From 63b700f5352f5915b1e7f81f3f8b3359b24c8105 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Sun, 25 Jan 2026 00:57:29 -0500 Subject: [PATCH 01/26] fix(markdown): normalize link destinations and titles in html test renderer Decode HTML entities and percent-encode URLs in destinations/titles for CommonMark conformance. Apply the same normalization to autolink hrefs, unescape info strings, and preserve tab/NBSP characters in paragraph trimming. Add percent-encoding as a test_utils dependency. --- Cargo.lock | 1 + Cargo.toml | 1 + crates/biome_markdown_parser/Cargo.toml | 3 +- crates/biome_markdown_parser/src/to_html.rs | 62 +++++++++++++++++++-- 4 files changed, 61 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bbd9fd5d9208..d7054e2153fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1301,6 +1301,7 @@ dependencies = [ "biome_unicode_table", "htmlize", "insta", + "percent-encoding", "quickcheck", "quickcheck_macros", "serde", diff --git a/Cargo.toml b/Cargo.toml index 77ef95cebe5f..81c03fdeecd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -130,6 +130,7 @@ libc = "0.2.180" mimalloc = "0.1.48" papaya = "0.2.3" path-absolutize = { version = "3.1.1", features = ["use_unix_paths_on_wasm"], optional = false } +percent-encoding = "2.3.2" phf = { version = "0.13.1", features = ["macros"] } proc-macro-error2 = { version = "2.0.1", default-features = false } proc-macro2 = "1.0.106" diff --git a/crates/biome_markdown_parser/Cargo.toml b/crates/biome_markdown_parser/Cargo.toml index 561f1054cd32..c3087a08e71c 100644 --- a/crates/biome_markdown_parser/Cargo.toml +++ b/crates/biome_markdown_parser/Cargo.toml @@ -25,6 +25,7 @@ biome_rowan = { workspace = true } biome_unicode_table = { workspace = true } # Optional dependency for test_utils feature (HTML rendering for spec tests) htmlize = { version = "1.0.6", features = ["unescape"], optional = true } +percent-encoding = { workspace = true, optional = true } tracing = { workspace = true } unicode-bom = { workspace = true } @@ -44,7 +45,7 @@ tests_macros = { path = "../tests_macros" } [features] # Enables test utilities (to_html module) for CommonMark spec compliance testing. # Not included in production builds to avoid unnecessary dependencies and code. -test_utils = ["dep:htmlize"] +test_utils = ["dep:htmlize", "dep:percent-encoding"] [lints] workspace = true diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index 35e1b04b89b0..f3dee25a4aa1 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -42,6 +42,7 @@ use biome_markdown_syntax::{ MdReferenceImage, MdReferenceLink, MdSetextHeader, MdTextual, }; use biome_rowan::{AstNode, AstNodeList, Direction, TextRange}; +use percent_encoding::{AsciiSet, CONTROLS, utf8_percent_encode}; use std::collections::HashMap; use crate::link_reference::normalize_reference_label; @@ -446,7 +447,10 @@ fn render_paragraph( } // Trim both ends - leading whitespace can appear from parser including // the space after list markers in the paragraph content - let content = strip_paragraph_indent(content.trim()); + let content = strip_paragraph_indent( + content + .trim_matches(|c| c == ' ' || c == '\n' || c == '\r') + ); if in_tight_list { // In tight lists, paragraphs are rendered without

tags @@ -602,10 +606,11 @@ fn render_fenced_code_block( // Extract just the language part (before first space) and process escapes let language = info_string.split_whitespace().next().unwrap_or(""); let language = process_escapes(language); + let language = htmlize::unescape(&language); if !language.is_empty() { out.push_str(" class=\"language-"); - out.push_str(&escape_html_attribute(&language)); + out.push_str(&escape_html_attribute(language.as_ref())); out.push('"'); } @@ -1194,7 +1199,7 @@ fn render_autolink(autolink: &MdAutolink, out: &mut String) { let href = if is_email { format!("mailto:{}", content) } else { - content.clone() + process_link_destination(&content) }; out.push_str(" String { }; // Process escapes - process_escapes(dest) + let dest = process_escapes(dest); + let decoded = htmlize::unescape(&dest).into_owned(); + percent_encode_uri(&decoded) } /// Process a link title (remove quotes, decode escapes). @@ -1379,9 +1386,54 @@ fn process_link_title(title: &str) -> String { }; // Process escapes - process_escapes(title) + let title = process_escapes(title); + htmlize::unescape(&title).into_owned() } +fn percent_encode_uri(value: &str) -> String { + let mut result = String::new(); + let mut last = 0; + + for (i, c) in value.char_indices() { + if c == '%' { + let bytes = value.as_bytes(); + if i + 2 < bytes.len() + && bytes[i + 1].is_ascii_alphanumeric() + && bytes[i + 2].is_ascii_alphanumeric() + { + if last < i { + result.push_str( + &utf8_percent_encode(&value[last..i], URI_ENCODE_SET).to_string(), + ); + } + result.push_str(&value[i..i + 3]); + last = i + 3; + } + } + } + + if last < value.len() { + result.push_str(&utf8_percent_encode(&value[last..], URI_ENCODE_SET).to_string()); + } + + result +} + +const URI_ENCODE_SET: &AsciiSet = &CONTROLS + .add(b' ') + .add(b'"') + .add(b'%') + .add(b'<') + .add(b'>') + .add(b'\\') + .add(b'[') + .add(b']') + .add(b'^') + .add(b'`') + .add(b'{') + .add(b'|') + .add(b'}'); + // ============================================================================ // HTML Escaping // ============================================================================ From 118e210920d8247bb73ff5c7dacd236d4000ce16 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Sun, 25 Jan 2026 01:03:39 -0500 Subject: [PATCH 02/26] fix(markdown): improve link and reference parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit consolidates several fixes related to link parsing, reference definitions, and URL validation to align with CommonMark specifications. Changes include: - Enforce strict parenthesis depth counting in link destinations to correctly handle nested parens. - Tighten link title parsing to handle edge cases with escapes and mismatched delimiters. - Improve validation of link destinations, specifically handling whitespace and control characters. - Fix inline link parsing fallback mechanisms when patterns do not match. - Correctly handle whitespace in inline links and truncated destinations. - Resolve multiple CommonMark compliance failures related to link reference definitions and general link rendering. Includes regression tests for percent-encoding and complex link structures. diff --git a/Cargo.lock b/Cargo.lock index bfc6c1c8c5..72c683148d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1295,6 +1295,7 @@ dependencies = [ "biome_markdown_syntax", "biome_parser", "biome_rowan", + "biome_string_case", "biome_test_utils", "biome_unicode_table", "htmlize", diff --git a/crates/biome_markdown_parser/Cargo.toml b/crates/biome_markdown_parser/Cargo.toml index c3087a08e7..3f100686af 100644 --- a/crates/biome_markdown_parser/Cargo.toml +++ b/crates/biome_markdown_parser/Cargo.toml @@ -22,6 +22,7 @@ biome_markdown_factory = { workspace = true } biome_markdown_syntax = { workspace = true } biome_parser = { workspace = true } biome_rowan = { workspace = true } +biome_string_case = { workspace = true } biome_unicode_table = { workspace = true } # Optional dependency for test_utils feature (HTML rendering for spec tests) htmlize = { version = "1.0.6", features = ["unescape"], optional = true } diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs index 7edfb0b9db..9bd74f48f7 100644 --- a/crates/biome_markdown_parser/src/link_reference.rs +++ b/crates/biome_markdown_parser/src/link_reference.rs @@ -1,5 +1,7 @@ use std::collections::HashSet; +use biome_string_case::StrOnlyExtension; + use biome_markdown_syntax::{MdLinkLabel, MdLinkReferenceDefinition}; use biome_rowan::{AstNode, Direction}; @@ -29,7 +31,7 @@ pub(crate) fn normalize_reference_label(text: &str) -> String { push_normalized_char(&mut out, c, &mut saw_whitespace); } - out + out.as_str().to_lowercase_cow().to_uppercase() } fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { @@ -37,9 +39,7 @@ fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { out.push(' '); } *saw_whitespace = false; - for lower in c.to_lowercase() { - out.push(lower); - } + out.push(c); } pub(crate) fn collect_link_reference_definitions( diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index 1c9bdac152..9ceac84a0f 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -60,6 +60,9 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; +/// Maximum paren nesting allowed in link destinations per CommonMark. +pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32; + /// CommonMark requires 4 or more spaces for indented code blocks. const INDENT_CODE_BLOCK_SPACES: usize = 4; @@ -71,6 +74,98 @@ pub(crate) fn parse_document(p: &mut MarkdownParser) { m.complete(p, MD_DOCUMENT); } +/// Result of updating parenthesis depth when scanning link destinations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ParenDepthResult { + /// Depth updated successfully, contains new depth value + Ok(i32), + /// Depth would exceed the maximum (too many nested opening parens). + /// Per cmark, this truncates the destination at this point. + DepthExceeded, + /// Unmatched closing paren (would go below 0). + /// This typically means the `)` belongs to the enclosing construct. + UnmatchedClose, +} + +pub(crate) fn try_update_paren_depth(text: &str, depth: i32, max: i32) -> ParenDepthResult { + let mut depth = depth; + let mut chars = text.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' && matches!(chars.peek(), Some('(' | ')')) { + chars.next(); + continue; + } + + if c == '(' { + if depth == max { + return ParenDepthResult::DepthExceeded; + } + depth += 1; + } else if c == ')' { + if depth == 0 { + return ParenDepthResult::UnmatchedClose; + } + depth -= 1; + } + } + + ParenDepthResult::Ok(depth) +} + +pub(crate) enum LinkDestinationKind { + Enclosed, + Raw, +} + +pub(crate) fn validate_link_destination_text( + text: &str, + kind: LinkDestinationKind, + pending_escape: &mut bool, +) -> bool { + for c in text.chars() { + if *pending_escape { + if c.is_ascii_punctuation() { + *pending_escape = false; + continue; + } + *pending_escape = false; + } + + if c == '\\' { + *pending_escape = true; + continue; + } + + if c.is_ascii_control() { + return false; + } + + if matches!(kind, LinkDestinationKind::Enclosed) && c == '<' { + return false; + } + } + + true +} + +pub(crate) fn ends_with_unescaped_close(text: &str, close_char: char) -> bool { + if !text.ends_with(close_char) { + return false; + } + + let mut backslashes = 0; + for c in text.chars().rev().skip(1) { + if c == '\\' { + backslashes += 1; + } else { + break; + } + } + + backslashes % 2 == 0 +} + pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax { let mut list = DocumentBlockList; Present(list.parse_list(p)) diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index 275a09ddb6..44f0eed223 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -611,15 +611,35 @@ pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax { parse_emphasis_from_context(p, false) } -fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { +fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool { let m = p.start(); let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + let mut has_nested_link = false; loop { - if p.at(stop) || p.at_inline_end() { + if p.at_inline_end() { break; } + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + if p.at(L_BRACK) { + if !has_nested_link && nested_link_starts_here(p) { + has_nested_link = true; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + if parse_any_inline_no_links(p).is_absent() { break; } @@ -627,13 +647,53 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS m.complete(p, MD_INLINE_ITEM_LIST); p.set_emphasis_context(prev_context); + has_nested_link +} + +fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + if !p.at(L_BRACK) { + return false; + } + + p.bump(L_BRACK); + let mut depth = 0usize; + + loop { + if p.at(EOF) || p.at_inline_end() { + return false; + } + + if p.at(L_BRACK) { + depth += 1; + p.bump(L_BRACK); + continue; + } + + if p.at(R_BRACK) { + if depth > 0 { + depth -= 1; + p.bump(R_BRACK); + continue; + } + p.bump(R_BRACK); + return p.at(L_PAREN) || p.at(L_BRACK); + } + + p.bump(p.cur()); + } + }) } fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax { - if (p.at(BANG) && p.nth_at(1, L_BRACK)) || p.at(L_BRACK) { + if p.at(L_BRACK) { return super::parse_textual(p); } + if p.at(BANG) && p.nth_at(1, L_BRACK) { + return parse_inline_image(p); + } + parse_any_inline(p) } @@ -826,10 +886,15 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); // ] - if missing at inline end, emit diagnostic; otherwise rewind if !p.eat(R_BRACK) { + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } if p.at_inline_end() { // Unclosed link/image at end of inline content - emit diagnostic // Expand range to include the text content, not just the opening bracket @@ -843,19 +908,50 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn p.rewind(checkpoint); return Absent; } + let text_end_offset = p.cur_range().start(); + + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } // Now decide based on what follows ] - if p.at(L_PAREN) { + let link_validation = if p.at(L_PAREN) { + inline_link_is_valid(p) + } else { + InlineLinkValidation::Invalid + }; + + if matches!( + link_validation, + InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded + ) { // Inline link/image: [text](url) or ![alt](url) // Bump past ( and lex the following tokens in LinkDefinition context // so whitespace separates destination and title. p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition); let destination = p.start(); - parse_inline_link_destination_tokens(p); + let destination_result = parse_inline_link_destination_tokens(p); + + // When depth exceeded, destination is truncated but link is still valid. + // Complete the destination and link immediately without looking for closing paren. + if destination_result == DestinationScanResult::DepthExceeded { + destination.complete(p, MD_INLINE_ITEM_LIST); + return Present(m.complete(p, kind.inline_kind())); + } + let has_title = inline_title_starts_after_whitespace_tokens(p); - while is_whitespace_token(p) { - bump_textual_link_def(p); + while is_title_separator_token(p) { + bump_link_def_separator(p); + } + if destination_result == DestinationScanResult::Invalid { + destination.abandon(p); + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } destination.complete(p, MD_INLINE_ITEM_LIST); @@ -867,8 +963,18 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn title_m.complete(p, MD_LINK_TITLE); } + while is_title_separator_token(p) { + bump_link_def_separator(p); + } + if !p.eat(R_PAREN) { - kind.report_unclosed_destination(p, opening_range); + if p.at_inline_end() { + kind.report_unclosed_destination(p, opening_range); + } + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } Present(m.complete(p, kind.inline_kind())) @@ -888,7 +994,7 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) @@ -901,14 +1007,13 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) } } struct ReferenceLinkLookahead { - end_offset: TextSize, label_raw: String, is_shortcut: bool, } @@ -947,7 +1052,6 @@ fn lookahead_reference_common( p.bump(L_BRACK); let link_text = collect_bracket_text(p)?; - let end_offset = p.cur_range().end(); p.bump(R_BRACK); if p.at(L_PAREN) { @@ -963,10 +1067,8 @@ fn lookahead_reference_common( } else { label_text }; - let end_offset = p.cur_range().end(); p.bump(R_BRACK); return Some(ReferenceLinkLookahead { - end_offset, label_raw: label, is_shortcut: false, }); @@ -974,7 +1076,6 @@ fn lookahead_reference_common( } Some(ReferenceLinkLookahead { - end_offset, label_raw: link_text, is_shortcut: true, }) @@ -1025,48 +1126,296 @@ fn is_whitespace_token(p: &MarkdownParser) -> bool { fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - while is_whitespace_token(p) { - bump_textual_link_def(p); + let mut saw_whitespace = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + saw_whitespace = true; } - get_title_close_char(p).is_some() + saw_whitespace && get_title_close_char(p).is_some() }) } -fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) { +/// Result of validating an inline link. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InlineLinkValidation { + /// Link is valid with complete destination + Valid, + /// Link is invalid + Invalid, + /// Link is valid but destination was truncated due to paren depth limit. + /// The link should be closed immediately without looking for `)`. + DepthExceeded, +} + +fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation { + p.lookahead(|p| { + if !p.at(L_PAREN) { + return InlineLinkValidation::Invalid; + } + + p.bump(L_PAREN); + p.re_lex_link_definition(); + + let destination_result = scan_inline_link_destination_tokens(p); + + // If depth exceeded, link is valid but truncated - no need to check for closing paren + if destination_result == DestinationScanResult::DepthExceeded { + return InlineLinkValidation::DepthExceeded; + } + + if destination_result == DestinationScanResult::Invalid { + return InlineLinkValidation::Invalid; + } + + let mut saw_separator = false; + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + saw_separator = true; + } + let has_title = saw_separator && get_title_close_char(p).is_some(); + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if has_title { + scan_title_content(p, get_title_close_char(p)); + } + + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if p.at(R_PAREN) { + InlineLinkValidation::Valid + } else { + InlineLinkValidation::Invalid + } + }) +} + +/// Result of scanning a link destination. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationScanResult { + /// Destination is valid and complete + Valid, + /// Destination is invalid (contains invalid characters, etc.) + Invalid, + /// Destination was truncated because paren depth exceeded the limit. + /// In this case, the link is considered valid but closed at the truncation point. + DepthExceeded, +} + +fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; + if p.at(L_ANGLE) { + p.bump_link_definition(); + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } + if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + continue; + } + p.bump_link_definition(); + return DestinationScanResult::Valid; + } + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + } + } + + let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while !p.at(EOF) && !p.at(NEWLINE) { + if is_whitespace_token(p) { + break; + } + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + p.bump_link_definition(); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + // Per CommonMark/cmark, the link is still valid but closed here. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). + break; + } + } + } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid +} + +fn scan_title_content(p: &mut MarkdownParser, close_char: Option) { + let Some(close_char) = close_char else { + return; + }; + + let text = p.cur_text(); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); + + p.bump_link_definition(); + if is_complete { + return; + } + + loop { + if p.at(EOF) || p.at(NEWLINE) { + return; + } + + let text = p.cur_text(); + if super::ends_with_unescaped_close(text, close_char) { + p.bump_link_definition(); + return; + } + + p.bump_link_definition(); + } +} + +fn skip_link_def_separator_tokens(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + p.bump(NEWLINE); + } else { + p.bump_link_definition(); + } +} + +fn is_title_separator_token(p: &MarkdownParser) -> bool { + is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) +} + +fn bump_link_def_separator(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + let item = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + item.complete(p, MD_TEXTUAL); + } else { + bump_textual_link_def(p); + } +} + +fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { p.re_lex_link_definition(); + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; if p.at(L_ANGLE) { bump_textual_link_def(p); - while !p.at(EOF) && !p.at(NEWLINE) { + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + bump_textual_link_def(p); + continue; + } bump_textual_link_def(p); - break; + return DestinationScanResult::Valid; } - if is_whitespace_token(p) { - break; + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; } bump_textual_link_def(p); } - return; } let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + } while !p.at(EOF) && !p.at(NEWLINE) { if is_whitespace_token(p) { break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth == 0 { + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). break; } - paren_depth -= 1; } - - bump_textual_link_def(p); } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid } fn get_title_close_char(p: &MarkdownParser) -> Option { @@ -1088,9 +1437,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { }; let text = p.cur_text(); - let is_complete = text.len() >= 2 - && ((close_char == ')' && text.ends_with(')')) - || (close_char != ')' && text.ends_with(close_char))); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); bump_textual_link_def(p); if is_complete { @@ -1103,7 +1450,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { } let text = p.cur_text(); - if text.ends_with(close_char) { + if super::ends_with_unescaped_close(text, close_char) { bump_textual_link_def(p); return; } diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 86d9f57354..7422a055fb 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -184,15 +184,35 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { if p.at(L_ANGLE) { // Angle-bracketed destination p.bump_link_definition(); + let mut pending_escape = false; loop { if p.at(EOF) || p.at(NEWLINE) { return false; // Unterminated angle bracket } if p.at(R_ANGLE) { - p.bump_link_definition(); - // Consume separator whitespace into destination - skip_whitespace_tokens(p); - return true; + if pending_escape { + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return false; + } + p.bump_link_definition(); + continue; + } else { + p.bump_link_definition(); + // Consume separator whitespace into destination + skip_whitespace_tokens(p); + return true; + } + } + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return false; } p.bump_link_definition(); } @@ -201,6 +221,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { let mut paren_depth = 0i32; let mut has_content = false; let mut saw_separator = false; + let mut pending_escape = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); @@ -217,19 +238,31 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends destination - } + if !crate::syntax::validate_link_destination_text( + text, + crate::syntax::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return false; } - has_content = true; - saw_separator = false; - p.bump_link_definition(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + has_content = true; + saw_separator = false; + paren_depth = next_depth; + p.bump_link_definition(); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + // For link reference definitions, both cases end the destination + break; + } + } } has_content } @@ -249,17 +282,10 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { // Check if first token is complete (e.g., `"title"`) let first_text = p.cur_text(); - if first_text.len() >= 2 { - let is_complete = if close_char == ')' { - first_text.ends_with(')') - } else { - first_text.ends_with(close_char) - }; - if is_complete { - p.bump_link_definition(); - skip_whitespace_tokens(p); - return p.at(EOF) || p.at(NEWLINE); - } + if first_text.len() >= 2 && crate::syntax::ends_with_unescaped_close(first_text, close_char) { + p.bump_link_definition(); + skip_whitespace_tokens(p); + return p.at(EOF) || p.at(NEWLINE); } p.bump_link_definition(); @@ -271,11 +297,7 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { } // Check for closing delimiter - let is_close = if close_char == ')' { - p.at(R_PAREN) - } else { - p.cur_text().ends_with(close_char) - }; + let is_close = crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char); if is_close { p.bump_link_definition(); @@ -393,17 +415,21 @@ fn parse_link_destination(p: &mut MarkdownParser) { break; // Bare destination stops at first whitespace } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends bare destination + let text = p.cur_text(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + break; } } - - bump_textual_link_def(p); } } diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index f3dee25a4a..0512db55fe 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -447,10 +447,8 @@ fn render_paragraph( } // Trim both ends - leading whitespace can appear from parser including // the space after list markers in the paragraph content - let content = strip_paragraph_indent( - content - .trim_matches(|c| c == ' ' || c == '\n' || c == '\r') - ); + let content = + strip_paragraph_indent(content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r')); if in_tight_list { // In tight lists, paragraphs are rendered without

tags @@ -1160,7 +1158,11 @@ where { if let Some(node) = label_node { let text = label_text(&node); - (text.clone(), Some(text)) + if text.trim().is_empty() { + (fallback, None) + } else { + (text.clone(), Some(text)) + } } else { (fallback, None) } @@ -1602,4 +1604,64 @@ mod tests { // U+0000 should become replacement character assert_eq!(decode_entity("�"), Some("\u{FFFD}".to_string())); } + + #[test] + fn test_percent_encode_uri() { + let input = format!("https://a{}b.c/%20/%", '\u{1F44D}'); + let encoded = percent_encode_uri(&input); + assert_eq!(encoded, "https://a%F0%9F%91%8Db.c/%20/%25"); + } + + #[test] + fn test_process_link_destination_decodes_entities() { + let encoded = process_link_destination("https://example.com/<"); + assert_eq!(encoded, "https://example.com/%3C"); + } + + #[test] + fn test_paren_depth_limit_in_destination() { + let dest = format!("x{}y{}", "(".repeat(32), ")".repeat(32)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected = format!("

a

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_paren_depth_limit_exceeded_in_destination() { + let dest = format!("x{}y{}", "(".repeat(33), ")".repeat(33)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected_dest = format!("x{}", "(".repeat(32)); + let trailing = ")".repeat(34); + let expected = format!("

a(y{trailing}

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_title_with_escaped_closing_quote() { + let parsed = parse_markdown("[a](/url \"title with \\\" quote\")\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

a

\n" + ); + } } diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md new file mode 100644 index 0000000000..3cbf1f91d3 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md @@ -0,0 +1 @@ +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap new file mode 100644 index 0000000000..236bd2046f --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap @@ -0,0 +1,399 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdInlineLink { + l_brack_token: L_BRACK@0..1 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@1..2 "a" [] [], + }, + ], + r_brack_token: R_BRACK@2..3 "]" [] [], + l_paren_token: L_PAREN@3..4 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "x" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..6 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@6..7 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@8..9 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..12 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..13 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@13..14 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@14..15 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..17 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@17..18 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@18..19 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@19..20 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@20..21 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@21..22 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@22..23 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@24..25 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@26..27 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@27..28 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@28..29 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@29..30 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@30..31 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@31..32 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@32..33 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@33..34 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@34..35 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@35..36 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@36..37 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@37..38 "y" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@38..39 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@39..40 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@41..42 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..43 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@43..44 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@44..45 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@45..46 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@46..47 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@47..48 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..49 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@49..50 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@50..51 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..52 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@52..53 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@53..54 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@54..55 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..59 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@59..60 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@60..61 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@61..62 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@62..63 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@63..64 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@64..65 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@65..66 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@66..67 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@67..68 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@69..70 ")" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@70..71 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@71..72 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@72..72 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..72 + 0: (empty) + 1: MD_BLOCK_LIST@0..72 + 0: MD_PARAGRAPH@0..72 + 0: MD_INLINE_ITEM_LIST@0..72 + 0: MD_INLINE_LINK@0..71 + 0: L_BRACK@0..1 "[" [] [] + 1: MD_INLINE_ITEM_LIST@1..2 + 0: MD_TEXTUAL@1..2 + 0: MD_TEXTUAL_LITERAL@1..2 "a" [] [] + 2: R_BRACK@2..3 "]" [] [] + 3: L_PAREN@3..4 "(" [] [] + 4: MD_INLINE_ITEM_LIST@4..70 + 0: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "x" [] [] + 1: MD_TEXTUAL@5..6 + 0: MD_TEXTUAL_LITERAL@5..6 "(" [] [] + 2: MD_TEXTUAL@6..7 + 0: MD_TEXTUAL_LITERAL@6..7 "(" [] [] + 3: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "(" [] [] + 4: MD_TEXTUAL@8..9 + 0: MD_TEXTUAL_LITERAL@8..9 "(" [] [] + 5: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "(" [] [] + 6: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "(" [] [] + 7: MD_TEXTUAL@11..12 + 0: MD_TEXTUAL_LITERAL@11..12 "(" [] [] + 8: MD_TEXTUAL@12..13 + 0: MD_TEXTUAL_LITERAL@12..13 "(" [] [] + 9: MD_TEXTUAL@13..14 + 0: MD_TEXTUAL_LITERAL@13..14 "(" [] [] + 10: MD_TEXTUAL@14..15 + 0: MD_TEXTUAL_LITERAL@14..15 "(" [] [] + 11: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "(" [] [] + 12: MD_TEXTUAL@16..17 + 0: MD_TEXTUAL_LITERAL@16..17 "(" [] [] + 13: MD_TEXTUAL@17..18 + 0: MD_TEXTUAL_LITERAL@17..18 "(" [] [] + 14: MD_TEXTUAL@18..19 + 0: MD_TEXTUAL_LITERAL@18..19 "(" [] [] + 15: MD_TEXTUAL@19..20 + 0: MD_TEXTUAL_LITERAL@19..20 "(" [] [] + 16: MD_TEXTUAL@20..21 + 0: MD_TEXTUAL_LITERAL@20..21 "(" [] [] + 17: MD_TEXTUAL@21..22 + 0: MD_TEXTUAL_LITERAL@21..22 "(" [] [] + 18: MD_TEXTUAL@22..23 + 0: MD_TEXTUAL_LITERAL@22..23 "(" [] [] + 19: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "(" [] [] + 20: MD_TEXTUAL@24..25 + 0: MD_TEXTUAL_LITERAL@24..25 "(" [] [] + 21: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "(" [] [] + 22: MD_TEXTUAL@26..27 + 0: MD_TEXTUAL_LITERAL@26..27 "(" [] [] + 23: MD_TEXTUAL@27..28 + 0: MD_TEXTUAL_LITERAL@27..28 "(" [] [] + 24: MD_TEXTUAL@28..29 + 0: MD_TEXTUAL_LITERAL@28..29 "(" [] [] + 25: MD_TEXTUAL@29..30 + 0: MD_TEXTUAL_LITERAL@29..30 "(" [] [] + 26: MD_TEXTUAL@30..31 + 0: MD_TEXTUAL_LITERAL@30..31 "(" [] [] + 27: MD_TEXTUAL@31..32 + 0: MD_TEXTUAL_LITERAL@31..32 "(" [] [] + 28: MD_TEXTUAL@32..33 + 0: MD_TEXTUAL_LITERAL@32..33 "(" [] [] + 29: MD_TEXTUAL@33..34 + 0: MD_TEXTUAL_LITERAL@33..34 "(" [] [] + 30: MD_TEXTUAL@34..35 + 0: MD_TEXTUAL_LITERAL@34..35 "(" [] [] + 31: MD_TEXTUAL@35..36 + 0: MD_TEXTUAL_LITERAL@35..36 "(" [] [] + 32: MD_TEXTUAL@36..37 + 0: MD_TEXTUAL_LITERAL@36..37 "(" [] [] + 33: MD_TEXTUAL@37..38 + 0: MD_TEXTUAL_LITERAL@37..38 "y" [] [] + 34: MD_TEXTUAL@38..39 + 0: MD_TEXTUAL_LITERAL@38..39 ")" [] [] + 35: MD_TEXTUAL@39..40 + 0: MD_TEXTUAL_LITERAL@39..40 ")" [] [] + 36: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 ")" [] [] + 37: MD_TEXTUAL@41..42 + 0: MD_TEXTUAL_LITERAL@41..42 ")" [] [] + 38: MD_TEXTUAL@42..43 + 0: MD_TEXTUAL_LITERAL@42..43 ")" [] [] + 39: MD_TEXTUAL@43..44 + 0: MD_TEXTUAL_LITERAL@43..44 ")" [] [] + 40: MD_TEXTUAL@44..45 + 0: MD_TEXTUAL_LITERAL@44..45 ")" [] [] + 41: MD_TEXTUAL@45..46 + 0: MD_TEXTUAL_LITERAL@45..46 ")" [] [] + 42: MD_TEXTUAL@46..47 + 0: MD_TEXTUAL_LITERAL@46..47 ")" [] [] + 43: MD_TEXTUAL@47..48 + 0: MD_TEXTUAL_LITERAL@47..48 ")" [] [] + 44: MD_TEXTUAL@48..49 + 0: MD_TEXTUAL_LITERAL@48..49 ")" [] [] + 45: MD_TEXTUAL@49..50 + 0: MD_TEXTUAL_LITERAL@49..50 ")" [] [] + 46: MD_TEXTUAL@50..51 + 0: MD_TEXTUAL_LITERAL@50..51 ")" [] [] + 47: MD_TEXTUAL@51..52 + 0: MD_TEXTUAL_LITERAL@51..52 ")" [] [] + 48: MD_TEXTUAL@52..53 + 0: MD_TEXTUAL_LITERAL@52..53 ")" [] [] + 49: MD_TEXTUAL@53..54 + 0: MD_TEXTUAL_LITERAL@53..54 ")" [] [] + 50: MD_TEXTUAL@54..55 + 0: MD_TEXTUAL_LITERAL@54..55 ")" [] [] + 51: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 ")" [] [] + 52: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 ")" [] [] + 53: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 ")" [] [] + 54: MD_TEXTUAL@58..59 + 0: MD_TEXTUAL_LITERAL@58..59 ")" [] [] + 55: MD_TEXTUAL@59..60 + 0: MD_TEXTUAL_LITERAL@59..60 ")" [] [] + 56: MD_TEXTUAL@60..61 + 0: MD_TEXTUAL_LITERAL@60..61 ")" [] [] + 57: MD_TEXTUAL@61..62 + 0: MD_TEXTUAL_LITERAL@61..62 ")" [] [] + 58: MD_TEXTUAL@62..63 + 0: MD_TEXTUAL_LITERAL@62..63 ")" [] [] + 59: MD_TEXTUAL@63..64 + 0: MD_TEXTUAL_LITERAL@63..64 ")" [] [] + 60: MD_TEXTUAL@64..65 + 0: MD_TEXTUAL_LITERAL@64..65 ")" [] [] + 61: MD_TEXTUAL@65..66 + 0: MD_TEXTUAL_LITERAL@65..66 ")" [] [] + 62: MD_TEXTUAL@66..67 + 0: MD_TEXTUAL_LITERAL@66..67 ")" [] [] + 63: MD_TEXTUAL@67..68 + 0: MD_TEXTUAL_LITERAL@67..68 ")" [] [] + 64: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 ")" [] [] + 65: MD_TEXTUAL@69..70 + 0: MD_TEXTUAL_LITERAL@69..70 ")" [] [] + 5: (empty) + 6: R_PAREN@70..71 ")" [] [] + 1: MD_TEXTUAL@71..72 + 0: MD_TEXTUAL_LITERAL@71..72 "\n" [] [] + 1: (empty) + 2: EOF@72..72 "" [] [] + +``` diff --git a/Cargo.lock b/Cargo.lock index bfc6c1c8c5..72c683148d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1295,6 +1295,7 @@ dependencies = [ "biome_markdown_syntax", "biome_parser", "biome_rowan", + "biome_string_case", "biome_test_utils", "biome_unicode_table", "htmlize", diff --git a/crates/biome_markdown_parser/Cargo.toml b/crates/biome_markdown_parser/Cargo.toml index c3087a08e7..3f100686af 100644 --- a/crates/biome_markdown_parser/Cargo.toml +++ b/crates/biome_markdown_parser/Cargo.toml @@ -22,6 +22,7 @@ biome_markdown_factory = { workspace = true } biome_markdown_syntax = { workspace = true } biome_parser = { workspace = true } biome_rowan = { workspace = true } +biome_string_case = { workspace = true } biome_unicode_table = { workspace = true } # Optional dependency for test_utils feature (HTML rendering for spec tests) htmlize = { version = "1.0.6", features = ["unescape"], optional = true } diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs index 7edfb0b9db..e67f38759a 100644 --- a/crates/biome_markdown_parser/src/link_reference.rs +++ b/crates/biome_markdown_parser/src/link_reference.rs @@ -1,5 +1,7 @@ use std::collections::HashSet; +use biome_string_case::StrOnlyExtension; + use biome_markdown_syntax::{MdLinkLabel, MdLinkReferenceDefinition}; use biome_rowan::{AstNode, Direction}; @@ -8,19 +10,20 @@ use crate::MarkdownParseOptions; use crate::parser::MarkdownParser; use crate::syntax::parse_document; +/// Normalize a reference label per CommonMark spec. +/// +/// Per CommonMark, label normalization involves: +/// 1. Collapsing consecutive whitespace into a single space +/// 2. Case-folding (case-insensitive matching) +/// +/// IMPORTANT: Backslash escapes are NOT stripped during normalization. +/// This means `[foo\!]` does NOT match `[foo!]` - the backslash is preserved. +/// This matches cmark's reference implementation behavior. pub(crate) fn normalize_reference_label(text: &str) -> String { let mut out = String::new(); - let mut chars = text.chars().peekable(); let mut saw_whitespace = false; - while let Some(c) = chars.next() { - if c == '\\' { - if let Some(next) = chars.next() { - push_normalized_char(&mut out, next, &mut saw_whitespace); - } - continue; - } - + for c in text.chars() { if c.is_whitespace() { saw_whitespace = true; continue; @@ -29,7 +32,7 @@ pub(crate) fn normalize_reference_label(text: &str) -> String { push_normalized_char(&mut out, c, &mut saw_whitespace); } - out + out.as_str().to_lowercase_cow().to_uppercase() } fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { @@ -37,9 +40,7 @@ fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { out.push(' '); } *saw_whitespace = false; - for lower in c.to_lowercase() { - out.push(lower); - } + out.push(c); } pub(crate) fn collect_link_reference_definitions( diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index 1c9bdac152..9ceac84a0f 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -60,6 +60,9 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; +/// Maximum paren nesting allowed in link destinations per CommonMark. +pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32; + /// CommonMark requires 4 or more spaces for indented code blocks. const INDENT_CODE_BLOCK_SPACES: usize = 4; @@ -71,6 +74,98 @@ pub(crate) fn parse_document(p: &mut MarkdownParser) { m.complete(p, MD_DOCUMENT); } +/// Result of updating parenthesis depth when scanning link destinations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ParenDepthResult { + /// Depth updated successfully, contains new depth value + Ok(i32), + /// Depth would exceed the maximum (too many nested opening parens). + /// Per cmark, this truncates the destination at this point. + DepthExceeded, + /// Unmatched closing paren (would go below 0). + /// This typically means the `)` belongs to the enclosing construct. + UnmatchedClose, +} + +pub(crate) fn try_update_paren_depth(text: &str, depth: i32, max: i32) -> ParenDepthResult { + let mut depth = depth; + let mut chars = text.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' && matches!(chars.peek(), Some('(' | ')')) { + chars.next(); + continue; + } + + if c == '(' { + if depth == max { + return ParenDepthResult::DepthExceeded; + } + depth += 1; + } else if c == ')' { + if depth == 0 { + return ParenDepthResult::UnmatchedClose; + } + depth -= 1; + } + } + + ParenDepthResult::Ok(depth) +} + +pub(crate) enum LinkDestinationKind { + Enclosed, + Raw, +} + +pub(crate) fn validate_link_destination_text( + text: &str, + kind: LinkDestinationKind, + pending_escape: &mut bool, +) -> bool { + for c in text.chars() { + if *pending_escape { + if c.is_ascii_punctuation() { + *pending_escape = false; + continue; + } + *pending_escape = false; + } + + if c == '\\' { + *pending_escape = true; + continue; + } + + if c.is_ascii_control() { + return false; + } + + if matches!(kind, LinkDestinationKind::Enclosed) && c == '<' { + return false; + } + } + + true +} + +pub(crate) fn ends_with_unescaped_close(text: &str, close_char: char) -> bool { + if !text.ends_with(close_char) { + return false; + } + + let mut backslashes = 0; + for c in text.chars().rev().skip(1) { + if c == '\\' { + backslashes += 1; + } else { + break; + } + } + + backslashes % 2 == 0 +} + pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax { let mut list = DocumentBlockList; Present(list.parse_list(p)) diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index 275a09ddb6..f336b37b33 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -611,15 +611,46 @@ pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax { parse_emphasis_from_context(p, false) } -fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { +fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool { let m = p.start(); let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + let mut has_nested_link = false; loop { - if p.at(stop) || p.at_inline_end() { + // Per CommonMark, link text can span lines, but blank lines end the link. + // Check for blank line (NEWLINE followed by NEWLINE or EOF after optional whitespace) + if p.at(NEWLINE) { + if p.at_blank_line() { + break; // Blank line ends link text + } + // Single newline inside link text - consume and continue + let _ = super::parse_textual(p); + continue; + } + + if p.at(T![EOF]) { break; } + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + if p.at(L_BRACK) { + if !has_nested_link && nested_link_starts_here(p) { + has_nested_link = true; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + if parse_any_inline_no_links(p).is_absent() { break; } @@ -627,13 +658,53 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS m.complete(p, MD_INLINE_ITEM_LIST); p.set_emphasis_context(prev_context); + has_nested_link +} + +fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + if !p.at(L_BRACK) { + return false; + } + + p.bump(L_BRACK); + let mut depth = 0usize; + + loop { + if p.at(EOF) || p.at_inline_end() { + return false; + } + + if p.at(L_BRACK) { + depth += 1; + p.bump(L_BRACK); + continue; + } + + if p.at(R_BRACK) { + if depth > 0 { + depth -= 1; + p.bump(R_BRACK); + continue; + } + p.bump(R_BRACK); + return p.at(L_PAREN) || p.at(L_BRACK); + } + + p.bump(p.cur()); + } + }) } fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax { - if (p.at(BANG) && p.nth_at(1, L_BRACK)) || p.at(L_BRACK) { + if p.at(L_BRACK) { return super::parse_textual(p); } + if p.at(BANG) && p.nth_at(1, L_BRACK) { + return parse_inline_image(p); + } + parse_any_inline(p) } @@ -826,10 +897,15 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); // ] - if missing at inline end, emit diagnostic; otherwise rewind if !p.eat(R_BRACK) { + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } if p.at_inline_end() { // Unclosed link/image at end of inline content - emit diagnostic // Expand range to include the text content, not just the opening bracket @@ -843,19 +919,50 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn p.rewind(checkpoint); return Absent; } + let text_end_offset = p.cur_range().start(); + + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } // Now decide based on what follows ] - if p.at(L_PAREN) { + let link_validation = if p.at(L_PAREN) { + inline_link_is_valid(p) + } else { + InlineLinkValidation::Invalid + }; + + if matches!( + link_validation, + InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded + ) { // Inline link/image: [text](url) or ![alt](url) // Bump past ( and lex the following tokens in LinkDefinition context // so whitespace separates destination and title. p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition); let destination = p.start(); - parse_inline_link_destination_tokens(p); + let destination_result = parse_inline_link_destination_tokens(p); + + // When depth exceeded, destination is truncated but link is still valid. + // Complete the destination and link immediately without looking for closing paren. + if destination_result == DestinationScanResult::DepthExceeded { + destination.complete(p, MD_INLINE_ITEM_LIST); + return Present(m.complete(p, kind.inline_kind())); + } + let has_title = inline_title_starts_after_whitespace_tokens(p); - while is_whitespace_token(p) { - bump_textual_link_def(p); + while is_title_separator_token(p) { + bump_link_def_separator(p); + } + if destination_result == DestinationScanResult::Invalid { + destination.abandon(p); + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } destination.complete(p, MD_INLINE_ITEM_LIST); @@ -867,8 +974,20 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn title_m.complete(p, MD_LINK_TITLE); } + // Skip trailing whitespace/newlines before closing paren without creating nodes + // (creating nodes would violate the MD_INLINE_LINK grammar which expects exactly 7 children) + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if !p.eat(R_PAREN) { - kind.report_unclosed_destination(p, opening_range); + if p.at_inline_end() { + kind.report_unclosed_destination(p, opening_range); + } + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } Present(m.complete(p, kind.inline_kind())) @@ -888,7 +1007,7 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) @@ -901,14 +1020,13 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) } } struct ReferenceLinkLookahead { - end_offset: TextSize, label_raw: String, is_shortcut: bool, } @@ -947,7 +1065,13 @@ fn lookahead_reference_common( p.bump(L_BRACK); let link_text = collect_bracket_text(p)?; - let end_offset = p.cur_range().end(); + + // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized_link = normalize_reference_label(&link_text); + if normalized_link.is_empty() { + return None; + } + p.bump(R_BRACK); if p.at(L_PAREN) { @@ -961,12 +1085,15 @@ fn lookahead_reference_common( let label = if label_text.is_empty() { link_text.clone() } else { + // Explicit label must also normalize to non-empty + let normalized_label = normalize_reference_label(&label_text); + if normalized_label.is_empty() { + return None; + } label_text }; - let end_offset = p.cur_range().end(); p.bump(R_BRACK); return Some(ReferenceLinkLookahead { - end_offset, label_raw: label, is_shortcut: false, }); @@ -974,7 +1101,6 @@ fn lookahead_reference_common( } Some(ReferenceLinkLookahead { - end_offset, label_raw: link_text, is_shortcut: true, }) @@ -1025,48 +1151,307 @@ fn is_whitespace_token(p: &MarkdownParser) -> bool { fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - while is_whitespace_token(p) { - bump_textual_link_def(p); + let mut saw_whitespace = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + saw_whitespace = true; + } + saw_whitespace && get_title_close_char(p).is_some() + }) +} + +/// Result of validating an inline link. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InlineLinkValidation { + /// Link is valid with complete destination + Valid, + /// Link is invalid + Invalid, + /// Link is valid but destination was truncated due to paren depth limit. + /// The link should be closed immediately without looking for `)`. + DepthExceeded, +} + +fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation { + p.lookahead(|p| { + if !p.at(L_PAREN) { + return InlineLinkValidation::Invalid; + } + + p.bump(L_PAREN); + p.re_lex_link_definition(); + + let destination_result = scan_inline_link_destination_tokens(p); + + // If depth exceeded, link is valid but truncated - no need to check for closing paren + if destination_result == DestinationScanResult::DepthExceeded { + return InlineLinkValidation::DepthExceeded; + } + + if destination_result == DestinationScanResult::Invalid { + return InlineLinkValidation::Invalid; + } + + let mut saw_separator = false; + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + saw_separator = true; + } + let has_title = saw_separator && get_title_close_char(p).is_some(); + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if has_title { + scan_title_content(p, get_title_close_char(p)); + } + + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if p.at(R_PAREN) { + InlineLinkValidation::Valid + } else { + InlineLinkValidation::Invalid } - get_title_close_char(p).is_some() }) } -fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) { +/// Result of scanning a link destination. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationScanResult { + /// Destination is valid and complete + Valid, + /// Destination is invalid (contains invalid characters, etc.) + Invalid, + /// Destination was truncated because paren depth exceeded the limit. + /// In this case, the link is considered valid but closed at the truncation point. + DepthExceeded, +} + +fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; + // Skip leading whitespace to match parse_inline_link_destination_tokens behavior + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if p.at(L_ANGLE) { + p.bump_link_definition(); + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } + if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + continue; + } + p.bump_link_definition(); + return DestinationScanResult::Valid; + } + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + } + } + + let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while !p.at(EOF) && !p.at(NEWLINE) { + if is_whitespace_token(p) { + break; + } + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + p.bump_link_definition(); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + // Per CommonMark/cmark, the link is still valid but closed here. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). + break; + } + } + } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid +} + +fn scan_title_content(p: &mut MarkdownParser, close_char: Option) { + let Some(close_char) = close_char else { + return; + }; + + let text = p.cur_text(); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); + + p.bump_link_definition(); + if is_complete { + return; + } + + loop { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { + return; + } + + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + skip_link_def_separator_tokens(p); + continue; + } + + let text = p.cur_text(); + if super::ends_with_unescaped_close(text, close_char) { + p.bump_link_definition(); + return; + } + + p.bump_link_definition(); + } +} + +fn skip_link_def_separator_tokens(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + p.bump(NEWLINE); + } else { + p.bump_link_definition(); + } +} + +fn is_title_separator_token(p: &MarkdownParser) -> bool { + is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) +} + +fn bump_link_def_separator(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + let item = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + item.complete(p, MD_TEXTUAL); + } else { + bump_textual_link_def(p); + } +} + +fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { p.re_lex_link_definition(); + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; if p.at(L_ANGLE) { bump_textual_link_def(p); - while !p.at(EOF) && !p.at(NEWLINE) { + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + bump_textual_link_def(p); + continue; + } bump_textual_link_def(p); - break; + return DestinationScanResult::Valid; } - if is_whitespace_token(p) { - break; + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; } bump_textual_link_def(p); } - return; } let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + } while !p.at(EOF) && !p.at(NEWLINE) { if is_whitespace_token(p) { break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth == 0 { + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). break; } - paren_depth -= 1; } - - bump_textual_link_def(p); } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid } fn get_title_close_char(p: &MarkdownParser) -> Option { @@ -1088,9 +1473,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { }; let text = p.cur_text(); - let is_complete = text.len() >= 2 - && ((close_char == ')' && text.ends_with(')')) - || (close_char != ')' && text.ends_with(close_char))); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); bump_textual_link_def(p); if is_complete { @@ -1098,12 +1481,19 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { } loop { - if p.at(EOF) || p.at(NEWLINE) { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { return; } + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + bump_link_def_separator(p); + continue; + } + let text = p.cur_text(); - if text.ends_with(close_char) { + if super::ends_with_unescaped_close(text, close_char) { bump_textual_link_def(p); return; } diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 86d9f57354..69d29d3bd4 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -73,7 +73,9 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { p.bump_any(); // Parse label: consume tokens until ] or invalid state + // Also collect the label text for normalization check. let mut label_len = 0; + let mut label_text = String::new(); loop { if p.at(EOF) { return false; @@ -89,6 +91,8 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } let text = p.cur_text(); + label_text.push_str(text); + // Check for escape sequences if text.starts_with('\\') && text.len() > 1 { label_len += 1; // Count escaped char @@ -107,6 +111,12 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { return false; } + // Label must also be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized = crate::link_reference::normalize_reference_label(&label_text); + if normalized.is_empty() { + return false; + } + // Expect ] if !p.at(R_BRACK) { return false; @@ -122,18 +132,31 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { // Re-lex the current token in LinkDefinition context so whitespace is tokenized. p.re_lex_link_definition(); - // Destination is required - if p.at(EOF) || p.at(NEWLINE) { - return false; + // Skip optional whitespace after colon (before destination or newline) + skip_whitespace_tokens(p); + + // Per CommonMark §4.7, destination can be on the next line if there's a + // single non-blank newline after the colon. + if p.at(NEWLINE) { + if p.at_blank_line() { + return false; // Blank line = no destination + } + // Single newline - allow destination on next line + p.bump_link_definition(); + skip_whitespace_tokens(p); } - // Skip destination - if !skip_destination_tokens(p) { + // Destination is required (can be on same line or next line now) + if p.at(EOF) || p.at_blank_line() { return false; } - // Skip optional whitespace after destination (lookahead only) - skip_whitespace_tokens(p); + // Skip destination and track whether there was whitespace after it + let dest_result = skip_destination_tokens(p); + if dest_result == DestinationResult::Invalid { + return false; + } + let had_separator = dest_result == DestinationResult::ValidWithSeparator; // Check what follows destination if p.at(EOF) { @@ -141,19 +164,27 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } if p.at(NEWLINE) { - // Check for title on next line + // Check for title on next line (newline counts as separator) p.bump_link_definition(); skip_whitespace_tokens(p); if at_title_start(p) { - return skip_title_tokens(p); + // If title looks valid, it's included in the definition. + // If title has trailing content, it's invalid - but the definition + // is still valid (destination-only). The invalid title line will + // be parsed as a paragraph. Per CommonMark §4.7. + let _ = skip_title_tokens(p); // Ignore result - definition is valid either way } - // No title on next line - destination-only is valid + // Destination-only is valid, or destination+valid_title is valid return true; } - // Check for optional title on same line + // Check for optional title on same line - MUST be preceded by whitespace if at_title_start(p) { + if !had_separator { + // Title without preceding whitespace is invalid (e.g., `(baz)`) + return false; + } return skip_title_tokens(p); } @@ -163,14 +194,22 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { /// Skip whitespace tokens (spaces/tabs) in lookahead. fn skip_whitespace_tokens(p: &mut MarkdownParser) { + skip_whitespace_tokens_tracked(p); +} + +/// Skip whitespace tokens (spaces/tabs) in lookahead and return whether any were skipped. +fn skip_whitespace_tokens_tracked(p: &mut MarkdownParser) -> bool { + let mut skipped = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { p.bump_link_definition(); + skipped = true; } else { break; } } + skipped } /// Check if at a title start token. @@ -179,20 +218,65 @@ fn at_title_start(p: &MarkdownParser) -> bool { text.starts_with('"') || text.starts_with('\'') || p.at(L_PAREN) } -/// Skip destination tokens in lookahead. Returns false if destination is invalid. -fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { +/// Result of skipping destination tokens. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationResult { + /// Invalid destination + Invalid, + /// Valid destination, no trailing whitespace found before title + ValidNoSeparator, + /// Valid destination with trailing whitespace (separator before potential title) + ValidWithSeparator, +} + +/// Skip destination tokens in lookahead. Returns the destination result. +fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { + // Skip optional leading whitespace before destination + while !p.at(EOF) && !p.at(NEWLINE) { + let text = p.cur_text(); + if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { + p.bump_link_definition(); + } else { + break; + } + } + if p.at(L_ANGLE) { // Angle-bracketed destination p.bump_link_definition(); + let mut pending_escape = false; loop { if p.at(EOF) || p.at(NEWLINE) { - return false; // Unterminated angle bracket + return DestinationResult::Invalid; // Unterminated angle bracket } if p.at(R_ANGLE) { - p.bump_link_definition(); - // Consume separator whitespace into destination - skip_whitespace_tokens(p); - return true; + if pending_escape { + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; + } + p.bump_link_definition(); + continue; + } else { + p.bump_link_definition(); + // Check for trailing whitespace (separator) + let had_sep = skip_whitespace_tokens_tracked(p); + return if had_sep { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator + }; + } + } + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } p.bump_link_definition(); } @@ -201,6 +285,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { let mut paren_depth = 0i32; let mut has_content = false; let mut saw_separator = false; + let mut pending_escape = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); @@ -214,24 +299,43 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { } if at_title_start(p) && has_content && saw_separator { + // Break here - we've found separator before title break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends destination - } + if !crate::syntax::validate_link_destination_text( + text, + crate::syntax::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } - has_content = true; - saw_separator = false; - p.bump_link_definition(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + has_content = true; + saw_separator = false; + paren_depth = next_depth; + p.bump_link_definition(); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + // For link reference definitions, both cases end the destination + break; + } + } + } + if !has_content { + DestinationResult::Invalid + } else if saw_separator { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator } - has_content } } @@ -249,17 +353,10 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { // Check if first token is complete (e.g., `"title"`) let first_text = p.cur_text(); - if first_text.len() >= 2 { - let is_complete = if close_char == ')' { - first_text.ends_with(')') - } else { - first_text.ends_with(close_char) - }; - if is_complete { - p.bump_link_definition(); - skip_whitespace_tokens(p); - return p.at(EOF) || p.at(NEWLINE); - } + if first_text.len() >= 2 && crate::syntax::ends_with_unescaped_close(first_text, close_char) { + p.bump_link_definition(); + skip_whitespace_tokens(p); + return p.at(EOF) || p.at(NEWLINE); } p.bump_link_definition(); @@ -271,11 +368,7 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { } // Check for closing delimiter - let is_close = if close_char == ')' { - p.at(R_PAREN) - } else { - p.cur_text().ends_with(close_char) - }; + let is_close = crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char); if is_close { p.bump_link_definition(); @@ -326,12 +419,35 @@ pub(crate) fn parse_link_block(p: &mut MarkdownParser) -> ParsedSyntax { parse_link_destination(p); // Optional title - can be on same line or next line per CommonMark §4.7 + // First, check for title on same line (at_link_title skips whitespace in lookahead) if at_link_title(p) { parse_link_title(p); - } else if p.at(NEWLINE) && title_on_next_line(p) { - // Title is on the next line per CommonMark §4.7 - // We parse the newline and whitespace as part of the title - parse_link_title_after_newline(p); + } else { + // Check for title on next line - need to skip trailing whitespace first + // Also validate that the title is complete and has no trailing content + let has_valid_title_after_newline = p.lookahead(|p| { + while is_whitespace_token(p) { + p.bump_link_definition(); + } + if p.at(NEWLINE) && !p.at_blank_line() { + // Check if there's a title starter on next line + if !title_on_next_line(p) { + return false; + } + // Also validate that the title is complete (no trailing content) + p.bump_link_definition(); // consume newline + skip_whitespace_tokens(p); // skip leading whitespace on title line + skip_title_tokens(p) // returns true only if title ends at EOL/EOF + } else { + false + } + }); + + if has_valid_title_after_newline { + // Title is on the next line per CommonMark §4.7 + // Include trailing whitespace + newline + leading whitespace as part of title + parse_link_title_with_trailing_ws(p); + } } Present(m.complete(p, MD_LINK_REFERENCE_DEFINITION)) @@ -375,6 +491,14 @@ fn parse_link_destination(p: &mut MarkdownParser) { bump_textual_link_def(p); } + // Per CommonMark §4.7, destination can be on the next line + if p.at(NEWLINE) && !p.at_blank_line() { + bump_textual_link_def(p); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + } + if p.at(L_ANGLE) { // Angle-bracketed: consume < ... > bump_textual_link_def(p); @@ -393,17 +517,21 @@ fn parse_link_destination(p: &mut MarkdownParser) { break; // Bare destination stops at first whitespace } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends bare destination + let text = p.cur_text(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + break; } } - - bump_textual_link_def(p); } } @@ -457,16 +585,24 @@ fn title_on_next_line(p: &MarkdownParser) -> bool { // Check for title starter trimmed.starts_with('"') || trimmed.starts_with('\'') || trimmed.starts_with('(') } - -/// Parse a link title that appears on the next line after a newline. +/// Parse a link title that appears on next line, including trailing whitespace before newline. /// -/// Per CommonMark §4.7, titles can appear on the line following the destination. -fn parse_link_title_after_newline(p: &mut MarkdownParser) { +/// This is used when there's trailing whitespace after the destination but before +/// the newline that precedes the title. The trailing whitespace is included in the +/// title node to maintain the grammar structure. +fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) { let m = p.start(); let list = p.start(); - // Include the newline as textual content - bump_textual_link_def(p); + // Include trailing whitespace after destination + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + + // Include the newline + if p.at(NEWLINE) { + bump_textual_link_def(p); + } // Include leading whitespace on title line while is_whitespace_token(p) { @@ -517,9 +653,10 @@ fn get_title_close_char(p: &MarkdownParser) -> Option { } } -/// Parse title content until closing delimiter. +/// Parse title content until closing delimiter, including trailing whitespace. /// /// Inside title quotes, we use Regular context so whitespace doesn't split tokens. +/// Trailing whitespace after the title is also consumed to prevent spurious paragraphs. fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { let Some(close_char) = close_char else { return; @@ -535,6 +672,11 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { bump_textual_link_def(p); if is_complete { + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } return; } @@ -545,15 +687,20 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { break; } - // Check for closing delimiter + // Check for closing delimiter (must be unescaped) let is_close = if close_char == ')' { p.at(R_PAREN) } else { - p.cur_text().ends_with(close_char) + crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char) }; if is_close { // Use Regular context for title content bump_textual(p); + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } break; } diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index f3dee25a4a..0512db55fe 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -447,10 +447,8 @@ fn render_paragraph( } // Trim both ends - leading whitespace can appear from parser including // the space after list markers in the paragraph content - let content = strip_paragraph_indent( - content - .trim_matches(|c| c == ' ' || c == '\n' || c == '\r') - ); + let content = + strip_paragraph_indent(content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r')); if in_tight_list { // In tight lists, paragraphs are rendered without

tags @@ -1160,7 +1158,11 @@ where { if let Some(node) = label_node { let text = label_text(&node); - (text.clone(), Some(text)) + if text.trim().is_empty() { + (fallback, None) + } else { + (text.clone(), Some(text)) + } } else { (fallback, None) } @@ -1602,4 +1604,64 @@ mod tests { // U+0000 should become replacement character assert_eq!(decode_entity("�"), Some("\u{FFFD}".to_string())); } + + #[test] + fn test_percent_encode_uri() { + let input = format!("https://a{}b.c/%20/%", '\u{1F44D}'); + let encoded = percent_encode_uri(&input); + assert_eq!(encoded, "https://a%F0%9F%91%8Db.c/%20/%25"); + } + + #[test] + fn test_process_link_destination_decodes_entities() { + let encoded = process_link_destination("https://example.com/<"); + assert_eq!(encoded, "https://example.com/%3C"); + } + + #[test] + fn test_paren_depth_limit_in_destination() { + let dest = format!("x{}y{}", "(".repeat(32), ")".repeat(32)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected = format!("

a

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_paren_depth_limit_exceeded_in_destination() { + let dest = format!("x{}y{}", "(".repeat(33), ")".repeat(33)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected_dest = format!("x{}", "(".repeat(32)); + let trailing = ")".repeat(34); + let expected = format!("

a(y{trailing}

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_title_with_escaped_closing_quote() { + let parsed = parse_markdown("[a](/url \"title with \\\" quote\")\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

a

\n" + ); + } } diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md new file mode 100644 index 0000000000..502a100473 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md @@ -0,0 +1,17 @@ +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap new file mode 100644 index 0000000000..6a103654f6 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap @@ -0,0 +1,405 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@41..42 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..46 "link" [] [], + }, + ], + r_brack_token: R_BRACK@46..47 "]" [] [], + l_paren_token: L_PAREN@47..48 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..51 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@65..68 " )" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@69..70 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@91..92 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@92..93 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@93..97 "link" [] [], + }, + ], + r_brack_token: R_BRACK@97..98 "]" [] [], + l_paren_token: L_PAREN@98..99 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@99..103 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@103..104 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@110..111 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@121..122 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@122..123 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@123..124 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@165..166 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@166..167 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@167..171 "link" [] [], + }, + ], + r_brack_token: R_BRACK@171..172 "]" [] [], + l_paren_token: L_PAREN@172..173 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@177..178 "\n" [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@185..186 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@186..187 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@187..188 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@226..227 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@227..228 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@228..232 "link" [] [], + }, + ], + r_brack_token: R_BRACK@232..233 "]" [] [], + l_paren_token: L_PAREN@233..234 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@234..237 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@237..241 "/url" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@241..242 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@242..243 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@243..244 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@283..284 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@284..285 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@285..289 "link" [] [], + }, + ], + r_brack_token: R_BRACK@289..290 "]" [] [], + l_paren_token: L_PAREN@290..291 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@291..295 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@295..298 " " [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@298..299 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@299..300 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@300..300 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..300 + 0: (empty) + 1: MD_BLOCK_LIST@0..300 + 0: MD_PARAGRAPH@0..69 + 0: MD_INLINE_ITEM_LIST@0..69 + 0: MD_TEXTUAL@0..40 + 0: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [] + 1: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 "\n" [] [] + 2: MD_INLINE_LINK@41..68 + 0: L_BRACK@41..42 "[" [] [] + 1: MD_INLINE_ITEM_LIST@42..46 + 0: MD_TEXTUAL@42..46 + 0: MD_TEXTUAL_LITERAL@42..46 "link" [] [] + 2: R_BRACK@46..47 "]" [] [] + 3: L_PAREN@47..48 "(" [] [] + 4: MD_INLINE_ITEM_LIST@48..58 + 0: MD_TEXTUAL@48..51 + 0: MD_TEXTUAL_LITERAL@48..51 " " [] [] + 1: MD_TEXTUAL@51..55 + 0: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [] + 2: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 "\n" [] [] + 3: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 " " [] [] + 4: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 " " [] [] + 5: MD_LINK_TITLE@58..65 + 0: MD_INLINE_ITEM_LIST@58..65 + 0: MD_TEXTUAL@58..65 + 0: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [] + 6: R_PAREN@65..68 " )" [] [] + 3: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 "\n" [] [] + 1: (empty) + 1: MD_NEWLINE@69..70 + 0: NEWLINE@69..70 "\n" [] [] + 2: MD_PARAGRAPH@70..123 + 0: MD_INLINE_ITEM_LIST@70..123 + 0: MD_TEXTUAL@70..91 + 0: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [] + 1: MD_TEXTUAL@91..92 + 0: MD_TEXTUAL_LITERAL@91..92 "\n" [] [] + 2: MD_INLINE_LINK@92..122 + 0: L_BRACK@92..93 "[" [] [] + 1: MD_INLINE_ITEM_LIST@93..97 + 0: MD_TEXTUAL@93..97 + 0: MD_TEXTUAL_LITERAL@93..97 "link" [] [] + 2: R_BRACK@97..98 "]" [] [] + 3: L_PAREN@98..99 "(" [] [] + 4: MD_INLINE_ITEM_LIST@99..104 + 0: MD_TEXTUAL@99..103 + 0: MD_TEXTUAL_LITERAL@99..103 "/url" [] [] + 1: MD_TEXTUAL@103..104 + 0: MD_TEXTUAL_LITERAL@103..104 " " [] [] + 5: MD_LINK_TITLE@104..121 + 0: MD_INLINE_ITEM_LIST@104..121 + 0: MD_TEXTUAL@104..110 + 0: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [] + 1: MD_TEXTUAL@110..111 + 0: MD_TEXTUAL_LITERAL@110..111 "\n" [] [] + 2: MD_TEXTUAL@111..121 + 0: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [] + 6: R_PAREN@121..122 ")" [] [] + 3: MD_TEXTUAL@122..123 + 0: MD_TEXTUAL_LITERAL@122..123 "\n" [] [] + 1: (empty) + 3: MD_NEWLINE@123..124 + 0: NEWLINE@123..124 "\n" [] [] + 4: MD_PARAGRAPH@124..187 + 0: MD_INLINE_ITEM_LIST@124..187 + 0: MD_TEXTUAL@124..165 + 0: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [] + 1: MD_TEXTUAL@165..166 + 0: MD_TEXTUAL_LITERAL@165..166 "\n" [] [] + 2: MD_INLINE_LINK@166..186 + 0: L_BRACK@166..167 "[" [] [] + 1: MD_INLINE_ITEM_LIST@167..171 + 0: MD_TEXTUAL@167..171 + 0: MD_TEXTUAL_LITERAL@167..171 "link" [] [] + 2: R_BRACK@171..172 "]" [] [] + 3: L_PAREN@172..173 "(" [] [] + 4: MD_INLINE_ITEM_LIST@173..178 + 0: MD_TEXTUAL@173..177 + 0: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [] + 1: MD_TEXTUAL@177..178 + 0: MD_TEXTUAL_LITERAL@177..178 "\n" [] [] + 5: MD_LINK_TITLE@178..185 + 0: MD_INLINE_ITEM_LIST@178..185 + 0: MD_TEXTUAL@178..185 + 0: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [] + 6: R_PAREN@185..186 ")" [] [] + 3: MD_TEXTUAL@186..187 + 0: MD_TEXTUAL_LITERAL@186..187 "\n" [] [] + 1: (empty) + 5: MD_NEWLINE@187..188 + 0: NEWLINE@187..188 "\n" [] [] + 6: MD_PARAGRAPH@188..243 + 0: MD_INLINE_ITEM_LIST@188..243 + 0: MD_TEXTUAL@188..226 + 0: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [] + 1: MD_TEXTUAL@226..227 + 0: MD_TEXTUAL_LITERAL@226..227 "\n" [] [] + 2: MD_INLINE_LINK@227..242 + 0: L_BRACK@227..228 "[" [] [] + 1: MD_INLINE_ITEM_LIST@228..232 + 0: MD_TEXTUAL@228..232 + 0: MD_TEXTUAL_LITERAL@228..232 "link" [] [] + 2: R_BRACK@232..233 "]" [] [] + 3: L_PAREN@233..234 "(" [] [] + 4: MD_INLINE_ITEM_LIST@234..241 + 0: MD_TEXTUAL@234..237 + 0: MD_TEXTUAL_LITERAL@234..237 " " [] [] + 1: MD_TEXTUAL@237..241 + 0: MD_TEXTUAL_LITERAL@237..241 "/url" [] [] + 5: (empty) + 6: R_PAREN@241..242 ")" [] [] + 3: MD_TEXTUAL@242..243 + 0: MD_TEXTUAL_LITERAL@242..243 "\n" [] [] + 1: (empty) + 7: MD_NEWLINE@243..244 + 0: NEWLINE@243..244 "\n" [] [] + 8: MD_PARAGRAPH@244..300 + 0: MD_INLINE_ITEM_LIST@244..300 + 0: MD_TEXTUAL@244..283 + 0: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [] + 1: MD_TEXTUAL@283..284 + 0: MD_TEXTUAL_LITERAL@283..284 "\n" [] [] + 2: MD_INLINE_LINK@284..299 + 0: L_BRACK@284..285 "[" [] [] + 1: MD_INLINE_ITEM_LIST@285..289 + 0: MD_TEXTUAL@285..289 + 0: MD_TEXTUAL_LITERAL@285..289 "link" [] [] + 2: R_BRACK@289..290 "]" [] [] + 3: L_PAREN@290..291 "(" [] [] + 4: MD_INLINE_ITEM_LIST@291..298 + 0: MD_TEXTUAL@291..295 + 0: MD_TEXTUAL_LITERAL@291..295 "/url" [] [] + 1: MD_TEXTUAL@295..298 + 0: MD_TEXTUAL_LITERAL@295..298 " " [] [] + 5: (empty) + 6: R_PAREN@298..299 ")" [] [] + 3: MD_TEXTUAL@299..300 + 0: MD_TEXTUAL_LITERAL@299..300 "\n" [] [] + 1: (empty) + 2: EOF@300..300 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap index 2737efcebe..c81a255370 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap @@ -601,48 +601,44 @@ MdDocument { MdNewline { value_token: NEWLINE@473..474 "\n" [] [], }, - MdLinkReferenceDefinition { - l_brack_token: L_BRACK@474..475 "[" [] [], - label: MdLinkLabel { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], - }, - ], - }, - r_brack_token: R_BRACK@489..490 "]" [] [], - colon_token: COLON@490..491 ":" [] [], - destination: MdLinkDestination { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], - }, - ], - }, - title: missing (optional), - }, MdParagraph { list: MdInlineItemList [ MdTextual { - value_token: MD_TEXTUAL_LITERAL@498..499 " " [] [], + value_token: MD_TEXTUAL_LITERAL@474..475 "[" [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [], + value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@489..490 "]" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@490..491 ":" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], + }, + MdInlineHtml { + value: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], + }, + ], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@506..507 "\n" [] [], @@ -1021,36 +1017,33 @@ MdDocument { 1: (empty) 39: MD_NEWLINE@473..474 0: NEWLINE@473..474 "\n" [] [] - 40: MD_LINK_REFERENCE_DEFINITION@474..498 - 0: L_BRACK@474..475 "[" [] [] - 1: MD_LINK_LABEL@475..489 - 0: MD_INLINE_ITEM_LIST@475..489 - 0: MD_TEXTUAL@475..480 - 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] - 1: MD_TEXTUAL@480..481 - 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] - 2: MD_TEXTUAL@481..489 - 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] - 2: R_BRACK@489..490 "]" [] [] - 3: COLON@490..491 ":" [] [] - 4: MD_LINK_DESTINATION@491..498 - 0: MD_INLINE_ITEM_LIST@491..498 - 0: MD_TEXTUAL@491..492 - 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] - 1: MD_TEXTUAL@492..493 - 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] - 2: MD_TEXTUAL@493..497 - 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] - 3: MD_TEXTUAL@497..498 - 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] - 5: (empty) - 41: MD_PARAGRAPH@498..507 - 0: MD_INLINE_ITEM_LIST@498..507 - 0: MD_TEXTUAL@498..499 - 0: MD_TEXTUAL_LITERAL@498..499 " " [] [] - 1: MD_TEXTUAL@499..506 - 0: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [] - 2: MD_TEXTUAL@506..507 + 40: MD_PARAGRAPH@474..507 + 0: MD_INLINE_ITEM_LIST@474..507 + 0: MD_TEXTUAL@474..475 + 0: MD_TEXTUAL_LITERAL@474..475 "[" [] [] + 1: MD_TEXTUAL@475..480 + 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] + 2: MD_TEXTUAL@480..481 + 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] + 3: MD_TEXTUAL@481..489 + 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] + 4: MD_TEXTUAL@489..490 + 0: MD_TEXTUAL_LITERAL@489..490 "]" [] [] + 5: MD_TEXTUAL@490..491 + 0: MD_TEXTUAL_LITERAL@490..491 ":" [] [] + 6: MD_TEXTUAL@491..492 + 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] + 7: MD_INLINE_HTML@492..498 + 0: MD_INLINE_ITEM_LIST@492..498 + 0: MD_TEXTUAL@492..493 + 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] + 1: MD_TEXTUAL@493..497 + 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] + 2: MD_TEXTUAL@497..498 + 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] + 8: MD_TEXTUAL@498..506 + 0: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [] + 9: MD_TEXTUAL@506..507 0: MD_TEXTUAL_LITERAL@506..507 "\n" [] [] 1: (empty) 2: EOF@507..507 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap index 2d3cb5a3c2..3be4200606 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap @@ -1,6 +1,5 @@ --- source: crates/biome_markdown_parser/tests/spec_test.rs -assertion_line: 131 expression: snapshot --- ## Input @@ -184,11 +183,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@353..354 "\n" [] [], }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@354..355 "[" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@355..356 "]" [] [], + MdReferenceLink { + l_brack_token: L_BRACK@354..355 "[" [] [], + text: MdInlineItemList [], + r_brack_token: R_BRACK@355..356 "]" [] [], + label: missing (optional), }, MdTextual { value_token: MD_TEXTUAL_LITERAL@356..357 ":" [] [], @@ -315,15 +314,16 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@330..353 "Empty label is invalid:" [] [] 1: MD_TEXTUAL@353..354 0: MD_TEXTUAL_LITERAL@353..354 "\n" [] [] - 2: MD_TEXTUAL@354..355 - 0: MD_TEXTUAL_LITERAL@354..355 "[" [] [] - 3: MD_TEXTUAL@355..356 - 0: MD_TEXTUAL_LITERAL@355..356 "]" [] [] - 4: MD_TEXTUAL@356..357 + 2: MD_REFERENCE_LINK@354..356 + 0: L_BRACK@354..355 "[" [] [] + 1: MD_INLINE_ITEM_LIST@355..355 + 2: R_BRACK@355..356 "]" [] [] + 3: (empty) + 3: MD_TEXTUAL@356..357 0: MD_TEXTUAL_LITERAL@356..357 ":" [] [] - 5: MD_TEXTUAL@357..362 + 4: MD_TEXTUAL@357..362 0: MD_TEXTUAL_LITERAL@357..362 " /url" [] [] - 6: MD_TEXTUAL@362..363 + 5: MD_TEXTUAL@362..363 0: MD_TEXTUAL_LITERAL@362..363 "\n" [] [] 1: (empty) 2: EOF@363..363 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap similarity index 50% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap index 7d67fcd2a8..f0ecbb1ced 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap @@ -25,19 +25,16 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@1..4 "foo" [] [], }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], + }, ], - r_brack_token: missing (required), + r_brack_token: R_BRACK@8..9 "]" [] [], label: missing (optional), }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@8..9 "]" [] [], - }, MdTextual { value_token: MD_TEXTUAL_LITERAL@9..10 "\n" [] [], }, @@ -57,45 +54,20 @@ MdDocument { 1: MD_BLOCK_LIST@0..10 0: MD_PARAGRAPH@0..10 0: MD_INLINE_ITEM_LIST@0..10 - 0: MD_REFERENCE_LINK@0..4 + 0: MD_REFERENCE_LINK@0..9 0: L_BRACK@0..1 "[" [] [] - 1: MD_INLINE_ITEM_LIST@1..4 + 1: MD_INLINE_ITEM_LIST@1..8 0: MD_TEXTUAL@1..4 0: MD_TEXTUAL_LITERAL@1..4 "foo" [] [] - 2: (empty) + 1: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] + 2: MD_TEXTUAL@5..8 + 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] + 2: R_BRACK@8..9 "]" [] [] 3: (empty) - 1: MD_TEXTUAL@4..5 - 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] - 2: MD_TEXTUAL@5..8 - 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] - 3: MD_TEXTUAL@8..9 - 0: MD_TEXTUAL_LITERAL@8..9 "]" [] [] - 4: MD_TEXTUAL@9..10 + 1: MD_TEXTUAL@9..10 0: MD_TEXTUAL_LITERAL@9..10 "\n" [] [] 1: (empty) 2: EOF@10..10 "" [] [] ``` - -## Diagnostics - -``` -multiline_label_reference.md:1:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i link started here - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md new file mode 100644 index 0000000000..3cbf1f91d3 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md @@ -0,0 +1 @@ +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap new file mode 100644 index 0000000000..236bd2046f --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap @@ -0,0 +1,399 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdInlineLink { + l_brack_token: L_BRACK@0..1 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@1..2 "a" [] [], + }, + ], + r_brack_token: R_BRACK@2..3 "]" [] [], + l_paren_token: L_PAREN@3..4 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "x" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..6 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@6..7 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@8..9 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..12 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..13 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@13..14 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@14..15 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..17 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@17..18 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@18..19 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@19..20 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@20..21 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@21..22 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@22..23 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@24..25 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@26..27 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@27..28 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@28..29 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@29..30 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@30..31 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@31..32 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@32..33 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@33..34 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@34..35 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@35..36 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@36..37 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@37..38 "y" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@38..39 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@39..40 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@41..42 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..43 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@43..44 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@44..45 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@45..46 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@46..47 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@47..48 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..49 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@49..50 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@50..51 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..52 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@52..53 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@53..54 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@54..55 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..59 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@59..60 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@60..61 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@61..62 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@62..63 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@63..64 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@64..65 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@65..66 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@66..67 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@67..68 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@69..70 ")" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@70..71 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@71..72 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@72..72 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..72 + 0: (empty) + 1: MD_BLOCK_LIST@0..72 + 0: MD_PARAGRAPH@0..72 + 0: MD_INLINE_ITEM_LIST@0..72 + 0: MD_INLINE_LINK@0..71 + 0: L_BRACK@0..1 "[" [] [] + 1: MD_INLINE_ITEM_LIST@1..2 + 0: MD_TEXTUAL@1..2 + 0: MD_TEXTUAL_LITERAL@1..2 "a" [] [] + 2: R_BRACK@2..3 "]" [] [] + 3: L_PAREN@3..4 "(" [] [] + 4: MD_INLINE_ITEM_LIST@4..70 + 0: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "x" [] [] + 1: MD_TEXTUAL@5..6 + 0: MD_TEXTUAL_LITERAL@5..6 "(" [] [] + 2: MD_TEXTUAL@6..7 + 0: MD_TEXTUAL_LITERAL@6..7 "(" [] [] + 3: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "(" [] [] + 4: MD_TEXTUAL@8..9 + 0: MD_TEXTUAL_LITERAL@8..9 "(" [] [] + 5: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "(" [] [] + 6: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "(" [] [] + 7: MD_TEXTUAL@11..12 + 0: MD_TEXTUAL_LITERAL@11..12 "(" [] [] + 8: MD_TEXTUAL@12..13 + 0: MD_TEXTUAL_LITERAL@12..13 "(" [] [] + 9: MD_TEXTUAL@13..14 + 0: MD_TEXTUAL_LITERAL@13..14 "(" [] [] + 10: MD_TEXTUAL@14..15 + 0: MD_TEXTUAL_LITERAL@14..15 "(" [] [] + 11: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "(" [] [] + 12: MD_TEXTUAL@16..17 + 0: MD_TEXTUAL_LITERAL@16..17 "(" [] [] + 13: MD_TEXTUAL@17..18 + 0: MD_TEXTUAL_LITERAL@17..18 "(" [] [] + 14: MD_TEXTUAL@18..19 + 0: MD_TEXTUAL_LITERAL@18..19 "(" [] [] + 15: MD_TEXTUAL@19..20 + 0: MD_TEXTUAL_LITERAL@19..20 "(" [] [] + 16: MD_TEXTUAL@20..21 + 0: MD_TEXTUAL_LITERAL@20..21 "(" [] [] + 17: MD_TEXTUAL@21..22 + 0: MD_TEXTUAL_LITERAL@21..22 "(" [] [] + 18: MD_TEXTUAL@22..23 + 0: MD_TEXTUAL_LITERAL@22..23 "(" [] [] + 19: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "(" [] [] + 20: MD_TEXTUAL@24..25 + 0: MD_TEXTUAL_LITERAL@24..25 "(" [] [] + 21: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "(" [] [] + 22: MD_TEXTUAL@26..27 + 0: MD_TEXTUAL_LITERAL@26..27 "(" [] [] + 23: MD_TEXTUAL@27..28 + 0: MD_TEXTUAL_LITERAL@27..28 "(" [] [] + 24: MD_TEXTUAL@28..29 + 0: MD_TEXTUAL_LITERAL@28..29 "(" [] [] + 25: MD_TEXTUAL@29..30 + 0: MD_TEXTUAL_LITERAL@29..30 "(" [] [] + 26: MD_TEXTUAL@30..31 + 0: MD_TEXTUAL_LITERAL@30..31 "(" [] [] + 27: MD_TEXTUAL@31..32 + 0: MD_TEXTUAL_LITERAL@31..32 "(" [] [] + 28: MD_TEXTUAL@32..33 + 0: MD_TEXTUAL_LITERAL@32..33 "(" [] [] + 29: MD_TEXTUAL@33..34 + 0: MD_TEXTUAL_LITERAL@33..34 "(" [] [] + 30: MD_TEXTUAL@34..35 + 0: MD_TEXTUAL_LITERAL@34..35 "(" [] [] + 31: MD_TEXTUAL@35..36 + 0: MD_TEXTUAL_LITERAL@35..36 "(" [] [] + 32: MD_TEXTUAL@36..37 + 0: MD_TEXTUAL_LITERAL@36..37 "(" [] [] + 33: MD_TEXTUAL@37..38 + 0: MD_TEXTUAL_LITERAL@37..38 "y" [] [] + 34: MD_TEXTUAL@38..39 + 0: MD_TEXTUAL_LITERAL@38..39 ")" [] [] + 35: MD_TEXTUAL@39..40 + 0: MD_TEXTUAL_LITERAL@39..40 ")" [] [] + 36: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 ")" [] [] + 37: MD_TEXTUAL@41..42 + 0: MD_TEXTUAL_LITERAL@41..42 ")" [] [] + 38: MD_TEXTUAL@42..43 + 0: MD_TEXTUAL_LITERAL@42..43 ")" [] [] + 39: MD_TEXTUAL@43..44 + 0: MD_TEXTUAL_LITERAL@43..44 ")" [] [] + 40: MD_TEXTUAL@44..45 + 0: MD_TEXTUAL_LITERAL@44..45 ")" [] [] + 41: MD_TEXTUAL@45..46 + 0: MD_TEXTUAL_LITERAL@45..46 ")" [] [] + 42: MD_TEXTUAL@46..47 + 0: MD_TEXTUAL_LITERAL@46..47 ")" [] [] + 43: MD_TEXTUAL@47..48 + 0: MD_TEXTUAL_LITERAL@47..48 ")" [] [] + 44: MD_TEXTUAL@48..49 + 0: MD_TEXTUAL_LITERAL@48..49 ")" [] [] + 45: MD_TEXTUAL@49..50 + 0: MD_TEXTUAL_LITERAL@49..50 ")" [] [] + 46: MD_TEXTUAL@50..51 + 0: MD_TEXTUAL_LITERAL@50..51 ")" [] [] + 47: MD_TEXTUAL@51..52 + 0: MD_TEXTUAL_LITERAL@51..52 ")" [] [] + 48: MD_TEXTUAL@52..53 + 0: MD_TEXTUAL_LITERAL@52..53 ")" [] [] + 49: MD_TEXTUAL@53..54 + 0: MD_TEXTUAL_LITERAL@53..54 ")" [] [] + 50: MD_TEXTUAL@54..55 + 0: MD_TEXTUAL_LITERAL@54..55 ")" [] [] + 51: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 ")" [] [] + 52: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 ")" [] [] + 53: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 ")" [] [] + 54: MD_TEXTUAL@58..59 + 0: MD_TEXTUAL_LITERAL@58..59 ")" [] [] + 55: MD_TEXTUAL@59..60 + 0: MD_TEXTUAL_LITERAL@59..60 ")" [] [] + 56: MD_TEXTUAL@60..61 + 0: MD_TEXTUAL_LITERAL@60..61 ")" [] [] + 57: MD_TEXTUAL@61..62 + 0: MD_TEXTUAL_LITERAL@61..62 ")" [] [] + 58: MD_TEXTUAL@62..63 + 0: MD_TEXTUAL_LITERAL@62..63 ")" [] [] + 59: MD_TEXTUAL@63..64 + 0: MD_TEXTUAL_LITERAL@63..64 ")" [] [] + 60: MD_TEXTUAL@64..65 + 0: MD_TEXTUAL_LITERAL@64..65 ")" [] [] + 61: MD_TEXTUAL@65..66 + 0: MD_TEXTUAL_LITERAL@65..66 ")" [] [] + 62: MD_TEXTUAL@66..67 + 0: MD_TEXTUAL_LITERAL@66..67 ")" [] [] + 63: MD_TEXTUAL@67..68 + 0: MD_TEXTUAL_LITERAL@67..68 ")" [] [] + 64: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 ")" [] [] + 65: MD_TEXTUAL@69..70 + 0: MD_TEXTUAL_LITERAL@69..70 ")" [] [] + 5: (empty) + 6: R_PAREN@70..71 ")" [] [] + 1: MD_TEXTUAL@71..72 + 0: MD_TEXTUAL_LITERAL@71..72 "\n" [] [] + 1: (empty) + 2: EOF@72..72 "" [] [] + +``` diff --git a/Cargo.lock b/Cargo.lock index bfc6c1c8c5..72c683148d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1295,6 +1295,7 @@ dependencies = [ "biome_markdown_syntax", "biome_parser", "biome_rowan", + "biome_string_case", "biome_test_utils", "biome_unicode_table", "htmlize", diff --git a/crates/biome_markdown_parser/Cargo.toml b/crates/biome_markdown_parser/Cargo.toml index c3087a08e7..3f100686af 100644 --- a/crates/biome_markdown_parser/Cargo.toml +++ b/crates/biome_markdown_parser/Cargo.toml @@ -22,6 +22,7 @@ biome_markdown_factory = { workspace = true } biome_markdown_syntax = { workspace = true } biome_parser = { workspace = true } biome_rowan = { workspace = true } +biome_string_case = { workspace = true } biome_unicode_table = { workspace = true } # Optional dependency for test_utils feature (HTML rendering for spec tests) htmlize = { version = "1.0.6", features = ["unescape"], optional = true } diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs index 7edfb0b9db..e67f38759a 100644 --- a/crates/biome_markdown_parser/src/link_reference.rs +++ b/crates/biome_markdown_parser/src/link_reference.rs @@ -1,5 +1,7 @@ use std::collections::HashSet; +use biome_string_case::StrOnlyExtension; + use biome_markdown_syntax::{MdLinkLabel, MdLinkReferenceDefinition}; use biome_rowan::{AstNode, Direction}; @@ -8,19 +10,20 @@ use crate::MarkdownParseOptions; use crate::parser::MarkdownParser; use crate::syntax::parse_document; +/// Normalize a reference label per CommonMark spec. +/// +/// Per CommonMark, label normalization involves: +/// 1. Collapsing consecutive whitespace into a single space +/// 2. Case-folding (case-insensitive matching) +/// +/// IMPORTANT: Backslash escapes are NOT stripped during normalization. +/// This means `[foo\!]` does NOT match `[foo!]` - the backslash is preserved. +/// This matches cmark's reference implementation behavior. pub(crate) fn normalize_reference_label(text: &str) -> String { let mut out = String::new(); - let mut chars = text.chars().peekable(); let mut saw_whitespace = false; - while let Some(c) = chars.next() { - if c == '\\' { - if let Some(next) = chars.next() { - push_normalized_char(&mut out, next, &mut saw_whitespace); - } - continue; - } - + for c in text.chars() { if c.is_whitespace() { saw_whitespace = true; continue; @@ -29,7 +32,7 @@ pub(crate) fn normalize_reference_label(text: &str) -> String { push_normalized_char(&mut out, c, &mut saw_whitespace); } - out + out.as_str().to_lowercase_cow().to_uppercase() } fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { @@ -37,9 +40,7 @@ fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { out.push(' '); } *saw_whitespace = false; - for lower in c.to_lowercase() { - out.push(lower); - } + out.push(c); } pub(crate) fn collect_link_reference_definitions( diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index 1c9bdac152..e4fecdd82c 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -60,6 +60,9 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; +/// Maximum paren nesting allowed in link destinations per CommonMark. +pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32; + /// CommonMark requires 4 or more spaces for indented code blocks. const INDENT_CODE_BLOCK_SPACES: usize = 4; @@ -71,6 +74,98 @@ pub(crate) fn parse_document(p: &mut MarkdownParser) { m.complete(p, MD_DOCUMENT); } +/// Result of updating parenthesis depth when scanning link destinations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ParenDepthResult { + /// Depth updated successfully, contains new depth value + Ok(i32), + /// Depth would exceed the maximum (too many nested opening parens). + /// Per cmark, this truncates the destination at this point. + DepthExceeded, + /// Unmatched closing paren (would go below 0). + /// This typically means the `)` belongs to the enclosing construct. + UnmatchedClose, +} + +pub(crate) fn try_update_paren_depth(text: &str, depth: i32, max: i32) -> ParenDepthResult { + let mut depth = depth; + let mut chars = text.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' && matches!(chars.peek(), Some('(' | ')')) { + chars.next(); + continue; + } + + if c == '(' { + if depth == max { + return ParenDepthResult::DepthExceeded; + } + depth += 1; + } else if c == ')' { + if depth == 0 { + return ParenDepthResult::UnmatchedClose; + } + depth -= 1; + } + } + + ParenDepthResult::Ok(depth) +} + +pub(crate) enum LinkDestinationKind { + Enclosed, + Raw, +} + +pub(crate) fn validate_link_destination_text( + text: &str, + kind: LinkDestinationKind, + pending_escape: &mut bool, +) -> bool { + for c in text.chars() { + if *pending_escape { + if c.is_ascii_punctuation() { + *pending_escape = false; + continue; + } + *pending_escape = false; + } + + if c == '\\' { + *pending_escape = true; + continue; + } + + if c.is_ascii_control() { + return false; + } + + if matches!(kind, LinkDestinationKind::Enclosed) && c == '<' { + return false; + } + } + + true +} + +pub(crate) fn ends_with_unescaped_close(text: &str, close_char: char) -> bool { + if !text.ends_with(close_char) { + return false; + } + + let mut backslashes = 0; + for c in text.chars().rev().skip(1) { + if c == '\\' { + backslashes += 1; + } else { + break; + } + } + + backslashes % 2 == 0 +} + pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax { let mut list = DocumentBlockList; Present(list.parse_list(p)) @@ -837,7 +932,10 @@ fn set_inline_emphasis_context( source }; let base_offset = u32::from(p.cur_range().start()) as usize; - let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset); + // Create a reference checker closure that uses the parser's link reference definitions + let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| { + p.has_link_reference_definition(label) + }); p.set_emphasis_context(Some(context)) } diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index 275a09ddb6..f336b37b33 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -611,15 +611,46 @@ pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax { parse_emphasis_from_context(p, false) } -fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { +fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool { let m = p.start(); let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + let mut has_nested_link = false; loop { - if p.at(stop) || p.at_inline_end() { + // Per CommonMark, link text can span lines, but blank lines end the link. + // Check for blank line (NEWLINE followed by NEWLINE or EOF after optional whitespace) + if p.at(NEWLINE) { + if p.at_blank_line() { + break; // Blank line ends link text + } + // Single newline inside link text - consume and continue + let _ = super::parse_textual(p); + continue; + } + + if p.at(T![EOF]) { break; } + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + if p.at(L_BRACK) { + if !has_nested_link && nested_link_starts_here(p) { + has_nested_link = true; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + if parse_any_inline_no_links(p).is_absent() { break; } @@ -627,13 +658,53 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS m.complete(p, MD_INLINE_ITEM_LIST); p.set_emphasis_context(prev_context); + has_nested_link +} + +fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + if !p.at(L_BRACK) { + return false; + } + + p.bump(L_BRACK); + let mut depth = 0usize; + + loop { + if p.at(EOF) || p.at_inline_end() { + return false; + } + + if p.at(L_BRACK) { + depth += 1; + p.bump(L_BRACK); + continue; + } + + if p.at(R_BRACK) { + if depth > 0 { + depth -= 1; + p.bump(R_BRACK); + continue; + } + p.bump(R_BRACK); + return p.at(L_PAREN) || p.at(L_BRACK); + } + + p.bump(p.cur()); + } + }) } fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax { - if (p.at(BANG) && p.nth_at(1, L_BRACK)) || p.at(L_BRACK) { + if p.at(L_BRACK) { return super::parse_textual(p); } + if p.at(BANG) && p.nth_at(1, L_BRACK) { + return parse_inline_image(p); + } + parse_any_inline(p) } @@ -826,10 +897,15 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); // ] - if missing at inline end, emit diagnostic; otherwise rewind if !p.eat(R_BRACK) { + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } if p.at_inline_end() { // Unclosed link/image at end of inline content - emit diagnostic // Expand range to include the text content, not just the opening bracket @@ -843,19 +919,50 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn p.rewind(checkpoint); return Absent; } + let text_end_offset = p.cur_range().start(); + + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } // Now decide based on what follows ] - if p.at(L_PAREN) { + let link_validation = if p.at(L_PAREN) { + inline_link_is_valid(p) + } else { + InlineLinkValidation::Invalid + }; + + if matches!( + link_validation, + InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded + ) { // Inline link/image: [text](url) or ![alt](url) // Bump past ( and lex the following tokens in LinkDefinition context // so whitespace separates destination and title. p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition); let destination = p.start(); - parse_inline_link_destination_tokens(p); + let destination_result = parse_inline_link_destination_tokens(p); + + // When depth exceeded, destination is truncated but link is still valid. + // Complete the destination and link immediately without looking for closing paren. + if destination_result == DestinationScanResult::DepthExceeded { + destination.complete(p, MD_INLINE_ITEM_LIST); + return Present(m.complete(p, kind.inline_kind())); + } + let has_title = inline_title_starts_after_whitespace_tokens(p); - while is_whitespace_token(p) { - bump_textual_link_def(p); + while is_title_separator_token(p) { + bump_link_def_separator(p); + } + if destination_result == DestinationScanResult::Invalid { + destination.abandon(p); + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } destination.complete(p, MD_INLINE_ITEM_LIST); @@ -867,8 +974,20 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn title_m.complete(p, MD_LINK_TITLE); } + // Skip trailing whitespace/newlines before closing paren without creating nodes + // (creating nodes would violate the MD_INLINE_LINK grammar which expects exactly 7 children) + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if !p.eat(R_PAREN) { - kind.report_unclosed_destination(p, opening_range); + if p.at_inline_end() { + kind.report_unclosed_destination(p, opening_range); + } + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } Present(m.complete(p, kind.inline_kind())) @@ -888,7 +1007,7 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) @@ -901,14 +1020,13 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) } } struct ReferenceLinkLookahead { - end_offset: TextSize, label_raw: String, is_shortcut: bool, } @@ -947,7 +1065,13 @@ fn lookahead_reference_common( p.bump(L_BRACK); let link_text = collect_bracket_text(p)?; - let end_offset = p.cur_range().end(); + + // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized_link = normalize_reference_label(&link_text); + if normalized_link.is_empty() { + return None; + } + p.bump(R_BRACK); if p.at(L_PAREN) { @@ -961,12 +1085,15 @@ fn lookahead_reference_common( let label = if label_text.is_empty() { link_text.clone() } else { + // Explicit label must also normalize to non-empty + let normalized_label = normalize_reference_label(&label_text); + if normalized_label.is_empty() { + return None; + } label_text }; - let end_offset = p.cur_range().end(); p.bump(R_BRACK); return Some(ReferenceLinkLookahead { - end_offset, label_raw: label, is_shortcut: false, }); @@ -974,7 +1101,6 @@ fn lookahead_reference_common( } Some(ReferenceLinkLookahead { - end_offset, label_raw: link_text, is_shortcut: true, }) @@ -1025,48 +1151,307 @@ fn is_whitespace_token(p: &MarkdownParser) -> bool { fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - while is_whitespace_token(p) { - bump_textual_link_def(p); + let mut saw_whitespace = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + saw_whitespace = true; + } + saw_whitespace && get_title_close_char(p).is_some() + }) +} + +/// Result of validating an inline link. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InlineLinkValidation { + /// Link is valid with complete destination + Valid, + /// Link is invalid + Invalid, + /// Link is valid but destination was truncated due to paren depth limit. + /// The link should be closed immediately without looking for `)`. + DepthExceeded, +} + +fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation { + p.lookahead(|p| { + if !p.at(L_PAREN) { + return InlineLinkValidation::Invalid; + } + + p.bump(L_PAREN); + p.re_lex_link_definition(); + + let destination_result = scan_inline_link_destination_tokens(p); + + // If depth exceeded, link is valid but truncated - no need to check for closing paren + if destination_result == DestinationScanResult::DepthExceeded { + return InlineLinkValidation::DepthExceeded; + } + + if destination_result == DestinationScanResult::Invalid { + return InlineLinkValidation::Invalid; + } + + let mut saw_separator = false; + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + saw_separator = true; + } + let has_title = saw_separator && get_title_close_char(p).is_some(); + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if has_title { + scan_title_content(p, get_title_close_char(p)); + } + + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if p.at(R_PAREN) { + InlineLinkValidation::Valid + } else { + InlineLinkValidation::Invalid } - get_title_close_char(p).is_some() }) } -fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) { +/// Result of scanning a link destination. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationScanResult { + /// Destination is valid and complete + Valid, + /// Destination is invalid (contains invalid characters, etc.) + Invalid, + /// Destination was truncated because paren depth exceeded the limit. + /// In this case, the link is considered valid but closed at the truncation point. + DepthExceeded, +} + +fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; + // Skip leading whitespace to match parse_inline_link_destination_tokens behavior + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if p.at(L_ANGLE) { + p.bump_link_definition(); + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } + if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + continue; + } + p.bump_link_definition(); + return DestinationScanResult::Valid; + } + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + } + } + + let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while !p.at(EOF) && !p.at(NEWLINE) { + if is_whitespace_token(p) { + break; + } + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + p.bump_link_definition(); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + // Per CommonMark/cmark, the link is still valid but closed here. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). + break; + } + } + } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid +} + +fn scan_title_content(p: &mut MarkdownParser, close_char: Option) { + let Some(close_char) = close_char else { + return; + }; + + let text = p.cur_text(); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); + + p.bump_link_definition(); + if is_complete { + return; + } + + loop { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { + return; + } + + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + skip_link_def_separator_tokens(p); + continue; + } + + let text = p.cur_text(); + if super::ends_with_unescaped_close(text, close_char) { + p.bump_link_definition(); + return; + } + + p.bump_link_definition(); + } +} + +fn skip_link_def_separator_tokens(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + p.bump(NEWLINE); + } else { + p.bump_link_definition(); + } +} + +fn is_title_separator_token(p: &MarkdownParser) -> bool { + is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) +} + +fn bump_link_def_separator(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + let item = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + item.complete(p, MD_TEXTUAL); + } else { + bump_textual_link_def(p); + } +} + +fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { p.re_lex_link_definition(); + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; if p.at(L_ANGLE) { bump_textual_link_def(p); - while !p.at(EOF) && !p.at(NEWLINE) { + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + bump_textual_link_def(p); + continue; + } bump_textual_link_def(p); - break; + return DestinationScanResult::Valid; } - if is_whitespace_token(p) { - break; + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; } bump_textual_link_def(p); } - return; } let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + } while !p.at(EOF) && !p.at(NEWLINE) { if is_whitespace_token(p) { break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth == 0 { + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). break; } - paren_depth -= 1; } - - bump_textual_link_def(p); } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid } fn get_title_close_char(p: &MarkdownParser) -> Option { @@ -1088,9 +1473,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { }; let text = p.cur_text(); - let is_complete = text.len() >= 2 - && ((close_char == ')' && text.ends_with(')')) - || (close_char != ')' && text.ends_with(close_char))); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); bump_textual_link_def(p); if is_complete { @@ -1098,12 +1481,19 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { } loop { - if p.at(EOF) || p.at(NEWLINE) { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { return; } + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + bump_link_def_separator(p); + continue; + } + let text = p.cur_text(); - if text.ends_with(close_char) { + if super::ends_with_unescaped_close(text, close_char) { bump_textual_link_def(p); return; } diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 86d9f57354..69d29d3bd4 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -73,7 +73,9 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { p.bump_any(); // Parse label: consume tokens until ] or invalid state + // Also collect the label text for normalization check. let mut label_len = 0; + let mut label_text = String::new(); loop { if p.at(EOF) { return false; @@ -89,6 +91,8 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } let text = p.cur_text(); + label_text.push_str(text); + // Check for escape sequences if text.starts_with('\\') && text.len() > 1 { label_len += 1; // Count escaped char @@ -107,6 +111,12 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { return false; } + // Label must also be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized = crate::link_reference::normalize_reference_label(&label_text); + if normalized.is_empty() { + return false; + } + // Expect ] if !p.at(R_BRACK) { return false; @@ -122,18 +132,31 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { // Re-lex the current token in LinkDefinition context so whitespace is tokenized. p.re_lex_link_definition(); - // Destination is required - if p.at(EOF) || p.at(NEWLINE) { - return false; + // Skip optional whitespace after colon (before destination or newline) + skip_whitespace_tokens(p); + + // Per CommonMark §4.7, destination can be on the next line if there's a + // single non-blank newline after the colon. + if p.at(NEWLINE) { + if p.at_blank_line() { + return false; // Blank line = no destination + } + // Single newline - allow destination on next line + p.bump_link_definition(); + skip_whitespace_tokens(p); } - // Skip destination - if !skip_destination_tokens(p) { + // Destination is required (can be on same line or next line now) + if p.at(EOF) || p.at_blank_line() { return false; } - // Skip optional whitespace after destination (lookahead only) - skip_whitespace_tokens(p); + // Skip destination and track whether there was whitespace after it + let dest_result = skip_destination_tokens(p); + if dest_result == DestinationResult::Invalid { + return false; + } + let had_separator = dest_result == DestinationResult::ValidWithSeparator; // Check what follows destination if p.at(EOF) { @@ -141,19 +164,27 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } if p.at(NEWLINE) { - // Check for title on next line + // Check for title on next line (newline counts as separator) p.bump_link_definition(); skip_whitespace_tokens(p); if at_title_start(p) { - return skip_title_tokens(p); + // If title looks valid, it's included in the definition. + // If title has trailing content, it's invalid - but the definition + // is still valid (destination-only). The invalid title line will + // be parsed as a paragraph. Per CommonMark §4.7. + let _ = skip_title_tokens(p); // Ignore result - definition is valid either way } - // No title on next line - destination-only is valid + // Destination-only is valid, or destination+valid_title is valid return true; } - // Check for optional title on same line + // Check for optional title on same line - MUST be preceded by whitespace if at_title_start(p) { + if !had_separator { + // Title without preceding whitespace is invalid (e.g., `(baz)`) + return false; + } return skip_title_tokens(p); } @@ -163,14 +194,22 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { /// Skip whitespace tokens (spaces/tabs) in lookahead. fn skip_whitespace_tokens(p: &mut MarkdownParser) { + skip_whitespace_tokens_tracked(p); +} + +/// Skip whitespace tokens (spaces/tabs) in lookahead and return whether any were skipped. +fn skip_whitespace_tokens_tracked(p: &mut MarkdownParser) -> bool { + let mut skipped = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { p.bump_link_definition(); + skipped = true; } else { break; } } + skipped } /// Check if at a title start token. @@ -179,20 +218,65 @@ fn at_title_start(p: &MarkdownParser) -> bool { text.starts_with('"') || text.starts_with('\'') || p.at(L_PAREN) } -/// Skip destination tokens in lookahead. Returns false if destination is invalid. -fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { +/// Result of skipping destination tokens. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationResult { + /// Invalid destination + Invalid, + /// Valid destination, no trailing whitespace found before title + ValidNoSeparator, + /// Valid destination with trailing whitespace (separator before potential title) + ValidWithSeparator, +} + +/// Skip destination tokens in lookahead. Returns the destination result. +fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { + // Skip optional leading whitespace before destination + while !p.at(EOF) && !p.at(NEWLINE) { + let text = p.cur_text(); + if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { + p.bump_link_definition(); + } else { + break; + } + } + if p.at(L_ANGLE) { // Angle-bracketed destination p.bump_link_definition(); + let mut pending_escape = false; loop { if p.at(EOF) || p.at(NEWLINE) { - return false; // Unterminated angle bracket + return DestinationResult::Invalid; // Unterminated angle bracket } if p.at(R_ANGLE) { - p.bump_link_definition(); - // Consume separator whitespace into destination - skip_whitespace_tokens(p); - return true; + if pending_escape { + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; + } + p.bump_link_definition(); + continue; + } else { + p.bump_link_definition(); + // Check for trailing whitespace (separator) + let had_sep = skip_whitespace_tokens_tracked(p); + return if had_sep { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator + }; + } + } + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } p.bump_link_definition(); } @@ -201,6 +285,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { let mut paren_depth = 0i32; let mut has_content = false; let mut saw_separator = false; + let mut pending_escape = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); @@ -214,24 +299,43 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { } if at_title_start(p) && has_content && saw_separator { + // Break here - we've found separator before title break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends destination - } + if !crate::syntax::validate_link_destination_text( + text, + crate::syntax::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } - has_content = true; - saw_separator = false; - p.bump_link_definition(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + has_content = true; + saw_separator = false; + paren_depth = next_depth; + p.bump_link_definition(); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + // For link reference definitions, both cases end the destination + break; + } + } + } + if !has_content { + DestinationResult::Invalid + } else if saw_separator { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator } - has_content } } @@ -249,17 +353,10 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { // Check if first token is complete (e.g., `"title"`) let first_text = p.cur_text(); - if first_text.len() >= 2 { - let is_complete = if close_char == ')' { - first_text.ends_with(')') - } else { - first_text.ends_with(close_char) - }; - if is_complete { - p.bump_link_definition(); - skip_whitespace_tokens(p); - return p.at(EOF) || p.at(NEWLINE); - } + if first_text.len() >= 2 && crate::syntax::ends_with_unescaped_close(first_text, close_char) { + p.bump_link_definition(); + skip_whitespace_tokens(p); + return p.at(EOF) || p.at(NEWLINE); } p.bump_link_definition(); @@ -271,11 +368,7 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { } // Check for closing delimiter - let is_close = if close_char == ')' { - p.at(R_PAREN) - } else { - p.cur_text().ends_with(close_char) - }; + let is_close = crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char); if is_close { p.bump_link_definition(); @@ -326,12 +419,35 @@ pub(crate) fn parse_link_block(p: &mut MarkdownParser) -> ParsedSyntax { parse_link_destination(p); // Optional title - can be on same line or next line per CommonMark §4.7 + // First, check for title on same line (at_link_title skips whitespace in lookahead) if at_link_title(p) { parse_link_title(p); - } else if p.at(NEWLINE) && title_on_next_line(p) { - // Title is on the next line per CommonMark §4.7 - // We parse the newline and whitespace as part of the title - parse_link_title_after_newline(p); + } else { + // Check for title on next line - need to skip trailing whitespace first + // Also validate that the title is complete and has no trailing content + let has_valid_title_after_newline = p.lookahead(|p| { + while is_whitespace_token(p) { + p.bump_link_definition(); + } + if p.at(NEWLINE) && !p.at_blank_line() { + // Check if there's a title starter on next line + if !title_on_next_line(p) { + return false; + } + // Also validate that the title is complete (no trailing content) + p.bump_link_definition(); // consume newline + skip_whitespace_tokens(p); // skip leading whitespace on title line + skip_title_tokens(p) // returns true only if title ends at EOL/EOF + } else { + false + } + }); + + if has_valid_title_after_newline { + // Title is on the next line per CommonMark §4.7 + // Include trailing whitespace + newline + leading whitespace as part of title + parse_link_title_with_trailing_ws(p); + } } Present(m.complete(p, MD_LINK_REFERENCE_DEFINITION)) @@ -375,6 +491,14 @@ fn parse_link_destination(p: &mut MarkdownParser) { bump_textual_link_def(p); } + // Per CommonMark §4.7, destination can be on the next line + if p.at(NEWLINE) && !p.at_blank_line() { + bump_textual_link_def(p); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + } + if p.at(L_ANGLE) { // Angle-bracketed: consume < ... > bump_textual_link_def(p); @@ -393,17 +517,21 @@ fn parse_link_destination(p: &mut MarkdownParser) { break; // Bare destination stops at first whitespace } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends bare destination + let text = p.cur_text(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + break; } } - - bump_textual_link_def(p); } } @@ -457,16 +585,24 @@ fn title_on_next_line(p: &MarkdownParser) -> bool { // Check for title starter trimmed.starts_with('"') || trimmed.starts_with('\'') || trimmed.starts_with('(') } - -/// Parse a link title that appears on the next line after a newline. +/// Parse a link title that appears on next line, including trailing whitespace before newline. /// -/// Per CommonMark §4.7, titles can appear on the line following the destination. -fn parse_link_title_after_newline(p: &mut MarkdownParser) { +/// This is used when there's trailing whitespace after the destination but before +/// the newline that precedes the title. The trailing whitespace is included in the +/// title node to maintain the grammar structure. +fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) { let m = p.start(); let list = p.start(); - // Include the newline as textual content - bump_textual_link_def(p); + // Include trailing whitespace after destination + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + + // Include the newline + if p.at(NEWLINE) { + bump_textual_link_def(p); + } // Include leading whitespace on title line while is_whitespace_token(p) { @@ -517,9 +653,10 @@ fn get_title_close_char(p: &MarkdownParser) -> Option { } } -/// Parse title content until closing delimiter. +/// Parse title content until closing delimiter, including trailing whitespace. /// /// Inside title quotes, we use Regular context so whitespace doesn't split tokens. +/// Trailing whitespace after the title is also consumed to prevent spurious paragraphs. fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { let Some(close_char) = close_char else { return; @@ -535,6 +672,11 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { bump_textual_link_def(p); if is_complete { + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } return; } @@ -545,15 +687,20 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { break; } - // Check for closing delimiter + // Check for closing delimiter (must be unescaped) let is_close = if close_char == ')' { p.at(R_PAREN) } else { - p.cur_text().ends_with(close_char) + crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char) }; if is_close { // Use Regular context for title content bump_textual(p); + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } break; } diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index f3dee25a4a..0512db55fe 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -447,10 +447,8 @@ fn render_paragraph( } // Trim both ends - leading whitespace can appear from parser including // the space after list markers in the paragraph content - let content = strip_paragraph_indent( - content - .trim_matches(|c| c == ' ' || c == '\n' || c == '\r') - ); + let content = + strip_paragraph_indent(content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r')); if in_tight_list { // In tight lists, paragraphs are rendered without

tags @@ -1160,7 +1158,11 @@ where { if let Some(node) = label_node { let text = label_text(&node); - (text.clone(), Some(text)) + if text.trim().is_empty() { + (fallback, None) + } else { + (text.clone(), Some(text)) + } } else { (fallback, None) } @@ -1602,4 +1604,64 @@ mod tests { // U+0000 should become replacement character assert_eq!(decode_entity("�"), Some("\u{FFFD}".to_string())); } + + #[test] + fn test_percent_encode_uri() { + let input = format!("https://a{}b.c/%20/%", '\u{1F44D}'); + let encoded = percent_encode_uri(&input); + assert_eq!(encoded, "https://a%F0%9F%91%8Db.c/%20/%25"); + } + + #[test] + fn test_process_link_destination_decodes_entities() { + let encoded = process_link_destination("https://example.com/<"); + assert_eq!(encoded, "https://example.com/%3C"); + } + + #[test] + fn test_paren_depth_limit_in_destination() { + let dest = format!("x{}y{}", "(".repeat(32), ")".repeat(32)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected = format!("

a

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_paren_depth_limit_exceeded_in_destination() { + let dest = format!("x{}y{}", "(".repeat(33), ")".repeat(33)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected_dest = format!("x{}", "(".repeat(32)); + let trailing = ")".repeat(34); + let expected = format!("

a(y{trailing}

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_title_with_escaped_closing_quote() { + let parsed = parse_markdown("[a](/url \"title with \\\" quote\")\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

a

\n" + ); + } } diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap deleted file mode 100644 index 3e7f69d76b..0000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap +++ /dev/null @@ -1,90 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has ![unclosed image - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdReferenceImage { - excl_token: BANG@9..10 "!" [] [], - l_brack_token: L_BRACK@10..11 "[" [] [], - alt: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@25..26 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@26..26 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..26 - 0: (empty) - 1: MD_BLOCK_LIST@0..26 - 0: MD_PARAGRAPH@0..26 - 0: MD_INLINE_ITEM_LIST@0..26 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_REFERENCE_IMAGE@9..25 - 0: BANG@9..10 "!" [] [] - 1: L_BRACK@10..11 "[" [] [] - 2: MD_INLINE_ITEM_LIST@11..25 - 0: MD_TEXTUAL@11..25 - 0: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [] - 3: (empty) - 4: (empty) - 2: MD_TEXTUAL@25..26 - 0: MD_TEXTUAL_LITERAL@25..26 "\n" [] [] - 1: (empty) - 2: EOF@26..26 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_image.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed image, expected `]` to close alt text. - - > 1 │ This has ![unclosed image - │ ^^^^^^^^^^^^^^^^ - 2 │ - - i image started here - - > 1 │ This has ![unclosed image - │ ^^^^^^^^^^^^^^^^ - 2 │ - - i Format: ![alt text](image-url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap deleted file mode 100644 index e9bb02cd26..0000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap +++ /dev/null @@ -1,88 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has [unclosed link - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdReferenceLink { - l_brack_token: L_BRACK@9..10 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@24..24 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..24 - 0: (empty) - 1: MD_BLOCK_LIST@0..24 - 0: MD_PARAGRAPH@0..24 - 0: MD_INLINE_ITEM_LIST@0..24 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_REFERENCE_LINK@9..23 - 0: L_BRACK@9..10 "[" [] [] - 1: MD_INLINE_ITEM_LIST@10..23 - 0: MD_TEXTUAL@10..23 - 0: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [] - 2: (empty) - 3: (empty) - 2: MD_TEXTUAL@23..24 - 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] - 1: (empty) - 2: EOF@24..24 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_link.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has [unclosed link - │ ^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has [unclosed link - │ ^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md new file mode 100644 index 0000000000..502a100473 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md @@ -0,0 +1,17 @@ +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap new file mode 100644 index 0000000000..6a103654f6 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap @@ -0,0 +1,405 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@41..42 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..46 "link" [] [], + }, + ], + r_brack_token: R_BRACK@46..47 "]" [] [], + l_paren_token: L_PAREN@47..48 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..51 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@65..68 " )" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@69..70 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@91..92 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@92..93 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@93..97 "link" [] [], + }, + ], + r_brack_token: R_BRACK@97..98 "]" [] [], + l_paren_token: L_PAREN@98..99 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@99..103 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@103..104 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@110..111 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@121..122 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@122..123 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@123..124 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@165..166 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@166..167 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@167..171 "link" [] [], + }, + ], + r_brack_token: R_BRACK@171..172 "]" [] [], + l_paren_token: L_PAREN@172..173 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@177..178 "\n" [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@185..186 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@186..187 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@187..188 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@226..227 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@227..228 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@228..232 "link" [] [], + }, + ], + r_brack_token: R_BRACK@232..233 "]" [] [], + l_paren_token: L_PAREN@233..234 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@234..237 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@237..241 "/url" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@241..242 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@242..243 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@243..244 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@283..284 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@284..285 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@285..289 "link" [] [], + }, + ], + r_brack_token: R_BRACK@289..290 "]" [] [], + l_paren_token: L_PAREN@290..291 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@291..295 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@295..298 " " [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@298..299 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@299..300 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@300..300 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..300 + 0: (empty) + 1: MD_BLOCK_LIST@0..300 + 0: MD_PARAGRAPH@0..69 + 0: MD_INLINE_ITEM_LIST@0..69 + 0: MD_TEXTUAL@0..40 + 0: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [] + 1: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 "\n" [] [] + 2: MD_INLINE_LINK@41..68 + 0: L_BRACK@41..42 "[" [] [] + 1: MD_INLINE_ITEM_LIST@42..46 + 0: MD_TEXTUAL@42..46 + 0: MD_TEXTUAL_LITERAL@42..46 "link" [] [] + 2: R_BRACK@46..47 "]" [] [] + 3: L_PAREN@47..48 "(" [] [] + 4: MD_INLINE_ITEM_LIST@48..58 + 0: MD_TEXTUAL@48..51 + 0: MD_TEXTUAL_LITERAL@48..51 " " [] [] + 1: MD_TEXTUAL@51..55 + 0: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [] + 2: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 "\n" [] [] + 3: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 " " [] [] + 4: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 " " [] [] + 5: MD_LINK_TITLE@58..65 + 0: MD_INLINE_ITEM_LIST@58..65 + 0: MD_TEXTUAL@58..65 + 0: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [] + 6: R_PAREN@65..68 " )" [] [] + 3: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 "\n" [] [] + 1: (empty) + 1: MD_NEWLINE@69..70 + 0: NEWLINE@69..70 "\n" [] [] + 2: MD_PARAGRAPH@70..123 + 0: MD_INLINE_ITEM_LIST@70..123 + 0: MD_TEXTUAL@70..91 + 0: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [] + 1: MD_TEXTUAL@91..92 + 0: MD_TEXTUAL_LITERAL@91..92 "\n" [] [] + 2: MD_INLINE_LINK@92..122 + 0: L_BRACK@92..93 "[" [] [] + 1: MD_INLINE_ITEM_LIST@93..97 + 0: MD_TEXTUAL@93..97 + 0: MD_TEXTUAL_LITERAL@93..97 "link" [] [] + 2: R_BRACK@97..98 "]" [] [] + 3: L_PAREN@98..99 "(" [] [] + 4: MD_INLINE_ITEM_LIST@99..104 + 0: MD_TEXTUAL@99..103 + 0: MD_TEXTUAL_LITERAL@99..103 "/url" [] [] + 1: MD_TEXTUAL@103..104 + 0: MD_TEXTUAL_LITERAL@103..104 " " [] [] + 5: MD_LINK_TITLE@104..121 + 0: MD_INLINE_ITEM_LIST@104..121 + 0: MD_TEXTUAL@104..110 + 0: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [] + 1: MD_TEXTUAL@110..111 + 0: MD_TEXTUAL_LITERAL@110..111 "\n" [] [] + 2: MD_TEXTUAL@111..121 + 0: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [] + 6: R_PAREN@121..122 ")" [] [] + 3: MD_TEXTUAL@122..123 + 0: MD_TEXTUAL_LITERAL@122..123 "\n" [] [] + 1: (empty) + 3: MD_NEWLINE@123..124 + 0: NEWLINE@123..124 "\n" [] [] + 4: MD_PARAGRAPH@124..187 + 0: MD_INLINE_ITEM_LIST@124..187 + 0: MD_TEXTUAL@124..165 + 0: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [] + 1: MD_TEXTUAL@165..166 + 0: MD_TEXTUAL_LITERAL@165..166 "\n" [] [] + 2: MD_INLINE_LINK@166..186 + 0: L_BRACK@166..167 "[" [] [] + 1: MD_INLINE_ITEM_LIST@167..171 + 0: MD_TEXTUAL@167..171 + 0: MD_TEXTUAL_LITERAL@167..171 "link" [] [] + 2: R_BRACK@171..172 "]" [] [] + 3: L_PAREN@172..173 "(" [] [] + 4: MD_INLINE_ITEM_LIST@173..178 + 0: MD_TEXTUAL@173..177 + 0: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [] + 1: MD_TEXTUAL@177..178 + 0: MD_TEXTUAL_LITERAL@177..178 "\n" [] [] + 5: MD_LINK_TITLE@178..185 + 0: MD_INLINE_ITEM_LIST@178..185 + 0: MD_TEXTUAL@178..185 + 0: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [] + 6: R_PAREN@185..186 ")" [] [] + 3: MD_TEXTUAL@186..187 + 0: MD_TEXTUAL_LITERAL@186..187 "\n" [] [] + 1: (empty) + 5: MD_NEWLINE@187..188 + 0: NEWLINE@187..188 "\n" [] [] + 6: MD_PARAGRAPH@188..243 + 0: MD_INLINE_ITEM_LIST@188..243 + 0: MD_TEXTUAL@188..226 + 0: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [] + 1: MD_TEXTUAL@226..227 + 0: MD_TEXTUAL_LITERAL@226..227 "\n" [] [] + 2: MD_INLINE_LINK@227..242 + 0: L_BRACK@227..228 "[" [] [] + 1: MD_INLINE_ITEM_LIST@228..232 + 0: MD_TEXTUAL@228..232 + 0: MD_TEXTUAL_LITERAL@228..232 "link" [] [] + 2: R_BRACK@232..233 "]" [] [] + 3: L_PAREN@233..234 "(" [] [] + 4: MD_INLINE_ITEM_LIST@234..241 + 0: MD_TEXTUAL@234..237 + 0: MD_TEXTUAL_LITERAL@234..237 " " [] [] + 1: MD_TEXTUAL@237..241 + 0: MD_TEXTUAL_LITERAL@237..241 "/url" [] [] + 5: (empty) + 6: R_PAREN@241..242 ")" [] [] + 3: MD_TEXTUAL@242..243 + 0: MD_TEXTUAL_LITERAL@242..243 "\n" [] [] + 1: (empty) + 7: MD_NEWLINE@243..244 + 0: NEWLINE@243..244 "\n" [] [] + 8: MD_PARAGRAPH@244..300 + 0: MD_INLINE_ITEM_LIST@244..300 + 0: MD_TEXTUAL@244..283 + 0: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [] + 1: MD_TEXTUAL@283..284 + 0: MD_TEXTUAL_LITERAL@283..284 "\n" [] [] + 2: MD_INLINE_LINK@284..299 + 0: L_BRACK@284..285 "[" [] [] + 1: MD_INLINE_ITEM_LIST@285..289 + 0: MD_TEXTUAL@285..289 + 0: MD_TEXTUAL_LITERAL@285..289 "link" [] [] + 2: R_BRACK@289..290 "]" [] [] + 3: L_PAREN@290..291 "(" [] [] + 4: MD_INLINE_ITEM_LIST@291..298 + 0: MD_TEXTUAL@291..295 + 0: MD_TEXTUAL_LITERAL@291..295 "/url" [] [] + 1: MD_TEXTUAL@295..298 + 0: MD_TEXTUAL_LITERAL@295..298 " " [] [] + 5: (empty) + 6: R_PAREN@298..299 ")" [] [] + 3: MD_TEXTUAL@299..300 + 0: MD_TEXTUAL_LITERAL@299..300 "\n" [] [] + 1: (empty) + 2: EOF@300..300 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap index 2737efcebe..c81a255370 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap @@ -601,48 +601,44 @@ MdDocument { MdNewline { value_token: NEWLINE@473..474 "\n" [] [], }, - MdLinkReferenceDefinition { - l_brack_token: L_BRACK@474..475 "[" [] [], - label: MdLinkLabel { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], - }, - ], - }, - r_brack_token: R_BRACK@489..490 "]" [] [], - colon_token: COLON@490..491 ":" [] [], - destination: MdLinkDestination { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], - }, - ], - }, - title: missing (optional), - }, MdParagraph { list: MdInlineItemList [ MdTextual { - value_token: MD_TEXTUAL_LITERAL@498..499 " " [] [], + value_token: MD_TEXTUAL_LITERAL@474..475 "[" [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [], + value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@489..490 "]" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@490..491 ":" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], + }, + MdInlineHtml { + value: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], + }, + ], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@506..507 "\n" [] [], @@ -1021,36 +1017,33 @@ MdDocument { 1: (empty) 39: MD_NEWLINE@473..474 0: NEWLINE@473..474 "\n" [] [] - 40: MD_LINK_REFERENCE_DEFINITION@474..498 - 0: L_BRACK@474..475 "[" [] [] - 1: MD_LINK_LABEL@475..489 - 0: MD_INLINE_ITEM_LIST@475..489 - 0: MD_TEXTUAL@475..480 - 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] - 1: MD_TEXTUAL@480..481 - 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] - 2: MD_TEXTUAL@481..489 - 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] - 2: R_BRACK@489..490 "]" [] [] - 3: COLON@490..491 ":" [] [] - 4: MD_LINK_DESTINATION@491..498 - 0: MD_INLINE_ITEM_LIST@491..498 - 0: MD_TEXTUAL@491..492 - 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] - 1: MD_TEXTUAL@492..493 - 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] - 2: MD_TEXTUAL@493..497 - 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] - 3: MD_TEXTUAL@497..498 - 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] - 5: (empty) - 41: MD_PARAGRAPH@498..507 - 0: MD_INLINE_ITEM_LIST@498..507 - 0: MD_TEXTUAL@498..499 - 0: MD_TEXTUAL_LITERAL@498..499 " " [] [] - 1: MD_TEXTUAL@499..506 - 0: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [] - 2: MD_TEXTUAL@506..507 + 40: MD_PARAGRAPH@474..507 + 0: MD_INLINE_ITEM_LIST@474..507 + 0: MD_TEXTUAL@474..475 + 0: MD_TEXTUAL_LITERAL@474..475 "[" [] [] + 1: MD_TEXTUAL@475..480 + 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] + 2: MD_TEXTUAL@480..481 + 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] + 3: MD_TEXTUAL@481..489 + 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] + 4: MD_TEXTUAL@489..490 + 0: MD_TEXTUAL_LITERAL@489..490 "]" [] [] + 5: MD_TEXTUAL@490..491 + 0: MD_TEXTUAL_LITERAL@490..491 ":" [] [] + 6: MD_TEXTUAL@491..492 + 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] + 7: MD_INLINE_HTML@492..498 + 0: MD_INLINE_ITEM_LIST@492..498 + 0: MD_TEXTUAL@492..493 + 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] + 1: MD_TEXTUAL@493..497 + 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] + 2: MD_TEXTUAL@497..498 + 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] + 8: MD_TEXTUAL@498..506 + 0: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [] + 9: MD_TEXTUAL@506..507 0: MD_TEXTUAL_LITERAL@506..507 "\n" [] [] 1: (empty) 2: EOF@507..507 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap index 2d3cb5a3c2..3be4200606 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap @@ -1,6 +1,5 @@ --- source: crates/biome_markdown_parser/tests/spec_test.rs -assertion_line: 131 expression: snapshot --- ## Input @@ -184,11 +183,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@353..354 "\n" [] [], }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@354..355 "[" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@355..356 "]" [] [], + MdReferenceLink { + l_brack_token: L_BRACK@354..355 "[" [] [], + text: MdInlineItemList [], + r_brack_token: R_BRACK@355..356 "]" [] [], + label: missing (optional), }, MdTextual { value_token: MD_TEXTUAL_LITERAL@356..357 ":" [] [], @@ -315,15 +314,16 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@330..353 "Empty label is invalid:" [] [] 1: MD_TEXTUAL@353..354 0: MD_TEXTUAL_LITERAL@353..354 "\n" [] [] - 2: MD_TEXTUAL@354..355 - 0: MD_TEXTUAL_LITERAL@354..355 "[" [] [] - 3: MD_TEXTUAL@355..356 - 0: MD_TEXTUAL_LITERAL@355..356 "]" [] [] - 4: MD_TEXTUAL@356..357 + 2: MD_REFERENCE_LINK@354..356 + 0: L_BRACK@354..355 "[" [] [] + 1: MD_INLINE_ITEM_LIST@355..355 + 2: R_BRACK@355..356 "]" [] [] + 3: (empty) + 3: MD_TEXTUAL@356..357 0: MD_TEXTUAL_LITERAL@356..357 ":" [] [] - 5: MD_TEXTUAL@357..362 + 4: MD_TEXTUAL@357..362 0: MD_TEXTUAL_LITERAL@357..362 " /url" [] [] - 6: MD_TEXTUAL@362..363 + 5: MD_TEXTUAL@362..363 0: MD_TEXTUAL_LITERAL@362..363 "\n" [] [] 1: (empty) 2: EOF@363..363 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap similarity index 50% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap index 7d67fcd2a8..f0ecbb1ced 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap @@ -25,19 +25,16 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@1..4 "foo" [] [], }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], + }, ], - r_brack_token: missing (required), + r_brack_token: R_BRACK@8..9 "]" [] [], label: missing (optional), }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@8..9 "]" [] [], - }, MdTextual { value_token: MD_TEXTUAL_LITERAL@9..10 "\n" [] [], }, @@ -57,45 +54,20 @@ MdDocument { 1: MD_BLOCK_LIST@0..10 0: MD_PARAGRAPH@0..10 0: MD_INLINE_ITEM_LIST@0..10 - 0: MD_REFERENCE_LINK@0..4 + 0: MD_REFERENCE_LINK@0..9 0: L_BRACK@0..1 "[" [] [] - 1: MD_INLINE_ITEM_LIST@1..4 + 1: MD_INLINE_ITEM_LIST@1..8 0: MD_TEXTUAL@1..4 0: MD_TEXTUAL_LITERAL@1..4 "foo" [] [] - 2: (empty) + 1: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] + 2: MD_TEXTUAL@5..8 + 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] + 2: R_BRACK@8..9 "]" [] [] 3: (empty) - 1: MD_TEXTUAL@4..5 - 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] - 2: MD_TEXTUAL@5..8 - 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] - 3: MD_TEXTUAL@8..9 - 0: MD_TEXTUAL_LITERAL@8..9 "]" [] [] - 4: MD_TEXTUAL@9..10 + 1: MD_TEXTUAL@9..10 0: MD_TEXTUAL_LITERAL@9..10 "\n" [] [] 1: (empty) 2: EOF@10..10 "" [] [] ``` - -## Diagnostics - -``` -multiline_label_reference.md:1:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i link started here - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md new file mode 100644 index 0000000000..3cbf1f91d3 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md @@ -0,0 +1 @@ +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap new file mode 100644 index 0000000000..236bd2046f --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap @@ -0,0 +1,399 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdInlineLink { + l_brack_token: L_BRACK@0..1 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@1..2 "a" [] [], + }, + ], + r_brack_token: R_BRACK@2..3 "]" [] [], + l_paren_token: L_PAREN@3..4 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "x" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..6 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@6..7 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@8..9 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..12 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..13 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@13..14 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@14..15 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..17 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@17..18 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@18..19 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@19..20 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@20..21 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@21..22 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@22..23 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@24..25 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@26..27 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@27..28 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@28..29 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@29..30 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@30..31 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@31..32 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@32..33 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@33..34 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@34..35 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@35..36 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@36..37 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@37..38 "y" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@38..39 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@39..40 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@41..42 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..43 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@43..44 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@44..45 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@45..46 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@46..47 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@47..48 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..49 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@49..50 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@50..51 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..52 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@52..53 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@53..54 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@54..55 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..59 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@59..60 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@60..61 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@61..62 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@62..63 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@63..64 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@64..65 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@65..66 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@66..67 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@67..68 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@69..70 ")" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@70..71 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@71..72 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@72..72 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..72 + 0: (empty) + 1: MD_BLOCK_LIST@0..72 + 0: MD_PARAGRAPH@0..72 + 0: MD_INLINE_ITEM_LIST@0..72 + 0: MD_INLINE_LINK@0..71 + 0: L_BRACK@0..1 "[" [] [] + 1: MD_INLINE_ITEM_LIST@1..2 + 0: MD_TEXTUAL@1..2 + 0: MD_TEXTUAL_LITERAL@1..2 "a" [] [] + 2: R_BRACK@2..3 "]" [] [] + 3: L_PAREN@3..4 "(" [] [] + 4: MD_INLINE_ITEM_LIST@4..70 + 0: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "x" [] [] + 1: MD_TEXTUAL@5..6 + 0: MD_TEXTUAL_LITERAL@5..6 "(" [] [] + 2: MD_TEXTUAL@6..7 + 0: MD_TEXTUAL_LITERAL@6..7 "(" [] [] + 3: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "(" [] [] + 4: MD_TEXTUAL@8..9 + 0: MD_TEXTUAL_LITERAL@8..9 "(" [] [] + 5: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "(" [] [] + 6: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "(" [] [] + 7: MD_TEXTUAL@11..12 + 0: MD_TEXTUAL_LITERAL@11..12 "(" [] [] + 8: MD_TEXTUAL@12..13 + 0: MD_TEXTUAL_LITERAL@12..13 "(" [] [] + 9: MD_TEXTUAL@13..14 + 0: MD_TEXTUAL_LITERAL@13..14 "(" [] [] + 10: MD_TEXTUAL@14..15 + 0: MD_TEXTUAL_LITERAL@14..15 "(" [] [] + 11: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "(" [] [] + 12: MD_TEXTUAL@16..17 + 0: MD_TEXTUAL_LITERAL@16..17 "(" [] [] + 13: MD_TEXTUAL@17..18 + 0: MD_TEXTUAL_LITERAL@17..18 "(" [] [] + 14: MD_TEXTUAL@18..19 + 0: MD_TEXTUAL_LITERAL@18..19 "(" [] [] + 15: MD_TEXTUAL@19..20 + 0: MD_TEXTUAL_LITERAL@19..20 "(" [] [] + 16: MD_TEXTUAL@20..21 + 0: MD_TEXTUAL_LITERAL@20..21 "(" [] [] + 17: MD_TEXTUAL@21..22 + 0: MD_TEXTUAL_LITERAL@21..22 "(" [] [] + 18: MD_TEXTUAL@22..23 + 0: MD_TEXTUAL_LITERAL@22..23 "(" [] [] + 19: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "(" [] [] + 20: MD_TEXTUAL@24..25 + 0: MD_TEXTUAL_LITERAL@24..25 "(" [] [] + 21: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "(" [] [] + 22: MD_TEXTUAL@26..27 + 0: MD_TEXTUAL_LITERAL@26..27 "(" [] [] + 23: MD_TEXTUAL@27..28 + 0: MD_TEXTUAL_LITERAL@27..28 "(" [] [] + 24: MD_TEXTUAL@28..29 + 0: MD_TEXTUAL_LITERAL@28..29 "(" [] [] + 25: MD_TEXTUAL@29..30 + 0: MD_TEXTUAL_LITERAL@29..30 "(" [] [] + 26: MD_TEXTUAL@30..31 + 0: MD_TEXTUAL_LITERAL@30..31 "(" [] [] + 27: MD_TEXTUAL@31..32 + 0: MD_TEXTUAL_LITERAL@31..32 "(" [] [] + 28: MD_TEXTUAL@32..33 + 0: MD_TEXTUAL_LITERAL@32..33 "(" [] [] + 29: MD_TEXTUAL@33..34 + 0: MD_TEXTUAL_LITERAL@33..34 "(" [] [] + 30: MD_TEXTUAL@34..35 + 0: MD_TEXTUAL_LITERAL@34..35 "(" [] [] + 31: MD_TEXTUAL@35..36 + 0: MD_TEXTUAL_LITERAL@35..36 "(" [] [] + 32: MD_TEXTUAL@36..37 + 0: MD_TEXTUAL_LITERAL@36..37 "(" [] [] + 33: MD_TEXTUAL@37..38 + 0: MD_TEXTUAL_LITERAL@37..38 "y" [] [] + 34: MD_TEXTUAL@38..39 + 0: MD_TEXTUAL_LITERAL@38..39 ")" [] [] + 35: MD_TEXTUAL@39..40 + 0: MD_TEXTUAL_LITERAL@39..40 ")" [] [] + 36: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 ")" [] [] + 37: MD_TEXTUAL@41..42 + 0: MD_TEXTUAL_LITERAL@41..42 ")" [] [] + 38: MD_TEXTUAL@42..43 + 0: MD_TEXTUAL_LITERAL@42..43 ")" [] [] + 39: MD_TEXTUAL@43..44 + 0: MD_TEXTUAL_LITERAL@43..44 ")" [] [] + 40: MD_TEXTUAL@44..45 + 0: MD_TEXTUAL_LITERAL@44..45 ")" [] [] + 41: MD_TEXTUAL@45..46 + 0: MD_TEXTUAL_LITERAL@45..46 ")" [] [] + 42: MD_TEXTUAL@46..47 + 0: MD_TEXTUAL_LITERAL@46..47 ")" [] [] + 43: MD_TEXTUAL@47..48 + 0: MD_TEXTUAL_LITERAL@47..48 ")" [] [] + 44: MD_TEXTUAL@48..49 + 0: MD_TEXTUAL_LITERAL@48..49 ")" [] [] + 45: MD_TEXTUAL@49..50 + 0: MD_TEXTUAL_LITERAL@49..50 ")" [] [] + 46: MD_TEXTUAL@50..51 + 0: MD_TEXTUAL_LITERAL@50..51 ")" [] [] + 47: MD_TEXTUAL@51..52 + 0: MD_TEXTUAL_LITERAL@51..52 ")" [] [] + 48: MD_TEXTUAL@52..53 + 0: MD_TEXTUAL_LITERAL@52..53 ")" [] [] + 49: MD_TEXTUAL@53..54 + 0: MD_TEXTUAL_LITERAL@53..54 ")" [] [] + 50: MD_TEXTUAL@54..55 + 0: MD_TEXTUAL_LITERAL@54..55 ")" [] [] + 51: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 ")" [] [] + 52: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 ")" [] [] + 53: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 ")" [] [] + 54: MD_TEXTUAL@58..59 + 0: MD_TEXTUAL_LITERAL@58..59 ")" [] [] + 55: MD_TEXTUAL@59..60 + 0: MD_TEXTUAL_LITERAL@59..60 ")" [] [] + 56: MD_TEXTUAL@60..61 + 0: MD_TEXTUAL_LITERAL@60..61 ")" [] [] + 57: MD_TEXTUAL@61..62 + 0: MD_TEXTUAL_LITERAL@61..62 ")" [] [] + 58: MD_TEXTUAL@62..63 + 0: MD_TEXTUAL_LITERAL@62..63 ")" [] [] + 59: MD_TEXTUAL@63..64 + 0: MD_TEXTUAL_LITERAL@63..64 ")" [] [] + 60: MD_TEXTUAL@64..65 + 0: MD_TEXTUAL_LITERAL@64..65 ")" [] [] + 61: MD_TEXTUAL@65..66 + 0: MD_TEXTUAL_LITERAL@65..66 ")" [] [] + 62: MD_TEXTUAL@66..67 + 0: MD_TEXTUAL_LITERAL@66..67 ")" [] [] + 63: MD_TEXTUAL@67..68 + 0: MD_TEXTUAL_LITERAL@67..68 ")" [] [] + 64: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 ")" [] [] + 65: MD_TEXTUAL@69..70 + 0: MD_TEXTUAL_LITERAL@69..70 ")" [] [] + 5: (empty) + 6: R_PAREN@70..71 ")" [] [] + 1: MD_TEXTUAL@71..72 + 0: MD_TEXTUAL_LITERAL@71..72 "\n" [] [] + 1: (empty) + 2: EOF@72..72 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md index 50ee14bae9..1fd8994439 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md @@ -31,3 +31,7 @@ Nested in paragraph: This is a paragraph with [a reference][foo] in the middle. Case-insensitive: [case label] Whitespace normalized: [case label] + +[label\]]: https://escaped.example + +Escaped bracket in label: [text][label\]] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap index 8895f1a6a9..ebd7da580a 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap @@ -39,6 +39,10 @@ Case-insensitive: [case label] Whitespace normalized: [case label] +[label\]]: https://escaped.example + +Escaped bracket in label: [text][label\]] + ``` @@ -580,17 +584,84 @@ MdDocument { ], hard_line: missing (optional), }, + MdNewline { + value_token: NEWLINE@678..679 "\n" [] [], + }, + MdLinkReferenceDefinition { + l_brack_token: L_BRACK@679..680 "[" [] [], + label: MdLinkLabel { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@680..685 "label" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@685..687 "\\]" [] [], + }, + ], + }, + r_brack_token: R_BRACK@687..688 "]" [] [], + colon_token: COLON@688..689 ":" [] [], + destination: MdLinkDestination { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@689..690 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@690..713 "https://escaped.example" [] [], + }, + ], + }, + title: missing (optional), + }, + MdNewline { + value_token: NEWLINE@713..714 "\n" [] [], + }, + MdNewline { + value_token: NEWLINE@714..715 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@715..741 "Escaped bracket in label: " [] [], + }, + MdReferenceLink { + l_brack_token: L_BRACK@741..742 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@742..746 "text" [] [], + }, + ], + r_brack_token: R_BRACK@746..747 "]" [] [], + label: MdReferenceLinkLabel { + l_brack_token: L_BRACK@747..748 "[" [] [], + label: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@748..753 "label" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@753..755 "\\]" [] [], + }, + ], + r_brack_token: R_BRACK@755..756 "]" [] [], + }, + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@756..757 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, ], - eof_token: EOF@678..678 "" [] [], + eof_token: EOF@757..757 "" [] [], } ``` ## CST ``` -0: MD_DOCUMENT@0..678 +0: MD_DOCUMENT@0..757 0: (empty) - 1: MD_BLOCK_LIST@0..678 + 1: MD_BLOCK_LIST@0..757 0: MD_LINK_REFERENCE_DEFINITION@0..46 0: L_BRACK@0..1 "[" [] [] 1: MD_LINK_LABEL@1..8 @@ -944,6 +1015,50 @@ MdDocument { 2: MD_TEXTUAL@677..678 0: MD_TEXTUAL_LITERAL@677..678 "\n" [] [] 1: (empty) - 2: EOF@678..678 "" [] [] + 37: MD_NEWLINE@678..679 + 0: NEWLINE@678..679 "\n" [] [] + 38: MD_LINK_REFERENCE_DEFINITION@679..713 + 0: L_BRACK@679..680 "[" [] [] + 1: MD_LINK_LABEL@680..687 + 0: MD_INLINE_ITEM_LIST@680..687 + 0: MD_TEXTUAL@680..685 + 0: MD_TEXTUAL_LITERAL@680..685 "label" [] [] + 1: MD_TEXTUAL@685..687 + 0: MD_TEXTUAL_LITERAL@685..687 "\\]" [] [] + 2: R_BRACK@687..688 "]" [] [] + 3: COLON@688..689 ":" [] [] + 4: MD_LINK_DESTINATION@689..713 + 0: MD_INLINE_ITEM_LIST@689..713 + 0: MD_TEXTUAL@689..690 + 0: MD_TEXTUAL_LITERAL@689..690 " " [] [] + 1: MD_TEXTUAL@690..713 + 0: MD_TEXTUAL_LITERAL@690..713 "https://escaped.example" [] [] + 5: (empty) + 39: MD_NEWLINE@713..714 + 0: NEWLINE@713..714 "\n" [] [] + 40: MD_NEWLINE@714..715 + 0: NEWLINE@714..715 "\n" [] [] + 41: MD_PARAGRAPH@715..757 + 0: MD_INLINE_ITEM_LIST@715..757 + 0: MD_TEXTUAL@715..741 + 0: MD_TEXTUAL_LITERAL@715..741 "Escaped bracket in label: " [] [] + 1: MD_REFERENCE_LINK@741..756 + 0: L_BRACK@741..742 "[" [] [] + 1: MD_INLINE_ITEM_LIST@742..746 + 0: MD_TEXTUAL@742..746 + 0: MD_TEXTUAL_LITERAL@742..746 "text" [] [] + 2: R_BRACK@746..747 "]" [] [] + 3: MD_REFERENCE_LINK_LABEL@747..756 + 0: L_BRACK@747..748 "[" [] [] + 1: MD_INLINE_ITEM_LIST@748..755 + 0: MD_TEXTUAL@748..753 + 0: MD_TEXTUAL_LITERAL@748..753 "label" [] [] + 1: MD_TEXTUAL@753..755 + 0: MD_TEXTUAL_LITERAL@753..755 "\\]" [] [] + 2: R_BRACK@755..756 "]" [] [] + 2: MD_TEXTUAL@756..757 + 0: MD_TEXTUAL_LITERAL@756..757 "\n" [] [] + 1: (empty) + 2: EOF@757..757 "" [] [] ``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap new file mode 100644 index 0000000000..34a87e78ae --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap @@ -0,0 +1,65 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has ![unclosed image + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "!" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@26..26 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..26 + 0: (empty) + 1: MD_BLOCK_LIST@0..26 + 0: MD_PARAGRAPH@0..26 + 0: MD_INLINE_ITEM_LIST@0..26 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "!" [] [] + 2: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "[" [] [] + 3: MD_TEXTUAL@11..25 + 0: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [] + 4: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "\n" [] [] + 1: (empty) + 2: EOF@26..26 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap new file mode 100644 index 0000000000..54da8f2881 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap @@ -0,0 +1,60 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has [unclosed link + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@24..24 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..24 + 0: (empty) + 1: MD_BLOCK_LIST@0..24 + 0: MD_PARAGRAPH@0..24 + 0: MD_INLINE_ITEM_LIST@0..24 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "[" [] [] + 2: MD_TEXTUAL@10..23 + 0: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [] + 3: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] + 1: (empty) + 2: EOF@24..24 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap similarity index 58% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap index 9e23a10310..eff2d1ec3c 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap @@ -33,15 +33,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@14..15 "]" [] [], }, - MdReferenceLink { - l_brack_token: L_BRACK@15..16 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@30..31 "\n" [] [], @@ -72,37 +68,13 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@11..14 "alt" [] [] 4: MD_TEXTUAL@14..15 0: MD_TEXTUAL_LITERAL@14..15 "]" [] [] - 5: MD_REFERENCE_LINK@15..30 - 0: L_BRACK@15..16 "[" [] [] - 1: MD_INLINE_ITEM_LIST@16..30 - 0: MD_TEXTUAL@16..30 - 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] - 2: (empty) - 3: (empty) - 6: MD_TEXTUAL@30..31 + 5: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "[" [] [] + 6: MD_TEXTUAL@16..30 + 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] + 7: MD_TEXTUAL@30..31 0: MD_TEXTUAL_LITERAL@30..31 "\n" [] [] 1: (empty) 2: EOF@31..31 "" [] [] ``` - -## Diagnostics - -``` -unclosed_reference_image_label.md:1:16 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has ![alt][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has ![alt][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap similarity index 55% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap index 4414aecd6f..56fa9b0978 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap @@ -30,15 +30,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@14..15 "]" [] [], }, - MdReferenceLink { - l_brack_token: L_BRACK@15..16 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@30..31 "\n" [] [], @@ -67,37 +63,13 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@10..14 "text" [] [] 3: MD_TEXTUAL@14..15 0: MD_TEXTUAL_LITERAL@14..15 "]" [] [] - 4: MD_REFERENCE_LINK@15..30 - 0: L_BRACK@15..16 "[" [] [] - 1: MD_INLINE_ITEM_LIST@16..30 - 0: MD_TEXTUAL@16..30 - 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] - 2: (empty) - 3: (empty) - 5: MD_TEXTUAL@30..31 + 4: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "[" [] [] + 5: MD_TEXTUAL@16..30 + 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] + 6: MD_TEXTUAL@30..31 0: MD_TEXTUAL_LITERAL@30..31 "\n" [] [] 1: (empty) 2: EOF@31..31 "" [] [] ``` - -## Diagnostics - -``` -unclosed_reference_link_label.md:1:16 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has [text][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has [text][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` --- Cargo.lock | 1 + crates/biome_markdown_parser/Cargo.toml | 1 + .../src/link_reference.rs | 27 +- crates/biome_markdown_parser/src/syntax.rs | 100 +++- .../src/syntax/inline.rs | 464 ++++++++++++++++-- .../src/syntax/link_block.rs | 283 ++++++++--- crates/biome_markdown_parser/src/to_html.rs | 72 ++- .../error/unclosed_image.md.snap | 90 ---- .../md_test_suite/error/unclosed_link.md.snap | 88 ---- .../ok/inline_link_whitespace.md | 17 + .../ok/inline_link_whitespace.md.snap | 405 +++++++++++++++ .../ok/link_definition_edge_cases.md.snap | 127 +++-- .../ok/link_definition_invalid.md.snap | 26 +- .../multiline_label_reference.md | 0 .../multiline_label_reference.md.snap | 58 +-- .../md_test_suite/ok/paren_depth_limit.md | 1 + .../ok/paren_depth_limit.md.snap | 399 +++++++++++++++ .../tests/md_test_suite/ok/reference_links.md | 4 + .../md_test_suite/ok/reference_links.md.snap | 123 ++++- .../{error => ok}/unclosed_image.md | 0 .../md_test_suite/ok/unclosed_image.md.snap | 65 +++ .../{error => ok}/unclosed_link.md | 0 .../md_test_suite/ok/unclosed_link.md.snap | 60 +++ .../unclosed_reference_image_label.md | 0 .../unclosed_reference_image_label.md.snap | 48 +- .../unclosed_reference_link_label.md | 0 .../unclosed_reference_link_label.md.snap | 48 +- 27 files changed, 2002 insertions(+), 505 deletions(-) delete mode 100644 crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap delete mode 100644 crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/multiline_label_reference.md (100%) rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/multiline_label_reference.md.snap (50%) create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_image.md (100%) create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_link.md (100%) create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_reference_image_label.md (100%) rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_reference_image_label.md.snap (58%) rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_reference_link_label.md (100%) rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_reference_link_label.md.snap (55%) diff --git a/Cargo.lock b/Cargo.lock index d7054e2153fd..fc2f323c10e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1297,6 +1297,7 @@ dependencies = [ "biome_markdown_syntax", "biome_parser", "biome_rowan", + "biome_string_case", "biome_test_utils", "biome_unicode_table", "htmlize", diff --git a/crates/biome_markdown_parser/Cargo.toml b/crates/biome_markdown_parser/Cargo.toml index c3087a08e71c..3f100686af81 100644 --- a/crates/biome_markdown_parser/Cargo.toml +++ b/crates/biome_markdown_parser/Cargo.toml @@ -22,6 +22,7 @@ biome_markdown_factory = { workspace = true } biome_markdown_syntax = { workspace = true } biome_parser = { workspace = true } biome_rowan = { workspace = true } +biome_string_case = { workspace = true } biome_unicode_table = { workspace = true } # Optional dependency for test_utils feature (HTML rendering for spec tests) htmlize = { version = "1.0.6", features = ["unescape"], optional = true } diff --git a/crates/biome_markdown_parser/src/link_reference.rs b/crates/biome_markdown_parser/src/link_reference.rs index 7edfb0b9db9f..e67f38759aa9 100644 --- a/crates/biome_markdown_parser/src/link_reference.rs +++ b/crates/biome_markdown_parser/src/link_reference.rs @@ -1,5 +1,7 @@ use std::collections::HashSet; +use biome_string_case::StrOnlyExtension; + use biome_markdown_syntax::{MdLinkLabel, MdLinkReferenceDefinition}; use biome_rowan::{AstNode, Direction}; @@ -8,19 +10,20 @@ use crate::MarkdownParseOptions; use crate::parser::MarkdownParser; use crate::syntax::parse_document; +/// Normalize a reference label per CommonMark spec. +/// +/// Per CommonMark, label normalization involves: +/// 1. Collapsing consecutive whitespace into a single space +/// 2. Case-folding (case-insensitive matching) +/// +/// IMPORTANT: Backslash escapes are NOT stripped during normalization. +/// This means `[foo\!]` does NOT match `[foo!]` - the backslash is preserved. +/// This matches cmark's reference implementation behavior. pub(crate) fn normalize_reference_label(text: &str) -> String { let mut out = String::new(); - let mut chars = text.chars().peekable(); let mut saw_whitespace = false; - while let Some(c) = chars.next() { - if c == '\\' { - if let Some(next) = chars.next() { - push_normalized_char(&mut out, next, &mut saw_whitespace); - } - continue; - } - + for c in text.chars() { if c.is_whitespace() { saw_whitespace = true; continue; @@ -29,7 +32,7 @@ pub(crate) fn normalize_reference_label(text: &str) -> String { push_normalized_char(&mut out, c, &mut saw_whitespace); } - out + out.as_str().to_lowercase_cow().to_uppercase() } fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { @@ -37,9 +40,7 @@ fn push_normalized_char(out: &mut String, c: char, saw_whitespace: &mut bool) { out.push(' '); } *saw_whitespace = false; - for lower in c.to_lowercase() { - out.push(lower); - } + out.push(c); } pub(crate) fn collect_link_reference_definitions( diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index 1c9bdac15277..e4fecdd82c55 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -60,6 +60,9 @@ use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block}; use crate::MarkdownParser; +/// Maximum paren nesting allowed in link destinations per CommonMark. +pub(crate) const MAX_LINK_DESTINATION_PAREN_DEPTH: i32 = 32; + /// CommonMark requires 4 or more spaces for indented code blocks. const INDENT_CODE_BLOCK_SPACES: usize = 4; @@ -71,6 +74,98 @@ pub(crate) fn parse_document(p: &mut MarkdownParser) { m.complete(p, MD_DOCUMENT); } +/// Result of updating parenthesis depth when scanning link destinations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ParenDepthResult { + /// Depth updated successfully, contains new depth value + Ok(i32), + /// Depth would exceed the maximum (too many nested opening parens). + /// Per cmark, this truncates the destination at this point. + DepthExceeded, + /// Unmatched closing paren (would go below 0). + /// This typically means the `)` belongs to the enclosing construct. + UnmatchedClose, +} + +pub(crate) fn try_update_paren_depth(text: &str, depth: i32, max: i32) -> ParenDepthResult { + let mut depth = depth; + let mut chars = text.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' && matches!(chars.peek(), Some('(' | ')')) { + chars.next(); + continue; + } + + if c == '(' { + if depth == max { + return ParenDepthResult::DepthExceeded; + } + depth += 1; + } else if c == ')' { + if depth == 0 { + return ParenDepthResult::UnmatchedClose; + } + depth -= 1; + } + } + + ParenDepthResult::Ok(depth) +} + +pub(crate) enum LinkDestinationKind { + Enclosed, + Raw, +} + +pub(crate) fn validate_link_destination_text( + text: &str, + kind: LinkDestinationKind, + pending_escape: &mut bool, +) -> bool { + for c in text.chars() { + if *pending_escape { + if c.is_ascii_punctuation() { + *pending_escape = false; + continue; + } + *pending_escape = false; + } + + if c == '\\' { + *pending_escape = true; + continue; + } + + if c.is_ascii_control() { + return false; + } + + if matches!(kind, LinkDestinationKind::Enclosed) && c == '<' { + return false; + } + } + + true +} + +pub(crate) fn ends_with_unescaped_close(text: &str, close_char: char) -> bool { + if !text.ends_with(close_char) { + return false; + } + + let mut backslashes = 0; + for c in text.chars().rev().skip(1) { + if c == '\\' { + backslashes += 1; + } else { + break; + } + } + + backslashes % 2 == 0 +} + pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax { let mut list = DocumentBlockList; Present(list.parse_list(p)) @@ -837,7 +932,10 @@ fn set_inline_emphasis_context( source }; let base_offset = u32::from(p.cur_range().start()) as usize; - let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset); + // Create a reference checker closure that uses the parser's link reference definitions + let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| { + p.has_link_reference_definition(label) + }); p.set_emphasis_context(Some(context)) } diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index 275a09ddb6d2..f336b37b33a6 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -611,15 +611,46 @@ pub(crate) fn parse_inline_italic(p: &mut MarkdownParser) -> ParsedSyntax { parse_emphasis_from_context(p, false) } -fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { +fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) -> bool { let m = p.start(); let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + let mut has_nested_link = false; loop { - if p.at(stop) || p.at_inline_end() { + // Per CommonMark, link text can span lines, but blank lines end the link. + // Check for blank line (NEWLINE followed by NEWLINE or EOF after optional whitespace) + if p.at(NEWLINE) { + if p.at_blank_line() { + break; // Blank line ends link text + } + // Single newline inside link text - consume and continue + let _ = super::parse_textual(p); + continue; + } + + if p.at(T![EOF]) { break; } + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + if p.at(L_BRACK) { + if !has_nested_link && nested_link_starts_here(p) { + has_nested_link = true; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + if parse_any_inline_no_links(p).is_absent() { break; } @@ -627,13 +658,53 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS m.complete(p, MD_INLINE_ITEM_LIST); p.set_emphasis_context(prev_context); + has_nested_link +} + +fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + if !p.at(L_BRACK) { + return false; + } + + p.bump(L_BRACK); + let mut depth = 0usize; + + loop { + if p.at(EOF) || p.at_inline_end() { + return false; + } + + if p.at(L_BRACK) { + depth += 1; + p.bump(L_BRACK); + continue; + } + + if p.at(R_BRACK) { + if depth > 0 { + depth -= 1; + p.bump(R_BRACK); + continue; + } + p.bump(R_BRACK); + return p.at(L_PAREN) || p.at(L_BRACK); + } + + p.bump(p.cur()); + } + }) } fn parse_any_inline_no_links(p: &mut MarkdownParser) -> ParsedSyntax { - if (p.at(BANG) && p.nth_at(1, L_BRACK)) || p.at(L_BRACK) { + if p.at(L_BRACK) { return super::parse_textual(p); } + if p.at(BANG) && p.nth_at(1, L_BRACK) { + return parse_inline_image(p); + } + parse_any_inline(p) } @@ -826,10 +897,15 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); // ] - if missing at inline end, emit diagnostic; otherwise rewind if !p.eat(R_BRACK) { + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } if p.at_inline_end() { // Unclosed link/image at end of inline content - emit diagnostic // Expand range to include the text content, not just the opening bracket @@ -843,19 +919,50 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn p.rewind(checkpoint); return Absent; } + let text_end_offset = p.cur_range().start(); + + if matches!(kind, LinkParseKind::Link) && has_nested_link { + m.abandon(p); + p.rewind(checkpoint); + return Absent; + } // Now decide based on what follows ] - if p.at(L_PAREN) { + let link_validation = if p.at(L_PAREN) { + inline_link_is_valid(p) + } else { + InlineLinkValidation::Invalid + }; + + if matches!( + link_validation, + InlineLinkValidation::Valid | InlineLinkValidation::DepthExceeded + ) { // Inline link/image: [text](url) or ![alt](url) // Bump past ( and lex the following tokens in LinkDefinition context // so whitespace separates destination and title. p.expect_with_context(L_PAREN, crate::lexer::MarkdownLexContext::LinkDefinition); let destination = p.start(); - parse_inline_link_destination_tokens(p); + let destination_result = parse_inline_link_destination_tokens(p); + + // When depth exceeded, destination is truncated but link is still valid. + // Complete the destination and link immediately without looking for closing paren. + if destination_result == DestinationScanResult::DepthExceeded { + destination.complete(p, MD_INLINE_ITEM_LIST); + return Present(m.complete(p, kind.inline_kind())); + } + let has_title = inline_title_starts_after_whitespace_tokens(p); - while is_whitespace_token(p) { - bump_textual_link_def(p); + while is_title_separator_token(p) { + bump_link_def_separator(p); + } + if destination_result == DestinationScanResult::Invalid { + destination.abandon(p); + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } destination.complete(p, MD_INLINE_ITEM_LIST); @@ -867,8 +974,20 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn title_m.complete(p, MD_LINK_TITLE); } + // Skip trailing whitespace/newlines before closing paren without creating nodes + // (creating nodes would violate the MD_INLINE_LINK grammar which expects exactly 7 children) + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if !p.eat(R_PAREN) { - kind.report_unclosed_destination(p, opening_range); + if p.at_inline_end() { + kind.report_unclosed_destination(p, opening_range); + } + m.abandon(p); + p.rewind(checkpoint); + p.force_relex_regular(); + return Absent; } Present(m.complete(p, kind.inline_kind())) @@ -888,7 +1007,7 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) @@ -901,14 +1020,13 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, reference.end_offset); + return consume_textual_until_offset(p, text_end_offset); } Present(m.complete(p, kind.reference_kind())) } } struct ReferenceLinkLookahead { - end_offset: TextSize, label_raw: String, is_shortcut: bool, } @@ -947,7 +1065,13 @@ fn lookahead_reference_common( p.bump(L_BRACK); let link_text = collect_bracket_text(p)?; - let end_offset = p.cur_range().end(); + + // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized_link = normalize_reference_label(&link_text); + if normalized_link.is_empty() { + return None; + } + p.bump(R_BRACK); if p.at(L_PAREN) { @@ -961,12 +1085,15 @@ fn lookahead_reference_common( let label = if label_text.is_empty() { link_text.clone() } else { + // Explicit label must also normalize to non-empty + let normalized_label = normalize_reference_label(&label_text); + if normalized_label.is_empty() { + return None; + } label_text }; - let end_offset = p.cur_range().end(); p.bump(R_BRACK); return Some(ReferenceLinkLookahead { - end_offset, label_raw: label, is_shortcut: false, }); @@ -974,7 +1101,6 @@ fn lookahead_reference_common( } Some(ReferenceLinkLookahead { - end_offset, label_raw: link_text, is_shortcut: true, }) @@ -1025,48 +1151,307 @@ fn is_whitespace_token(p: &MarkdownParser) -> bool { fn inline_title_starts_after_whitespace_tokens(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - while is_whitespace_token(p) { - bump_textual_link_def(p); + let mut saw_whitespace = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + saw_whitespace = true; + } + saw_whitespace && get_title_close_char(p).is_some() + }) +} + +/// Result of validating an inline link. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum InlineLinkValidation { + /// Link is valid with complete destination + Valid, + /// Link is invalid + Invalid, + /// Link is valid but destination was truncated due to paren depth limit. + /// The link should be closed immediately without looking for `)`. + DepthExceeded, +} + +fn inline_link_is_valid(p: &mut MarkdownParser) -> InlineLinkValidation { + p.lookahead(|p| { + if !p.at(L_PAREN) { + return InlineLinkValidation::Invalid; + } + + p.bump(L_PAREN); + p.re_lex_link_definition(); + + let destination_result = scan_inline_link_destination_tokens(p); + + // If depth exceeded, link is valid but truncated - no need to check for closing paren + if destination_result == DestinationScanResult::DepthExceeded { + return InlineLinkValidation::DepthExceeded; + } + + if destination_result == DestinationScanResult::Invalid { + return InlineLinkValidation::Invalid; + } + + let mut saw_separator = false; + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + saw_separator = true; + } + let has_title = saw_separator && get_title_close_char(p).is_some(); + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if has_title { + scan_title_content(p, get_title_close_char(p)); + } + + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + + if p.at(R_PAREN) { + InlineLinkValidation::Valid + } else { + InlineLinkValidation::Invalid } - get_title_close_char(p).is_some() }) } -fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) { +/// Result of scanning a link destination. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationScanResult { + /// Destination is valid and complete + Valid, + /// Destination is invalid (contains invalid characters, etc.) + Invalid, + /// Destination was truncated because paren depth exceeded the limit. + /// In this case, the link is considered valid but closed at the truncation point. + DepthExceeded, +} + +fn scan_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; + // Skip leading whitespace to match parse_inline_link_destination_tokens behavior + while is_title_separator_token(p) { + skip_link_def_separator_tokens(p); + } + if p.at(L_ANGLE) { + p.bump_link_definition(); + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } + if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + continue; + } + p.bump_link_definition(); + return DestinationScanResult::Valid; + } + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + p.bump_link_definition(); + } + } + + let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while !p.at(EOF) && !p.at(NEWLINE) { + if is_whitespace_token(p) { + break; + } + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + p.bump_link_definition(); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + // Per CommonMark/cmark, the link is still valid but closed here. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). + break; + } + } + } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid +} + +fn scan_title_content(p: &mut MarkdownParser, close_char: Option) { + let Some(close_char) = close_char else { + return; + }; + + let text = p.cur_text(); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); + + p.bump_link_definition(); + if is_complete { + return; + } + + loop { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { + return; + } + + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + skip_link_def_separator_tokens(p); + continue; + } + + let text = p.cur_text(); + if super::ends_with_unescaped_close(text, close_char) { + p.bump_link_definition(); + return; + } + + p.bump_link_definition(); + } +} + +fn skip_link_def_separator_tokens(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + p.bump(NEWLINE); + } else { + p.bump_link_definition(); + } +} + +fn is_title_separator_token(p: &MarkdownParser) -> bool { + is_whitespace_token(p) || (p.at(NEWLINE) && !p.at_blank_line()) +} + +fn bump_link_def_separator(p: &mut MarkdownParser) { + if p.at(NEWLINE) { + let item = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + item.complete(p, MD_TEXTUAL); + } else { + bump_textual_link_def(p); + } +} + +fn parse_inline_link_destination_tokens(p: &mut MarkdownParser) -> DestinationScanResult { p.re_lex_link_definition(); + const MAX_PAREN_DEPTH: i32 = super::MAX_LINK_DESTINATION_PAREN_DEPTH; if p.at(L_ANGLE) { bump_textual_link_def(p); - while !p.at(EOF) && !p.at(NEWLINE) { + let mut pending_escape = false; + loop { + if p.at(EOF) || p.at(NEWLINE) { + return DestinationScanResult::Invalid; + } if p.at(R_ANGLE) { + if pending_escape { + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + bump_textual_link_def(p); + continue; + } bump_textual_link_def(p); - break; + return DestinationScanResult::Valid; } - if is_whitespace_token(p) { - break; + if !super::validate_link_destination_text( + p.cur_text(), + super::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; } bump_textual_link_def(p); } - return; } let mut paren_depth: i32 = 0; + let mut pending_escape = false; + while is_title_separator_token(p) { + bump_link_def_separator(p); + } while !p.at(EOF) && !p.at(NEWLINE) { if is_whitespace_token(p) { break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth == 0 { + let text = p.cur_text(); + if !super::validate_link_destination_text( + text, + super::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationScanResult::Invalid; + } + match super::try_update_paren_depth(text, paren_depth, MAX_PAREN_DEPTH) { + super::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + super::ParenDepthResult::DepthExceeded => { + // Paren depth exceeded - destination is truncated at this point. + return DestinationScanResult::DepthExceeded; + } + super::ParenDepthResult::UnmatchedClose => { + // Unmatched closing paren - destination ends here normally. + // The `)` belongs to the enclosing construct (inline link closer). break; } - paren_depth -= 1; } - - bump_textual_link_def(p); } + if p.at(EOF) { + return DestinationScanResult::Invalid; + } + if p.at(NEWLINE) { + return if p.at_blank_line() { + DestinationScanResult::Invalid + } else { + DestinationScanResult::Valid + }; + } + DestinationScanResult::Valid } fn get_title_close_char(p: &MarkdownParser) -> Option { @@ -1088,9 +1473,7 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { }; let text = p.cur_text(); - let is_complete = text.len() >= 2 - && ((close_char == ')' && text.ends_with(')')) - || (close_char != ')' && text.ends_with(close_char))); + let is_complete = text.len() >= 2 && super::ends_with_unescaped_close(text, close_char); bump_textual_link_def(p); if is_complete { @@ -1098,12 +1481,19 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { } loop { - if p.at(EOF) || p.at(NEWLINE) { + // Stop on EOF or blank line (titles cannot span blank lines per CommonMark) + if p.at(EOF) || p.at_blank_line() { return; } + // Continue through single newlines (titles can span non-blank lines) + if p.at(NEWLINE) { + bump_link_def_separator(p); + continue; + } + let text = p.cur_text(); - if text.ends_with(close_char) { + if super::ends_with_unescaped_close(text, close_char) { bump_textual_link_def(p); return; } diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 86d9f5735443..69d29d3bd40e 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -73,7 +73,9 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { p.bump_any(); // Parse label: consume tokens until ] or invalid state + // Also collect the label text for normalization check. let mut label_len = 0; + let mut label_text = String::new(); loop { if p.at(EOF) { return false; @@ -89,6 +91,8 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } let text = p.cur_text(); + label_text.push_str(text); + // Check for escape sequences if text.starts_with('\\') && text.len() > 1 { label_len += 1; // Count escaped char @@ -107,6 +111,12 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { return false; } + // Label must also be non-empty after normalization (e.g., `[\n ]` normalizes to empty) + let normalized = crate::link_reference::normalize_reference_label(&label_text); + if normalized.is_empty() { + return false; + } + // Expect ] if !p.at(R_BRACK) { return false; @@ -122,18 +132,31 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { // Re-lex the current token in LinkDefinition context so whitespace is tokenized. p.re_lex_link_definition(); - // Destination is required - if p.at(EOF) || p.at(NEWLINE) { - return false; + // Skip optional whitespace after colon (before destination or newline) + skip_whitespace_tokens(p); + + // Per CommonMark §4.7, destination can be on the next line if there's a + // single non-blank newline after the colon. + if p.at(NEWLINE) { + if p.at_blank_line() { + return false; // Blank line = no destination + } + // Single newline - allow destination on next line + p.bump_link_definition(); + skip_whitespace_tokens(p); } - // Skip destination - if !skip_destination_tokens(p) { + // Destination is required (can be on same line or next line now) + if p.at(EOF) || p.at_blank_line() { return false; } - // Skip optional whitespace after destination (lookahead only) - skip_whitespace_tokens(p); + // Skip destination and track whether there was whitespace after it + let dest_result = skip_destination_tokens(p); + if dest_result == DestinationResult::Invalid { + return false; + } + let had_separator = dest_result == DestinationResult::ValidWithSeparator; // Check what follows destination if p.at(EOF) { @@ -141,19 +164,27 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { } if p.at(NEWLINE) { - // Check for title on next line + // Check for title on next line (newline counts as separator) p.bump_link_definition(); skip_whitespace_tokens(p); if at_title_start(p) { - return skip_title_tokens(p); + // If title looks valid, it's included in the definition. + // If title has trailing content, it's invalid - but the definition + // is still valid (destination-only). The invalid title line will + // be parsed as a paragraph. Per CommonMark §4.7. + let _ = skip_title_tokens(p); // Ignore result - definition is valid either way } - // No title on next line - destination-only is valid + // Destination-only is valid, or destination+valid_title is valid return true; } - // Check for optional title on same line + // Check for optional title on same line - MUST be preceded by whitespace if at_title_start(p) { + if !had_separator { + // Title without preceding whitespace is invalid (e.g., `(baz)`) + return false; + } return skip_title_tokens(p); } @@ -163,14 +194,22 @@ fn is_valid_link_definition_lookahead(p: &mut MarkdownParser) -> bool { /// Skip whitespace tokens (spaces/tabs) in lookahead. fn skip_whitespace_tokens(p: &mut MarkdownParser) { + skip_whitespace_tokens_tracked(p); +} + +/// Skip whitespace tokens (spaces/tabs) in lookahead and return whether any were skipped. +fn skip_whitespace_tokens_tracked(p: &mut MarkdownParser) -> bool { + let mut skipped = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { p.bump_link_definition(); + skipped = true; } else { break; } } + skipped } /// Check if at a title start token. @@ -179,20 +218,65 @@ fn at_title_start(p: &MarkdownParser) -> bool { text.starts_with('"') || text.starts_with('\'') || p.at(L_PAREN) } -/// Skip destination tokens in lookahead. Returns false if destination is invalid. -fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { +/// Result of skipping destination tokens. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DestinationResult { + /// Invalid destination + Invalid, + /// Valid destination, no trailing whitespace found before title + ValidNoSeparator, + /// Valid destination with trailing whitespace (separator before potential title) + ValidWithSeparator, +} + +/// Skip destination tokens in lookahead. Returns the destination result. +fn skip_destination_tokens(p: &mut MarkdownParser) -> DestinationResult { + // Skip optional leading whitespace before destination + while !p.at(EOF) && !p.at(NEWLINE) { + let text = p.cur_text(); + if text.chars().all(|c| c == ' ' || c == '\t') && !text.is_empty() { + p.bump_link_definition(); + } else { + break; + } + } + if p.at(L_ANGLE) { // Angle-bracketed destination p.bump_link_definition(); + let mut pending_escape = false; loop { if p.at(EOF) || p.at(NEWLINE) { - return false; // Unterminated angle bracket + return DestinationResult::Invalid; // Unterminated angle bracket } if p.at(R_ANGLE) { - p.bump_link_definition(); - // Consume separator whitespace into destination - skip_whitespace_tokens(p); - return true; + if pending_escape { + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; + } + p.bump_link_definition(); + continue; + } else { + p.bump_link_definition(); + // Check for trailing whitespace (separator) + let had_sep = skip_whitespace_tokens_tracked(p); + return if had_sep { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator + }; + } + } + if !crate::syntax::validate_link_destination_text( + p.cur_text(), + crate::syntax::LinkDestinationKind::Enclosed, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } p.bump_link_definition(); } @@ -201,6 +285,7 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { let mut paren_depth = 0i32; let mut has_content = false; let mut saw_separator = false; + let mut pending_escape = false; while !p.at(EOF) && !p.at(NEWLINE) { let text = p.cur_text(); @@ -214,24 +299,43 @@ fn skip_destination_tokens(p: &mut MarkdownParser) -> bool { } if at_title_start(p) && has_content && saw_separator { + // Break here - we've found separator before title break; } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends destination - } + if !crate::syntax::validate_link_destination_text( + text, + crate::syntax::LinkDestinationKind::Raw, + &mut pending_escape, + ) { + return DestinationResult::Invalid; } - has_content = true; - saw_separator = false; - p.bump_link_definition(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + has_content = true; + saw_separator = false; + paren_depth = next_depth; + p.bump_link_definition(); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + // For link reference definitions, both cases end the destination + break; + } + } + } + if !has_content { + DestinationResult::Invalid + } else if saw_separator { + DestinationResult::ValidWithSeparator + } else { + DestinationResult::ValidNoSeparator } - has_content } } @@ -249,17 +353,10 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { // Check if first token is complete (e.g., `"title"`) let first_text = p.cur_text(); - if first_text.len() >= 2 { - let is_complete = if close_char == ')' { - first_text.ends_with(')') - } else { - first_text.ends_with(close_char) - }; - if is_complete { - p.bump_link_definition(); - skip_whitespace_tokens(p); - return p.at(EOF) || p.at(NEWLINE); - } + if first_text.len() >= 2 && crate::syntax::ends_with_unescaped_close(first_text, close_char) { + p.bump_link_definition(); + skip_whitespace_tokens(p); + return p.at(EOF) || p.at(NEWLINE); } p.bump_link_definition(); @@ -271,11 +368,7 @@ fn skip_title_tokens(p: &mut MarkdownParser) -> bool { } // Check for closing delimiter - let is_close = if close_char == ')' { - p.at(R_PAREN) - } else { - p.cur_text().ends_with(close_char) - }; + let is_close = crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char); if is_close { p.bump_link_definition(); @@ -326,12 +419,35 @@ pub(crate) fn parse_link_block(p: &mut MarkdownParser) -> ParsedSyntax { parse_link_destination(p); // Optional title - can be on same line or next line per CommonMark §4.7 + // First, check for title on same line (at_link_title skips whitespace in lookahead) if at_link_title(p) { parse_link_title(p); - } else if p.at(NEWLINE) && title_on_next_line(p) { - // Title is on the next line per CommonMark §4.7 - // We parse the newline and whitespace as part of the title - parse_link_title_after_newline(p); + } else { + // Check for title on next line - need to skip trailing whitespace first + // Also validate that the title is complete and has no trailing content + let has_valid_title_after_newline = p.lookahead(|p| { + while is_whitespace_token(p) { + p.bump_link_definition(); + } + if p.at(NEWLINE) && !p.at_blank_line() { + // Check if there's a title starter on next line + if !title_on_next_line(p) { + return false; + } + // Also validate that the title is complete (no trailing content) + p.bump_link_definition(); // consume newline + skip_whitespace_tokens(p); // skip leading whitespace on title line + skip_title_tokens(p) // returns true only if title ends at EOL/EOF + } else { + false + } + }); + + if has_valid_title_after_newline { + // Title is on the next line per CommonMark §4.7 + // Include trailing whitespace + newline + leading whitespace as part of title + parse_link_title_with_trailing_ws(p); + } } Present(m.complete(p, MD_LINK_REFERENCE_DEFINITION)) @@ -375,6 +491,14 @@ fn parse_link_destination(p: &mut MarkdownParser) { bump_textual_link_def(p); } + // Per CommonMark §4.7, destination can be on the next line + if p.at(NEWLINE) && !p.at_blank_line() { + bump_textual_link_def(p); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + } + if p.at(L_ANGLE) { // Angle-bracketed: consume < ... > bump_textual_link_def(p); @@ -393,17 +517,21 @@ fn parse_link_destination(p: &mut MarkdownParser) { break; // Bare destination stops at first whitespace } - if p.at(L_PAREN) { - paren_depth += 1; - } else if p.at(R_PAREN) { - if paren_depth > 0 { - paren_depth -= 1; - } else { - break; // Unbalanced ) ends bare destination + let text = p.cur_text(); + match crate::syntax::try_update_paren_depth( + text, + paren_depth, + crate::syntax::MAX_LINK_DESTINATION_PAREN_DEPTH, + ) { + crate::syntax::ParenDepthResult::Ok(next_depth) => { + paren_depth = next_depth; + bump_textual_link_def(p); + } + crate::syntax::ParenDepthResult::DepthExceeded + | crate::syntax::ParenDepthResult::UnmatchedClose => { + break; } } - - bump_textual_link_def(p); } } @@ -457,16 +585,24 @@ fn title_on_next_line(p: &MarkdownParser) -> bool { // Check for title starter trimmed.starts_with('"') || trimmed.starts_with('\'') || trimmed.starts_with('(') } - -/// Parse a link title that appears on the next line after a newline. +/// Parse a link title that appears on next line, including trailing whitespace before newline. /// -/// Per CommonMark §4.7, titles can appear on the line following the destination. -fn parse_link_title_after_newline(p: &mut MarkdownParser) { +/// This is used when there's trailing whitespace after the destination but before +/// the newline that precedes the title. The trailing whitespace is included in the +/// title node to maintain the grammar structure. +fn parse_link_title_with_trailing_ws(p: &mut MarkdownParser) { let m = p.start(); let list = p.start(); - // Include the newline as textual content - bump_textual_link_def(p); + // Include trailing whitespace after destination + while is_whitespace_token(p) { + bump_textual_link_def(p); + } + + // Include the newline + if p.at(NEWLINE) { + bump_textual_link_def(p); + } // Include leading whitespace on title line while is_whitespace_token(p) { @@ -517,9 +653,10 @@ fn get_title_close_char(p: &MarkdownParser) -> Option { } } -/// Parse title content until closing delimiter. +/// Parse title content until closing delimiter, including trailing whitespace. /// /// Inside title quotes, we use Regular context so whitespace doesn't split tokens. +/// Trailing whitespace after the title is also consumed to prevent spurious paragraphs. fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { let Some(close_char) = close_char else { return; @@ -535,6 +672,11 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { bump_textual_link_def(p); if is_complete { + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } return; } @@ -545,15 +687,20 @@ fn parse_title_content(p: &mut MarkdownParser, close_char: Option) { break; } - // Check for closing delimiter + // Check for closing delimiter (must be unescaped) let is_close = if close_char == ')' { p.at(R_PAREN) } else { - p.cur_text().ends_with(close_char) + crate::syntax::ends_with_unescaped_close(p.cur_text(), close_char) }; if is_close { // Use Regular context for title content bump_textual(p); + // Consume trailing whitespace after title (before newline) + p.re_lex_link_definition(); + while is_whitespace_token(p) { + bump_textual_link_def(p); + } break; } diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index f3dee25a4aa1..0512db55febe 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -447,10 +447,8 @@ fn render_paragraph( } // Trim both ends - leading whitespace can appear from parser including // the space after list markers in the paragraph content - let content = strip_paragraph_indent( - content - .trim_matches(|c| c == ' ' || c == '\n' || c == '\r') - ); + let content = + strip_paragraph_indent(content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r')); if in_tight_list { // In tight lists, paragraphs are rendered without

tags @@ -1160,7 +1158,11 @@ where { if let Some(node) = label_node { let text = label_text(&node); - (text.clone(), Some(text)) + if text.trim().is_empty() { + (fallback, None) + } else { + (text.clone(), Some(text)) + } } else { (fallback, None) } @@ -1602,4 +1604,64 @@ mod tests { // U+0000 should become replacement character assert_eq!(decode_entity("�"), Some("\u{FFFD}".to_string())); } + + #[test] + fn test_percent_encode_uri() { + let input = format!("https://a{}b.c/%20/%", '\u{1F44D}'); + let encoded = percent_encode_uri(&input); + assert_eq!(encoded, "https://a%F0%9F%91%8Db.c/%20/%25"); + } + + #[test] + fn test_process_link_destination_decodes_entities() { + let encoded = process_link_destination("https://example.com/<"); + assert_eq!(encoded, "https://example.com/%3C"); + } + + #[test] + fn test_paren_depth_limit_in_destination() { + let dest = format!("x{}y{}", "(".repeat(32), ")".repeat(32)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected = format!("

a

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_paren_depth_limit_exceeded_in_destination() { + let dest = format!("x{}y{}", "(".repeat(33), ")".repeat(33)); + let input = format!("[a]({dest})\n"); + let parsed = parse_markdown(&input); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + let expected_dest = format!("x{}", "(".repeat(32)); + let trailing = ")".repeat(34); + let expected = format!("

a(y{trailing}

\n"); + assert_eq!(html, expected); + } + + #[test] + fn test_title_with_escaped_closing_quote() { + let parsed = parse_markdown("[a](/url \"title with \\\" quote\")\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

a

\n" + ); + } } diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap deleted file mode 100644 index 3e7f69d76bd7..000000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md.snap +++ /dev/null @@ -1,90 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has ![unclosed image - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdReferenceImage { - excl_token: BANG@9..10 "!" [] [], - l_brack_token: L_BRACK@10..11 "[" [] [], - alt: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@25..26 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@26..26 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..26 - 0: (empty) - 1: MD_BLOCK_LIST@0..26 - 0: MD_PARAGRAPH@0..26 - 0: MD_INLINE_ITEM_LIST@0..26 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_REFERENCE_IMAGE@9..25 - 0: BANG@9..10 "!" [] [] - 1: L_BRACK@10..11 "[" [] [] - 2: MD_INLINE_ITEM_LIST@11..25 - 0: MD_TEXTUAL@11..25 - 0: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [] - 3: (empty) - 4: (empty) - 2: MD_TEXTUAL@25..26 - 0: MD_TEXTUAL_LITERAL@25..26 "\n" [] [] - 1: (empty) - 2: EOF@26..26 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_image.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed image, expected `]` to close alt text. - - > 1 │ This has ![unclosed image - │ ^^^^^^^^^^^^^^^^ - 2 │ - - i image started here - - > 1 │ This has ![unclosed image - │ ^^^^^^^^^^^^^^^^ - 2 │ - - i Format: ![alt text](image-url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap deleted file mode 100644 index e9bb02cd26cb..000000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md.snap +++ /dev/null @@ -1,88 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has [unclosed link - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdReferenceLink { - l_brack_token: L_BRACK@9..10 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@24..24 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..24 - 0: (empty) - 1: MD_BLOCK_LIST@0..24 - 0: MD_PARAGRAPH@0..24 - 0: MD_INLINE_ITEM_LIST@0..24 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_REFERENCE_LINK@9..23 - 0: L_BRACK@9..10 "[" [] [] - 1: MD_INLINE_ITEM_LIST@10..23 - 0: MD_TEXTUAL@10..23 - 0: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [] - 2: (empty) - 3: (empty) - 2: MD_TEXTUAL@23..24 - 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] - 1: (empty) - 2: EOF@24..24 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_link.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has [unclosed link - │ ^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has [unclosed link - │ ^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md new file mode 100644 index 000000000000..502a1004733e --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md @@ -0,0 +1,17 @@ +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap new file mode 100644 index 000000000000..6a103654f6d6 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/inline_link_whitespace.md.snap @@ -0,0 +1,405 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +Whitespace around destination and title: +[link]( /uri + "title" ) + +Title spanning lines: +[link](/url "title +continued") + +Line break between destination and title: +[link](/uri +"title") + +Leading whitespace before destination: +[link]( /url) + +Trailing whitespace before close paren: +[link](/url ) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@41..42 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..46 "link" [] [], + }, + ], + r_brack_token: R_BRACK@46..47 "]" [] [], + l_paren_token: L_PAREN@47..48 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..51 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@65..68 " )" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@69..70 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@91..92 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@92..93 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@93..97 "link" [] [], + }, + ], + r_brack_token: R_BRACK@97..98 "]" [] [], + l_paren_token: L_PAREN@98..99 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@99..103 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@103..104 " " [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@110..111 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@121..122 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@122..123 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@123..124 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@165..166 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@166..167 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@167..171 "link" [] [], + }, + ], + r_brack_token: R_BRACK@171..172 "]" [] [], + l_paren_token: L_PAREN@172..173 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@177..178 "\n" [] [], + }, + ], + title: MdLinkTitle { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [], + }, + ], + }, + r_paren_token: R_PAREN@185..186 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@186..187 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@187..188 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@226..227 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@227..228 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@228..232 "link" [] [], + }, + ], + r_brack_token: R_BRACK@232..233 "]" [] [], + l_paren_token: L_PAREN@233..234 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@234..237 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@237..241 "/url" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@241..242 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@242..243 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + MdNewline { + value_token: NEWLINE@243..244 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@283..284 "\n" [] [], + }, + MdInlineLink { + l_brack_token: L_BRACK@284..285 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@285..289 "link" [] [], + }, + ], + r_brack_token: R_BRACK@289..290 "]" [] [], + l_paren_token: L_PAREN@290..291 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@291..295 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@295..298 " " [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@298..299 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@299..300 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@300..300 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..300 + 0: (empty) + 1: MD_BLOCK_LIST@0..300 + 0: MD_PARAGRAPH@0..69 + 0: MD_INLINE_ITEM_LIST@0..69 + 0: MD_TEXTUAL@0..40 + 0: MD_TEXTUAL_LITERAL@0..40 "Whitespace around destination and title:" [] [] + 1: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 "\n" [] [] + 2: MD_INLINE_LINK@41..68 + 0: L_BRACK@41..42 "[" [] [] + 1: MD_INLINE_ITEM_LIST@42..46 + 0: MD_TEXTUAL@42..46 + 0: MD_TEXTUAL_LITERAL@42..46 "link" [] [] + 2: R_BRACK@46..47 "]" [] [] + 3: L_PAREN@47..48 "(" [] [] + 4: MD_INLINE_ITEM_LIST@48..58 + 0: MD_TEXTUAL@48..51 + 0: MD_TEXTUAL_LITERAL@48..51 " " [] [] + 1: MD_TEXTUAL@51..55 + 0: MD_TEXTUAL_LITERAL@51..55 "/uri" [] [] + 2: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 "\n" [] [] + 3: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 " " [] [] + 4: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 " " [] [] + 5: MD_LINK_TITLE@58..65 + 0: MD_INLINE_ITEM_LIST@58..65 + 0: MD_TEXTUAL@58..65 + 0: MD_TEXTUAL_LITERAL@58..65 "\"title\"" [] [] + 6: R_PAREN@65..68 " )" [] [] + 3: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 "\n" [] [] + 1: (empty) + 1: MD_NEWLINE@69..70 + 0: NEWLINE@69..70 "\n" [] [] + 2: MD_PARAGRAPH@70..123 + 0: MD_INLINE_ITEM_LIST@70..123 + 0: MD_TEXTUAL@70..91 + 0: MD_TEXTUAL_LITERAL@70..91 "Title spanning lines:" [] [] + 1: MD_TEXTUAL@91..92 + 0: MD_TEXTUAL_LITERAL@91..92 "\n" [] [] + 2: MD_INLINE_LINK@92..122 + 0: L_BRACK@92..93 "[" [] [] + 1: MD_INLINE_ITEM_LIST@93..97 + 0: MD_TEXTUAL@93..97 + 0: MD_TEXTUAL_LITERAL@93..97 "link" [] [] + 2: R_BRACK@97..98 "]" [] [] + 3: L_PAREN@98..99 "(" [] [] + 4: MD_INLINE_ITEM_LIST@99..104 + 0: MD_TEXTUAL@99..103 + 0: MD_TEXTUAL_LITERAL@99..103 "/url" [] [] + 1: MD_TEXTUAL@103..104 + 0: MD_TEXTUAL_LITERAL@103..104 " " [] [] + 5: MD_LINK_TITLE@104..121 + 0: MD_INLINE_ITEM_LIST@104..121 + 0: MD_TEXTUAL@104..110 + 0: MD_TEXTUAL_LITERAL@104..110 "\"title" [] [] + 1: MD_TEXTUAL@110..111 + 0: MD_TEXTUAL_LITERAL@110..111 "\n" [] [] + 2: MD_TEXTUAL@111..121 + 0: MD_TEXTUAL_LITERAL@111..121 "continued\"" [] [] + 6: R_PAREN@121..122 ")" [] [] + 3: MD_TEXTUAL@122..123 + 0: MD_TEXTUAL_LITERAL@122..123 "\n" [] [] + 1: (empty) + 3: MD_NEWLINE@123..124 + 0: NEWLINE@123..124 "\n" [] [] + 4: MD_PARAGRAPH@124..187 + 0: MD_INLINE_ITEM_LIST@124..187 + 0: MD_TEXTUAL@124..165 + 0: MD_TEXTUAL_LITERAL@124..165 "Line break between destination and title:" [] [] + 1: MD_TEXTUAL@165..166 + 0: MD_TEXTUAL_LITERAL@165..166 "\n" [] [] + 2: MD_INLINE_LINK@166..186 + 0: L_BRACK@166..167 "[" [] [] + 1: MD_INLINE_ITEM_LIST@167..171 + 0: MD_TEXTUAL@167..171 + 0: MD_TEXTUAL_LITERAL@167..171 "link" [] [] + 2: R_BRACK@171..172 "]" [] [] + 3: L_PAREN@172..173 "(" [] [] + 4: MD_INLINE_ITEM_LIST@173..178 + 0: MD_TEXTUAL@173..177 + 0: MD_TEXTUAL_LITERAL@173..177 "/uri" [] [] + 1: MD_TEXTUAL@177..178 + 0: MD_TEXTUAL_LITERAL@177..178 "\n" [] [] + 5: MD_LINK_TITLE@178..185 + 0: MD_INLINE_ITEM_LIST@178..185 + 0: MD_TEXTUAL@178..185 + 0: MD_TEXTUAL_LITERAL@178..185 "\"title\"" [] [] + 6: R_PAREN@185..186 ")" [] [] + 3: MD_TEXTUAL@186..187 + 0: MD_TEXTUAL_LITERAL@186..187 "\n" [] [] + 1: (empty) + 5: MD_NEWLINE@187..188 + 0: NEWLINE@187..188 "\n" [] [] + 6: MD_PARAGRAPH@188..243 + 0: MD_INLINE_ITEM_LIST@188..243 + 0: MD_TEXTUAL@188..226 + 0: MD_TEXTUAL_LITERAL@188..226 "Leading whitespace before destination:" [] [] + 1: MD_TEXTUAL@226..227 + 0: MD_TEXTUAL_LITERAL@226..227 "\n" [] [] + 2: MD_INLINE_LINK@227..242 + 0: L_BRACK@227..228 "[" [] [] + 1: MD_INLINE_ITEM_LIST@228..232 + 0: MD_TEXTUAL@228..232 + 0: MD_TEXTUAL_LITERAL@228..232 "link" [] [] + 2: R_BRACK@232..233 "]" [] [] + 3: L_PAREN@233..234 "(" [] [] + 4: MD_INLINE_ITEM_LIST@234..241 + 0: MD_TEXTUAL@234..237 + 0: MD_TEXTUAL_LITERAL@234..237 " " [] [] + 1: MD_TEXTUAL@237..241 + 0: MD_TEXTUAL_LITERAL@237..241 "/url" [] [] + 5: (empty) + 6: R_PAREN@241..242 ")" [] [] + 3: MD_TEXTUAL@242..243 + 0: MD_TEXTUAL_LITERAL@242..243 "\n" [] [] + 1: (empty) + 7: MD_NEWLINE@243..244 + 0: NEWLINE@243..244 "\n" [] [] + 8: MD_PARAGRAPH@244..300 + 0: MD_INLINE_ITEM_LIST@244..300 + 0: MD_TEXTUAL@244..283 + 0: MD_TEXTUAL_LITERAL@244..283 "Trailing whitespace before close paren:" [] [] + 1: MD_TEXTUAL@283..284 + 0: MD_TEXTUAL_LITERAL@283..284 "\n" [] [] + 2: MD_INLINE_LINK@284..299 + 0: L_BRACK@284..285 "[" [] [] + 1: MD_INLINE_ITEM_LIST@285..289 + 0: MD_TEXTUAL@285..289 + 0: MD_TEXTUAL_LITERAL@285..289 "link" [] [] + 2: R_BRACK@289..290 "]" [] [] + 3: L_PAREN@290..291 "(" [] [] + 4: MD_INLINE_ITEM_LIST@291..298 + 0: MD_TEXTUAL@291..295 + 0: MD_TEXTUAL_LITERAL@291..295 "/url" [] [] + 1: MD_TEXTUAL@295..298 + 0: MD_TEXTUAL_LITERAL@295..298 " " [] [] + 5: (empty) + 6: R_PAREN@298..299 ")" [] [] + 3: MD_TEXTUAL@299..300 + 0: MD_TEXTUAL_LITERAL@299..300 "\n" [] [] + 1: (empty) + 2: EOF@300..300 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap index 2737efcebe70..c81a255370a3 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_edge_cases.md.snap @@ -601,48 +601,44 @@ MdDocument { MdNewline { value_token: NEWLINE@473..474 "\n" [] [], }, - MdLinkReferenceDefinition { - l_brack_token: L_BRACK@474..475 "[" [] [], - label: MdLinkLabel { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], - }, - ], - }, - r_brack_token: R_BRACK@489..490 "]" [] [], - colon_token: COLON@490..491 ":" [] [], - destination: MdLinkDestination { - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], - }, - ], - }, - title: missing (optional), - }, MdParagraph { list: MdInlineItemList [ MdTextual { - value_token: MD_TEXTUAL_LITERAL@498..499 " " [] [], + value_token: MD_TEXTUAL_LITERAL@474..475 "[" [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [], + value_token: MD_TEXTUAL_LITERAL@475..480 "angle" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@480..481 "-" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@489..490 "]" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@490..491 ":" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@491..492 " " [] [], + }, + MdInlineHtml { + value: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@492..493 "<" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@493..497 "/url" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@497..498 ">" [] [], + }, + ], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@506..507 "\n" [] [], @@ -1021,36 +1017,33 @@ MdDocument { 1: (empty) 39: MD_NEWLINE@473..474 0: NEWLINE@473..474 "\n" [] [] - 40: MD_LINK_REFERENCE_DEFINITION@474..498 - 0: L_BRACK@474..475 "[" [] [] - 1: MD_LINK_LABEL@475..489 - 0: MD_INLINE_ITEM_LIST@475..489 - 0: MD_TEXTUAL@475..480 - 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] - 1: MD_TEXTUAL@480..481 - 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] - 2: MD_TEXTUAL@481..489 - 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] - 2: R_BRACK@489..490 "]" [] [] - 3: COLON@490..491 ":" [] [] - 4: MD_LINK_DESTINATION@491..498 - 0: MD_INLINE_ITEM_LIST@491..498 - 0: MD_TEXTUAL@491..492 - 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] - 1: MD_TEXTUAL@492..493 - 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] - 2: MD_TEXTUAL@493..497 - 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] - 3: MD_TEXTUAL@497..498 - 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] - 5: (empty) - 41: MD_PARAGRAPH@498..507 - 0: MD_INLINE_ITEM_LIST@498..507 - 0: MD_TEXTUAL@498..499 - 0: MD_TEXTUAL_LITERAL@498..499 " " [] [] - 1: MD_TEXTUAL@499..506 - 0: MD_TEXTUAL_LITERAL@499..506 "invalid" [] [] - 2: MD_TEXTUAL@506..507 + 40: MD_PARAGRAPH@474..507 + 0: MD_INLINE_ITEM_LIST@474..507 + 0: MD_TEXTUAL@474..475 + 0: MD_TEXTUAL_LITERAL@474..475 "[" [] [] + 1: MD_TEXTUAL@475..480 + 0: MD_TEXTUAL_LITERAL@475..480 "angle" [] [] + 2: MD_TEXTUAL@480..481 + 0: MD_TEXTUAL_LITERAL@480..481 "-" [] [] + 3: MD_TEXTUAL@481..489 + 0: MD_TEXTUAL_LITERAL@481..489 "trailing" [] [] + 4: MD_TEXTUAL@489..490 + 0: MD_TEXTUAL_LITERAL@489..490 "]" [] [] + 5: MD_TEXTUAL@490..491 + 0: MD_TEXTUAL_LITERAL@490..491 ":" [] [] + 6: MD_TEXTUAL@491..492 + 0: MD_TEXTUAL_LITERAL@491..492 " " [] [] + 7: MD_INLINE_HTML@492..498 + 0: MD_INLINE_ITEM_LIST@492..498 + 0: MD_TEXTUAL@492..493 + 0: MD_TEXTUAL_LITERAL@492..493 "<" [] [] + 1: MD_TEXTUAL@493..497 + 0: MD_TEXTUAL_LITERAL@493..497 "/url" [] [] + 2: MD_TEXTUAL@497..498 + 0: MD_TEXTUAL_LITERAL@497..498 ">" [] [] + 8: MD_TEXTUAL@498..506 + 0: MD_TEXTUAL_LITERAL@498..506 " invalid" [] [] + 9: MD_TEXTUAL@506..507 0: MD_TEXTUAL_LITERAL@506..507 "\n" [] [] 1: (empty) 2: EOF@507..507 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap index 2d3cb5a3c2c5..3be42006062e 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/link_definition_invalid.md.snap @@ -1,6 +1,5 @@ --- source: crates/biome_markdown_parser/tests/spec_test.rs -assertion_line: 131 expression: snapshot --- ## Input @@ -184,11 +183,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@353..354 "\n" [] [], }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@354..355 "[" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@355..356 "]" [] [], + MdReferenceLink { + l_brack_token: L_BRACK@354..355 "[" [] [], + text: MdInlineItemList [], + r_brack_token: R_BRACK@355..356 "]" [] [], + label: missing (optional), }, MdTextual { value_token: MD_TEXTUAL_LITERAL@356..357 ":" [] [], @@ -315,15 +314,16 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@330..353 "Empty label is invalid:" [] [] 1: MD_TEXTUAL@353..354 0: MD_TEXTUAL_LITERAL@353..354 "\n" [] [] - 2: MD_TEXTUAL@354..355 - 0: MD_TEXTUAL_LITERAL@354..355 "[" [] [] - 3: MD_TEXTUAL@355..356 - 0: MD_TEXTUAL_LITERAL@355..356 "]" [] [] - 4: MD_TEXTUAL@356..357 + 2: MD_REFERENCE_LINK@354..356 + 0: L_BRACK@354..355 "[" [] [] + 1: MD_INLINE_ITEM_LIST@355..355 + 2: R_BRACK@355..356 "]" [] [] + 3: (empty) + 3: MD_TEXTUAL@356..357 0: MD_TEXTUAL_LITERAL@356..357 ":" [] [] - 5: MD_TEXTUAL@357..362 + 4: MD_TEXTUAL@357..362 0: MD_TEXTUAL_LITERAL@357..362 " /url" [] [] - 6: MD_TEXTUAL@362..363 + 5: MD_TEXTUAL@362..363 0: MD_TEXTUAL_LITERAL@362..363 "\n" [] [] 1: (empty) 2: EOF@363..363 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap similarity index 50% rename from crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap index 7d67fcd2a850..f0ecbb1ceda5 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/multiline_label_reference.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/multiline_label_reference.md.snap @@ -25,19 +25,16 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@1..4 "foo" [] [], }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], + }, ], - r_brack_token: missing (required), + r_brack_token: R_BRACK@8..9 "]" [] [], label: missing (optional), }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@4..5 "\n" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@5..8 "bar" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@8..9 "]" [] [], - }, MdTextual { value_token: MD_TEXTUAL_LITERAL@9..10 "\n" [] [], }, @@ -57,45 +54,20 @@ MdDocument { 1: MD_BLOCK_LIST@0..10 0: MD_PARAGRAPH@0..10 0: MD_INLINE_ITEM_LIST@0..10 - 0: MD_REFERENCE_LINK@0..4 + 0: MD_REFERENCE_LINK@0..9 0: L_BRACK@0..1 "[" [] [] - 1: MD_INLINE_ITEM_LIST@1..4 + 1: MD_INLINE_ITEM_LIST@1..8 0: MD_TEXTUAL@1..4 0: MD_TEXTUAL_LITERAL@1..4 "foo" [] [] - 2: (empty) + 1: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] + 2: MD_TEXTUAL@5..8 + 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] + 2: R_BRACK@8..9 "]" [] [] 3: (empty) - 1: MD_TEXTUAL@4..5 - 0: MD_TEXTUAL_LITERAL@4..5 "\n" [] [] - 2: MD_TEXTUAL@5..8 - 0: MD_TEXTUAL_LITERAL@5..8 "bar" [] [] - 3: MD_TEXTUAL@8..9 - 0: MD_TEXTUAL_LITERAL@8..9 "]" [] [] - 4: MD_TEXTUAL@9..10 + 1: MD_TEXTUAL@9..10 0: MD_TEXTUAL_LITERAL@9..10 "\n" [] [] 1: (empty) 2: EOF@10..10 "" [] [] ``` - -## Diagnostics - -``` -multiline_label_reference.md:1:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i link started here - - > 1 │ [foo - │ ^^^^ - 2 │ bar] - 3 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md new file mode 100644 index 000000000000..3cbf1f91d3e1 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md @@ -0,0 +1 @@ +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap new file mode 100644 index 000000000000..236bd2046fec --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/paren_depth_limit.md.snap @@ -0,0 +1,399 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +[a](x((((((((((((((((((((((((((((((((y))))))))))))))))))))))))))))))))) + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdInlineLink { + l_brack_token: L_BRACK@0..1 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@1..2 "a" [] [], + }, + ], + r_brack_token: R_BRACK@2..3 "]" [] [], + l_paren_token: L_PAREN@3..4 "(" [] [], + destination: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..5 "x" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..6 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@6..7 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@8..9 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..12 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..13 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@13..14 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@14..15 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..17 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@17..18 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@18..19 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@19..20 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@20..21 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@21..22 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@22..23 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@24..25 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@26..27 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@27..28 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@28..29 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@29..30 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@30..31 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@31..32 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@32..33 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@33..34 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@34..35 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@35..36 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@36..37 "(" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@37..38 "y" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@38..39 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@39..40 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@40..41 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@41..42 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@42..43 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@43..44 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@44..45 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@45..46 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@46..47 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@47..48 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@48..49 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@49..50 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@50..51 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@51..52 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@52..53 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@53..54 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@54..55 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@55..56 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@56..57 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@57..58 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@58..59 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@59..60 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@60..61 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@61..62 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@62..63 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@63..64 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@64..65 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@65..66 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@66..67 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@67..68 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@68..69 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@69..70 ")" [] [], + }, + ], + title: missing (optional), + r_paren_token: R_PAREN@70..71 ")" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@71..72 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@72..72 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..72 + 0: (empty) + 1: MD_BLOCK_LIST@0..72 + 0: MD_PARAGRAPH@0..72 + 0: MD_INLINE_ITEM_LIST@0..72 + 0: MD_INLINE_LINK@0..71 + 0: L_BRACK@0..1 "[" [] [] + 1: MD_INLINE_ITEM_LIST@1..2 + 0: MD_TEXTUAL@1..2 + 0: MD_TEXTUAL_LITERAL@1..2 "a" [] [] + 2: R_BRACK@2..3 "]" [] [] + 3: L_PAREN@3..4 "(" [] [] + 4: MD_INLINE_ITEM_LIST@4..70 + 0: MD_TEXTUAL@4..5 + 0: MD_TEXTUAL_LITERAL@4..5 "x" [] [] + 1: MD_TEXTUAL@5..6 + 0: MD_TEXTUAL_LITERAL@5..6 "(" [] [] + 2: MD_TEXTUAL@6..7 + 0: MD_TEXTUAL_LITERAL@6..7 "(" [] [] + 3: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "(" [] [] + 4: MD_TEXTUAL@8..9 + 0: MD_TEXTUAL_LITERAL@8..9 "(" [] [] + 5: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "(" [] [] + 6: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "(" [] [] + 7: MD_TEXTUAL@11..12 + 0: MD_TEXTUAL_LITERAL@11..12 "(" [] [] + 8: MD_TEXTUAL@12..13 + 0: MD_TEXTUAL_LITERAL@12..13 "(" [] [] + 9: MD_TEXTUAL@13..14 + 0: MD_TEXTUAL_LITERAL@13..14 "(" [] [] + 10: MD_TEXTUAL@14..15 + 0: MD_TEXTUAL_LITERAL@14..15 "(" [] [] + 11: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "(" [] [] + 12: MD_TEXTUAL@16..17 + 0: MD_TEXTUAL_LITERAL@16..17 "(" [] [] + 13: MD_TEXTUAL@17..18 + 0: MD_TEXTUAL_LITERAL@17..18 "(" [] [] + 14: MD_TEXTUAL@18..19 + 0: MD_TEXTUAL_LITERAL@18..19 "(" [] [] + 15: MD_TEXTUAL@19..20 + 0: MD_TEXTUAL_LITERAL@19..20 "(" [] [] + 16: MD_TEXTUAL@20..21 + 0: MD_TEXTUAL_LITERAL@20..21 "(" [] [] + 17: MD_TEXTUAL@21..22 + 0: MD_TEXTUAL_LITERAL@21..22 "(" [] [] + 18: MD_TEXTUAL@22..23 + 0: MD_TEXTUAL_LITERAL@22..23 "(" [] [] + 19: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "(" [] [] + 20: MD_TEXTUAL@24..25 + 0: MD_TEXTUAL_LITERAL@24..25 "(" [] [] + 21: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "(" [] [] + 22: MD_TEXTUAL@26..27 + 0: MD_TEXTUAL_LITERAL@26..27 "(" [] [] + 23: MD_TEXTUAL@27..28 + 0: MD_TEXTUAL_LITERAL@27..28 "(" [] [] + 24: MD_TEXTUAL@28..29 + 0: MD_TEXTUAL_LITERAL@28..29 "(" [] [] + 25: MD_TEXTUAL@29..30 + 0: MD_TEXTUAL_LITERAL@29..30 "(" [] [] + 26: MD_TEXTUAL@30..31 + 0: MD_TEXTUAL_LITERAL@30..31 "(" [] [] + 27: MD_TEXTUAL@31..32 + 0: MD_TEXTUAL_LITERAL@31..32 "(" [] [] + 28: MD_TEXTUAL@32..33 + 0: MD_TEXTUAL_LITERAL@32..33 "(" [] [] + 29: MD_TEXTUAL@33..34 + 0: MD_TEXTUAL_LITERAL@33..34 "(" [] [] + 30: MD_TEXTUAL@34..35 + 0: MD_TEXTUAL_LITERAL@34..35 "(" [] [] + 31: MD_TEXTUAL@35..36 + 0: MD_TEXTUAL_LITERAL@35..36 "(" [] [] + 32: MD_TEXTUAL@36..37 + 0: MD_TEXTUAL_LITERAL@36..37 "(" [] [] + 33: MD_TEXTUAL@37..38 + 0: MD_TEXTUAL_LITERAL@37..38 "y" [] [] + 34: MD_TEXTUAL@38..39 + 0: MD_TEXTUAL_LITERAL@38..39 ")" [] [] + 35: MD_TEXTUAL@39..40 + 0: MD_TEXTUAL_LITERAL@39..40 ")" [] [] + 36: MD_TEXTUAL@40..41 + 0: MD_TEXTUAL_LITERAL@40..41 ")" [] [] + 37: MD_TEXTUAL@41..42 + 0: MD_TEXTUAL_LITERAL@41..42 ")" [] [] + 38: MD_TEXTUAL@42..43 + 0: MD_TEXTUAL_LITERAL@42..43 ")" [] [] + 39: MD_TEXTUAL@43..44 + 0: MD_TEXTUAL_LITERAL@43..44 ")" [] [] + 40: MD_TEXTUAL@44..45 + 0: MD_TEXTUAL_LITERAL@44..45 ")" [] [] + 41: MD_TEXTUAL@45..46 + 0: MD_TEXTUAL_LITERAL@45..46 ")" [] [] + 42: MD_TEXTUAL@46..47 + 0: MD_TEXTUAL_LITERAL@46..47 ")" [] [] + 43: MD_TEXTUAL@47..48 + 0: MD_TEXTUAL_LITERAL@47..48 ")" [] [] + 44: MD_TEXTUAL@48..49 + 0: MD_TEXTUAL_LITERAL@48..49 ")" [] [] + 45: MD_TEXTUAL@49..50 + 0: MD_TEXTUAL_LITERAL@49..50 ")" [] [] + 46: MD_TEXTUAL@50..51 + 0: MD_TEXTUAL_LITERAL@50..51 ")" [] [] + 47: MD_TEXTUAL@51..52 + 0: MD_TEXTUAL_LITERAL@51..52 ")" [] [] + 48: MD_TEXTUAL@52..53 + 0: MD_TEXTUAL_LITERAL@52..53 ")" [] [] + 49: MD_TEXTUAL@53..54 + 0: MD_TEXTUAL_LITERAL@53..54 ")" [] [] + 50: MD_TEXTUAL@54..55 + 0: MD_TEXTUAL_LITERAL@54..55 ")" [] [] + 51: MD_TEXTUAL@55..56 + 0: MD_TEXTUAL_LITERAL@55..56 ")" [] [] + 52: MD_TEXTUAL@56..57 + 0: MD_TEXTUAL_LITERAL@56..57 ")" [] [] + 53: MD_TEXTUAL@57..58 + 0: MD_TEXTUAL_LITERAL@57..58 ")" [] [] + 54: MD_TEXTUAL@58..59 + 0: MD_TEXTUAL_LITERAL@58..59 ")" [] [] + 55: MD_TEXTUAL@59..60 + 0: MD_TEXTUAL_LITERAL@59..60 ")" [] [] + 56: MD_TEXTUAL@60..61 + 0: MD_TEXTUAL_LITERAL@60..61 ")" [] [] + 57: MD_TEXTUAL@61..62 + 0: MD_TEXTUAL_LITERAL@61..62 ")" [] [] + 58: MD_TEXTUAL@62..63 + 0: MD_TEXTUAL_LITERAL@62..63 ")" [] [] + 59: MD_TEXTUAL@63..64 + 0: MD_TEXTUAL_LITERAL@63..64 ")" [] [] + 60: MD_TEXTUAL@64..65 + 0: MD_TEXTUAL_LITERAL@64..65 ")" [] [] + 61: MD_TEXTUAL@65..66 + 0: MD_TEXTUAL_LITERAL@65..66 ")" [] [] + 62: MD_TEXTUAL@66..67 + 0: MD_TEXTUAL_LITERAL@66..67 ")" [] [] + 63: MD_TEXTUAL@67..68 + 0: MD_TEXTUAL_LITERAL@67..68 ")" [] [] + 64: MD_TEXTUAL@68..69 + 0: MD_TEXTUAL_LITERAL@68..69 ")" [] [] + 65: MD_TEXTUAL@69..70 + 0: MD_TEXTUAL_LITERAL@69..70 ")" [] [] + 5: (empty) + 6: R_PAREN@70..71 ")" [] [] + 1: MD_TEXTUAL@71..72 + 0: MD_TEXTUAL_LITERAL@71..72 "\n" [] [] + 1: (empty) + 2: EOF@72..72 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md index 50ee14bae96d..1fd899443983 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md @@ -31,3 +31,7 @@ Nested in paragraph: This is a paragraph with [a reference][foo] in the middle. Case-insensitive: [case label] Whitespace normalized: [case label] + +[label\]]: https://escaped.example + +Escaped bracket in label: [text][label\]] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap index 8895f1a6a9ee..ebd7da580ada 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/reference_links.md.snap @@ -39,6 +39,10 @@ Case-insensitive: [case label] Whitespace normalized: [case label] +[label\]]: https://escaped.example + +Escaped bracket in label: [text][label\]] + ``` @@ -580,17 +584,84 @@ MdDocument { ], hard_line: missing (optional), }, + MdNewline { + value_token: NEWLINE@678..679 "\n" [] [], + }, + MdLinkReferenceDefinition { + l_brack_token: L_BRACK@679..680 "[" [] [], + label: MdLinkLabel { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@680..685 "label" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@685..687 "\\]" [] [], + }, + ], + }, + r_brack_token: R_BRACK@687..688 "]" [] [], + colon_token: COLON@688..689 ":" [] [], + destination: MdLinkDestination { + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@689..690 " " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@690..713 "https://escaped.example" [] [], + }, + ], + }, + title: missing (optional), + }, + MdNewline { + value_token: NEWLINE@713..714 "\n" [] [], + }, + MdNewline { + value_token: NEWLINE@714..715 "\n" [] [], + }, + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@715..741 "Escaped bracket in label: " [] [], + }, + MdReferenceLink { + l_brack_token: L_BRACK@741..742 "[" [] [], + text: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@742..746 "text" [] [], + }, + ], + r_brack_token: R_BRACK@746..747 "]" [] [], + label: MdReferenceLinkLabel { + l_brack_token: L_BRACK@747..748 "[" [] [], + label: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@748..753 "label" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@753..755 "\\]" [] [], + }, + ], + r_brack_token: R_BRACK@755..756 "]" [] [], + }, + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@756..757 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, ], - eof_token: EOF@678..678 "" [] [], + eof_token: EOF@757..757 "" [] [], } ``` ## CST ``` -0: MD_DOCUMENT@0..678 +0: MD_DOCUMENT@0..757 0: (empty) - 1: MD_BLOCK_LIST@0..678 + 1: MD_BLOCK_LIST@0..757 0: MD_LINK_REFERENCE_DEFINITION@0..46 0: L_BRACK@0..1 "[" [] [] 1: MD_LINK_LABEL@1..8 @@ -944,6 +1015,50 @@ MdDocument { 2: MD_TEXTUAL@677..678 0: MD_TEXTUAL_LITERAL@677..678 "\n" [] [] 1: (empty) - 2: EOF@678..678 "" [] [] + 37: MD_NEWLINE@678..679 + 0: NEWLINE@678..679 "\n" [] [] + 38: MD_LINK_REFERENCE_DEFINITION@679..713 + 0: L_BRACK@679..680 "[" [] [] + 1: MD_LINK_LABEL@680..687 + 0: MD_INLINE_ITEM_LIST@680..687 + 0: MD_TEXTUAL@680..685 + 0: MD_TEXTUAL_LITERAL@680..685 "label" [] [] + 1: MD_TEXTUAL@685..687 + 0: MD_TEXTUAL_LITERAL@685..687 "\\]" [] [] + 2: R_BRACK@687..688 "]" [] [] + 3: COLON@688..689 ":" [] [] + 4: MD_LINK_DESTINATION@689..713 + 0: MD_INLINE_ITEM_LIST@689..713 + 0: MD_TEXTUAL@689..690 + 0: MD_TEXTUAL_LITERAL@689..690 " " [] [] + 1: MD_TEXTUAL@690..713 + 0: MD_TEXTUAL_LITERAL@690..713 "https://escaped.example" [] [] + 5: (empty) + 39: MD_NEWLINE@713..714 + 0: NEWLINE@713..714 "\n" [] [] + 40: MD_NEWLINE@714..715 + 0: NEWLINE@714..715 "\n" [] [] + 41: MD_PARAGRAPH@715..757 + 0: MD_INLINE_ITEM_LIST@715..757 + 0: MD_TEXTUAL@715..741 + 0: MD_TEXTUAL_LITERAL@715..741 "Escaped bracket in label: " [] [] + 1: MD_REFERENCE_LINK@741..756 + 0: L_BRACK@741..742 "[" [] [] + 1: MD_INLINE_ITEM_LIST@742..746 + 0: MD_TEXTUAL@742..746 + 0: MD_TEXTUAL_LITERAL@742..746 "text" [] [] + 2: R_BRACK@746..747 "]" [] [] + 3: MD_REFERENCE_LINK_LABEL@747..756 + 0: L_BRACK@747..748 "[" [] [] + 1: MD_INLINE_ITEM_LIST@748..755 + 0: MD_TEXTUAL@748..753 + 0: MD_TEXTUAL_LITERAL@748..753 "label" [] [] + 1: MD_TEXTUAL@753..755 + 0: MD_TEXTUAL_LITERAL@753..755 "\\]" [] [] + 2: R_BRACK@755..756 "]" [] [] + 2: MD_TEXTUAL@756..757 + 0: MD_TEXTUAL_LITERAL@756..757 "\n" [] [] + 1: (empty) + 2: EOF@757..757 "" [] [] ``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_image.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap new file mode 100644 index 000000000000..34a87e78ae26 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_image.md.snap @@ -0,0 +1,65 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has ![unclosed image + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "!" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@25..26 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@26..26 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..26 + 0: (empty) + 1: MD_BLOCK_LIST@0..26 + 0: MD_PARAGRAPH@0..26 + 0: MD_INLINE_ITEM_LIST@0..26 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "!" [] [] + 2: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "[" [] [] + 3: MD_TEXTUAL@11..25 + 0: MD_TEXTUAL_LITERAL@11..25 "unclosed image" [] [] + 4: MD_TEXTUAL@25..26 + 0: MD_TEXTUAL_LITERAL@25..26 "\n" [] [] + 1: (empty) + 2: EOF@26..26 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_link.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap new file mode 100644 index 000000000000..54da8f2881d6 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_link.md.snap @@ -0,0 +1,60 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has [unclosed link + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@24..24 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..24 + 0: (empty) + 1: MD_BLOCK_LIST@0..24 + 0: MD_PARAGRAPH@0..24 + 0: MD_INLINE_ITEM_LIST@0..24 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "[" [] [] + 2: MD_TEXTUAL@10..23 + 0: MD_TEXTUAL_LITERAL@10..23 "unclosed link" [] [] + 3: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] + 1: (empty) + 2: EOF@24..24 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap similarity index 58% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap index 9e23a10310a5..eff2d1ec3c51 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_image_label.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_image_label.md.snap @@ -33,15 +33,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@14..15 "]" [] [], }, - MdReferenceLink { - l_brack_token: L_BRACK@15..16 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@30..31 "\n" [] [], @@ -72,37 +68,13 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@11..14 "alt" [] [] 4: MD_TEXTUAL@14..15 0: MD_TEXTUAL_LITERAL@14..15 "]" [] [] - 5: MD_REFERENCE_LINK@15..30 - 0: L_BRACK@15..16 "[" [] [] - 1: MD_INLINE_ITEM_LIST@16..30 - 0: MD_TEXTUAL@16..30 - 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] - 2: (empty) - 3: (empty) - 6: MD_TEXTUAL@30..31 + 5: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "[" [] [] + 6: MD_TEXTUAL@16..30 + 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] + 7: MD_TEXTUAL@30..31 0: MD_TEXTUAL_LITERAL@30..31 "\n" [] [] 1: (empty) 2: EOF@31..31 "" [] [] ``` - -## Diagnostics - -``` -unclosed_reference_image_label.md:1:16 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has ![alt][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has ![alt][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap similarity index 55% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap index 4414aecd6fdf..56fa9b097806 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_reference_link_label.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_reference_link_label.md.snap @@ -30,15 +30,11 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@14..15 "]" [] [], }, - MdReferenceLink { - l_brack_token: L_BRACK@15..16 "[" [] [], - text: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], - }, - ], - r_brack_token: missing (required), - label: missing (optional), + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "[" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@30..31 "\n" [] [], @@ -67,37 +63,13 @@ MdDocument { 0: MD_TEXTUAL_LITERAL@10..14 "text" [] [] 3: MD_TEXTUAL@14..15 0: MD_TEXTUAL_LITERAL@14..15 "]" [] [] - 4: MD_REFERENCE_LINK@15..30 - 0: L_BRACK@15..16 "[" [] [] - 1: MD_INLINE_ITEM_LIST@16..30 - 0: MD_TEXTUAL@16..30 - 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] - 2: (empty) - 3: (empty) - 5: MD_TEXTUAL@30..31 + 4: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "[" [] [] + 5: MD_TEXTUAL@16..30 + 0: MD_TEXTUAL_LITERAL@16..30 "unclosed label" [] [] + 6: MD_TEXTUAL@30..31 0: MD_TEXTUAL_LITERAL@30..31 "\n" [] [] 1: (empty) 2: EOF@31..31 "" [] [] ``` - -## Diagnostics - -``` -unclosed_reference_link_label.md:1:16 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed link, expected `]` to close link text. - - > 1 │ This has [text][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i link started here - - > 1 │ This has [text][unclosed label - │ ^^^^^^^^^^^^^^^ - 2 │ - - i Format: [link text](url) - -``` From 9a4dae0de22f6e0fb300bbf6132338d01b2da55c Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Sun, 25 Jan 2026 10:18:09 -0500 Subject: [PATCH 03/26] fix(markdown): enhance inline element parsing and emphasis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refines the parsing logic for inline elements, focusing on emphasis delimiters, code spans, and line breaks. Changes include: - Refactor emphasis matching to strictly follow the "rule of 3" (left-flanking/right-flanking) logic. - Allow emphasis and code spans to span multiple lines correctly. - Treat unmatched code span openers strictly as literal text. - Introduce a shared CommonMark punctuation table for consistent character handling. - Fix edge cases for hard line breaks and whitespace handling within inline elements. - Address specific CommonMark examples (518–603, 636-646) related to inline nesting and precedence. diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs index ec5a9ae0f2..8122d861de 100644 --- a/crates/biome_markdown_parser/src/lexer/mod.rs +++ b/crates/biome_markdown_parser/src/lexer/mod.rs @@ -20,6 +20,8 @@ use biome_unicode_table::lookup_byte; /// - `FencedCodeBlock`: Inside fenced code block, no markdown parsing /// - `HtmlBlock`: Inside HTML block, minimal markdown parsing /// - `LinkDefinition`: Inside link reference definition, whitespace separates tokens +/// - `CodeSpan`: Inside inline code span, backslashes are literal (no escapes) +/// - `EmphasisInline`: Emit single STAR/UNDERSCORE tokens for partial delimiter consumption #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum MarkdownLexContext { /// Normal markdown parsing with full inline element detection. @@ -39,6 +41,16 @@ pub enum MarkdownLexContext { /// In this context, whitespace is significant and separates destination from title. /// Text tokens stop at whitespace to allow proper parsing. LinkDefinition, + /// Inside an inline code span. + /// Per CommonMark §6.1, backslash escapes are not processed inside code spans. + /// Backslash is treated as a literal character, not an escape. + CodeSpan, + /// Inside emphasis delimiter processing. + /// In this context, `*` and `_` are always emitted as single-character tokens + /// (STAR, UNDERSCORE) rather than double tokens (DOUBLE_STAR, DOUBLE_UNDERSCORE). + /// This allows partial consumption of delimiter runs when the match algorithm + /// determines only 1 char should be used from a 2-char run. + EmphasisInline, } impl LexContext for MarkdownLexContext { @@ -57,6 +69,10 @@ pub enum MarkdownReLexContext { Regular, /// Re-lex for link definition context where whitespace is significant. LinkDefinition, + /// Re-lex for emphasis inline context where `*` and `_` emit single tokens. + /// Used when the emphasis matching algorithm needs to partially consume + /// a DOUBLE_STAR or DOUBLE_UNDERSCORE token. + EmphasisInline, } /// An extremely fast, lookup table based, lossless Markdown lexer @@ -230,9 +246,14 @@ impl<'src> MarkdownLexer<'src> { // - In middle of line: whitespace is just text content, include in textual token // - Exception: 2+ spaces before newline is a hard line break // - In LinkDefinition context: whitespace is always significant (separates destination from title) + // - In CodeSpan context: whitespace is literal content, no hard-line-break detection WHS => { if current == b'\n' || current == b'\r' { self.consume_newline() + } else if matches!(context, MarkdownLexContext::CodeSpan) { + // In code span context, whitespace is literal content. + // No hard-line-break detection - the renderer normalizes line endings to spaces. + self.consume_textual(context) } else if matches!(context, MarkdownLexContext::LinkDefinition) { // In link definition context, whitespace separates tokens. // We consume it as textual literal so it's not treated as trivia by the parser. @@ -267,7 +288,15 @@ impl<'src> MarkdownLexer<'src> { PNC => self.consume_byte(R_PAREN), COL => self.consume_byte(COLON), AMP => self.consume_entity_or_textual(context), - BSL => self.consume_escape(), + BSL => { + // Per CommonMark §6.1, backslash escapes are NOT processed inside code spans. + // Backslash is literal, so `\`` produces a literal backslash followed by backtick. + if matches!(context, MarkdownLexContext::CodeSpan) { + self.consume_textual(context) + } else { + self.consume_escape() + } + } // = at line start could be setext heading underline EQL if self.after_newline => self.consume_setext_underline_or_textual(), _ => { @@ -753,6 +782,19 @@ impl<'src> MarkdownLexer<'src> { // Not a thematic break - restore position and consume as emphasis marker self.position = start_position; + // In EmphasisInline context, always emit single tokens for * and _. + // This allows partial consumption of delimiter runs when the match algorithm + // determines only 1 char should be used from a 2-char run. + if matches!(context, MarkdownLexContext::EmphasisInline) { + self.advance(1); + return match start_char { + b'*' => STAR, + b'_' => UNDERSCORE, + b'-' => MINUS, + _ => unreachable!(), + }; + } + // Check for double emphasis markers (**, __) // Note: -- is not valid markdown emphasis, so we don't check for it if start_char != b'-' && self.peek_byte() == Some(start_char) { @@ -1200,6 +1242,7 @@ impl<'src> ReLexer<'src> for MarkdownLexer<'src> { let lex_context = match context { MarkdownReLexContext::Regular => MarkdownLexContext::Regular, MarkdownReLexContext::LinkDefinition => MarkdownLexContext::LinkDefinition, + MarkdownReLexContext::EmphasisInline => MarkdownLexContext::EmphasisInline, }; let re_lexed_kind = match self.current_byte() { diff --git a/crates/biome_markdown_parser/src/parser.rs b/crates/biome_markdown_parser/src/parser.rs index 2ac966faa0..877750fa61 100644 --- a/crates/biome_markdown_parser/src/parser.rs +++ b/crates/biome_markdown_parser/src/parser.rs @@ -208,6 +208,28 @@ impl<'source> MarkdownParser<'source> { .force_relex_in_context(crate::lexer::MarkdownLexContext::Regular); } + /// Force re-lex the current token in CodeSpan context. + /// In this context, backslash is literal (not an escape character). + /// Used for autolinks where `\>` should be `\` + `>` as separate tokens. + pub(crate) fn force_relex_code_span(&mut self) { + self.source + .force_relex_in_context(crate::lexer::MarkdownLexContext::CodeSpan); + } + + /// Re-lex the current token as single-char emphasis delimiter. + /// + /// Use this when the emphasis matching algorithm needs to partially consume + /// a DOUBLE_STAR or DOUBLE_UNDERSCORE token. After re-lexing, the token will + /// be either STAR or UNDERSCORE (single char). + /// + /// # Safety + /// Only call on the current token, NOT inside lookahead closures. + /// This invalidates any buffered lookahead, so ensure no lookahead is active. + pub(crate) fn force_relex_emphasis_inline(&mut self) -> MarkdownSyntaxKind { + self.source + .re_lex(crate::lexer::MarkdownReLexContext::EmphasisInline) + } + pub(crate) fn set_force_ordered_list_marker(&mut self, value: bool) { self.source.set_force_ordered_list_marker(value); } @@ -218,6 +240,7 @@ impl<'source> MarkdownParser<'source> { self.source.bump_link_definition(); } + pub fn checkpoint(&self) -> MarkdownParserCheckpoint { MarkdownParserCheckpoint { context: self.context.checkpoint(), diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index f336b37b33..87d649e360 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -41,8 +41,9 @@ use biome_markdown_syntax::T; use biome_markdown_syntax::kind::MarkdownSyntaxKind::*; use biome_parser::Parser; use biome_parser::prelude::ParsedSyntax::{self, *}; +use biome_unicode_table::is_unicode_punctuation; -use biome_rowan::{TextRange, TextSize}; +use biome_rowan::TextRange; use crate::MarkdownParser; use crate::link_reference::normalize_reference_label; @@ -71,6 +72,10 @@ struct DelimRun { can_close: bool, /// Byte offset in the source where this run starts start_offset: usize, + /// Bracket nesting depth for scoping emphasis within link text. + /// Delimiters inside brackets (links) should only match with each other, + /// not with delimiters outside the brackets. 0 = outside brackets. + label_id: usize, } /// A matched emphasis span (opener + closer) @@ -89,45 +94,14 @@ fn is_whitespace(c: char) -> bool { c.is_whitespace() } +fn is_emphasis_marker(c: char) -> bool { + matches!(c, '*' | '_') +} + /// Check if a character is Unicode punctuation for flanking rules. /// Per CommonMark spec, this includes ASCII punctuation and Unicode punctuation categories. fn is_punctuation(c: char) -> bool { - // ASCII punctuation + Unicode punctuation categories - matches!( - c, - '!' | '"' - | '#' - | '$' - | '%' - | '&' - | '\'' - | '(' - | ')' - | '*' - | '+' - | ',' - | '-' - | '.' - | '/' - | ':' - | ';' - | '<' - | '=' - | '>' - | '?' - | '@' - | '[' - | '\\' - | ']' - | '^' - | '_' - | '`' - | '{' - | '|' - | '}' - | '~' - ) || c.is_ascii_punctuation() - || matches!(c, '\u{2000}'..='\u{206F}' | '\u{2E00}'..='\u{2E7F}') + is_unicode_punctuation(c) } /// Check if an opening delimiter is left-flanking per CommonMark rules. @@ -138,6 +112,7 @@ fn is_left_flanking_delimiter(char_after: Option, char_before: Option false, // At end of input, can't be left-flanking Some(c) if is_whitespace(c) => false, // Followed by whitespace + Some(c) if is_emphasis_marker(c) => true, Some(c) if is_punctuation(c) => { // Followed by punctuation - only left-flanking if preceded by whitespace or punctuation match char_before { @@ -157,6 +132,7 @@ fn is_right_flanking_delimiter(char_before: Option, char_after: Option false, // At start of input, can't be right-flanking Some(c) if is_whitespace(c) => false, // Preceded by whitespace + Some(c) if is_emphasis_marker(c) => true, Some(c) if is_punctuation(c) => { // Preceded by punctuation - only right-flanking if followed by whitespace or punctuation match char_after { @@ -209,14 +185,147 @@ fn can_underscore_close(char_before: Option, char_after: Option) -> /// This is the first pass of the CommonMark emphasis algorithm. It scans /// the source text and identifies all potential delimiter runs (sequences /// of `*` or `_`), computing their flanking status. -fn collect_delimiter_runs(source: &str) -> Vec { +/// Result of checking if a bracket forms a valid link. +/// Contains the closing bracket position if found. +struct BracketCheckResult { + /// Position of the closing `]` (or 0 if not found) + close_pos: usize, + /// Whether this is a valid inline link `[...](` or full reference `[...][` + is_inline_or_full_ref: bool, +} + +/// Check if a bracket at position `start` forms a valid link pattern. +/// Returns the closing bracket position and whether it's an inline link or full reference. +fn check_bracket_pattern(bytes: &[u8], start: usize) -> Option { + if start >= bytes.len() || bytes[start] != b'[' { + return None; + } + + // Find matching ] with proper nesting + let mut depth = 1; + let mut i = start + 1; + while i < bytes.len() && depth > 0 { + match bytes[i] { + b'[' => depth += 1, + b']' => depth -= 1, + b'\\' if i + 1 < bytes.len() => i += 1, // Skip escaped char + b'`' => { + // Skip code spans + let backtick_count = { + let mut c = 1; + while i + c < bytes.len() && bytes[i + c] == b'`' { + c += 1; + } + c + }; + i += backtick_count; + while i < bytes.len() { + if bytes[i] == b'`' { + let close_count = { + let mut c = 1; + while i + c < bytes.len() && bytes[i + c] == b'`' { + c += 1; + } + c + }; + i += close_count; + if close_count == backtick_count { + break; + } + } else { + i += 1; + } + } + continue; + } + b'<' => { + // Skip potential HTML/autolinks + i += 1; + while i < bytes.len() && bytes[i] != b'>' && bytes[i] != b'\n' { + i += 1; + } + if i < bytes.len() && bytes[i] == b'>' { + i += 1; + } + continue; + } + _ => {} + } + i += 1; + } + + if depth != 0 { + return None; + } + + // i now points to position after `]` + let close_pos = i - 1; + let is_inline_or_full_ref = i < bytes.len() && (bytes[i] == b'(' || bytes[i] == b'['); + + Some(BracketCheckResult { + close_pos, + is_inline_or_full_ref, + }) +} + +/// Extract label text from a bracket pattern for reference lookup. +fn extract_label_text(source: &str, start: usize, close_pos: usize) -> &str { + if start < close_pos && close_pos <= source.len() { + &source[start + 1..close_pos] + } else { + "" + } +} + +fn collect_delimiter_runs(source: &str, reference_checker: impl Fn(&str) -> bool) -> Vec { let mut runs = Vec::new(); let bytes = source.as_bytes(); let mut i = 0; + // Pre-compute valid link bracket positions. + // A bracket is considered a valid link if: + // 1. It's followed by `(` (inline link) or `[` (full reference), OR + // 2. It's a shortcut reference with a defined reference (checked via reference_checker) + let mut link_bracket_starts = Vec::new(); + for pos in 0..bytes.len() { + if bytes[pos] == b'[' + && let Some(result) = check_bracket_pattern(bytes, pos) + { + if result.is_inline_or_full_ref { + // Inline link or full reference link + link_bracket_starts.push(pos); + } else { + // Could be a shortcut reference - check if definition exists + let label = extract_label_text(source, pos, result.close_pos); + let normalized = normalize_reference_label(label); + if !normalized.is_empty() && reference_checker(&normalized) { + link_bracket_starts.push(pos); + } + } + } + } + + // Track bracket depth, but only for valid link brackets + let mut bracket_depth = 0usize; + let mut active_link_brackets: Vec = Vec::new(); + while i < bytes.len() { let b = bytes[i]; + // Track bracket depth for valid links only + if b == b'[' && link_bracket_starts.contains(&i) { + bracket_depth += 1; + active_link_brackets.push(i); + i += 1; + continue; + } + if b == b']' && !active_link_brackets.is_empty() { + bracket_depth = bracket_depth.saturating_sub(1); + active_link_brackets.pop(); + i += 1; + continue; + } + // Check for delimiter characters if b == b'*' || b == b'_' { let kind = if b == b'*' { @@ -265,6 +374,10 @@ fn collect_delimiter_runs(source: &str) -> Vec { can_open, can_close, start_offset, + // Only scope by bracket depth when inside a valid link pattern. + // This prevents emphasis from spanning link boundaries, but allows + // emphasis to span brackets that don't form valid links. + label_id: bracket_depth, }); i = end_offset; @@ -330,41 +443,38 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { if runs[idx].can_close && runs[idx].count > 0 { loop { let mut opener_stack_pos = None; - let prefer_strong = runs[idx].count >= 2; - - for pass in 0..2 { - for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() { - let opener = &runs[opener_idx]; - let closer = &runs[idx]; - if opener.kind != closer.kind || !opener.can_open || opener.count == 0 { - continue; - } - - if prefer_strong && pass == 0 && opener.count < 2 { - continue; - } - - // Rule of 3: if (opener_count + closer_count) % 3 == 0 and - // the closer can open or the opener can close, skip unless - // both counts are divisible by 3 - let opener_count = opener.count; - let closer_count = closer.count; - if ((opener.can_open && opener.can_close) - || (closer.can_open && closer.can_close)) - && (opener_count + closer_count).is_multiple_of(3) - && (!opener_count.is_multiple_of(3) || !closer_count.is_multiple_of(3)) - { - continue; - } + // Search backward for the closest matching opener. + // Per CommonMark spec, we find any matching opener first, + // then determine strong vs regular based on both counts. + for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() { + let opener = &runs[opener_idx]; + let closer = &runs[idx]; + + // Only match within same bracket scope (label_id). + // This prevents emphasis from spanning link boundaries. + if opener.label_id != closer.label_id { + continue; + } - opener_stack_pos = Some(pos); - break; + if opener.kind != closer.kind || !opener.can_open || opener.count == 0 { + continue; } - if opener_stack_pos.is_some() { - break; + // Rule of 3: if (opener_count + closer_count) % 3 == 0 and + // the closer can open or the opener can close, skip unless + // both counts are divisible by 3 + let opener_count = opener.count; + let closer_count = closer.count; + if (opener.can_close || closer.can_open) + && !closer_count.is_multiple_of(3) + && (opener_count + closer_count).is_multiple_of(3) + { + continue; } + + opener_stack_pos = Some(pos); + break; } let Some(pos) = opener_stack_pos else { break }; @@ -375,7 +485,11 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { 1 }; - let opener_start = runs[opener_idx].start_offset; + // Openers consume from END of run (leftover stays at beginning). + // This ensures for `***foo***`, the inner `**` is consumed leaving `*` at start. + let opener_start = + runs[opener_idx].start_offset + runs[opener_idx].count - use_count; + // Closers consume from BEGINNING of what remains. let closer_start = runs[idx].start_offset; matches.push(EmphasisMatch { @@ -384,8 +498,9 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { is_strong: use_count == 2, }); + // Opener: reduce count but keep start_offset (leftover is at beginning) runs[opener_idx].count -= use_count; - runs[opener_idx].start_offset += use_count; + // Closer: reduce count and advance start_offset (leftover is at end) runs[idx].count -= use_count; runs[idx].start_offset += use_count; @@ -395,10 +510,10 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { opener_stack.pop(); } - if use_count == 2 && runs[opener_idx].count > 0 && runs[idx].count > 0 { - // Avoid crossing matches from odd-length runs (e.g. ***foo***). - break; - } + // Note: With the "consume from END" algorithm for openers, + // crossing matches are no longer an issue because the leftover + // chars end up at the beginning of the opener run (wrapping + // around the inner match), not at the end (which would cross). if runs[idx].count == 0 { break; @@ -426,10 +541,26 @@ pub(crate) struct EmphasisContext { base_offset: usize, } +/// Information about a match found within a token's range. +/// Used when the opener doesn't start at the exact token boundary. +#[derive(Debug)] +struct OpenerMatch<'a> { + /// The matched emphasis span + matched: &'a EmphasisMatch, + /// How many chars before opener_start (literal prefix to emit) + prefix_len: usize, +} + impl EmphasisContext { - /// Create a new emphasis context by analyzing the source text - pub(crate) fn new(source: &str, base_offset: usize) -> Self { - let mut runs = collect_delimiter_runs(source); + /// Create a new emphasis context by analyzing the source text. + /// The reference_checker function is used to determine if a bracket pattern + /// is a valid shortcut reference link. + pub(crate) fn new( + source: &str, + base_offset: usize, + reference_checker: impl Fn(&str) -> bool, + ) -> Self { + let mut runs = collect_delimiter_runs(source, reference_checker); let matches = match_delimiters(&mut runs); Self { matches, @@ -437,12 +568,44 @@ impl EmphasisContext { } } - /// Check if there's an emphasis opener at the given offset - fn opener_at(&self, offset: usize) -> Option<&EmphasisMatch> { - let abs_offset = offset; - self.matches - .iter() - .find(|m| m.opener_start + self.base_offset == abs_offset) + /// Find the *earliest* match whose opener_start is within [token_start, token_end) + /// and matches the expected `is_strong` value. + /// Returns None if no match found, or the match plus prefix length. + /// + /// This is used instead of exact offset matching because with the "consume from END" + /// algorithm, an opener might start in the middle of a DOUBLE_STAR token. + fn opener_within( + &self, + token_start: usize, + token_len: usize, + expect_strong: bool, + ) -> Option> { + let token_end = token_start + token_len; + let mut best: Option> = None; + + for m in &self.matches { + // Filter by expected emphasis type + if m.is_strong != expect_strong { + continue; + } + + let abs_opener = m.opener_start + self.base_offset; + if abs_opener >= token_start && abs_opener < token_end { + let candidate = OpenerMatch { + matched: m, + prefix_len: abs_opener - token_start, + }; + // Pick the earliest match (smallest prefix_len) + if best + .as_ref() + .is_none_or(|b| candidate.prefix_len < b.prefix_len) + { + best = Some(candidate); + } + } + } + + best } } @@ -458,72 +621,143 @@ pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax { return Absent; } + let ends_block = p.lookahead(|p| { + p.bump(MD_HARD_LINE_LITERAL); + p.at(NEWLINE) || p.at(EOF) + }); + + if ends_block { + return super::parse_textual(p); + } + let m = p.start(); p.bump(MD_HARD_LINE_LITERAL); Present(m.complete(p, MD_HARD_LINE)) } +/// Check if there's a matching closing backtick sequence before EOF/blank line. +/// +/// Per CommonMark §6.1, a code span opener must have a matching closer with the +/// same number of backticks. If no match exists, the opener should be treated +/// as literal text, not an unclosed code span. +/// +/// Returns false if no match found (opener should become literal text). + +fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool { + use crate::lexer::MarkdownLexContext; + + p.lookahead(|p| { + // Skip the opening backticks + p.bump(BACKTICK); + + loop { + // EOF = no matching closer found + if p.at(T![EOF]) { + return false; + } + + // Blank line = paragraph boundary, terminates search + if p.at(NEWLINE) && p.at_blank_line() { + return false; + } + + // Per CommonMark §4.3, setext heading underlines take priority over + // inline code spans. If crossing a newline would land on a setext + // underline, the code span is invalid — the underline forms a heading. + if p.at(NEWLINE) { + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); + if crate::syntax::at_setext_underline_after_newline(p).is_some() { + return false; + } + continue; + } + + // Found backticks - check if they match + if p.at(BACKTICK) { + let closing_count = p.cur_text().len(); + if closing_count == opening_count { + return true; + } + // Not matching - continue searching + p.bump(BACKTICK); + continue; + } + + // Consume token and continue (use CodeSpan context for proper backslash handling) + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); + } + }) +} + /// Parse inline code span (`` `code` `` or ``` `` `code` `` ```). /// /// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`' /// -/// Per CommonMark, code spans can use multiple backticks to allow literal -/// backticks inside: ``` `` `code` `` ``` wraps around code containing backticks. -/// The opening and closing backtick strings must be the same length. +/// Per CommonMark §6.1: +/// - Code spans can use multiple backticks to allow literal backticks inside +/// - The opening and closing backtick strings must be the same length +/// - Backslash escapes are NOT processed inside code spans (\` is literal `\``) +/// - If no matching closer exists, the opener is treated as literal text pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { + use crate::lexer::MarkdownLexContext; + if !p.at(BACKTICK) { return Absent; } - let m = p.start(); - - // Count opening backticks from token text let opening_count = p.cur_text().len(); - let opening_range = p.cur_range(); + + // DESIGN PRINCIPLE #2 & #4: Check for matching closer BEFORE creating any nodes. + // If no match exists, return Absent so backticks become literal text. + // This avoids synthesizing MD_INLINE_CODE with missing r_tick_token. + if !has_matching_code_span_closer(p, opening_count) { + return Absent; // Caller will treat backtick as literal MD_TEXTUAL + } + + // We have a valid code span - now parse it + let m = p.start(); // Opening backtick(s) p.bump(BACKTICK); - // Content - parse until we find a BACKTICK with matching count, or EOF + // Content - parse until we find matching closing backticks + // Per CommonMark, code spans can span multiple lines (newlines become spaces in output) + // All content is lexed in CodeSpan context to keep backslash literal and avoid + // hard-line-break detection. let content = p.start(); - let mut found_closing = false; loop { - if p.at_inline_end() { + // EOF should not happen (lookahead guaranteed a closer), but handle defensively + if p.at(T![EOF]) { break; } - // Check for matching closing backticks - if p.at(BACKTICK) { - let closing_count = p.cur_text().len(); - if closing_count == opening_count { - // Found matching closing backticks - found_closing = true; - break; + // DESIGN PRINCIPLE #3: Terminate on blank line (paragraph boundary) + if p.at(NEWLINE) { + if p.at_blank_line() { + break; // Paragraph boundary - stop } - // Not matching - consume as content + // Soft line break - consume NEWLINE as content and continue + // Use CodeSpan context so next token is also lexed without escape processing let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); text_m.complete(p, MD_TEXTUAL); continue; } - // Regular content + // Found matching closing backticks + if p.at(BACKTICK) && p.cur_text().len() == opening_count { + break; + } + + // DESIGN PRINCIPLE #1: Use CodeSpan context so backslash is literal let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); text_m.complete(p, MD_TEXTUAL); } content.complete(p, MD_INLINE_ITEM_LIST); - // Closing backtick(s) - emit custom diagnostic if missing - if found_closing { - p.bump(BACKTICK); - } else { - p.error(super::parse_error::unclosed_code_span( - p, - opening_range, - opening_count, - )); - } + // Closing backticks (guaranteed to exist due to lookahead check) + p.bump(BACKTICK); Present(m.complete(p, MD_INLINE_CODE)) } @@ -535,46 +769,95 @@ fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> P None => return Absent, }; - let offset = u32::from(p.cur_range().start()) as usize; - let matched = match context.opener_at(offset) { - Some(matched) => matched, + // Must be at an emphasis token + if !p.at(DOUBLE_STAR) && !p.at(DOUBLE_UNDERSCORE) && !p.at(T![*]) && !p.at(UNDERSCORE) { + return Absent; + } + + // Get current token info BEFORE any re-lex + let token_start = u32::from(p.cur_range().start()) as usize; + let token_len: usize = p.cur_range().len().into(); + + // Find match within current token's range that has the expected is_strong value + let opener_match = match context.opener_within(token_start, token_len, expect_strong) { + Some(m) => m, None => return Absent, }; - if matched.is_strong != expect_strong { + // If the opener doesn't start at the exact token boundary, return Absent. + // The caller (parse_any_inline) will emit literal text, advancing the parser position. + // On subsequent calls, we'll eventually be at the correct position with prefix_len == 0. + if opener_match.prefix_len > 0 { return Absent; } - let (opener_kind, closer_kind, opener_text) = if expect_strong { - if p.at(DOUBLE_STAR) { - (DOUBLE_STAR, DOUBLE_STAR, "**") - } else if p.at(DOUBLE_UNDERSCORE) { - (DOUBLE_UNDERSCORE, DOUBLE_UNDERSCORE, "__") - } else { - return Absent; - } - } else if p.at(T![*]) { - (T![*], T![*], "*") - } else if p.at(UNDERSCORE) { - (UNDERSCORE, UNDERSCORE, "_") - } else { - return Absent; + // Extract values before dropping the borrow on context + let use_count = if expect_strong { 2 } else { 1 }; + let closer_offset = opener_match.matched.closer_start + context.base_offset; + // Use the correct delimiter character for error messages + let is_underscore = p.at(DOUBLE_UNDERSCORE) || p.at(UNDERSCORE); + let opener_text = match (expect_strong, is_underscore) { + (true, true) => "__", + (true, false) => "**", + (false, true) => "_", + (false, false) => "*", }; - let closer_offset = matched.closer_start + context.base_offset; let m = p.start(); let opening_range = p.cur_range(); - p.bump(opener_kind); + // Consume opener tokens + // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one. + // Only re-lex when we need to consume a partial token or single chars. + if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) { + // Bump the double token as a single unit + p.bump_any(); + } else { + // Consume individual tokens + for _ in 0..use_count { + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + p.bump_any(); + } + } + // Parse content until we reach the closer let content = p.start(); loop { - if p.at_inline_end() { + // EOF always ends content + if p.at(T![EOF]) { break; } let current_offset = u32::from(p.cur_range().start()) as usize; - if current_offset == closer_offset { + let current_len: usize = p.cur_range().len().into(); + + // Check if closer is AT or WITHIN current token + if closer_offset >= current_offset && closer_offset < current_offset + current_len { + break; + } + + // Check if we've passed the closer (can happen when link parsing consumes past it) + if current_offset > closer_offset { + break; + } + + // Handle NEWLINE: emphasis can span multiple lines per CommonMark + // But blank lines end paragraphs, so stop there + if p.at(NEWLINE) { + if p.at_blank_line() { + // Blank line = paragraph boundary, emphasis is unclosed + break; + } + if closer_offset > current_offset { + // Soft line break - consume NEWLINE as textual content and continue + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + continue; + } + // Closer should have been at or before this newline - stop break; } @@ -584,9 +867,45 @@ fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> P } content.complete(p, MD_INLINE_ITEM_LIST); - if p.at(closer_kind) && u32::from(p.cur_range().start()) as usize == closer_offset { - p.bump(closer_kind); + // Consume closer tokens (1 or 2) + // Handle partial closer consumption (e.g., `*foo**` where closer might be at offset 4 + // but token DOUBLE_STAR spans 4-6) + let current_offset = u32::from(p.cur_range().start()) as usize; + let closer_prefix_len = closer_offset.saturating_sub(current_offset); + + if closer_prefix_len > 0 { + // Closer starts AFTER token start - emit prefix as literal + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + for _ in 0..closer_prefix_len { + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + } + } + + // Now consume actual closer delimiters + // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one. + let mut consumed_closer = 0; + if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) { + p.bump_any(); + consumed_closer = 2; } else { + for _ in 0..use_count { + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + if p.at(T![*]) || p.at(UNDERSCORE) || p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.bump_any(); + consumed_closer += 1; + } else { + break; + } + } + } + + if consumed_closer < use_count { p.error(super::parse_error::unclosed_emphasis( p, opening_range, @@ -633,6 +952,31 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS break; } + // IMPORTANT: Parse constructs that can contain `]` BEFORE checking for stop token. + // Per CommonMark, `]` inside code spans, autolinks, and HTML doesn't terminate links. + + // Code spans can contain `]` + if p.at(BACKTICK) { + if parse_inline_code(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // Autolinks and inline HTML can contain `]` + if p.at(L_ANGLE) { + if parse_autolink(p).is_present() { + continue; + } + if parse_inline_html(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // NOW check for stop token (after constructs that can contain it) if p.at(stop) { if bracket_depth == 0 { break; @@ -661,6 +1005,86 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS has_nested_link } +/// Parse inline items until `stop` token, allowing full inline parsing including links. +/// Used for image alt text where nested links/images should be fully parsed +/// so their text content can be extracted for the alt attribute. +fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { + let m = p.start(); + let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + + loop { + if p.at(NEWLINE) { + if p.at_blank_line() { + break; + } + let _ = super::parse_textual(p); + continue; + } + + if p.at(T![EOF]) { + break; + } + + // Code spans can contain `]` + if p.at(BACKTICK) { + if parse_inline_code(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // Autolinks and inline HTML can contain `]` + if p.at(L_ANGLE) { + if parse_autolink(p).is_present() { + continue; + } + if parse_inline_html(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + // For image alt: allow full inline parsing including links and images + if p.at(L_BRACK) { + let result = parse_link_or_image(p, LinkParseKind::Link); + if result.is_present() { + continue; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + + if p.at(BANG) && p.nth_at(1, L_BRACK) { + let result = parse_link_or_image(p, LinkParseKind::Image); + if result.is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + if parse_any_inline(p).is_absent() { + break; + } + } + + m.complete(p, MD_INLINE_ITEM_LIST); + p.set_emphasis_context(prev_context); +} + fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { if !p.at(L_BRACK) { @@ -720,7 +1144,10 @@ fn set_inline_emphasis_context_until( source }; let base_offset = u32::from(p.cur_range().start()) as usize; - let context = EmphasisContext::new(inline_source, base_offset); + // Create a reference checker closure that uses the parser's link reference definitions + let context = EmphasisContext::new(inline_source, base_offset, |label| { + p.has_link_reference_definition(label) + }); p.set_emphasis_context(Some(context)) } @@ -851,21 +1278,6 @@ impl LinkParseKind { } } - fn report_unclosed_text(self, p: &mut MarkdownParser, opening_range: TextRange) { - match self { - Self::Link => p.error(super::parse_error::unclosed_link( - p, - opening_range, - "expected `]` to close link text", - )), - Self::Image => p.error(super::parse_error::unclosed_image( - p, - opening_range, - "expected `]` to close alt text", - )), - } - } - fn report_unclosed_destination(self, p: &mut MarkdownParser, opening_range: TextRange) { match self { Self::Link => p.error(super::parse_error::unclosed_link( @@ -897,30 +1309,28 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = if matches!(kind, LinkParseKind::Image) { + // For images, allow full inline parsing (including links) in alt text. + // This lets nested links/images be parsed so their text can be extracted for alt. + parse_inline_item_list_until(p, R_BRACK); + false + } else { + parse_inline_item_list_until_no_links(p, R_BRACK) + }; - // ] - if missing at inline end, emit diagnostic; otherwise rewind + // ] - if missing, rewind and treat [ as literal text. + // Per CommonMark, if there's no valid ] to close the link (e.g., all ] + // characters are inside code spans or HTML), the [ is literal text. + // NOTE: We intentionally do NOT emit an "unclosed link" diagnostic here. + // CommonMark treats unmatched `[` as literal text, not an error. if !p.eat(R_BRACK) { - if matches!(kind, LinkParseKind::Link) && has_nested_link { - m.abandon(p); - p.rewind(checkpoint); - return Absent; - } - if p.at_inline_end() { - // Unclosed link/image at end of inline content - emit diagnostic - // Expand range to include the text content, not just the opening bracket - let full_range = TextRange::new(opening_range.start(), p.cur_range().start()); - kind.report_unclosed_text(p, full_range); - // Return as reference link/image (shortcut) with missing closing bracket - return Present(m.complete(p, kind.reference_kind())); - } - // Not at inline end but missing ] - rewind and treat as text m.abandon(p); p.rewind(checkpoint); return Absent; } - let text_end_offset = p.cur_range().start(); + // Per CommonMark, a link (not image) whose text contains another link must fail. + // The inner link wins and the outer `[` becomes literal text. if matches!(kind, LinkParseKind::Link) && has_nested_link { m.abandon(p); p.rewind(checkpoint); @@ -1007,7 +1417,10 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, text_end_offset); + // Return Absent - the caller will treat `[` as textual. + // Don't consume the whole bracket sequence to avoid consuming + // past emphasis closers. + return Absent; } Present(m.complete(p, kind.reference_kind())) @@ -1020,7 +1433,10 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, text_end_offset); + // Return Absent - the caller will treat `[` as textual. + // Don't consume the whole bracket sequence to avoid consuming + // past emphasis closers. + return Absent; } Present(m.complete(p, kind.reference_kind())) } @@ -1064,7 +1480,7 @@ fn lookahead_reference_common( p.bump(L_BRACK); - let link_text = collect_bracket_text(p)?; + let link_text = collect_link_text(p)?; // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty) let normalized_link = normalize_reference_label(&link_text); @@ -1080,7 +1496,7 @@ fn lookahead_reference_common( if p.at(L_BRACK) { p.bump(L_BRACK); - let label_text = collect_bracket_text(p); + let label_text = collect_label_text_simple(p); if let Some(label_text) = label_text { let label = if label_text.is_empty() { link_text.clone() @@ -1107,13 +1523,31 @@ fn lookahead_reference_common( }) } -fn collect_bracket_text(p: &mut MarkdownParser) -> Option { +/// Collect text for a link label (e.g., the `label` in `[text][label]`). +/// +/// Per CommonMark §4.7, link labels have specific rules: +/// - Unescaped square brackets are NOT allowed inside labels (see example 555) +/// - Backslash escapes ARE allowed (e.g., `\]` is a literal `]` in the label) +/// - No inline parsing (backticks, HTML, etc. are literal characters) +/// +/// We stop at the first R_BRACK token (unescaped `]`). Escaped brackets like `\]` +/// are lexed as MD_TEXTUAL_LITERAL, not R_BRACK, so they're included in the label. +fn collect_label_text_simple(p: &mut MarkdownParser) -> Option { let mut text = String::new(); + loop { if p.at(T![EOF]) || p.at_inline_end() { return None; } + // Blank lines terminate + if p.at(NEWLINE) && p.at_blank_line() { + return None; + } + + // R_BRACK token = unescaped `]` closes the label. + // Note: Escaped brackets (`\]`) are lexed as MD_TEXTUAL_LITERAL, + // not R_BRACK, so they're correctly included in the label text. if p.at(R_BRACK) { return Some(text); } @@ -1123,18 +1557,94 @@ fn collect_bracket_text(p: &mut MarkdownParser) -> Option { } } -fn consume_textual_until_offset(p: &mut MarkdownParser, end_offset: TextSize) -> ParsedSyntax { - let mut last = Absent; +/// Collect text for link text (e.g., the `text` in `[text](url)` or `[text][label]`). +/// Per CommonMark, link text CAN contain inline elements - code spans, autolinks, HTML. +/// `]` inside these constructs does NOT close the link text. +fn collect_link_text(p: &mut MarkdownParser) -> Option { + let mut text = String::new(); + let mut bracket_depth = 0usize; - while !p.at(T![EOF]) { - let end = p.cur_range().end(); - last = super::parse_textual(p); - if end >= end_offset { - break; + loop { + if p.at(T![EOF]) || p.at_inline_end() { + return None; } - } - last + // Per CommonMark, blank lines terminate link text + if p.at(NEWLINE) && p.at_blank_line() { + return None; + } + + // Code spans can contain `]` - skip them entirely. + // Per CommonMark, `]` inside code spans doesn't terminate link text. + if p.at(BACKTICK) { + let opening_count = p.cur_text().len(); + text.push_str(p.cur_text()); + p.bump(p.cur()); + + // Find matching closing backticks + let mut found_close = false; + while !p.at(T![EOF]) && !p.at_inline_end() { + if p.at(NEWLINE) && p.at_blank_line() { + break; // Blank line terminates + } + if p.at(BACKTICK) && p.cur_text().len() == opening_count { + text.push_str(p.cur_text()); + p.bump(p.cur()); + found_close = true; + break; + } + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + if !found_close { + // Unclosed code span - treat opening backticks as literal + // (already added to text, continue normally) + } + continue; + } + + // Autolinks and inline HTML can contain `]` - skip them entirely. + // Per CommonMark, `]` inside `<...>` constructs doesn't terminate link text. + if p.at(L_ANGLE) { + text.push_str(p.cur_text()); + p.bump(p.cur()); + + // Consume until `>` or newline + while !p.at(T![EOF]) && !p.at_inline_end() && !p.at(R_ANGLE) { + if p.at(NEWLINE) { + // Newlines end autolinks/HTML tags + break; + } + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + if p.at(R_ANGLE) { + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + continue; + } + + if p.at(L_BRACK) { + bracket_depth += 1; + text.push_str(p.cur_text()); + p.bump(p.cur()); + continue; + } + + if p.at(R_BRACK) { + if bracket_depth == 0 { + return Some(text); + } + bracket_depth -= 1; + text.push_str(p.cur_text()); + p.bump(p.cur()); + continue; + } + + text.push_str(p.cur_text()); + p.bump(p.cur()); + } } fn bump_textual_link_def(p: &mut MarkdownParser) { @@ -1522,19 +2032,30 @@ pub(crate) fn parse_inline_image(p: &mut MarkdownParser) -> ParsedSyntax { /// - Processing instructions: `` /// - Declarations: `` /// - CDATA: `` -fn is_inline_html(text: &str) -> Option { +pub(crate) fn is_inline_html(text: &str) -> Option { let bytes = text.as_bytes(); if bytes.len() < 2 || bytes[0] != b'<' { return None; } // HTML comment: + // Per CommonMark 0.31.2 §6.8, an HTML comment consists of ``, + // where text does not start with `>` or `->`, and does not end with `-`. + // Additionally, `` and `` are valid (degenerate) comments. if bytes.starts_with(b" + let rest = &bytes[4..]; + // Handle degenerate comments: and + if rest.starts_with(b">") { + return Some(5); // + } + if rest.starts_with(b"->") { + return Some(6); // + } + // Find closing --> after ") { let body = &text[4..4 + pos]; - // CommonMark: comment cannot start with '>' or '->', and must not contain "--" - if body.starts_with('>') || body.starts_with("->") || body.contains("--") { + // Body must not end with '-' + if body.ends_with('-') { return None; } return Some(4 + pos + 3); @@ -1658,8 +2179,17 @@ fn is_inline_html(text: &str) -> Option { let is_attr_name_continue = |b: u8| b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'.' || b == b'-'; + let mut need_space = true; + // We already know the boundary char was whitespace, so first iteration has space. + let mut had_space = true; + loop { - let had_space = skip_spaces(&mut i)?; + if need_space { + let s = skip_spaces(&mut i)?; + had_space = had_space || s; + } + need_space = true; + if i >= bytes.len() { return None; } @@ -1690,7 +2220,7 @@ fn is_inline_html(text: &str) -> Option { } // Optional whitespace and value - skip_spaces(&mut i)?; + had_space = skip_spaces(&mut i)?; if i < bytes.len() && bytes[i] == b'=' { i += 1; skip_spaces(&mut i)?; @@ -1740,7 +2270,11 @@ fn is_inline_html(text: &str) -> Option { } } } + // After value, need to find whitespace at top of loop + had_space = false; } + // If no '=' was found, `had_space` from skip_spaces above carries over + // as the separator for the next attribute (boolean attribute case). } } @@ -1785,6 +2319,12 @@ pub(crate) fn parse_inline_html(p: &mut MarkdownParser) -> ParsedSyntax { None => return Absent, }; + // Per CommonMark §4.3, setext heading underlines take priority over inline HTML. + // If this HTML tag spans across a line that is a setext underline, treat `<` as literal. + if crate::syntax::inline_span_crosses_setext(p, html_len) { + return Absent; + } + // Valid inline HTML - create the node // Use checkpoint so we can rewind if token boundaries don't align let checkpoint = p.checkpoint(); @@ -1957,18 +2497,29 @@ pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax { // < p.bump(L_ANGLE); - // Content as inline item list containing textual nodes - let content = p.start(); - while !p.at(R_ANGLE) && !p.at_inline_end() { + // Content as inline item list containing textual nodes. + // Autolinks don't process backslash escapes, but the lexer may combine + // `\>` into a single escape token. We re-lex in CodeSpan context where + // backslash is literal, so `\` and `>` are separate tokens. + p.force_relex_code_span(); + + let content_m = p.start(); + while !p.at(R_ANGLE) && !p.at(T![EOF]) && !p.at_inline_end() { let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context( + MD_TEXTUAL_LITERAL, + crate::lexer::MarkdownLexContext::CodeSpan, + ); text_m.complete(p, MD_TEXTUAL); } - content.complete(p, MD_INLINE_ITEM_LIST); + content_m.complete(p, MD_INLINE_ITEM_LIST); // > p.expect(R_ANGLE); + // Re-lex back to regular context + p.force_relex_regular(); + Present(m.complete(p, MD_AUTOLINK)) } @@ -1977,15 +2528,29 @@ pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax { if p.at(MD_HARD_LINE_LITERAL) { parse_hard_line(p) } else if p.at(BACKTICK) { - parse_inline_code(p) - } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { - // Try emphasis, fall back to literal text if flanking rules fail - let result = parse_inline_emphasis(p); + // Try code span, fall back to literal text if no matching closer exists + let result = parse_inline_code(p); if result.is_absent() { super::parse_textual(p) } else { result } + } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + // For cases like `***foo***`, the em match starts at the exact token boundary + // (prefix_len=0) while the strong match starts at offset 1 (prefix_len=1). + // Try italic first to handle nested emphasis correctly, then try strong. + let result = parse_inline_italic(p); + if result.is_present() { + return result; + } + let result = parse_inline_emphasis(p); + if result.is_present() { + return result; + } + // Neither matched - re-lex to single token and emit just one char as literal. + // This handles cases like `**foo*` where opener is at offset 1. + p.force_relex_emphasis_inline(); + super::parse_textual(p) } else if p.at(T![*]) || p.at(UNDERSCORE) { // Try italic, fall back to literal text if flanking rules fail let result = parse_inline_italic(p); diff --git a/crates/biome_markdown_parser/src/syntax/parse_error.rs b/crates/biome_markdown_parser/src/syntax/parse_error.rs index 8f864cc0ac..97ce5288ac 100644 --- a/crates/biome_markdown_parser/src/syntax/parse_error.rs +++ b/crates/biome_markdown_parser/src/syntax/parse_error.rs @@ -29,26 +29,6 @@ pub(crate) fn unclosed_emphasis( )) } -/// Unclosed inline code span. -/// -/// ```markdown -/// `code -/// ^ expected closing ` -/// ``` -pub(crate) fn unclosed_code_span( - p: &MarkdownParser, - opening_range: TextRange, - backtick_count: usize, -) -> ParseDiagnostic { - let backticks = "`".repeat(backtick_count); - p.err_builder( - format!("Unclosed code span, expected closing {backticks}."), - opening_range, - ) - .with_detail(opening_range, "code span started here") - .with_hint(format!("Add closing {backticks} to close the code span.")) -} - /// Unclosed inline link. /// /// ```markdown diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index 0512db55fe..90d75ad412 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -1047,9 +1047,7 @@ fn render_inline_link(link: &MdInlineLink, ctx: &HtmlRenderContext, out: &mut St /// Render an inline image. fn render_inline_image(img: &MdInlineImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = render_inline_list(&img.alt(), ctx); - // Strip HTML tags from alt text - let alt = strip_html_tags(&alt); + let alt = extract_alt_text(&img.alt(), ctx); let dest = collect_inline_text(&img.destination()); let dest = process_link_destination(&dest); @@ -1109,8 +1107,7 @@ fn render_reference_link(link: &MdReferenceLink, ctx: &HtmlRenderContext, out: & /// Render a reference image. fn render_reference_image(img: &MdReferenceImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = render_inline_list(&img.alt(), ctx); - let alt = strip_html_tags(&alt); + let alt = extract_alt_text(&img.alt(), ctx); let alt_raw = collect_inline_text(&img.alt()); render_reference_common( @@ -1198,10 +1195,12 @@ fn render_autolink(autolink: &MdAutolink, out: &mut String) { // Check if it's an email autolink let is_email = content.contains('@') && !content.contains(':'); + // Autolinks must NOT process backslash escapes or entity decoding. + // Only percent-encode for URL safety. let href = if is_email { format!("mailto:{}", content) } else { - process_link_destination(&content) + percent_encode_uri(&content) }; out.push_str(" String { escape_html(text) } -/// Strip HTML tags from text (for image alt text). -fn strip_html_tags(text: &str) -> String { +/// Extract plain text for image alt attribute. +/// Per CommonMark, the alt text is the content with inline formatting stripped +/// but text from nested links/images preserved (recursively extracting their text). +fn extract_alt_text( + list: &biome_markdown_syntax::MdInlineItemList, + ctx: &HtmlRenderContext, +) -> String { let mut result = String::new(); - let mut in_tag = false; + for item in list.iter() { + extract_alt_text_inline(&item, ctx, &mut result); + } + result +} - for c in text.chars() { - if c == '<' { - in_tag = true; - } else if c == '>' { - in_tag = false; - } else if !in_tag { - result.push(c); +fn extract_alt_text_inline(inline: &AnyMdInline, ctx: &HtmlRenderContext, out: &mut String) { + match inline { + AnyMdInline::MdTextual(text) => { + render_textual(text, out); + } + AnyMdInline::MdInlineEmphasis(em) => { + out.push_str(&extract_alt_text(&em.content(), ctx)); + } + AnyMdInline::MdInlineItalic(italic) => { + out.push_str(&extract_alt_text(&italic.content(), ctx)); + } + AnyMdInline::MdInlineCode(code) => { + // Plain text only — no tags for alt attribute + let content = collect_raw_inline_text(&code.content()); + let content = content.replace('\n', " "); + let content = if content.starts_with(' ') + && content.ends_with(' ') + && content.len() > 2 + && content.chars().any(|c| c != ' ') + { + content[1..content.len() - 1].to_string() + } else { + content + }; + out.push_str(&escape_html(&content)); + } + AnyMdInline::MdInlineLink(link) => { + // Extract text content from link text + out.push_str(&extract_alt_text(&link.text(), ctx)); + } + AnyMdInline::MdInlineImage(img) => { + // Recursively extract alt text from nested image + out.push_str(&extract_alt_text(&img.alt(), ctx)); + } + AnyMdInline::MdReferenceLink(link) => { + out.push_str(&extract_alt_text(&link.text(), ctx)); + } + AnyMdInline::MdReferenceImage(img) => { + out.push_str(&extract_alt_text(&img.alt(), ctx)); + } + AnyMdInline::MdAutolink(autolink) => { + let content = collect_raw_inline_text(&autolink.value()); + out.push_str(&escape_html(&content)); + } + AnyMdInline::MdHardLine(_) | AnyMdInline::MdSoftBreak(_) => { + out.push(' '); + } + AnyMdInline::MdEntityReference(entity) => { + render_entity_reference(entity, out); + } + AnyMdInline::MdInlineHtml(_) | AnyMdInline::MdHtmlBlock(_) => { + // HTML tags are stripped in alt text } } - - result } // ============================================================================ @@ -1590,6 +1641,80 @@ mod tests { assert_eq!(html, "

italic and bold

\n"); } + #[test] + fn test_emphasis_complex_cases() { + // Test: Nested + let parsed = parse_markdown("**bold *and italic* text**\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Nested failed: {}", + parsed.syntax() + ); + + // Test: Rule of 3 + let parsed = parse_markdown("***bold italic***\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Rule of 3 failed: {}", + parsed.syntax() + ); + + // Test: Multiple runs + let parsed = parse_markdown("*a **b** c*\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Multiple runs failed: {}", + parsed.syntax() + ); + + // Test: Overlapping + let parsed = parse_markdown("*foo**bar**baz*\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Overlapping failed: {}", + parsed.syntax() + ); + + // Test: Unbalanced emphasis (CommonMark example 442) + // **foo* should produce *foo + let parsed = parse_markdown("**foo*\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

*foo

\n", + "Unbalanced: {}", + parsed.syntax() + ); + } + + #[test] + fn test_example_431() { + // Test: Example 431 - nested emphasis with triple star closer + // **foo *bar*** should produce foo bar + let parsed = parse_markdown("**foo *bar***\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

foo bar

\n", + "Example 431: {}", + parsed.syntax() + ); + } + #[test] fn test_escape_html() { assert_eq!(escape_html("a & b < c > d"), "a & b < c > d"); diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap deleted file mode 100644 index c5e71c93f2..0000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap +++ /dev/null @@ -1,86 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has `unclosed code - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdInlineCode { - l_tick_token: BACKTICK@9..10 "`" [] [], - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [], - }, - ], - r_tick_token: missing (required), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@24..24 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..24 - 0: (empty) - 1: MD_BLOCK_LIST@0..24 - 0: MD_PARAGRAPH@0..24 - 0: MD_INLINE_ITEM_LIST@0..24 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_INLINE_CODE@9..23 - 0: BACKTICK@9..10 "`" [] [] - 1: MD_INLINE_ITEM_LIST@10..23 - 0: MD_TEXTUAL@10..23 - 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [] - 2: (empty) - 2: MD_TEXTUAL@23..24 - 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] - 1: (empty) - 2: EOF@24..24 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_code_span.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed code span, expected closing `. - - > 1 │ This has `unclosed code - │ ^ - 2 │ - - i code span started here - - > 1 │ This has `unclosed code - │ ^ - 2 │ - - i Add closing ` to close the code span. - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap index 10e49ec3d4..f0790723e0 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap @@ -103,20 +103,20 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] [], }, - MdInlineEmphasis { - l_fence: DOUBLE_STAR@77..79 "**" [] [], + MdInlineItalic { + l_fence: STAR@77..78 "*" [] [], content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@79..80 "*" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [], + MdInlineEmphasis { + l_fence: DOUBLE_STAR@78..80 "**" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [], + }, + ], + r_fence: DOUBLE_STAR@91..93 "**" [] [], }, ], - r_fence: DOUBLE_STAR@91..93 "**" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@93..94 "*" [] [], + r_fence: STAR@93..94 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@94..95 "\n" [] [], @@ -301,17 +301,17 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@66..95 0: MD_TEXTUAL@66..77 0: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] [] - 1: MD_INLINE_EMPHASIS@77..93 - 0: DOUBLE_STAR@77..79 "**" [] [] - 1: MD_INLINE_ITEM_LIST@79..91 - 0: MD_TEXTUAL@79..80 - 0: MD_TEXTUAL_LITERAL@79..80 "*" [] [] - 1: MD_TEXTUAL@80..91 - 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [] - 2: DOUBLE_STAR@91..93 "**" [] [] - 2: MD_TEXTUAL@93..94 - 0: MD_TEXTUAL_LITERAL@93..94 "*" [] [] - 3: MD_TEXTUAL@94..95 + 1: MD_INLINE_ITALIC@77..94 + 0: STAR@77..78 "*" [] [] + 1: MD_INLINE_ITEM_LIST@78..93 + 0: MD_INLINE_EMPHASIS@78..93 + 0: DOUBLE_STAR@78..80 "**" [] [] + 1: MD_INLINE_ITEM_LIST@80..91 + 0: MD_TEXTUAL@80..91 + 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [] + 2: DOUBLE_STAR@91..93 "**" [] [] + 2: STAR@93..94 "*" [] [] + 2: MD_TEXTUAL@94..95 0: MD_TEXTUAL_LITERAL@94..95 "\n" [] [] 1: (empty) 5: MD_NEWLINE@95..96 diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap index bdbaf86e82..94116b93fa 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap @@ -21,26 +21,32 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] [], }, - MdInlineEmphasis { - l_fence: DOUBLE_STAR@10..12 "**" [] [], + MdInlineItalic { + l_fence: STAR@10..11 "*" [] [], content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@14..15 "*" [] [], + MdInlineItalic { + l_fence: STAR@11..12 "*" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [], + }, + MdInlineItalic { + l_fence: STAR@14..15 "*" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [], + }, + ], + r_fence: STAR@16..17 "*" [] [], + }, + ], + r_fence: STAR@17..18 "*" [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [], + value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [], }, ], - r_fence: DOUBLE_STAR@16..18 "**" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@20..21 "*" [] [], + r_fence: STAR@20..21 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@21..22 "\n" [] [], @@ -63,21 +69,25 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@0..22 0: MD_TEXTUAL@0..10 0: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] [] - 1: MD_INLINE_EMPHASIS@10..18 - 0: DOUBLE_STAR@10..12 "**" [] [] - 1: MD_INLINE_ITEM_LIST@12..16 - 0: MD_TEXTUAL@12..14 - 0: MD_TEXTUAL_LITERAL@12..14 "a " [] [] - 1: MD_TEXTUAL@14..15 - 0: MD_TEXTUAL_LITERAL@14..15 "*" [] [] - 2: MD_TEXTUAL@15..16 - 0: MD_TEXTUAL_LITERAL@15..16 "b" [] [] - 2: DOUBLE_STAR@16..18 "**" [] [] - 2: MD_TEXTUAL@18..20 - 0: MD_TEXTUAL_LITERAL@18..20 " c" [] [] - 3: MD_TEXTUAL@20..21 - 0: MD_TEXTUAL_LITERAL@20..21 "*" [] [] - 4: MD_TEXTUAL@21..22 + 1: MD_INLINE_ITALIC@10..21 + 0: STAR@10..11 "*" [] [] + 1: MD_INLINE_ITEM_LIST@11..20 + 0: MD_INLINE_ITALIC@11..18 + 0: STAR@11..12 "*" [] [] + 1: MD_INLINE_ITEM_LIST@12..17 + 0: MD_TEXTUAL@12..14 + 0: MD_TEXTUAL_LITERAL@12..14 "a " [] [] + 1: MD_INLINE_ITALIC@14..17 + 0: STAR@14..15 "*" [] [] + 1: MD_INLINE_ITEM_LIST@15..16 + 0: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "b" [] [] + 2: STAR@16..17 "*" [] [] + 2: STAR@17..18 "*" [] [] + 1: MD_TEXTUAL@18..20 + 0: MD_TEXTUAL_LITERAL@18..20 " c" [] [] + 2: STAR@20..21 "*" [] [] + 2: MD_TEXTUAL@21..22 0: MD_TEXTUAL_LITERAL@21..22 "\n" [] [] 1: (empty) 2: EOF@22..22 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap index 1183219b2b..abc0b3d482 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap @@ -22,7 +22,10 @@ MdDocument { value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@9..11 "**" [] [], + value_token: MD_TEXTUAL_LITERAL@9..10 "*" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] [], @@ -48,11 +51,13 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@0..25 0: MD_TEXTUAL@0..9 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_TEXTUAL@9..11 - 0: MD_TEXTUAL_LITERAL@9..11 "**" [] [] - 2: MD_TEXTUAL@11..24 + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "*" [] [] + 2: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "*" [] [] + 3: MD_TEXTUAL@11..24 0: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] [] - 3: MD_TEXTUAL@24..25 + 4: MD_TEXTUAL@24..25 0: MD_TEXTUAL_LITERAL@24..25 "\n" [] [] 1: (empty) 2: EOF@25..25 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap new file mode 100644 index 0000000000..eefa654d5e --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap @@ -0,0 +1,60 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has `unclosed code + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "`" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@24..24 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..24 + 0: (empty) + 1: MD_BLOCK_LIST@0..24 + 0: MD_PARAGRAPH@0..24 + 0: MD_INLINE_ITEM_LIST@0..24 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "`" [] [] + 2: MD_TEXTUAL@10..23 + 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [] + 3: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] + 1: (empty) + 2: EOF@24..24 "" [] [] + +``` diff --git a/crates/biome_unicode_table/src/lib.rs b/crates/biome_unicode_table/src/lib.rs index 3d2c3a82cd..9a5495ebc4 100644 --- a/crates/biome_unicode_table/src/lib.rs +++ b/crates/biome_unicode_table/src/lib.rs @@ -4,9 +4,11 @@ use crate::bytes::DISPATCHER; use crate::tables::derived_property::{ID_Continue, ID_Start}; mod bytes; +mod punctuation; mod tables; pub use crate::bytes::Dispatch; +pub use crate::punctuation::is_unicode_punctuation; /// Tests if `c` is a valid start of a CSS identifier #[inline] diff --git a/crates/biome_unicode_table/src/punctuation.rs b/crates/biome_unicode_table/src/punctuation.rs new file mode 100644 index 0000000000..c3575a823d --- /dev/null +++ b/crates/biome_unicode_table/src/punctuation.rs @@ -0,0 +1,413 @@ +//! CommonMark Unicode punctuation table. +//! +//! Derived from the markdown-rs Unicode punctuation list used for CommonMark. +//! Per CommonMark, "Unicode punctuation" includes characters from both the +//! General_Category=Punctuation (P*) and General_Category=Symbol (S*) categories. +//! This is used for CommonMark flanking rules in emphasis parsing. + +// Note: duplicated from generated unicode tables to keep this module standalone. +#[inline] +fn bsearch_range_table(c: char, r: &[(char, char)]) -> bool { + use core::cmp::Ordering::{Equal, Greater, Less}; + r.binary_search_by(|&(lo, hi)| { + if lo > c { + Greater + } else if hi < c { + Less + } else { + Equal + } + }) + .is_ok() +} + +const PUNCTUATION_RANGES: &[(char, char)] = &[ + ('\u{0021}', '\u{002F}'), + ('\u{003A}', '\u{0040}'), + ('\u{005B}', '\u{0060}'), + ('\u{007B}', '\u{007E}'), + ('\u{00A1}', '\u{00A9}'), + ('\u{00AB}', '\u{00AC}'), + ('\u{00AE}', '\u{00B1}'), + ('\u{00B4}', '\u{00B4}'), + ('\u{00B6}', '\u{00B8}'), + ('\u{00BB}', '\u{00BB}'), + ('\u{00BF}', '\u{00BF}'), + ('\u{00D7}', '\u{00D7}'), + ('\u{00F7}', '\u{00F7}'), + ('\u{02C2}', '\u{02C5}'), + ('\u{02D2}', '\u{02DF}'), + ('\u{02E5}', '\u{02EB}'), + ('\u{02ED}', '\u{02ED}'), + ('\u{02EF}', '\u{02FF}'), + ('\u{0375}', '\u{0375}'), + ('\u{037E}', '\u{037E}'), + ('\u{0384}', '\u{0385}'), + ('\u{0387}', '\u{0387}'), + ('\u{03F6}', '\u{03F6}'), + ('\u{0482}', '\u{0482}'), + ('\u{055A}', '\u{055F}'), + ('\u{0589}', '\u{058A}'), + ('\u{058D}', '\u{058F}'), + ('\u{05BE}', '\u{05BE}'), + ('\u{05C0}', '\u{05C0}'), + ('\u{05C3}', '\u{05C3}'), + ('\u{05C6}', '\u{05C6}'), + ('\u{05F3}', '\u{05F4}'), + ('\u{0606}', '\u{060F}'), + ('\u{061B}', '\u{061B}'), + ('\u{061D}', '\u{061F}'), + ('\u{066A}', '\u{066D}'), + ('\u{06D4}', '\u{06D4}'), + ('\u{06DE}', '\u{06DE}'), + ('\u{06E9}', '\u{06E9}'), + ('\u{06FD}', '\u{06FE}'), + ('\u{0700}', '\u{070D}'), + ('\u{07F6}', '\u{07F9}'), + ('\u{07FE}', '\u{07FF}'), + ('\u{0830}', '\u{083E}'), + ('\u{085E}', '\u{085E}'), + ('\u{0888}', '\u{0888}'), + ('\u{0964}', '\u{0965}'), + ('\u{0970}', '\u{0970}'), + ('\u{09F2}', '\u{09F3}'), + ('\u{09FA}', '\u{09FB}'), + ('\u{09FD}', '\u{09FD}'), + ('\u{0A76}', '\u{0A76}'), + ('\u{0AF0}', '\u{0AF1}'), + ('\u{0B70}', '\u{0B70}'), + ('\u{0BF3}', '\u{0BFA}'), + ('\u{0C77}', '\u{0C77}'), + ('\u{0C7F}', '\u{0C7F}'), + ('\u{0C84}', '\u{0C84}'), + ('\u{0D4F}', '\u{0D4F}'), + ('\u{0D79}', '\u{0D79}'), + ('\u{0DF4}', '\u{0DF4}'), + ('\u{0E3F}', '\u{0E3F}'), + ('\u{0E4F}', '\u{0E4F}'), + ('\u{0E5A}', '\u{0E5B}'), + ('\u{0F01}', '\u{0F17}'), + ('\u{0F1A}', '\u{0F1F}'), + ('\u{0F34}', '\u{0F34}'), + ('\u{0F36}', '\u{0F36}'), + ('\u{0F38}', '\u{0F38}'), + ('\u{0F3A}', '\u{0F3D}'), + ('\u{0F85}', '\u{0F85}'), + ('\u{0FBE}', '\u{0FC5}'), + ('\u{0FC7}', '\u{0FCC}'), + ('\u{0FCE}', '\u{0FDA}'), + ('\u{104A}', '\u{104F}'), + ('\u{109E}', '\u{109F}'), + ('\u{10FB}', '\u{10FB}'), + ('\u{1360}', '\u{1368}'), + ('\u{1390}', '\u{1399}'), + ('\u{1400}', '\u{1400}'), + ('\u{166D}', '\u{166E}'), + ('\u{169B}', '\u{169C}'), + ('\u{16EB}', '\u{16ED}'), + ('\u{1735}', '\u{1736}'), + ('\u{17D4}', '\u{17D6}'), + ('\u{17D8}', '\u{17DB}'), + ('\u{1800}', '\u{180A}'), + ('\u{1940}', '\u{1940}'), + ('\u{1944}', '\u{1945}'), + ('\u{19DE}', '\u{19FF}'), + ('\u{1A1E}', '\u{1A1F}'), + ('\u{1AA0}', '\u{1AA6}'), + ('\u{1AA8}', '\u{1AAD}'), + ('\u{1B4E}', '\u{1B4F}'), + ('\u{1B5A}', '\u{1B6A}'), + ('\u{1B74}', '\u{1B7F}'), + ('\u{1BFC}', '\u{1BFF}'), + ('\u{1C3B}', '\u{1C3F}'), + ('\u{1C7E}', '\u{1C7F}'), + ('\u{1CC0}', '\u{1CC7}'), + ('\u{1CD3}', '\u{1CD3}'), + ('\u{1FBD}', '\u{1FBD}'), + ('\u{1FBF}', '\u{1FC1}'), + ('\u{1FCD}', '\u{1FCF}'), + ('\u{1FDD}', '\u{1FDF}'), + ('\u{1FED}', '\u{1FEF}'), + ('\u{1FFD}', '\u{1FFE}'), + ('\u{2010}', '\u{2027}'), + ('\u{2030}', '\u{205E}'), + ('\u{207A}', '\u{207E}'), + ('\u{208A}', '\u{208E}'), + ('\u{20A0}', '\u{20C0}'), + ('\u{2100}', '\u{2101}'), + ('\u{2103}', '\u{2106}'), + ('\u{2108}', '\u{2109}'), + ('\u{2114}', '\u{2114}'), + ('\u{2116}', '\u{2118}'), + ('\u{211E}', '\u{2123}'), + ('\u{2125}', '\u{2125}'), + ('\u{2127}', '\u{2127}'), + ('\u{2129}', '\u{2129}'), + ('\u{212E}', '\u{212E}'), + ('\u{213A}', '\u{213B}'), + ('\u{2140}', '\u{2144}'), + ('\u{214A}', '\u{214D}'), + ('\u{214F}', '\u{214F}'), + ('\u{218A}', '\u{218B}'), + ('\u{2190}', '\u{2429}'), + ('\u{2440}', '\u{244A}'), + ('\u{249C}', '\u{24E9}'), + ('\u{2500}', '\u{2775}'), + ('\u{2794}', '\u{2B73}'), + ('\u{2B76}', '\u{2B95}'), + ('\u{2B97}', '\u{2BFF}'), + ('\u{2CE5}', '\u{2CEA}'), + ('\u{2CF9}', '\u{2CFC}'), + ('\u{2CFE}', '\u{2CFF}'), + ('\u{2D70}', '\u{2D70}'), + ('\u{2E00}', '\u{2E2E}'), + ('\u{2E30}', '\u{2E5D}'), + ('\u{2E80}', '\u{2E99}'), + ('\u{2E9B}', '\u{2EF3}'), + ('\u{2F00}', '\u{2FD5}'), + ('\u{2FF0}', '\u{2FFF}'), + ('\u{3001}', '\u{3004}'), + ('\u{3008}', '\u{3020}'), + ('\u{3030}', '\u{3030}'), + ('\u{3036}', '\u{3037}'), + ('\u{303D}', '\u{303F}'), + ('\u{309B}', '\u{309C}'), + ('\u{30A0}', '\u{30A0}'), + ('\u{30FB}', '\u{30FB}'), + ('\u{3190}', '\u{3191}'), + ('\u{3196}', '\u{319F}'), + ('\u{31C0}', '\u{31E5}'), + ('\u{31EF}', '\u{31EF}'), + ('\u{3200}', '\u{321E}'), + ('\u{322A}', '\u{3247}'), + ('\u{3250}', '\u{3250}'), + ('\u{3260}', '\u{327F}'), + ('\u{328A}', '\u{32B0}'), + ('\u{32C0}', '\u{33FF}'), + ('\u{4DC0}', '\u{4DFF}'), + ('\u{A490}', '\u{A4C6}'), + ('\u{A4FE}', '\u{A4FF}'), + ('\u{A60D}', '\u{A60F}'), + ('\u{A673}', '\u{A673}'), + ('\u{A67E}', '\u{A67E}'), + ('\u{A6F2}', '\u{A6F7}'), + ('\u{A700}', '\u{A716}'), + ('\u{A720}', '\u{A721}'), + ('\u{A789}', '\u{A78A}'), + ('\u{A828}', '\u{A82B}'), + ('\u{A836}', '\u{A839}'), + ('\u{A874}', '\u{A877}'), + ('\u{A8CE}', '\u{A8CF}'), + ('\u{A8F8}', '\u{A8FA}'), + ('\u{A8FC}', '\u{A8FC}'), + ('\u{A92E}', '\u{A92F}'), + ('\u{A95F}', '\u{A95F}'), + ('\u{A9C1}', '\u{A9CD}'), + ('\u{A9DE}', '\u{A9DF}'), + ('\u{AA5C}', '\u{AA5F}'), + ('\u{AA77}', '\u{AA79}'), + ('\u{AADE}', '\u{AADF}'), + ('\u{AAF0}', '\u{AAF1}'), + ('\u{AB5B}', '\u{AB5B}'), + ('\u{AB6A}', '\u{AB6B}'), + ('\u{ABEB}', '\u{ABEB}'), + ('\u{FB29}', '\u{FB29}'), + ('\u{FBB2}', '\u{FBC2}'), + ('\u{FD3E}', '\u{FD4F}'), + ('\u{FDCF}', '\u{FDCF}'), + ('\u{FDFC}', '\u{FDFF}'), + ('\u{FE10}', '\u{FE19}'), + ('\u{FE30}', '\u{FE52}'), + ('\u{FE54}', '\u{FE66}'), + ('\u{FE68}', '\u{FE6B}'), + ('\u{FF01}', '\u{FF0F}'), + ('\u{FF1A}', '\u{FF20}'), + ('\u{FF3B}', '\u{FF40}'), + ('\u{FF5B}', '\u{FF65}'), + ('\u{FFE0}', '\u{FFE6}'), + ('\u{FFE8}', '\u{FFEE}'), + ('\u{FFFC}', '\u{FFFD}'), + ('\u{10100}', '\u{10102}'), + ('\u{10137}', '\u{1013F}'), + ('\u{10179}', '\u{10189}'), + ('\u{1018C}', '\u{1018E}'), + ('\u{10190}', '\u{1019C}'), + ('\u{101A0}', '\u{101A0}'), + ('\u{101D0}', '\u{101FC}'), + ('\u{1039F}', '\u{1039F}'), + ('\u{103D0}', '\u{103D0}'), + ('\u{1056F}', '\u{1056F}'), + ('\u{10857}', '\u{10857}'), + ('\u{10877}', '\u{10878}'), + ('\u{1091F}', '\u{1091F}'), + ('\u{1093F}', '\u{1093F}'), + ('\u{10A50}', '\u{10A58}'), + ('\u{10A7F}', '\u{10A7F}'), + ('\u{10AC8}', '\u{10AC8}'), + ('\u{10AF0}', '\u{10AF6}'), + ('\u{10B39}', '\u{10B3F}'), + ('\u{10B99}', '\u{10B9C}'), + ('\u{10D6E}', '\u{10D6E}'), + ('\u{10D8E}', '\u{10D8F}'), + ('\u{10EAD}', '\u{10EAD}'), + ('\u{10F55}', '\u{10F59}'), + ('\u{10F86}', '\u{10F89}'), + ('\u{11047}', '\u{1104D}'), + ('\u{110BB}', '\u{110BC}'), + ('\u{110BE}', '\u{110C1}'), + ('\u{11140}', '\u{11143}'), + ('\u{11174}', '\u{11175}'), + ('\u{111C5}', '\u{111C8}'), + ('\u{111CD}', '\u{111CD}'), + ('\u{111DB}', '\u{111DB}'), + ('\u{111DD}', '\u{111DF}'), + ('\u{11238}', '\u{1123D}'), + ('\u{112A9}', '\u{112A9}'), + ('\u{113D4}', '\u{113D5}'), + ('\u{113D7}', '\u{113D8}'), + ('\u{1144B}', '\u{1144F}'), + ('\u{1145A}', '\u{1145B}'), + ('\u{1145D}', '\u{1145D}'), + ('\u{114C6}', '\u{114C6}'), + ('\u{115C1}', '\u{115D7}'), + ('\u{11641}', '\u{11643}'), + ('\u{11660}', '\u{1166C}'), + ('\u{116B9}', '\u{116B9}'), + ('\u{1173C}', '\u{1173F}'), + ('\u{1183B}', '\u{1183B}'), + ('\u{11944}', '\u{11946}'), + ('\u{119E2}', '\u{119E2}'), + ('\u{11A3F}', '\u{11A46}'), + ('\u{11A9A}', '\u{11A9C}'), + ('\u{11A9E}', '\u{11AA2}'), + ('\u{11B00}', '\u{11B09}'), + ('\u{11BE1}', '\u{11BE1}'), + ('\u{11C41}', '\u{11C45}'), + ('\u{11C70}', '\u{11C71}'), + ('\u{11EF7}', '\u{11EF8}'), + ('\u{11F43}', '\u{11F4F}'), + ('\u{11FD5}', '\u{11FF1}'), + ('\u{11FFF}', '\u{11FFF}'), + ('\u{12470}', '\u{12474}'), + ('\u{12FF1}', '\u{12FF2}'), + ('\u{16A6E}', '\u{16A6F}'), + ('\u{16AF5}', '\u{16AF5}'), + ('\u{16B37}', '\u{16B3F}'), + ('\u{16B44}', '\u{16B45}'), + ('\u{16D6D}', '\u{16D6F}'), + ('\u{16E97}', '\u{16E9A}'), + ('\u{16FE2}', '\u{16FE2}'), + ('\u{1BC9C}', '\u{1BC9C}'), + ('\u{1BC9F}', '\u{1BC9F}'), + ('\u{1CC00}', '\u{1CCEF}'), + ('\u{1CD00}', '\u{1CEB3}'), + ('\u{1CF50}', '\u{1CFC3}'), + ('\u{1D000}', '\u{1D0F5}'), + ('\u{1D100}', '\u{1D126}'), + ('\u{1D129}', '\u{1D164}'), + ('\u{1D16A}', '\u{1D16C}'), + ('\u{1D183}', '\u{1D184}'), + ('\u{1D18C}', '\u{1D1A9}'), + ('\u{1D1AE}', '\u{1D1EA}'), + ('\u{1D200}', '\u{1D241}'), + ('\u{1D245}', '\u{1D245}'), + ('\u{1D300}', '\u{1D356}'), + ('\u{1D6C1}', '\u{1D6C1}'), + ('\u{1D6DB}', '\u{1D6DB}'), + ('\u{1D6FB}', '\u{1D6FB}'), + ('\u{1D715}', '\u{1D715}'), + ('\u{1D735}', '\u{1D735}'), + ('\u{1D74F}', '\u{1D74F}'), + ('\u{1D76F}', '\u{1D76F}'), + ('\u{1D789}', '\u{1D789}'), + ('\u{1D7A9}', '\u{1D7A9}'), + ('\u{1D7C3}', '\u{1D7C3}'), + ('\u{1D800}', '\u{1D9FF}'), + ('\u{1DA37}', '\u{1DA3A}'), + ('\u{1DA6D}', '\u{1DA74}'), + ('\u{1DA76}', '\u{1DA83}'), + ('\u{1DA85}', '\u{1DA8B}'), + ('\u{1E14F}', '\u{1E14F}'), + ('\u{1E2FF}', '\u{1E2FF}'), + ('\u{1E5FF}', '\u{1E5FF}'), + ('\u{1E95E}', '\u{1E95F}'), + ('\u{1ECAC}', '\u{1ECAC}'), + ('\u{1ECB0}', '\u{1ECB0}'), + ('\u{1ED2E}', '\u{1ED2E}'), + ('\u{1EEF0}', '\u{1EEF1}'), + ('\u{1F000}', '\u{1F02B}'), + ('\u{1F030}', '\u{1F093}'), + ('\u{1F0A0}', '\u{1F0AE}'), + ('\u{1F0B1}', '\u{1F0BF}'), + ('\u{1F0C1}', '\u{1F0CF}'), + ('\u{1F0D1}', '\u{1F0F5}'), + ('\u{1F10D}', '\u{1F1AD}'), + ('\u{1F1E6}', '\u{1F202}'), + ('\u{1F210}', '\u{1F23B}'), + ('\u{1F240}', '\u{1F248}'), + ('\u{1F250}', '\u{1F251}'), + ('\u{1F260}', '\u{1F265}'), + ('\u{1F300}', '\u{1F6D7}'), + ('\u{1F6DC}', '\u{1F6EC}'), + ('\u{1F6F0}', '\u{1F6FC}'), + ('\u{1F700}', '\u{1F776}'), + ('\u{1F77B}', '\u{1F7D9}'), + ('\u{1F7E0}', '\u{1F7EB}'), + ('\u{1F7F0}', '\u{1F7F0}'), + ('\u{1F800}', '\u{1F80B}'), + ('\u{1F810}', '\u{1F847}'), + ('\u{1F850}', '\u{1F859}'), + ('\u{1F860}', '\u{1F887}'), + ('\u{1F890}', '\u{1F8AD}'), + ('\u{1F8B0}', '\u{1F8BB}'), + ('\u{1F8C0}', '\u{1F8C1}'), + ('\u{1F900}', '\u{1FA53}'), + ('\u{1FA60}', '\u{1FA6D}'), + ('\u{1FA70}', '\u{1FA7C}'), + ('\u{1FA80}', '\u{1FA89}'), + ('\u{1FA8F}', '\u{1FAC6}'), + ('\u{1FACE}', '\u{1FADC}'), + ('\u{1FADF}', '\u{1FAE9}'), + ('\u{1FAF0}', '\u{1FAF8}'), + ('\u{1FB00}', '\u{1FB92}'), + ('\u{1FB94}', '\u{1FBEF}'), +]; + +/// Check if a character is Unicode punctuation per CommonMark. +#[inline] +pub fn is_unicode_punctuation(c: char) -> bool { + bsearch_range_table(c, PUNCTUATION_RANGES) +} + +#[cfg(test)] +mod tests { + use super::PUNCTUATION_RANGES; + use super::is_unicode_punctuation; + + #[test] + fn ascii_punctuation() { + assert!(is_unicode_punctuation('!')); + assert!(is_unicode_punctuation('.')); + assert!(is_unicode_punctuation('(')); + } + + #[test] + fn non_punctuation() { + assert!(!is_unicode_punctuation('a')); + assert!(!is_unicode_punctuation(' ')); + assert!(!is_unicode_punctuation('0')); + } + + #[test] + fn unicode_punctuation() { + assert!(is_unicode_punctuation('\u{2014}')); + assert!(is_unicode_punctuation('\u{00BF}')); + } + + #[test] + fn table_is_sorted() { + for window in PUNCTUATION_RANGES.windows(2) { + assert!(window[0].1 < window[1].0, "Ranges must be sorted"); + } + } +} --- crates/biome_markdown_parser/src/lexer/mod.rs | 45 +- crates/biome_markdown_parser/src/parser.rs | 23 + crates/biome_markdown_parser/src/syntax.rs | 18 +- .../src/syntax/header.rs | 9 +- .../src/syntax/inline.rs | 965 ++++++++++++++---- .../src/syntax/parse_error.rs | 20 - crates/biome_markdown_parser/src/to_html.rs | 161 ++- .../error/unclosed_code_span.md.snap | 86 -- .../md_test_suite/ok/emphasis_complex.md.snap | 44 +- .../ok/emphasis_crossing.md.snap | 70 +- .../md_test_suite/ok/unclosed_bold.md.snap | 15 +- .../{error => ok}/unclosed_code_span.md | 0 .../ok/unclosed_code_span.md.snap | 60 ++ crates/biome_unicode_table/src/lib.rs | 2 + crates/biome_unicode_table/src/punctuation.rs | 413 ++++++++ 15 files changed, 1546 insertions(+), 385 deletions(-) delete mode 100644 crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap rename crates/biome_markdown_parser/tests/md_test_suite/{error => ok}/unclosed_code_span.md (100%) create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap create mode 100644 crates/biome_unicode_table/src/punctuation.rs diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs index ec5a9ae0f27d..8122d861de06 100644 --- a/crates/biome_markdown_parser/src/lexer/mod.rs +++ b/crates/biome_markdown_parser/src/lexer/mod.rs @@ -20,6 +20,8 @@ use biome_unicode_table::lookup_byte; /// - `FencedCodeBlock`: Inside fenced code block, no markdown parsing /// - `HtmlBlock`: Inside HTML block, minimal markdown parsing /// - `LinkDefinition`: Inside link reference definition, whitespace separates tokens +/// - `CodeSpan`: Inside inline code span, backslashes are literal (no escapes) +/// - `EmphasisInline`: Emit single STAR/UNDERSCORE tokens for partial delimiter consumption #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum MarkdownLexContext { /// Normal markdown parsing with full inline element detection. @@ -39,6 +41,16 @@ pub enum MarkdownLexContext { /// In this context, whitespace is significant and separates destination from title. /// Text tokens stop at whitespace to allow proper parsing. LinkDefinition, + /// Inside an inline code span. + /// Per CommonMark §6.1, backslash escapes are not processed inside code spans. + /// Backslash is treated as a literal character, not an escape. + CodeSpan, + /// Inside emphasis delimiter processing. + /// In this context, `*` and `_` are always emitted as single-character tokens + /// (STAR, UNDERSCORE) rather than double tokens (DOUBLE_STAR, DOUBLE_UNDERSCORE). + /// This allows partial consumption of delimiter runs when the match algorithm + /// determines only 1 char should be used from a 2-char run. + EmphasisInline, } impl LexContext for MarkdownLexContext { @@ -57,6 +69,10 @@ pub enum MarkdownReLexContext { Regular, /// Re-lex for link definition context where whitespace is significant. LinkDefinition, + /// Re-lex for emphasis inline context where `*` and `_` emit single tokens. + /// Used when the emphasis matching algorithm needs to partially consume + /// a DOUBLE_STAR or DOUBLE_UNDERSCORE token. + EmphasisInline, } /// An extremely fast, lookup table based, lossless Markdown lexer @@ -230,9 +246,14 @@ impl<'src> MarkdownLexer<'src> { // - In middle of line: whitespace is just text content, include in textual token // - Exception: 2+ spaces before newline is a hard line break // - In LinkDefinition context: whitespace is always significant (separates destination from title) + // - In CodeSpan context: whitespace is literal content, no hard-line-break detection WHS => { if current == b'\n' || current == b'\r' { self.consume_newline() + } else if matches!(context, MarkdownLexContext::CodeSpan) { + // In code span context, whitespace is literal content. + // No hard-line-break detection - the renderer normalizes line endings to spaces. + self.consume_textual(context) } else if matches!(context, MarkdownLexContext::LinkDefinition) { // In link definition context, whitespace separates tokens. // We consume it as textual literal so it's not treated as trivia by the parser. @@ -267,7 +288,15 @@ impl<'src> MarkdownLexer<'src> { PNC => self.consume_byte(R_PAREN), COL => self.consume_byte(COLON), AMP => self.consume_entity_or_textual(context), - BSL => self.consume_escape(), + BSL => { + // Per CommonMark §6.1, backslash escapes are NOT processed inside code spans. + // Backslash is literal, so `\`` produces a literal backslash followed by backtick. + if matches!(context, MarkdownLexContext::CodeSpan) { + self.consume_textual(context) + } else { + self.consume_escape() + } + } // = at line start could be setext heading underline EQL if self.after_newline => self.consume_setext_underline_or_textual(), _ => { @@ -753,6 +782,19 @@ impl<'src> MarkdownLexer<'src> { // Not a thematic break - restore position and consume as emphasis marker self.position = start_position; + // In EmphasisInline context, always emit single tokens for * and _. + // This allows partial consumption of delimiter runs when the match algorithm + // determines only 1 char should be used from a 2-char run. + if matches!(context, MarkdownLexContext::EmphasisInline) { + self.advance(1); + return match start_char { + b'*' => STAR, + b'_' => UNDERSCORE, + b'-' => MINUS, + _ => unreachable!(), + }; + } + // Check for double emphasis markers (**, __) // Note: -- is not valid markdown emphasis, so we don't check for it if start_char != b'-' && self.peek_byte() == Some(start_char) { @@ -1200,6 +1242,7 @@ impl<'src> ReLexer<'src> for MarkdownLexer<'src> { let lex_context = match context { MarkdownReLexContext::Regular => MarkdownLexContext::Regular, MarkdownReLexContext::LinkDefinition => MarkdownLexContext::LinkDefinition, + MarkdownReLexContext::EmphasisInline => MarkdownLexContext::EmphasisInline, }; let re_lexed_kind = match self.current_byte() { diff --git a/crates/biome_markdown_parser/src/parser.rs b/crates/biome_markdown_parser/src/parser.rs index 2ac966faa0c5..877750fa6127 100644 --- a/crates/biome_markdown_parser/src/parser.rs +++ b/crates/biome_markdown_parser/src/parser.rs @@ -208,6 +208,28 @@ impl<'source> MarkdownParser<'source> { .force_relex_in_context(crate::lexer::MarkdownLexContext::Regular); } + /// Force re-lex the current token in CodeSpan context. + /// In this context, backslash is literal (not an escape character). + /// Used for autolinks where `\>` should be `\` + `>` as separate tokens. + pub(crate) fn force_relex_code_span(&mut self) { + self.source + .force_relex_in_context(crate::lexer::MarkdownLexContext::CodeSpan); + } + + /// Re-lex the current token as single-char emphasis delimiter. + /// + /// Use this when the emphasis matching algorithm needs to partially consume + /// a DOUBLE_STAR or DOUBLE_UNDERSCORE token. After re-lexing, the token will + /// be either STAR or UNDERSCORE (single char). + /// + /// # Safety + /// Only call on the current token, NOT inside lookahead closures. + /// This invalidates any buffered lookahead, so ensure no lookahead is active. + pub(crate) fn force_relex_emphasis_inline(&mut self) -> MarkdownSyntaxKind { + self.source + .re_lex(crate::lexer::MarkdownReLexContext::EmphasisInline) + } + pub(crate) fn set_force_ordered_list_marker(&mut self, value: bool) { self.source.set_force_ordered_list_marker(value); } @@ -218,6 +240,7 @@ impl<'source> MarkdownParser<'source> { self.source.bump_link_definition(); } + pub fn checkpoint(&self) -> MarkdownParserCheckpoint { MarkdownParserCheckpoint { context: self.context.checkpoint(), diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index e4fecdd82c55..96037af6f798 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -908,9 +908,25 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) { } // Parse inline content (stops at NEWLINE via at_inline_end) - if parse_any_inline(p).is_absent() { + let parsed = parse_any_inline(p); + if parsed.is_absent() { break; } + let after_hard_break = + matches!(&parsed, Present(cm) if cm.kind(p) == MD_HARD_LINE); + + // Per CommonMark §6.7: after a hard line break, leading spaces on the + // next line are ignored. Skip whitespace-only textual tokens as trivia. + if after_hard_break && p.at(MD_TEXTUAL_LITERAL) { + if p.cur_text().chars().all(|c| c == ' ' || c == '\t') { + while p.at(MD_TEXTUAL_LITERAL) + && p.cur_text().chars().all(|c| c == ' ' || c == '\t') + { + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL)); + } + } + } + let inline_end: usize = p.cur_range().start().into(); has_content = inline_has_non_whitespace(p, inline_start, inline_end); } diff --git a/crates/biome_markdown_parser/src/syntax/header.rs b/crates/biome_markdown_parser/src/syntax/header.rs index 1dfefc0ed1fd..58127cfb57c3 100644 --- a/crates/biome_markdown_parser/src/syntax/header.rs +++ b/crates/biome_markdown_parser/src/syntax/header.rs @@ -150,8 +150,13 @@ fn parse_header_content(p: &mut MarkdownParser) { loop { if p.at(MD_HARD_LINE_LITERAL) { - // Trailing spaces before newline in ATX headings should be ignored. - p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_HARD_LINE_LITERAL)); + if p.cur_text().starts_with('\\') { + // Backslash at end of heading is literal text, not a hard break. + let _ = super::parse_textual(p); + } else { + // Trailing spaces before newline — skip as trivia. + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_HARD_LINE_LITERAL)); + } break; } diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index f336b37b33a6..87d649e36039 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -41,8 +41,9 @@ use biome_markdown_syntax::T; use biome_markdown_syntax::kind::MarkdownSyntaxKind::*; use biome_parser::Parser; use biome_parser::prelude::ParsedSyntax::{self, *}; +use biome_unicode_table::is_unicode_punctuation; -use biome_rowan::{TextRange, TextSize}; +use biome_rowan::TextRange; use crate::MarkdownParser; use crate::link_reference::normalize_reference_label; @@ -71,6 +72,10 @@ struct DelimRun { can_close: bool, /// Byte offset in the source where this run starts start_offset: usize, + /// Bracket nesting depth for scoping emphasis within link text. + /// Delimiters inside brackets (links) should only match with each other, + /// not with delimiters outside the brackets. 0 = outside brackets. + label_id: usize, } /// A matched emphasis span (opener + closer) @@ -89,45 +94,14 @@ fn is_whitespace(c: char) -> bool { c.is_whitespace() } +fn is_emphasis_marker(c: char) -> bool { + matches!(c, '*' | '_') +} + /// Check if a character is Unicode punctuation for flanking rules. /// Per CommonMark spec, this includes ASCII punctuation and Unicode punctuation categories. fn is_punctuation(c: char) -> bool { - // ASCII punctuation + Unicode punctuation categories - matches!( - c, - '!' | '"' - | '#' - | '$' - | '%' - | '&' - | '\'' - | '(' - | ')' - | '*' - | '+' - | ',' - | '-' - | '.' - | '/' - | ':' - | ';' - | '<' - | '=' - | '>' - | '?' - | '@' - | '[' - | '\\' - | ']' - | '^' - | '_' - | '`' - | '{' - | '|' - | '}' - | '~' - ) || c.is_ascii_punctuation() - || matches!(c, '\u{2000}'..='\u{206F}' | '\u{2E00}'..='\u{2E7F}') + is_unicode_punctuation(c) } /// Check if an opening delimiter is left-flanking per CommonMark rules. @@ -138,6 +112,7 @@ fn is_left_flanking_delimiter(char_after: Option, char_before: Option false, // At end of input, can't be left-flanking Some(c) if is_whitespace(c) => false, // Followed by whitespace + Some(c) if is_emphasis_marker(c) => true, Some(c) if is_punctuation(c) => { // Followed by punctuation - only left-flanking if preceded by whitespace or punctuation match char_before { @@ -157,6 +132,7 @@ fn is_right_flanking_delimiter(char_before: Option, char_after: Option false, // At start of input, can't be right-flanking Some(c) if is_whitespace(c) => false, // Preceded by whitespace + Some(c) if is_emphasis_marker(c) => true, Some(c) if is_punctuation(c) => { // Preceded by punctuation - only right-flanking if followed by whitespace or punctuation match char_after { @@ -209,14 +185,147 @@ fn can_underscore_close(char_before: Option, char_after: Option) -> /// This is the first pass of the CommonMark emphasis algorithm. It scans /// the source text and identifies all potential delimiter runs (sequences /// of `*` or `_`), computing their flanking status. -fn collect_delimiter_runs(source: &str) -> Vec { +/// Result of checking if a bracket forms a valid link. +/// Contains the closing bracket position if found. +struct BracketCheckResult { + /// Position of the closing `]` (or 0 if not found) + close_pos: usize, + /// Whether this is a valid inline link `[...](` or full reference `[...][` + is_inline_or_full_ref: bool, +} + +/// Check if a bracket at position `start` forms a valid link pattern. +/// Returns the closing bracket position and whether it's an inline link or full reference. +fn check_bracket_pattern(bytes: &[u8], start: usize) -> Option { + if start >= bytes.len() || bytes[start] != b'[' { + return None; + } + + // Find matching ] with proper nesting + let mut depth = 1; + let mut i = start + 1; + while i < bytes.len() && depth > 0 { + match bytes[i] { + b'[' => depth += 1, + b']' => depth -= 1, + b'\\' if i + 1 < bytes.len() => i += 1, // Skip escaped char + b'`' => { + // Skip code spans + let backtick_count = { + let mut c = 1; + while i + c < bytes.len() && bytes[i + c] == b'`' { + c += 1; + } + c + }; + i += backtick_count; + while i < bytes.len() { + if bytes[i] == b'`' { + let close_count = { + let mut c = 1; + while i + c < bytes.len() && bytes[i + c] == b'`' { + c += 1; + } + c + }; + i += close_count; + if close_count == backtick_count { + break; + } + } else { + i += 1; + } + } + continue; + } + b'<' => { + // Skip potential HTML/autolinks + i += 1; + while i < bytes.len() && bytes[i] != b'>' && bytes[i] != b'\n' { + i += 1; + } + if i < bytes.len() && bytes[i] == b'>' { + i += 1; + } + continue; + } + _ => {} + } + i += 1; + } + + if depth != 0 { + return None; + } + + // i now points to position after `]` + let close_pos = i - 1; + let is_inline_or_full_ref = i < bytes.len() && (bytes[i] == b'(' || bytes[i] == b'['); + + Some(BracketCheckResult { + close_pos, + is_inline_or_full_ref, + }) +} + +/// Extract label text from a bracket pattern for reference lookup. +fn extract_label_text(source: &str, start: usize, close_pos: usize) -> &str { + if start < close_pos && close_pos <= source.len() { + &source[start + 1..close_pos] + } else { + "" + } +} + +fn collect_delimiter_runs(source: &str, reference_checker: impl Fn(&str) -> bool) -> Vec { let mut runs = Vec::new(); let bytes = source.as_bytes(); let mut i = 0; + // Pre-compute valid link bracket positions. + // A bracket is considered a valid link if: + // 1. It's followed by `(` (inline link) or `[` (full reference), OR + // 2. It's a shortcut reference with a defined reference (checked via reference_checker) + let mut link_bracket_starts = Vec::new(); + for pos in 0..bytes.len() { + if bytes[pos] == b'[' + && let Some(result) = check_bracket_pattern(bytes, pos) + { + if result.is_inline_or_full_ref { + // Inline link or full reference link + link_bracket_starts.push(pos); + } else { + // Could be a shortcut reference - check if definition exists + let label = extract_label_text(source, pos, result.close_pos); + let normalized = normalize_reference_label(label); + if !normalized.is_empty() && reference_checker(&normalized) { + link_bracket_starts.push(pos); + } + } + } + } + + // Track bracket depth, but only for valid link brackets + let mut bracket_depth = 0usize; + let mut active_link_brackets: Vec = Vec::new(); + while i < bytes.len() { let b = bytes[i]; + // Track bracket depth for valid links only + if b == b'[' && link_bracket_starts.contains(&i) { + bracket_depth += 1; + active_link_brackets.push(i); + i += 1; + continue; + } + if b == b']' && !active_link_brackets.is_empty() { + bracket_depth = bracket_depth.saturating_sub(1); + active_link_brackets.pop(); + i += 1; + continue; + } + // Check for delimiter characters if b == b'*' || b == b'_' { let kind = if b == b'*' { @@ -265,6 +374,10 @@ fn collect_delimiter_runs(source: &str) -> Vec { can_open, can_close, start_offset, + // Only scope by bracket depth when inside a valid link pattern. + // This prevents emphasis from spanning link boundaries, but allows + // emphasis to span brackets that don't form valid links. + label_id: bracket_depth, }); i = end_offset; @@ -330,41 +443,38 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { if runs[idx].can_close && runs[idx].count > 0 { loop { let mut opener_stack_pos = None; - let prefer_strong = runs[idx].count >= 2; - - for pass in 0..2 { - for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() { - let opener = &runs[opener_idx]; - let closer = &runs[idx]; - if opener.kind != closer.kind || !opener.can_open || opener.count == 0 { - continue; - } - - if prefer_strong && pass == 0 && opener.count < 2 { - continue; - } - - // Rule of 3: if (opener_count + closer_count) % 3 == 0 and - // the closer can open or the opener can close, skip unless - // both counts are divisible by 3 - let opener_count = opener.count; - let closer_count = closer.count; - if ((opener.can_open && opener.can_close) - || (closer.can_open && closer.can_close)) - && (opener_count + closer_count).is_multiple_of(3) - && (!opener_count.is_multiple_of(3) || !closer_count.is_multiple_of(3)) - { - continue; - } + // Search backward for the closest matching opener. + // Per CommonMark spec, we find any matching opener first, + // then determine strong vs regular based on both counts. + for (pos, &opener_idx) in opener_stack.iter().enumerate().rev() { + let opener = &runs[opener_idx]; + let closer = &runs[idx]; + + // Only match within same bracket scope (label_id). + // This prevents emphasis from spanning link boundaries. + if opener.label_id != closer.label_id { + continue; + } - opener_stack_pos = Some(pos); - break; + if opener.kind != closer.kind || !opener.can_open || opener.count == 0 { + continue; } - if opener_stack_pos.is_some() { - break; + // Rule of 3: if (opener_count + closer_count) % 3 == 0 and + // the closer can open or the opener can close, skip unless + // both counts are divisible by 3 + let opener_count = opener.count; + let closer_count = closer.count; + if (opener.can_close || closer.can_open) + && !closer_count.is_multiple_of(3) + && (opener_count + closer_count).is_multiple_of(3) + { + continue; } + + opener_stack_pos = Some(pos); + break; } let Some(pos) = opener_stack_pos else { break }; @@ -375,7 +485,11 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { 1 }; - let opener_start = runs[opener_idx].start_offset; + // Openers consume from END of run (leftover stays at beginning). + // This ensures for `***foo***`, the inner `**` is consumed leaving `*` at start. + let opener_start = + runs[opener_idx].start_offset + runs[opener_idx].count - use_count; + // Closers consume from BEGINNING of what remains. let closer_start = runs[idx].start_offset; matches.push(EmphasisMatch { @@ -384,8 +498,9 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { is_strong: use_count == 2, }); + // Opener: reduce count but keep start_offset (leftover is at beginning) runs[opener_idx].count -= use_count; - runs[opener_idx].start_offset += use_count; + // Closer: reduce count and advance start_offset (leftover is at end) runs[idx].count -= use_count; runs[idx].start_offset += use_count; @@ -395,10 +510,10 @@ fn match_delimiters(runs: &mut [DelimRun]) -> Vec { opener_stack.pop(); } - if use_count == 2 && runs[opener_idx].count > 0 && runs[idx].count > 0 { - // Avoid crossing matches from odd-length runs (e.g. ***foo***). - break; - } + // Note: With the "consume from END" algorithm for openers, + // crossing matches are no longer an issue because the leftover + // chars end up at the beginning of the opener run (wrapping + // around the inner match), not at the end (which would cross). if runs[idx].count == 0 { break; @@ -426,10 +541,26 @@ pub(crate) struct EmphasisContext { base_offset: usize, } +/// Information about a match found within a token's range. +/// Used when the opener doesn't start at the exact token boundary. +#[derive(Debug)] +struct OpenerMatch<'a> { + /// The matched emphasis span + matched: &'a EmphasisMatch, + /// How many chars before opener_start (literal prefix to emit) + prefix_len: usize, +} + impl EmphasisContext { - /// Create a new emphasis context by analyzing the source text - pub(crate) fn new(source: &str, base_offset: usize) -> Self { - let mut runs = collect_delimiter_runs(source); + /// Create a new emphasis context by analyzing the source text. + /// The reference_checker function is used to determine if a bracket pattern + /// is a valid shortcut reference link. + pub(crate) fn new( + source: &str, + base_offset: usize, + reference_checker: impl Fn(&str) -> bool, + ) -> Self { + let mut runs = collect_delimiter_runs(source, reference_checker); let matches = match_delimiters(&mut runs); Self { matches, @@ -437,12 +568,44 @@ impl EmphasisContext { } } - /// Check if there's an emphasis opener at the given offset - fn opener_at(&self, offset: usize) -> Option<&EmphasisMatch> { - let abs_offset = offset; - self.matches - .iter() - .find(|m| m.opener_start + self.base_offset == abs_offset) + /// Find the *earliest* match whose opener_start is within [token_start, token_end) + /// and matches the expected `is_strong` value. + /// Returns None if no match found, or the match plus prefix length. + /// + /// This is used instead of exact offset matching because with the "consume from END" + /// algorithm, an opener might start in the middle of a DOUBLE_STAR token. + fn opener_within( + &self, + token_start: usize, + token_len: usize, + expect_strong: bool, + ) -> Option> { + let token_end = token_start + token_len; + let mut best: Option> = None; + + for m in &self.matches { + // Filter by expected emphasis type + if m.is_strong != expect_strong { + continue; + } + + let abs_opener = m.opener_start + self.base_offset; + if abs_opener >= token_start && abs_opener < token_end { + let candidate = OpenerMatch { + matched: m, + prefix_len: abs_opener - token_start, + }; + // Pick the earliest match (smallest prefix_len) + if best + .as_ref() + .is_none_or(|b| candidate.prefix_len < b.prefix_len) + { + best = Some(candidate); + } + } + } + + best } } @@ -458,72 +621,143 @@ pub(crate) fn parse_hard_line(p: &mut MarkdownParser) -> ParsedSyntax { return Absent; } + let ends_block = p.lookahead(|p| { + p.bump(MD_HARD_LINE_LITERAL); + p.at(NEWLINE) || p.at(EOF) + }); + + if ends_block { + return super::parse_textual(p); + } + let m = p.start(); p.bump(MD_HARD_LINE_LITERAL); Present(m.complete(p, MD_HARD_LINE)) } +/// Check if there's a matching closing backtick sequence before EOF/blank line. +/// +/// Per CommonMark §6.1, a code span opener must have a matching closer with the +/// same number of backticks. If no match exists, the opener should be treated +/// as literal text, not an unclosed code span. +/// +/// Returns false if no match found (opener should become literal text). + +fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) -> bool { + use crate::lexer::MarkdownLexContext; + + p.lookahead(|p| { + // Skip the opening backticks + p.bump(BACKTICK); + + loop { + // EOF = no matching closer found + if p.at(T![EOF]) { + return false; + } + + // Blank line = paragraph boundary, terminates search + if p.at(NEWLINE) && p.at_blank_line() { + return false; + } + + // Per CommonMark §4.3, setext heading underlines take priority over + // inline code spans. If crossing a newline would land on a setext + // underline, the code span is invalid — the underline forms a heading. + if p.at(NEWLINE) { + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); + if crate::syntax::at_setext_underline_after_newline(p).is_some() { + return false; + } + continue; + } + + // Found backticks - check if they match + if p.at(BACKTICK) { + let closing_count = p.cur_text().len(); + if closing_count == opening_count { + return true; + } + // Not matching - continue searching + p.bump(BACKTICK); + continue; + } + + // Consume token and continue (use CodeSpan context for proper backslash handling) + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); + } + }) +} + /// Parse inline code span (`` `code` `` or ``` `` `code` `` ```). /// /// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`' /// -/// Per CommonMark, code spans can use multiple backticks to allow literal -/// backticks inside: ``` `` `code` `` ``` wraps around code containing backticks. -/// The opening and closing backtick strings must be the same length. +/// Per CommonMark §6.1: +/// - Code spans can use multiple backticks to allow literal backticks inside +/// - The opening and closing backtick strings must be the same length +/// - Backslash escapes are NOT processed inside code spans (\` is literal `\``) +/// - If no matching closer exists, the opener is treated as literal text pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { + use crate::lexer::MarkdownLexContext; + if !p.at(BACKTICK) { return Absent; } - let m = p.start(); - - // Count opening backticks from token text let opening_count = p.cur_text().len(); - let opening_range = p.cur_range(); + + // DESIGN PRINCIPLE #2 & #4: Check for matching closer BEFORE creating any nodes. + // If no match exists, return Absent so backticks become literal text. + // This avoids synthesizing MD_INLINE_CODE with missing r_tick_token. + if !has_matching_code_span_closer(p, opening_count) { + return Absent; // Caller will treat backtick as literal MD_TEXTUAL + } + + // We have a valid code span - now parse it + let m = p.start(); // Opening backtick(s) p.bump(BACKTICK); - // Content - parse until we find a BACKTICK with matching count, or EOF + // Content - parse until we find matching closing backticks + // Per CommonMark, code spans can span multiple lines (newlines become spaces in output) + // All content is lexed in CodeSpan context to keep backslash literal and avoid + // hard-line-break detection. let content = p.start(); - let mut found_closing = false; loop { - if p.at_inline_end() { + // EOF should not happen (lookahead guaranteed a closer), but handle defensively + if p.at(T![EOF]) { break; } - // Check for matching closing backticks - if p.at(BACKTICK) { - let closing_count = p.cur_text().len(); - if closing_count == opening_count { - // Found matching closing backticks - found_closing = true; - break; + // DESIGN PRINCIPLE #3: Terminate on blank line (paragraph boundary) + if p.at(NEWLINE) { + if p.at_blank_line() { + break; // Paragraph boundary - stop } - // Not matching - consume as content + // Soft line break - consume NEWLINE as content and continue + // Use CodeSpan context so next token is also lexed without escape processing let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); text_m.complete(p, MD_TEXTUAL); continue; } - // Regular content + // Found matching closing backticks + if p.at(BACKTICK) && p.cur_text().len() == opening_count { + break; + } + + // DESIGN PRINCIPLE #1: Use CodeSpan context so backslash is literal let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context(MD_TEXTUAL_LITERAL, MarkdownLexContext::CodeSpan); text_m.complete(p, MD_TEXTUAL); } content.complete(p, MD_INLINE_ITEM_LIST); - // Closing backtick(s) - emit custom diagnostic if missing - if found_closing { - p.bump(BACKTICK); - } else { - p.error(super::parse_error::unclosed_code_span( - p, - opening_range, - opening_count, - )); - } + // Closing backticks (guaranteed to exist due to lookahead check) + p.bump(BACKTICK); Present(m.complete(p, MD_INLINE_CODE)) } @@ -535,46 +769,95 @@ fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> P None => return Absent, }; - let offset = u32::from(p.cur_range().start()) as usize; - let matched = match context.opener_at(offset) { - Some(matched) => matched, + // Must be at an emphasis token + if !p.at(DOUBLE_STAR) && !p.at(DOUBLE_UNDERSCORE) && !p.at(T![*]) && !p.at(UNDERSCORE) { + return Absent; + } + + // Get current token info BEFORE any re-lex + let token_start = u32::from(p.cur_range().start()) as usize; + let token_len: usize = p.cur_range().len().into(); + + // Find match within current token's range that has the expected is_strong value + let opener_match = match context.opener_within(token_start, token_len, expect_strong) { + Some(m) => m, None => return Absent, }; - if matched.is_strong != expect_strong { + // If the opener doesn't start at the exact token boundary, return Absent. + // The caller (parse_any_inline) will emit literal text, advancing the parser position. + // On subsequent calls, we'll eventually be at the correct position with prefix_len == 0. + if opener_match.prefix_len > 0 { return Absent; } - let (opener_kind, closer_kind, opener_text) = if expect_strong { - if p.at(DOUBLE_STAR) { - (DOUBLE_STAR, DOUBLE_STAR, "**") - } else if p.at(DOUBLE_UNDERSCORE) { - (DOUBLE_UNDERSCORE, DOUBLE_UNDERSCORE, "__") - } else { - return Absent; - } - } else if p.at(T![*]) { - (T![*], T![*], "*") - } else if p.at(UNDERSCORE) { - (UNDERSCORE, UNDERSCORE, "_") - } else { - return Absent; + // Extract values before dropping the borrow on context + let use_count = if expect_strong { 2 } else { 1 }; + let closer_offset = opener_match.matched.closer_start + context.base_offset; + // Use the correct delimiter character for error messages + let is_underscore = p.at(DOUBLE_UNDERSCORE) || p.at(UNDERSCORE); + let opener_text = match (expect_strong, is_underscore) { + (true, true) => "__", + (true, false) => "**", + (false, true) => "_", + (false, false) => "*", }; - let closer_offset = matched.closer_start + context.base_offset; let m = p.start(); let opening_range = p.cur_range(); - p.bump(opener_kind); + // Consume opener tokens + // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one. + // Only re-lex when we need to consume a partial token or single chars. + if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) { + // Bump the double token as a single unit + p.bump_any(); + } else { + // Consume individual tokens + for _ in 0..use_count { + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + p.bump_any(); + } + } + // Parse content until we reach the closer let content = p.start(); loop { - if p.at_inline_end() { + // EOF always ends content + if p.at(T![EOF]) { break; } let current_offset = u32::from(p.cur_range().start()) as usize; - if current_offset == closer_offset { + let current_len: usize = p.cur_range().len().into(); + + // Check if closer is AT or WITHIN current token + if closer_offset >= current_offset && closer_offset < current_offset + current_len { + break; + } + + // Check if we've passed the closer (can happen when link parsing consumes past it) + if current_offset > closer_offset { + break; + } + + // Handle NEWLINE: emphasis can span multiple lines per CommonMark + // But blank lines end paragraphs, so stop there + if p.at(NEWLINE) { + if p.at_blank_line() { + // Blank line = paragraph boundary, emphasis is unclosed + break; + } + if closer_offset > current_offset { + // Soft line break - consume NEWLINE as textual content and continue + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + continue; + } + // Closer should have been at or before this newline - stop break; } @@ -584,9 +867,45 @@ fn parse_emphasis_from_context(p: &mut MarkdownParser, expect_strong: bool) -> P } content.complete(p, MD_INLINE_ITEM_LIST); - if p.at(closer_kind) && u32::from(p.cur_range().start()) as usize == closer_offset { - p.bump(closer_kind); + // Consume closer tokens (1 or 2) + // Handle partial closer consumption (e.g., `*foo**` where closer might be at offset 4 + // but token DOUBLE_STAR spans 4-6) + let current_offset = u32::from(p.cur_range().start()) as usize; + let closer_prefix_len = closer_offset.saturating_sub(current_offset); + + if closer_prefix_len > 0 { + // Closer starts AFTER token start - emit prefix as literal + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + for _ in 0..closer_prefix_len { + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + } + } + + // Now consume actual closer delimiters + // For strong emphasis (use_count=2), we can bump DOUBLE_* directly if at one. + let mut consumed_closer = 0; + if use_count == 2 && (p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE)) { + p.bump_any(); + consumed_closer = 2; } else { + for _ in 0..use_count { + if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.force_relex_emphasis_inline(); + } + if p.at(T![*]) || p.at(UNDERSCORE) || p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + p.bump_any(); + consumed_closer += 1; + } else { + break; + } + } + } + + if consumed_closer < use_count { p.error(super::parse_error::unclosed_emphasis( p, opening_range, @@ -633,6 +952,31 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS break; } + // IMPORTANT: Parse constructs that can contain `]` BEFORE checking for stop token. + // Per CommonMark, `]` inside code spans, autolinks, and HTML doesn't terminate links. + + // Code spans can contain `]` + if p.at(BACKTICK) { + if parse_inline_code(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // Autolinks and inline HTML can contain `]` + if p.at(L_ANGLE) { + if parse_autolink(p).is_present() { + continue; + } + if parse_inline_html(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // NOW check for stop token (after constructs that can contain it) if p.at(stop) { if bracket_depth == 0 { break; @@ -661,6 +1005,86 @@ fn parse_inline_item_list_until_no_links(p: &mut MarkdownParser, stop: MarkdownS has_nested_link } +/// Parse inline items until `stop` token, allowing full inline parsing including links. +/// Used for image alt text where nested links/images should be fully parsed +/// so their text content can be extracted for the alt attribute. +fn parse_inline_item_list_until(p: &mut MarkdownParser, stop: MarkdownSyntaxKind) { + let m = p.start(); + let prev_context = set_inline_emphasis_context_until(p, stop); + let mut bracket_depth = 0usize; + + loop { + if p.at(NEWLINE) { + if p.at_blank_line() { + break; + } + let _ = super::parse_textual(p); + continue; + } + + if p.at(T![EOF]) { + break; + } + + // Code spans can contain `]` + if p.at(BACKTICK) { + if parse_inline_code(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + // Autolinks and inline HTML can contain `]` + if p.at(L_ANGLE) { + if parse_autolink(p).is_present() { + continue; + } + if parse_inline_html(p).is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + if p.at(stop) { + if bracket_depth == 0 { + break; + } + bracket_depth = bracket_depth.saturating_sub(1); + let _ = super::parse_textual(p); + continue; + } + + // For image alt: allow full inline parsing including links and images + if p.at(L_BRACK) { + let result = parse_link_or_image(p, LinkParseKind::Link); + if result.is_present() { + continue; + } + bracket_depth += 1; + let _ = super::parse_textual(p); + continue; + } + + if p.at(BANG) && p.nth_at(1, L_BRACK) { + let result = parse_link_or_image(p, LinkParseKind::Image); + if result.is_present() { + continue; + } + let _ = super::parse_textual(p); + continue; + } + + if parse_any_inline(p).is_absent() { + break; + } + } + + m.complete(p, MD_INLINE_ITEM_LIST); + p.set_emphasis_context(prev_context); +} + fn nested_link_starts_here(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { if !p.at(L_BRACK) { @@ -720,7 +1144,10 @@ fn set_inline_emphasis_context_until( source }; let base_offset = u32::from(p.cur_range().start()) as usize; - let context = EmphasisContext::new(inline_source, base_offset); + // Create a reference checker closure that uses the parser's link reference definitions + let context = EmphasisContext::new(inline_source, base_offset, |label| { + p.has_link_reference_definition(label) + }); p.set_emphasis_context(Some(context)) } @@ -851,21 +1278,6 @@ impl LinkParseKind { } } - fn report_unclosed_text(self, p: &mut MarkdownParser, opening_range: TextRange) { - match self { - Self::Link => p.error(super::parse_error::unclosed_link( - p, - opening_range, - "expected `]` to close link text", - )), - Self::Image => p.error(super::parse_error::unclosed_image( - p, - opening_range, - "expected `]` to close alt text", - )), - } - } - fn report_unclosed_destination(self, p: &mut MarkdownParser, opening_range: TextRange) { match self { Self::Link => p.error(super::parse_error::unclosed_link( @@ -897,30 +1309,28 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn kind.bump_opening(p); // Link text / alt text - let has_nested_link = parse_inline_item_list_until_no_links(p, R_BRACK); + let has_nested_link = if matches!(kind, LinkParseKind::Image) { + // For images, allow full inline parsing (including links) in alt text. + // This lets nested links/images be parsed so their text can be extracted for alt. + parse_inline_item_list_until(p, R_BRACK); + false + } else { + parse_inline_item_list_until_no_links(p, R_BRACK) + }; - // ] - if missing at inline end, emit diagnostic; otherwise rewind + // ] - if missing, rewind and treat [ as literal text. + // Per CommonMark, if there's no valid ] to close the link (e.g., all ] + // characters are inside code spans or HTML), the [ is literal text. + // NOTE: We intentionally do NOT emit an "unclosed link" diagnostic here. + // CommonMark treats unmatched `[` as literal text, not an error. if !p.eat(R_BRACK) { - if matches!(kind, LinkParseKind::Link) && has_nested_link { - m.abandon(p); - p.rewind(checkpoint); - return Absent; - } - if p.at_inline_end() { - // Unclosed link/image at end of inline content - emit diagnostic - // Expand range to include the text content, not just the opening bracket - let full_range = TextRange::new(opening_range.start(), p.cur_range().start()); - kind.report_unclosed_text(p, full_range); - // Return as reference link/image (shortcut) with missing closing bracket - return Present(m.complete(p, kind.reference_kind())); - } - // Not at inline end but missing ] - rewind and treat as text m.abandon(p); p.rewind(checkpoint); return Absent; } - let text_end_offset = p.cur_range().start(); + // Per CommonMark, a link (not image) whose text contains another link must fail. + // The inner link wins and the outer `[` becomes literal text. if matches!(kind, LinkParseKind::Link) && has_nested_link { m.abandon(p); p.rewind(checkpoint); @@ -1007,7 +1417,10 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, text_end_offset); + // Return Absent - the caller will treat `[` as textual. + // Don't consume the whole bracket sequence to avoid consuming + // past emphasis closers. + return Absent; } Present(m.complete(p, kind.reference_kind())) @@ -1020,7 +1433,10 @@ fn parse_link_or_image(p: &mut MarkdownParser, kind: LinkParseKind) -> ParsedSyn { m.abandon(p); p.rewind(checkpoint); - return consume_textual_until_offset(p, text_end_offset); + // Return Absent - the caller will treat `[` as textual. + // Don't consume the whole bracket sequence to avoid consuming + // past emphasis closers. + return Absent; } Present(m.complete(p, kind.reference_kind())) } @@ -1064,7 +1480,7 @@ fn lookahead_reference_common( p.bump(L_BRACK); - let link_text = collect_bracket_text(p)?; + let link_text = collect_link_text(p)?; // Link text must be non-empty after normalization (e.g., `[\n ]` normalizes to empty) let normalized_link = normalize_reference_label(&link_text); @@ -1080,7 +1496,7 @@ fn lookahead_reference_common( if p.at(L_BRACK) { p.bump(L_BRACK); - let label_text = collect_bracket_text(p); + let label_text = collect_label_text_simple(p); if let Some(label_text) = label_text { let label = if label_text.is_empty() { link_text.clone() @@ -1107,13 +1523,31 @@ fn lookahead_reference_common( }) } -fn collect_bracket_text(p: &mut MarkdownParser) -> Option { +/// Collect text for a link label (e.g., the `label` in `[text][label]`). +/// +/// Per CommonMark §4.7, link labels have specific rules: +/// - Unescaped square brackets are NOT allowed inside labels (see example 555) +/// - Backslash escapes ARE allowed (e.g., `\]` is a literal `]` in the label) +/// - No inline parsing (backticks, HTML, etc. are literal characters) +/// +/// We stop at the first R_BRACK token (unescaped `]`). Escaped brackets like `\]` +/// are lexed as MD_TEXTUAL_LITERAL, not R_BRACK, so they're included in the label. +fn collect_label_text_simple(p: &mut MarkdownParser) -> Option { let mut text = String::new(); + loop { if p.at(T![EOF]) || p.at_inline_end() { return None; } + // Blank lines terminate + if p.at(NEWLINE) && p.at_blank_line() { + return None; + } + + // R_BRACK token = unescaped `]` closes the label. + // Note: Escaped brackets (`\]`) are lexed as MD_TEXTUAL_LITERAL, + // not R_BRACK, so they're correctly included in the label text. if p.at(R_BRACK) { return Some(text); } @@ -1123,18 +1557,94 @@ fn collect_bracket_text(p: &mut MarkdownParser) -> Option { } } -fn consume_textual_until_offset(p: &mut MarkdownParser, end_offset: TextSize) -> ParsedSyntax { - let mut last = Absent; +/// Collect text for link text (e.g., the `text` in `[text](url)` or `[text][label]`). +/// Per CommonMark, link text CAN contain inline elements - code spans, autolinks, HTML. +/// `]` inside these constructs does NOT close the link text. +fn collect_link_text(p: &mut MarkdownParser) -> Option { + let mut text = String::new(); + let mut bracket_depth = 0usize; - while !p.at(T![EOF]) { - let end = p.cur_range().end(); - last = super::parse_textual(p); - if end >= end_offset { - break; + loop { + if p.at(T![EOF]) || p.at_inline_end() { + return None; } - } - last + // Per CommonMark, blank lines terminate link text + if p.at(NEWLINE) && p.at_blank_line() { + return None; + } + + // Code spans can contain `]` - skip them entirely. + // Per CommonMark, `]` inside code spans doesn't terminate link text. + if p.at(BACKTICK) { + let opening_count = p.cur_text().len(); + text.push_str(p.cur_text()); + p.bump(p.cur()); + + // Find matching closing backticks + let mut found_close = false; + while !p.at(T![EOF]) && !p.at_inline_end() { + if p.at(NEWLINE) && p.at_blank_line() { + break; // Blank line terminates + } + if p.at(BACKTICK) && p.cur_text().len() == opening_count { + text.push_str(p.cur_text()); + p.bump(p.cur()); + found_close = true; + break; + } + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + if !found_close { + // Unclosed code span - treat opening backticks as literal + // (already added to text, continue normally) + } + continue; + } + + // Autolinks and inline HTML can contain `]` - skip them entirely. + // Per CommonMark, `]` inside `<...>` constructs doesn't terminate link text. + if p.at(L_ANGLE) { + text.push_str(p.cur_text()); + p.bump(p.cur()); + + // Consume until `>` or newline + while !p.at(T![EOF]) && !p.at_inline_end() && !p.at(R_ANGLE) { + if p.at(NEWLINE) { + // Newlines end autolinks/HTML tags + break; + } + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + if p.at(R_ANGLE) { + text.push_str(p.cur_text()); + p.bump(p.cur()); + } + continue; + } + + if p.at(L_BRACK) { + bracket_depth += 1; + text.push_str(p.cur_text()); + p.bump(p.cur()); + continue; + } + + if p.at(R_BRACK) { + if bracket_depth == 0 { + return Some(text); + } + bracket_depth -= 1; + text.push_str(p.cur_text()); + p.bump(p.cur()); + continue; + } + + text.push_str(p.cur_text()); + p.bump(p.cur()); + } } fn bump_textual_link_def(p: &mut MarkdownParser) { @@ -1522,19 +2032,30 @@ pub(crate) fn parse_inline_image(p: &mut MarkdownParser) -> ParsedSyntax { /// - Processing instructions: `` /// - Declarations: `` /// - CDATA: `` -fn is_inline_html(text: &str) -> Option { +pub(crate) fn is_inline_html(text: &str) -> Option { let bytes = text.as_bytes(); if bytes.len() < 2 || bytes[0] != b'<' { return None; } // HTML comment: + // Per CommonMark 0.31.2 §6.8, an HTML comment consists of ``, + // where text does not start with `>` or `->`, and does not end with `-`. + // Additionally, `` and `` are valid (degenerate) comments. if bytes.starts_with(b" + let rest = &bytes[4..]; + // Handle degenerate comments: and + if rest.starts_with(b">") { + return Some(5); // + } + if rest.starts_with(b"->") { + return Some(6); // + } + // Find closing --> after ") { let body = &text[4..4 + pos]; - // CommonMark: comment cannot start with '>' or '->', and must not contain "--" - if body.starts_with('>') || body.starts_with("->") || body.contains("--") { + // Body must not end with '-' + if body.ends_with('-') { return None; } return Some(4 + pos + 3); @@ -1658,8 +2179,17 @@ fn is_inline_html(text: &str) -> Option { let is_attr_name_continue = |b: u8| b.is_ascii_alphanumeric() || b == b'_' || b == b':' || b == b'.' || b == b'-'; + let mut need_space = true; + // We already know the boundary char was whitespace, so first iteration has space. + let mut had_space = true; + loop { - let had_space = skip_spaces(&mut i)?; + if need_space { + let s = skip_spaces(&mut i)?; + had_space = had_space || s; + } + need_space = true; + if i >= bytes.len() { return None; } @@ -1690,7 +2220,7 @@ fn is_inline_html(text: &str) -> Option { } // Optional whitespace and value - skip_spaces(&mut i)?; + had_space = skip_spaces(&mut i)?; if i < bytes.len() && bytes[i] == b'=' { i += 1; skip_spaces(&mut i)?; @@ -1740,7 +2270,11 @@ fn is_inline_html(text: &str) -> Option { } } } + // After value, need to find whitespace at top of loop + had_space = false; } + // If no '=' was found, `had_space` from skip_spaces above carries over + // as the separator for the next attribute (boolean attribute case). } } @@ -1785,6 +2319,12 @@ pub(crate) fn parse_inline_html(p: &mut MarkdownParser) -> ParsedSyntax { None => return Absent, }; + // Per CommonMark §4.3, setext heading underlines take priority over inline HTML. + // If this HTML tag spans across a line that is a setext underline, treat `<` as literal. + if crate::syntax::inline_span_crosses_setext(p, html_len) { + return Absent; + } + // Valid inline HTML - create the node // Use checkpoint so we can rewind if token boundaries don't align let checkpoint = p.checkpoint(); @@ -1957,18 +2497,29 @@ pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax { // < p.bump(L_ANGLE); - // Content as inline item list containing textual nodes - let content = p.start(); - while !p.at(R_ANGLE) && !p.at_inline_end() { + // Content as inline item list containing textual nodes. + // Autolinks don't process backslash escapes, but the lexer may combine + // `\>` into a single escape token. We re-lex in CodeSpan context where + // backslash is literal, so `\` and `>` are separate tokens. + p.force_relex_code_span(); + + let content_m = p.start(); + while !p.at(R_ANGLE) && !p.at(T![EOF]) && !p.at_inline_end() { let text_m = p.start(); - p.bump_remap(MD_TEXTUAL_LITERAL); + p.bump_remap_with_context( + MD_TEXTUAL_LITERAL, + crate::lexer::MarkdownLexContext::CodeSpan, + ); text_m.complete(p, MD_TEXTUAL); } - content.complete(p, MD_INLINE_ITEM_LIST); + content_m.complete(p, MD_INLINE_ITEM_LIST); // > p.expect(R_ANGLE); + // Re-lex back to regular context + p.force_relex_regular(); + Present(m.complete(p, MD_AUTOLINK)) } @@ -1977,15 +2528,29 @@ pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax { if p.at(MD_HARD_LINE_LITERAL) { parse_hard_line(p) } else if p.at(BACKTICK) { - parse_inline_code(p) - } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { - // Try emphasis, fall back to literal text if flanking rules fail - let result = parse_inline_emphasis(p); + // Try code span, fall back to literal text if no matching closer exists + let result = parse_inline_code(p); if result.is_absent() { super::parse_textual(p) } else { result } + } else if p.at(DOUBLE_STAR) || p.at(DOUBLE_UNDERSCORE) { + // For cases like `***foo***`, the em match starts at the exact token boundary + // (prefix_len=0) while the strong match starts at offset 1 (prefix_len=1). + // Try italic first to handle nested emphasis correctly, then try strong. + let result = parse_inline_italic(p); + if result.is_present() { + return result; + } + let result = parse_inline_emphasis(p); + if result.is_present() { + return result; + } + // Neither matched - re-lex to single token and emit just one char as literal. + // This handles cases like `**foo*` where opener is at offset 1. + p.force_relex_emphasis_inline(); + super::parse_textual(p) } else if p.at(T![*]) || p.at(UNDERSCORE) { // Try italic, fall back to literal text if flanking rules fail let result = parse_inline_italic(p); diff --git a/crates/biome_markdown_parser/src/syntax/parse_error.rs b/crates/biome_markdown_parser/src/syntax/parse_error.rs index 8f864cc0ac88..97ce5288ac56 100644 --- a/crates/biome_markdown_parser/src/syntax/parse_error.rs +++ b/crates/biome_markdown_parser/src/syntax/parse_error.rs @@ -29,26 +29,6 @@ pub(crate) fn unclosed_emphasis( )) } -/// Unclosed inline code span. -/// -/// ```markdown -/// `code -/// ^ expected closing ` -/// ``` -pub(crate) fn unclosed_code_span( - p: &MarkdownParser, - opening_range: TextRange, - backtick_count: usize, -) -> ParseDiagnostic { - let backticks = "`".repeat(backtick_count); - p.err_builder( - format!("Unclosed code span, expected closing {backticks}."), - opening_range, - ) - .with_detail(opening_range, "code span started here") - .with_hint(format!("Add closing {backticks} to close the code span.")) -} - /// Unclosed inline link. /// /// ```markdown diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index 0512db55febe..90d75ad4127a 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -1047,9 +1047,7 @@ fn render_inline_link(link: &MdInlineLink, ctx: &HtmlRenderContext, out: &mut St /// Render an inline image. fn render_inline_image(img: &MdInlineImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = render_inline_list(&img.alt(), ctx); - // Strip HTML tags from alt text - let alt = strip_html_tags(&alt); + let alt = extract_alt_text(&img.alt(), ctx); let dest = collect_inline_text(&img.destination()); let dest = process_link_destination(&dest); @@ -1109,8 +1107,7 @@ fn render_reference_link(link: &MdReferenceLink, ctx: &HtmlRenderContext, out: & /// Render a reference image. fn render_reference_image(img: &MdReferenceImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = render_inline_list(&img.alt(), ctx); - let alt = strip_html_tags(&alt); + let alt = extract_alt_text(&img.alt(), ctx); let alt_raw = collect_inline_text(&img.alt()); render_reference_common( @@ -1198,10 +1195,12 @@ fn render_autolink(autolink: &MdAutolink, out: &mut String) { // Check if it's an email autolink let is_email = content.contains('@') && !content.contains(':'); + // Autolinks must NOT process backslash escapes or entity decoding. + // Only percent-encode for URL safety. let href = if is_email { format!("mailto:{}", content) } else { - process_link_destination(&content) + percent_encode_uri(&content) }; out.push_str("
String { escape_html(text) } -/// Strip HTML tags from text (for image alt text). -fn strip_html_tags(text: &str) -> String { +/// Extract plain text for image alt attribute. +/// Per CommonMark, the alt text is the content with inline formatting stripped +/// but text from nested links/images preserved (recursively extracting their text). +fn extract_alt_text( + list: &biome_markdown_syntax::MdInlineItemList, + ctx: &HtmlRenderContext, +) -> String { let mut result = String::new(); - let mut in_tag = false; + for item in list.iter() { + extract_alt_text_inline(&item, ctx, &mut result); + } + result +} - for c in text.chars() { - if c == '<' { - in_tag = true; - } else if c == '>' { - in_tag = false; - } else if !in_tag { - result.push(c); +fn extract_alt_text_inline(inline: &AnyMdInline, ctx: &HtmlRenderContext, out: &mut String) { + match inline { + AnyMdInline::MdTextual(text) => { + render_textual(text, out); + } + AnyMdInline::MdInlineEmphasis(em) => { + out.push_str(&extract_alt_text(&em.content(), ctx)); + } + AnyMdInline::MdInlineItalic(italic) => { + out.push_str(&extract_alt_text(&italic.content(), ctx)); + } + AnyMdInline::MdInlineCode(code) => { + // Plain text only — no tags for alt attribute + let content = collect_raw_inline_text(&code.content()); + let content = content.replace('\n', " "); + let content = if content.starts_with(' ') + && content.ends_with(' ') + && content.len() > 2 + && content.chars().any(|c| c != ' ') + { + content[1..content.len() - 1].to_string() + } else { + content + }; + out.push_str(&escape_html(&content)); + } + AnyMdInline::MdInlineLink(link) => { + // Extract text content from link text + out.push_str(&extract_alt_text(&link.text(), ctx)); + } + AnyMdInline::MdInlineImage(img) => { + // Recursively extract alt text from nested image + out.push_str(&extract_alt_text(&img.alt(), ctx)); + } + AnyMdInline::MdReferenceLink(link) => { + out.push_str(&extract_alt_text(&link.text(), ctx)); + } + AnyMdInline::MdReferenceImage(img) => { + out.push_str(&extract_alt_text(&img.alt(), ctx)); + } + AnyMdInline::MdAutolink(autolink) => { + let content = collect_raw_inline_text(&autolink.value()); + out.push_str(&escape_html(&content)); + } + AnyMdInline::MdHardLine(_) | AnyMdInline::MdSoftBreak(_) => { + out.push(' '); + } + AnyMdInline::MdEntityReference(entity) => { + render_entity_reference(entity, out); + } + AnyMdInline::MdInlineHtml(_) | AnyMdInline::MdHtmlBlock(_) => { + // HTML tags are stripped in alt text } } - - result } // ============================================================================ @@ -1590,6 +1641,80 @@ mod tests { assert_eq!(html, "

italic and bold

\n"); } + #[test] + fn test_emphasis_complex_cases() { + // Test: Nested + let parsed = parse_markdown("**bold *and italic* text**\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Nested failed: {}", + parsed.syntax() + ); + + // Test: Rule of 3 + let parsed = parse_markdown("***bold italic***\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Rule of 3 failed: {}", + parsed.syntax() + ); + + // Test: Multiple runs + let parsed = parse_markdown("*a **b** c*\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Multiple runs failed: {}", + parsed.syntax() + ); + + // Test: Overlapping + let parsed = parse_markdown("*foo**bar**baz*\n"); + assert_eq!( + parsed.syntax().kind(), + biome_markdown_syntax::MarkdownSyntaxKind::MD_DOCUMENT, + "Overlapping failed: {}", + parsed.syntax() + ); + + // Test: Unbalanced emphasis (CommonMark example 442) + // **foo* should produce *foo + let parsed = parse_markdown("**foo*\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

*foo

\n", + "Unbalanced: {}", + parsed.syntax() + ); + } + + #[test] + fn test_example_431() { + // Test: Example 431 - nested emphasis with triple star closer + // **foo *bar*** should produce foo bar + let parsed = parse_markdown("**foo *bar***\n"); + let html = document_to_html( + &parsed.tree(), + parsed.list_tightness(), + parsed.list_item_indents(), + parsed.quote_indents(), + ); + assert_eq!( + html, + "

foo bar

\n", + "Example 431: {}", + parsed.syntax() + ); + } + #[test] fn test_escape_html() { assert_eq!(escape_html("a & b < c > d"), "a & b < c > d"); diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap deleted file mode 100644 index c5e71c93f27d..000000000000 --- a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md.snap +++ /dev/null @@ -1,86 +0,0 @@ ---- -source: crates/biome_markdown_parser/tests/spec_test.rs -expression: snapshot ---- -## Input - -``` -This has `unclosed code - -``` - - -## AST - -``` -MdDocument { - bom_token: missing (optional), - value: MdBlockList [ - MdParagraph { - list: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], - }, - MdInlineCode { - l_tick_token: BACKTICK@9..10 "`" [] [], - content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [], - }, - ], - r_tick_token: missing (required), - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], - }, - ], - hard_line: missing (optional), - }, - ], - eof_token: EOF@24..24 "" [] [], -} -``` - -## CST - -``` -0: MD_DOCUMENT@0..24 - 0: (empty) - 1: MD_BLOCK_LIST@0..24 - 0: MD_PARAGRAPH@0..24 - 0: MD_INLINE_ITEM_LIST@0..24 - 0: MD_TEXTUAL@0..9 - 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_INLINE_CODE@9..23 - 0: BACKTICK@9..10 "`" [] [] - 1: MD_INLINE_ITEM_LIST@10..23 - 0: MD_TEXTUAL@10..23 - 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [] - 2: (empty) - 2: MD_TEXTUAL@23..24 - 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] - 1: (empty) - 2: EOF@24..24 "" [] [] - -``` - -## Diagnostics - -``` -unclosed_code_span.md:1:10 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - - × Unclosed code span, expected closing `. - - > 1 │ This has `unclosed code - │ ^ - 2 │ - - i code span started here - - > 1 │ This has `unclosed code - │ ^ - 2 │ - - i Add closing ` to close the code span. - -``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap index 10e49ec3d451..f0790723e0d0 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_complex.md.snap @@ -103,20 +103,20 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] [], }, - MdInlineEmphasis { - l_fence: DOUBLE_STAR@77..79 "**" [] [], + MdInlineItalic { + l_fence: STAR@77..78 "*" [] [], content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@79..80 "*" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [], + MdInlineEmphasis { + l_fence: DOUBLE_STAR@78..80 "**" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [], + }, + ], + r_fence: DOUBLE_STAR@91..93 "**" [] [], }, ], - r_fence: DOUBLE_STAR@91..93 "**" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@93..94 "*" [] [], + r_fence: STAR@93..94 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@94..95 "\n" [] [], @@ -301,17 +301,17 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@66..95 0: MD_TEXTUAL@66..77 0: MD_TEXTUAL_LITERAL@66..77 "Rule of 3: " [] [] - 1: MD_INLINE_EMPHASIS@77..93 - 0: DOUBLE_STAR@77..79 "**" [] [] - 1: MD_INLINE_ITEM_LIST@79..91 - 0: MD_TEXTUAL@79..80 - 0: MD_TEXTUAL_LITERAL@79..80 "*" [] [] - 1: MD_TEXTUAL@80..91 - 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [] - 2: DOUBLE_STAR@91..93 "**" [] [] - 2: MD_TEXTUAL@93..94 - 0: MD_TEXTUAL_LITERAL@93..94 "*" [] [] - 3: MD_TEXTUAL@94..95 + 1: MD_INLINE_ITALIC@77..94 + 0: STAR@77..78 "*" [] [] + 1: MD_INLINE_ITEM_LIST@78..93 + 0: MD_INLINE_EMPHASIS@78..93 + 0: DOUBLE_STAR@78..80 "**" [] [] + 1: MD_INLINE_ITEM_LIST@80..91 + 0: MD_TEXTUAL@80..91 + 0: MD_TEXTUAL_LITERAL@80..91 "bold italic" [] [] + 2: DOUBLE_STAR@91..93 "**" [] [] + 2: STAR@93..94 "*" [] [] + 2: MD_TEXTUAL@94..95 0: MD_TEXTUAL_LITERAL@94..95 "\n" [] [] 1: (empty) 5: MD_NEWLINE@95..96 diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap index bdbaf86e823d..94116b93fa48 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/emphasis_crossing.md.snap @@ -21,26 +21,32 @@ MdDocument { MdTextual { value_token: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] [], }, - MdInlineEmphasis { - l_fence: DOUBLE_STAR@10..12 "**" [] [], + MdInlineItalic { + l_fence: STAR@10..11 "*" [] [], content: MdInlineItemList [ - MdTextual { - value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@14..15 "*" [] [], + MdInlineItalic { + l_fence: STAR@11..12 "*" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@12..14 "a " [] [], + }, + MdInlineItalic { + l_fence: STAR@14..15 "*" [] [], + content: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [], + }, + ], + r_fence: STAR@16..17 "*" [] [], + }, + ], + r_fence: STAR@17..18 "*" [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@15..16 "b" [] [], + value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [], }, ], - r_fence: DOUBLE_STAR@16..18 "**" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@18..20 " c" [] [], - }, - MdTextual { - value_token: MD_TEXTUAL_LITERAL@20..21 "*" [] [], + r_fence: STAR@20..21 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@21..22 "\n" [] [], @@ -63,21 +69,25 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@0..22 0: MD_TEXTUAL@0..10 0: MD_TEXTUAL_LITERAL@0..10 "Crossing: " [] [] - 1: MD_INLINE_EMPHASIS@10..18 - 0: DOUBLE_STAR@10..12 "**" [] [] - 1: MD_INLINE_ITEM_LIST@12..16 - 0: MD_TEXTUAL@12..14 - 0: MD_TEXTUAL_LITERAL@12..14 "a " [] [] - 1: MD_TEXTUAL@14..15 - 0: MD_TEXTUAL_LITERAL@14..15 "*" [] [] - 2: MD_TEXTUAL@15..16 - 0: MD_TEXTUAL_LITERAL@15..16 "b" [] [] - 2: DOUBLE_STAR@16..18 "**" [] [] - 2: MD_TEXTUAL@18..20 - 0: MD_TEXTUAL_LITERAL@18..20 " c" [] [] - 3: MD_TEXTUAL@20..21 - 0: MD_TEXTUAL_LITERAL@20..21 "*" [] [] - 4: MD_TEXTUAL@21..22 + 1: MD_INLINE_ITALIC@10..21 + 0: STAR@10..11 "*" [] [] + 1: MD_INLINE_ITEM_LIST@11..20 + 0: MD_INLINE_ITALIC@11..18 + 0: STAR@11..12 "*" [] [] + 1: MD_INLINE_ITEM_LIST@12..17 + 0: MD_TEXTUAL@12..14 + 0: MD_TEXTUAL_LITERAL@12..14 "a " [] [] + 1: MD_INLINE_ITALIC@14..17 + 0: STAR@14..15 "*" [] [] + 1: MD_INLINE_ITEM_LIST@15..16 + 0: MD_TEXTUAL@15..16 + 0: MD_TEXTUAL_LITERAL@15..16 "b" [] [] + 2: STAR@16..17 "*" [] [] + 2: STAR@17..18 "*" [] [] + 1: MD_TEXTUAL@18..20 + 0: MD_TEXTUAL_LITERAL@18..20 " c" [] [] + 2: STAR@20..21 "*" [] [] + 2: MD_TEXTUAL@21..22 0: MD_TEXTUAL_LITERAL@21..22 "\n" [] [] 1: (empty) 2: EOF@22..22 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap index 1183219b2b94..abc0b3d4823f 100644 --- a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_bold.md.snap @@ -22,7 +22,10 @@ MdDocument { value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], }, MdTextual { - value_token: MD_TEXTUAL_LITERAL@9..11 "**" [] [], + value_token: MD_TEXTUAL_LITERAL@9..10 "*" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..11 "*" [] [], }, MdTextual { value_token: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] [], @@ -48,11 +51,13 @@ MdDocument { 0: MD_INLINE_ITEM_LIST@0..25 0: MD_TEXTUAL@0..9 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] - 1: MD_TEXTUAL@9..11 - 0: MD_TEXTUAL_LITERAL@9..11 "**" [] [] - 2: MD_TEXTUAL@11..24 + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "*" [] [] + 2: MD_TEXTUAL@10..11 + 0: MD_TEXTUAL_LITERAL@10..11 "*" [] [] + 3: MD_TEXTUAL@11..24 0: MD_TEXTUAL_LITERAL@11..24 "unclosed bold" [] [] - 3: MD_TEXTUAL@24..25 + 4: MD_TEXTUAL@24..25 0: MD_TEXTUAL_LITERAL@24..25 "\n" [] [] 1: (empty) 2: EOF@25..25 "" [] [] diff --git a/crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md similarity index 100% rename from crates/biome_markdown_parser/tests/md_test_suite/error/unclosed_code_span.md rename to crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap new file mode 100644 index 000000000000..eefa654d5e99 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/unclosed_code_span.md.snap @@ -0,0 +1,60 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +``` +This has `unclosed code + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@0..9 "This has " [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@9..10 "`" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@23..24 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + eof_token: EOF@24..24 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..24 + 0: (empty) + 1: MD_BLOCK_LIST@0..24 + 0: MD_PARAGRAPH@0..24 + 0: MD_INLINE_ITEM_LIST@0..24 + 0: MD_TEXTUAL@0..9 + 0: MD_TEXTUAL_LITERAL@0..9 "This has " [] [] + 1: MD_TEXTUAL@9..10 + 0: MD_TEXTUAL_LITERAL@9..10 "`" [] [] + 2: MD_TEXTUAL@10..23 + 0: MD_TEXTUAL_LITERAL@10..23 "unclosed code" [] [] + 3: MD_TEXTUAL@23..24 + 0: MD_TEXTUAL_LITERAL@23..24 "\n" [] [] + 1: (empty) + 2: EOF@24..24 "" [] [] + +``` diff --git a/crates/biome_unicode_table/src/lib.rs b/crates/biome_unicode_table/src/lib.rs index 3d2c3a82cd4c..9a5495ebc4af 100644 --- a/crates/biome_unicode_table/src/lib.rs +++ b/crates/biome_unicode_table/src/lib.rs @@ -4,9 +4,11 @@ use crate::bytes::DISPATCHER; use crate::tables::derived_property::{ID_Continue, ID_Start}; mod bytes; +mod punctuation; mod tables; pub use crate::bytes::Dispatch; +pub use crate::punctuation::is_unicode_punctuation; /// Tests if `c` is a valid start of a CSS identifier #[inline] diff --git a/crates/biome_unicode_table/src/punctuation.rs b/crates/biome_unicode_table/src/punctuation.rs new file mode 100644 index 000000000000..c3575a823d6b --- /dev/null +++ b/crates/biome_unicode_table/src/punctuation.rs @@ -0,0 +1,413 @@ +//! CommonMark Unicode punctuation table. +//! +//! Derived from the markdown-rs Unicode punctuation list used for CommonMark. +//! Per CommonMark, "Unicode punctuation" includes characters from both the +//! General_Category=Punctuation (P*) and General_Category=Symbol (S*) categories. +//! This is used for CommonMark flanking rules in emphasis parsing. + +// Note: duplicated from generated unicode tables to keep this module standalone. +#[inline] +fn bsearch_range_table(c: char, r: &[(char, char)]) -> bool { + use core::cmp::Ordering::{Equal, Greater, Less}; + r.binary_search_by(|&(lo, hi)| { + if lo > c { + Greater + } else if hi < c { + Less + } else { + Equal + } + }) + .is_ok() +} + +const PUNCTUATION_RANGES: &[(char, char)] = &[ + ('\u{0021}', '\u{002F}'), + ('\u{003A}', '\u{0040}'), + ('\u{005B}', '\u{0060}'), + ('\u{007B}', '\u{007E}'), + ('\u{00A1}', '\u{00A9}'), + ('\u{00AB}', '\u{00AC}'), + ('\u{00AE}', '\u{00B1}'), + ('\u{00B4}', '\u{00B4}'), + ('\u{00B6}', '\u{00B8}'), + ('\u{00BB}', '\u{00BB}'), + ('\u{00BF}', '\u{00BF}'), + ('\u{00D7}', '\u{00D7}'), + ('\u{00F7}', '\u{00F7}'), + ('\u{02C2}', '\u{02C5}'), + ('\u{02D2}', '\u{02DF}'), + ('\u{02E5}', '\u{02EB}'), + ('\u{02ED}', '\u{02ED}'), + ('\u{02EF}', '\u{02FF}'), + ('\u{0375}', '\u{0375}'), + ('\u{037E}', '\u{037E}'), + ('\u{0384}', '\u{0385}'), + ('\u{0387}', '\u{0387}'), + ('\u{03F6}', '\u{03F6}'), + ('\u{0482}', '\u{0482}'), + ('\u{055A}', '\u{055F}'), + ('\u{0589}', '\u{058A}'), + ('\u{058D}', '\u{058F}'), + ('\u{05BE}', '\u{05BE}'), + ('\u{05C0}', '\u{05C0}'), + ('\u{05C3}', '\u{05C3}'), + ('\u{05C6}', '\u{05C6}'), + ('\u{05F3}', '\u{05F4}'), + ('\u{0606}', '\u{060F}'), + ('\u{061B}', '\u{061B}'), + ('\u{061D}', '\u{061F}'), + ('\u{066A}', '\u{066D}'), + ('\u{06D4}', '\u{06D4}'), + ('\u{06DE}', '\u{06DE}'), + ('\u{06E9}', '\u{06E9}'), + ('\u{06FD}', '\u{06FE}'), + ('\u{0700}', '\u{070D}'), + ('\u{07F6}', '\u{07F9}'), + ('\u{07FE}', '\u{07FF}'), + ('\u{0830}', '\u{083E}'), + ('\u{085E}', '\u{085E}'), + ('\u{0888}', '\u{0888}'), + ('\u{0964}', '\u{0965}'), + ('\u{0970}', '\u{0970}'), + ('\u{09F2}', '\u{09F3}'), + ('\u{09FA}', '\u{09FB}'), + ('\u{09FD}', '\u{09FD}'), + ('\u{0A76}', '\u{0A76}'), + ('\u{0AF0}', '\u{0AF1}'), + ('\u{0B70}', '\u{0B70}'), + ('\u{0BF3}', '\u{0BFA}'), + ('\u{0C77}', '\u{0C77}'), + ('\u{0C7F}', '\u{0C7F}'), + ('\u{0C84}', '\u{0C84}'), + ('\u{0D4F}', '\u{0D4F}'), + ('\u{0D79}', '\u{0D79}'), + ('\u{0DF4}', '\u{0DF4}'), + ('\u{0E3F}', '\u{0E3F}'), + ('\u{0E4F}', '\u{0E4F}'), + ('\u{0E5A}', '\u{0E5B}'), + ('\u{0F01}', '\u{0F17}'), + ('\u{0F1A}', '\u{0F1F}'), + ('\u{0F34}', '\u{0F34}'), + ('\u{0F36}', '\u{0F36}'), + ('\u{0F38}', '\u{0F38}'), + ('\u{0F3A}', '\u{0F3D}'), + ('\u{0F85}', '\u{0F85}'), + ('\u{0FBE}', '\u{0FC5}'), + ('\u{0FC7}', '\u{0FCC}'), + ('\u{0FCE}', '\u{0FDA}'), + ('\u{104A}', '\u{104F}'), + ('\u{109E}', '\u{109F}'), + ('\u{10FB}', '\u{10FB}'), + ('\u{1360}', '\u{1368}'), + ('\u{1390}', '\u{1399}'), + ('\u{1400}', '\u{1400}'), + ('\u{166D}', '\u{166E}'), + ('\u{169B}', '\u{169C}'), + ('\u{16EB}', '\u{16ED}'), + ('\u{1735}', '\u{1736}'), + ('\u{17D4}', '\u{17D6}'), + ('\u{17D8}', '\u{17DB}'), + ('\u{1800}', '\u{180A}'), + ('\u{1940}', '\u{1940}'), + ('\u{1944}', '\u{1945}'), + ('\u{19DE}', '\u{19FF}'), + ('\u{1A1E}', '\u{1A1F}'), + ('\u{1AA0}', '\u{1AA6}'), + ('\u{1AA8}', '\u{1AAD}'), + ('\u{1B4E}', '\u{1B4F}'), + ('\u{1B5A}', '\u{1B6A}'), + ('\u{1B74}', '\u{1B7F}'), + ('\u{1BFC}', '\u{1BFF}'), + ('\u{1C3B}', '\u{1C3F}'), + ('\u{1C7E}', '\u{1C7F}'), + ('\u{1CC0}', '\u{1CC7}'), + ('\u{1CD3}', '\u{1CD3}'), + ('\u{1FBD}', '\u{1FBD}'), + ('\u{1FBF}', '\u{1FC1}'), + ('\u{1FCD}', '\u{1FCF}'), + ('\u{1FDD}', '\u{1FDF}'), + ('\u{1FED}', '\u{1FEF}'), + ('\u{1FFD}', '\u{1FFE}'), + ('\u{2010}', '\u{2027}'), + ('\u{2030}', '\u{205E}'), + ('\u{207A}', '\u{207E}'), + ('\u{208A}', '\u{208E}'), + ('\u{20A0}', '\u{20C0}'), + ('\u{2100}', '\u{2101}'), + ('\u{2103}', '\u{2106}'), + ('\u{2108}', '\u{2109}'), + ('\u{2114}', '\u{2114}'), + ('\u{2116}', '\u{2118}'), + ('\u{211E}', '\u{2123}'), + ('\u{2125}', '\u{2125}'), + ('\u{2127}', '\u{2127}'), + ('\u{2129}', '\u{2129}'), + ('\u{212E}', '\u{212E}'), + ('\u{213A}', '\u{213B}'), + ('\u{2140}', '\u{2144}'), + ('\u{214A}', '\u{214D}'), + ('\u{214F}', '\u{214F}'), + ('\u{218A}', '\u{218B}'), + ('\u{2190}', '\u{2429}'), + ('\u{2440}', '\u{244A}'), + ('\u{249C}', '\u{24E9}'), + ('\u{2500}', '\u{2775}'), + ('\u{2794}', '\u{2B73}'), + ('\u{2B76}', '\u{2B95}'), + ('\u{2B97}', '\u{2BFF}'), + ('\u{2CE5}', '\u{2CEA}'), + ('\u{2CF9}', '\u{2CFC}'), + ('\u{2CFE}', '\u{2CFF}'), + ('\u{2D70}', '\u{2D70}'), + ('\u{2E00}', '\u{2E2E}'), + ('\u{2E30}', '\u{2E5D}'), + ('\u{2E80}', '\u{2E99}'), + ('\u{2E9B}', '\u{2EF3}'), + ('\u{2F00}', '\u{2FD5}'), + ('\u{2FF0}', '\u{2FFF}'), + ('\u{3001}', '\u{3004}'), + ('\u{3008}', '\u{3020}'), + ('\u{3030}', '\u{3030}'), + ('\u{3036}', '\u{3037}'), + ('\u{303D}', '\u{303F}'), + ('\u{309B}', '\u{309C}'), + ('\u{30A0}', '\u{30A0}'), + ('\u{30FB}', '\u{30FB}'), + ('\u{3190}', '\u{3191}'), + ('\u{3196}', '\u{319F}'), + ('\u{31C0}', '\u{31E5}'), + ('\u{31EF}', '\u{31EF}'), + ('\u{3200}', '\u{321E}'), + ('\u{322A}', '\u{3247}'), + ('\u{3250}', '\u{3250}'), + ('\u{3260}', '\u{327F}'), + ('\u{328A}', '\u{32B0}'), + ('\u{32C0}', '\u{33FF}'), + ('\u{4DC0}', '\u{4DFF}'), + ('\u{A490}', '\u{A4C6}'), + ('\u{A4FE}', '\u{A4FF}'), + ('\u{A60D}', '\u{A60F}'), + ('\u{A673}', '\u{A673}'), + ('\u{A67E}', '\u{A67E}'), + ('\u{A6F2}', '\u{A6F7}'), + ('\u{A700}', '\u{A716}'), + ('\u{A720}', '\u{A721}'), + ('\u{A789}', '\u{A78A}'), + ('\u{A828}', '\u{A82B}'), + ('\u{A836}', '\u{A839}'), + ('\u{A874}', '\u{A877}'), + ('\u{A8CE}', '\u{A8CF}'), + ('\u{A8F8}', '\u{A8FA}'), + ('\u{A8FC}', '\u{A8FC}'), + ('\u{A92E}', '\u{A92F}'), + ('\u{A95F}', '\u{A95F}'), + ('\u{A9C1}', '\u{A9CD}'), + ('\u{A9DE}', '\u{A9DF}'), + ('\u{AA5C}', '\u{AA5F}'), + ('\u{AA77}', '\u{AA79}'), + ('\u{AADE}', '\u{AADF}'), + ('\u{AAF0}', '\u{AAF1}'), + ('\u{AB5B}', '\u{AB5B}'), + ('\u{AB6A}', '\u{AB6B}'), + ('\u{ABEB}', '\u{ABEB}'), + ('\u{FB29}', '\u{FB29}'), + ('\u{FBB2}', '\u{FBC2}'), + ('\u{FD3E}', '\u{FD4F}'), + ('\u{FDCF}', '\u{FDCF}'), + ('\u{FDFC}', '\u{FDFF}'), + ('\u{FE10}', '\u{FE19}'), + ('\u{FE30}', '\u{FE52}'), + ('\u{FE54}', '\u{FE66}'), + ('\u{FE68}', '\u{FE6B}'), + ('\u{FF01}', '\u{FF0F}'), + ('\u{FF1A}', '\u{FF20}'), + ('\u{FF3B}', '\u{FF40}'), + ('\u{FF5B}', '\u{FF65}'), + ('\u{FFE0}', '\u{FFE6}'), + ('\u{FFE8}', '\u{FFEE}'), + ('\u{FFFC}', '\u{FFFD}'), + ('\u{10100}', '\u{10102}'), + ('\u{10137}', '\u{1013F}'), + ('\u{10179}', '\u{10189}'), + ('\u{1018C}', '\u{1018E}'), + ('\u{10190}', '\u{1019C}'), + ('\u{101A0}', '\u{101A0}'), + ('\u{101D0}', '\u{101FC}'), + ('\u{1039F}', '\u{1039F}'), + ('\u{103D0}', '\u{103D0}'), + ('\u{1056F}', '\u{1056F}'), + ('\u{10857}', '\u{10857}'), + ('\u{10877}', '\u{10878}'), + ('\u{1091F}', '\u{1091F}'), + ('\u{1093F}', '\u{1093F}'), + ('\u{10A50}', '\u{10A58}'), + ('\u{10A7F}', '\u{10A7F}'), + ('\u{10AC8}', '\u{10AC8}'), + ('\u{10AF0}', '\u{10AF6}'), + ('\u{10B39}', '\u{10B3F}'), + ('\u{10B99}', '\u{10B9C}'), + ('\u{10D6E}', '\u{10D6E}'), + ('\u{10D8E}', '\u{10D8F}'), + ('\u{10EAD}', '\u{10EAD}'), + ('\u{10F55}', '\u{10F59}'), + ('\u{10F86}', '\u{10F89}'), + ('\u{11047}', '\u{1104D}'), + ('\u{110BB}', '\u{110BC}'), + ('\u{110BE}', '\u{110C1}'), + ('\u{11140}', '\u{11143}'), + ('\u{11174}', '\u{11175}'), + ('\u{111C5}', '\u{111C8}'), + ('\u{111CD}', '\u{111CD}'), + ('\u{111DB}', '\u{111DB}'), + ('\u{111DD}', '\u{111DF}'), + ('\u{11238}', '\u{1123D}'), + ('\u{112A9}', '\u{112A9}'), + ('\u{113D4}', '\u{113D5}'), + ('\u{113D7}', '\u{113D8}'), + ('\u{1144B}', '\u{1144F}'), + ('\u{1145A}', '\u{1145B}'), + ('\u{1145D}', '\u{1145D}'), + ('\u{114C6}', '\u{114C6}'), + ('\u{115C1}', '\u{115D7}'), + ('\u{11641}', '\u{11643}'), + ('\u{11660}', '\u{1166C}'), + ('\u{116B9}', '\u{116B9}'), + ('\u{1173C}', '\u{1173F}'), + ('\u{1183B}', '\u{1183B}'), + ('\u{11944}', '\u{11946}'), + ('\u{119E2}', '\u{119E2}'), + ('\u{11A3F}', '\u{11A46}'), + ('\u{11A9A}', '\u{11A9C}'), + ('\u{11A9E}', '\u{11AA2}'), + ('\u{11B00}', '\u{11B09}'), + ('\u{11BE1}', '\u{11BE1}'), + ('\u{11C41}', '\u{11C45}'), + ('\u{11C70}', '\u{11C71}'), + ('\u{11EF7}', '\u{11EF8}'), + ('\u{11F43}', '\u{11F4F}'), + ('\u{11FD5}', '\u{11FF1}'), + ('\u{11FFF}', '\u{11FFF}'), + ('\u{12470}', '\u{12474}'), + ('\u{12FF1}', '\u{12FF2}'), + ('\u{16A6E}', '\u{16A6F}'), + ('\u{16AF5}', '\u{16AF5}'), + ('\u{16B37}', '\u{16B3F}'), + ('\u{16B44}', '\u{16B45}'), + ('\u{16D6D}', '\u{16D6F}'), + ('\u{16E97}', '\u{16E9A}'), + ('\u{16FE2}', '\u{16FE2}'), + ('\u{1BC9C}', '\u{1BC9C}'), + ('\u{1BC9F}', '\u{1BC9F}'), + ('\u{1CC00}', '\u{1CCEF}'), + ('\u{1CD00}', '\u{1CEB3}'), + ('\u{1CF50}', '\u{1CFC3}'), + ('\u{1D000}', '\u{1D0F5}'), + ('\u{1D100}', '\u{1D126}'), + ('\u{1D129}', '\u{1D164}'), + ('\u{1D16A}', '\u{1D16C}'), + ('\u{1D183}', '\u{1D184}'), + ('\u{1D18C}', '\u{1D1A9}'), + ('\u{1D1AE}', '\u{1D1EA}'), + ('\u{1D200}', '\u{1D241}'), + ('\u{1D245}', '\u{1D245}'), + ('\u{1D300}', '\u{1D356}'), + ('\u{1D6C1}', '\u{1D6C1}'), + ('\u{1D6DB}', '\u{1D6DB}'), + ('\u{1D6FB}', '\u{1D6FB}'), + ('\u{1D715}', '\u{1D715}'), + ('\u{1D735}', '\u{1D735}'), + ('\u{1D74F}', '\u{1D74F}'), + ('\u{1D76F}', '\u{1D76F}'), + ('\u{1D789}', '\u{1D789}'), + ('\u{1D7A9}', '\u{1D7A9}'), + ('\u{1D7C3}', '\u{1D7C3}'), + ('\u{1D800}', '\u{1D9FF}'), + ('\u{1DA37}', '\u{1DA3A}'), + ('\u{1DA6D}', '\u{1DA74}'), + ('\u{1DA76}', '\u{1DA83}'), + ('\u{1DA85}', '\u{1DA8B}'), + ('\u{1E14F}', '\u{1E14F}'), + ('\u{1E2FF}', '\u{1E2FF}'), + ('\u{1E5FF}', '\u{1E5FF}'), + ('\u{1E95E}', '\u{1E95F}'), + ('\u{1ECAC}', '\u{1ECAC}'), + ('\u{1ECB0}', '\u{1ECB0}'), + ('\u{1ED2E}', '\u{1ED2E}'), + ('\u{1EEF0}', '\u{1EEF1}'), + ('\u{1F000}', '\u{1F02B}'), + ('\u{1F030}', '\u{1F093}'), + ('\u{1F0A0}', '\u{1F0AE}'), + ('\u{1F0B1}', '\u{1F0BF}'), + ('\u{1F0C1}', '\u{1F0CF}'), + ('\u{1F0D1}', '\u{1F0F5}'), + ('\u{1F10D}', '\u{1F1AD}'), + ('\u{1F1E6}', '\u{1F202}'), + ('\u{1F210}', '\u{1F23B}'), + ('\u{1F240}', '\u{1F248}'), + ('\u{1F250}', '\u{1F251}'), + ('\u{1F260}', '\u{1F265}'), + ('\u{1F300}', '\u{1F6D7}'), + ('\u{1F6DC}', '\u{1F6EC}'), + ('\u{1F6F0}', '\u{1F6FC}'), + ('\u{1F700}', '\u{1F776}'), + ('\u{1F77B}', '\u{1F7D9}'), + ('\u{1F7E0}', '\u{1F7EB}'), + ('\u{1F7F0}', '\u{1F7F0}'), + ('\u{1F800}', '\u{1F80B}'), + ('\u{1F810}', '\u{1F847}'), + ('\u{1F850}', '\u{1F859}'), + ('\u{1F860}', '\u{1F887}'), + ('\u{1F890}', '\u{1F8AD}'), + ('\u{1F8B0}', '\u{1F8BB}'), + ('\u{1F8C0}', '\u{1F8C1}'), + ('\u{1F900}', '\u{1FA53}'), + ('\u{1FA60}', '\u{1FA6D}'), + ('\u{1FA70}', '\u{1FA7C}'), + ('\u{1FA80}', '\u{1FA89}'), + ('\u{1FA8F}', '\u{1FAC6}'), + ('\u{1FACE}', '\u{1FADC}'), + ('\u{1FADF}', '\u{1FAE9}'), + ('\u{1FAF0}', '\u{1FAF8}'), + ('\u{1FB00}', '\u{1FB92}'), + ('\u{1FB94}', '\u{1FBEF}'), +]; + +/// Check if a character is Unicode punctuation per CommonMark. +#[inline] +pub fn is_unicode_punctuation(c: char) -> bool { + bsearch_range_table(c, PUNCTUATION_RANGES) +} + +#[cfg(test)] +mod tests { + use super::PUNCTUATION_RANGES; + use super::is_unicode_punctuation; + + #[test] + fn ascii_punctuation() { + assert!(is_unicode_punctuation('!')); + assert!(is_unicode_punctuation('.')); + assert!(is_unicode_punctuation('(')); + } + + #[test] + fn non_punctuation() { + assert!(!is_unicode_punctuation('a')); + assert!(!is_unicode_punctuation(' ')); + assert!(!is_unicode_punctuation('0')); + } + + #[test] + fn unicode_punctuation() { + assert!(is_unicode_punctuation('\u{2014}')); + assert!(is_unicode_punctuation('\u{00BF}')); + } + + #[test] + fn table_is_sorted() { + for window in PUNCTUATION_RANGES.windows(2) { + assert!(window[0].1 < window[1].0, "Ranges must be sorted"); + } + } +} From b6763e5a4850b2bddac6794031fc4b6335df8d7b Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Sun, 25 Jan 2026 01:29:45 -0500 Subject: [PATCH 04/26] fix(markdown): refine block structure and list handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overhauls the handling of block-level elements, with a major focus on list "tightness" and HTML block detection. Changes include: - Align HTML block detection conditions with CommonMark specs (tightening start/end conditions). - Fix logic for "tight" vs "loose" lists, ensuring correct paragraph wrapping in HTML output. - Correct indentation handling for nested lists and empty list items. - Improve detection of blockquotes when they appear on the first line of a list item. - Fix Setext heading edge cases (examples 086–091). - Prevent hard line breaks from appearing incorrectly at the end of blocks. --- crates/biome_markdown_parser/src/lexer/mod.rs | 24 +- crates/biome_markdown_parser/src/syntax.rs | 245 ++++++++-- .../src/syntax/fenced_code_block.rs | 53 +- .../src/syntax/header.rs | 4 +- .../src/syntax/html_block.rs | 273 +++++++++-- .../src/syntax/inline.rs | 50 +- .../biome_markdown_parser/src/syntax/list.rs | 459 ++++++++++++++++-- .../biome_markdown_parser/src/syntax/quote.rs | 9 +- crates/biome_markdown_parser/src/to_html.rs | 60 ++- .../tests/md_test_suite/ok/edge_cases.md.snap | 59 +-- .../md_test_suite/ok/list_indentation.md.snap | 29 +- .../ok/list_interrupt_empty_bullet.md.snap | 35 +- .../md_test_suite/ok/list_tightness.md.snap | 113 ++--- .../md_test_suite/ok/multiline_list.md.snap | 57 +-- .../ok/setext_heading_edge_cases.md | 15 + .../ok/setext_heading_edge_cases.md.snap | 236 +++++++++ .../ok/setext_heading_negative.md | 13 + .../ok/setext_heading_negative.md.snap | 212 ++++++++ .../biome_markdown_parser/tests/spec_test.rs | 35 +- xtask/coverage/src/reporters.rs | 2 +- 20 files changed, 1631 insertions(+), 352 deletions(-) create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_edge_cases.md.snap create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md create mode 100644 crates/biome_markdown_parser/tests/md_test_suite/ok/setext_heading_negative.md.snap diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs index 8122d861de06..a41e12749383 100644 --- a/crates/biome_markdown_parser/src/lexer/mod.rs +++ b/crates/biome_markdown_parser/src/lexer/mod.rs @@ -151,6 +151,7 @@ impl<'src> Lexer<'src> for MarkdownLexer<'src> { // This ensures the *next* token (after NEWLINE) has PRECEDING_LINE_BREAK set. if !kind.is_trivia() && kind != NEWLINE + && kind != MD_HARD_LINE_LITERAL && !(kind == MD_TEXTUAL_LITERAL && self.after_newline && self.current_text_is_whitespace()) @@ -876,21 +877,18 @@ impl<'src> MarkdownLexer<'src> { let start_position = self.position; let mut eq_count = 0; - // Consume all `=` and spaces - loop { - match self.current_byte() { - Some(b'=') => { - self.advance(1); - eq_count += 1; - } - Some(b' ') => { - self.advance(1); - } - _ => break, - } + // Consume only `=` characters — no spaces between (CommonMark §4.3) + while let Some(b'=') = self.current_byte() { + self.advance(1); + eq_count += 1; + } + + // Allow optional trailing whitespace only + while matches!(self.current_byte(), Some(b' ' | b'\t')) { + self.advance(1); } - // Must have at least one `=` and be followed by newline or EOF + // Must have at least one `=` and nothing else before newline or EOF if eq_count >= 1 && matches!(self.current_byte(), Some(b'\n' | b'\r') | None) { return MD_SETEXT_UNDERLINE_LITERAL; } diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs index 96037af6f798..832d93c97246 100644 --- a/crates/biome_markdown_parser/src/syntax.rs +++ b/crates/biome_markdown_parser/src/syntax.rs @@ -590,13 +590,25 @@ pub(crate) fn parse_paragraph(p: &mut MarkdownParser) -> ParsedSyntax { // MD_SETEXT_UNDERLINE_LITERAL is for `=` underlines // MD_THEMATIC_BREAK_LITERAL with only `-` is also a setext underline (H2) let completed = if allow_setext && p.at(MD_SETEXT_UNDERLINE_LITERAL) { - // This is a setext heading (H1 with `=`) - consume the underline - p.bump(MD_SETEXT_UNDERLINE_LITERAL); - m.complete(p, MD_SETEXT_HEADER) + let indent = real_line_indent_from_source(p); + if indent < 4 { + // This is a setext heading (H1 with `=`) - consume the underline + p.bump(MD_SETEXT_UNDERLINE_LITERAL); + m.complete(p, MD_SETEXT_HEADER) + } else { + // 4+ spaces of indent: not a setext underline (CommonMark §4.3) + m.complete(p, MD_PARAGRAPH) + } } else if allow_setext && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) { - // This is a setext heading (H2 with `-`) - remap token and consume - p.bump_remap(MD_SETEXT_UNDERLINE_LITERAL); - m.complete(p, MD_SETEXT_HEADER) + let indent = real_line_indent_from_source(p); + if indent < 4 { + // This is a setext heading (H2 with `-`) - remap token and consume + p.bump_remap(MD_SETEXT_UNDERLINE_LITERAL); + m.complete(p, MD_SETEXT_HEADER) + } else { + // 4+ spaces of indent: not a setext underline (CommonMark §4.3) + m.complete(p, MD_PARAGRAPH) + } } else { m.complete(p, MD_PARAGRAPH) }; @@ -618,17 +630,105 @@ fn inline_has_non_whitespace(p: &MarkdownParser, start: usize, end: usize) -> bo .is_empty() } +/// Check if a thematic break text contains only dashes (used for setext H2 detection). +pub(crate) fn is_dash_only_thematic_break_text(text: &str) -> bool { + !text.is_empty() && text.trim().chars().all(|c| c == '-') +} + +/// Token-based check: is the current line a setext underline? +/// +/// Call after consuming a NEWLINE token. Skips 0–3 columns of leading whitespace +/// (tabs expand to the next tab stop per CommonMark §2.2), then checks for +/// `MD_SETEXT_UNDERLINE_LITERAL` or a dash-only `MD_THEMATIC_BREAK_LITERAL`. +/// +/// Returns `Some(bytes_consumed)` if the line is a setext underline, `None` otherwise. +/// The byte count includes only the whitespace tokens consumed during the indent skip, +/// NOT the underline token itself. Callers that track byte budgets must subtract this. +/// +/// This is the single source of truth for setext detection in inline contexts. +/// Used by `has_matching_code_span_closer`, `parse_inline_html`, and `parse_inline_item_list`. +/// +/// Context safety: this function does NOT call `allow_setext_heading` because the token +/// stream itself encodes context. In blockquotes, `R_ANGLE` tokens appear after NEWLINE +/// before content, so the whitespace-only skip naturally rejects those lines. In list +/// items, the indent reflected in the token stream is the raw line indent, and the +/// `columns < 4` check correctly rejects lines with 4+ columns of leading whitespace. +pub(crate) fn at_setext_underline_after_newline(p: &mut MarkdownParser) -> Option { + let mut columns = 0; + let mut bytes_consumed = 0; + while columns < INDENT_CODE_BLOCK_SPACES + && p.at(MD_TEXTUAL_LITERAL) + && p.cur_text().chars().all(|c| c == ' ' || c == '\t') + { + for c in p.cur_text().chars() { + match c { + ' ' => columns += 1, + '\t' => columns += 4 - (columns % 4), + _ => {} + } + } + bytes_consumed += p.cur_text().len(); + p.bump(MD_TEXTUAL_LITERAL); + } + if columns >= INDENT_CODE_BLOCK_SPACES { + return None; + } + let is_setext = p.at(MD_SETEXT_UNDERLINE_LITERAL) + || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break_text(p.cur_text())); + if is_setext { + Some(bytes_consumed) + } else { + None + } +} + +/// Token-based check: does an inline span of `byte_len` bytes cross a setext underline? +/// +/// Walks tokens via lookahead. At each NEWLINE, delegates to +/// [`at_setext_underline_after_newline`] — the same detection used by +/// `has_matching_code_span_closer` and `parse_inline_item_list`. +pub(crate) fn inline_span_crosses_setext(p: &mut MarkdownParser, byte_len: usize) -> bool { + p.lookahead(|p| { + let mut remaining = byte_len; + loop { + if remaining == 0 || p.at(T![EOF]) { + return false; + } + if p.at(NEWLINE) { + let nl_len = p.cur_text().len(); + if nl_len > remaining { + return false; + } + remaining -= nl_len; + p.bump(NEWLINE); + if let Some(ws_bytes) = at_setext_underline_after_newline(p) { + // Only flag if the whitespace consumed is still within our span + return ws_bytes <= remaining; + } + continue; + } + let tok_len = p.cur_text().len(); + if tok_len > remaining { + return false; + } + remaining -= tok_len; + p.bump_any(); + } + }) +} + /// Check if the current thematic break token contains only dashes. /// This is used to detect H2 setext underlines. fn is_dash_only_thematic_break(p: &MarkdownParser) -> bool { - let text = p.cur_text(); - !text.is_empty() && text.trim().chars().all(|c| c == '-') + is_dash_only_thematic_break_text(p.cur_text()) } fn allow_setext_heading(p: &MarkdownParser) -> bool { let required_indent = p.state().list_item_required_indent; if required_indent > 0 { - let indent = p.line_start_leading_indent(); + // Compute real indent from source text, since leading whitespace + // may have been consumed as trivia in list item context. + let indent = real_line_indent_from_source(p); if indent < required_indent { return false; } @@ -649,6 +749,31 @@ fn allow_setext_heading(p: &MarkdownParser) -> bool { line_has_quote_prefix(p, depth) } +/// Compute the real leading indent of the current line from source text. +/// This is needed because leading whitespace may have been consumed as trivia +/// in list item context, making `line_start_leading_indent()` return 0. +fn real_line_indent_from_source(p: &MarkdownParser) -> usize { + let source = p.source().source_text(); + let pos: usize = p.cur_range().start().into(); + + // Find the start of the current line + let line_start = source[..pos] + .rfind('\n') + .map(|i| i + 1) + .unwrap_or(0); + + // Count leading whitespace columns on this line + let mut column = 0; + for c in source[line_start..].chars() { + match c { + ' ' => column += 1, + '\t' => column += 4 - (column % 4), + _ => break, + } + } + column +} + fn line_has_quote_prefix(p: &MarkdownParser, depth: usize) -> bool { if depth == 0 { return false; @@ -804,13 +929,24 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) { consume_partial_quote_prefix(p, quote_depth); } - // After crossing a line, check for block-level constructs and setext underlines - // Check if we're at a setext heading underline + // After crossing a line, check for setext underlines. + // For non-list paragraphs, we need to look past up to 3 spaces of indent + // to detect setext underlines (CommonMark §4.3). + if has_content && p.state().list_item_required_indent == 0 { + let is_setext = p.lookahead(|p| { + at_setext_underline_after_newline(p).is_some() + }); + if is_setext { + // Skip the indent so parse_paragraph sees the underline + p.skip_line_indent(INDENT_CODE_BLOCK_SPACES); + break; + } + } + + // Check if we're at a setext heading underline (already past indent) if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) { break; } - - // Check if we're at a thematic break that could be a setext underline if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) { break; } @@ -820,6 +956,23 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) { // nested list markers like "\t - baz" to break out of the paragraph. let required_indent = p.state().list_item_required_indent; if required_indent > 0 { + // Check for setext underline after indent stripping. + // The `---` or `===` may be indented by the list item's required indent, + // so we need to look past that indent. + let real_indent = real_line_indent_from_source(p); + if real_indent >= required_indent { + let is_setext = p.lookahead(|p| { + p.skip_line_indent(required_indent); + p.at(MD_SETEXT_UNDERLINE_LITERAL) + || (p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p)) + }); + if is_setext && has_content { + // Skip the indent so parse_paragraph sees the underline + p.skip_line_indent(required_indent); + break; + } + } + let indent = p.line_start_leading_indent(); if indent >= required_indent { let interrupts = p.lookahead(|p| { @@ -886,13 +1039,22 @@ pub(crate) fn parse_inline_item_list(p: &mut MarkdownParser) { } // Check if we're at a setext heading underline (stop for paragraph to handle) - if has_content && p.at(MD_SETEXT_UNDERLINE_LITERAL) && allow_setext_heading(p) { + // Per CommonMark §4.3, setext underlines can be indented 0-3 spaces only. + if has_content + && p.at(MD_SETEXT_UNDERLINE_LITERAL) + && real_line_indent_from_source(p) < INDENT_CODE_BLOCK_SPACES + && allow_setext_heading(p) + { break; } // Check if we're at a thematic break that could be a setext underline // (dash-only thematic breaks following paragraph content are setext H2) - if has_content && p.at(MD_THEMATIC_BREAK_LITERAL) && is_dash_only_thematic_break(p) { + if has_content + && p.at(MD_THEMATIC_BREAK_LITERAL) + && real_line_indent_from_source(p) < INDENT_CODE_BLOCK_SPACES + && is_dash_only_thematic_break(p) + { break; } @@ -949,9 +1111,10 @@ fn set_inline_emphasis_context( }; let base_offset = u32::from(p.cur_range().start()) as usize; // Create a reference checker closure that uses the parser's link reference definitions - let context = crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| { - p.has_link_reference_definition(label) - }); + let context = + crate::syntax::inline::EmphasisContext::new(inline_source, base_offset, |label| { + p.has_link_reference_definition(label) + }); p.set_emphasis_context(Some(context)) } @@ -1075,10 +1238,13 @@ fn line_starts_with_fence(p: &mut MarkdownParser) -> bool { } p.skip_line_indent(3); let rest = p.source_after_current(); - if rest.starts_with("```") { + let Some((fence_char, _len)) = fenced_code_block::detect_fence(rest) else { + return false; + }; + if fence_char == '`' { return !info_string_has_backtick(p); } - rest.starts_with("~~~") + true }) } @@ -1173,9 +1339,8 @@ pub(crate) fn at_block_interrupt(p: &mut MarkdownParser) -> bool { } // Bullet list item (-, *, +) - // Per CommonMark §5.2: bullet lists can interrupt paragraphs if: - // - The item has content, OR - // - The item is empty but followed by a blank line + // Per CommonMark §5.2: bullet lists can interrupt paragraphs only if the + // item has content (non-empty). Empty markers cannot interrupt paragraphs. // When inside a list, we also need to check for list items at ANY indent // (not just at the current context's indent) because a less-indented list // marker would end the current list item and start a sibling/parent item. @@ -1382,17 +1547,12 @@ fn at_order_list_item_textual(p: &mut MarkdownParser) -> bool { /// Check if a bullet list item can interrupt a top-level paragraph. /// -/// Per CommonMark §5.2: A bullet list can interrupt a paragraph if: -/// - The list item has content (at least one character after marker), OR -/// - The list item is empty but is followed by a blank line -/// -/// This allows patterns like: -/// ```markdown -/// Paragraph text -/// + +/// Per CommonMark §5.2: "A bullet list can interrupt a paragraph only if +/// it starts with a non-empty item (that is, a list item that contains +/// some non-blank character)." /// -/// Next paragraph (interrupted by empty bullet + blank line) -/// ``` +/// This means empty markers (marker followed by only whitespace/newline) +/// cannot interrupt paragraphs, regardless of what follows. fn can_bullet_interrupt_paragraph(p: &mut MarkdownParser) -> bool { let checkpoint = p.checkpoint(); @@ -1414,19 +1574,22 @@ fn can_bullet_interrupt_paragraph(p: &mut MarkdownParser) -> bool { } // Check what follows the marker + // Per CommonMark §5.2: "A bullet list can interrupt a paragraph only if + // it starts with a non-empty item (that is, a list item that contains + // some non-blank character)." let result = if p.at(T![EOF]) { - // Empty item at EOF - cannot interrupt (no blank line follows) + // Empty item at EOF - cannot interrupt false } else if p.at(NEWLINE) { - // Empty item - check if followed by blank line - p.at_blank_line() + // Empty item (marker + newline) - cannot interrupt paragraphs + false } else if p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { - p.bump(MD_TEXTUAL_LITERAL); - if p.at(NEWLINE) { - p.at_blank_line() - } else { - false + // Skip all whitespace tokens after marker + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); } + // If only whitespace followed by newline/EOF, item is empty and cannot interrupt + !(p.at(NEWLINE) || p.at(T![EOF])) } else { // Has content after marker - can interrupt true diff --git a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs index ccfd1430b225..37bba8046783 100644 --- a/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs +++ b/crates/biome_markdown_parser/src/syntax/fenced_code_block.rs @@ -33,7 +33,6 @@ use biome_parser::{ }; use super::parse_error::unterminated_fenced_code; -use super::quote::{consume_quote_prefix, has_quote_prefix}; /// Minimum number of fence characters required per CommonMark §4.5. const MIN_FENCE_LENGTH: usize = 3; @@ -135,7 +134,7 @@ fn find_line_start(before: &str) -> usize { /// Returns `Some((fence_char, length))` if a valid fence is found, /// where `length` is the actual number of fence characters (3 or more). /// Returns `None` if no valid fence is present. -fn detect_fence(s: &str) -> Option<(char, usize)> { +pub(crate) fn detect_fence(s: &str) -> Option<(char, usize)> { let first_char = s.chars().next()?; if first_char != '`' && first_char != '~' { @@ -272,21 +271,60 @@ fn parse_code_content( ) { let m = p.start(); let quote_depth = p.state().block_quote_depth; + let mut at_line_start = false; // Consume all tokens until we see the matching closing fence or EOF while !p.at(T![EOF]) { - if quote_depth > 0 && (p.at_line_start() || p.has_preceding_line_break()) { - if !has_quote_prefix(p, quote_depth) { + if at_line_start && quote_depth > 0 { + let prev_virtual = p.state().virtual_line_start; + p.state_mut().virtual_line_start = Some(p.cur_range().start()); + p.skip_line_indent(3); + p.state_mut().virtual_line_start = prev_virtual; + + let mut ok = true; + for _ in 0..quote_depth { + if p.at(MD_TEXTUAL_LITERAL) && p.cur_text().starts_with('>') { + p.force_relex_regular(); + } + + if p.at(T![>]) { + p.parse_as_skipped_trivia_tokens(|p| p.bump(T![>])); + } else if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">" { + p.parse_as_skipped_trivia_tokens(|p| p.bump_remap(T![>])); + } else { + ok = false; + break; + } + + if p.at(MD_TEXTUAL_LITERAL) { + let text = p.cur_text(); + if text == " " || text == "\t" { + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL)); + } + } + } + + if !ok { break; } - consume_quote_prefix(p, quote_depth); + at_line_start = false; + } + + if p.at(NEWLINE) { + // Preserve newlines as code content and reset virtual line start. + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + p.set_virtual_line_start(); + at_line_start = true; + continue; } if at_closing_fence(p, is_tilde_fence, fence_len) { break; } - if p.at_line_start() && fence_indent > 0 { + if at_line_start && fence_indent > 0 { skip_fenced_content_indent(p, fence_indent); if at_closing_fence(p, is_tilde_fence, fence_len) { break; @@ -297,6 +335,7 @@ fn parse_code_content( let text_m = p.start(); p.bump_remap(MD_TEXTUAL_LITERAL); text_m.complete(p, MD_TEXTUAL); + at_line_start = false; } m.complete(p, MD_INLINE_ITEM_LIST); @@ -317,7 +356,7 @@ pub(crate) fn info_string_has_backtick(p: &mut MarkdownParser) -> bool { } while !p.at_inline_end() { - if p.at(BACKTICK) { + if p.at(BACKTICK) || p.at(T!["```"]) { return true; } p.bump(p.cur()); diff --git a/crates/biome_markdown_parser/src/syntax/header.rs b/crates/biome_markdown_parser/src/syntax/header.rs index 58127cfb57c3..555f1da00671 100644 --- a/crates/biome_markdown_parser/src/syntax/header.rs +++ b/crates/biome_markdown_parser/src/syntax/header.rs @@ -138,7 +138,7 @@ fn parse_hash_list(p: &mut MarkdownParser) -> usize { /// /// This stops at end of line (NEWLINE or EOF) or when trailing hashes are detected. /// Note: NEWLINE is an explicit token (not trivia), so we check `at_inline_end()`. -fn parse_header_content(p: &mut MarkdownParser) { +pub(crate) fn parse_header_content(p: &mut MarkdownParser) { // Check if there's any content (not at EOF or NEWLINE) if p.at_inline_end() { return; @@ -240,7 +240,7 @@ fn at_trailing_hashes_start(p: &mut MarkdownParser) -> bool { /// /// The lexer emits all consecutive `#` characters as a single HASH token. /// We wrap it in an MdHash node to match the grammar. -fn parse_trailing_hashes(p: &mut MarkdownParser) { +pub(crate) fn parse_trailing_hashes(p: &mut MarkdownParser) { let m = p.start(); if at_trailing_hashes_start(p) { diff --git a/crates/biome_markdown_parser/src/syntax/html_block.rs b/crates/biome_markdown_parser/src/syntax/html_block.rs index 7865eb63651b..8d2f51c43608 100644 --- a/crates/biome_markdown_parser/src/syntax/html_block.rs +++ b/crates/biome_markdown_parser/src/syntax/html_block.rs @@ -34,34 +34,164 @@ pub(crate) fn at_html_block(p: &mut MarkdownParser) -> bool { /// Check if content after `<` looks like HTML (tag, comment, declaration, etc.). fn is_html_like_content(p: &MarkdownParser) -> bool { + html_block_kind(p).is_some() +} + +#[derive(Clone, Copy)] +enum HtmlBlockKind { + Type1(Type1Tag), + Type2, + Type3, + Type4, + Type5, + Type6, + Type7, +} + +#[derive(Clone, Copy)] +enum Type1Tag { + Script, + Pre, + Style, + Textarea, +} + +fn html_block_kind(p: &MarkdownParser) -> Option { let remaining = p.source_after_current(); if !remaining.starts_with('<') { - return false; + return None; } let after_angle = &remaining[1..]; - // Comment, CDATA, declaration, or processing instruction - if after_angle.starts_with("!--") - || after_angle.starts_with("![CDATA[") - || after_angle.starts_with('?') - { - return true; + // Comment + if after_angle.starts_with("!--") { + return Some(HtmlBlockKind::Type2); + } + + // Processing instruction + if after_angle.starts_with('?') { + return Some(HtmlBlockKind::Type3); + } + + // CDATA + if after_angle.starts_with("![CDATA[") { + return Some(HtmlBlockKind::Type5); } // Declaration: Option<&str> { let tag_start = after_angle.strip_prefix('/').unwrap_or(after_angle); - tag_start - .chars() - .next() - .is_some_and(|c| c.is_ascii_alphabetic()) + let bytes = tag_start.as_bytes(); + let first = *bytes.first()?; + if !first.is_ascii_alphabetic() { + return None; + } + + let tag_end = bytes + .iter() + .position(|b| !b.is_ascii_alphanumeric() && *b != b'-') + .unwrap_or(tag_start.len()); + let tag_name = &tag_start[..tag_end]; + + let boundary = tag_start.as_bytes().get(tag_end).copied(); + if matches!( + boundary, + None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'>' | b'/') + ) { + Some(tag_name) + } else { + None + } +} + +fn type1_tag(tag_name: &str) -> Option { + if tag_name.eq_ignore_ascii_case("script") { + Some(Type1Tag::Script) + } else if tag_name.eq_ignore_ascii_case("pre") { + Some(Type1Tag::Pre) + } else if tag_name.eq_ignore_ascii_case("style") { + Some(Type1Tag::Style) + } else if tag_name.eq_ignore_ascii_case("textarea") { + Some(Type1Tag::Textarea) + } else { + None + } +} + +fn first_line(text: &str) -> &str { + text.split_once(['\n', '\r']).map_or(text, |(line, _)| line) +} + +fn line_has_only_tag(line: &str) -> bool { + let bytes = line.as_bytes(); + if !bytes.starts_with(b"<") { + return false; + } + + let Some(end) = tag_end_index(bytes) else { + return false; + }; + + line[end + 1..].chars().all(|c| c == ' ' || c == '\t') +} + +fn tag_end_index(bytes: &[u8]) -> Option { + let mut i = 1; + let mut in_single = false; + let mut in_double = false; + + while i < bytes.len() { + let b = bytes[i]; + if in_single { + if b == b'\'' { + in_single = false; + } + i += 1; + continue; + } + if in_double { + if b == b'"' { + in_double = false; + } + i += 1; + continue; + } + + match b { + b'\'' => in_single = true, + b'"' => in_double = true, + b'>' => return Some(i), + _ => {} + } + i += 1; + } + + None } /// Block-level tags that can interrupt paragraphs. @@ -115,17 +245,12 @@ const BLOCK_TAGS: &[&str] = &[ "option", "p", "param", - "pre", - "script", + "search", "section", - "source", - "style", "summary", "table", "tbody", "td", - "template", - "textarea", "tfoot", "th", "thead", @@ -138,34 +263,19 @@ const BLOCK_TAGS: &[&str] = &[ /// Only block-level HTML and special constructs interrupt paragraphs. pub(crate) fn at_html_block_interrupt(p: &mut MarkdownParser) -> bool { p.lookahead(|p| { - if !at_html_block(p) { - return false; - } - - let remaining = p.source_after_current(); - if remaining.len() < 2 { + let Some(kind) = html_block_kind(p) else { return false; - } - - let after_angle = &remaining[1..]; - - // Special constructs always interrupt - if after_angle.starts_with("!--") - || after_angle.starts_with("![CDATA[") - || after_angle.starts_with('!') - || after_angle.starts_with('?') - { - return true; - } - - // Check for block-level tag - let tag_start = after_angle.strip_prefix('/').unwrap_or(after_angle); - let tag_name: String = tag_start - .chars() - .take_while(|c| c.is_ascii_alphanumeric()) - .collect(); - - BLOCK_TAGS.iter().any(|t| t.eq_ignore_ascii_case(&tag_name)) + }; + + matches!( + kind, + HtmlBlockKind::Type1 { .. } + | HtmlBlockKind::Type2 + | HtmlBlockKind::Type3 + | HtmlBlockKind::Type4 + | HtmlBlockKind::Type5 + | HtmlBlockKind::Type6 + ) }) } @@ -177,10 +287,29 @@ pub(crate) fn parse_html_block(p: &mut MarkdownParser) -> ParsedSyntax { return Absent; } + let Some(kind) = html_block_kind(p) else { + return Absent; + }; + let m = p.start(); let content_m = p.start(); - parse_until_blank_line(p); + match kind { + HtmlBlockKind::Type1(tag) => { + let terminator = match tag { + Type1Tag::Script => "", + Type1Tag::Pre => "", + Type1Tag::Style => "", + Type1Tag::Textarea => "", + }; + parse_until_terminator(p, terminator, true); + } + HtmlBlockKind::Type2 => parse_until_terminator(p, "-->", false), + HtmlBlockKind::Type3 => parse_until_terminator(p, "?>", false), + HtmlBlockKind::Type4 => parse_until_terminator(p, ">", false), + HtmlBlockKind::Type5 => parse_until_terminator(p, "]]>", false), + HtmlBlockKind::Type6 | HtmlBlockKind::Type7 => parse_until_blank_line(p), + } content_m.complete(p, MD_INLINE_ITEM_LIST); Present(m.complete(p, MD_HTML_BLOCK)) @@ -210,6 +339,56 @@ fn parse_until_blank_line(p: &mut MarkdownParser) { } } +fn parse_until_terminator(p: &mut MarkdownParser, terminator: &str, case_insensitive: bool) { + let mut line = String::new(); + + while !p.at(EOF) { + if at_container_boundary(p) { + break; + } + + let text = p.cur_text(); + let is_newline = p.at(NEWLINE); + line.push_str(text); + + let text_m = p.start(); + p.bump_remap(MD_TEXTUAL_LITERAL); + text_m.complete(p, MD_TEXTUAL); + + if is_newline { + if line_contains(&line, terminator, case_insensitive) { + break; + } + line.clear(); + skip_container_prefixes(p); + } + } +} + +fn line_contains(line: &str, needle: &str, case_insensitive: bool) -> bool { + if !case_insensitive { + return line.contains(needle); + } + + let hay = line.as_bytes(); + let needle = needle.as_bytes(); + if needle.is_empty() || hay.len() < needle.len() { + return false; + } + + for i in 0..=hay.len() - needle.len() { + if hay[i..i + needle.len()] + .iter() + .zip(needle.iter()) + .all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase()) + { + return true; + } + } + + false +} + fn skip_container_prefixes(p: &mut MarkdownParser) { let quote_depth = p.state().block_quote_depth; if quote_depth > 0 && has_quote_prefix(p, quote_depth) { diff --git a/crates/biome_markdown_parser/src/syntax/inline.rs b/crates/biome_markdown_parser/src/syntax/inline.rs index 87d649e36039..f00c96753a0e 100644 --- a/crates/biome_markdown_parser/src/syntax/inline.rs +++ b/crates/biome_markdown_parser/src/syntax/inline.rs @@ -647,8 +647,12 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) - use crate::lexer::MarkdownLexContext; p.lookahead(|p| { - // Skip the opening backticks - p.bump(BACKTICK); + // Skip the opening backticks (handle both BACKTICK and TRIPLE_BACKTICK) + if p.at(T!["```"]) { + p.bump(T!["```"]); + } else { + p.bump(BACKTICK); + } loop { // EOF = no matching closer found @@ -672,14 +676,18 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) - continue; } - // Found backticks - check if they match - if p.at(BACKTICK) { + // Found backticks - check if they match (handle both BACKTICK and TRIPLE_BACKTICK) + if p.at(BACKTICK) || p.at(T!["```"]) { let closing_count = p.cur_text().len(); if closing_count == opening_count { return true; } // Not matching - continue searching - p.bump(BACKTICK); + if p.at(T!["```"]) { + p.bump(T!["```"]); + } else { + p.bump(BACKTICK); + } continue; } @@ -701,7 +709,12 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) - pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { use crate::lexer::MarkdownLexContext; - if !p.at(BACKTICK) { + // Handle both BACKTICK and TRIPLE_BACKTICK (T!["```"]) as code span openers. + // TRIPLE_BACKTICK can appear when backticks are at line start but info string + // contains backticks, making it not a fenced code block (CommonMark examples 138, 145). + let is_backtick = p.at(BACKTICK); + let is_triple_backtick = p.at(T!["```"]); + if !is_backtick && !is_triple_backtick { return Absent; } @@ -717,8 +730,12 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { // We have a valid code span - now parse it let m = p.start(); - // Opening backtick(s) - p.bump(BACKTICK); + // Opening backtick(s) - remap TRIPLE_BACKTICK to BACKTICK for consistency + if is_triple_backtick { + p.bump_remap(BACKTICK); + } else { + p.bump(BACKTICK); + } // Content - parse until we find matching closing backticks // Per CommonMark, code spans can span multiple lines (newlines become spaces in output) @@ -744,8 +761,8 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { continue; } - // Found matching closing backticks - if p.at(BACKTICK) && p.cur_text().len() == opening_count { + // Found matching closing backticks (handle both BACKTICK and TRIPLE_BACKTICK) + if (p.at(BACKTICK) || p.at(T!["```"])) && p.cur_text().len() == opening_count { break; } @@ -757,7 +774,12 @@ pub(crate) fn parse_inline_code(p: &mut MarkdownParser) -> ParsedSyntax { content.complete(p, MD_INLINE_ITEM_LIST); // Closing backticks (guaranteed to exist due to lookahead check) - p.bump(BACKTICK); + // Remap TRIPLE_BACKTICK to BACKTICK for consistency + if p.at(T!["```"]) { + p.bump_remap(BACKTICK); + } else { + p.bump(BACKTICK); + } Present(m.complete(p, MD_INLINE_CODE)) } @@ -2527,8 +2549,10 @@ pub(crate) fn parse_autolink(p: &mut MarkdownParser) -> ParsedSyntax { pub(crate) fn parse_any_inline(p: &mut MarkdownParser) -> ParsedSyntax { if p.at(MD_HARD_LINE_LITERAL) { parse_hard_line(p) - } else if p.at(BACKTICK) { - // Try code span, fall back to literal text if no matching closer exists + } else if p.at(BACKTICK) || p.at(T!["```"]) { + // Try code span, fall back to literal text if no matching closer exists. + // T!["```"] can appear when backticks are at line start but info string + // contains backticks, making it not a fenced code block (CommonMark examples 138, 145). let result = parse_inline_code(p); if result.is_absent() { super::parse_textual(p) diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs index 896bb92a02f5..43a1af47b93f 100644 --- a/crates/biome_markdown_parser/src/syntax/list.rs +++ b/crates/biome_markdown_parser/src/syntax/list.rs @@ -40,7 +40,10 @@ use biome_parser::prelude::ParsedSyntax::{self, *}; use biome_parser::prelude::{CompletedMarker, Marker, ParseDiagnostic, TokenSet}; use biome_parser::{Parser, token_set}; -use super::quote::{consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix}; +use super::quote::{ + consume_quote_prefix, consume_quote_prefix_without_virtual, has_quote_prefix, + parse_quote_block_list, +}; use biome_rowan::TextRange; use super::fenced_code_block::parse_fenced_code_block; @@ -63,6 +66,41 @@ const BLOCK_RECOVERY_SET: TokenSet = token_set![ /// CommonMark requires 4 or more spaces for indented code blocks. const INDENT_CODE_BLOCK_SPACES: usize = 4; +/// Compute the marker indent for list parsing. +/// +/// For normal cases, this returns the leading whitespace count from +/// `line_start_leading_indent()`. For virtual line start cases (nested list +/// detection), we compute the actual column position from the source text +/// to ensure correct indented code block detection in nested lists. +fn compute_marker_indent(p: &MarkdownParser) -> usize { + if p.state().virtual_line_start == Some(p.cur_range().start()) { + // Virtual line start: compute actual column from source text. + // The leading whitespace was skipped as trivia, but we need the + // real column for indented code block detection. + let source = p.source().source_text(); + let pos: usize = p.cur_range().start().into(); + + // Find the start of the current line + let line_start = source[..pos] + .rfind('\n') + .map(|i| i + 1) + .unwrap_or(0); + + // Count columns from line start to current position + let mut column = 0; + for c in source[line_start..pos].chars() { + match c { + '\t' => column += 4 - (column % 4), + _ => column += 1, + } + } + column + } else { + // Normal case: use the standard leading indent count + p.source().line_start_leading_indent() + } +} + /// Check if we're at the start of a bullet list item (`-`, `*`, or `+`). /// /// A bullet list marker at line start followed by content is a list item. @@ -183,9 +221,15 @@ fn skip_blank_lines_between_items( is_tight: &mut bool, last_item_ends_with_blank: &mut bool, ) { + // Skip blank lines between list items. // Per CommonMark §5.3, blank lines between items make the list loose // but don't end the list. + // + // Any NEWLINE we see at this position (after the item-terminating newline) + // represents a blank line between items. We don't use at_blank_line() here + // because it checks if what comes AFTER the newline is blank, but we're + // already past one newline - any additional newlines ARE blank lines. while p.at(NEWLINE) { // Only skip if there's another list item after the blank lines if !has_item_after_blank_lines(p) { @@ -204,6 +248,7 @@ fn update_list_tightness( is_tight: &mut bool, last_item_ends_with_blank: &mut bool, ) { + // Blank line between items makes the list loose if *last_item_ends_with_blank { *is_tight = false; @@ -230,6 +275,9 @@ where FMarker: Fn(&mut MarkdownParser) -> Option, FParse: Fn(&mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo), { + let prev_is_tight = *is_tight; + let prev_last_item_ends_with_blank = *last_item_ends_with_blank; + skip_blank_lines_between_items( p, has_item_after_blank_lines, @@ -242,7 +290,16 @@ where } let (parsed, blank_info) = parse_item(p); - update_list_tightness(blank_info, is_tight, last_item_ends_with_blank); + + if parsed.is_absent() { + // The blank lines we skipped didn't lead to a valid item in this list. + // Restore tightness — the blank lines belong to a parent context. + *is_tight = prev_is_tight; + *last_item_ends_with_blank = prev_last_item_ends_with_blank; + } else { + update_list_tightness(blank_info, is_tight, last_item_ends_with_blank); + } + parsed } @@ -308,14 +365,17 @@ struct BulletList { last_item_ends_with_blank: bool, /// The marker kind for this list (`-`, `*`, or `+`). marker_kind: Option, + /// The indentation level of the list marker (0 for top-level). + marker_indent: usize, } impl BulletList { - fn new() -> Self { + fn new(marker_indent: usize) -> Self { Self { is_tight: true, last_item_ends_with_blank: false, marker_kind: None, + marker_indent, } } } @@ -339,29 +399,67 @@ impl ParseNodeList for BulletList { } fn is_at_list_end(&self, p: &mut Self::Parser<'_>) -> bool { - is_at_list_end_common( + let marker_indent = self.marker_indent; + + // Check blank line at line start with indent awareness BEFORE + // delegating to is_at_list_end_common (which uses non-indent-aware check). + if p.at_line_start() && at_blank_line_start(p) { + let result = !has_bullet_item_after_blank_lines_at_indent(p, marker_indent); + + return result; + } + + let result = is_at_list_end_common( p, self.marker_kind, at_bullet_list_item, current_bullet_marker, has_bullet_item_after_blank_lines, |p, _marker_kind| { - let next_is_bullet = p.lookahead(|p| { + let next_is_bullet_at_indent = p.lookahead(|p| { p.bump(NEWLINE); - skip_leading_whitespace_tokens(p); + // Count indent before marker (tabs expand to next tab stop) + let mut indent = 0usize; + while p.at(MD_TEXTUAL_LITERAL) { + let text = p.cur_text(); + if text == " " { + indent += 1; + p.bump(MD_TEXTUAL_LITERAL); + } else if text == "\t" { + indent += 4 - (indent % 4); + p.bump(MD_TEXTUAL_LITERAL); + } else { + break; + } + } + // Check indent matches this list's marker indent + let indent_ok = if marker_indent == 0 { + indent <= 3 + } else { + indent >= marker_indent && indent <= marker_indent + 3 + }; + if !indent_ok { + return false; + } if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) { p.bump(p.cur()); return marker_followed_by_whitespace_or_eol(p); } false }); - if next_is_bullet { + if next_is_bullet_at_indent { Some(false) } else { - Some(!has_bullet_item_after_blank_lines(p)) + // Check if bullet after blank lines is at correct indent + let has_item = p.lookahead(|p| { + has_bullet_item_after_blank_lines_at_indent(p, marker_indent) + }); + Some(!has_item) } }, - ) + ); + + result } fn recover( @@ -380,6 +478,7 @@ impl ParseNodeList for BulletList { fn finish_list(&mut self, p: &mut Self::Parser<'_>, m: Marker) -> CompletedMarker { let completed = m.complete(p, Self::LIST_KIND); let range = completed.range(p); + p.record_list_tightness(range, self.is_tight); completed } @@ -479,8 +578,11 @@ pub(crate) fn parse_bullet_list_item(p: &mut MarkdownParser) -> ParsedSyntax { // Increment list depth p.state_mut().list_nesting_depth += 1; + // Compute the marker indent (leading whitespace before the first marker) + let marker_indent = compute_marker_indent(p); + // Use ParseNodeList to parse the list with proper recovery - let mut list_helper = BulletList::new(); + let mut list_helper = BulletList::new(marker_indent); list_helper.parse_list(p); // Decrement list depth @@ -501,11 +603,10 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) { let m = p.start(); - let marker_indent = if p.state().virtual_line_start == Some(p.cur_range().start()) { - 0 - } else { - p.source().line_start_leading_indent() - }; + // Compute the marker indent, handling both normal and virtual line start cases. + // For virtual line start (nested list detection), we compute the actual column + // to ensure correct indented code block detection. + let marker_indent = compute_marker_indent(p); skip_list_marker_indent(p); // Bullet marker is 1 character (-, *, or +) @@ -536,18 +637,41 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) { // Count spaces after marker to determine required indentation. // Per CommonMark §5.2, content aligns to first non-space after marker. - let spaces_after_marker = if let Some(text) = marker_token_text.as_deref() { - count_spaces_after_dash_in_token(text, marker_indent + marker_width) + // + // For the setext-remapped case (marker_token_text is Some), the token includes + // trailing spaces before the newline. This means the first line is empty + // (marker + whitespace + newline), and the trailing spaces shouldn't count + // for indentation purposes. Per CommonMark, the required indent is marker_width + 1. + let (spaces_after_marker, first_line_empty) = if let Some(text) = marker_token_text.as_deref() { + // Setext token case: token is "- " or "- " etc. followed by newline + // The first line is empty, so use minimum indent (marker_width + 1) + let spaces = count_spaces_after_dash_in_token(text, marker_indent + marker_width); + (spaces, true) } else { - count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width) + let spaces = + count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width); + // Check if first line is empty by looking at what follows + let first_empty = p.lookahead(|p| { + // Skip any whitespace + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); + } + // If we hit newline or EOF, first line is empty + p.at(NEWLINE) || p.at(T![EOF]) + }); + (spaces, first_empty) }; // Set required indent for continuation lines // Required indent = marker width + spaces after marker (minimum 1) + // BUT: if first line is empty (marker + whitespace + newline), use minimum indent let prev_required_indent = p.state().list_item_required_indent; let prev_marker_indent = p.state().list_item_marker_indent; p.state_mut().list_item_required_indent = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { marker_indent + marker_width + 1 + } else if first_line_empty { + // Empty first line: use minimum indent (marker + 1 space) + marker_indent + marker_width + 1 } else { marker_indent + marker_width + spaces_after_marker.max(1) }; @@ -767,11 +891,10 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI let m = p.start(); - let marker_indent = if p.state().virtual_line_start == Some(p.cur_range().start()) { - 0 - } else { - p.source().line_start_leading_indent() - }; + // Compute the marker indent, handling both normal and virtual line start cases. + // For virtual line start (nested list detection), we compute the actual column + // to ensure correct indented code block detection. + let marker_indent = compute_marker_indent(p); skip_list_marker_indent(p); // Get marker width from actual token text (e.g., "1." = 2, "10." = 3) @@ -785,12 +908,24 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI let spaces_after_marker = count_spaces_after_marker(p.source_after_current(), marker_indent + marker_width); + // Check if first line is empty (marker followed by only whitespace + newline) + let first_line_empty = p.lookahead(|p| { + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); + } + p.at(NEWLINE) || p.at(T![EOF]) + }); + // Set required indent for continuation lines // Required indent = marker width + spaces after marker (minimum 1) + // BUT: if first line is empty (marker + whitespace + newline), use minimum indent let prev_required_indent = p.state().list_item_required_indent; let prev_marker_indent = p.state().list_item_marker_indent; p.state_mut().list_item_required_indent = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { marker_indent + marker_width + 1 + } else if first_line_empty { + // Empty first line: use minimum indent (marker + 1 space) + marker_indent + marker_width + 1 } else { marker_indent + marker_width + spaces_after_marker.max(1) }; @@ -1079,11 +1214,18 @@ fn parse_list_item_block_content( if !first_line && p.at(NEWLINE) && !p.at_blank_line() && !newline_has_quote_prefix { let action = classify_blank_line(p, required_indent, marker_indent); + // Check if the NEWLINE we're at is itself on a blank line + // (i.e., preceded by another newline). This distinguishes a real + // blank line from a content-terminating newline (e.g., after a + // fenced code block's closing fence). + let is_blank = list_newline_is_blank_line(p); match action { BlankLineAction::ContinueItem => { consume_blank_line(p); - has_blank_line = true; - last_was_blank = true; + if is_blank { + has_blank_line = true; + } + last_was_blank = is_blank; continue; } BlankLineAction::EndItemAfterBlank => { @@ -1092,6 +1234,14 @@ fn parse_list_item_block_content( last_was_blank = true; break; } + BlankLineAction::EndItemAtBoundary => { + consume_blank_line(p); + if is_blank { + has_blank_line = true; + last_was_blank = true; + } + break; + } BlankLineAction::EndItemBeforeBlank => { break; } @@ -1112,7 +1262,12 @@ fn parse_list_item_block_content( at_blank_line_after_prefix(p) }; - if (p.at_line_start() || line_has_quote_prefix) && blank_line_after_prefix { + // On the first line (same line as marker), if we're at a blank line, + // this is a marker-only line followed by blank line. Handle this + // in the first_line && p.at(NEWLINE) block below, not here. + if first_line && blank_line_after_prefix && p.at(NEWLINE) { + // Fall through to the first_line && p.at(NEWLINE) handler below + } else if (p.at_line_start() || line_has_quote_prefix) && blank_line_after_prefix { if line_has_quote_prefix && quote_only_line_indent_at_current(p, quote_depth).is_some() && let Some(next_indent) = next_quote_content_indent(p, quote_depth) @@ -1163,6 +1318,19 @@ fn parse_list_item_block_content( last_was_blank = true; break; } + BlankLineAction::EndItemAtBoundary => { + // In the blank_line_after_prefix path, we know there's an + // actual blank line, so treat as EndItemAfterBlank. + if line_has_quote_prefix { + consume_quote_prefix(p, quote_depth); + } + consume_blank_line(p); + if !marker_line_break { + has_blank_line = true; + } + last_was_blank = true; + break; + } BlankLineAction::EndItemBeforeBlank => { break; } @@ -1195,6 +1363,25 @@ fn parse_list_item_block_content( if next_is_sibling { continue; } + + // Now check if we're at a blank line (the line immediately after marker is empty). + // Per CommonMark: if marker-only line is followed by a blank line, + // the item is truly empty and subsequent content is outside the list. + let now_at_blank_line = p.lookahead(|p| { + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); + } + p.at(NEWLINE) || p.at(T![EOF]) + }); + + if now_at_blank_line { + // Item is empty - break out of the loop + break; + } + + // Continue to next iteration with fresh state to properly handle + // the continuation content on the next line. + continue; } if first_line { @@ -1247,6 +1434,138 @@ fn parse_list_item_block_content( } } + // Check for ATX heading on the first line of list item content. + // e.g., `- # Foo` should produce a heading inside the list item. + let atx_heading_info = p.lookahead(|p| { + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); + } + // # may be tokenized as HASH or MD_TEXTUAL_LITERAL + let is_hash = p.at(T![#]) + || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == '#')); + if !is_hash { + return None; + } + let text = p.cur_text(); + let hash_count = text.len(); + if hash_count < 1 || hash_count > 6 { + return None; + } + p.bump(p.cur()); + // Must be followed by space/tab, EOL, or EOF + if p.at(NEWLINE) || p.at(T![EOF]) { + return Some(hash_count); + } + if p.at(MD_TEXTUAL_LITERAL) { + let t = p.cur_text(); + if t.starts_with(' ') || t.starts_with('\t') { + return Some(hash_count); + } + } + None + }); + + if atx_heading_info.is_some() { + // Skip leading whitespace as trivia + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL)); + } + + // Manually build the heading node since we're on the first + // line and parse_header can't handle tokens here directly. + let header_m = p.start(); + + // Build MdHashList > MdHash > T![#] + let hash_list_m = p.start(); + let hash_m = p.start(); + if p.at(T![#]) { + p.bump(T![#]); + } else { + p.bump_remap(T![#]); + } + hash_m.complete(p, MD_HASH); + hash_list_m.complete(p, MD_HASH_LIST); + + // Parse heading content (inline until end of line) + super::header::parse_header_content(p); + + // Parse trailing hashes + super::header::parse_trailing_hashes(p); + + header_m.complete(p, MD_HEADER); + + last_block_was_paragraph = false; + last_was_blank = false; + first_line = false; + continue; + } + + // Check for blockquote on the first line of list item content. + // Per CommonMark §5.2, list item content can include block-level + // elements like blockquotes on the same line as the marker. + // e.g., `> 1. > Blockquote` has a blockquote inside the list item. + let blockquote_start = p.lookahead(|p| { + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.bump(MD_TEXTUAL_LITERAL); + } + // Check for > as either T![>] or MD_TEXTUAL_LITERAL ">" + p.at(T![>]) + || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">") + }); + + if blockquote_start { + // Skip leading whitespace as trivia + while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL)); + } + + let prev_virtual = p.state().virtual_line_start; + let prev_required = p.state().list_item_required_indent; + p.state_mut().virtual_line_start = Some(p.cur_range().start()); + p.state_mut().list_item_required_indent = 0; + + // Remap textual ">" to T![>] so parse_quote can recognize it. + // parse_quote checks `p.at(T![>])` after skipping indent. + if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">" { + p.bump_remap(T![>]); + // We bumped the >, but parse_quote expects to bump it itself. + // Instead, manually build the quote node inline. + let quote_m = p.start(); + p.state_mut().block_quote_depth += 1; + + // Skip optional space after > + if p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == " " { + p.parse_as_skipped_trivia_tokens(|p| p.bump(MD_TEXTUAL_LITERAL)); + } + p.state_mut().virtual_line_start = Some(p.cur_range().start()); + + parse_quote_block_list(p); + + p.state_mut().block_quote_depth -= 1; + quote_m.complete(p, MD_QUOTE); + + last_block_was_paragraph = false; + last_was_blank = false; + first_line = false; + p.state_mut().virtual_line_start = prev_virtual; + p.state_mut().list_item_required_indent = prev_required; + continue; + } + + // T![>] case: parse_quote can handle it directly + let parsed = super::quote::parse_quote(p); + if parsed.is_present() { + last_block_was_paragraph = false; + last_was_blank = false; + first_line = false; + p.state_mut().virtual_line_start = prev_virtual; + p.state_mut().list_item_required_indent = prev_required; + continue; + } + p.state_mut().virtual_line_start = prev_virtual; + p.state_mut().list_item_required_indent = prev_required; + } + let nested_marker = p.lookahead(|p| { while p.at(MD_TEXTUAL_LITERAL) && is_whitespace_only(p.cur_text()) { p.bump(MD_TEXTUAL_LITERAL); @@ -1501,7 +1820,10 @@ fn list_newline_is_blank_line(p: &MarkdownParser) -> bool { enum BlankLineAction { ContinueItem, + /// End item; actual blank lines were found before the next item. EndItemAfterBlank, + /// End item; no actual blank lines, just a normal item boundary. + EndItemAtBoundary, EndItemBeforeBlank, } @@ -1514,6 +1836,7 @@ fn classify_blank_line( // Skip ALL consecutive blank lines (not just one). // Per CommonMark §5.3, multiple blank lines between items still // belong to the same list - they just make it "loose". + let mut blank_lines_found = 0usize; loop { let line_is_blank = p.lookahead(|p| { while p.at(MD_TEXTUAL_LITERAL) { @@ -1531,6 +1854,8 @@ fn classify_blank_line( break; } + blank_lines_found += 1; + while p.at(MD_TEXTUAL_LITERAL) { let text = p.cur_text(); if text == " " || text == "\t" { @@ -1563,7 +1888,12 @@ fn classify_blank_line( && (at_bullet_list_item_with_base_indent(p, marker_indent) || at_order_list_item_with_base_indent(p, marker_indent)) { - return BlankLineAction::EndItemAfterBlank; + // The first "blank line" is just the item-ending newline. + // Only report actual blank lines if more than 1 was found. + if blank_lines_found > 1 { + return BlankLineAction::EndItemAfterBlank; + } + return BlankLineAction::EndItemAtBoundary; } BlankLineAction::EndItemBeforeBlank @@ -1742,6 +2072,79 @@ fn has_bullet_item_after_blank_lines(p: &mut MarkdownParser) -> bool { }) } +/// Like `has_bullet_item_after_blank_lines` but also checks that the +/// bullet marker is at the expected indent level for this list. +fn has_bullet_item_after_blank_lines_at_indent( + p: &mut MarkdownParser, + expected_indent: usize, +) -> bool { + has_list_item_after_blank_lines_at_indent( + p, + expected_indent, + |p| { + if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) { + p.bump(p.cur()); + marker_followed_by_whitespace_or_eol(p) + } else { + false + } + }, + ) +} + +fn has_list_item_after_blank_lines_at_indent( + p: &mut MarkdownParser, + expected_indent: usize, + has_marker: F, +) -> bool +where + F: Fn(&mut MarkdownParser) -> bool, +{ + p.lookahead(|p| { + // Skip all blank lines + loop { + while p.at(MD_TEXTUAL_LITERAL) { + let text = p.cur_text(); + if text == " " || text == "\t" { + p.bump(MD_TEXTUAL_LITERAL); + } else { + break; + } + } + if p.at(NEWLINE) { + p.bump(NEWLINE); + continue; + } + break; + } + + let mut indent = 0; + while p.at(MD_TEXTUAL_LITERAL) { + let text = p.cur_text(); + if text == " " { + indent += 1; + p.bump(MD_TEXTUAL_LITERAL); + } else if text == "\t" { + indent += 4 - (indent % 4); + p.bump(MD_TEXTUAL_LITERAL); + } else { + break; + } + } + + // Check indent matches the list's marker indent range + if expected_indent == 0 { + if indent > 3 { + return false; + } + } else if indent < expected_indent || indent > expected_indent + 3 { + return false; + } + + has_marker(p) + }) +} + /// Check if there's an ordered list item after skipping blank lines. /// /// Per CommonMark §5.3, blank lines between list items don't end the list, @@ -1787,7 +2190,7 @@ where indent += 1; p.bump(MD_TEXTUAL_LITERAL); } else if text == "\t" { - indent += 4; + indent += 4 - (indent % 4); p.bump(MD_TEXTUAL_LITERAL); } else { break; diff --git a/crates/biome_markdown_parser/src/syntax/quote.rs b/crates/biome_markdown_parser/src/syntax/quote.rs index 33e4ac03fa61..13d1d2dd660b 100644 --- a/crates/biome_markdown_parser/src/syntax/quote.rs +++ b/crates/biome_markdown_parser/src/syntax/quote.rs @@ -195,7 +195,11 @@ impl ParseNodeList for QuoteBlockList { } // Parse regular block + // Treat content after '>' as column 0 for block parsing (fence detection). + let prev_virtual = p.state().virtual_line_start; + p.state_mut().virtual_line_start = Some(p.cur_range().start()); let parsed = super::parse_any_block_with_indent_code_policy(p, true); + p.state_mut().virtual_line_start = prev_virtual; if let Present(ref marker) = parsed { self.last_block_was_paragraph = is_paragraph_like(marker.kind(p)); } else { @@ -222,13 +226,13 @@ impl ParseNodeList for QuoteBlockList { } } -fn parse_quote_block_list(p: &mut MarkdownParser) { +pub(crate) fn parse_quote_block_list(p: &mut MarkdownParser) { let depth = p.state().block_quote_depth; let mut list = QuoteBlockList::new(depth); list.parse_list(p); } -fn line_has_quote_prefix_at_current(p: &MarkdownParser, depth: usize) -> bool { +pub(crate) fn line_has_quote_prefix_at_current(p: &MarkdownParser, depth: usize) -> bool { if depth == 0 { return false; } @@ -371,6 +375,7 @@ pub(crate) fn consume_quote_prefix(p: &mut MarkdownParser, depth: usize) -> bool consume_quote_prefix_impl(p, depth, true) } +/// Check if a quote prefix starts at the current position. pub(crate) fn consume_quote_prefix_without_virtual(p: &mut MarkdownParser, depth: usize) -> bool { if depth == 0 || !has_quote_prefix(p, depth) { return false; diff --git a/crates/biome_markdown_parser/src/to_html.rs b/crates/biome_markdown_parser/src/to_html.rs index 90d75ad4127a..2d6f912fa43e 100644 --- a/crates/biome_markdown_parser/src/to_html.rs +++ b/crates/biome_markdown_parser/src/to_html.rs @@ -626,6 +626,9 @@ fn render_fenced_code_block( if content_indent > 0 { content = strip_indent_preserve_tabs(&content, content_indent); } + if quote_indent > 0 { + content = strip_quote_prefixes(&content, quote_indent); + } // Escape HTML but preserve the content structure out.push_str(&escape_html(&content)); @@ -715,16 +718,7 @@ fn render_bullet_list( quote_indent: usize, ) { let range = list.syntax().text_trimmed_range(); - let mut is_tight = ctx.is_list_tight(range); - let has_blank_lines = list.md_bullet_list().iter().any(|bullet| { - bullet - .content() - .iter() - .any(|block| matches!(block, AnyMdBlock::AnyLeafBlock(AnyLeafBlock::MdNewline(_)))) - }); - if has_blank_lines { - is_tight = false; - } + let is_tight = ctx.is_list_tight(range); out.push_str("
\n"); + self.list_stack.pop(); + return; + } + + if MdOrderedListItem::cast(node.clone()).is_some() { + self.push_str("\n"); + self.list_stack.pop(); + return; + } + + if MdBullet::cast(node.clone()).is_some() { + let buffer = self.pop_buffer(); + let state = self.list_item_stack.pop(); + if let (BufferKind::ListItem, Some(state)) = (buffer.kind, state) { + if state.is_empty { + self.push_str("
  • \n"); + return; + } + + self.push_str("
  • "); + if state.leading_newline { + self.push_str("\n"); + } + + let mut content = buffer.content; + if state.trim_trailing_newline && content.ends_with('\n') { + content.pop(); + } + self.push_str(&content); + self.push_str("
  • \n"); + } + return; + } + + if MdInlineEmphasis::cast(node.clone()).is_some() { + self.push_str("
    "); + return; } - AnyContainerBlock::MdBulletListItem(list) => { - render_bullet_list(list, ctx, out, quote_indent); + + if MdInlineItalic::cast(node.clone()).is_some() { + self.push_str(""); + return; } - AnyContainerBlock::MdOrderedListItem(list) => { - render_ordered_list(list, ctx, out, quote_indent); + + if MdInlineLink::cast(node.clone()).is_some() { + self.suppressed_inline_nodes.pop(); + self.push_str(""); + return; + } + + if MdReferenceLink::cast(node).is_some() { + let buffer = self.pop_buffer(); + if let BufferKind::ReferenceLink(state) = buffer.kind { + if let Some(url) = state.url { + self.push_str(""); + self.push_str(&buffer.content); + self.push_str(""); + } else { + self.push_str("["); + self.push_str(&buffer.content); + self.push_str("]"); + if let Some(label) = state.label_display { + self.push_str("["); + self.push_str(&escape_html(&label)); + self.push_str("]"); + } + } + } } } -} -/// Render a paragraph. -fn render_paragraph( - para: &MdParagraph, - ctx: &HtmlRenderContext, - out: &mut String, - in_tight_list: bool, - quote_indent: usize, -) { - let mut content = render_inline_list(¶.list(), ctx); - if quote_indent > 0 { - content = strip_quote_prefixes(&content, quote_indent); + fn push_buffer(&mut self, kind: BufferKind) { + self.buffers.push(Buffer { + kind, + content: String::new(), + }); } - // Trim both ends - leading whitespace can appear from parser including - // the space after list markers in the paragraph content - let content = - strip_paragraph_indent(content.trim_matches(|c| c == ' ' || c == '\n' || c == '\r')); - - if in_tight_list { - // In tight lists, paragraphs are rendered without

    tags - out.push_str(&content); - out.push('\n'); - } else { - out.push_str("

    "); - out.push_str(&content); - out.push_str("

    \n"); + + fn pop_buffer(&mut self) -> Buffer { + self.buffers.pop().unwrap_or(Buffer { + kind: BufferKind::Root, + content: String::new(), + }) + } + + fn out_mut(&mut self) -> &mut String { + &mut self.buffers.last_mut().expect("missing buffer").content + } + + fn push_str(&mut self, value: &str) { + self.out_mut().push_str(value); } + + fn block_indent(&self, range: TextRange) -> BlockIndent { + self.list_item_stack + .last() + .and_then(|state| state.block_indents.get(&range).copied()) + .unwrap_or_default() + } +} + +fn is_last_inline_item(node: &SyntaxNode) -> bool { + let Some(parent) = node.parent() else { + return false; + }; + let Some(list) = biome_markdown_syntax::MdInlineItemList::cast(parent) else { + return false; + }; + list.iter().last().is_some_and(|item| item.syntax() == node) } /// Strip leading whitespace from paragraph continuation lines. @@ -492,36 +1083,23 @@ fn strip_paragraph_indent(content: &str) -> String { } /// Render an ATX header (# style). -fn render_atx_header(header: &MdHeader, ctx: &HtmlRenderContext, out: &mut String) { +fn header_level(header: &MdHeader) -> usize { // Count total hash characters in the before list. // The lexer emits all consecutive `#` chars as a single HASH token, // so we sum the text lengths of all hash tokens. // Use text_trimmed() to exclude any leading trivia (skipped indentation spaces). - let level = header + header .before() .iter() .filter_map(|h| h.hash_token().ok()) .map(|tok| tok.text_trimmed().len()) .sum::() - .clamp(1, 6); - - out.push_str("'); - - if let Some(content) = header.content() { - let text = render_inline_list(&content.list(), ctx); - out.push_str(text.trim()); - } - - out.push_str("\n"); + .clamp(1, 6) } /// Render a setext header (underline style). -fn render_setext_header(header: &MdSetextHeader, ctx: &HtmlRenderContext, out: &mut String) { - let level = if let Ok(underline) = header.underline_token() { +fn setext_header_level(header: &MdSetextHeader) -> usize { + if let Ok(underline) = header.underline_token() { let text = underline.text(); if text.trim_start().starts_with('=') { 1 @@ -530,41 +1108,13 @@ fn render_setext_header(header: &MdSetextHeader, ctx: &HtmlRenderContext, out: & } } else { 1 - }; - - out.push_str("'); - - let text = render_inline_list(&header.content(), ctx); - out.push_str(text.trim()); - - out.push_str("\n"); + } } // ============================================================================ // Code Block Rendering // ============================================================================ -/// Render a code block (fenced or indented). -fn render_code_block( - code: &AnyCodeBlock, - out: &mut String, - list_indent: usize, - quote_indent: usize, -) { - match code { - AnyCodeBlock::MdFencedCodeBlock(fenced) => { - render_fenced_code_block(fenced, out, list_indent, quote_indent); - } - AnyCodeBlock::MdIndentCodeBlock(indented) => { - render_indented_code_block(indented, out, list_indent, quote_indent); - } - } -} - /// Render a fenced code block. /// /// Handles the architectural issue where the CST content may include the @@ -697,25 +1247,6 @@ fn render_indented_code_block_in_list( out.push_str("\n"); } -fn render_block_in_list( - block: &AnyMdBlock, - ctx: &HtmlRenderContext, - out: &mut String, - in_tight_list: bool, - list_indent: usize, - quote_indent: usize, - first_line_column: usize, -) { - if let AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock(AnyCodeBlock::MdIndentCodeBlock( - code, - ))) = block - { - render_indented_code_block_in_list(code, out, list_indent, quote_indent, first_line_column); - } else { - render_block(block, ctx, out, in_tight_list, list_indent, quote_indent); - } -} - /// Render an HTML block. fn render_html_block( html: &MdHtmlBlock, @@ -736,388 +1267,6 @@ fn render_html_block( } } -// ============================================================================ -// Container Block Rendering -// ============================================================================ - -/// Render a blockquote. -fn render_blockquote( - quote: &MdQuote, - ctx: &HtmlRenderContext, - out: &mut String, - list_indent: usize, - quote_indent: usize, -) { - out.push_str("
    \n"); - - let content = quote.content(); - let marker_indent = ctx.quote_indent(quote.syntax().text_trimmed_range()); - for block in content.iter() { - render_block( - &block, - ctx, - out, - false, - list_indent, - quote_indent + marker_indent, - ); - } - - out.push_str("
    \n"); -} - -/// Render a bullet (unordered) list. -fn render_bullet_list( - list: &MdBulletListItem, - ctx: &HtmlRenderContext, - out: &mut String, - quote_indent: usize, -) { - let range = list.syntax().text_trimmed_range(); - let is_tight = ctx.is_list_tight(range); - - out.push_str("
      \n"); - - for bullet in list.md_bullet_list() { - render_list_item(&bullet, ctx, out, is_tight, quote_indent); - } - - out.push_str("
    \n"); -} - -/// Render an ordered list. -fn render_ordered_list( - list: &MdOrderedListItem, - ctx: &HtmlRenderContext, - out: &mut String, - quote_indent: usize, -) { - let range = list.syntax().text_trimmed_range(); - let is_tight = ctx.is_list_tight(range); - - // Get starting number from first item - let start = list - .md_bullet_list() - .first() - .and_then(|bullet| bullet.bullet().ok()) - .map_or(1, |marker| { - let text = marker.text(); - // Extract number from "1." or "1)" format - text.trim_start() - .chars() - .take_while(|c| c.is_ascii_digit()) - .collect::() - .parse::() - .unwrap_or(1) - }); - - if start == 1 { - out.push_str("
      \n"); - } else { - out.push_str("
        \n"); - } - - for bullet in list.md_bullet_list() { - render_list_item(&bullet, ctx, out, is_tight, quote_indent); - } - - out.push_str("
      \n"); -} - -/// Render a list item. -fn render_list_item( - bullet: &MdBullet, - ctx: &HtmlRenderContext, - out: &mut String, - is_tight: bool, - quote_indent: usize, -) { - out.push_str("
    1. "); - - let list_indent = ctx.list_item_indent(bullet.syntax().text_trimmed_range()); - let blocks: Vec<_> = bullet.content().iter().collect(); - // A blank line within an item requires two consecutive newline blocks - // (one ending the previous line, one for the blank line itself). - // A single MD_NEWLINE between blocks is just a structural separator. - let item_has_blank_line = blocks - .windows(2) - .any(|pair| is_newline_block(&pair[0]) && is_newline_block(&pair[1])); - let is_tight = is_tight && !item_has_blank_line; - - let (indent, first_line_code_indent, first_line_column) = match list_indent { - Some(entry) => { - let base = list_item_required_indent(entry); - let first_line_code = - (entry.spaces_after_marker > INDENT_CODE_BLOCK_SPACES).then_some(base); - let column = entry.marker_indent + entry.marker_width; - (base, first_line_code, column) - } - None => (0, None, 0), - }; - - if is_empty_content(&blocks) { - out.push_str("
    2. \n"); - return; - } - - if is_tight { - if blocks.len() == 1 && is_paragraph_block(&blocks[0]) { - // Tight list with single paragraph: no newline after
    3. - if let Some(block) = blocks.first() { - let block_indent = match (first_line_code_indent, block) { - ( - Some(code_indent), - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_), - )), - ) => code_indent, - _ => indent, - }; - let column_for_block = if first_line_code_indent.is_some() - && matches!( - block, - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_) - )) - ) { - first_line_column - } else { - 0 - }; - render_block_in_list( - block, - ctx, - out, - true, - block_indent, - quote_indent, - column_for_block, - ); - } - // Remove trailing newline for tight lists - if out.ends_with('\n') { - out.pop(); - } - } else if blocks.first().is_some_and(is_paragraph_block) { - // Tight list with multiple blocks: render paragraph inline with
    4. - if let Some(first) = blocks.first() { - let block_indent = match (first_line_code_indent, first) { - ( - Some(code_indent), - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_), - )), - ) => code_indent, - _ => indent, - }; - let column_for_block = if first_line_code_indent.is_some() - && matches!( - first, - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_) - )) - ) { - first_line_column - } else { - 0 - }; - render_block_in_list( - first, - ctx, - out, - true, - block_indent, - quote_indent, - column_for_block, - ); - } - for block in blocks.iter().skip(1) { - render_block_in_list(block, ctx, out, true, indent, quote_indent, 0); - } - } else { - out.push('\n'); - for (idx, block) in blocks.iter().enumerate() { - let block_indent = if idx == 0 { - match (first_line_code_indent, block) { - ( - Some(code_indent), - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_), - )), - ) => code_indent, - _ => indent, - } - } else { - indent - }; - let column_for_block = if idx == 0 - && first_line_code_indent.is_some() - && matches!( - block, - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_) - )) - ) { - first_line_column - } else { - 0 - }; - render_block_in_list( - block, - ctx, - out, - true, - block_indent, - quote_indent, - column_for_block, - ); - } - // Remove trailing newline when the last content block is a paragraph - // (tight list paragraphs should not have trailing newlines) - if blocks - .iter() - .rev() - .find(|b| !is_newline_block(b)) - .is_some_and(is_paragraph_block) - && out.ends_with('\n') - { - out.pop(); - } - } - } else { - // Loose list or multiple blocks - out.push('\n'); - for (idx, block) in blocks.iter().enumerate() { - let block_indent = if idx == 0 { - match (first_line_code_indent, block) { - ( - Some(code_indent), - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_), - )), - ) => code_indent, - _ => indent, - } - } else { - indent - }; - let column_for_block = if idx == 0 - && first_line_code_indent.is_some() - && matches!( - block, - AnyMdBlock::AnyLeafBlock(AnyLeafBlock::AnyCodeBlock( - AnyCodeBlock::MdIndentCodeBlock(_) - )) - ) { - first_line_column - } else { - 0 - }; - render_block_in_list( - block, - ctx, - out, - false, - block_indent, - quote_indent, - column_for_block, - ); - } - } - - out.push_str("
    5. \n"); -} - -// ============================================================================ -// Inline Rendering -// ============================================================================ - -/// Render an inline item list to HTML string. -fn render_inline_list( - list: &biome_markdown_syntax::MdInlineItemList, - ctx: &HtmlRenderContext, -) -> String { - let mut result = String::new(); - let items: Vec<_> = list.iter().collect(); - let len = items.len(); - - for (i, item) in items.iter().enumerate() { - let is_last = i == len - 1; - - // Special handling for hard line breaks at end of block - if is_last && let AnyMdInline::MdHardLine(hard) = item { - // Per CommonMark: hard line break at end of block is ignored - // But if it was a backslash, output the backslash - if let Ok(token) = hard.value_token() - && token.text().starts_with('\\') - { - result.push('\\'); - } - // Otherwise (trailing spaces), output nothing - continue; - } - - render_inline(item, ctx, &mut result); - } - result -} - -/// Render an inline element. -fn render_inline(inline: &AnyMdInline, ctx: &HtmlRenderContext, out: &mut String) { - match inline { - AnyMdInline::MdTextual(text) => { - render_textual(text, out); - } - AnyMdInline::MdInlineEmphasis(em) => { - out.push_str(""); - out.push_str(&render_inline_list(&em.content(), ctx)); - out.push_str(""); - } - AnyMdInline::MdInlineItalic(italic) => { - out.push_str(""); - out.push_str(&render_inline_list(&italic.content(), ctx)); - out.push_str(""); - } - AnyMdInline::MdInlineCode(code) => { - render_inline_code(code, out); - } - AnyMdInline::MdInlineLink(link) => { - render_inline_link(link, ctx, out); - } - AnyMdInline::MdInlineImage(img) => { - render_inline_image(img, ctx, out); - } - AnyMdInline::MdReferenceLink(link) => { - render_reference_link(link, ctx, out); - } - AnyMdInline::MdReferenceImage(img) => { - render_reference_image(img, ctx, out); - } - AnyMdInline::MdAutolink(autolink) => { - render_autolink(autolink, out); - } - AnyMdInline::MdInlineHtml(html) => { - render_inline_html(html, out); - } - AnyMdInline::MdHtmlBlock(html) => { - // Inline HTML block (rare case) - let content = collect_raw_inline_text(&html.content()); - out.push_str(&content); - } - AnyMdInline::MdHardLine(_) => { - out.push_str("
      \n"); - } - AnyMdInline::MdSoftBreak(_) => { - out.push('\n'); - } - AnyMdInline::MdEntityReference(entity) => { - render_entity_reference(entity, out); - } - } -} - /// Render textual content. fn render_textual(text: &MdTextual, out: &mut String) { if let Ok(token) = text.value_token() { @@ -1152,125 +1301,6 @@ fn render_inline_code(code: &MdInlineCode, out: &mut String) { out.push_str(""); } -/// Render an inline link. -fn render_inline_link(link: &MdInlineLink, ctx: &HtmlRenderContext, out: &mut String) { - let text = render_inline_list(&link.text(), ctx); - let dest = collect_inline_text(&link.destination()); - let dest = process_link_destination(&dest); - - out.push_str("'); - out.push_str(&text); - out.push_str(""); -} - -/// Render an inline image. -fn render_inline_image(img: &MdInlineImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = extract_alt_text(&img.alt(), ctx); - - let dest = collect_inline_text(&img.destination()); - let dest = process_link_destination(&dest); - - out.push_str("\"");"); -} - -/// Render a reference link. -fn render_reference_link(link: &MdReferenceLink, ctx: &HtmlRenderContext, out: &mut String) { - let text = render_inline_list(&link.text(), ctx); - let text_raw = collect_inline_text(&link.text()); - - render_reference_common( - link.label(), - text_raw.clone(), - |label_node| collect_inline_text(&label_node.label()), - ctx, - out, - |url, title, out| { - out.push_str("'); - out.push_str(&text); - out.push_str(""); - }, - |label_display, out| { - // No definition found - output as literal text - out.push('['); - out.push_str(&text); - out.push(']'); - if let Some(label) = label_display { - out.push('['); - out.push_str(&escape_html(&label)); - out.push(']'); - } - }, - ); -} - -/// Render a reference image. -fn render_reference_image(img: &MdReferenceImage, ctx: &HtmlRenderContext, out: &mut String) { - let alt = extract_alt_text(&img.alt(), ctx); - let alt_raw = collect_inline_text(&img.alt()); - - render_reference_common( - img.label(), - alt_raw.clone(), - |label_node| collect_inline_text(&label_node.label()), - ctx, - out, - |url, title, out| { - out.push_str("\"");"); - }, - |label_display, out| { - // No definition found - output as literal text - out.push_str("!["); - out.push_str(&alt); - out.push(']'); - if let Some(label) = label_display { - out.push('['); - out.push_str(&escape_html(&label)); - out.push(']'); - } - }, - ); -} - fn resolve_reference_label( label_node: Option, fallback: String, @@ -1291,29 +1321,6 @@ where } } -fn render_reference_common( - label_node: Option, - fallback: String, - label_text: FLabel, - ctx: &HtmlRenderContext, - out: &mut String, - on_found: FFound, - on_missing: FMissing, -) where - FLabel: FnOnce(&L) -> String, - FFound: FnOnce(&str, Option<&str>, &mut String), - FMissing: FnOnce(Option, &mut String), -{ - // Get label (if explicit) or use the fallback - let (label, label_display) = resolve_reference_label(label_node, fallback, label_text); - - if let Some((url, title)) = ctx.get_link_definition(&label) { - on_found(url, title.as_deref(), out); - } else { - on_missing(label_display, out); - } -} - /// Render an autolink. fn render_autolink(autolink: &MdAutolink, out: &mut String) { let content = collect_raw_inline_text(&autolink.value()); From 80a87db2a06f48bf79d5baa81d6f6e66c4cbbfae Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Fri, 30 Jan 2026 11:43:57 -0500 Subject: [PATCH 12/26] refactor(markdown): tidy list marker detection Extract list-marker follow checks and document marker rules. --- .../src/syntax/inline/code_span.rs | 54 +++++++++---------- .../src/syntax/inline/links.rs | 2 +- .../src/syntax/link_block.rs | 2 +- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/crates/biome_markdown_parser/src/syntax/inline/code_span.rs b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs index 5147f83efbb2..d055a276e9a2 100644 --- a/crates/biome_markdown_parser/src/syntax/inline/code_span.rs +++ b/crates/biome_markdown_parser/src/syntax/inline/code_span.rs @@ -3,8 +3,8 @@ use biome_markdown_syntax::kind::MarkdownSyntaxKind::*; use biome_parser::Parser; use biome_parser::prelude::ParsedSyntax::{self, *}; -use crate::lexer::MarkdownLexContext; use crate::MarkdownParser; +use crate::lexer::MarkdownLexContext; /// Parse a hard line break. /// @@ -60,7 +60,7 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) - } // Per CommonMark, block interrupts (including list markers) can // terminate paragraphs. A code span cannot cross a block boundary. - if crate::syntax::at_block_interrupt(p) || at_list_marker_after_newline(p) { + if crate::syntax::at_block_interrupt(p) || is_at_list_marker_after_newline(p) { return false; } continue; @@ -89,10 +89,13 @@ fn has_matching_code_span_closer(p: &mut MarkdownParser, opening_count: usize) - /// Check if we're at a list marker after a newline. /// This is used to detect when a code span would cross a list item boundary. -fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { +fn is_at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { + // List markers can be indented up to 3 spaces; 4+ means indented code block. + const LIST_MARKER_MAX_INDENT: usize = 4; + // Skip up to 3 spaces of indent (list markers can be indented 0-3 spaces) let mut columns = 0usize; - while columns < 4 + while columns < LIST_MARKER_MAX_INDENT && p.at(MD_TEXTUAL_LITERAL) && p.cur_text().chars().all(|c| c == ' ' || c == '\t') { @@ -103,7 +106,7 @@ fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { _ => {} } } - if columns >= 4 { + if columns >= LIST_MARKER_MAX_INDENT { return false; // Indented code block, not a list marker } p.bump(MD_TEXTUAL_LITERAL); @@ -112,16 +115,10 @@ fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { // Check for bullet list markers: -, *, + if p.at(T![-]) || p.at(T![*]) || p.at(T![+]) { let marker_text = p.cur_text(); + // Only a single -, *, or + is a list marker; longer runs are not. if marker_text.len() == 1 { p.bump_any(); - // Must be followed by space, tab, or EOL - if p.at(NEWLINE) || p.at(T![EOF]) { - return true; - } - if p.at(MD_TEXTUAL_LITERAL) { - let text = p.cur_text(); - return text.starts_with(' ') || text.starts_with('\t'); - } + return is_list_marker_followed_by_space_or_eol(p); } return false; } @@ -129,15 +126,7 @@ fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { // Check for ordered list marker: digits followed by . or ) if p.at(MD_ORDERED_LIST_MARKER) { p.bump(MD_ORDERED_LIST_MARKER); - // Must be followed by space, tab, or EOL - if p.at(NEWLINE) || p.at(T![EOF]) { - return true; - } - if p.at(MD_TEXTUAL_LITERAL) { - let text = p.cur_text(); - return text.starts_with(' ') || text.starts_with('\t'); - } - return false; + return is_list_marker_followed_by_space_or_eol(p); } // Check for textual bullet markers (lexed as MD_TEXTUAL_LITERAL in some contexts) @@ -145,20 +134,25 @@ fn at_list_marker_after_newline(p: &mut MarkdownParser) -> bool { let text = p.cur_text(); if text == "-" || text == "*" || text == "+" { p.bump(MD_TEXTUAL_LITERAL); - // Must be followed by space, tab, or EOL - if p.at(NEWLINE) || p.at(T![EOF]) { - return true; - } - if p.at(MD_TEXTUAL_LITERAL) { - let next = p.cur_text(); - return next.starts_with(' ') || next.starts_with('\t'); - } + return is_list_marker_followed_by_space_or_eol(p); } } false } +/// A list marker must be followed by space, tab, or end of line/input. +fn is_list_marker_followed_by_space_or_eol(p: &MarkdownParser) -> bool { + if p.at(NEWLINE) || p.at(T![EOF]) { + return true; + } + if p.at(MD_TEXTUAL_LITERAL) { + let text = p.cur_text(); + return text.starts_with(' ') || text.starts_with('\t'); + } + false +} + /// Parse inline code span (`` `code` `` or ``` `` `code` `` ```). /// /// Grammar: MdInlineCode = l_tick: '`' content: MdInlineItemList r_tick: '`' diff --git a/crates/biome_markdown_parser/src/syntax/inline/links.rs b/crates/biome_markdown_parser/src/syntax/inline/links.rs index 820a9d9968e0..ac75a99d4233 100644 --- a/crates/biome_markdown_parser/src/syntax/inline/links.rs +++ b/crates/biome_markdown_parser/src/syntax/inline/links.rs @@ -5,8 +5,8 @@ use biome_parser::Parser; use biome_parser::prelude::ParsedSyntax::{self, *}; use biome_rowan::TextRange; -use crate::lexer::MarkdownLexContext; use crate::MarkdownParser; +use crate::lexer::MarkdownLexContext; use crate::syntax::reference::normalize_reference_label; use crate::syntax::inline::{parse_inline_item_list_until, parse_inline_item_list_until_no_links}; diff --git a/crates/biome_markdown_parser/src/syntax/link_block.rs b/crates/biome_markdown_parser/src/syntax/link_block.rs index 82b2d4b06a38..84f18ba18d2a 100644 --- a/crates/biome_markdown_parser/src/syntax/link_block.rs +++ b/crates/biome_markdown_parser/src/syntax/link_block.rs @@ -24,8 +24,8 @@ use biome_markdown_syntax::MarkdownSyntaxKind::*; use biome_parser::Parser; use biome_parser::prelude::ParsedSyntax::{self, *}; -use crate::lexer::MarkdownLexContext; use crate::MarkdownParser; +use crate::lexer::MarkdownLexContext; /// Maximum label length per CommonMark spec (999 characters). const MAX_LABEL_LENGTH: usize = 999; From de946f8e83d94712ebe944638c228d52ef08eb95 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Fri, 30 Jan 2026 15:23:39 -0500 Subject: [PATCH 13/26] refactor(markdown): fix imports --- crates/biome_markdown_parser/src/parser.rs | 12 ++-- .../src/syntax/html_block.rs | 11 ++++ .../src/syntax/inline/code_span.rs | 5 +- .../src/syntax/inline/emphasis.rs | 7 +- .../src/syntax/inline/html.rs | 9 ++- .../src/syntax/inline/links.rs | 64 ++++++++++--------- .../src/syntax/link_block.rs | 47 ++++++-------- .../biome_markdown_parser/src/syntax/mod.rs | 10 +-- crates/biome_markdown_parser/src/to_html.rs | 18 +++--- 9 files changed, 96 insertions(+), 87 deletions(-) diff --git a/crates/biome_markdown_parser/src/parser.rs b/crates/biome_markdown_parser/src/parser.rs index 9a772601bd5a..07591b44235f 100644 --- a/crates/biome_markdown_parser/src/parser.rs +++ b/crates/biome_markdown_parser/src/parser.rs @@ -7,7 +7,9 @@ use biome_parser::{ParserContextCheckpoint, diagnostic::merge_diagnostics}; use biome_rowan::{TextRange, TextSize}; use std::collections::HashSet; +use crate::lexer::{MarkdownLexContext, MarkdownReLexContext}; use crate::syntax::inline::EmphasisContext; +use crate::syntax::parse_error::DEFAULT_MAX_NESTING_DEPTH; use crate::token_source::{MarkdownTokenSource, MarkdownTokenSourceCheckpoint}; /// Options for configuring the markdown parser. @@ -23,7 +25,7 @@ pub struct MarkdownParseOptions { impl Default for MarkdownParseOptions { fn default() -> Self { Self { - max_nesting_depth: crate::syntax::parse_error::DEFAULT_MAX_NESTING_DEPTH, + max_nesting_depth: DEFAULT_MAX_NESTING_DEPTH, } } } @@ -196,7 +198,7 @@ impl<'source> MarkdownParser<'source> { /// This makes whitespace produce separate tokens for destination/title parsing. pub(crate) fn re_lex_link_definition(&mut self) { self.source - .re_lex(crate::lexer::MarkdownReLexContext::LinkDefinition); + .re_lex(MarkdownReLexContext::LinkDefinition); } /// Force re-lex the current token in Regular context. @@ -205,7 +207,7 @@ impl<'source> MarkdownParser<'source> { /// e.g., when entering title content where whitespace should not split tokens. pub(crate) fn force_relex_regular(&mut self) { self.source - .force_relex_in_context(crate::lexer::MarkdownLexContext::Regular); + .force_relex_in_context(MarkdownLexContext::Regular); } /// Force re-lex the current token in CodeSpan context. @@ -213,7 +215,7 @@ impl<'source> MarkdownParser<'source> { /// Used for autolinks where `\>` should be `\` + `>` as separate tokens. pub(crate) fn force_relex_code_span(&mut self) { self.source - .force_relex_in_context(crate::lexer::MarkdownLexContext::CodeSpan); + .force_relex_in_context(MarkdownLexContext::CodeSpan); } /// Re-lex the current token as single-char emphasis delimiter. @@ -227,7 +229,7 @@ impl<'source> MarkdownParser<'source> { /// This invalidates any buffered lookahead, so ensure no lookahead is active. pub(crate) fn force_relex_emphasis_inline(&mut self) -> MarkdownSyntaxKind { self.source - .re_lex(crate::lexer::MarkdownReLexContext::EmphasisInline) + .re_lex(MarkdownReLexContext::EmphasisInline) } pub(crate) fn set_force_ordered_list_marker(&mut self, value: bool) { diff --git a/crates/biome_markdown_parser/src/syntax/html_block.rs b/crates/biome_markdown_parser/src/syntax/html_block.rs index 5264929c322f..43222184e82b 100644 --- a/crates/biome_markdown_parser/src/syntax/html_block.rs +++ b/crates/biome_markdown_parser/src/syntax/html_block.rs @@ -39,20 +39,31 @@ fn is_html_like_content(p: &MarkdownParser) -> bool { #[derive(Clone, Copy)] enum HtmlBlockKind { + /// CommonMark HTML block type 1: raw `