diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs index c932558fea91..cb38242334d3 100644 --- a/crates/biome_markdown_parser/src/lexer/mod.rs +++ b/crates/biome_markdown_parser/src/lexer/mod.rs @@ -15,6 +15,8 @@ use biome_unicode_table::lookup_byte; use crate::syntax::{MAX_BLOCK_PREFIX_INDENT, TAB_STOP_SPACES}; +const MAX_ORDERED_LIST_MARKER_DIGITS: usize = 9; + /// Lexer context for different markdown parsing modes #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum MarkdownLexContext { @@ -260,14 +262,19 @@ impl<'src> MarkdownLexer<'src> { // In link definition context, whitespace separates tokens. // We consume it as textual literal so it's not treated as trivia by the parser. self.consume_link_definition_whitespace() - } else if self.after_newline && matches!(current, b' ' | b'\t') { + } else if self.after_newline && is_space_or_tab_byte(current) { // At line start, emit single whitespace tokens to allow // indentation handling and quote marker spacing. self.consume_single_whitespace_as_text() - } else if matches!(current, b' ' | b'\t') && self.is_after_block_quote_marker() { + } else if is_space_or_tab_byte(current) && self.is_after_block_quote_marker() { // After a block quote marker, emit a single whitespace token // so the parser can skip the optional space. self.consume_single_whitespace_as_text() + } else if is_space_or_tab_byte(current) && self.is_in_list_marker_whitespace() { + // While consuming the leading whitespace after a list marker, + // emit one space/tab per token so the parser can distinguish + // the optional post-marker separator from content indent. + self.consume_single_whitespace_as_text() } else if current == b' ' && !matches!(context, MarkdownLexContext::HeadingContent) && self.is_potential_hard_line_break() @@ -683,6 +690,118 @@ impl<'src> MarkdownLexer<'src> { saw_marker } + /// Returns true if the current whitespace is part of the leading + /// space/tab run immediately following a top-level list marker. + fn is_in_list_marker_whitespace(&self) -> bool { + let bytes = self.source.as_bytes(); + let Some(¤t) = bytes.get(self.position) else { + return false; + }; + if !is_space_or_tab_byte(current) { + return false; + } + + let before = &self.source[..self.position]; + let last_newline_pos = before.rfind(['\n', '\r']); + let line_start = match last_newline_pos { + Some(pos) => { + let before_bytes = before.as_bytes(); + if before_bytes.get(pos) == Some(&b'\r') + && before_bytes.get(pos + 1) == Some(&b'\n') + { + pos + 2 + } else { + pos + 1 + } + } + None => 0, + }; + + let prefix = &bytes[line_start..self.position]; + let mut idx = 0usize; + let mut indent = 0usize; + + while prefix.get(idx).copied().is_some_and(is_space_or_tab_byte) { + if prefix[idx] == b'\t' { + indent += TAB_STOP_SPACES - (indent % TAB_STOP_SPACES); + } else { + indent += 1; + } + if indent > MAX_BLOCK_PREFIX_INDENT { + return false; + } + idx += 1; + } + + if idx >= prefix.len() { + return false; + } + + match lookup_byte(prefix[idx]) { + MIN | MUL | PLS => { + idx += 1; + } + ZER | DIG => { + let digit_start = idx; + while prefix.get(idx).copied().is_some_and(is_ascii_digit_byte) { + idx += 1; + if idx - digit_start > MAX_ORDERED_LIST_MARKER_DIGITS { + return false; + } + } + + let Some(delimiter) = prefix.get(idx).copied() else { + return false; + }; + if !matches!(lookup_byte(delimiter), PRD | PNC) { + return false; + } + idx += 1; + } + _ => return false, + } + + let trailing = &prefix[idx..]; + if trailing.is_empty() { + let mut saw_tab = current == b'\t'; + let mut next = self.position + 1; + while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) { + if bytes[next] == b'\t' { + saw_tab = true; + } + next += 1; + } + + if !saw_tab { + return false; + } + + if current == b'\t' { + return !bytes + .get(self.position + 1) + .copied() + .is_some_and(is_space_or_tab_byte); + } + + return true; + } + + if !trailing.iter().copied().all(is_space_or_tab_byte) || trailing[0] != b' ' { + return false; + } + + let mut saw_tab = current == b'\t' || trailing.contains(&b'\t'); + let mut next = self.position + 1; + while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) { + if bytes[next] == b'\t' { + saw_tab = true; + } + next += 1; + } + + saw_tab + } + /// Consumes thematic break, setext underline, or emphasis markers (*, -, _). /// /// For `-` at line start: @@ -1243,6 +1362,16 @@ impl<'src> MarkdownLexer<'src> { } } +#[inline] +fn is_space_or_tab_byte(byte: u8) -> bool { + matches!(lookup_byte(byte), WHS) && !matches!(byte, b'\n' | b'\r') +} + +#[inline] +fn is_ascii_digit_byte(byte: u8) -> bool { + matches!(lookup_byte(byte), ZER | DIG) +} + impl<'src> ReLexer<'src> for MarkdownLexer<'src> { fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind { let old_position = self.position; diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs index a04175cb1755..dd3ff198210c 100644 --- a/crates/biome_markdown_parser/src/syntax/list.rs +++ b/crates/biome_markdown_parser/src/syntax/list.rs @@ -191,15 +191,20 @@ fn emit_indent_char_list(p: &mut MarkdownParser, max_columns: usize) -> usize { } /// Consume the first whitespace token after the list marker as MD_LIST_POST_MARKER_SPACE. -/// Returns true if a space was consumed. -fn emit_list_post_marker_space(p: &mut MarkdownParser) -> bool { +/// Returns true if a space/tab separator was recognized. +fn emit_list_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool { if !p.at(MD_TEXTUAL_LITERAL) { return false; } let text = p.cur_text(); - if text == " " || text == "\t" { + if text == " " { p.bump_remap(MD_LIST_POST_MARKER_SPACE); true + } else if text == "\t" { + if !preserve_tab { + p.bump_remap(MD_LIST_POST_MARKER_SPACE); + } + true } else { false } @@ -834,12 +839,19 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) { // Post-marker space (first whitespace token after marker) if !setext_marker { - emit_list_post_marker_space(p); + emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES); } - // Content indent (remaining whitespace tokens on first line) + // Content indent (remaining whitespace tokens on first line). + // For first-line indented code, only the 4-column code indent is consumed + // here so any additional padding remains in the code content. if !setext_marker && !first_line_empty && spaces_after_marker > 1 { - emit_indent_char_list(p, 0); + let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { + INDENT_CODE_BLOCK_SPACES + } else { + 0 + }; + emit_indent_char_list(p, max_columns); } else { // Empty first line or no content indent -- emit empty MdIndentTokenList let empty_m = p.start(); @@ -1149,11 +1161,18 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI }); // Post-marker space - emit_list_post_marker_space(p); + emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES); - // Content indent + // Content indent. + // For first-line indented code, only the 4-column code indent is consumed + // here so any additional padding remains in the code content. if !first_line_empty && spaces_after_marker > 1 { - emit_indent_char_list(p, 0); + let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { + INDENT_CODE_BLOCK_SPACES + } else { + 0 + }; + emit_indent_char_list(p, max_columns); } else { let empty_m = p.start(); empty_m.complete(p, MD_INDENT_TOKEN_LIST); diff --git a/crates/biome_markdown_parser/src/syntax/quote.rs b/crates/biome_markdown_parser/src/syntax/quote.rs index 20e3153eeb56..53696bb8d2b8 100644 --- a/crates/biome_markdown_parser/src/syntax/quote.rs +++ b/crates/biome_markdown_parser/src/syntax/quote.rs @@ -300,7 +300,10 @@ fn emit_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool { // When preserve_tab is true (e.g. indented code in quote), the tab still // semantically counts as the optional post-marker separator, but remains // in the stream so the child block can claim it as indentation. - if !preserve_tab { + if !preserve_tab + || !quote_tab_has_following_indent(p) + || quote_tab_starts_nested_prefix(p) + { p.bump_remap(MD_QUOTE_POST_MARKER_SPACE); } true @@ -558,6 +561,23 @@ pub(crate) fn at_quote_indented_code_start(p: &MarkdownParser) -> bool { column >= INDENT_CODE_BLOCK_SPACES } +fn quote_tab_starts_nested_prefix(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + p.bump(MD_TEXTUAL_LITERAL); + p.at(T![>]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">") + }) +} + +fn quote_tab_has_following_indent(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + p.bump(MD_TEXTUAL_LITERAL); + p.source_after_current() + .chars() + .next() + .is_some_and(|c| c == ' ' || c == '\t') + }) +} + fn parse_quote_indented_code_block(p: &mut MarkdownParser, depth: usize) -> ParsedSyntax { let m = p.start(); let content = p.start(); @@ -641,7 +661,8 @@ pub(crate) fn emit_optional_marker_space(p: &mut MarkdownParser, preserve_tab: b return true; } if text == "\t" { - if !preserve_tab { + if !preserve_tab || !quote_tab_has_following_indent(p) || quote_tab_starts_nested_prefix(p) + { p.bump_remap(MD_QUOTE_POST_MARKER_SPACE); } return true; diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md new file mode 100644 index 000000000000..61d4abed1d4e --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md @@ -0,0 +1 @@ +> > foo diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md.snap new file mode 100644 index 000000000000..54a3d82ebf88 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/block_quote_tab_separated.md.snap @@ -0,0 +1,81 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- + +## Input + +``` +> > foo + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdQuote { + prefix: MdQuotePrefix { + pre_marker_indent: MdQuoteIndentList [], + marker_token: R_ANGLE@0..1 ">" [] [], + post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@1..2 "\t" [] [], + }, + content: MdBlockList [ + MdQuote { + prefix: MdQuotePrefix { + pre_marker_indent: MdQuoteIndentList [], + marker_token: R_ANGLE@2..3 ">" [] [], + post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@3..4 "\t" [] [], + }, + content: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..7 "foo" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + }, + ], + }, + ], + eof_token: EOF@8..8 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..8 + 0: (empty) + 1: MD_BLOCK_LIST@0..8 + 0: MD_QUOTE@0..8 + 0: MD_QUOTE_PREFIX@0..2 + 0: MD_QUOTE_INDENT_LIST@0..0 + 1: R_ANGLE@0..1 ">" [] [] + 2: MD_QUOTE_POST_MARKER_SPACE@1..2 "\t" [] [] + 1: MD_BLOCK_LIST@2..8 + 0: MD_QUOTE@2..8 + 0: MD_QUOTE_PREFIX@2..4 + 0: MD_QUOTE_INDENT_LIST@2..2 + 1: R_ANGLE@2..3 ">" [] [] + 2: MD_QUOTE_POST_MARKER_SPACE@3..4 "\t" [] [] + 1: MD_BLOCK_LIST@4..8 + 0: MD_PARAGRAPH@4..8 + 0: MD_INLINE_ITEM_LIST@4..8 + 0: MD_TEXTUAL@4..7 + 0: MD_TEXTUAL_LITERAL@4..7 "foo" [] [] + 1: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "\n" [] [] + 1: (empty) + 2: EOF@8..8 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md new file mode 100644 index 000000000000..b73ab1009419 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md @@ -0,0 +1 @@ +- foo diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md.snap new file mode 100644 index 000000000000..564ea121f6c0 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_space_tab_space.md.snap @@ -0,0 +1,85 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- + +## Input + +``` +- foo + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdBulletListItem { + md_bullet_list: MdBulletList [ + MdBullet { + prefix: MdListMarkerPrefix { + pre_marker_indent: MdIndentTokenList [], + marker: MINUS@0..1 "-" [] [], + post_marker_space_token: MD_LIST_POST_MARKER_SPACE@1..2 " " [] [], + content_indent: MdIndentTokenList [ + MdIndentToken { + md_indent_char_token: MD_INDENT_CHAR@2..3 "\t" [] [], + }, + MdIndentToken { + md_indent_char_token: MD_INDENT_CHAR@3..4 " " [] [], + }, + ], + }, + content: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@4..7 "foo" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@7..8 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + }, + ], + }, + ], + eof_token: EOF@8..8 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..8 + 0: (empty) + 1: MD_BLOCK_LIST@0..8 + 0: MD_BULLET_LIST_ITEM@0..8 + 0: MD_BULLET_LIST@0..8 + 0: MD_BULLET@0..8 + 0: MD_LIST_MARKER_PREFIX@0..4 + 0: MD_INDENT_TOKEN_LIST@0..0 + 1: MINUS@0..1 "-" [] [] + 2: MD_LIST_POST_MARKER_SPACE@1..2 " " [] [] + 3: MD_INDENT_TOKEN_LIST@2..4 + 0: MD_INDENT_TOKEN@2..3 + 0: MD_INDENT_CHAR@2..3 "\t" [] [] + 1: MD_INDENT_TOKEN@3..4 + 0: MD_INDENT_CHAR@3..4 " " [] [] + 1: MD_BLOCK_LIST@4..8 + 0: MD_PARAGRAPH@4..8 + 0: MD_INLINE_ITEM_LIST@4..8 + 0: MD_TEXTUAL@4..7 + 0: MD_TEXTUAL_LITERAL@4..7 "foo" [] [] + 1: MD_TEXTUAL@7..8 + 0: MD_TEXTUAL_LITERAL@7..8 "\n" [] [] + 1: (empty) + 2: EOF@8..8 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md new file mode 100644 index 000000000000..03891e4b2230 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md @@ -0,0 +1 @@ +- foo diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md.snap new file mode 100644 index 000000000000..e6264feee1c6 --- /dev/null +++ b/crates/biome_markdown_parser/tests/md_test_suite/ok/bullet_list_tab_separated.md.snap @@ -0,0 +1,74 @@ +--- +source: crates/biome_markdown_parser/tests/spec_test.rs +expression: snapshot +--- + +## Input + +``` +- foo + +``` + + +## AST + +``` +MdDocument { + bom_token: missing (optional), + value: MdBlockList [ + MdBulletListItem { + md_bullet_list: MdBulletList [ + MdBullet { + prefix: MdListMarkerPrefix { + pre_marker_indent: MdIndentTokenList [], + marker: MINUS@0..1 "-" [] [], + post_marker_space_token: MD_LIST_POST_MARKER_SPACE@1..2 "\t" [] [], + content_indent: MdIndentTokenList [], + }, + content: MdBlockList [ + MdParagraph { + list: MdInlineItemList [ + MdTextual { + value_token: MD_TEXTUAL_LITERAL@2..5 "foo" [] [], + }, + MdTextual { + value_token: MD_TEXTUAL_LITERAL@5..6 "\n" [] [], + }, + ], + hard_line: missing (optional), + }, + ], + }, + ], + }, + ], + eof_token: EOF@6..6 "" [] [], +} +``` + +## CST + +``` +0: MD_DOCUMENT@0..6 + 0: (empty) + 1: MD_BLOCK_LIST@0..6 + 0: MD_BULLET_LIST_ITEM@0..6 + 0: MD_BULLET_LIST@0..6 + 0: MD_BULLET@0..6 + 0: MD_LIST_MARKER_PREFIX@0..2 + 0: MD_INDENT_TOKEN_LIST@0..0 + 1: MINUS@0..1 "-" [] [] + 2: MD_LIST_POST_MARKER_SPACE@1..2 "\t" [] [] + 3: MD_INDENT_TOKEN_LIST@2..2 + 1: MD_BLOCK_LIST@2..6 + 0: MD_PARAGRAPH@2..6 + 0: MD_INLINE_ITEM_LIST@2..6 + 0: MD_TEXTUAL@2..5 + 0: MD_TEXTUAL_LITERAL@2..5 "foo" [] [] + 1: MD_TEXTUAL@5..6 + 0: MD_TEXTUAL_LITERAL@5..6 "\n" [] [] + 1: (empty) + 2: EOF@6..6 "" [] [] + +``` diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs index 233a48fc2d4b..ce943c2cd7b4 100644 --- a/crates/biome_markdown_parser/tests/spec_test.rs +++ b/crates/biome_markdown_parser/tests/spec_test.rs @@ -275,6 +275,11 @@ pub fn quick_test() { "> code\n> ---\n", "
\n\n", ); + test_example( + 99923, + ">\t>\tfoo\n", + "\ncode\n
\n
\n\n", + ); test_example( 9993, "- foo\n - bar\n", @@ -291,6 +296,8 @@ pub fn quick_test() { " - foo\n - bar\n\t - baz\n", "\n\nfoo
\n