From 24baa1eae6cd632c0a433caffab68f66014bee50 Mon Sep 17 00:00:00 2001 From: jfmcdowell <206422+jfmcdowell@users.noreply.github.com> Date: Mon, 13 Apr 2026 10:45:44 -0400 Subject: [PATCH 1/2] fix(markdown_parser): handle tab-separated container markers --- crates/biome_markdown_parser/src/lexer/mod.rs | 133 +++++++++++++++++- .../biome_markdown_parser/src/syntax/list.rs | 37 +++-- .../biome_markdown_parser/src/syntax/quote.rs | 25 +++- .../biome_markdown_parser/tests/spec_test.rs | 18 ++- 4 files changed, 193 insertions(+), 20 deletions(-) diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs index c932558fea91..cb38242334d3 100644 --- a/crates/biome_markdown_parser/src/lexer/mod.rs +++ b/crates/biome_markdown_parser/src/lexer/mod.rs @@ -15,6 +15,8 @@ use biome_unicode_table::lookup_byte; use crate::syntax::{MAX_BLOCK_PREFIX_INDENT, TAB_STOP_SPACES}; +const MAX_ORDERED_LIST_MARKER_DIGITS: usize = 9; + /// Lexer context for different markdown parsing modes #[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] pub enum MarkdownLexContext { @@ -260,14 +262,19 @@ impl<'src> MarkdownLexer<'src> { // In link definition context, whitespace separates tokens. // We consume it as textual literal so it's not treated as trivia by the parser. self.consume_link_definition_whitespace() - } else if self.after_newline && matches!(current, b' ' | b'\t') { + } else if self.after_newline && is_space_or_tab_byte(current) { // At line start, emit single whitespace tokens to allow // indentation handling and quote marker spacing. self.consume_single_whitespace_as_text() - } else if matches!(current, b' ' | b'\t') && self.is_after_block_quote_marker() { + } else if is_space_or_tab_byte(current) && self.is_after_block_quote_marker() { // After a block quote marker, emit a single whitespace token // so the parser can skip the optional space. self.consume_single_whitespace_as_text() + } else if is_space_or_tab_byte(current) && self.is_in_list_marker_whitespace() { + // While consuming the leading whitespace after a list marker, + // emit one space/tab per token so the parser can distinguish + // the optional post-marker separator from content indent. + self.consume_single_whitespace_as_text() } else if current == b' ' && !matches!(context, MarkdownLexContext::HeadingContent) && self.is_potential_hard_line_break() @@ -683,6 +690,118 @@ impl<'src> MarkdownLexer<'src> { saw_marker } + /// Returns true if the current whitespace is part of the leading + /// space/tab run immediately following a top-level list marker. + fn is_in_list_marker_whitespace(&self) -> bool { + let bytes = self.source.as_bytes(); + let Some(¤t) = bytes.get(self.position) else { + return false; + }; + if !is_space_or_tab_byte(current) { + return false; + } + + let before = &self.source[..self.position]; + let last_newline_pos = before.rfind(['\n', '\r']); + let line_start = match last_newline_pos { + Some(pos) => { + let before_bytes = before.as_bytes(); + if before_bytes.get(pos) == Some(&b'\r') + && before_bytes.get(pos + 1) == Some(&b'\n') + { + pos + 2 + } else { + pos + 1 + } + } + None => 0, + }; + + let prefix = &bytes[line_start..self.position]; + let mut idx = 0usize; + let mut indent = 0usize; + + while prefix.get(idx).copied().is_some_and(is_space_or_tab_byte) { + if prefix[idx] == b'\t' { + indent += TAB_STOP_SPACES - (indent % TAB_STOP_SPACES); + } else { + indent += 1; + } + if indent > MAX_BLOCK_PREFIX_INDENT { + return false; + } + idx += 1; + } + + if idx >= prefix.len() { + return false; + } + + match lookup_byte(prefix[idx]) { + MIN | MUL | PLS => { + idx += 1; + } + ZER | DIG => { + let digit_start = idx; + while prefix.get(idx).copied().is_some_and(is_ascii_digit_byte) { + idx += 1; + if idx - digit_start > MAX_ORDERED_LIST_MARKER_DIGITS { + return false; + } + } + + let Some(delimiter) = prefix.get(idx).copied() else { + return false; + }; + if !matches!(lookup_byte(delimiter), PRD | PNC) { + return false; + } + idx += 1; + } + _ => return false, + } + + let trailing = &prefix[idx..]; + if trailing.is_empty() { + let mut saw_tab = current == b'\t'; + let mut next = self.position + 1; + while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) { + if bytes[next] == b'\t' { + saw_tab = true; + } + next += 1; + } + + if !saw_tab { + return false; + } + + if current == b'\t' { + return !bytes + .get(self.position + 1) + .copied() + .is_some_and(is_space_or_tab_byte); + } + + return true; + } + + if !trailing.iter().copied().all(is_space_or_tab_byte) || trailing[0] != b' ' { + return false; + } + + let mut saw_tab = current == b'\t' || trailing.contains(&b'\t'); + let mut next = self.position + 1; + while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) { + if bytes[next] == b'\t' { + saw_tab = true; + } + next += 1; + } + + saw_tab + } + /// Consumes thematic break, setext underline, or emphasis markers (*, -, _). /// /// For `-` at line start: @@ -1243,6 +1362,16 @@ impl<'src> MarkdownLexer<'src> { } } +#[inline] +fn is_space_or_tab_byte(byte: u8) -> bool { + matches!(lookup_byte(byte), WHS) && !matches!(byte, b'\n' | b'\r') +} + +#[inline] +fn is_ascii_digit_byte(byte: u8) -> bool { + matches!(lookup_byte(byte), ZER | DIG) +} + impl<'src> ReLexer<'src> for MarkdownLexer<'src> { fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind { let old_position = self.position; diff --git a/crates/biome_markdown_parser/src/syntax/list.rs b/crates/biome_markdown_parser/src/syntax/list.rs index a04175cb1755..dd3ff198210c 100644 --- a/crates/biome_markdown_parser/src/syntax/list.rs +++ b/crates/biome_markdown_parser/src/syntax/list.rs @@ -191,15 +191,20 @@ fn emit_indent_char_list(p: &mut MarkdownParser, max_columns: usize) -> usize { } /// Consume the first whitespace token after the list marker as MD_LIST_POST_MARKER_SPACE. -/// Returns true if a space was consumed. -fn emit_list_post_marker_space(p: &mut MarkdownParser) -> bool { +/// Returns true if a space/tab separator was recognized. +fn emit_list_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool { if !p.at(MD_TEXTUAL_LITERAL) { return false; } let text = p.cur_text(); - if text == " " || text == "\t" { + if text == " " { p.bump_remap(MD_LIST_POST_MARKER_SPACE); true + } else if text == "\t" { + if !preserve_tab { + p.bump_remap(MD_LIST_POST_MARKER_SPACE); + } + true } else { false } @@ -834,12 +839,19 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) { // Post-marker space (first whitespace token after marker) if !setext_marker { - emit_list_post_marker_space(p); + emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES); } - // Content indent (remaining whitespace tokens on first line) + // Content indent (remaining whitespace tokens on first line). + // For first-line indented code, only the 4-column code indent is consumed + // here so any additional padding remains in the code content. if !setext_marker && !first_line_empty && spaces_after_marker > 1 { - emit_indent_char_list(p, 0); + let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { + INDENT_CODE_BLOCK_SPACES + } else { + 0 + }; + emit_indent_char_list(p, max_columns); } else { // Empty first line or no content indent -- emit empty MdIndentTokenList let empty_m = p.start(); @@ -1149,11 +1161,18 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI }); // Post-marker space - emit_list_post_marker_space(p); + emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES); - // Content indent + // Content indent. + // For first-line indented code, only the 4-column code indent is consumed + // here so any additional padding remains in the code content. if !first_line_empty && spaces_after_marker > 1 { - emit_indent_char_list(p, 0); + let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES { + INDENT_CODE_BLOCK_SPACES + } else { + 0 + }; + emit_indent_char_list(p, max_columns); } else { let empty_m = p.start(); empty_m.complete(p, MD_INDENT_TOKEN_LIST); diff --git a/crates/biome_markdown_parser/src/syntax/quote.rs b/crates/biome_markdown_parser/src/syntax/quote.rs index 20e3153eeb56..53696bb8d2b8 100644 --- a/crates/biome_markdown_parser/src/syntax/quote.rs +++ b/crates/biome_markdown_parser/src/syntax/quote.rs @@ -300,7 +300,10 @@ fn emit_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool { // When preserve_tab is true (e.g. indented code in quote), the tab still // semantically counts as the optional post-marker separator, but remains // in the stream so the child block can claim it as indentation. - if !preserve_tab { + if !preserve_tab + || !quote_tab_has_following_indent(p) + || quote_tab_starts_nested_prefix(p) + { p.bump_remap(MD_QUOTE_POST_MARKER_SPACE); } true @@ -558,6 +561,23 @@ pub(crate) fn at_quote_indented_code_start(p: &MarkdownParser) -> bool { column >= INDENT_CODE_BLOCK_SPACES } +fn quote_tab_starts_nested_prefix(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + p.bump(MD_TEXTUAL_LITERAL); + p.at(T![>]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">") + }) +} + +fn quote_tab_has_following_indent(p: &mut MarkdownParser) -> bool { + p.lookahead(|p| { + p.bump(MD_TEXTUAL_LITERAL); + p.source_after_current() + .chars() + .next() + .is_some_and(|c| c == ' ' || c == '\t') + }) +} + fn parse_quote_indented_code_block(p: &mut MarkdownParser, depth: usize) -> ParsedSyntax { let m = p.start(); let content = p.start(); @@ -641,7 +661,8 @@ pub(crate) fn emit_optional_marker_space(p: &mut MarkdownParser, preserve_tab: b return true; } if text == "\t" { - if !preserve_tab { + if !preserve_tab || !quote_tab_has_following_indent(p) || quote_tab_starts_nested_prefix(p) + { p.bump_remap(MD_QUOTE_POST_MARKER_SPACE); } return true; diff --git a/crates/biome_markdown_parser/tests/spec_test.rs b/crates/biome_markdown_parser/tests/spec_test.rs index 233a48fc2d4b..ce943c2cd7b4 100644 --- a/crates/biome_markdown_parser/tests/spec_test.rs +++ b/crates/biome_markdown_parser/tests/spec_test.rs @@ -275,6 +275,11 @@ pub fn quick_test() { "> code\n> ---\n", "
\n\n", ); + test_example( + 99923, + ">\t>\tfoo\n", + "\ncode\n
\n
\n\n", + ); test_example( 9993, "- foo\n - bar\n", @@ -291,6 +296,8 @@ pub fn quick_test() { " - foo\n - bar\n\t - baz\n", "\n\nfoo
\n