Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 131 additions & 2 deletions crates/biome_markdown_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use biome_unicode_table::lookup_byte;

use crate::syntax::{MAX_BLOCK_PREFIX_INDENT, TAB_STOP_SPACES};

const MAX_ORDERED_LIST_MARKER_DIGITS: usize = 9;

/// Lexer context for different markdown parsing modes
#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)]
pub enum MarkdownLexContext {
Expand Down Expand Up @@ -260,14 +262,19 @@ impl<'src> MarkdownLexer<'src> {
// In link definition context, whitespace separates tokens.
// We consume it as textual literal so it's not treated as trivia by the parser.
self.consume_link_definition_whitespace()
} else if self.after_newline && matches!(current, b' ' | b'\t') {
} else if self.after_newline && is_space_or_tab_byte(current) {
// At line start, emit single whitespace tokens to allow
// indentation handling and quote marker spacing.
self.consume_single_whitespace_as_text()
} else if matches!(current, b' ' | b'\t') && self.is_after_block_quote_marker() {
} else if is_space_or_tab_byte(current) && self.is_after_block_quote_marker() {
// After a block quote marker, emit a single whitespace token
// so the parser can skip the optional space.
self.consume_single_whitespace_as_text()
} else if is_space_or_tab_byte(current) && self.is_in_list_marker_whitespace() {
// While consuming the leading whitespace after a list marker,
// emit one space/tab per token so the parser can distinguish
// the optional post-marker separator from content indent.
self.consume_single_whitespace_as_text()
} else if current == b' '
&& !matches!(context, MarkdownLexContext::HeadingContent)
&& self.is_potential_hard_line_break()
Expand Down Expand Up @@ -683,6 +690,118 @@ impl<'src> MarkdownLexer<'src> {
saw_marker
}

/// Returns true if the current whitespace is part of the leading
/// space/tab run immediately following a top-level list marker.
fn is_in_list_marker_whitespace(&self) -> bool {
let bytes = self.source.as_bytes();
let Some(&current) = bytes.get(self.position) else {
return false;
};
if !is_space_or_tab_byte(current) {
return false;
}

let before = &self.source[..self.position];
let last_newline_pos = before.rfind(['\n', '\r']);
let line_start = match last_newline_pos {
Some(pos) => {
let before_bytes = before.as_bytes();
if before_bytes.get(pos) == Some(&b'\r')
&& before_bytes.get(pos + 1) == Some(&b'\n')
{
pos + 2
} else {
pos + 1
}
}
None => 0,
};

let prefix = &bytes[line_start..self.position];
let mut idx = 0usize;
let mut indent = 0usize;

while prefix.get(idx).copied().is_some_and(is_space_or_tab_byte) {
if prefix[idx] == b'\t' {
indent += TAB_STOP_SPACES - (indent % TAB_STOP_SPACES);
} else {
indent += 1;
}
if indent > MAX_BLOCK_PREFIX_INDENT {
return false;
}
idx += 1;
}

if idx >= prefix.len() {
return false;
}

match lookup_byte(prefix[idx]) {
MIN | MUL | PLS => {
idx += 1;
}
ZER | DIG => {
let digit_start = idx;
while prefix.get(idx).copied().is_some_and(is_ascii_digit_byte) {
idx += 1;
if idx - digit_start > MAX_ORDERED_LIST_MARKER_DIGITS {
return false;
}
}

let Some(delimiter) = prefix.get(idx).copied() else {
return false;
};
if !matches!(lookup_byte(delimiter), PRD | PNC) {
return false;
}
idx += 1;
}
_ => return false,
}

let trailing = &prefix[idx..];
if trailing.is_empty() {
let mut saw_tab = current == b'\t';
let mut next = self.position + 1;
while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) {
if bytes[next] == b'\t' {
saw_tab = true;
}
next += 1;
}

if !saw_tab {
return false;
}

if current == b'\t' {
return !bytes
.get(self.position + 1)
.copied()
.is_some_and(is_space_or_tab_byte);
}

return true;
}

if !trailing.iter().copied().all(is_space_or_tab_byte) || trailing[0] != b' ' {
return false;
}

let mut saw_tab = current == b'\t' || trailing.contains(&b'\t');
let mut next = self.position + 1;
while bytes.get(next).copied().is_some_and(is_space_or_tab_byte) {
if bytes[next] == b'\t' {
saw_tab = true;
}
next += 1;
}

saw_tab
}

/// Consumes thematic break, setext underline, or emphasis markers (*, -, _).
///
/// For `-` at line start:
Expand Down Expand Up @@ -1243,6 +1362,16 @@ impl<'src> MarkdownLexer<'src> {
}
}

#[inline]
fn is_space_or_tab_byte(byte: u8) -> bool {
matches!(lookup_byte(byte), WHS) && !matches!(byte, b'\n' | b'\r')
}

#[inline]
fn is_ascii_digit_byte(byte: u8) -> bool {
matches!(lookup_byte(byte), ZER | DIG)
}

impl<'src> ReLexer<'src> for MarkdownLexer<'src> {
fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind {
let old_position = self.position;
Expand Down
37 changes: 28 additions & 9 deletions crates/biome_markdown_parser/src/syntax/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,20 @@ fn emit_indent_char_list(p: &mut MarkdownParser, max_columns: usize) -> usize {
}

/// Consume the first whitespace token after the list marker as MD_LIST_POST_MARKER_SPACE.
/// Returns true if a space was consumed.
fn emit_list_post_marker_space(p: &mut MarkdownParser) -> bool {
/// Returns true if a space/tab separator was recognized.
fn emit_list_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool {
if !p.at(MD_TEXTUAL_LITERAL) {
return false;
}
let text = p.cur_text();
if text == " " || text == "\t" {
if text == " " {
p.bump_remap(MD_LIST_POST_MARKER_SPACE);
true
} else if text == "\t" {
if !preserve_tab {
p.bump_remap(MD_LIST_POST_MARKER_SPACE);
}
true
} else {
false
}
Expand Down Expand Up @@ -834,12 +839,19 @@ fn parse_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankInfo) {

// Post-marker space (first whitespace token after marker)
if !setext_marker {
emit_list_post_marker_space(p);
emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES);
}

// Content indent (remaining whitespace tokens on first line)
// Content indent (remaining whitespace tokens on first line).
// For first-line indented code, only the 4-column code indent is consumed
// here so any additional padding remains in the code content.
if !setext_marker && !first_line_empty && spaces_after_marker > 1 {
emit_indent_char_list(p, 0);
let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES {
INDENT_CODE_BLOCK_SPACES
} else {
0
};
emit_indent_char_list(p, max_columns);
} else {
// Empty first line or no content indent -- emit empty MdIndentTokenList
let empty_m = p.start();
Expand Down Expand Up @@ -1149,11 +1161,18 @@ fn parse_ordered_bullet(p: &mut MarkdownParser) -> (ParsedSyntax, ListItemBlankI
});

// Post-marker space
emit_list_post_marker_space(p);
emit_list_post_marker_space(p, spaces_after_marker > INDENT_CODE_BLOCK_SPACES);

// Content indent
// Content indent.
// For first-line indented code, only the 4-column code indent is consumed
// here so any additional padding remains in the code content.
if !first_line_empty && spaces_after_marker > 1 {
emit_indent_char_list(p, 0);
let max_columns = if spaces_after_marker > INDENT_CODE_BLOCK_SPACES {
INDENT_CODE_BLOCK_SPACES
} else {
0
};
emit_indent_char_list(p, max_columns);
} else {
let empty_m = p.start();
empty_m.complete(p, MD_INDENT_TOKEN_LIST);
Expand Down
25 changes: 23 additions & 2 deletions crates/biome_markdown_parser/src/syntax/quote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,10 @@ fn emit_post_marker_space(p: &mut MarkdownParser, preserve_tab: bool) -> bool {
// When preserve_tab is true (e.g. indented code in quote), the tab still
// semantically counts as the optional post-marker separator, but remains
// in the stream so the child block can claim it as indentation.
if !preserve_tab {
if !preserve_tab
|| !quote_tab_has_following_indent(p)
|| quote_tab_starts_nested_prefix(p)
{
p.bump_remap(MD_QUOTE_POST_MARKER_SPACE);
}
true
Expand Down Expand Up @@ -558,6 +561,23 @@ pub(crate) fn at_quote_indented_code_start(p: &MarkdownParser) -> bool {
column >= INDENT_CODE_BLOCK_SPACES
}

fn quote_tab_starts_nested_prefix(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
p.bump(MD_TEXTUAL_LITERAL);
p.at(T![>]) || (p.at(MD_TEXTUAL_LITERAL) && p.cur_text() == ">")
})
}

fn quote_tab_has_following_indent(p: &mut MarkdownParser) -> bool {
p.lookahead(|p| {
p.bump(MD_TEXTUAL_LITERAL);
p.source_after_current()
.chars()
.next()
.is_some_and(|c| c == ' ' || c == '\t')
})
}

fn parse_quote_indented_code_block(p: &mut MarkdownParser, depth: usize) -> ParsedSyntax {
let m = p.start();
let content = p.start();
Expand Down Expand Up @@ -641,7 +661,8 @@ pub(crate) fn emit_optional_marker_space(p: &mut MarkdownParser, preserve_tab: b
return true;
}
if text == "\t" {
if !preserve_tab {
if !preserve_tab || !quote_tab_has_following_indent(p) || quote_tab_starts_nested_prefix(p)
{
p.bump_remap(MD_QUOTE_POST_MARKER_SPACE);
}
return true;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
> > foo
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
expression: snapshot
---

## Input

```
> > foo

```


## AST

```
MdDocument {
bom_token: missing (optional),
value: MdBlockList [
MdQuote {
prefix: MdQuotePrefix {
pre_marker_indent: MdQuoteIndentList [],
marker_token: R_ANGLE@0..1 ">" [] [],
post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@1..2 "\t" [] [],
},
content: MdBlockList [
MdQuote {
prefix: MdQuotePrefix {
pre_marker_indent: MdQuoteIndentList [],
marker_token: R_ANGLE@2..3 ">" [] [],
post_marker_space_token: MD_QUOTE_POST_MARKER_SPACE@3..4 "\t" [] [],
},
content: MdBlockList [
MdParagraph {
list: MdInlineItemList [
MdTextual {
value_token: MD_TEXTUAL_LITERAL@4..7 "foo" [] [],
},
MdTextual {
value_token: MD_TEXTUAL_LITERAL@7..8 "\n" [] [],
},
],
hard_line: missing (optional),
},
],
},
],
},
],
eof_token: EOF@8..8 "" [] [],
}
```

## CST

```
0: MD_DOCUMENT@0..8
0: (empty)
1: MD_BLOCK_LIST@0..8
0: MD_QUOTE@0..8
0: MD_QUOTE_PREFIX@0..2
0: MD_QUOTE_INDENT_LIST@0..0
1: R_ANGLE@0..1 ">" [] []
2: MD_QUOTE_POST_MARKER_SPACE@1..2 "\t" [] []
1: MD_BLOCK_LIST@2..8
0: MD_QUOTE@2..8
0: MD_QUOTE_PREFIX@2..4
0: MD_QUOTE_INDENT_LIST@2..2
1: R_ANGLE@2..3 ">" [] []
2: MD_QUOTE_POST_MARKER_SPACE@3..4 "\t" [] []
1: MD_BLOCK_LIST@4..8
0: MD_PARAGRAPH@4..8
0: MD_INLINE_ITEM_LIST@4..8
0: MD_TEXTUAL@4..7
0: MD_TEXTUAL_LITERAL@4..7 "foo" [] []
1: MD_TEXTUAL@7..8
0: MD_TEXTUAL_LITERAL@7..8 "\n" [] []
1: (empty)
2: EOF@8..8 "" [] []

```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- foo
Loading
Loading