-
-
Notifications
You must be signed in to change notification settings - Fork 928
fix(parser/html): regex literals in frontmatter #9531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| --- | ||
| "@biomejs/biome": patch | ||
| --- | ||
|
|
||
| Fixed [#9187](https://github.com/biomejs/biome/issues/9187): Astro frontmatter containing regex literals with quotes (`/'/`, `/"/`) or dashes (`/---/`) no longer causes parse errors. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1555,9 +1555,10 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { | |
| } | ||
| } | ||
|
|
||
| /// Tracks whether the lexer is currently inside an open string literal while | ||
| /// scanning Astro frontmatter. Used to determine whether a `---` sequence is | ||
| /// a genuine closing fence or merely three dashes that appear inside a string. | ||
| /// Tracks whether the lexer is currently inside an open string literal, regex | ||
| /// literal, or comment while scanning Astro frontmatter. Used to determine | ||
| /// whether a `---` sequence is a genuine closing fence or merely three dashes | ||
| /// that appear inside a string or regex. | ||
| /// | ||
| /// ## Design | ||
| /// | ||
|
|
@@ -1568,6 +1569,11 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { | |
| /// already open; it closes the string only when it **matches** the opening | ||
| /// quote. For example, a `'` inside a `"…"` string is treated as a literal | ||
| /// character, not as a new string opener. | ||
| /// - The **regex flag** (`in_regex`): set when a `/` is encountered in a | ||
| /// position where it starts a regex literal (determined by the previous | ||
| /// non-whitespace byte). While set, all bytes are consumed until an | ||
| /// unescaped `/` closes the regex. Quotes and dashes inside a regex are | ||
| /// not treated as string delimiters or fence markers. | ||
| /// - The **comment state** (`comment`): distinguishes single-line (`//`) from | ||
| /// multi-line (`/* … */`) comments, so that quote characters inside comments | ||
| /// are not counted as string delimiters. | ||
|
|
@@ -1578,13 +1584,17 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { | |
| struct QuotesSeen { | ||
| /// The quote character that opened the current string, if any. | ||
| current_quote: Option<u8>, | ||
| /// Whether we are currently inside a regex literal (`/…/`). | ||
| in_regex: bool, | ||
| /// Current comment state. | ||
| comment: QuotesSeenComment, | ||
| /// Whether the previous byte was an unescaped backslash. | ||
| escaped: bool, | ||
| /// The previous byte, needed to detect `//` and `/* */` comment markers | ||
| /// and the `*/` block-comment terminator. | ||
| prev_byte: Option<u8>, | ||
| /// The previous non-whitespace byte, used for the regex-start heuristic. | ||
| prev_non_ws_byte: Option<u8>, | ||
| } | ||
|
|
||
| /// Distinguishes the kind of comment the lexer is currently inside. | ||
|
|
@@ -1602,9 +1612,11 @@ impl QuotesSeen { | |
| fn new() -> Self { | ||
| Self { | ||
| current_quote: None, | ||
| in_regex: false, | ||
| comment: QuotesSeenComment::None, | ||
| escaped: false, | ||
| prev_byte: None, | ||
| prev_non_ws_byte: None, | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1617,13 +1629,21 @@ impl QuotesSeen { | |
| self.comment = QuotesSeenComment::None; | ||
| } | ||
| self.prev_byte = Some(byte); | ||
| if !byte.is_ascii_whitespace() { | ||
| self.prev_non_ws_byte = Some(byte); | ||
| } | ||
| // Quotes inside comments are ignored. | ||
| return; | ||
| } | ||
| QuotesSeenComment::MultiLine => { | ||
| // Multi-line comment ends at `*/`. | ||
| if self.prev_byte == Some(b'*') && byte == b'/' { | ||
| self.comment = QuotesSeenComment::None; | ||
| // Use a neutral prev_byte so the closing `/` of `*/` is | ||
| // not mistaken for a potential regex or comment opener. | ||
| self.prev_byte = None; | ||
| self.prev_non_ws_byte = Some(b'/'); | ||
| return; | ||
| } | ||
| self.prev_byte = Some(byte); | ||
| // Quotes inside comments are ignored. | ||
|
|
@@ -1632,11 +1652,31 @@ impl QuotesSeen { | |
| QuotesSeenComment::None => {} | ||
| } | ||
|
|
||
| // Inside a regex literal: consume bytes until an unescaped `/` closes it. | ||
| if self.in_regex { | ||
| if byte == b'\\' { | ||
| self.escaped = !self.escaped; | ||
| self.prev_byte = Some(byte); | ||
| } else if byte == b'/' && !self.escaped { | ||
| self.in_regex = false; | ||
| self.escaped = false; | ||
| // Use a neutral prev_byte so the closing `/` of the regex is | ||
| // not mistaken for a deferred slash (comment/regex opener). | ||
| self.prev_byte = None; | ||
| self.prev_non_ws_byte = Some(b'/'); | ||
| } else { | ||
| self.escaped = false; | ||
| self.prev_byte = Some(byte); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| // Handle escape sequences: a `\` that is not itself escaped toggles the | ||
| // escape flag for the next character. | ||
| if byte == b'\\' { | ||
| self.escaped = !self.escaped; | ||
| self.prev_byte = Some(byte); | ||
| self.prev_non_ws_byte = Some(byte); | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -1647,24 +1687,60 @@ impl QuotesSeen { | |
|
|
||
| if was_escaped { | ||
| self.prev_byte = Some(byte); | ||
| if !byte.is_ascii_whitespace() { | ||
| self.prev_non_ws_byte = Some(byte); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| // Detect comment openers — only valid outside of open strings. | ||
| // Detect comment openers and regex literals — only valid outside of open strings. | ||
| if self.current_quote.is_none() && byte == b'/' { | ||
| // Check if the previous byte was also `/` → single-line comment. | ||
| if self.prev_byte == Some(b'/') { | ||
| self.comment = QuotesSeenComment::SingleLine; | ||
| self.prev_byte = Some(byte); | ||
| // Don't update prev_non_ws_byte — it was already preserved | ||
| // when we deferred the first `/`. | ||
| return; | ||
| } | ||
|
|
||
| // The `/` might start a comment (if followed by `/` or `*`), a | ||
| // regex literal, or be a division operator. We defer the decision: | ||
| // store it as prev_byte and decide on the *next* byte. | ||
| // Crucially, do NOT update prev_non_ws_byte here — we need to | ||
| // preserve the byte before the `/` for the regex heuristic. | ||
| self.prev_byte = Some(byte); | ||
| return; | ||
| } | ||
|
|
||
| // If the *previous* byte was `/` (outside a string), decide now whether | ||
| // it was a comment opener, a regex opener, or plain division. | ||
| if self.current_quote.is_none() && self.prev_byte == Some(b'/') { | ||
| match byte { | ||
| b'/' => { | ||
| self.comment = QuotesSeenComment::SingleLine; | ||
| self.prev_byte = Some(byte); | ||
| return; | ||
| if byte == b'*' { | ||
| self.comment = QuotesSeenComment::MultiLine; | ||
| self.prev_byte = Some(byte); | ||
| self.prev_non_ws_byte = Some(byte); | ||
| return; | ||
| } | ||
|
|
||
| // Not `//` or `/*`, so the previous `/` was either a regex opener | ||
| // or a division operator. Use the previous non-whitespace byte | ||
| // before the `/` to decide. | ||
| if self.slash_starts_regex() { | ||
| // The `/` opened a regex. The current byte is the first byte | ||
| // inside the regex body. | ||
| self.in_regex = true; | ||
| if byte == b'\\' { | ||
| self.escaped = true; | ||
| } | ||
| b'*' => { | ||
| self.comment = QuotesSeenComment::MultiLine; | ||
| self.prev_byte = Some(byte); | ||
| return; | ||
| self.prev_byte = Some(byte); | ||
| if !byte.is_ascii_whitespace() { | ||
| self.prev_non_ws_byte = Some(byte); | ||
| } | ||
| _ => {} | ||
| return; | ||
| } | ||
| // It was division; update prev_non_ws_byte to `/` now. | ||
| self.prev_non_ws_byte = Some(b'/'); | ||
| } | ||
|
|
||
| // Track string delimiters. | ||
|
|
@@ -1689,13 +1765,42 @@ impl QuotesSeen { | |
| } | ||
|
|
||
| self.prev_byte = Some(byte); | ||
| if !byte.is_ascii_whitespace() { | ||
| self.prev_non_ws_byte = Some(byte); | ||
| } | ||
| } | ||
|
|
||
| /// Returns `true` when the tracker is not currently inside an open string literal | ||
| /// or a comment. Both states must be absent for a `---` fence to be a valid | ||
| /// frontmatter closing delimiter. | ||
| /// Returns whether a deferred `/` starts a regex literal based on | ||
| /// `prev_non_ws_byte`. After an identifier character, closing | ||
| /// paren/bracket, number, or `++`/`--` suffix, `/` is division. In all | ||
| /// other positions `/` starts a regex. | ||
| fn slash_starts_regex(&self) -> bool { | ||
| match self.prev_non_ws_byte { | ||
| None => true, | ||
| Some(b) => !matches!( | ||
| b, | ||
| b'a'..=b'z' | ||
| | b'A'..=b'Z' | ||
| | b'0'..=b'9' | ||
| | b'_' | ||
| | b'$' | ||
| | b')' | ||
| | b']' | ||
| | b'+' | ||
| | b'-' | ||
| ), | ||
| } | ||
| } | ||
|
Comment on lines
+1773
to
+1793
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: Yes—both ECMAScript avoids ambiguity by using different lexical goal symbols: in syntactic positions where a regex literal is permitted, the scanner uses By contrast, when the grammar expects an operator continuing an expression (e.g., after an identifier or literal), One-byte look-behind cannot distinguish valid regex starters in expression contexts.
🤖 Prompt for AI Agents |
||
|
|
||
| /// Returns `true` when the tracker is not currently inside an open string | ||
| /// literal, regex literal, or comment, and there is no pending deferred | ||
| /// slash that might open a regex. All conditions must be absent for a | ||
| /// `---` fence to be a valid frontmatter closing delimiter. | ||
| fn is_empty(&self) -> bool { | ||
| self.current_quote.is_none() && self.comment == QuotesSeenComment::None | ||
| self.current_quote.is_none() | ||
| && !self.in_regex | ||
| && self.comment == QuotesSeenComment::None | ||
| && self.prev_byte != Some(b'/') | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1952,4 +2057,45 @@ const f = "something" "#; | |
| "double backslash followed by closing quote must close the string" | ||
| ); | ||
| } | ||
|
|
||
| // --- Tests for issue #9187: regex literals in frontmatter --- | ||
|
|
||
| /// A regex literal containing a single quote must not leave the tracker in a | ||
| /// non-empty state. The quote inside the regex is not a string delimiter. | ||
| #[test] | ||
| fn issue_9187_regex_with_single_quote() { | ||
| let source = "const test = /'/\n"; | ||
| let mut quotes_seen = QuotesSeen::new(); | ||
| track(source, &mut quotes_seen); | ||
| assert!( | ||
| quotes_seen.is_empty(), | ||
| "regex literal containing single quote must not open a string" | ||
| ); | ||
| } | ||
|
|
||
| /// A regex literal containing a double quote must not leave the tracker in a | ||
| /// non-empty state. | ||
| #[test] | ||
| fn issue_9187_regex_with_double_quote() { | ||
| let source = "const test = /\"/\n"; | ||
| let mut quotes_seen = QuotesSeen::new(); | ||
| track(source, &mut quotes_seen); | ||
| assert!( | ||
| quotes_seen.is_empty(), | ||
| "regex literal containing double quote must not open a string" | ||
| ); | ||
| } | ||
|
|
||
| /// A regex literal containing `---` must not cause the tracker to misidentify | ||
| /// the fence. The tracker must remain empty after the regex closes. | ||
| #[test] | ||
| fn issue_9187_regex_with_dashes() { | ||
| let source = "const test = /---/\n"; | ||
| let mut quotes_seen = QuotesSeen::new(); | ||
| track(source, &mut quotes_seen); | ||
| assert!( | ||
| quotes_seen.is_empty(), | ||
| "regex literal containing dashes must not confuse the tracker" | ||
| ); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| --- | ||
| const test1 = /'/ | ||
| const test2 = /"/ | ||
| const test3 = /---/ | ||
| --- | ||
|
|
||
| <div></div> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| --- | ||
| source: crates/biome_html_parser/tests/spec_test.rs | ||
| expression: snapshot | ||
| --- | ||
|
|
||
| ## Input | ||
|
|
||
| ```astro | ||
| --- | ||
| const test1 = /'/ | ||
| const test2 = /"/ | ||
| const test3 = /---/ | ||
| --- | ||
|
|
||
| <div></div> | ||
|
|
||
| ``` | ||
|
|
||
|
|
||
| ## AST | ||
|
|
||
| ``` | ||
| HtmlRoot { | ||
| bom_token: missing (optional), | ||
| frontmatter: AstroFrontmatterElement { | ||
| l_fence_token: FENCE@0..3 "---" [] [], | ||
| content: AstroEmbeddedContent { | ||
| content_token: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] [], | ||
| }, | ||
| r_fence_token: FENCE@60..63 "---" [] [], | ||
| }, | ||
| directive: missing (optional), | ||
| html: HtmlElementList [ | ||
| HtmlElement { | ||
| opening_element: HtmlOpeningElement { | ||
| l_angle_token: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] [], | ||
| name: HtmlTagName { | ||
| value_token: HTML_LITERAL@66..69 "div" [] [], | ||
| }, | ||
| attributes: HtmlAttributeList [], | ||
| r_angle_token: R_ANGLE@69..70 ">" [] [], | ||
| }, | ||
| children: HtmlElementList [], | ||
| closing_element: HtmlClosingElement { | ||
| l_angle_token: L_ANGLE@70..71 "<" [] [], | ||
| slash_token: SLASH@71..72 "/" [] [], | ||
| name: HtmlTagName { | ||
| value_token: HTML_LITERAL@72..75 "div" [] [], | ||
| }, | ||
| r_angle_token: R_ANGLE@75..76 ">" [] [], | ||
| }, | ||
| }, | ||
| ], | ||
| eof_token: EOF@76..77 "" [Newline("\n")] [], | ||
| } | ||
| ``` | ||
|
|
||
| ## CST | ||
|
|
||
| ``` | ||
| 0: HTML_ROOT@0..77 | ||
| 0: (empty) | ||
| 1: ASTRO_FRONTMATTER_ELEMENT@0..63 | ||
| 0: FENCE@0..3 "---" [] [] | ||
| 1: ASTRO_EMBEDDED_CONTENT@3..60 | ||
| 0: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] [] | ||
| 2: FENCE@60..63 "---" [] [] | ||
| 2: (empty) | ||
| 3: HTML_ELEMENT_LIST@63..76 | ||
| 0: HTML_ELEMENT@63..76 | ||
| 0: HTML_OPENING_ELEMENT@63..70 | ||
| 0: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] [] | ||
| 1: HTML_TAG_NAME@66..69 | ||
| 0: HTML_LITERAL@66..69 "div" [] [] | ||
| 2: HTML_ATTRIBUTE_LIST@69..69 | ||
| 3: R_ANGLE@69..70 ">" [] [] | ||
| 1: HTML_ELEMENT_LIST@70..70 | ||
| 2: HTML_CLOSING_ELEMENT@70..76 | ||
| 0: L_ANGLE@70..71 "<" [] [] | ||
| 1: SLASH@71..72 "/" [] [] | ||
| 2: HTML_TAG_NAME@72..75 | ||
| 0: HTML_LITERAL@72..75 "div" [] [] | ||
| 3: R_ANGLE@75..76 ">" [] [] | ||
| 4: EOF@76..77 "" [Newline("\n")] [] | ||
|
|
||
| ``` |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🌐 Web query:
In ECMAScript regular expression literals, may an unescaped forward slash appear inside a character class[...], for example/[/']/, without terminating the regex literal?💡 Result:
Yes. In an ECMAScript regular expression literal (
/.../), an unescaped/may appear inside a character class[...]without terminating the literal, because the lexical grammar forRegularExpressionClassCharexcludes]and\—but does not exclude/. Therefore/[/']/is tokenized as a single regex literal and its character class matches either/or'. [1][2]🏁 Script executed:
Repository: biomejs/biome
Length of output: 5071
🏁 Script executed:
Repository: biomejs/biome
Length of output: 444
🏁 Script executed:
Repository: biomejs/biome
Length of output: 100
🏁 Script executed:
Repository: biomejs/biome
Length of output: 1428
🏁 Script executed:
Repository: biomejs/biome
Length of output: 1314
🏁 Script executed:
Repository: biomejs/biome
Length of output: 908
Character classes inside regex literals are not tracked, causing early termination.
The regex closing logic (lines 1656–1661) terminates on any unescaped
/, but ECMAScript regex literals permit/inside character classes without closing the pattern—/[/']/is valid. This causes the lexer to exit regex mode too early, leaving subsequent quotes or dashes to confuse frontmatter fence detection. Tracking character classes (detecting[,], and escape sequences within them) would prevent this false termination.🤖 Prompt for AI Agents
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems important