diff --git a/.changeset/fix-astro-regex-frontmatter.md b/.changeset/fix-astro-regex-frontmatter.md new file mode 100644 index 000000000000..eed0916adeb7 --- /dev/null +++ b/.changeset/fix-astro-regex-frontmatter.md @@ -0,0 +1,5 @@ +--- +"@biomejs/biome": patch +--- + +Fixed [#9187](https://github.com/biomejs/biome/issues/9187): Astro frontmatter containing regex literals with quotes (`/'/`, `/"/`) or dashes (`/---/`) no longer causes parse errors. diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs index 73aa766dc36e..b7af4a76391c 100644 --- a/crates/biome_html_parser/src/lexer/mod.rs +++ b/crates/biome_html_parser/src/lexer/mod.rs @@ -1555,9 +1555,10 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { } } -/// Tracks whether the lexer is currently inside an open string literal while -/// scanning Astro frontmatter. Used to determine whether a `---` sequence is -/// a genuine closing fence or merely three dashes that appear inside a string. +/// Tracks whether the lexer is currently inside an open string literal, regex +/// literal, or comment while scanning Astro frontmatter. Used to determine +/// whether a `---` sequence is a genuine closing fence or merely three dashes +/// that appear inside a string or regex. /// /// ## Design /// @@ -1568,6 +1569,11 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { /// already open; it closes the string only when it **matches** the opening /// quote. For example, a `'` inside a `"…"` string is treated as a literal /// character, not as a new string opener. +/// - The **regex flag** (`in_regex`): set when a `/` is encountered in a +/// position where it starts a regex literal (determined by the previous +/// non-whitespace byte). While set, all bytes are consumed until an +/// unescaped `/` closes the regex. Quotes and dashes inside a regex are +/// not treated as string delimiters or fence markers. /// - The **comment state** (`comment`): distinguishes single-line (`//`) from /// multi-line (`/* … */`) comments, so that quote characters inside comments /// are not counted as string delimiters. @@ -1578,6 +1584,8 @@ impl<'src> LexerWithCheckpoint<'src> for HtmlLexer<'src> { struct QuotesSeen { /// The quote character that opened the current string, if any. current_quote: Option, + /// Whether we are currently inside a regex literal (`/…/`). + in_regex: bool, /// Current comment state. comment: QuotesSeenComment, /// Whether the previous byte was an unescaped backslash. @@ -1585,6 +1593,8 @@ struct QuotesSeen { /// The previous byte, needed to detect `//` and `/* */` comment markers /// and the `*/` block-comment terminator. prev_byte: Option, + /// The previous non-whitespace byte, used for the regex-start heuristic. + prev_non_ws_byte: Option, } /// Distinguishes the kind of comment the lexer is currently inside. @@ -1602,9 +1612,11 @@ impl QuotesSeen { fn new() -> Self { Self { current_quote: None, + in_regex: false, comment: QuotesSeenComment::None, escaped: false, prev_byte: None, + prev_non_ws_byte: None, } } @@ -1617,6 +1629,9 @@ impl QuotesSeen { self.comment = QuotesSeenComment::None; } self.prev_byte = Some(byte); + if !byte.is_ascii_whitespace() { + self.prev_non_ws_byte = Some(byte); + } // Quotes inside comments are ignored. return; } @@ -1624,6 +1639,11 @@ impl QuotesSeen { // Multi-line comment ends at `*/`. if self.prev_byte == Some(b'*') && byte == b'/' { self.comment = QuotesSeenComment::None; + // Use a neutral prev_byte so the closing `/` of `*/` is + // not mistaken for a potential regex or comment opener. + self.prev_byte = None; + self.prev_non_ws_byte = Some(b'/'); + return; } self.prev_byte = Some(byte); // Quotes inside comments are ignored. @@ -1632,11 +1652,31 @@ impl QuotesSeen { QuotesSeenComment::None => {} } + // Inside a regex literal: consume bytes until an unescaped `/` closes it. + if self.in_regex { + if byte == b'\\' { + self.escaped = !self.escaped; + self.prev_byte = Some(byte); + } else if byte == b'/' && !self.escaped { + self.in_regex = false; + self.escaped = false; + // Use a neutral prev_byte so the closing `/` of the regex is + // not mistaken for a deferred slash (comment/regex opener). + self.prev_byte = None; + self.prev_non_ws_byte = Some(b'/'); + } else { + self.escaped = false; + self.prev_byte = Some(byte); + } + return; + } + // Handle escape sequences: a `\` that is not itself escaped toggles the // escape flag for the next character. if byte == b'\\' { self.escaped = !self.escaped; self.prev_byte = Some(byte); + self.prev_non_ws_byte = Some(byte); return; } @@ -1647,24 +1687,60 @@ impl QuotesSeen { if was_escaped { self.prev_byte = Some(byte); + if !byte.is_ascii_whitespace() { + self.prev_non_ws_byte = Some(byte); + } return; } - // Detect comment openers — only valid outside of open strings. + // Detect comment openers and regex literals — only valid outside of open strings. + if self.current_quote.is_none() && byte == b'/' { + // Check if the previous byte was also `/` → single-line comment. + if self.prev_byte == Some(b'/') { + self.comment = QuotesSeenComment::SingleLine; + self.prev_byte = Some(byte); + // Don't update prev_non_ws_byte — it was already preserved + // when we deferred the first `/`. + return; + } + + // The `/` might start a comment (if followed by `/` or `*`), a + // regex literal, or be a division operator. We defer the decision: + // store it as prev_byte and decide on the *next* byte. + // Crucially, do NOT update prev_non_ws_byte here — we need to + // preserve the byte before the `/` for the regex heuristic. + self.prev_byte = Some(byte); + return; + } + + // If the *previous* byte was `/` (outside a string), decide now whether + // it was a comment opener, a regex opener, or plain division. if self.current_quote.is_none() && self.prev_byte == Some(b'/') { - match byte { - b'/' => { - self.comment = QuotesSeenComment::SingleLine; - self.prev_byte = Some(byte); - return; + if byte == b'*' { + self.comment = QuotesSeenComment::MultiLine; + self.prev_byte = Some(byte); + self.prev_non_ws_byte = Some(byte); + return; + } + + // Not `//` or `/*`, so the previous `/` was either a regex opener + // or a division operator. Use the previous non-whitespace byte + // before the `/` to decide. + if self.slash_starts_regex() { + // The `/` opened a regex. The current byte is the first byte + // inside the regex body. + self.in_regex = true; + if byte == b'\\' { + self.escaped = true; } - b'*' => { - self.comment = QuotesSeenComment::MultiLine; - self.prev_byte = Some(byte); - return; + self.prev_byte = Some(byte); + if !byte.is_ascii_whitespace() { + self.prev_non_ws_byte = Some(byte); } - _ => {} + return; } + // It was division; update prev_non_ws_byte to `/` now. + self.prev_non_ws_byte = Some(b'/'); } // Track string delimiters. @@ -1689,13 +1765,42 @@ impl QuotesSeen { } self.prev_byte = Some(byte); + if !byte.is_ascii_whitespace() { + self.prev_non_ws_byte = Some(byte); + } } - /// Returns `true` when the tracker is not currently inside an open string literal - /// or a comment. Both states must be absent for a `---` fence to be a valid - /// frontmatter closing delimiter. + /// Returns whether a deferred `/` starts a regex literal based on + /// `prev_non_ws_byte`. After an identifier character, closing + /// paren/bracket, number, or `++`/`--` suffix, `/` is division. In all + /// other positions `/` starts a regex. + fn slash_starts_regex(&self) -> bool { + match self.prev_non_ws_byte { + None => true, + Some(b) => !matches!( + b, + b'a'..=b'z' + | b'A'..=b'Z' + | b'0'..=b'9' + | b'_' + | b'$' + | b')' + | b']' + | b'+' + | b'-' + ), + } + } + + /// Returns `true` when the tracker is not currently inside an open string + /// literal, regex literal, or comment, and there is no pending deferred + /// slash that might open a regex. All conditions must be absent for a + /// `---` fence to be a valid frontmatter closing delimiter. fn is_empty(&self) -> bool { - self.current_quote.is_none() && self.comment == QuotesSeenComment::None + self.current_quote.is_none() + && !self.in_regex + && self.comment == QuotesSeenComment::None + && self.prev_byte != Some(b'/') } } @@ -1952,4 +2057,45 @@ const f = "something" "#; "double backslash followed by closing quote must close the string" ); } + + // --- Tests for issue #9187: regex literals in frontmatter --- + + /// A regex literal containing a single quote must not leave the tracker in a + /// non-empty state. The quote inside the regex is not a string delimiter. + #[test] + fn issue_9187_regex_with_single_quote() { + let source = "const test = /'/\n"; + let mut quotes_seen = QuotesSeen::new(); + track(source, &mut quotes_seen); + assert!( + quotes_seen.is_empty(), + "regex literal containing single quote must not open a string" + ); + } + + /// A regex literal containing a double quote must not leave the tracker in a + /// non-empty state. + #[test] + fn issue_9187_regex_with_double_quote() { + let source = "const test = /\"/\n"; + let mut quotes_seen = QuotesSeen::new(); + track(source, &mut quotes_seen); + assert!( + quotes_seen.is_empty(), + "regex literal containing double quote must not open a string" + ); + } + + /// A regex literal containing `---` must not cause the tracker to misidentify + /// the fence. The tracker must remain empty after the regex closes. + #[test] + fn issue_9187_regex_with_dashes() { + let source = "const test = /---/\n"; + let mut quotes_seen = QuotesSeen::new(); + track(source, &mut quotes_seen); + assert!( + quotes_seen.is_empty(), + "regex literal containing dashes must not confuse the tracker" + ); + } } diff --git a/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro b/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro new file mode 100644 index 000000000000..8dd481ecec49 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro @@ -0,0 +1,7 @@ +--- +const test1 = /'/ +const test2 = /"/ +const test3 = /---/ +--- + +
diff --git a/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro.snap b/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro.snap new file mode 100644 index 000000000000..9d5c26477c14 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/astro/issue_9187.astro.snap @@ -0,0 +1,86 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- + +## Input + +```astro +--- +const test1 = /'/ +const test2 = /"/ +const test3 = /---/ +--- + +
+ +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + frontmatter: AstroFrontmatterElement { + l_fence_token: FENCE@0..3 "---" [] [], + content: AstroEmbeddedContent { + content_token: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] [], + }, + r_fence_token: FENCE@60..63 "---" [] [], + }, + directive: missing (optional), + html: HtmlElementList [ + HtmlElement { + opening_element: HtmlOpeningElement { + l_angle_token: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] [], + name: HtmlTagName { + value_token: HTML_LITERAL@66..69 "div" [] [], + }, + attributes: HtmlAttributeList [], + r_angle_token: R_ANGLE@69..70 ">" [] [], + }, + children: HtmlElementList [], + closing_element: HtmlClosingElement { + l_angle_token: L_ANGLE@70..71 "<" [] [], + slash_token: SLASH@71..72 "/" [] [], + name: HtmlTagName { + value_token: HTML_LITERAL@72..75 "div" [] [], + }, + r_angle_token: R_ANGLE@75..76 ">" [] [], + }, + }, + ], + eof_token: EOF@76..77 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..77 + 0: (empty) + 1: ASTRO_FRONTMATTER_ELEMENT@0..63 + 0: FENCE@0..3 "---" [] [] + 1: ASTRO_EMBEDDED_CONTENT@3..60 + 0: HTML_LITERAL@3..60 "const test1 = /'/\nconst test2 = /\"/\nconst test3 = /---/\n" [Newline("\n")] [] + 2: FENCE@60..63 "---" [] [] + 2: (empty) + 3: HTML_ELEMENT_LIST@63..76 + 0: HTML_ELEMENT@63..76 + 0: HTML_OPENING_ELEMENT@63..70 + 0: L_ANGLE@63..66 "<" [Newline("\n"), Newline("\n")] [] + 1: HTML_TAG_NAME@66..69 + 0: HTML_LITERAL@66..69 "div" [] [] + 2: HTML_ATTRIBUTE_LIST@69..69 + 3: R_ANGLE@69..70 ">" [] [] + 1: HTML_ELEMENT_LIST@70..70 + 2: HTML_CLOSING_ELEMENT@70..76 + 0: L_ANGLE@70..71 "<" [] [] + 1: SLASH@71..72 "/" [] [] + 2: HTML_TAG_NAME@72..75 + 0: HTML_LITERAL@72..75 "div" [] [] + 3: R_ANGLE@75..76 ">" [] [] + 4: EOF@76..77 "" [Newline("\n")] [] + +```