From f5e66b55e97ce3dcd79cc681788b9e90becf1c61 Mon Sep 17 00:00:00 2001 From: Denis Bezrukov <6227442+denbezrukov@users.noreply.github.com> Date: Wed, 19 Jul 2023 11:22:13 +0300 Subject: [PATCH] feat(rome_css_parser): CSS lexer number and ident #4682 --- crates/rome_css_parser/src/lexer/mod.rs | 359 +++++++++++++++++-- crates/rome_css_parser/src/lexer/tests.rs | 181 ++++++++++ crates/rome_css_syntax/src/generated/kind.rs | 9 +- xtask/codegen/src/css_kinds_src.rs | 2 + 4 files changed, 522 insertions(+), 29 deletions(-) diff --git a/crates/rome_css_parser/src/lexer/mod.rs b/crates/rome_css_parser/src/lexer/mod.rs index 9a67f65acc8..55b070050bf 100644 --- a/crates/rome_css_parser/src/lexer/mod.rs +++ b/crates/rome_css_parser/src/lexer/mod.rs @@ -5,7 +5,7 @@ mod tests; use rome_css_syntax::{CssSyntaxKind, CssSyntaxKind::*, TextLen, TextRange, TextSize, T}; -use rome_js_unicode_table::{lookup_byte, Dispatch::*}; +use rome_js_unicode_table::{is_id_continue, is_id_start, lookup_byte, Dispatch::*}; use rome_parser::diagnostic::ParseDiagnostic; use std::char::REPLACEMENT_CHARACTER; use std::iter::FusedIterator; @@ -99,7 +99,7 @@ impl<'src> Lexer<'src> { /// ## Safety /// Must be called at a valid UT8 char boundary fn consume_newline(&mut self) -> bool { - self.assert_at_char_boundary(); + self.assert_current_char_boundary(); match self.current_byte() { Some(b'\n') => { @@ -124,7 +124,7 @@ impl<'src> Lexer<'src> { /// ## Safety /// Must be called at a valid UT8 char boundary fn consume_whitespaces(&mut self) { - self.assert_at_char_boundary(); + self.assert_current_char_boundary(); while let Some(byte) = self.current_byte() { let dispatch = lookup_byte(byte); @@ -172,24 +172,7 @@ impl<'src> Lexer<'src> { /// ## Safety /// Must be called at a valid UT8 char boundary fn current_char_unchecked(&self) -> char { - // Precautionary measure for making sure the unsafe code below does not read over memory boundary - debug_assert!(!self.is_eof()); - self.assert_at_char_boundary(); - - // Safety: We know this is safe because we require the input to the lexer to be valid utf8 and we always call this when we are at a char - let string = unsafe { - std::str::from_utf8_unchecked(self.source.as_bytes().get_unchecked(self.position..)) - }; - let chr = if let Some(chr) = string.chars().next() { - chr - } else { - // Safety: we always call this when we are at a valid char, so this branch is completely unreachable - unsafe { - core::hint::unreachable_unchecked(); - } - }; - - chr + self.char_unchecked_at(0) } /// Gets the current byte. @@ -205,9 +188,9 @@ impl<'src> Lexer<'src> { } } - /// Asserts that the lexer is at a UTF8 char boundary + /// Asserts that the lexer is at current a UTF8 char boundary #[inline] - fn assert_at_char_boundary(&self) { + fn assert_current_char_boundary(&self) { debug_assert!(self.source.is_char_boundary(self.position)); } @@ -217,12 +200,63 @@ impl<'src> Lexer<'src> { self.byte_at(1) } + /// Peek the UTF8 char which starts at the current byte + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn peek_char_unchecked(&self) -> char { + self.char_unchecked_at(1) + } + /// Returns the byte at position `self.position + offset` or `None` if it is out of bounds. #[inline] fn byte_at(&self, offset: usize) -> Option { self.source.as_bytes().get(self.position + offset).copied() } + /// Asserts that the lexer is at a UTF8 char boundary + #[inline] + fn assert_at_char_boundary(&self, offset: usize) { + debug_assert!(self.source.is_char_boundary(self.position + offset)); + } + + /// Get the UTF8 char which starts at the current byte + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn char_unchecked_at(&self, offset: usize) -> char { + // Precautionary measure for making sure the unsafe code below does not read over memory boundary + debug_assert!(!self.is_eof()); + self.assert_at_char_boundary(offset); + + // Safety: We know this is safe because we require the input to the lexer to be valid utf8 and we always call this when we are at a char + let string = unsafe { + std::str::from_utf8_unchecked( + self.source + .as_bytes() + .get_unchecked((self.position + offset)..), + ) + }; + let chr = if let Some(chr) = string.chars().next() { + chr + } else { + // Safety: we always call this when we are at a valid char, so this branch is completely unreachable + unsafe { + core::hint::unreachable_unchecked(); + } + }; + + chr + } + + /// Check if the lexer is at a valid escape. U+005C REVERSE SOLIDUS (\) + fn is_valid_escape_at(&self, offset: usize) -> bool { + match self.byte_at(offset) { + Some(b'\n' | b'\r') | None => false, + Some(_) => true, + } + } + /// Advances the current position by `n` bytes. #[inline] fn advance(&mut self, n: usize) { @@ -269,7 +303,70 @@ impl<'src> Lexer<'src> { QOT => self.lex_string_literal(current), SLH => self.lex_slash(), - PRD => self.eat_byte(T![.]), + DIG => self.lex_number(current), + + MIN => { + if self.is_number_start() { + return self.lex_number(current); + } + + // GREATER-THAN SIGN (->), consume them and return a CDC. + if self.peek_byte() == Some(b'-') { + if self.byte_at(2) == Some(b'>') { + self.advance(3); + return CDC; + } + + // --custom-property + if self.is_ident_start() { + self.advance(2); + self.lex_identifier(); + + return CSS_CUSTOM_PROPERTY; + } + } + + // -identifier + if self.is_ident_start() { + self.advance(1); + return self.lex_identifier(); + } + + self.eat_byte(T![-]) + } + + PLS => { + if self.is_number_start() { + self.lex_number(current) + } else { + self.eat_byte(T![+]) + } + } + + PRD => { + if self.is_number_start() { + self.lex_number(current) + } else { + self.eat_byte(T![.]) + } + } + + LSS => { + // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D + // HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), consume them and return a CDO. + if self.peek_byte() == Some(b'!') + && self.byte_at(2) == Some(b'-') + && self.byte_at(3) == Some(b'-') + { + self.advance(4); + return CDO; + } + + self.eat_byte(T![<]) + } + + IDT | UNI | BSL if self.is_ident_start() => self.lex_identifier(), + MUL => self.eat_byte(T![*]), COL => self.eat_byte(T![:]), AT_ => self.eat_byte(T![@]), @@ -280,13 +377,14 @@ impl<'src> Lexer<'src> { BEC => self.eat_byte(T!['}']), BTO => self.eat_byte(T!('[')), BTC => self.eat_byte(T![']']), + COM => self.eat_byte(T![,]), _ => self.eat_unexpected_character(), } } fn lex_string_literal(&mut self, quote: u8) -> CssSyntaxKind { - self.assert_at_char_boundary(); + self.assert_current_char_boundary(); let start = self.text_position(); self.advance(1); // Skip over the quote @@ -331,7 +429,13 @@ impl<'src> Lexer<'src> { // Note that this means 1-6 hex digits have been consumed in total. for _ in 0..5 { let Some(digit) = self.current_byte() - .and_then(|c| (c as char).to_digit(16)) else { break; }; + .and_then(|c| { + if c.is_ascii_hexdigit() { + (c as char).to_digit(16) + } else { + None + } + }) else { break; }; self.advance(1); hex = hex * 16 + digit; @@ -377,6 +481,7 @@ impl<'src> Lexer<'src> { return ERROR_TOKEN; } + // we don't need to handle IDT because it's always len 1. UNI => self.advance_char_unchecked(), _ => self.advance(1), @@ -400,6 +505,136 @@ impl<'src> Lexer<'src> { } } + /// Lexes a CSS number literal + fn lex_number(&mut self, current: u8) -> CssSyntaxKind { + debug_assert!(self.is_number_start()); + + if matches!(current, b'+' | b'-') { + self.advance(1); + } + + // While the next input code point is a digit, consume it. + self.consume_number(); + + // If the next 2 input code points are U+002E FULL STOP (.) followed by a digit... + if matches!(self.current_byte(), Some(b'.')) + && self.peek_byte().map_or(false, |byte| byte.is_ascii_digit()) + { + // Consume them. + self.advance(2); + + // While the next input code point is a digit, consume it. + self.consume_number() + } + + // If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or + // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS + // (-) or U+002B PLUS SIGN (+), followed by a digit, then: + if matches!(self.current_byte(), Some(b'e') | Some(b'E')) { + match (self.peek_byte(), self.byte_at(2)) { + (Some(b'-') | Some(b'+'), Some(byte)) if byte.is_ascii_digit() => { + // Consume them. + self.advance(3); + + // While the next input code point is a digit, consume it. + self.consume_number() + } + (Some(byte), _) if byte.is_ascii_digit() => { + // Consume them. + self.advance(2); + + // While the next input code point is a digit, consume it. + self.consume_number() + } + _ => {} + } + } + + CSS_NUMBER_LITERAL + } + + fn consume_number(&mut self) { + // While the next input code point is a digit, consume it. + while let Some(b'0'..=b'9') = self.current_byte() { + self.advance(1); + } + } + + fn lex_identifier(&mut self) -> CssSyntaxKind { + // Note to keep the buffer large enough to fit every possible keyword that + // the lexer can return + let mut buf = [0u8; 20]; + let count = self.consume_ident_sequence(&mut buf); + + match &buf[..count] { + b"media" => MEDIA_KW, + b"keyframes" => KEYFRAMES_KW, + b"not" => NOT_KW, + b"and" => AND_KW, + b"only" => ONLY_KW, + b"or" => OR_KW, + b"i" => I_KW, + b"important" => IMPORTANT_KW, + b"from" => FROM_KW, + b"to" => TO_KW, + b"var" => VAR_KW, + _ => IDENT, + } + } + + /// Consume a ident sequence. + fn consume_ident_sequence(&mut self, buf: &mut [u8]) -> usize { + debug_assert!(self.is_ident_start()); + + let mut idx = 0; + let mut is_first = true; + // Repeatedly consume the next input code point from the stream. + loop { + let Some(current) = self.current_byte() else { + break; + }; + + let dispatched = lookup_byte(current); + + let chr = match dispatched { + // name code point + UNI | IDT => { + // SAFETY: We know that the current byte is a valid unicode code point + let chr = self.current_char_unchecked(); + let is_id = if is_first { + is_first = false; + is_id_start(chr) + } else { + is_id_continue(chr) + }; + + if is_id { + chr + } else { + break; + } + } + // SAFETY: We know that the current byte is a number and we can use cast. + DIG | ZER if !is_first => current as char, + // U+005C REVERSE SOLIDUS (\) + // If the first and second code points are a valid escape, continue consume. + // Otherwise, break. + BSL if self.is_valid_escape_at(1) => '\\', + _ => break, + }; + + let len = chr.len_utf8(); + self.advance(len); + + if let Some(buf) = buf.get_mut(idx..idx + 4) { + let res = chr.encode_utf8(buf); + idx += res.len(); + } + } + + idx + } + /// Lexes a comment. fn lex_slash(&mut self) -> CssSyntaxKind { let start = self.text_position(); @@ -462,7 +697,7 @@ impl<'src> Lexer<'src> { #[inline] fn eat_unexpected_character(&mut self) -> CssSyntaxKind { - self.assert_at_char_boundary(); + self.assert_current_char_boundary(); let char = self.current_char_unchecked(); let err = ParseDiagnostic::new( @@ -474,6 +709,76 @@ impl<'src> Lexer<'src> { ERROR_TOKEN } + + /// Check if the lexer starts a number. + fn is_number_start(&self) -> bool { + match self.current_byte() { + Some(b'+') | Some(b'-') => match self.peek_byte() { + // If the second code point is a digit, return true. + Some(byte) if byte.is_ascii_digit() => true, + // Otherwise, if the second code point is a U+002E FULL STOP (.) and the + // third code point is a digit, return true. + Some(b'.') if self.byte_at(2).map_or(false, |byte| byte.is_ascii_digit()) => true, + _ => false, + }, + Some(b'.') => match self.peek_byte() { + // If the second code point is a digit, return true. + Some(byte) if byte.is_ascii_digit() => true, + _ => false, + }, + Some(byte) => byte.is_ascii_digit(), + _ => false, + } + } + + /// Check if the lexer starts an identifier. + fn is_ident_start(&self) -> bool { + let Some(current) = self.current_byte() else { + return false; + }; + + // Look at the first code point: + match lookup_byte(current) { + // U+002D HYPHEN-MINUS + MIN => { + let Some(next) = self.peek_byte() else { + return false; + }; + + match lookup_byte(next) { + MIN => { + let Some(next) = self.byte_at(2) else { + return false; + }; + + match lookup_byte(next) { + // If the third code point is a name-start code point + // return true. + UNI | IDT if is_id_start(self.char_unchecked_at(2)) => true, + // or the third and fourth code points are a valid escape + // return true. + BSL => self.is_valid_escape_at(3), + _ => false, + } + } + // If the second code point is a name-start code point + // return true. + UNI | IDT if is_id_start(self.peek_char_unchecked()) => true, + // or the second and third code points are a valid escape + // return true. + BSL => self.is_valid_escape_at(2), + _ => false, + } + } + UNI | IDT if is_id_start(self.current_char_unchecked()) => true, + // U+005C REVERSE SOLIDUS (\) + // If the first and second code points are a valid escape, return true. Otherwise, + // return false. + BSL => self.is_valid_escape_at(1), + + _ => false, + } + } } impl Iterator for Lexer<'_> { diff --git a/crates/rome_css_parser/src/lexer/tests.rs b/crates/rome_css_parser/src/lexer/tests.rs index 4bc08842f78..8a43a02a277 100644 --- a/crates/rome_css_parser/src/lexer/tests.rs +++ b/crates/rome_css_parser/src/lexer/tests.rs @@ -192,6 +192,187 @@ fn string() { } } +#[test] +fn number() { + assert_lex! { + "5098382", + CSS_NUMBER_LITERAL:7, + EOF:0 + } + + assert_lex! { + "509.382", + CSS_NUMBER_LITERAL:7, + EOF:0 + } + + assert_lex! { + ".382", + CSS_NUMBER_LITERAL:4, + EOF:0 + } + + assert_lex! { + "+123", + CSS_NUMBER_LITERAL:4, + EOF:0 + } + + assert_lex! { + "-123", + CSS_NUMBER_LITERAL:4, + EOF:0 + } + + assert_lex! { + "+123", + CSS_NUMBER_LITERAL:4, + EOF:0 + } + + assert_lex! { + "123e10", + CSS_NUMBER_LITERAL:6, + EOF:0 + } + + assert_lex! { + "123e+10", + CSS_NUMBER_LITERAL:7, + EOF:0 + } + + assert_lex! { + "123e-10", + CSS_NUMBER_LITERAL:7, + EOF:0 + } + + assert_lex! { + "123E10", + CSS_NUMBER_LITERAL:6, + EOF:0 + } + + assert_lex! { + "123E+10", + CSS_NUMBER_LITERAL:7, + EOF:0 + } + + assert_lex! { + "123E-10", + CSS_NUMBER_LITERAL:7, + EOF:0 + } +} + +#[test] +fn cdo_and_cdc() { + assert_lex! { + "", + CDO:4, + WHITESPACE:1, + CDC:3 + EOF:0 + } +} + +#[test] +fn dimension() { + assert_lex! { + "100vh", + CSS_NUMBER_LITERAL:3, + IDENT:2, + EOF:0 + } +} + +#[test] +fn keywords() { + assert_lex! { + "media keyframes important from", + MEDIA_KW:5, + WHITESPACE:1, + KEYFRAMES_KW:9, + WHITESPACE:1, + IMPORTANT_KW:9, + WHITESPACE:1, + FROM_KW:4, + EOF:0 + } +} + +#[test] +fn identifier() { + assert_lex! { + "--", + MINUS:1, + MINUS:1, + EOF:0 + } + + assert_lex! { + "i4f5g7", + IDENT:6, + EOF:0 + } + + assert_lex! { + "class", + IDENT:5, + EOF:0 + } + + assert_lex! { + r#"cl\aass"#, + IDENT:7, + EOF:0 + } + + assert_lex! { + r#"\ccl\aass"#, + IDENT:9, + EOF:0 + } + + assert_lex! { + "-class", + IDENT:6, + EOF:0 + } + + assert_lex! { + r#"-cl\aass"#, + IDENT:8, + EOF:0 + } + + assert_lex! { + r#"-\acl\aass"#, + IDENT:10, + EOF:0 + } + + assert_lex! { + "--property", + CSS_CUSTOM_PROPERTY:10, + EOF:0 + } + + assert_lex! { + r#"--prop\eerty"#, + CSS_CUSTOM_PROPERTY:12, + EOF:0 + } + + assert_lex! { + r#"--\pprop\eerty"#, + CSS_CUSTOM_PROPERTY:14, + EOF:0 + } +} + #[test] fn single_line_comments() { assert_lex! { diff --git a/crates/rome_css_syntax/src/generated/kind.rs b/crates/rome_css_syntax/src/generated/kind.rs index ca29377b187..437d21fd95c 100644 --- a/crates/rome_css_syntax/src/generated/kind.rs +++ b/crates/rome_css_syntax/src/generated/kind.rs @@ -47,6 +47,8 @@ pub enum CssSyntaxKind { AT, DOLLAR_EQ, TILDE_EQ, + CDC, + CDO, ALICEBLUE_KW, ANTIQUEWHITE_KW, AQUA_KW, @@ -271,7 +273,8 @@ impl CssSyntaxKind { SEMICOLON | COMMA | L_PAREN | R_PAREN | L_CURLY | R_CURLY | L_BRACK | R_BRACK | L_ANGLE | R_ANGLE | TILDE | HASH | AMP | PIPE | PLUS | STAR | SLASH | CARET | PERCENT | DOT | COLON | EQ | BANG | NEQ | MINUS | LTEQ | GTEQ | PLUSEQ | PIPEEQ - | AMPEQ | CARETEQ | SLASHEQ | STAREQ | PERCENTEQ | AT | DOLLAR_EQ | TILDE_EQ => true, + | AMPEQ | CARETEQ | SLASHEQ | STAREQ | PERCENTEQ | AT | DOLLAR_EQ | TILDE_EQ | CDC + | CDO => true, _ => false, } } @@ -493,6 +496,8 @@ impl CssSyntaxKind { AT => "@", DOLLAR_EQ => "$=", TILDE_EQ => "~=", + CDC => "-->", + CDO => "] => { $ crate :: CssSyntaxKind :: CDC } ; [", "CDC"), + ("