feat(biome_graphql_parser): implement NthAt for GraphqlLexer (#2677)

biomejs · May 2, 2024 · 1f891f5 · 1f891f5
1 parent afa5004
commit 1f891f5
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 81 deletions.
diff --git a/crates/biome_graphql_parser/src/lexer/mod.rs b/crates/biome_graphql_parser/src/lexer/mod.rs
@@ -4,7 +4,7 @@ mod tests;
 
 use biome_graphql_syntax::{GraphqlSyntaxKind, GraphqlSyntaxKind::*, TextLen, TextSize, T};
 use biome_parser::diagnostic::ParseDiagnostic;
-use biome_parser::lexer::{Lexer, LexerCheckpoint, TokenFlags};
+use biome_parser::lexer::{Lexer, LexerCheckpoint, LexerWithCheckpoint, TokenFlags};
 use std::ops::Add;
 
 #[derive(Debug)]
@@ -15,6 +15,9 @@ pub struct GraphqlLexer<'src> {
     /// The start byte position in the source text of the next token.
     position: usize,
 
+    /// If the source starts with a Unicode BOM, this is the number of bytes for that token.
+    unicode_bom_length: usize,
+
     /// Byte offset of the current token from the start of the source
     /// The range of the current token can be computed by
     /// `self.position - self.current_start`.
@@ -116,6 +119,20 @@ impl<'src> Lexer<'src> for GraphqlLexer<'src> {
     }
 }
 
+impl<'src> LexerWithCheckpoint<'src> for GraphqlLexer<'src> {
+    fn checkpoint(&self) -> LexerCheckpoint<Self::Kind> {
+        LexerCheckpoint {
+            position: TextSize::from(self.position as u32),
+            current_start: self.current_start,
+            current_flags: self.current_flags,
+            current_kind: self.current_kind,
+            after_line_break: self.has_preceding_line_break(),
+            unicode_bom_length: self.unicode_bom_length,
+            diagnostics_pos: self.diagnostics.len() as u32,
+        }
+    }
+}
+
 impl<'src> GraphqlLexer<'src> {
     /// Make a new lexer from a str, this is safe because strs are valid utf8
     pub fn from_str(source: &'src str) -> Self {
@@ -126,6 +143,7 @@ impl<'src> GraphqlLexer<'src> {
             current_flags: TokenFlags::empty(),
             position: 0,
             diagnostics: vec![],
+            unicode_bom_length: 0,
         }
     }
 
@@ -161,8 +179,12 @@ impl<'src> GraphqlLexer<'src> {
             b'#' => self.consume_comment(),
             _ if is_name_start(current) => self.consume_name(current),
             _ if is_number_start(current) => self.consume_number(current),
-            _ if self.position == 0 && self.consume_potential_bom(UNICODE_BOM).is_some() => {
-                UNICODE_BOM
+            _ if self.position == 0 => {
+                if let Some((bom, bom_size)) = self.consume_potential_bom(UNICODE_BOM) {
+                    self.unicode_bom_length = bom_size;
+                    return bom;
+                }
+                self.consume_unexpected_character()
             }
             _ => self.consume_unexpected_character(),
         }

diff --git a/crates/biome_graphql_parser/src/parser/definitions/operation.rs b/crates/biome_graphql_parser/src/parser/definitions/operation.rs
@@ -167,7 +167,7 @@ fn parse_field(p: &mut GraphqlParser) -> ParsedSyntax {
 
     // alias is optional, so if there is a colon, we parse it as an alias
     // otherwise we parse it as a normal field name
-    if p.lookahead_at(T![:]) {
+    if p.nth_at(1, T![:]) {
         let m = p.start();
 
         // name is checked for in `is_at_field`

diff --git a/crates/biome_graphql_parser/src/parser/definitions/schema.rs b/crates/biome_graphql_parser/src/parser/definitions/schema.rs
@@ -108,7 +108,7 @@ fn parse_root_operation_type_definition(p: &mut GraphqlParser) -> ParsedSyntax {
 
 #[inline]
 pub(crate) fn is_at_schema_definition(p: &mut GraphqlParser<'_>) -> bool {
-    p.at(T![schema]) || (is_at_string(p) && p.lookahead_at(T![schema]))
+    p.at(T![schema]) || (is_at_string(p) && p.nth_at(1, T![schema]))
 }
 
 #[inline]
@@ -117,7 +117,7 @@ fn is_at_root_operation_type_definition(p: &mut GraphqlParser<'_>) -> bool {
         // missing operation type
         || p.at(T![:])
         // there is likely a typo in the operation type
-        || p.lookahead_at(T![:])
+        || p.nth_at(1, T![:])
 }
 
 /// To prevent a missing closing brace from causing the parser to include the next definition
@@ -136,5 +136,5 @@ fn is_at_root_operation_type_definition_end(p: &mut GraphqlParser<'_>) -> bool {
     p.at(T!['}'])
         || (!p.at_ts(OPERATION_TYPE) && is_at_definition(p))
         // start of a new operation definition
-        || (p.at_ts(OPERATION_TYPE) && !p.lookahead_at(T![:]))
+        || (p.at_ts(OPERATION_TYPE) && !p.nth_at(1, T![:]))
 }
diff --git a/crates/biome_graphql_parser/src/parser/mod.rs b/crates/biome_graphql_parser/src/parser/mod.rs
@@ -30,14 +30,6 @@ impl<'source> GraphqlParser<'source> {
         }
     }
 
-    pub fn lookahead(&mut self) -> GraphqlSyntaxKind {
-        self.source.lookahead()
-    }
-
-    pub fn lookahead_at(&mut self, kind: GraphqlSyntaxKind) -> bool {
-        self.source.lookahead_at(kind)
-    }
-
     pub fn finish(
         self,
     ) -> (

diff --git a/crates/biome_graphql_parser/src/parser/value.rs b/crates/biome_graphql_parser/src/parser/value.rs
@@ -272,7 +272,7 @@ fn is_at_list(p: &GraphqlParser) -> bool {
 fn is_at_list_end(p: &mut GraphqlParser) -> bool {
     p.at(T![']'])
     // at next argument
-    || p.lookahead() == T![:]
+    || p.nth_at(1, T![:])
     // value is only used in argument
     || is_at_argument_list_end(p)
 }

diff --git a/crates/biome_graphql_parser/src/token_source.rs b/crates/biome_graphql_parser/src/token_source.rs
@@ -1,150 +1,188 @@
+use std::collections::VecDeque;
+
 use crate::lexer::GraphqlLexer;
-use biome_graphql_syntax::GraphqlSyntaxKind::{EOF, TOMBSTONE};
+use biome_graphql_syntax::GraphqlSyntaxKind::EOF;
 use biome_graphql_syntax::{GraphqlSyntaxKind, TextRange};
 use biome_parser::diagnostic::ParseDiagnostic;
-use biome_parser::lexer::Lexer;
+use biome_parser::lexer::BufferedLexer;
 use biome_parser::prelude::TokenSource;
-use biome_parser::token_source::Trivia;
+use biome_parser::token_source::{NthToken, Trivia};
 use biome_rowan::TriviaPieceKind;
 
 pub(crate) struct GraphqlTokenSource<'source> {
-    lexer: GraphqlLexer<'source>,
-    trivia: Vec<Trivia>,
-    current: NonTriviaToken,
-    next: Option<NonTriviaToken>,
+    lexer: BufferedLexer<'source, GraphqlLexer<'source>>,
+    trivia_list: Vec<Trivia>,
+
+    /// Cache for the non-trivia token lookahead. For example for the source `let a = 10;` if the
+    /// [TokenSource]'s currently positioned at the start of the file (`let`). The `nth(2)` non-trivia token,
+    /// as returned by the [TokenSource], is the `=` token but retrieving it requires skipping over the
+    /// two whitespace trivia tokens (first between `let` and `a`, second between `a` and `=`).
+    /// The [TokenSource] state then is:
+    ///
+    /// * `non_trivia_lookahead`: [IDENT: 'a', EQ]
+    /// * `lookahead_offset`: 4 (the `=` is the 4th token after the `let` keyword)
+    non_trivia_lookahead: VecDeque<Lookahead>,
+
+    /// Offset of the last cached lookahead token from the current [BufferedLexer] token.
+    lookahead_offset: usize,
 }
 
-struct NonTriviaToken {
+#[derive(Debug, Copy, Clone)]
+struct Lookahead {
     kind: GraphqlSyntaxKind,
-    range: TextRange,
-    preceding_line_break: bool,
+    after_newline: bool,
 }
 
-impl Default for NonTriviaToken {
-    fn default() -> Self {
+impl<'source> GraphqlTokenSource<'source> {
+    pub(crate) fn new(lexer: BufferedLexer<'source, GraphqlLexer<'source>>) -> Self {
         Self {
-            kind: TOMBSTONE,
-            range: TextRange::default(),
-            preceding_line_break: false,
+            lexer,
+            trivia_list: Vec::new(),
+            non_trivia_lookahead: VecDeque::new(),
+            lookahead_offset: 0,
         }
     }
-}
-
-impl<'source> GraphqlTokenSource<'source> {
     pub fn from_str(source: &'source str) -> Self {
         let lexer = GraphqlLexer::from_str(source);
+        let lexer = BufferedLexer::new(lexer);
 
-        let mut source = Self {
-            lexer,
-            trivia: Vec::new(),
-            current: NonTriviaToken::default(),
-            next: None,
-        };
+        let mut source = GraphqlTokenSource::new(lexer);
 
-        source.advance_to_next_non_trivia_token(true);
+        source.next_non_trivia_token(true);
         source
     }
 
-    fn advance_to_next_non_trivia_token(&mut self, first_token: bool) {
-        self.current = match self.next.take() {
-            Some(next) => next,
-            None => self.next_non_trivia_token(first_token),
+    #[inline(always)]
+    fn lookahead(&mut self, n: usize) -> Option<Lookahead> {
+        assert_ne!(n, 0);
+
+        // Return the cached token if any
+        if let Some(lookahead) = self.non_trivia_lookahead.get(n - 1) {
+            return Some(*lookahead);
         }
-    }
 
-    pub fn lookahead(&mut self) -> GraphqlSyntaxKind {
-        match self.next.as_ref() {
-            Some(next) => next.kind,
-            None if self.current.kind != EOF => {
-                let next_token = self.next_non_trivia_token(false);
-                let next_kind = next_token.kind;
-                self.next = Some(next_token);
-                next_kind
+        // Jump right to where we've left of last time rather than going through all tokens again.
+        let iter = self.lexer.lookahead().skip(self.lookahead_offset);
+        let mut remaining = n - self.non_trivia_lookahead.len();
+
+        for item in iter {
+            self.lookahead_offset += 1;
+
+            if !item.kind().is_trivia() {
+                remaining -= 1;
+
+                let lookahead = Lookahead {
+                    after_newline: item.has_preceding_line_break(),
+                    kind: item.kind(),
+                };
+
+                self.non_trivia_lookahead.push_back(lookahead);
+
+                if remaining == 0 {
+                    return Some(lookahead);
+                }
             }
-            None => EOF,
         }
-    }
 
-    // We mostly look ahead by one token
-    // so there is no need to implement NthToken to use nth_at
-    pub fn lookahead_at(&mut self, kind: GraphqlSyntaxKind) -> bool {
-        self.lookahead() == kind
+        None
     }
 
-    #[must_use]
-    fn next_non_trivia_token(&mut self, first_token: bool) -> NonTriviaToken {
-        let mut non_trivia_token = NonTriviaToken::default();
-
+    fn next_non_trivia_token(&mut self, first_token: bool) {
+        let mut processed_tokens = 0;
         let mut trailing = !first_token;
 
+        // Drop the last cached lookahead, we're now moving past it
+        self.non_trivia_lookahead.pop_front();
+
         loop {
             let kind = self.lexer.next_token(());
+            processed_tokens += 1;
+
             let trivia_kind = TriviaPieceKind::try_from(kind);
 
             match trivia_kind {
                 Err(_) => {
                     // Not trivia
-                    non_trivia_token.kind = kind;
-                    non_trivia_token.range = self.lexer.current_range();
                     break;
                 }
                 Ok(trivia_kind) => {
                     if trivia_kind.is_newline() {
                         trailing = false;
-                        non_trivia_token.preceding_line_break = true;
                     }
 
-                    self.trivia.push(Trivia::new(
-                        trivia_kind,
-                        self.lexer.current_range(),
-                        trailing,
-                    ));
+                    self.trivia_list
+                        .push(Trivia::new(trivia_kind, self.current_range(), trailing));
                 }
             }
         }
 
-        non_trivia_token
+        if self.lookahead_offset != 0 {
+            debug_assert!(self.lookahead_offset >= processed_tokens);
+            self.lookahead_offset -= processed_tokens;
+        }
     }
 }
 
 impl<'source> TokenSource for GraphqlTokenSource<'source> {
     type Kind = GraphqlSyntaxKind;
 
     fn current(&self) -> Self::Kind {
-        self.current.kind
+        self.lexer.current()
     }
 
     fn current_range(&self) -> TextRange {
-        self.current.range
+        self.lexer.current_range()
     }
 
     fn text(&self) -> &str {
         self.lexer.source()
     }
 
     fn has_preceding_line_break(&self) -> bool {
-        self.current.preceding_line_break
+        self.lexer.has_preceding_line_break()
     }
 
     fn bump(&mut self) {
-        if self.current.kind != EOF {
-            self.advance_to_next_non_trivia_token(false)
+        if self.current() != EOF {
+            self.next_non_trivia_token(false)
         }
     }
 
     fn skip_as_trivia(&mut self) {
         if self.current() != EOF {
-            self.trivia.push(Trivia::new(
+            self.trivia_list.push(Trivia::new(
                 TriviaPieceKind::Skipped,
                 self.current_range(),
                 false,
             ));
 
-            self.advance_to_next_non_trivia_token(false)
+            self.next_non_trivia_token(false)
         }
     }
 
     fn finish(self) -> (Vec<Trivia>, Vec<ParseDiagnostic>) {
-        (self.trivia, self.lexer.finish())
+        (self.trivia_list, self.lexer.finish())
+    }
+}
+
+impl<'source> NthToken for GraphqlTokenSource<'source> {
+    /// Gets the kind of the nth non-trivia token
+    fn nth(&mut self, n: usize) -> GraphqlSyntaxKind {
+        if n == 0 {
+            self.current()
+        } else {
+            self.lookahead(n).map_or(EOF, |lookahead| lookahead.kind)
+        }
+    }
+
+    /// Returns true if the nth non-trivia token is preceded by a line break
+    #[inline(always)]
+    fn has_nth_preceding_line_break(&mut self, n: usize) -> bool {
+        if n == 0 {
+            self.has_preceding_line_break()
+        } else {
+            self.lookahead(n)
+                .map_or(false, |lookahead| lookahead.after_newline)
+        }
     }
 }