From 153fd8cceb277283273efedea8961f5cc847a0d7 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Wed, 7 Jun 2023 12:43:27 +0200 Subject: [PATCH] Simple lexer for formatter --- .../src/comments/placement.rs | 80 +- .../src/expression/parentheses.rs | 31 +- ...hon_formatter__trivia__tests__Reverse.snap | 218 ++++++ ...matter__trivia__tests__tokenize_bogus.snap | 126 ++++ ...matter__trivia__tests__tokenize_comma.snap | 22 + ..._trivia__tests__tokenize_continuation.snap | 30 + ...__trivia__tests__tokenize_parentheses.snap | 30 + ...matter__trivia__tests__tokenize_slash.snap | 42 ++ ...er__trivia__tests__tokenize_substring.snap | 18 + ...atter__trivia__tests__tokenize_trivia.snap | 22 + crates/ruff_python_formatter/src/trivia.rs | 708 +++++++++++++++--- 11 files changed, 1145 insertions(+), 182 deletions(-) create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__Reverse.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_bogus.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_comma.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_continuation.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_parentheses.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_slash.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_substring.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_trivia.snap diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index 8d8acc1b64397..7ba015a2d0032 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -1,11 +1,11 @@ use crate::comments::visitor::{CommentPlacement, DecoratedComment}; use crate::comments::CommentTextPosition; -use crate::trivia::find_first_non_trivia_character_in_range; +use crate::trivia::{SimpleTokenizer, TokenKind}; use ruff_newlines::StrExt; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::source_code::Locator; use ruff_python_ast::whitespace; -use ruff_text_size::{TextLen, TextRange, TextSize}; +use ruff_text_size::{TextRange, TextSize}; use rustpython_parser::ast::Ranged; use std::cmp::Ordering; @@ -521,14 +521,16 @@ fn handle_trailing_end_of_line_condition_comment<'a>( // If the preceding is the node before the `colon` // `while true:` The node before the `colon` is the `true` constant. if preceding.ptr_eq(last_before_colon) { - let mut start = preceding.end(); - while let Some((offset, c)) = find_first_non_trivia_character_in_range( - TextRange::new(start, following.start()), + let tokens = SimpleTokenizer::new( locator.contents(), - ) { - match c { - ':' => { - if comment.slice().start() > offset { + TextRange::new(preceding.end(), following.start()), + ) + .skip_trivia(); + + for token in tokens { + match token.kind() { + TokenKind::Colon => { + if comment.slice().start() > token.start() { // Comment comes after the colon // ```python // while a: # comment @@ -546,9 +548,8 @@ fn handle_trailing_end_of_line_condition_comment<'a>( // ``` break; } - ')' => { + TokenKind::RParen => { // Skip over any closing parentheses - start = offset + ')'.text_len(); } _ => { unreachable!("Only ')' or ':' should follow the condition") @@ -652,21 +653,17 @@ fn handle_trailing_binary_expression_left_or_operator_comment<'a>( return CommentPlacement::Default(comment); } - let mut between_operands_range = TextRange::new( + let between_operands_range = TextRange::new( binary_expression.left.end(), binary_expression.right.start(), ); - let operator_offset = loop { - match find_first_non_trivia_character_in_range(between_operands_range, locator.contents()) { - // Skip over closing parens - Some((offset, ')')) => { - between_operands_range = - TextRange::new(offset + TextSize::new(1), between_operands_range.end()); - } - Some((offset, _)) => break offset, - None => return CommentPlacement::Default(comment), - } + let mut tokens = SimpleTokenizer::new(locator.contents(), between_operands_range).skip_trivia(); + let operator_offset = if let Some(non_r_paren) = tokens.find(|t| t.kind() != TokenKind::RParen) + { + non_r_paren.start() + } else { + return CommentPlacement::Default(comment); }; let comment_range = comment.slice().range(); @@ -805,29 +802,22 @@ fn find_pos_only_slash_offset( between_arguments_range: TextRange, locator: &Locator, ) -> Option { - // First find the comma separating the two arguments - find_first_non_trivia_character_in_range(between_arguments_range, locator.contents()).and_then( - |(comma_offset, comma)| { - debug_assert_eq!(comma, ','); - - // Then find the position of the `/` operator - find_first_non_trivia_character_in_range( - TextRange::new( - comma_offset + TextSize::new(1), - between_arguments_range.end(), - ), - locator.contents(), - ) - .and_then(|(offset, c)| { - if c == '/' { - Some(offset) - } else { - debug_assert_eq!(c, ')'); - None - } - }) - }, - ) + let mut tokens = + SimpleTokenizer::new(locator.contents(), between_arguments_range).skip_trivia(); + + if let Some(comma) = tokens.next() { + debug_assert_eq!(comma.kind(), TokenKind::Comma); + + if let Some(maybe_slash) = tokens.next() { + if maybe_slash.kind() == TokenKind::Slash { + return Some(maybe_slash.start()); + } + + debug_assert_eq!(maybe_slash.kind(), TokenKind::RParen); + } + } + + None } /// Returns `true` if `right` is `Some` and `left` and `right` are referentially equal. diff --git a/crates/ruff_python_formatter/src/expression/parentheses.rs b/crates/ruff_python_formatter/src/expression/parentheses.rs index 7cebc3ebbbb80..06d599ad2ccae 100644 --- a/crates/ruff_python_formatter/src/expression/parentheses.rs +++ b/crates/ruff_python_formatter/src/expression/parentheses.rs @@ -1,7 +1,6 @@ -use crate::trivia::{ - find_first_non_trivia_character_after, find_first_non_trivia_character_before, -}; +use crate::trivia::{first_non_trivia_token, first_non_trivia_token_rev, Token, TokenKind}; use ruff_python_ast::node::AnyNodeRef; +use rustpython_parser::ast::Ranged; pub(crate) trait NeedsParentheses { fn needs_parentheses(&self, parenthesize: Parenthesize, source: &str) -> Parentheses; @@ -73,21 +72,17 @@ pub enum Parentheses { } fn is_expression_parenthesized(expr: AnyNodeRef, contents: &str) -> bool { - use rustpython_parser::ast::Ranged; - - debug_assert!( - expr.is_expression(), - "Should only be called for expressions" - ); - - // Search backwards to avoid ambiguity with `(a, )` and because it's faster matches!( - find_first_non_trivia_character_after(expr.end(), contents), - Some((_, ')')) - ) - // Search forwards to confirm that this is not a nested expression `(5 + d * 3)` - && matches!( - find_first_non_trivia_character_before(expr.start(), contents), - Some((_, '(')) + first_non_trivia_token(expr.end(), contents), + Some(Token { + kind: TokenKind::RParen, + .. + }) + ) && matches!( + first_non_trivia_token_rev(expr.start(), contents), + Some(Token { + kind: TokenKind::LParen, + .. + }) ) } diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__Reverse.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__Reverse.snap new file mode 100644 index 0000000000000..ec701539c60a6 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__Reverse.snap @@ -0,0 +1,218 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokenize_reverse() +--- +[ + Token { + kind: RParen, + range: 52..53, + }, + Token { + kind: Other, + range: 51..52, + }, + Token { + kind: Bogus, + range: 50..51, + }, + Token { + kind: Bogus, + range: 49..50, + }, + Token { + kind: Bogus, + range: 48..49, + }, + Token { + kind: Bogus, + range: 47..48, + }, + Token { + kind: Bogus, + range: 46..47, + }, + Token { + kind: Bogus, + range: 45..46, + }, + Token { + kind: Bogus, + range: 44..45, + }, + Token { + kind: Bogus, + range: 43..44, + }, + Token { + kind: Bogus, + range: 42..43, + }, + Token { + kind: Bogus, + range: 41..42, + }, + Token { + kind: Bogus, + range: 40..41, + }, + Token { + kind: Bogus, + range: 39..40, + }, + Token { + kind: Bogus, + range: 38..39, + }, + Token { + kind: Bogus, + range: 37..38, + }, + Token { + kind: Bogus, + range: 36..37, + }, + Token { + kind: Bogus, + range: 35..36, + }, + Token { + kind: Bogus, + range: 34..35, + }, + Token { + kind: Bogus, + range: 33..34, + }, + Token { + kind: Bogus, + range: 32..33, + }, + Token { + kind: Bogus, + range: 31..32, + }, + Token { + kind: Bogus, + range: 30..31, + }, + Token { + kind: Bogus, + range: 29..30, + }, + Token { + kind: Bogus, + range: 28..29, + }, + Token { + kind: Bogus, + range: 27..28, + }, + Token { + kind: Bogus, + range: 26..27, + }, + Token { + kind: Bogus, + range: 25..26, + }, + Token { + kind: Bogus, + range: 24..25, + }, + Token { + kind: Bogus, + range: 23..24, + }, + Token { + kind: Bogus, + range: 22..23, + }, + Token { + kind: Bogus, + range: 21..22, + }, + Token { + kind: Bogus, + range: 20..21, + }, + Token { + kind: Bogus, + range: 19..20, + }, + Token { + kind: Bogus, + range: 18..19, + }, + Token { + kind: Bogus, + range: 17..18, + }, + Token { + kind: Bogus, + range: 16..17, + }, + Token { + kind: Bogus, + range: 15..16, + }, + Token { + kind: Bogus, + range: 14..15, + }, + Token { + kind: Bogus, + range: 13..14, + }, + Token { + kind: Bogus, + range: 12..13, + }, + Token { + kind: Bogus, + range: 11..12, + }, + Token { + kind: Bogus, + range: 10..11, + }, + Token { + kind: Bogus, + range: 9..10, + }, + Token { + kind: Bogus, + range: 8..9, + }, + Token { + kind: Bogus, + range: 7..8, + }, + Token { + kind: Bogus, + range: 6..7, + }, + Token { + kind: Bogus, + range: 5..6, + }, + Token { + kind: Bogus, + range: 4..5, + }, + Token { + kind: Bogus, + range: 3..4, + }, + Token { + kind: Bogus, + range: 2..3, + }, + Token { + kind: Bogus, + range: 1..2, + }, + Token { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_bogus.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_bogus.snap new file mode 100644 index 0000000000000..7936816089518 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_bogus.snap @@ -0,0 +1,126 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: Comment, + range: 0..17, + }, + Token { + kind: Newline, + range: 17..18, + }, + Token { + kind: Whitespace, + range: 18..26, + }, + Token { + kind: Other, + range: 26..27, + }, + Token { + kind: Bogus, + range: 27..28, + }, + Token { + kind: Bogus, + range: 28..29, + }, + Token { + kind: Bogus, + range: 29..30, + }, + Token { + kind: Bogus, + range: 30..31, + }, + Token { + kind: Bogus, + range: 31..32, + }, + Token { + kind: Bogus, + range: 32..33, + }, + Token { + kind: Bogus, + range: 33..34, + }, + Token { + kind: Bogus, + range: 34..35, + }, + Token { + kind: Bogus, + range: 35..36, + }, + Token { + kind: Bogus, + range: 36..37, + }, + Token { + kind: Bogus, + range: 37..38, + }, + Token { + kind: Bogus, + range: 38..39, + }, + Token { + kind: Bogus, + range: 39..40, + }, + Token { + kind: Bogus, + range: 40..41, + }, + Token { + kind: Bogus, + range: 41..42, + }, + Token { + kind: Bogus, + range: 42..43, + }, + Token { + kind: Bogus, + range: 43..44, + }, + Token { + kind: Bogus, + range: 44..45, + }, + Token { + kind: Bogus, + range: 45..46, + }, + Token { + kind: Bogus, + range: 46..47, + }, + Token { + kind: Bogus, + range: 47..48, + }, + Token { + kind: Bogus, + range: 48..49, + }, + Token { + kind: Bogus, + range: 49..50, + }, + Token { + kind: Bogus, + range: 50..51, + }, + Token { + kind: Bogus, + range: 51..52, + }, + Token { + kind: Bogus, + range: 52..53, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_comma.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_comma.snap new file mode 100644 index 0000000000000..aade2db2c95f3 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_comma.snap @@ -0,0 +1,22 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: tokens +--- +[ + Token { + kind: Comma, + range: 0..1, + }, + Token { + kind: Comma, + range: 1..2, + }, + Token { + kind: Comma, + range: 2..3, + }, + Token { + kind: Comma, + range: 3..4, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_continuation.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_continuation.snap new file mode 100644 index 0000000000000..b537ae611c498 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_continuation.snap @@ -0,0 +1,30 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: tokens +--- +[ + Token { + kind: LParen, + range: 0..1, + }, + Token { + kind: Whitespace, + range: 1..2, + }, + Token { + kind: Continuation, + range: 2..3, + }, + Token { + kind: Newline, + range: 3..4, + }, + Token { + kind: Whitespace, + range: 4..5, + }, + Token { + kind: RParen, + range: 5..6, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_parentheses.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_parentheses.snap new file mode 100644 index 0000000000000..f9de9526ae747 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_parentheses.snap @@ -0,0 +1,30 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: tokens +--- +[ + Token { + kind: LParen, + range: 0..1, + }, + Token { + kind: LBracket, + range: 1..2, + }, + Token { + kind: LBrace, + range: 2..3, + }, + Token { + kind: RBrace, + range: 3..4, + }, + Token { + kind: RBracket, + range: 4..5, + }, + Token { + kind: RParen, + range: 5..6, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_slash.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_slash.snap new file mode 100644 index 0000000000000..093715cf17798 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_slash.snap @@ -0,0 +1,42 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: Whitespace, + range: 0..1, + }, + Token { + kind: Comment, + range: 1..30, + }, + Token { + kind: Newline, + range: 30..31, + }, + Token { + kind: Whitespace, + range: 31..39, + }, + Token { + kind: Comment, + range: 39..77, + }, + Token { + kind: Newline, + range: 77..78, + }, + Token { + kind: Whitespace, + range: 78..86, + }, + Token { + kind: Comma, + range: 86..87, + }, + Token { + kind: Slash, + range: 87..88, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_substring.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_substring.snap new file mode 100644 index 0000000000000..747d504c4b27c --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_substring.snap @@ -0,0 +1,18 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: tokens +--- +[ + Token { + kind: RParen, + range: 14..15, + }, + Token { + kind: Whitespace, + range: 15..16, + }, + Token { + kind: Comment, + range: 16..25, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_trivia.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_trivia.snap new file mode 100644 index 0000000000000..685a032be71af --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_trivia.snap @@ -0,0 +1,22 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: tokens +--- +[ + Token { + kind: Comment, + range: 0..9, + }, + Token { + kind: Newline, + range: 9..10, + }, + Token { + kind: Whitespace, + range: 10..14, + }, + Token { + kind: Comment, + range: 14..23, + }, +] diff --git a/crates/ruff_python_formatter/src/trivia.rs b/crates/ruff_python_formatter/src/trivia.rs index c06caf01357ee..40a767b07e3e0 100644 --- a/crates/ruff_python_formatter/src/trivia.rs +++ b/crates/ruff_python_formatter/src/trivia.rs @@ -1,5 +1,6 @@ use ruff_python_ast::whitespace::is_python_whitespace; use ruff_text_size::{TextLen, TextRange, TextSize}; +use std::str::Chars; /// Searches for the first non-trivia character in `range`. /// @@ -9,174 +10,643 @@ use ruff_text_size::{TextLen, TextRange, TextSize}; /// of the character, the second item the non-trivia character. /// /// Returns `None` if the range is empty or only contains trivia (whitespace or comments). -pub(crate) fn find_first_non_trivia_character_in_range( - range: TextRange, - code: &str, -) -> Option<(TextSize, char)> { - let rest = &code[range]; - let mut char_iter = rest.chars(); - - while let Some(c) = char_iter.next() { - match c { - '#' => { - // We're now inside of a comment. Skip all content until the end of the line - for c in char_iter.by_ref() { - if matches!(c, '\n' | '\r') { - break; - } - } +pub(crate) fn first_non_trivia_token(offset: TextSize, code: &str) -> Option { + SimpleTokenizer::starts_at(offset, code) + .skip_trivia() + .next() +} + +/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file +/// or all preceding tokens are trivia tokens. +/// +/// ## Notes +/// +/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments. +pub(crate) fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option { + SimpleTokenizer::up_to(offset, code) + .skip_trivia() + .next_back() +} + +/// Returns the number of newlines between `offset` and the first non whitespace character in the source code. +pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 { + let tokens = SimpleTokenizer::up_to(offset, code); + let mut newlines = 0u32; + + for token in tokens.rev() { + match token.kind() { + TokenKind::Newline => { + newlines += 1; } - c => { - if !is_python_whitespace(c) { - let index = range.start() + rest.text_len() - - char_iter.as_str().text_len() - - c.text_len(); - return Some((index, c)); - } + TokenKind::Whitespace => { + // ignore + } + _ => { + break; } } } - None + newlines } -pub(crate) fn find_first_non_trivia_character_after( - offset: TextSize, - code: &str, -) -> Option<(TextSize, char)> { - find_first_non_trivia_character_in_range(TextRange::new(offset, code.text_len()), code) +/// Counts the empty lines between `offset` and the first non-whitespace character. +pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 { + let tokens = SimpleTokenizer::starts_at(offset, code); + let mut newlines = 0u32; + + for token in tokens { + match token.kind() { + TokenKind::Newline => { + newlines += 1; + } + TokenKind::Whitespace => { + // ignore + } + _ => { + break; + } + } + } + + newlines } -pub(crate) fn find_first_non_trivia_character_before( - offset: TextSize, - code: &str, -) -> Option<(TextSize, char)> { - let head = &code[TextRange::up_to(offset)]; - let mut char_iter = head.chars(); +/// Returns the position after skipping any trailing trivia up to, but not including the newline character. +pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize { + let tokenizer = SimpleTokenizer::starts_at(offset, code); - while let Some(c) = char_iter.next_back() { - match c { - c if is_python_whitespace(c) => { - continue; + for token in tokenizer { + match token.kind() { + TokenKind::Whitespace | TokenKind::Comment | TokenKind::Continuation => { + // No op } + _ => { + return token.start(); + } + } + } - // Empty comment - '#' => continue, + offset +} - non_trivia_character => { - // Non trivia character but we don't know if it is a comment or not. Consume all characters - // until the start of the line and track if the last non-whitespace character was a `#`. - let mut is_comment = false; +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub(crate) struct Token { + pub(crate) kind: TokenKind, + pub(crate) range: TextRange, +} - let first_non_trivia_offset = char_iter.as_str().text_len(); +impl Token { + pub(crate) const fn kind(&self) -> TokenKind { + self.kind + } - while let Some(c) = char_iter.next_back() { - match c { - '#' => { - is_comment = true; - } - '\n' | '\r' => { - if !is_comment { - return Some((first_non_trivia_offset, non_trivia_character)); - } - } + #[allow(unused)] + pub(crate) const fn range(&self) -> TextRange { + self.range + } - c => { - if !is_python_whitespace(c) { - is_comment = false; - } - } - } - } - } - } + pub(crate) const fn start(&self) -> TextSize { + self.range.start() } - None + #[allow(unused)] + pub(crate) const fn end(&self) -> TextSize { + self.range.start() + } } -/// Returns the number of newlines between `offset` and the first non whitespace character in the source code. -pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 { - let head = &code[TextRange::up_to(offset)]; - let mut newlines = 0u32; +#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)] +pub(crate) enum TokenKind { + /// A comment, not including the trailing new line. + Comment, + + /// Sequence of ' ' or '\t' + Whitespace, + + /// Start or end of the file + EndOfFile, + + /// `\\` + Continuation, + + /// `\n` or `\r` or `\r\n` + Newline, + + /// `(` + LParen, + + /// `)` + RParen, + + /// `{` + LBrace, - for (index, c) in head.char_indices().rev() { + /// `}` + RBrace, + + /// `[` + LBracket, + + /// `]` + RBracket, + + /// `,` + Comma, + + /// `:` + Colon, + + /// '/' + Slash, + + /// Any other non trivia token. Always has a length of 1 + Other, + + /// Returned for each character after [`TokenKind::Other`] has been returned once. + Bogus, +} + +impl TokenKind { + const fn from_non_trivia_char(c: char) -> TokenKind { match c { - '\n' => { - if head.as_bytes()[index.saturating_sub(1)] == b'\r' { - continue; - } - newlines += 1; + '(' => TokenKind::LParen, + ')' => TokenKind::RParen, + '[' => TokenKind::LBracket, + ']' => TokenKind::RBracket, + '{' => TokenKind::LBrace, + '}' => TokenKind::RBrace, + ',' => TokenKind::Comma, + ':' => TokenKind::Colon, + '/' => TokenKind::Slash, + _ => TokenKind::Other, + } + } + + const fn is_trivia(self) -> bool { + matches!( + self, + TokenKind::Whitespace + | TokenKind::Newline + | TokenKind::Comment + | TokenKind::Continuation + ) + } +} + +/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens). +/// +/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string). +/// +/// The tokenizer doesn't guarantee any correctness after it returned a [`TokenKind::Other`]. That's why it +/// will return [`TokenKind::Bogus`] for every character after until it reaches the end of the file. +pub(crate) struct SimpleTokenizer<'a> { + offset: TextSize, + back_offset: TextSize, + /// `true` when it is known that the current `back` line has no comment for sure. + back_line_has_no_comment: bool, + bogus: bool, + cursor: Cursor<'a>, +} + +impl<'a> SimpleTokenizer<'a> { + pub(crate) fn new(source: &'a str, range: TextRange) -> Self { + Self { + offset: range.start(), + back_offset: range.end(), + back_line_has_no_comment: false, + bogus: false, + cursor: Cursor::new(&source[range]), + } + } + + pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self { + let range = TextRange::new(offset, source.text_len()); + Self::new(source, range) + } + + pub(crate) fn up_to(offset: TextSize, source: &'a str) -> Self { + Self::new(source, TextRange::up_to(offset)) + } + + fn next_token(&mut self) -> Token { + self.cursor.start_token(); + + let Some(first) = self.cursor.bump() else { + return Token { + kind: TokenKind::EndOfFile, + range: TextRange::empty(self.offset), } + }; + + if self.bogus { + let token = Token { + kind: TokenKind::Bogus, + range: TextRange::at(self.offset, first.text_len()), + }; + + self.offset += first.text_len(); + return token; + } + + let kind = match first { + ' ' | '\t' => { + self.cursor.eat_while(|c| matches!(c, ' ' | '\t')); + TokenKind::Whitespace + } + + '\n' => TokenKind::Newline, '\r' => { - newlines += 1; + self.cursor.eat_char('\n'); + TokenKind::Newline } - c if is_python_whitespace(c) => continue, + '#' => { + self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); + TokenKind::Comment + } - _ => break, - } - } + '\\' => TokenKind::Continuation, - newlines -} + c => { + let kind = TokenKind::from_non_trivia_char(c); -/// Counts the empty lines between `offset` and the first non-whitespace character. -pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 { - let rest = &code[usize::from(offset)..]; - let mut newlines = 0; + if kind == TokenKind::Other { + self.bogus = true; + } - for (index, c) in rest.char_indices() { - match c { - '\n' => { - newlines += 1; + kind } - '\r' if rest.as_bytes().get(index + 1).copied() == Some(b'\n') => { - continue; + }; + + let token_len = self.cursor.token_len(); + + let token = Token { + kind, + range: TextRange::at(self.offset, token_len), + }; + + self.offset += token_len; + + token + } + + /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive + /// because it needs to check if the line has any comments when encountering any non-trivia token. + pub(crate) fn next_token_back(&mut self) -> Token { + self.cursor.start_token(); + + let Some(last) = self.cursor.bump_back() else { + return Token { + kind: TokenKind::EndOfFile, + range: TextRange::empty(self.back_offset), + } + }; + + if self.bogus { + let token = Token { + kind: TokenKind::Bogus, + range: TextRange::at(self.back_offset - last.text_len(), last.text_len()), + }; + + self.back_offset -= last.text_len(); + return token; + } + + let kind = match last { + // This may not be 100% correct because it will lex-out trailing whitespace from a comment + // as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for. + ' ' | '\t' => { + self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t')); + TokenKind::Whitespace } + '\r' => { - newlines += 1; + self.back_line_has_no_comment = false; + TokenKind::Newline + } + + '\n' => { + self.back_line_has_no_comment = false; + self.cursor.eat_char_back('\r'); + TokenKind::Newline + } + + // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for) + '#' => TokenKind::Comment, + + // For all other tokens, test if the character isn't part of a comment. + c => { + let mut comment_offset = None; + + // Skip the test whether there's a preceding comment if it has been performed before. + if !self.back_line_has_no_comment { + let rest = self.cursor.chars.as_str(); + + for (back_index, c) in rest.chars().rev().enumerate() { + match c { + '#' => { + // Potentially a comment + comment_offset = Some(back_index + 1); + } + '\r' | '\n' | '\\' => { + break; + } + c => { + if !is_python_whitespace(c) + && TokenKind::from_non_trivia_char(c) == TokenKind::Other + { + comment_offset = None; + } + } + } + } + } + + // From here on it is guaranteed that this line has no other comment. + self.back_line_has_no_comment = true; + + if let Some(comment_offset) = comment_offset { + // It is a comment, bump all tokens + for _ in 0..comment_offset { + self.cursor.bump_back().unwrap(); + } + + TokenKind::Comment + } else if c == '\\' { + TokenKind::Continuation + } else { + let kind = TokenKind::from_non_trivia_char(c); + + if kind == TokenKind::Other { + self.bogus = true; + } + + kind + } } - c if is_python_whitespace(c) => continue, - _ => break, + }; + + let token_len = self.cursor.token_len(); + + let start = self.back_offset - token_len; + + let token = Token { + kind, + range: TextRange::at(start, token_len), + }; + + self.back_offset = start; + + token + } + + pub(crate) fn skip_trivia(self) -> impl Iterator + DoubleEndedIterator + 'a { + self.filter(|t| !t.kind().is_trivia()) + } +} + +impl Iterator for SimpleTokenizer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + let token = self.next_token(); + + if token.kind == TokenKind::EndOfFile { + None + } else { + Some(token) } } +} - newlines +impl DoubleEndedIterator for SimpleTokenizer<'_> { + fn next_back(&mut self) -> Option { + let token = self.next_token_back(); + + if token.kind == TokenKind::EndOfFile { + None + } else { + Some(token) + } + } } -/// Returns the position after skipping any trailing trivia up to, but not including the newline character. -pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize { - let rest = &code[usize::from(offset)..]; - let mut iter = rest.char_indices(); +const EOF_CHAR: char = '\0'; - while let Some((relative_offset, c)) = iter.next() { - match c { - '\n' | '\r' => return offset + TextSize::try_from(relative_offset).unwrap(), - '#' => { - // Skip the comment - let newline_offset = iter - .as_str() - .find(['\n', '\r']) - .unwrap_or(iter.as_str().len()); +#[derive(Debug, Clone)] +struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, +} - return offset - + TextSize::try_from(relative_offset + '#'.len_utf8() + newline_offset) - .unwrap(); - } - c if is_python_whitespace(c) => continue, - _ => return offset + TextSize::try_from(relative_offset).unwrap(), +impl<'a> Cursor<'a> { + fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + } + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [`EOF_CHAR`] if the file is at the end of the file. + fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [`EOF_CHAR`] if the file is at the end of the file. + fn last(&self) -> char { + self.chars.clone().next_back().unwrap_or(EOF_CHAR) + } + + // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + fn start_token(&mut self) { + self.source_length = self.text_len(); + } + + fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + fn bump(&mut self) -> Option { + self.chars.next() + } + + /// Consumes the next character from the back + fn bump_back(&mut self) -> Option { + self.chars.next_back() + } + + fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + fn eat_char_back(&mut self, c: char) -> bool { + if self.last() == c { + self.bump_back(); + true + } else { + false + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); } } - offset + rest.text_len() + /// Eats symbols from the back while predicate returns true or until the beginning of file is reached. + fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.last()) && !self.is_eof() { + self.bump_back(); + } + } } #[cfg(test)] mod tests { - use crate::trivia::{lines_after, lines_before}; - use ruff_text_size::TextSize; + use crate::trivia::{lines_after, lines_before, SimpleTokenizer, Token}; + use insta::assert_debug_snapshot; + use ruff_text_size::{TextLen, TextRange, TextSize}; + + struct TokenizationTestCase { + source: &'static str, + range: TextRange, + tokens: Vec, + } + + impl TokenizationTestCase { + fn assert_reverse_tokenization(&self) { + let mut backwards = self.tokenize_reverse(); + + // Re-reverse to get the tokens in forward order. + backwards.reverse(); + + assert_eq!(&backwards, &self.tokens); + } + + fn tokenize_reverse(&self) -> Vec { + SimpleTokenizer::new(self.source, self.range) + .rev() + .collect() + } + + fn tokens(&self) -> &[Token] { + &self.tokens + } + } + + fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase { + let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect(); + + TokenizationTestCase { + source, + range, + tokens, + } + } + + fn tokenize(source: &'static str) -> TokenizationTestCase { + tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len())) + } + + #[test] + fn tokenize_trivia() { + let source = "# comment\n # comment"; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_parentheses() { + let source = "([{}])"; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_comma() { + let source = ",,,,"; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_continuation() { + let source = "( \\\n )"; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_substring() { + let source = "('some string') # comment"; + + let test_case = + tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len())); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_slash() { + let source = r#" # trailing positional comment + # Positional arguments only after here + ,/"#; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn tokenize_bogus() { + let source = r#"# leading comment + "a string" + a = (10)"#; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + assert_debug_snapshot!("Reverse", test_case.tokenize_reverse()); + } #[test] fn lines_before_empty_string() {