Skip to content

Commit

Permalink
feat(yaml): lexer that can lex very simple examples
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Aug 27, 2024
1 parent 221a74f commit 2462a59
Show file tree
Hide file tree
Showing 4 changed files with 576 additions and 13 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/biome_yaml_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ tracing = { workspace = true }


[dev-dependencies]
quickcheck = { workspace = true }
quickcheck_macros = { workspace = true }
320 changes: 307 additions & 13 deletions crates/biome_yaml_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,31 @@
use std::iter::FusedIterator;

use biome_parser::{
diagnostic::ParseDiagnostic,
lexer::{LexContext, Lexer, TokenFlags},
};
use biome_rowan::{TextRange, TextSize};
use biome_yaml_syntax::YamlSyntaxKind::*;
pub use biome_yaml_syntax::*;
use biome_yaml_syntax::{YamlSyntaxKind, T};

#[rustfmt::skip]
mod tests;

pub struct Token {
kind: YamlSyntaxKind,
range: TextRange,
}

impl Token {
#[allow(dead_code)]
pub fn kind(&self) -> YamlSyntaxKind {
self.kind
}

#[allow(dead_code)]
pub fn range(&self) -> TextRange {
self.range
}
}

pub(crate) struct YamlLexer<'src> {
/// Source text
Expand All @@ -27,20 +48,234 @@ pub(crate) struct YamlLexer<'src> {
/// The kind of the current token
current_kind: YamlSyntaxKind,

/// Byte offset of the current token from the start of the source
/// The range of the current token can be computed by
/// `self.position - self.current_start`.

/// Flags for the current token
current_flags: TokenFlags,

/// diagnostics emitted during the parsing phase
diagnostics: Vec<ParseDiagnostic>,

context: YamlLexContext,
}

impl<'source> YamlLexer<'source> {
/// Creates a new lexer from the given string
#[allow(dead_code)]
pub fn from_str(source: &'source str) -> Self {
Self {
source,
position: 0,
after_newline: false,
unicode_bom_length: 0,
current_start: TextSize::from(0),
current_kind: YamlSyntaxKind::EOF,
current_flags: TokenFlags::empty(),
diagnostics: vec![],
context: YamlLexContext::Regular,
}
}

fn current_char(&self) -> Option<u8> {
self.source.as_bytes().get(self.position).copied()
}

fn peek_next_char(&self) -> Option<u8> {
self.source.as_bytes().get(self.position + 1).copied()
}

fn is_next_char_whitespace(&self) -> bool {
matches!(self.peek_next_char(), Some(b' ' | b'\n'))
}

/// Consumes and returns the next token, if any
fn consume_token(&mut self) -> Option<Token> {
let start = self.text_position();
let char = self.current_char()?;
let kind = self.consume_token_in_context(char, self.context);
self.current_kind = kind;
let end = self.text_position();
Some(Token {
kind,
range: TextRange::new(start, end),
})
}

/// Consume a byte in the given context
fn consume_token_in_context(&mut self, current: u8, context: YamlLexContext) -> YamlSyntaxKind {
if self.position >= self.source.len() {
return YamlSyntaxKind::EOF;
}

let start = self.text_position();

let kind = match current {
b'#' => self.consume_comment(),
b'-' => {
if self.is_next_char_whitespace() {
self.consume_byte(T![-])
} else {
self.consume_identifer_or_value()
}
}
b':' => {
if self.is_next_char_whitespace() {
self.context = YamlLexContext::AfterIdent;
self.consume_byte(T![:])
} else {
self.consume_identifer_or_value()
}
}
b'\'' | b'"' => self.consume_string_literal(current),
b' ' => self.consume_newline_or_whitespaces(),
b'\n' => {
if self.context == YamlLexContext::AfterIdent {
self.context = YamlLexContext::Regular;
}
self.consume_newline_or_whitespaces()
}
b'[' => self.consume_array_inline_start(),
b',' => {
if self.context == YamlLexContext::AfterInlineArray {
self.consume_byte(T![,])
} else {
self.consume_identifer_or_value()
}
}
b']' => {
if self.context == YamlLexContext::AfterInlineArray {
self.consume_array_inline_end()
} else {
self.consume_identifer_or_value()
}
}
_ => match context {
YamlLexContext::Regular => self.consume_identifer_or_value(),
YamlLexContext::AfterIdent => self.consume_value(),
YamlLexContext::AfterInlineArray => self.consume_value(),
},
};

debug_assert!(self.text_position() > start, "Lexer did not advance");
kind
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: YamlSyntaxKind) -> YamlSyntaxKind {
self.advance(1);
tok
}

fn consume_comment(&mut self) -> YamlSyntaxKind {
self.assert_byte(b'#');
self.consume_until_newline();
YamlSyntaxKind::COMMENT
}

fn consume_until_newline(&mut self) {
while let Some(c) = self.current_char() {
if c == b'\n' {
break;
}
self.advance(1);
}
self.context = YamlLexContext::Regular;
}

fn consume_string_literal(&mut self, quote: u8) -> YamlSyntaxKind {
self.assert_current_char_boundary();
self.assert_byte(quote);
self.advance(1);

let mut escape = false;
loop {
match self.current_char() {
Some(b'\\') => {
escape = true;
self.advance(1);
}
Some(c) if c == quote && !escape => {
self.advance(1);
break;
}
Some(_) => {
escape = false;
self.advance(1);
}
None => {
break;
}
}
}
YamlSyntaxKind::YAML_STRING_VALUE
}

/// Consume a line up to the colon or the end of the line
fn consume_identifer_or_value(&mut self) -> YamlSyntaxKind {
let start = self.position;
let mut is_ident = false;
while let Some(c) = self.current_char() {
if c == b'\n' {
break;
}
if c == b':' && self.is_next_char_whitespace() {
is_ident = true;
break;
}
self.advance(1);
}
if is_ident {
YamlSyntaxKind::YAML_IDENTIFIER
} else {
let value = &self.source[start..self.position];
interpret_value(value)
}
}

fn consume_value(&mut self) -> YamlSyntaxKind {
let start = self.position;
while let Some(c) = self.current_char() {
if c == b'\n' {
break;
}
if self.context == YamlLexContext::AfterInlineArray && (c == b',' || c == b']') {
break;
}
self.advance(1);
}
let value = &self.source[start..self.position];
interpret_value(value)
}

fn consume_array_inline_start(&mut self) -> YamlSyntaxKind {
self.assert_byte(b'[');
self.advance(1);
self.context = YamlLexContext::AfterInlineArray;
YamlSyntaxKind::L_BRACK
}

fn consume_array_inline_end(&mut self) -> YamlSyntaxKind {
self.assert_byte(b']');
self.advance(1);
self.context = YamlLexContext::Regular;
YamlSyntaxKind::R_BRACK
}
}

fn interpret_value(value: &str) -> YamlSyntaxKind {
match value {
"true" | "false" => YamlSyntaxKind::YAML_BOOLEAN_VALUE,
"null" => YamlSyntaxKind::YAML_NULL_VALUE,
_ => value
.parse::<f64>()
.map_or(YamlSyntaxKind::YAML_STRING_VALUE, |_| {
YamlSyntaxKind::YAML_NUMBER_VALUE
}),
}
}

impl<'src> Lexer<'src> for YamlLexer<'src> {
const NEWLINE: Self::Kind = NEWLINE;
const WHITESPACE: Self::Kind = WHITESPACE;
const NEWLINE: Self::Kind = YamlSyntaxKind::NEWLINE;
const WHITESPACE: Self::Kind = YamlSyntaxKind::WHITESPACE;

type Kind = YamlSyntaxKind;
type LexContext = YamlLexContext;
Expand Down Expand Up @@ -69,8 +304,10 @@ impl<'src> Lexer<'src> for YamlLexer<'src> {
self.current_start
}

fn next_token(&mut self, _context: Self::LexContext) -> Self::Kind {
todo!()
fn next_token(&mut self, context: Self::LexContext) -> Self::Kind {
self.current_start = TextSize::from(self.position as u32);
self.current_flags = TokenFlags::empty();
self.consume_token_in_context(self.current_char().unwrap_or(b'\0'), context)
}

fn has_preceding_line_break(&self) -> bool {
Expand Down Expand Up @@ -101,6 +338,7 @@ impl<'src> Lexer<'src> for YamlLexer<'src> {
self.position
}

#[inline]
fn advance(&mut self, n: usize) {
self.position += n;
}
Expand All @@ -112,19 +350,59 @@ impl<'src> Lexer<'src> for YamlLexer<'src> {
fn consume_newline_or_whitespaces(&mut self) -> YamlSyntaxKind {
if self.consume_newline() {
self.after_newline = true;
NEWLINE
YamlSyntaxKind::NEWLINE
} else {
self.consume_whitespaces();
WHITESPACE
YamlSyntaxKind::WHITESPACE
}
}
}

impl Iterator for YamlLexer<'_> {
type Item = Token;

fn next(&mut self) -> Option<Self::Item> {
self.consume_token()
}
}

impl FusedIterator for YamlLexer<'_> {}

/// Context in which the lexer should lex the next token
#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)]
pub enum YamlLexContext {
#[default]
Regular,
/// The lexer has just lexed an identifier and is expecting a value to come next.
///
/// After an identifier, a `: ` is expected to come next, and then the value. If the next token is another identifier, it must come after a newline. Otherwise, values can remain on the same line.
/// However, it is possible to have a nested mapping on the same line, but the nested mapping must be enclosed in `{}`.
///
/// Valid:
/// ```yaml
/// foo:
/// bar: baz
/// ```
/// ```yaml
/// foo: 5
/// ```
/// ```yaml
/// foo: { bar: baz }
/// ```
/// Invalid:
/// ```yaml
/// foo: bar: baz
/// # ^ invalid syntax, mapping values not allowed here
/// ```
AfterIdent,
/// The lexer has lexed an inline array and is expecting a value or the end of the array to come next.
///
/// In this context, commas are allowed to separate values. Normally, commas are simply treated as part of a value.
/// A newline is not allowed in this context. The newline must come after the array has been closed with `]`
/// ```yaml
/// foo: [1, 2, 3]
/// ```
AfterInlineArray,
}

impl LexContext for YamlLexContext {
Expand All @@ -134,6 +412,22 @@ impl LexContext for YamlLexContext {
}
}

/// Context in which the [YamlLexContext]'s current yoken should be re-lexed.
/// Context in which the [YamlLexContext]'s current token should be re-lexed.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum YamlReLexContext {}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_interpret_value() {
assert_eq!(interpret_value("true"), YamlSyntaxKind::YAML_BOOLEAN_VALUE);
assert_eq!(interpret_value("false"), YamlSyntaxKind::YAML_BOOLEAN_VALUE);
assert_eq!(interpret_value("null"), YamlSyntaxKind::YAML_NULL_VALUE);
assert_eq!(interpret_value("foo"), YamlSyntaxKind::YAML_STRING_VALUE);
assert_eq!(interpret_value("1"), YamlSyntaxKind::YAML_NUMBER_VALUE);
assert_eq!(interpret_value("1.0"), YamlSyntaxKind::YAML_NUMBER_VALUE);
assert_eq!(interpret_value("1.0.0"), YamlSyntaxKind::YAML_STRING_VALUE);
}
}
Loading

0 comments on commit 2462a59

Please sign in to comment.