|
| 1 | +use once_cell::sync::Lazy; |
| 2 | +use regex::Regex; |
| 3 | + |
| 4 | +use crate::token::Token; |
| 5 | + |
| 6 | +#[derive(Debug, Clone, PartialEq, thiserror::Error)] |
| 7 | +pub enum LexicalError { |
| 8 | + #[error("Unexpected character: {0}")] |
| 9 | + UnexpectedCharacter(char), |
| 10 | + #[error("Unexpected end of input")] |
| 11 | + UnexpectedEndOfInput, |
| 12 | +} |
| 13 | + |
| 14 | +macro_rules! regex { |
| 15 | + ($pattern:expr) => {{ |
| 16 | + static RE: Lazy<Regex> = Lazy::new(|| Regex::new($pattern).unwrap()); |
| 17 | + &RE |
| 18 | + }}; |
| 19 | +} |
| 20 | + |
| 21 | +// 成功: Ok(Some((Token, 消費したバイト数))) |
| 22 | +// 失敗: Err(LexicalError) |
| 23 | +// EOF: Ok(None) |
| 24 | +type LexResult = std::result::Result<Option<(Token, usize)>, LexicalError>; |
| 25 | + |
| 26 | +fn ok(token: Token, bytes_consumed: usize) -> LexResult { |
| 27 | + Ok(Some((token, bytes_consumed))) |
| 28 | +} |
| 29 | + |
| 30 | +fn err(e: LexicalError) -> LexResult { |
| 31 | + Err(e) |
| 32 | +} |
| 33 | + |
| 34 | +fn eof() -> LexResult { |
| 35 | + Ok(None) |
| 36 | +} |
| 37 | + |
| 38 | +// input からトークンをひとつ読み取り、トークンと消費したバイト数を返す。 |
| 39 | +pub fn lex(input: &str) -> LexResult { |
| 40 | + if input.is_empty() { |
| 41 | + return eof(); |
| 42 | + } |
| 43 | + |
| 44 | + let re_whitespace = regex!(r"^[ \t\r\n]+"); |
| 45 | + if let Some(m) = re_whitespace.find(input) { |
| 46 | + let r = lex(&input[m.end()..]); |
| 47 | + return match r { |
| 48 | + Ok(Some((token, bytes_consumed))) => ok(token, m.end() + bytes_consumed), |
| 49 | + _ => r, |
| 50 | + }; |
| 51 | + } |
| 52 | + |
| 53 | + let re_identifier_or_keyword = regex!(r"^[a-zA-Z_][a-zA-Z0-9_]*"); |
| 54 | + if let Some(m) = re_identifier_or_keyword.find(input) { |
| 55 | + let s = m.as_str(); |
| 56 | + let token = match s { |
| 57 | + "true" => Token::True, |
| 58 | + "false" => Token::False, |
| 59 | + "null" => Token::Null, |
| 60 | + _ => Token::Identifier(s.to_owned()), |
| 61 | + }; |
| 62 | + return ok(token, m.end()); |
| 63 | + } |
| 64 | + |
| 65 | + let re_digits = regex!(r"^[0-9]+"); |
| 66 | + if let Some(m) = re_digits.find(input) { |
| 67 | + let n = m.as_str().parse::<f64>().unwrap(); |
| 68 | + return ok(Token::Number(n), m.end()); |
| 69 | + } |
| 70 | + |
| 71 | + unimplemented!() |
| 72 | +} |
| 73 | + |
| 74 | +// バイトオフセットを (行, 列) に変換する。 |
| 75 | +// 行と列は 0 から始まる。 |
| 76 | +// line_breaks はソースコードの改行文字のバイトオフセットの配列である。 |
| 77 | +pub fn to_line_col(line_breaks: &[usize], pos: usize) -> (usize, usize) { |
| 78 | + let line = line_breaks.partition_point(|&x| x < pos); |
| 79 | + let col = if line == 0 { |
| 80 | + pos |
| 81 | + } else { |
| 82 | + pos - line_breaks[line - 1] - 1 |
| 83 | + }; |
| 84 | + (line, col) |
| 85 | +} |
| 86 | + |
| 87 | +#[test] |
| 88 | +fn test_to_line_col() { |
| 89 | + let source_code = [ |
| 90 | + "#include <stdio.h>", |
| 91 | + "", |
| 92 | + "int main(void) {", |
| 93 | + " return 0;", |
| 94 | + "}", |
| 95 | + ].join("\n"); |
| 96 | + |
| 97 | + let line_breaks: Vec<usize> = source_code |
| 98 | + .match_indices('\n') |
| 99 | + .map(|(i, _)| i) |
| 100 | + .collect(); |
| 101 | + |
| 102 | + assert_eq!(to_line_col(&line_breaks, 0), (0, 0)); |
| 103 | + assert_eq!(to_line_col(&line_breaks, 1), (0, 1)); |
| 104 | + assert_eq!(to_line_col(&line_breaks, 18), (0, 18)); |
| 105 | + assert_eq!(to_line_col(&line_breaks, 19), (1, 0)); |
| 106 | + assert_eq!(to_line_col(&line_breaks, 20), (2, 0)); |
| 107 | + assert_eq!(to_line_col(&line_breaks, 21), (2, 1)); |
| 108 | + assert_eq!(to_line_col(&line_breaks, 36), (2, 16)); |
| 109 | +} |
0 commit comments