Skip to content

Commit

Permalink
feat(bytecode, lex, parse, vm): enhance bytecode, add token types, an…
Browse files Browse the repository at this point in the history
…d improve parser and VM
  • Loading branch information
HsiangNianian committed Nov 2, 2024
1 parent 1fc58a3 commit 1255caf
Show file tree
Hide file tree
Showing 9 changed files with 528 additions and 68 deletions.
9 changes: 8 additions & 1 deletion src/bytecode.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#[derive(Debug)]
pub enum ByteCode {
GetGlobal(u8, u8),
LoadConst(u8, u8),
SetGlobal(u8, u8),
SetGlobalConst(u8, u8), // TODO u8?
SetGlobalGlobal(u8, u8),
LoadConst(u8, u16),
LoadNil(u8),
LoadBool(u8, bool),
LoadInt(u8, i16),
Move(u8, u8),
Call(u8, u8),
}
288 changes: 251 additions & 37 deletions src/lex.rs
Original file line number Diff line number Diff line change
@@ -1,72 +1,286 @@
use std::mem;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};

// ANCHOR: token
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Token {
Name(String),
// keywords
And, Break, Do, Else, Elseif, End,
False, For, Function, Goto, If, In,
Local, Nil, Not, Or, Repeat, Return,
Then, True, Until, While,

// + - * / % ^ #
Add, Sub, Mul, Div, Mod, Pow, Len,
// & ~ | << >> //
BitAnd, BitXor, BitOr, ShiftL, ShiftR, Idiv,
// == ~= <= >= < > =
Equal, NotEq, LesEq, GreEq, Less, Greater, Assign,
// ( ) { } [ ] ::
ParL, ParR, CurlyL, CurlyR, SqurL, SqurR, DoubColon,
// ; : , . .. ...
SemiColon, Colon, Comma, Dot, Concat, Dots,

// constant values
Integer(i64),
Float(f64),
String(String),

// name of variables or table keys
Name(String),

// end
Eos,
}
// ANCHOR_END: token

// ANCHOR: lex
#[derive(Debug)]
// ANCHOR: lex
pub struct Lex {
input: File,
ahead: Token,
}
// ANCHOR_END: lex

impl Lex {
pub fn new(input: File) -> Self {
Lex { input }
Lex {
input,
ahead: Token::Eos,
}
}

// ANCHOR: peek_next
pub fn next(&mut self) -> Token {
if self.ahead == Token::Eos {
self.do_next()
} else {
mem::replace(&mut self.ahead, Token::Eos)
}
}

pub fn peek(&mut self) -> &Token {
if self.ahead == Token::Eos {
self.ahead = self.do_next();
}
&self.ahead
}
// ANCHOR_END: peek_next

fn do_next(&mut self) -> Token {
let ch = self.read_char();
match ch {
' ' | '\r' | '\n' | '\t' => self.next(),
'\0' => Token::Eos,

'"' => { // literal String
let mut s = String::new();
loop {
match self.read_char() {
'\0' => panic!("unfinished literal string"),
'"' => break,
ch => s.push(ch),
'\n' | '\r' | '\t' | ' ' => self.do_next(),
'+' => Token::Add,
'*' => Token::Mul,
'%' => Token::Mod,
'^' => Token::Pow,
'#' => Token::Len,
'&' => Token::BitAnd,
'|' => Token::BitOr,
'(' => Token::ParL,
')' => Token::ParR,
'{' => Token::CurlyL,
'}' => Token::CurlyR,
'[' => Token::SqurL,
']' => Token::SqurR,
';' => Token::SemiColon,
',' => Token::Comma,
'/' => self.check_ahead('/', Token::Idiv, Token::Div),
'=' => self.check_ahead('=', Token::Equal, Token::Assign),
'~' => self.check_ahead('=', Token::NotEq, Token::BitXor),
':' => self.check_ahead(':', Token::DoubColon, Token::Colon),
'<' => self.check_ahead2('=', Token::LesEq, '<', Token::ShiftL, Token::Less),
'>' => self.check_ahead2('=', Token::GreEq, '>', Token::ShiftR, Token::Greater),
'\'' | '"' => self.read_string(ch),
'.' => match self.read_char() {
'.' => {
if self.read_char() == '.' {
Token::Dots
} else {
self.putback_char();
Token::Concat
}
},
'0'..='9' => {
self.putback_char();
self.read_number_fraction(0)
},
_ => {
self.putback_char();
Token::Dot
},
},
'-' => {
if self.read_char() == '-' {
self.read_comment();
self.do_next()
} else {
self.putback_char();
Token::Sub
}
Token::String(s)
},
'0'..='9' => self.read_number(ch),
'A'..='Z' | 'a'..='z' | '_' => self.read_name(ch),
'\0' => Token::Eos,
_ => panic!("invalid char {ch}"),
}
}

#[allow(clippy::unused_io_amount)]
fn read_char(&mut self) -> char {
let mut buf: [u8; 1] = [0];
self.input.read(&mut buf).unwrap();
buf[0] as char
}
fn putback_char(&mut self) {
self.input.seek(SeekFrom::Current(-1)).unwrap();
}

fn check_ahead(&mut self, ahead: char, long: Token, short: Token) -> Token {
if self.read_char() == ahead {
long
} else {
self.putback_char();
short
}
}
fn check_ahead2(&mut self, ahead1: char, long1: Token, ahead2: char, long2: Token, short: Token) -> Token {
let ch = self.read_char();
if ch == ahead1 {
long1
} else if ch == ahead2 {
long2
} else {
self.putback_char();
short
}
}

fn read_number(&mut self, first: char) -> Token {
// heximal
if first == '0' {
let second = self.read_char();
if second == 'x' || second == 'X' {
return self.read_heximal();
}
self.putback_char();
}

'A'..='Z' | 'a'..='z' | '_' => { // Name
let mut name = String::new();
name.push(ch);
loop {
match self.read_char() {
'\0' => break,
'_' => name.push('_'),
ch if ch.is_alphanumeric() => name.push(ch),
_ => {
self.input.seek(SeekFrom::Current(-1)).unwrap();
break;
}
}
}
Token::Name(name)
// decimal
let mut n = char::to_digit(first, 10).unwrap() as i64;
loop {
let ch = self.read_char();
if let Some(d) = char::to_digit(ch, 10) {
n = n * 10 + d as i64;
} else if ch == '.' {
return self.read_number_fraction(n);
} else if ch == 'e' || ch == 'E' {
return self.read_number_exp(n as f64);
} else {
self.putback_char();
break;
}
}

_ => panic!("unexpected char: {ch}"),
// check following
let fch = self.read_char();
if fch.is_alphabetic() || fch == '.' {
panic!("malformat number");
} else {
self.putback_char();
}

Token::Integer(n)
}
fn read_number_fraction(&mut self, i: i64) -> Token {
let mut n: i64 = 0;
let mut x: f64 = 1.0;
loop {
let ch = self.read_char();
if let Some(d) = char::to_digit(ch, 10) {
n = n * 10 + d as i64;
x *= 10.0;
} else {
self.putback_char();
break;
}
}
Token::Float(i as f64 + n as f64 / x)
}
fn read_number_exp(&mut self, _: f64) -> Token {
todo!("lex number exp")
}
fn read_heximal(&mut self) -> Token {
todo!("lex heximal")
}

fn read_char(&mut self) -> char {
let mut buf: [u8; 1] = [0];
if self.input.read(&mut buf).unwrap() == 1 {
buf[0] as char
} else {
'\0'
fn read_string(&mut self, quote: char) -> Token {
let mut s = String::new();
loop {
match self.read_char() {
'\n' | '\0' => panic!("unfinished string"),
'\\' => todo!("escape"),
ch if ch == quote => break,
ch => s.push(ch),
}
}
Token::String(s)
}

fn read_name(&mut self, first: char) -> Token {
let mut s = first.to_string();

loop {
let ch = self.read_char();
if ch.is_alphanumeric() || ch == '_' {
s.push(ch);
} else {
self.putback_char();
break;
}
}

match &s as &str { // TODO optimize by hash
"and" => Token::And,
"break" => Token::Break,
"do" => Token::Do,
"else" => Token::Else,
"elseif" => Token::Elseif,
"end" => Token::End,
"false" => Token::False,
"for" => Token::For,
"function" => Token::Function,
"goto" => Token::Goto,
"if" => Token::If,
"in" => Token::In,
"local" => Token::Local,
"nil" => Token::Nil,
"not" => Token::Not,
"or" => Token::Or,
"repeat" => Token::Repeat,
"return" => Token::Return,
"then" => Token::Then,
"true" => Token::True,
"until" => Token::Until,
"while" => Token::While,
_ => Token::Name(s),
}
}

// '--' has been read
fn read_comment(&mut self) {
match self.read_char() {
'[' => todo!("long comment"),
_ => { // line comment
loop {
let ch = self.read_char();
if ch == '\n' || ch == '\0' {
break;
}
}
}
}
}
}
3 changes: 1 addition & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::env;
use std::fs::File;
use std::io::Read;

mod value;
mod bytecode;
Expand All @@ -16,6 +15,6 @@ fn main() {
}
let file = File::open(&args[1]).unwrap();

let proto = parse::load(file);
let proto = parse::ParseProto::load(file);
vm::ExeState::new().execute(&proto);
}
Loading

0 comments on commit 1255caf

Please sign in to comment.