Skip to content

Commit

Permalink
lexer recode
Browse files Browse the repository at this point in the history
why didnt i do this sooner wtf
  • Loading branch information
ImShyMike committed Jan 21, 2025
1 parent e2572dd commit 1a8a91a
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 166 deletions.
2 changes: 1 addition & 1 deletion eryx/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version of the package."""

CURRENT_VERSION = "0.3.12"
CURRENT_VERSION = "0.3.13"
225 changes: 82 additions & 143 deletions eryx/frontend/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from enum import Enum, auto
from typing import Any, Union

from colorama import Fore, init
from colorama import Fore

from eryx.utils.errors import syntax_error

init(autoreset=True)


class TokenType(Enum):
"""All token types in the language."""
Expand Down Expand Up @@ -61,19 +59,40 @@ class TokenType(Enum):
EOF = auto()


class Token:
"""Token class."""

def __init__(
self, value: Any, token_type: TokenType, position: Union[int, tuple[int, int]]
):
self.value = value
self.type = token_type
self.position = position

def __repr__(self) -> str:
return f'Token("{self.value}", {self.type.name}, {self.position})'
SINGLE_CHAR_TOKENS = {
"(": TokenType.OPEN_PAREN,
")": TokenType.CLOSE_PAREN,
"{": TokenType.OPEN_BRACE,
"}": TokenType.CLOSE_BRACE,
"[": TokenType.OPEN_BRACKET,
"]": TokenType.CLOSE_BRACKET,
"+": TokenType.BINARY_OPERATOR,
"*": TokenType.BINARY_OPERATOR,
"/": TokenType.BINARY_OPERATOR,
"%": TokenType.BINARY_OPERATOR,
"^": TokenType.BINARY_OPERATOR,
";": TokenType.SEMICOLON,
",": TokenType.COMMA,
":": TokenType.COLON,
".": TokenType.DOT,
"=": TokenType.EQUALS,
"<": TokenType.BINARY_OPERATOR,
">": TokenType.BINARY_OPERATOR,
"&": TokenType.BINARY_OPERATOR,
"|": TokenType.BINARY_OPERATOR,
}

DOUBLE_CHAR_TOKENS = {
"==": TokenType.BINARY_OPERATOR,
"!=": TokenType.BINARY_OPERATOR,
"<=": TokenType.BINARY_OPERATOR,
">=": TokenType.BINARY_OPERATOR,
"&&": TokenType.BINARY_OPERATOR,
"||": TokenType.BINARY_OPERATOR,
"<<": TokenType.BINARY_OPERATOR,
">>": TokenType.BINARY_OPERATOR,
"**": TokenType.BINARY_OPERATOR,
}

KEYWORDS = {
"let": TokenType.LET,
Expand All @@ -97,6 +116,20 @@ def __repr__(self) -> str:
}


class Token:
"""Token class."""

def __init__(
self, value: Any, token_type: TokenType, position: Union[int, tuple[int, int]]
):
self.value = value
self.type = token_type
self.position = position

def __repr__(self) -> str:
return f'Token("{self.value}", {self.type.name}, {self.position})'


def is_skipable(char: str) -> bool:
"""Check if a character is a skipable character."""
return char in (
Expand All @@ -109,110 +142,69 @@ def is_skipable(char: str) -> bool:

def tokenize(source_code: str) -> list[Token]:
"""Tokenize the source code."""
tokens = []
source_size = len(source_code)
tokens = [] # Initialize the tokens list
src = list(source_code)
current_pos = -1
comment = False # Comment flag

while len(src) > 0:
negative_num = False # Negative number flag
current_pos = source_size - len(src) # Current position in the source code
negative_num = False # Reset the negative number flag
current_pos += 1 # Increment the current position

# Skip comments
if comment:
if src[0] in ("\n", "\r", ";"):
comment = False
src.pop(0)
continue

single_char_tokens = {
"(": TokenType.OPEN_PAREN,
")": TokenType.CLOSE_PAREN,
"{": TokenType.OPEN_BRACE,
"}": TokenType.CLOSE_BRACE,
"[": TokenType.OPEN_BRACKET,
"]": TokenType.CLOSE_BRACKET,
"+": TokenType.BINARY_OPERATOR,
"*": TokenType.BINARY_OPERATOR,
"/": TokenType.BINARY_OPERATOR,
"%": TokenType.BINARY_OPERATOR,
"^": TokenType.BINARY_OPERATOR,
";": TokenType.SEMICOLON,
",": TokenType.COMMA,
":": TokenType.COLON,
".": TokenType.DOT,
}

# Check for single character tokens first
if src[0] in single_char_tokens:
token = src.pop(0)

# Power operator
if token == "*" and len(src) > 0 and src[0] == "*":
src.pop(0)
tokens.append(Token("**", TokenType.BINARY_OPERATOR, current_pos))
continue

# Single character token
tokens.append(Token(token, single_char_tokens[token], current_pos))
continue

# Check for comments
if src[0] == "#":
comment = True
# Skip skipable characters
if is_skipable(src[0]): # spaces, newlines, tabs, and carriage returns
src.pop(0)
continue

# Bitwise operators
if src[0] == ">" and len(src) > 1 and src[1] == ">":
src.pop(0)
src.pop(0)
tokens.append(Token(">>", TokenType.BINARY_OPERATOR, current_pos))
# Check for double character tokens first
if len(src) > 1 and src[0] + src[1] in DOUBLE_CHAR_TOKENS:
token = src.pop(0) + src.pop(0)
tokens.append(
Token(token, DOUBLE_CHAR_TOKENS[token], (current_pos, current_pos + 1))
)
continue

if src[0] == "<" and len(src) > 1 and src[1] == "<":
src.pop(0)
src.pop(0)
tokens.append(Token("<<", TokenType.BINARY_OPERATOR, current_pos))
continue
# Check for single character tokens
if src[0] in SINGLE_CHAR_TOKENS:
token = src.pop(0)

if src[0] == "&":
src.pop(0)
if len(src) > 0 and src[0] == "&":
src.pop(0)
tokens.append(Token("&&", TokenType.BINARY_OPERATOR, current_pos))
else:
tokens.append(Token("&", TokenType.BINARY_OPERATOR, current_pos))
# Single character token
tokens.append(Token(token, SINGLE_CHAR_TOKENS[token], current_pos))
continue

if src[0] == "|":
# Check for comments
if src[0] == "#":
comment = True
src.pop(0)
if len(src) > 0 and src[0] == "|":
src.pop(0)
tokens.append(Token("||", TokenType.BINARY_OPERATOR, current_pos))
else:
tokens.append(Token("|", TokenType.BINARY_OPERATOR, current_pos))
continue

# If its not a single character token, check for negative numbers
# If its not a single/double character token, check for negative numbers/variables
if src[0] == "-":
if len(src) > 0 and (src[1].isdigit() or src[1].isalpha() or src[1] == "_"):
negative_num = True # Set negative number flag
src.pop(0)
else:
# If its not a negative number, its a "-" operator
tokens.append(Token(src.pop(0), TokenType.BINARY_OPERATOR, current_pos))
continue

# If its a negative number, remove the negative sign
if negative_num:
src.pop(0)

# Check for multi character tokens
if src[0].isdigit(): # Number
start_pos = current_pos
end_pos = start_pos + (1 if negative_num else 0)
end_pos = start_pos
number = src.pop(0)

if negative_num:
end_pos += 1
number = "-" + number # Add negative sign to the number

dots = 0
while len(src) > 0 and (src[0].isdigit() or src[0] == "."):
if src[0] == ".":
Expand All @@ -233,11 +225,12 @@ def tokenize(source_code: str) -> list[Token]:
end_pos += 1
identifier += src.pop(0)

if identifier in KEYWORDS:
if identifier in KEYWORDS: # Check if the identifier is a keyword
tokens.append(
Token(identifier, KEYWORDS[identifier], (start_pos, end_pos))
)
else:

else: # If its not a keyword, its an identifier
if negative_num: # Fake a unary minus operator
tokens.append(
Token("(", TokenType.OPEN_PAREN, (start_pos, end_pos))
Expand All @@ -256,72 +249,17 @@ def tokenize(source_code: str) -> list[Token]:
Token(")", TokenType.CLOSE_PAREN, (start_pos, end_pos))
)

elif is_skipable(src[0]): # Skip spaces, newlines, tabs, and carriage returns
src.pop(0)

elif src[0] == '"': # String
start_pos = current_pos
end_pos = start_pos
src.pop(0)
src.pop(0) # Remove the opening quote
string = ""
while len(src) > 0 and src[0] != '"':
end_pos += 1
string += src.pop(0)
src.pop(0)
src.pop(0) # Remove the closing quote
tokens.append(Token(string, TokenType.STRING, (start_pos, end_pos + 1)))

elif src[0] in ("=", "<", ">"): # Binary operator
if len(src) > 1:
if src[0] == "=" and src[1] == "=":
tokens.append(
Token(
"==",
TokenType.BINARY_OPERATOR,
(current_pos, current_pos + 1),
)
)
src.pop(0)
src.pop(0)
continue

if src[0] == "<" and src[1] == "=":
tokens.append(
Token(
"<=",
TokenType.BINARY_OPERATOR,
(current_pos, current_pos + 1),
)
)
src.pop(0)
src.pop(0)
continue

if src[0] == ">" and src[1] == "=":
tokens.append(
Token(
">=",
TokenType.BINARY_OPERATOR,
(current_pos, current_pos + 1),
)
)
src.pop(0)
src.pop(0)
continue

if src[0] in ("<", ">"):
tokens.append(Token(src.pop(0), TokenType.BINARY_OPERATOR, current_pos))
continue

if src[0] == "=":
tokens.append(Token(src.pop(0), TokenType.EQUALS, current_pos))

elif src[0] == "!" and len(src) > 1 and src[1] == "=": # Binary operator
tokens.append(
Token("!=", TokenType.BINARY_OPERATOR, (current_pos, current_pos + 1))
)
src.pop(0)
src.pop(0)

else:
# If this is reached, its an unknown character
syntax_error(
Expand All @@ -330,6 +268,7 @@ def tokenize(source_code: str) -> list[Token]:
f"Unknown character found in source '{Fore.MAGENTA}{src.pop(0)}{Fore.RESET}'",
)

tokens.append(Token("EOF", TokenType.EOF, source_size - len(src)))
# Add the final EOF token
tokens.append(Token("EOF", TokenType.EOF, current_pos + 1))

return tokens
2 changes: 1 addition & 1 deletion eryx/frontend/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def parse_primary_expression(self) -> Expression:
self.next() # Skip the semicolon
return Expression()
case _:
syntax_error(self.source_code, token.position, "Unexpected token.")
syntax_error(self.source_code, token.position, f"Unexpected token. {token}")
return Expression() # This will never be reached

def parse_assignment_expression(self) -> Expression:
Expand Down
4 changes: 0 additions & 4 deletions eryx/runtime/repl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@

import sys

from colorama import init

from eryx.__init__ import CURRENT_VERSION
from eryx.frontend.parser import Parser
from eryx.runtime.environment import Environment
from eryx.runtime.runner import run_code

init(autoreset=True)


def start_repl(
log_ast: bool = False, log_result: bool = False, log_tokens: bool = False
Expand Down
Loading

0 comments on commit 1a8a91a

Please sign in to comment.