2025-10-13 21:05:03 +11:00
|
|
|
from lexer_token import Token, TokenType, lookup_ident
|
2025-10-13 06:55:35 +11:00
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Lexer:
|
|
|
|
|
def __init__(self, source: str) -> None:
|
|
|
|
|
self.source = source
|
|
|
|
|
|
|
|
|
|
self.position: int = -1
|
|
|
|
|
self.read_position: int = 0
|
|
|
|
|
self.line_no: int = 1
|
|
|
|
|
|
|
|
|
|
self.current_char: str | None = None
|
|
|
|
|
|
|
|
|
|
self.__read_char()
|
|
|
|
|
|
|
|
|
|
def __read_char(self) -> None:
|
|
|
|
|
if self.read_position >= len(self.source):
|
|
|
|
|
self.current_char = None
|
|
|
|
|
else:
|
|
|
|
|
self.current_char = self.source[self.read_position]
|
|
|
|
|
|
|
|
|
|
self.position = self.read_position
|
|
|
|
|
self.read_position += 1
|
|
|
|
|
|
2025-10-14 07:14:53 +11:00
|
|
|
def __peek_char(self) -> str | None:
|
|
|
|
|
if self.read_position >= len(self.source):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return self.source[self.read_position]
|
|
|
|
|
|
2025-10-13 06:55:35 +11:00
|
|
|
def __skip_whitespace(self) -> None:
|
|
|
|
|
while self.current_char in [' ', '\t', '\n', '\r']:
|
|
|
|
|
if self.current_char == "\n":
|
|
|
|
|
self.line_no += 1
|
|
|
|
|
|
|
|
|
|
self.__read_char()
|
|
|
|
|
|
|
|
|
|
def __new_token(self, tt: TokenType, literal: Any) -> Token:
|
|
|
|
|
return Token(tt, literal, self.line_no, self.position)
|
|
|
|
|
|
|
|
|
|
def __is_digit(self, char: str) -> bool:
|
|
|
|
|
return "0" <= char and char <= "9"
|
|
|
|
|
|
2025-10-13 21:05:03 +11:00
|
|
|
def __is_letter(self, char: str) -> bool:
|
|
|
|
|
return "a" <= char and char <= "z" or "A" <= char and char <= "Z" or char == "_"
|
|
|
|
|
|
2025-10-13 06:55:35 +11:00
|
|
|
def __read_number(self) -> Token:
|
|
|
|
|
start_pos: int = self.position
|
|
|
|
|
dot_count: int = 0
|
|
|
|
|
|
|
|
|
|
output: str = ""
|
|
|
|
|
|
|
|
|
|
while self.__is_digit(self.current_char) or self.current_char == ".":
|
|
|
|
|
if self.current_char == ".":
|
|
|
|
|
dot_count += 1
|
|
|
|
|
|
|
|
|
|
if dot_count > 1:
|
|
|
|
|
# todo: error message
|
|
|
|
|
return self.__new_token(TokenType.ILLEGAL, self.source[start_pos:self.position])
|
|
|
|
|
|
|
|
|
|
output += self.source[self.position]
|
|
|
|
|
self.__read_char()
|
|
|
|
|
|
|
|
|
|
if self.current_char is None:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if dot_count == 0:
|
|
|
|
|
return self.__new_token(TokenType.INT, int(output))
|
|
|
|
|
else:
|
|
|
|
|
return self.__new_token(TokenType.FLOAT, float(output))
|
|
|
|
|
|
2025-10-13 21:05:03 +11:00
|
|
|
def __read_identifier(self) -> str:
|
|
|
|
|
position = self.position
|
|
|
|
|
while self.current_char is not None and (self.__is_letter(self.current_char) or self.current_char.isalnum()):
|
|
|
|
|
self.__read_char()
|
|
|
|
|
|
|
|
|
|
return self.source[position:self.position]
|
|
|
|
|
|
2025-10-13 06:55:35 +11:00
|
|
|
def next_token(self) -> Token:
|
|
|
|
|
tok: Token = None
|
|
|
|
|
|
|
|
|
|
self.__skip_whitespace()
|
|
|
|
|
|
|
|
|
|
match self.current_char:
|
|
|
|
|
case "+":
|
|
|
|
|
tok = self.__new_token(TokenType.PLUS, self.current_char)
|
|
|
|
|
case "-":
|
|
|
|
|
tok = self.__new_token(TokenType.MINUS, self.current_char)
|
|
|
|
|
case "*":
|
|
|
|
|
tok = self.__new_token(TokenType.ASTERISK, self.current_char)
|
|
|
|
|
case "/":
|
|
|
|
|
tok = self.__new_token(TokenType.SLASH, self.current_char)
|
|
|
|
|
case "^":
|
|
|
|
|
tok = self.__new_token(TokenType.POW, self.current_char)
|
|
|
|
|
case "%":
|
|
|
|
|
tok = self.__new_token(TokenType.MODULUS, self.current_char)
|
2025-10-14 21:23:11 +11:00
|
|
|
case "<":
|
|
|
|
|
# Handle <=
|
|
|
|
|
if self.__peek_char() == "=":
|
|
|
|
|
ch = self.current_char
|
|
|
|
|
self.__read_char()
|
|
|
|
|
tok = self.__new_token(TokenType.LT_EQ, ch + self.current_char)
|
|
|
|
|
else:
|
|
|
|
|
tok = self.__new_token(TokenType.LT, self.current_char)
|
|
|
|
|
case ">":
|
|
|
|
|
# Handle >=
|
|
|
|
|
if self.__peek_char() == "=":
|
|
|
|
|
ch = self.current_char
|
|
|
|
|
self.__read_char()
|
|
|
|
|
tok = self.__new_token(TokenType.GT_EQ, ch + self.current_char)
|
|
|
|
|
else:
|
|
|
|
|
tok = self.__new_token(TokenType.GT, self.current_char)
|
2025-10-13 21:05:03 +11:00
|
|
|
case "=":
|
2025-10-14 21:23:11 +11:00
|
|
|
# Handle ==
|
|
|
|
|
if self.__peek_char() == "=":
|
|
|
|
|
ch = self.current_char
|
|
|
|
|
self.__read_char()
|
|
|
|
|
tok = self.__new_token(TokenType.EQ_EQ, ch + self.current_char)
|
|
|
|
|
else:
|
|
|
|
|
tok = self.__new_token(TokenType.EQ, self.current_char)
|
|
|
|
|
case "!":
|
|
|
|
|
# Handle !=
|
|
|
|
|
if self.__peek_char() == "=":
|
|
|
|
|
ch = self.current_char
|
|
|
|
|
self.__read_char()
|
|
|
|
|
tok = self.__new_token(TokenType.NOT_EQ, ch + self.current_char)
|
|
|
|
|
else:
|
|
|
|
|
# TODO: handle BANG
|
|
|
|
|
tok = self.__new_token(TokenType.ILLEGAL, self.current_char)
|
2025-10-13 06:55:35 +11:00
|
|
|
case "(":
|
|
|
|
|
tok = self.__new_token(TokenType.LPAREN, self.current_char)
|
|
|
|
|
case ")":
|
|
|
|
|
tok = self.__new_token(TokenType.RPAREN, self.current_char)
|
|
|
|
|
case "[":
|
|
|
|
|
tok = self.__new_token(TokenType.LBRACKET, self.current_char)
|
|
|
|
|
case "]":
|
|
|
|
|
tok = self.__new_token(TokenType.RBRACKET, self.current_char)
|
|
|
|
|
case "{":
|
2025-10-14 07:14:53 +11:00
|
|
|
tok = self.__new_token(TokenType.LBRACE, self.current_char)
|
2025-10-13 06:55:35 +11:00
|
|
|
case "}":
|
2025-10-14 07:14:53 +11:00
|
|
|
tok = self.__new_token(TokenType.RBRACE, self.current_char)
|
2025-10-13 17:41:07 +11:00
|
|
|
case ";":
|
|
|
|
|
tok = self.__new_token(TokenType.SEMICOLON, self.current_char)
|
2025-10-13 06:55:35 +11:00
|
|
|
case ":":
|
|
|
|
|
tok = self.__new_token(TokenType.COLON, self.current_char)
|
2025-10-15 07:16:15 +11:00
|
|
|
case ",":
|
|
|
|
|
tok = self.__new_token(TokenType.COMMA, self.current_char)
|
2025-10-15 07:48:38 +11:00
|
|
|
case '"':
|
|
|
|
|
tok = self.__new_token(TokenType.STRING, self.__read_string())
|
2025-10-13 06:55:35 +11:00
|
|
|
case None:
|
|
|
|
|
tok = self.__new_token(TokenType.EOF, "")
|
|
|
|
|
case _:
|
2025-10-13 21:05:03 +11:00
|
|
|
if self.__is_letter(self.current_char):
|
|
|
|
|
literal: str = self.__read_identifier()
|
|
|
|
|
tt: TokenType = lookup_ident(literal)
|
|
|
|
|
tok = self.__new_token(tt, literal)
|
|
|
|
|
return tok
|
|
|
|
|
|
2025-10-13 06:55:35 +11:00
|
|
|
if self.__is_digit(self.current_char):
|
|
|
|
|
tok = self.__read_number()
|
|
|
|
|
return tok
|
|
|
|
|
else:
|
|
|
|
|
tok = self.__new_token(TokenType.ILLEGAL, self.current_char)
|
|
|
|
|
|
|
|
|
|
self.__read_char()
|
2025-10-15 07:48:38 +11:00
|
|
|
return tok
|
|
|
|
|
|
|
|
|
|
def __read_string(self):
|
|
|
|
|
position: int = self.position + 1
|
|
|
|
|
while True:
|
|
|
|
|
self.__read_char()
|
|
|
|
|
if self.current_char == '"' or self.current_char is None:
|
|
|
|
|
break
|
2025-10-15 15:32:40 +11:00
|
|
|
return self.source[position:self.position]
|
2025-10-15 07:48:38 +11:00
|
|
|
|