From 07ca0fcc3268accc250bd6b555363f54485ef140 Mon Sep 17 00:00:00 2001 From: SpookyDervish <78246495+SpookyDervish@users.noreply.github.com> Date: Mon, 13 Oct 2025 06:55:35 +1100 Subject: [PATCH] started work on lexer --- lexer.py | 106 +++++++++++++++++++++++++++++++++++++++++++ lexer_token.py | 42 +++++++++++++++++ main.py | 13 ++++++ tests/helloWorld.pla | 2 +- tests/lexer.pla | 6 +++ tests/types.pla | 18 ++++++++ tests/variables.pla | 12 +++-- 7 files changed, 193 insertions(+), 6 deletions(-) create mode 100644 lexer.py create mode 100644 lexer_token.py create mode 100644 main.py create mode 100644 tests/lexer.pla create mode 100644 tests/types.pla diff --git a/lexer.py b/lexer.py new file mode 100644 index 0000000..795b8ef --- /dev/null +++ b/lexer.py @@ -0,0 +1,106 @@ +from lexer_token import Token, TokenType +from typing import Any + + +class Lexer: + def __init__(self, source: str) -> None: + self.source = source + + self.position: int = -1 + self.read_position: int = 0 + self.line_no: int = 1 + + self.current_char: str | None = None + + self.__read_char() + + def __read_char(self) -> None: + if self.read_position >= len(self.source): + self.current_char = None + else: + self.current_char = self.source[self.read_position] + + self.position = self.read_position + self.read_position += 1 + + def __skip_whitespace(self) -> None: + while self.current_char in [' ', '\t', '\n', '\r']: + if self.current_char == "\n": + self.line_no += 1 + + self.__read_char() + + def __new_token(self, tt: TokenType, literal: Any) -> Token: + return Token(tt, literal, self.line_no, self.position) + + def __is_digit(self, char: str) -> bool: + return "0" <= char and char <= "9" + + def __read_number(self) -> Token: + start_pos: int = self.position + dot_count: int = 0 + + output: str = "" + + while self.__is_digit(self.current_char) or self.current_char == ".": + if self.current_char == ".": + dot_count += 1 + + if dot_count > 1: + # todo: error message + return self.__new_token(TokenType.ILLEGAL, self.source[start_pos:self.position]) + + output += self.source[self.position] + self.__read_char() + + if self.current_char is None: + break + + if dot_count == 0: + return self.__new_token(TokenType.INT, int(output)) + else: + return self.__new_token(TokenType.FLOAT, float(output)) + + def next_token(self) -> Token: + tok: Token = None + + self.__skip_whitespace() + + match self.current_char: + case "+": + tok = self.__new_token(TokenType.PLUS, self.current_char) + case "-": + tok = self.__new_token(TokenType.MINUS, self.current_char) + case "*": + tok = self.__new_token(TokenType.ASTERISK, self.current_char) + case "/": + tok = self.__new_token(TokenType.SLASH, self.current_char) + case "^": + tok = self.__new_token(TokenType.POW, self.current_char) + case "%": + tok = self.__new_token(TokenType.MODULUS, self.current_char) + case "(": + tok = self.__new_token(TokenType.LPAREN, self.current_char) + case ")": + tok = self.__new_token(TokenType.RPAREN, self.current_char) + case "[": + tok = self.__new_token(TokenType.LBRACKET, self.current_char) + case "]": + tok = self.__new_token(TokenType.RBRACKET, self.current_char) + case "{": + tok = self.__new_token(TokenType.LCURLY, self.current_char) + case "}": + tok = self.__new_token(TokenType.RCURLY, self.current_char) + case ":": + tok = self.__new_token(TokenType.COLON, self.current_char) + case None: + tok = self.__new_token(TokenType.EOF, "") + case _: + if self.__is_digit(self.current_char): + tok = self.__read_number() + return tok + else: + tok = self.__new_token(TokenType.ILLEGAL, self.current_char) + + self.__read_char() + return tok \ No newline at end of file diff --git a/lexer_token.py b/lexer_token.py new file mode 100644 index 0000000..8d1f9cf --- /dev/null +++ b/lexer_token.py @@ -0,0 +1,42 @@ +from enum import Enum +from typing import Any + + +class TokenType(Enum): + # Special tokens + EOF = "EOF" + ILLEGAL = "ILLEGAL" + + # Data types + INT = "INT" + FLOAT = "FLOAT" + + # Arithmetic symbols + PLUS = "PLUS" + MINUS = "MINUS" + ASTERISK = "ASTERISK" + SLASH = "SLASH" + POW = "POW" + MODULUS = "MODULUS" + + # Symbols + LPAREN = "LPAREN" + RPAREN = "RPAREN" + LBRACKET = "LBRACKET" + RBRACKET = "RBRACKET" + LCURLY = "LCURLY" + RCURLY = "RCURLY" + COLON = "COLON" + +class Token: + def __init__(self, type: TokenType, literal: Any, line_no: int, position: int) -> None: + self.type = type + self.literal = literal + self.line_no = line_no + self.position = position + + def __str__(self) -> str: + return f"token[{self.type} : {self.literal} : Line {self.line_no} : Position {self.position}]" + + def __repr__(self) -> str: + return str(self) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..6262818 --- /dev/null +++ b/main.py @@ -0,0 +1,13 @@ +from lexer import Lexer + +LEXER_DEBUG: bool = True + + +if __name__ == "__main__": + with open("tests/lexer.pla") as f: + code: str = f.read() + + if LEXER_DEBUG: + debug_lex: Lexer = Lexer(source=code) + while debug_lex.current_char is not None: + print(debug_lex.next_token()) \ No newline at end of file diff --git a/tests/helloWorld.pla b/tests/helloWorld.pla index 2dbfefe..165f105 100644 --- a/tests/helloWorld.pla +++ b/tests/helloWorld.pla @@ -1,2 +1,2 @@ depend "io.pla" -write("Hello, World!") \ No newline at end of file +print("Hello, World!") \ No newline at end of file diff --git a/tests/lexer.pla b/tests/lexer.pla new file mode 100644 index 0000000..11747db --- /dev/null +++ b/tests/lexer.pla @@ -0,0 +1,6 @@ +123 +0.456 +[] +{} +(1 + 3 * 2 ^ 4) % 2 +2 - 1 \ No newline at end of file diff --git a/tests/types.pla b/tests/types.pla new file mode 100644 index 0000000..0b29969 --- /dev/null +++ b/tests/types.pla @@ -0,0 +1,18 @@ +depend "io.pla" + +enum Gender { + male, + female +} + +struct Person { + name: String, + age: Int = 0, + speak: Func +} + +speak = Func(sentence: String): Nil { + print(sentence) +} + +max: Person = {"Max", 17, } \ No newline at end of file diff --git a/tests/variables.pla b/tests/variables.pla index 65c9739..b43cc42 100644 --- a/tests/variables.pla +++ b/tests/variables.pla @@ -6,8 +6,10 @@ myBoolean: Bool = true myString: String = "Hello!\n" myList: List = [1, "hi", true, [1, 2, 3], 0.789] -write(String(myInt)) -write(String(myDecimal)) -write(String(myBoolean)) -write(myString) -write(String(myList)) \ No newline at end of file +MY_CONSTANT: Const(String) = "foo bar" + +print(String(myInt)) +print(String(myDecimal)) +print(String(myBoolean)) +print(myString) +print(String(myList)) \ No newline at end of file