From eee9325ab85561e634dff4911db8c903b36e24aa Mon Sep 17 00:00:00 2001 From: SpookyDervish <78246495+SpookyDervish@users.noreply.github.com> Date: Mon, 1 Sep 2025 06:44:33 +1000 Subject: [PATCH] working on generating an AST with scope and stuff --- error.py | 3 +- ground_ast.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++++ ground_types.py | 3 ++ main.py | 6 ++- test2.grnd | 16 ++++-- tokenizer.py | 69 ++++++++++++------------- 6 files changed, 187 insertions(+), 43 deletions(-) create mode 100644 ground_ast.py create mode 100644 ground_types.py diff --git a/error.py b/error.py index 959f52a..df8a47f 100644 --- a/error.py +++ b/error.py @@ -1,4 +1,5 @@ from console import console +from sys import exit def traceback(code: str, error_type: str, error_message: str, line: int | None = None, start_column: int | None = None, end_column: int | None = None): @@ -19,5 +20,5 @@ def traceback(code: str, error_type: str, error_message: str, line: int | None = else: console.print(f"[bold red]{error_type}: {error_message}") - + exit(1) \ No newline at end of file diff --git a/ground_ast.py b/ground_ast.py new file mode 100644 index 0000000..fbf8072 --- /dev/null +++ b/ground_ast.py @@ -0,0 +1,133 @@ +from __future__ import annotations +from dataclasses import dataclass +from tokenizer import Token, TokenType +from typing import Optional, Any +from error import traceback + + +@dataclass +class RootNode: + statements: list[Any] +@dataclass +class InstructionNode: + instruction: str + parent: FunctionNode | RootNode + arguments: list[Any] +@dataclass +class StringNode: + value: str +@dataclass +class NumberNode: + value: float +@dataclass +class VarRefNode: + var_name: str +@dataclass +class VarPointerNode: + var_name: str +@dataclass +class FunctionCallNode: + func_name: str +@dataclass +class TypeNode: + value: str +@dataclass +class ArgNode: + arg_type: str + name: str | None + parent: FunctionNode +@dataclass +class FunctionNode: + args: list[ArgNode] + statements: list[Any] + parent: FunctionNode | RootNode + return_type: Optional[str] = None + name: Optional[str] = None + +def generate_ast(tokens: list[Token], code: str) -> RootNode: + root_node = RootNode([]) + + current_node = None + last_token = None + current_node_type = None + scope = root_node + + # todo: this is the absolute WORST way i could do this, but i could not care less lmao + # its not even performant...... + for token in tokens: + print(token) + if token.type == TokenType.INSTRUCTION: + if current_node: + scope.statements.append(current_node) + + if token.value != "fun": + if current_node_type == "func": + scope = current_node + + current_node = InstructionNode(token.value, scope, []) + current_node_type = "inst" + + if current_node.instruction == "endfun": + scope = scope.parent # go up one scope + + current_node.parent = scope + else: + current_node = FunctionNode([], [], scope) + current_node_type = "func" + + if current_node: + if token.type == TokenType.STRING: + if current_node_type == "inst": + current_node.arguments.append(StringNode(token.value)) + else: + traceback(code, "SyntaxError", "Expected instruction, not string.") + + elif token.type == TokenType.INTEGER or token.type == TokenType.FLOAT: + if current_node_type == "inst": + current_node.arguments.append(NumberNode(token.value)) + else: + traceback(code, "SyntaxError", "Expected instruction, not number.") + + elif token.type == TokenType.VARIABLE_POINTER: + if current_node_type == "inst": + current_node.arguments.append(VarPointerNode(token.value)) + elif last_token and last_token.type == TokenType.TYPE and current_node_type == "func": + print(current_node) + current_node.args[-1].name = token.value + else: + traceback(code, "SyntaxError", "Expected instruction, not variable pointer.") + + elif token.type == TokenType.VARIABLE_REFERENCE: + if current_node_type == "inst": + current_node.arguments.append(VarRefNode(token.value)) + else: + traceback(code, "SyntaxError", "Expected instruction, not variable reference.") + + elif token.type == TokenType.TYPE: + if current_node_type == "inst": + current_node.arguments.append(TypeNode(token.value)) + elif current_node_type == "func": + if last_token and last_token.type == TokenType.FUNCTION_REFERENCE or current_node.return_type: + current_node.args.append(ArgNode( + arg_type=token.value, + name=None, + parent=current_node + )) + else: + current_node.return_type = token.value + else: + traceback(code, "SyntaxError", "Expected instruction, not type.") + + elif token.type == TokenType.FUNCTION_REFERENCE: + if last_token and last_token.type == TokenType.TYPE and current_node_type == "func": + current_node.name = token.value + elif current_node_type == "inst": + current_node.arguments.append(FunctionCallNode(token.value)) + else: + traceback(code, "SyntaxError", "Expected instruction or function return type, got function reference.") + + elif token.type == TokenType.EOF: + root_node.statements.append(current_node) + last_token = token + + return root_node \ No newline at end of file diff --git a/ground_types.py b/ground_types.py new file mode 100644 index 0000000..07a1fa0 --- /dev/null +++ b/ground_types.py @@ -0,0 +1,3 @@ +class String: + def __init__(self, value: str): + self.value = value \ No newline at end of file diff --git a/main.py b/main.py index fdbd177..6de7d9f 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,19 @@ from tokenizer import tokenize +from ground_ast import generate_ast from rich import print from time import time def main(): start = time() - file = open("test.grnd", "r") + file = open("test2.grnd", "r") code = file.read() file.close() tokens = tokenize(code) + ast = generate_ast(tokens, code) compile_time = time()-start - print(tokens) + print(ast) print(f"Compiled in {compile_time} seconds.") diff --git a/test2.grnd b/test2.grnd index 16717a5..79d3f11 100644 --- a/test2.grnd +++ b/test2.grnd @@ -1,4 +1,12 @@ -set &x 0 -@loop -add $x 1 &x -jump %loop \ No newline at end of file +set &myName "Nathaniel" +set &myAge 10 + +fun -list !split -string &str -string &determiner +set &x 2 +set &y 5 +add $x $y &x +stdlnout $x +endfun + +# should error +stdlnout $x \ No newline at end of file diff --git a/tokenizer.py b/tokenizer.py index 9f5fe05..1ceac55 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -3,7 +3,6 @@ from typing import Any from enum import Enum from error import traceback from string import ascii_letters, digits, whitespace -import sys class TokenType(Enum): @@ -20,6 +19,7 @@ class TokenType(Enum): COMMENT = 11 # example: # hi there LINE_REFERENCE = 12 # example: %12 LABEL_REFERENCE = 13 # example: %myLabel + EOF = 14 @dataclass class Token: @@ -42,10 +42,6 @@ def tokenize(input_string: str): "extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize", "listappend", "if" ] - types = [ - "string", "bool", "list", "char", "int", "double" - ] - while pos < len(input_string): current_char = input_string[pos] @@ -73,17 +69,17 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) - sys.exit(1) + elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) - sys.exit(1) + while pos < len(input_string): current_char = input_string[pos] @@ -113,17 +109,17 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) - sys.exit(1) + elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) - sys.exit(1) + while pos < len(input_string): current_char = input_string[pos] @@ -152,17 +148,17 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a function name, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column) - sys.exit(1) + elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column) - sys.exit(1) + while pos < len(input_string): current_char = input_string[pos] @@ -191,17 +187,17 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a list reference, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column) - sys.exit(1) + elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column) - sys.exit(1) + while pos < len(input_string): current_char = input_string[pos] @@ -230,14 +226,14 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a type name, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a type", line, column, column) - sys.exit(1) + start_col = column while pos < len(input_string): @@ -255,14 +251,10 @@ def tokenize(input_string: str): pos += 1 column += 1 - if current_token in types: - tokens.append(Token( - TokenType.TYPE, - value=current_token - )) - else: - traceback(input_string, "SyntaxError", f"\"{current_token}\" is not a valid type.", line, start_col, column) - sys.exit(1) + tokens.append(Token( + TokenType.TYPE, + value=current_token + )) current_token = "" elif current_char == "@": @@ -271,14 +263,14 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a label decleration, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column) - sys.exit(1) + start_col = column while pos < len(input_string): @@ -308,12 +300,12 @@ def tokenize(input_string: str): if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a label or line reference, got ", line, column, column) - sys.exit(1) + current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column) - sys.exit(1) + start_col = column if current_char in digits or current_char == ".": # its a line number reference @@ -329,7 +321,7 @@ def tokenize(input_string: str): break if not current_char in digits: # random ass character in the middle of the line number traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column) - sys.exit(1) + current_token += current_char pos += 1 @@ -374,7 +366,7 @@ def tokenize(input_string: str): column += 1 if pos > len(input_string)-1: traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col) - sys.exit(1) + current_char = input_string[pos] tokens.append(Token( @@ -407,7 +399,7 @@ def tokenize(input_string: str): )) else: traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column) - sys.exit(1) + current_token = "" elif current_char in digits: @@ -427,7 +419,7 @@ def tokenize(input_string: str): if not current_char in digits: traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column) - sys.exit(1) + current_token += current_char @@ -451,9 +443,14 @@ def tokenize(input_string: str): continue else: traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column) - sys.exit(1) + column += 1 pos += 1 + tokens.append(Token( + TokenType.EOF, + None + )) + return tokens \ No newline at end of file