From 8d3ffb7b9c9953b339f38097c1700ca4e9185f15 Mon Sep 17 00:00:00 2001 From: Maxwell Jeffress Date: Tue, 2 Sep 2025 20:41:17 +1000 Subject: [PATCH] Abstract syntax tree --- ast.py | 137 ++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 6 ++- preprocessor.py | 82 ++++++++++++++++++----------- test.high | 2 +- token.py | 45 ++++++++++------ 5 files changed, 222 insertions(+), 50 deletions(-) create mode 100644 ast.py diff --git a/ast.py b/ast.py new file mode 100644 index 0000000..8890c52 --- /dev/null +++ b/ast.py @@ -0,0 +1,137 @@ +import token +from enum import Enum + +class AstType(Enum): + CONSTANT = 0 + VARIABLE = 1 + COMPARITOR = 2 + OPERATOR = 3 + IF = 4 + WHILE = 5 + BODY = 6 + SET = 7 + ROOT = 8 + FUNC_CALL = 9 + +operators = [token.TokenType.ADD, token.TokenType.SUBTRACT, token.TokenType.MULTIPLY, token.TokenType.DIVIDE] +comparitors = [token.TokenType.EQUAL, token.TokenType.GREATER, token.TokenType.LESSER] +constants = [token.TokenType.INTEGER, token.TokenType.DOUBLE, token.TokenType.STRING, token.TokenType.CHAR, token.TokenType.BOOLEAN] + +class AstNode: + def __init__(self, intok: token.Token): + self.children = [] + self.tok = intok + if intok.type == token.TokenType.ROOT: + self.type = AstType.ROOT + elif intok.type in operators: + self.type = AstType.OPERATOR + elif intok.type in comparitors: + self.type = AstType.COMPARITOR + elif intok.type in constants: + self.type = AstType.CONSTANT + elif intok.type == token.TokenType.VARIABLE: + self.type = AstType.VARIABLE + elif intok.type == token.TokenType.IF: + self.type = AstType.IF + elif intok.type == token.TokenType.WHILE: + self.type = AstType.WHILE + elif intok.type == token.TokenType.SET: + self.type = AstType.SET + else: + self.type = None + + def __repr__(self, level=0): + ret = "\t" * level + f"AstNode(type={self.type}, token={self.tok})\n" + for child in self.children: + ret += child.__repr__(level + 1) + return ret + +def build_expression_ast(line_toks: list[token.Token]) -> AstNode: + if not line_toks: + return None + + if len(line_toks) == 1: + return AstNode(line_toks[0]) + + if len(line_toks) > 2 and line_toks[0].type == token.TokenType.VARIABLE and line_toks[1].type == token.TokenType.PARAMOPEN: + func_call_node = AstNode(line_toks[0]) + func_call_node.type = AstType.FUNC_CALL + if len(line_toks) > 3: # there are arguments + arg_toks = line_toks[2:-1] + if arg_toks: + func_call_node.children.append(build_expression_ast(arg_toks)) + return func_call_node + + for i, tok in enumerate(line_toks): + if tok.type in comparitors or tok.type in operators: + node = AstNode(tok) + left = build_expression_ast(line_toks[:i]) + right = build_expression_ast(line_toks[i+1:]) + if left: node.children.append(left) + if right: node.children.append(right) + return node + + return AstNode(line_toks[0]) if line_toks else None + + +def build_ast(toks: list[list[token.Token]]) -> AstNode: + root = AstNode(token.Token("", True)) + + line_index = 0 + while line_index < len(toks): + line_toks = toks[line_index] + + if not line_toks: + line_index += 1 + continue + + first_tok = line_toks[0] + + if first_tok.type == token.TokenType.LET: + if len(line_toks) >= 4 and line_toks[1].type == token.TokenType.VARIABLE and line_toks[2].type == token.TokenType.SET: + assignment_node = AstNode(line_toks[2]) + var_node = AstNode(line_toks[1]) + expr_node = build_expression_ast(line_toks[3:]) + + assignment_node.children.append(var_node) + if expr_node: + assignment_node.children.append(expr_node) + root.children.append(assignment_node) + else: + print(f"Syntax error on line {line_index + 1}: Invalid assignment.") + + elif first_tok.type == token.TokenType.IF: + if_node = AstNode(first_tok) + condition_node = build_expression_ast(line_toks[1:]) + if condition_node: + if_node.children.append(condition_node) + + body_start_index = line_index + 1 + body_end_index = body_start_index + while body_end_index < len(toks): + if len(toks[body_end_index]) == 1 and toks[body_end_index][0].type == token.TokenType.END: + break + body_end_index += 1 + else: + print(f"Syntax error on line {line_index + 1}: 'if' statement without matching 'end'.") + + body_toks = toks[body_start_index:body_end_index] + if body_toks: + body_node = build_ast(body_toks) + body_node.type = AstType.BODY + if_node.children.append(body_node) + + root.children.append(if_node) + line_index = body_end_index + + elif first_tok.type == token.TokenType.END: + pass + + else: + expr_node = build_expression_ast(line_toks) + if expr_node: + root.children.append(expr_node) + + line_index += 1 + + return root \ No newline at end of file diff --git a/main.py b/main.py index 5f26564..ae90e71 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import sys import preprocessor +import ast if len(sys.argv) < 2: print("Usage: hgc (file)") @@ -8,5 +9,6 @@ if len(sys.argv) < 2: with open(sys.argv[1], "r") as file: lines = [preprocessor.process_line(line) for line in file] -for line in lines: - print(line) \ No newline at end of file +ast = ast.build_ast(lines) + +print(ast) \ No newline at end of file diff --git a/preprocessor.py b/preprocessor.py index 82da1b0..b63b1d1 100644 --- a/preprocessor.py +++ b/preprocessor.py @@ -1,44 +1,66 @@ import token -delimiters = ["=", ">", "<", "+", "-", "*", "/", " "] +delimiters = ["=", ">", "<", "+", "-", "*", "/", "(", ")"] -quick_tokens = [">", "<", "+", "-", "*", "/"] +quick_tokens = [">", "<", "+", "-", "*", "/", "(", ")"] def doNothing(): return def process_line(process: str) -> list[token.Token]: - buf = "" tokens: list[token.Token] = [] - prevEquals = False - for c in process: - if c in delimiters and buf != "": - tokens.append(token.Token(buf)) - buf = "" - if prevEquals and c != '=': - tokens.append(token.Token("=")) - prevEquals = False - if c in quick_tokens: - tokens.append(token.Token(c)) - if buf != "": + buf = "" + i = 0 + while i < len(process): + char = process[i] + + if char == '"': + # End of buffer before string starts + if buf: tokens.append(token.Token(buf)) buf = "" - else: - match c: - case '\n': - doNothing() - case ' ': - doNothing() - case '=': - if prevEquals: - prevEquals = False - tokens.append(token.Token("==")) - else: - prevEquals = True - case _: - buf += c - if buf != "": + + i += 1 + start = i + while i < len(process) and process[i] != '"': + i += 1 + + string_content = process[start:i] + # Create string token, assuming constructor wants quotes + tokens.append(token.Token(f'"{string_content}"')) + + if i < len(process) and process[i] == '"': + i += 1 # Skip closing quote + continue + + if char.isspace(): + if buf: + tokens.append(token.Token(buf)) + buf = "" + i += 1 + continue + + # Handle multi-char operators like '==' + if char == '=' and i + 1 < len(process) and process[i+1] == '=': + if buf: + tokens.append(token.Token(buf)) + buf = "" + tokens.append(token.Token('==')) + i += 2 + continue + + if char in delimiters: + if buf: + tokens.append(token.Token(buf)) + buf = "" + tokens.append(token.Token(char)) + i += 1 + continue + + buf += char + i += 1 + + if buf: tokens.append(token.Token(buf)) - return tokens \ No newline at end of file diff --git a/test.high b/test.high index cf01afc..5650236 100644 --- a/test.high +++ b/test.high @@ -1,7 +1,7 @@ let dingus = 10 if "test" == "test" - + print("Test is the same as test") end print(dingus) \ No newline at end of file diff --git a/token.py b/token.py index 279efe8..e9661bf 100644 --- a/token.py +++ b/token.py @@ -7,31 +7,38 @@ class TokenType(Enum): CHAR = 4 BOOLEAN = 5 - FUNCTION = 6 - VARIABLE = 7 - IF = 8 - ELSE = 9 - WHILE = 10 - LET = 11 - END = 12 + VARIABLE = 8 + IF = 9 + ELSE = 10 + WHILE = 11 + LET = 12 + END = 13 - ADD = 13 - SUBTRACT = 14 - MULTIPLY = 15 - DIVIDE = 16 + ADD = 14 + SUBTRACT = 15 + MULTIPLY = 16 + DIVIDE = 17 - SET = 17 + SET = 18 - EQUAL = 18 - GREATER = 19 - LESSER = 20 + EQUAL = 19 + GREATER = 20 + LESSER = 21 + + PARAMOPEN = 22 + PARAMCLOSE = 23 + + ROOT = 24 UNKNOWN = 0 class Token: - def __init__(self, tok: str): + def __init__(self, tok: str, isroot = False): self.value = tok - self.type = get_type(tok) + if isroot: + self.type = TokenType.ROOT + else: + self.type = get_type(tok) def __repr__(self) -> str: return f"Token(type={self.type.name}, value='{self.value}')" @@ -69,6 +76,10 @@ def get_type(process: str) -> TokenType: return TokenType.GREATER case "<": return TokenType.LESSER + case "(": + return TokenType.PARAMOPEN + case ")": + return TokenType.PARAMCLOSE # String/Char Literals if len(process) >= 2: