Abstract syntax tree

2025-09-02 20:41:17 +10:00
parent 5ebf653342
commit 8d3ffb7b9c
5 changed files with 222 additions and 50 deletions
--- a/ast.py
+++ b/ast.py
@@ -0,0 +1,137 @@
+import token
+from enum import Enum
+
+class AstType(Enum):
+    CONSTANT = 0
+    VARIABLE = 1
+    COMPARITOR = 2
+    OPERATOR = 3
+    IF = 4
+    WHILE = 5
+    BODY = 6
+    SET = 7
+    ROOT = 8
+    FUNC_CALL = 9
+
+operators = [token.TokenType.ADD, token.TokenType.SUBTRACT, token.TokenType.MULTIPLY, token.TokenType.DIVIDE]
+comparitors = [token.TokenType.EQUAL, token.TokenType.GREATER, token.TokenType.LESSER]
+constants = [token.TokenType.INTEGER, token.TokenType.DOUBLE, token.TokenType.STRING, token.TokenType.CHAR, token.TokenType.BOOLEAN]
+
+class AstNode:
+    def __init__(self, intok: token.Token):
+        self.children = []
+        self.tok = intok
+        if intok.type == token.TokenType.ROOT:
+            self.type = AstType.ROOT
+        elif intok.type in operators:
+            self.type = AstType.OPERATOR
+        elif intok.type in comparitors:
+            self.type = AstType.COMPARITOR
+        elif intok.type in constants:
+            self.type = AstType.CONSTANT
+        elif intok.type == token.TokenType.VARIABLE:
+            self.type = AstType.VARIABLE
+        elif intok.type == token.TokenType.IF:
+            self.type = AstType.IF
+        elif intok.type == token.TokenType.WHILE:
+            self.type = AstType.WHILE
+        elif intok.type == token.TokenType.SET:
+            self.type = AstType.SET
+        else:
+            self.type = None
+
+    def __repr__(self, level=0):
+        ret = "\t" * level + f"AstNode(type={self.type}, token={self.tok})\n"
+        for child in self.children:
+            ret += child.__repr__(level + 1)
+        return ret
+
+def build_expression_ast(line_toks: list[token.Token]) -> AstNode:
+    if not line_toks:
+        return None
+
+    if len(line_toks) == 1:
+        return AstNode(line_toks[0])
+
+    if len(line_toks) > 2 and line_toks[0].type == token.TokenType.VARIABLE and line_toks[1].type == token.TokenType.PARAMOPEN:
+        func_call_node = AstNode(line_toks[0])
+        func_call_node.type = AstType.FUNC_CALL
+        if len(line_toks) > 3: # there are arguments
+             arg_toks = line_toks[2:-1]
+             if arg_toks:
+                func_call_node.children.append(build_expression_ast(arg_toks))
+        return func_call_node
+
+    for i, tok in enumerate(line_toks):
+        if tok.type in comparitors or tok.type in operators:
+            node = AstNode(tok)
+            left = build_expression_ast(line_toks[:i])
+            right = build_expression_ast(line_toks[i+1:])
+            if left: node.children.append(left)
+            if right: node.children.append(right)
+            return node
+
+    return AstNode(line_toks[0]) if line_toks else None
+
+
+def build_ast(toks: list[list[token.Token]]) -> AstNode:
+    root = AstNode(token.Token("", True))
+
+    line_index = 0
+    while line_index < len(toks):
+        line_toks = toks[line_index]
+
+        if not line_toks:
+            line_index += 1
+            continue
+
+        first_tok = line_toks[0]
+
+        if first_tok.type == token.TokenType.LET:
+            if len(line_toks) >= 4 and line_toks[1].type == token.TokenType.VARIABLE and line_toks[2].type == token.TokenType.SET:
+                assignment_node = AstNode(line_toks[2])
+                var_node = AstNode(line_toks[1])
+                expr_node = build_expression_ast(line_toks[3:])
+
+                assignment_node.children.append(var_node)
+                if expr_node:
+                    assignment_node.children.append(expr_node)
+                root.children.append(assignment_node)
+            else:
+                print(f"Syntax error on line {line_index + 1}: Invalid assignment.")
+
+        elif first_tok.type == token.TokenType.IF:
+            if_node = AstNode(first_tok)
+            condition_node = build_expression_ast(line_toks[1:])
+            if condition_node:
+                if_node.children.append(condition_node)
+
+            body_start_index = line_index + 1
+            body_end_index = body_start_index
+            while body_end_index < len(toks):
+                if len(toks[body_end_index]) == 1 and toks[body_end_index][0].type == token.TokenType.END:
+                    break
+                body_end_index += 1
+            else:
+                print(f"Syntax error on line {line_index + 1}: 'if' statement without matching 'end'.")
+
+            body_toks = toks[body_start_index:body_end_index]
+            if body_toks:
+                body_node = build_ast(body_toks)
+                body_node.type = AstType.BODY
+                if_node.children.append(body_node)
+
+            root.children.append(if_node)
+            line_index = body_end_index
+
+        elif first_tok.type == token.TokenType.END:
+            pass
+
+        else:
+            expr_node = build_expression_ast(line_toks)
+            if expr_node:
+                root.children.append(expr_node)
+
+        line_index += 1
+
+    return root
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 import sys
 import preprocessor
+import ast

 if len(sys.argv) < 2:
    print("Usage: hgc (file)")
@@ -8,5 +9,6 @@ if len(sys.argv) < 2:
 with open(sys.argv[1], "r") as file:
    lines = [preprocessor.process_line(line) for line in file]

-for line in lines:
-    print(line)
+ast = ast.build_ast(lines)
+
+print(ast)
--- a/preprocessor.py
+++ b/preprocessor.py
@@ -1,44 +1,66 @@
 import token

-delimiters = ["=", ">", "<", "+", "-", "*", "/", " "]
+delimiters = ["=", ">", "<", "+", "-", "*", "/", "(", ")"]

-quick_tokens = [">", "<", "+", "-", "*", "/"]
+quick_tokens = [">", "<", "+", "-", "*", "/", "(", ")"]

 def doNothing():
    return

 def process_line(process: str) -> list[token.Token]:
-    buf = ""
    tokens: list[token.Token] = []
-    prevEquals = False
-    for c in process:
-        if c in delimiters and buf != "":
-            tokens.append(token.Token(buf))
    buf = ""
-        if prevEquals and c != '=':
-            tokens.append(token.Token("="))
-            prevEquals = False
-        if c in quick_tokens:
-            tokens.append(token.Token(c))
-            if buf != "":
-                tokens.append(token.Token(buf))
-                buf = ""
-        else:
-            match c:
-                case '\n':
-                    doNothing()
-                case ' ':
-                    doNothing()
-                case '=':
-                    if prevEquals:
-                        prevEquals = False
-                        tokens.append(token.Token("=="))
-                    else:
-                        prevEquals = True
-                case _:
-                    buf += c
-    if buf != "":
-        tokens.append(token.Token(buf))
+    i = 0
+    while i < len(process):
+        char = process[i]

+        if char == '"':
+            # End of buffer before string starts
+            if buf:
+                tokens.append(token.Token(buf))
+                buf = ""
+
+            i += 1
+            start = i
+            while i < len(process) and process[i] != '"':
+                i += 1
+            
+            string_content = process[start:i]
+            # Create string token, assuming constructor wants quotes
+            tokens.append(token.Token(f'"{string_content}"'))
+            
+            if i < len(process) and process[i] == '"':
+                i += 1 # Skip closing quote
+            continue
+
+        if char.isspace():
+            if buf:
+                tokens.append(token.Token(buf))
+                buf = ""
+            i += 1
+            continue
+
+        # Handle multi-char operators like '=='
+        if char == '=' and i + 1 < len(process) and process[i+1] == '=':
+            if buf:
+                tokens.append(token.Token(buf))
+                buf = ""
+            tokens.append(token.Token('=='))
+            i += 2
+            continue
+
+        if char in delimiters:
+            if buf:
+                tokens.append(token.Token(buf))
+                buf = ""
+            tokens.append(token.Token(char))
+            i += 1
+            continue
+
+        buf += char
+        i += 1
+
+    if buf:
+        tokens.append(token.Token(buf))

    return tokens
--- a/test.high
+++ b/test.high
@@ -1,7 +1,7 @@
 let dingus = 10

 if "test" == "test"
-
+    print("Test is the same as test")
 end

 print(dingus)
--- a/token.py
+++ b/token.py
@@ -7,30 +7,37 @@ class TokenType(Enum):
    CHAR = 4
    BOOLEAN = 5

-    FUNCTION = 6
-    VARIABLE = 7
-    IF = 8
-    ELSE = 9
-    WHILE = 10
-    LET = 11
-    END = 12
+    VARIABLE = 8
+    IF = 9
+    ELSE = 10
+    WHILE = 11
+    LET = 12
+    END = 13

-    ADD = 13
-    SUBTRACT = 14
-    MULTIPLY = 15
-    DIVIDE = 16
+    ADD = 14
+    SUBTRACT = 15
+    MULTIPLY = 16
+    DIVIDE = 17

-    SET = 17
+    SET = 18

-    EQUAL = 18
-    GREATER = 19
-    LESSER = 20
+    EQUAL = 19
+    GREATER = 20
+    LESSER = 21
+
+    PARAMOPEN = 22
+    PARAMCLOSE = 23
+
+    ROOT = 24

    UNKNOWN = 0

 class Token:
-    def __init__(self, tok: str):
+    def __init__(self, tok: str, isroot = False):
        self.value = tok
+        if isroot:
+            self.type = TokenType.ROOT
+        else:
            self.type = get_type(tok)

    def __repr__(self) -> str:
@@ -69,6 +76,10 @@ def get_type(process: str) -> TokenType:
            return TokenType.GREATER
        case "<":
            return TokenType.LESSER
+        case "(":
+            return TokenType.PARAMOPEN
+        case ")":
+            return TokenType.PARAMCLOSE

    # String/Char Literals
    if len(process) >= 2: