Files
GroundPY/tokenizer.py

510 lines
12 KiB
Python
Raw Permalink Normal View History

2025-08-31 18:28:15 +10:00
from dataclasses import dataclass
from typing import Any
from enum import Enum
from error import traceback
from string import ascii_letters, digits, whitespace
class TokenType(Enum):
INSTRUCTION = 1 # example: stdlnout
STRING = 2 # example: "Hello!"
LABEL_DECLERATION = 3 # example: @myLabel
VARIABLE_REFERENCE = 4 # example: $myVar
VARIABLE_POINTER = 5 # example: &myVar
INTEGER = 6 # example: 123
FLOAT = 7 # example: 0.123
TYPE = 8 # example: -string
FUNCTION_REFERENCE = 9 # example: !myFunc
LIST_REFERENCE = 10 # example: *myList
COMMENT = 11 # example: # hi there
LINE_REFERENCE = 12 # example: %12
LABEL_REFERENCE = 13 # example: %myLabel
BOOL = 14 # example: true
EOF = 15
2025-08-31 18:28:15 +10:00
@dataclass
class Token:
type: TokenType
value: Any
def tokenize(input_string: str):
tokens: list[Token] = []
line = 1
column = 1
pos = 0 # the actual index in the string
current_char = None
current_token = ""
instructions = [
2025-09-06 21:18:22 +10:00
"stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat", "if",
2025-08-31 18:28:15 +10:00
"getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide",
"equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use",
"extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize",
2025-09-06 21:18:22 +10:00
"listappend"
2025-08-31 18:28:15 +10:00
]
while pos < len(input_string):
current_char = input_string[pos]
if current_char == "\n":
line += 1
column = 1
elif current_char == "#":
while pos < len(input_string)-1:
pos += 1
current_char = input_string[pos]
if current_char == "\n":
break
current_token += current_char
line += 1
column = 1
tokens.append(Token(
TokenType.COMMENT,
value=current_token
))
current_token = ""
elif current_char == "&":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char in digits or current_char == ".":
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
2025-08-31 18:28:15 +10:00
elif current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
2025-08-31 18:28:15 +10:00
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
#pos += 1
#column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.VARIABLE_POINTER,
value=current_token
))
current_token = ""
elif current_char == "$":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char in digits or current_char == ".":
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
2025-08-31 18:28:15 +10:00
elif current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
2025-08-31 18:28:15 +10:00
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.VARIABLE_REFERENCE,
value=current_token
))
current_token = ""
elif current_char == "!":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a function name, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char in digits or current_char == ".":
traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column)
2025-08-31 18:28:15 +10:00
elif current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column)
2025-08-31 18:28:15 +10:00
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.FUNCTION_REFERENCE,
value=current_token
))
current_token = ""
elif current_char == "*":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a list reference, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char in digits or current_char == ".":
traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column)
2025-08-31 18:28:15 +10:00
elif current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column)
2025-08-31 18:28:15 +10:00
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.LIST_REFERENCE,
value=current_token
))
current_token = ""
elif current_char == "-":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a type name, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a type", line, column, column)
is_number = False
if current_char in digits:
is_number = True
2025-08-31 18:28:15 +10:00
start_col = column
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
if is_number and not current_char in digits+".":
traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column)
2025-08-31 18:28:15 +10:00
current_token += current_char
pos += 1
column += 1
if not is_number:
tokens.append(Token(
TokenType.TYPE,
value=current_token
))
else:
if "." in current_token:
tokens.append(Token(
TokenType.FLOAT,
value=float("-"+current_token)
))
else:
tokens.append(Token(
TokenType.INTEGER,
value=int("-"+current_token)
))
2025-08-31 18:28:15 +10:00
current_token = ""
elif current_char == "@":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a label decleration, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column)
2025-08-31 18:28:15 +10:00
start_col = column
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.LABEL_DECLERATION,
value=current_token
))
current_token = ""
elif current_char == "%":
pos += 1
column += 1
if len(input_string) == pos:
traceback(input_string, "SyntaxError", "Expected a label or line reference, got <EOF>", line, column, column)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char == "\n":
traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column)
2025-08-31 18:28:15 +10:00
start_col = column
if current_char in digits or current_char == ".": # its a line number reference
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
if not current_char in digits: # random ass character in the middle of the line number
traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column)
2025-08-31 18:28:15 +10:00
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.LINE_REFERENCE,
value=int(current_token)
))
else: # its a label name
while pos < len(input_string):
current_char = input_string[pos]
if current_char in "\t ":
column += 1
break
if current_char == "\n":
line += 1
column = 1
break
current_token += current_char
pos += 1
column += 1
tokens.append(Token(
TokenType.LABEL_REFERENCE,
value=current_token
))
current_token = ""
elif current_char == '"':
pos += 1
column += 1
start_col = column
current_char = input_string[pos]
while current_char != '"':
2025-09-13 19:29:22 +10:00
if current_char != "\\":
current_token += current_char
2025-08-31 18:28:15 +10:00
pos += 1
column += 1
if pos > len(input_string)-1:
traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col)
2025-08-31 18:28:15 +10:00
current_char = input_string[pos]
if current_char == "\\":
escape = ""
pos += 1
column += 1
while pos <= len(input_string)-1:
escape += input_string[pos]
2025-09-13 19:29:22 +10:00
valid_escapes = ['"', 'n', 't', 'a', 'r', '\\']
if escape == '"':
2025-09-13 19:29:22 +10:00
current_token += '"'
elif escape == "n":
current_token += '\n'
elif escape == "t":
current_token += '\t'
elif escape == "a":
current_token += "\a"
elif escape == "r":
current_token += "\r"
2025-09-13 19:29:22 +10:00
elif escape == "\\":
current_token += "\\"
if escape in valid_escapes:
break
pos += 1
column += 1
2025-08-31 18:28:15 +10:00
tokens.append(Token(
TokenType.STRING,
value=current_token
))
current_token = ""
elif current_char in ascii_letters:
start_col = column+1
while pos <= len(input_string)-1:
current_char = input_string[pos]
if current_char in "\t ":
break
elif current_char in "\n":
column = 1
line += 1
break
current_token += current_char
pos += 1
column += 1
if current_char not in ascii_letters + digits:
break
if current_token in instructions:
tokens.append(Token(
TokenType.INSTRUCTION,
value=current_token
))
elif current_token in ["true", "false"]:
tokens.append(Token(
TokenType.BOOL,
value=current_token == "true"
))
2025-08-31 18:28:15 +10:00
else:
traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column)
2025-08-31 18:28:15 +10:00
current_token = ""
elif current_char in digits:
start_col = column
while pos <= len(input_string)-1:
current_char = input_string[pos]
if current_char in whitespace: # end of the number
if current_char == "\n":
column = 0
line += 1
#pos += 1
else:
pass
#pos += 1
#column += 1
break
if not current_char in digits + ".":
2025-08-31 18:28:15 +10:00
traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column)
2025-08-31 18:28:15 +10:00
current_token += current_char
pos += 1
column += 1
if "." in current_token:
tokens.append(Token(
TokenType.FLOAT,
value=float(current_token)
))
else:
tokens.append(Token(
TokenType.INTEGER,
value=int(current_token)
))
current_token = ""
elif current_char in '\t ':
column += 1
pos += 1
continue
else:
traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column)
2025-08-31 18:28:15 +10:00
column += 1
pos += 1
tokens.append(Token(
TokenType.EOF,
None
))
2025-08-31 18:28:15 +10:00
return tokens