2025-08-31 18:28:15 +10:00
|
|
|
from dataclasses import dataclass
|
|
|
|
from typing import Any
|
|
|
|
from enum import Enum
|
|
|
|
from error import traceback
|
|
|
|
from string import ascii_letters, digits, whitespace
|
|
|
|
|
|
|
|
|
|
|
|
class TokenType(Enum):
|
|
|
|
INSTRUCTION = 1 # example: stdlnout
|
|
|
|
STRING = 2 # example: "Hello!"
|
|
|
|
LABEL_DECLERATION = 3 # example: @myLabel
|
|
|
|
VARIABLE_REFERENCE = 4 # example: $myVar
|
|
|
|
VARIABLE_POINTER = 5 # example: &myVar
|
|
|
|
INTEGER = 6 # example: 123
|
|
|
|
FLOAT = 7 # example: 0.123
|
|
|
|
TYPE = 8 # example: -string
|
|
|
|
FUNCTION_REFERENCE = 9 # example: !myFunc
|
|
|
|
LIST_REFERENCE = 10 # example: *myList
|
|
|
|
COMMENT = 11 # example: # hi there
|
|
|
|
LINE_REFERENCE = 12 # example: %12
|
|
|
|
LABEL_REFERENCE = 13 # example: %myLabel
|
2025-09-01 06:44:33 +10:00
|
|
|
EOF = 14
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Token:
|
|
|
|
type: TokenType
|
|
|
|
value: Any
|
|
|
|
|
|
|
|
def tokenize(input_string: str):
|
|
|
|
tokens: list[Token] = []
|
|
|
|
|
|
|
|
line = 1
|
|
|
|
column = 1
|
|
|
|
pos = 0 # the actual index in the string
|
|
|
|
current_char = None
|
|
|
|
current_token = ""
|
|
|
|
|
|
|
|
instructions = [
|
|
|
|
"stdlnout", "stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat",
|
|
|
|
"getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide",
|
|
|
|
"equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use",
|
|
|
|
"extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize",
|
|
|
|
"listappend", "if"
|
|
|
|
]
|
|
|
|
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
elif current_char == "#":
|
|
|
|
while pos < len(input_string)-1:
|
|
|
|
pos += 1
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char == "\n":
|
|
|
|
break
|
|
|
|
current_token += current_char
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.COMMENT,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "&":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char in digits or current_char == ".":
|
|
|
|
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
elif current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
#pos += 1
|
|
|
|
#column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.VARIABLE_POINTER,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "$":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char in digits or current_char == ".":
|
|
|
|
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
elif current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.VARIABLE_REFERENCE,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "!":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a function name, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char in digits or current_char == ".":
|
|
|
|
traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
elif current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.FUNCTION_REFERENCE,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "*":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a list reference, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char in digits or current_char == ".":
|
|
|
|
traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
elif current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.LIST_REFERENCE,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "-":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a type name, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a type", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
start_col = column
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
2025-09-01 06:44:33 +10:00
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.TYPE,
|
|
|
|
value=current_token
|
|
|
|
))
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "@":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a label decleration, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
start_col = column
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.LABEL_DECLERATION,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == "%":
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if len(input_string) == pos:
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a label or line reference, got <EOF>", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
if current_char == "\n":
|
|
|
|
traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
start_col = column
|
|
|
|
if current_char in digits or current_char == ".": # its a line number reference
|
|
|
|
while pos < len(input_string):
|
|
|
|
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
if not current_char in digits: # random ass character in the middle of the line number
|
|
|
|
traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.LINE_REFERENCE,
|
|
|
|
value=int(current_token)
|
|
|
|
))
|
|
|
|
else: # its a label name
|
|
|
|
while pos < len(input_string):
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
column += 1
|
|
|
|
break
|
|
|
|
if current_char == "\n":
|
|
|
|
line += 1
|
|
|
|
column = 1
|
|
|
|
break
|
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.LABEL_REFERENCE,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char == '"':
|
|
|
|
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
start_col = column
|
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
while current_char != '"':
|
|
|
|
current_token += current_char
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
if pos > len(input_string)-1:
|
|
|
|
traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
current_char = input_string[pos]
|
|
|
|
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.STRING,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
current_token = ""
|
|
|
|
|
|
|
|
elif current_char in ascii_letters:
|
|
|
|
start_col = column+1
|
|
|
|
while pos <= len(input_string)-1:
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in "\t ":
|
|
|
|
break
|
|
|
|
elif current_char in "\n":
|
|
|
|
column = 1
|
|
|
|
line += 1
|
|
|
|
break
|
|
|
|
current_token += current_char
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if current_char not in ascii_letters + digits:
|
|
|
|
break
|
|
|
|
|
|
|
|
if current_token in instructions:
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.INSTRUCTION,
|
|
|
|
value=current_token
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
current_token = ""
|
|
|
|
elif current_char in digits:
|
|
|
|
start_col = column
|
|
|
|
while pos <= len(input_string)-1:
|
|
|
|
current_char = input_string[pos]
|
|
|
|
if current_char in whitespace: # end of the number
|
|
|
|
if current_char == "\n":
|
|
|
|
column = 0
|
|
|
|
line += 1
|
|
|
|
#pos += 1
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
#pos += 1
|
|
|
|
#column += 1
|
|
|
|
break
|
|
|
|
|
|
|
|
if not current_char in digits:
|
|
|
|
traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
current_token += current_char
|
|
|
|
|
|
|
|
pos += 1
|
|
|
|
column += 1
|
|
|
|
|
|
|
|
if "." in current_token:
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.FLOAT,
|
|
|
|
value=float(current_token)
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.INTEGER,
|
|
|
|
value=int(current_token)
|
|
|
|
))
|
|
|
|
current_token = ""
|
|
|
|
elif current_char in '\t ':
|
|
|
|
column += 1
|
|
|
|
pos += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column)
|
2025-09-01 06:44:33 +10:00
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
|
|
|
|
column += 1
|
|
|
|
pos += 1
|
|
|
|
|
2025-09-01 06:44:33 +10:00
|
|
|
tokens.append(Token(
|
|
|
|
TokenType.EOF,
|
|
|
|
None
|
|
|
|
))
|
|
|
|
|
2025-08-31 18:28:15 +10:00
|
|
|
return tokens
|