Upload files to "/"
This commit is contained in:
459
tokenizer.py
Normal file
459
tokenizer.py
Normal file
@@ -0,0 +1,459 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
from enum import Enum
|
||||||
|
from error import traceback
|
||||||
|
from string import ascii_letters, digits, whitespace
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
class TokenType(Enum):
|
||||||
|
INSTRUCTION = 1 # example: stdlnout
|
||||||
|
STRING = 2 # example: "Hello!"
|
||||||
|
LABEL_DECLERATION = 3 # example: @myLabel
|
||||||
|
VARIABLE_REFERENCE = 4 # example: $myVar
|
||||||
|
VARIABLE_POINTER = 5 # example: &myVar
|
||||||
|
INTEGER = 6 # example: 123
|
||||||
|
FLOAT = 7 # example: 0.123
|
||||||
|
TYPE = 8 # example: -string
|
||||||
|
FUNCTION_REFERENCE = 9 # example: !myFunc
|
||||||
|
LIST_REFERENCE = 10 # example: *myList
|
||||||
|
COMMENT = 11 # example: # hi there
|
||||||
|
LINE_REFERENCE = 12 # example: %12
|
||||||
|
LABEL_REFERENCE = 13 # example: %myLabel
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Token:
|
||||||
|
type: TokenType
|
||||||
|
value: Any
|
||||||
|
|
||||||
|
def tokenize(input_string: str):
|
||||||
|
tokens: list[Token] = []
|
||||||
|
|
||||||
|
line = 1
|
||||||
|
column = 1
|
||||||
|
pos = 0 # the actual index in the string
|
||||||
|
current_char = None
|
||||||
|
current_token = ""
|
||||||
|
|
||||||
|
instructions = [
|
||||||
|
"stdlnout", "stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat",
|
||||||
|
"getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide",
|
||||||
|
"equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use",
|
||||||
|
"extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize",
|
||||||
|
"listappend", "if"
|
||||||
|
]
|
||||||
|
types = [
|
||||||
|
"string", "bool", "list", "char", "int", "double"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
elif current_char == "#":
|
||||||
|
while pos < len(input_string)-1:
|
||||||
|
pos += 1
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char == "\n":
|
||||||
|
break
|
||||||
|
current_token += current_char
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.COMMENT,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "&":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char in digits or current_char == ".":
|
||||||
|
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
elif current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
#pos += 1
|
||||||
|
#column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.VARIABLE_POINTER,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "$":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char in digits or current_char == ".":
|
||||||
|
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
elif current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.VARIABLE_REFERENCE,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "!":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a function name, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char in digits or current_char == ".":
|
||||||
|
traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
elif current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.FUNCTION_REFERENCE,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "*":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a list reference, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char in digits or current_char == ".":
|
||||||
|
traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
elif current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.LIST_REFERENCE,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "-":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a type name, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a type", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
start_col = column
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if current_token in types:
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.TYPE,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
traceback(input_string, "SyntaxError", f"\"{current_token}\" is not a valid type.", line, start_col, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "@":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a label decleration, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
start_col = column
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.LABEL_DECLERATION,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == "%":
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if len(input_string) == pos:
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a label or line reference, got <EOF>", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
if current_char == "\n":
|
||||||
|
traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
start_col = column
|
||||||
|
if current_char in digits or current_char == ".": # its a line number reference
|
||||||
|
while pos < len(input_string):
|
||||||
|
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
if not current_char in digits: # random ass character in the middle of the line number
|
||||||
|
traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.LINE_REFERENCE,
|
||||||
|
value=int(current_token)
|
||||||
|
))
|
||||||
|
else: # its a label name
|
||||||
|
while pos < len(input_string):
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
column += 1
|
||||||
|
break
|
||||||
|
if current_char == "\n":
|
||||||
|
line += 1
|
||||||
|
column = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.LABEL_REFERENCE,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char == '"':
|
||||||
|
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
start_col = column
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
while current_char != '"':
|
||||||
|
current_token += current_char
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
if pos > len(input_string)-1:
|
||||||
|
traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col)
|
||||||
|
sys.exit(1)
|
||||||
|
current_char = input_string[pos]
|
||||||
|
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.STRING,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
current_token = ""
|
||||||
|
|
||||||
|
elif current_char in ascii_letters:
|
||||||
|
start_col = column+1
|
||||||
|
while pos <= len(input_string)-1:
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in "\t ":
|
||||||
|
break
|
||||||
|
elif current_char in "\n":
|
||||||
|
column = 1
|
||||||
|
line += 1
|
||||||
|
break
|
||||||
|
current_token += current_char
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if current_char not in ascii_letters + digits:
|
||||||
|
break
|
||||||
|
|
||||||
|
if current_token in instructions:
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.INSTRUCTION,
|
||||||
|
value=current_token
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
current_token = ""
|
||||||
|
elif current_char in digits:
|
||||||
|
start_col = column
|
||||||
|
while pos <= len(input_string)-1:
|
||||||
|
current_char = input_string[pos]
|
||||||
|
if current_char in whitespace: # end of the number
|
||||||
|
if current_char == "\n":
|
||||||
|
column = 0
|
||||||
|
line += 1
|
||||||
|
#pos += 1
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
#pos += 1
|
||||||
|
#column += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
if not current_char in digits:
|
||||||
|
traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
current_token += current_char
|
||||||
|
|
||||||
|
pos += 1
|
||||||
|
column += 1
|
||||||
|
|
||||||
|
if "." in current_token:
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.FLOAT,
|
||||||
|
value=float(current_token)
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
tokens.append(Token(
|
||||||
|
TokenType.INTEGER,
|
||||||
|
value=int(current_token)
|
||||||
|
))
|
||||||
|
current_token = ""
|
||||||
|
elif current_char in '\t ':
|
||||||
|
column += 1
|
||||||
|
pos += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
column += 1
|
||||||
|
pos += 1
|
||||||
|
|
||||||
|
return tokens
|
Reference in New Issue
Block a user