Upload files to "/"
This commit is contained in:
459
tokenizer.py
Normal file
459
tokenizer.py
Normal file
@@ -0,0 +1,459 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
from error import traceback
|
||||
from string import ascii_letters, digits, whitespace
|
||||
import sys
|
||||
|
||||
|
||||
class TokenType(Enum):
|
||||
INSTRUCTION = 1 # example: stdlnout
|
||||
STRING = 2 # example: "Hello!"
|
||||
LABEL_DECLERATION = 3 # example: @myLabel
|
||||
VARIABLE_REFERENCE = 4 # example: $myVar
|
||||
VARIABLE_POINTER = 5 # example: &myVar
|
||||
INTEGER = 6 # example: 123
|
||||
FLOAT = 7 # example: 0.123
|
||||
TYPE = 8 # example: -string
|
||||
FUNCTION_REFERENCE = 9 # example: !myFunc
|
||||
LIST_REFERENCE = 10 # example: *myList
|
||||
COMMENT = 11 # example: # hi there
|
||||
LINE_REFERENCE = 12 # example: %12
|
||||
LABEL_REFERENCE = 13 # example: %myLabel
|
||||
|
||||
@dataclass
|
||||
class Token:
|
||||
type: TokenType
|
||||
value: Any
|
||||
|
||||
def tokenize(input_string: str):
|
||||
tokens: list[Token] = []
|
||||
|
||||
line = 1
|
||||
column = 1
|
||||
pos = 0 # the actual index in the string
|
||||
current_char = None
|
||||
current_token = ""
|
||||
|
||||
instructions = [
|
||||
"stdlnout", "stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat",
|
||||
"getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide",
|
||||
"equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use",
|
||||
"extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize",
|
||||
"listappend", "if"
|
||||
]
|
||||
types = [
|
||||
"string", "bool", "list", "char", "int", "double"
|
||||
]
|
||||
|
||||
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
elif current_char == "#":
|
||||
while pos < len(input_string)-1:
|
||||
pos += 1
|
||||
current_char = input_string[pos]
|
||||
if current_char == "\n":
|
||||
break
|
||||
current_token += current_char
|
||||
line += 1
|
||||
column = 1
|
||||
tokens.append(Token(
|
||||
TokenType.COMMENT,
|
||||
value=current_token
|
||||
))
|
||||
current_token = ""
|
||||
elif current_char == "&":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char in digits or current_char == ".":
|
||||
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
||||
sys.exit(1)
|
||||
elif current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
#pos += 1
|
||||
#column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.VARIABLE_POINTER,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "$":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a variable name, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char in digits or current_char == ".":
|
||||
traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column)
|
||||
sys.exit(1)
|
||||
elif current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.VARIABLE_REFERENCE,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "!":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a function name, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char in digits or current_char == ".":
|
||||
traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column)
|
||||
sys.exit(1)
|
||||
elif current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.FUNCTION_REFERENCE,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "*":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a list reference, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char in digits or current_char == ".":
|
||||
traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column)
|
||||
sys.exit(1)
|
||||
elif current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.LIST_REFERENCE,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "-":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a type name, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a type", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
start_col = column
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if current_token in types:
|
||||
tokens.append(Token(
|
||||
TokenType.TYPE,
|
||||
value=current_token
|
||||
))
|
||||
else:
|
||||
traceback(input_string, "SyntaxError", f"\"{current_token}\" is not a valid type.", line, start_col, column)
|
||||
sys.exit(1)
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "@":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a label decleration, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
start_col = column
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.LABEL_DECLERATION,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == "%":
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if len(input_string) == pos:
|
||||
traceback(input_string, "SyntaxError", "Expected a label or line reference, got <EOF>", line, column, column)
|
||||
sys.exit(1)
|
||||
current_char = input_string[pos]
|
||||
|
||||
if current_char == "\n":
|
||||
traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
start_col = column
|
||||
if current_char in digits or current_char == ".": # its a line number reference
|
||||
while pos < len(input_string):
|
||||
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
if not current_char in digits: # random ass character in the middle of the line number
|
||||
traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column)
|
||||
sys.exit(1)
|
||||
|
||||
current_token += current_char
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.LINE_REFERENCE,
|
||||
value=int(current_token)
|
||||
))
|
||||
else: # its a label name
|
||||
while pos < len(input_string):
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
column += 1
|
||||
break
|
||||
if current_char == "\n":
|
||||
line += 1
|
||||
column = 1
|
||||
break
|
||||
|
||||
current_token += current_char
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.LABEL_REFERENCE,
|
||||
value=current_token
|
||||
))
|
||||
|
||||
current_token = ""
|
||||
elif current_char == '"':
|
||||
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
start_col = column
|
||||
current_char = input_string[pos]
|
||||
|
||||
while current_char != '"':
|
||||
current_token += current_char
|
||||
pos += 1
|
||||
column += 1
|
||||
if pos > len(input_string)-1:
|
||||
traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col)
|
||||
sys.exit(1)
|
||||
current_char = input_string[pos]
|
||||
|
||||
tokens.append(Token(
|
||||
TokenType.STRING,
|
||||
value=current_token
|
||||
))
|
||||
current_token = ""
|
||||
|
||||
elif current_char in ascii_letters:
|
||||
start_col = column+1
|
||||
while pos <= len(input_string)-1:
|
||||
current_char = input_string[pos]
|
||||
if current_char in "\t ":
|
||||
break
|
||||
elif current_char in "\n":
|
||||
column = 1
|
||||
line += 1
|
||||
break
|
||||
current_token += current_char
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if current_char not in ascii_letters + digits:
|
||||
break
|
||||
|
||||
if current_token in instructions:
|
||||
tokens.append(Token(
|
||||
TokenType.INSTRUCTION,
|
||||
value=current_token
|
||||
))
|
||||
else:
|
||||
traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column)
|
||||
sys.exit(1)
|
||||
|
||||
current_token = ""
|
||||
elif current_char in digits:
|
||||
start_col = column
|
||||
while pos <= len(input_string)-1:
|
||||
current_char = input_string[pos]
|
||||
if current_char in whitespace: # end of the number
|
||||
if current_char == "\n":
|
||||
column = 0
|
||||
line += 1
|
||||
#pos += 1
|
||||
else:
|
||||
pass
|
||||
#pos += 1
|
||||
#column += 1
|
||||
break
|
||||
|
||||
if not current_char in digits:
|
||||
traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column)
|
||||
sys.exit(1)
|
||||
|
||||
current_token += current_char
|
||||
|
||||
pos += 1
|
||||
column += 1
|
||||
|
||||
if "." in current_token:
|
||||
tokens.append(Token(
|
||||
TokenType.FLOAT,
|
||||
value=float(current_token)
|
||||
))
|
||||
else:
|
||||
tokens.append(Token(
|
||||
TokenType.INTEGER,
|
||||
value=int(current_token)
|
||||
))
|
||||
current_token = ""
|
||||
elif current_char in '\t ':
|
||||
column += 1
|
||||
pos += 1
|
||||
continue
|
||||
else:
|
||||
traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column)
|
||||
sys.exit(1)
|
||||
|
||||
column += 1
|
||||
pos += 1
|
||||
|
||||
return tokens
|
Reference in New Issue
Block a user