diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..9f5fe05 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,459 @@ +from dataclasses import dataclass +from typing import Any +from enum import Enum +from error import traceback +from string import ascii_letters, digits, whitespace +import sys + + +class TokenType(Enum): + INSTRUCTION = 1 # example: stdlnout + STRING = 2 # example: "Hello!" + LABEL_DECLERATION = 3 # example: @myLabel + VARIABLE_REFERENCE = 4 # example: $myVar + VARIABLE_POINTER = 5 # example: &myVar + INTEGER = 6 # example: 123 + FLOAT = 7 # example: 0.123 + TYPE = 8 # example: -string + FUNCTION_REFERENCE = 9 # example: !myFunc + LIST_REFERENCE = 10 # example: *myList + COMMENT = 11 # example: # hi there + LINE_REFERENCE = 12 # example: %12 + LABEL_REFERENCE = 13 # example: %myLabel + +@dataclass +class Token: + type: TokenType + value: Any + +def tokenize(input_string: str): + tokens: list[Token] = [] + + line = 1 + column = 1 + pos = 0 # the actual index in the string + current_char = None + current_token = "" + + instructions = [ + "stdlnout", "stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat", + "getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide", + "equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use", + "extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize", + "listappend", "if" + ] + types = [ + "string", "bool", "list", "char", "int", "double" + ] + + + while pos < len(input_string): + current_char = input_string[pos] + + if current_char == "\n": + line += 1 + column = 1 + elif current_char == "#": + while pos < len(input_string)-1: + pos += 1 + current_char = input_string[pos] + if current_char == "\n": + break + current_token += current_char + line += 1 + column = 1 + tokens.append(Token( + TokenType.COMMENT, + value=current_token + )) + current_token = "" + elif current_char == "&": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char in digits or current_char == ".": + traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) + sys.exit(1) + elif current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) + sys.exit(1) + + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + #pos += 1 + #column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.VARIABLE_POINTER, + value=current_token + )) + + current_token = "" + elif current_char == "$": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char in digits or current_char == ".": + traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) + sys.exit(1) + elif current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) + sys.exit(1) + + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.VARIABLE_REFERENCE, + value=current_token + )) + + current_token = "" + elif current_char == "!": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a function name, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char in digits or current_char == ".": + traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column) + sys.exit(1) + elif current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column) + sys.exit(1) + + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.FUNCTION_REFERENCE, + value=current_token + )) + + current_token = "" + elif current_char == "*": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a list reference, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char in digits or current_char == ".": + traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column) + sys.exit(1) + elif current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column) + sys.exit(1) + + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.LIST_REFERENCE, + value=current_token + )) + + current_token = "" + elif current_char == "-": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a type name, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a type", line, column, column) + sys.exit(1) + + start_col = column + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + if current_token in types: + tokens.append(Token( + TokenType.TYPE, + value=current_token + )) + else: + traceback(input_string, "SyntaxError", f"\"{current_token}\" is not a valid type.", line, start_col, column) + sys.exit(1) + + current_token = "" + elif current_char == "@": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a label decleration, got ", line, column, column) + sys.exit(1) + + + current_char = input_string[pos] + + if current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column) + sys.exit(1) + + start_col = column + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.LABEL_DECLERATION, + value=current_token + )) + + current_token = "" + elif current_char == "%": + pos += 1 + column += 1 + + if len(input_string) == pos: + traceback(input_string, "SyntaxError", "Expected a label or line reference, got ", line, column, column) + sys.exit(1) + current_char = input_string[pos] + + if current_char == "\n": + traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column) + sys.exit(1) + + start_col = column + if current_char in digits or current_char == ".": # its a line number reference + while pos < len(input_string): + + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + if not current_char in digits: # random ass character in the middle of the line number + traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column) + sys.exit(1) + + current_token += current_char + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.LINE_REFERENCE, + value=int(current_token) + )) + else: # its a label name + while pos < len(input_string): + current_char = input_string[pos] + if current_char in "\t ": + column += 1 + break + if current_char == "\n": + line += 1 + column = 1 + break + + current_token += current_char + pos += 1 + column += 1 + + tokens.append(Token( + TokenType.LABEL_REFERENCE, + value=current_token + )) + + current_token = "" + elif current_char == '"': + + + pos += 1 + column += 1 + start_col = column + current_char = input_string[pos] + + while current_char != '"': + current_token += current_char + pos += 1 + column += 1 + if pos > len(input_string)-1: + traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col) + sys.exit(1) + current_char = input_string[pos] + + tokens.append(Token( + TokenType.STRING, + value=current_token + )) + current_token = "" + + elif current_char in ascii_letters: + start_col = column+1 + while pos <= len(input_string)-1: + current_char = input_string[pos] + if current_char in "\t ": + break + elif current_char in "\n": + column = 1 + line += 1 + break + current_token += current_char + pos += 1 + column += 1 + + if current_char not in ascii_letters + digits: + break + + if current_token in instructions: + tokens.append(Token( + TokenType.INSTRUCTION, + value=current_token + )) + else: + traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column) + sys.exit(1) + + current_token = "" + elif current_char in digits: + start_col = column + while pos <= len(input_string)-1: + current_char = input_string[pos] + if current_char in whitespace: # end of the number + if current_char == "\n": + column = 0 + line += 1 + #pos += 1 + else: + pass + #pos += 1 + #column += 1 + break + + if not current_char in digits: + traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column) + sys.exit(1) + + current_token += current_char + + pos += 1 + column += 1 + + if "." in current_token: + tokens.append(Token( + TokenType.FLOAT, + value=float(current_token) + )) + else: + tokens.append(Token( + TokenType.INTEGER, + value=int(current_token) + )) + current_token = "" + elif current_char in '\t ': + column += 1 + pos += 1 + continue + else: + traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column) + sys.exit(1) + + column += 1 + pos += 1 + + return tokens \ No newline at end of file