from dataclasses import dataclass from typing import Any from enum import Enum from error import traceback from string import ascii_letters, digits, whitespace class TokenType(Enum): INSTRUCTION = 1 # example: stdlnout STRING = 2 # example: "Hello!" LABEL_DECLERATION = 3 # example: @myLabel VARIABLE_REFERENCE = 4 # example: $myVar VARIABLE_POINTER = 5 # example: &myVar INTEGER = 6 # example: 123 FLOAT = 7 # example: 0.123 TYPE = 8 # example: -string FUNCTION_REFERENCE = 9 # example: !myFunc LIST_REFERENCE = 10 # example: *myList COMMENT = 11 # example: # hi there LINE_REFERENCE = 12 # example: %12 LABEL_REFERENCE = 13 # example: %myLabel BOOL = 14 # example: true EOF = 15 @dataclass class Token: type: TokenType value: Any def tokenize(input_string: str): tokens: list[Token] = [] line = 1 column = 1 pos = 0 # the actual index in the string current_char = None current_token = "" instructions = [ "stdlnout", "stdout", "stdin", "end", "return", "fun", "endfun", "getstrcharat", "getstrsize", "pusharg", "call", "set", "add", "subtract", "multiply", "divide", "equal", "inequal", "not", "greater", "lesser", "stoi", "stod", "tostring", "use", "extern", "jump", "gettype", "exists", "setlist", "setlistat", "getlistat", "getlistsize", "listappend", "if" ] while pos < len(input_string): current_char = input_string[pos] if current_char == "\n": line += 1 column = 1 elif current_char == "#": while pos < len(input_string)-1: pos += 1 current_char = input_string[pos] if current_char == "\n": break current_token += current_char line += 1 column = 1 tokens.append(Token( TokenType.COMMENT, value=current_token )) current_token = "" elif current_char == "&": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": #pos += 1 #column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.VARIABLE_POINTER, value=current_token )) current_token = "" elif current_char == "$": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a variable name, got ", line, column, column) current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Variable names can't start with numbers.", line, column, column) elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a variable name", line, column, column) while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.VARIABLE_REFERENCE, value=current_token )) current_token = "" elif current_char == "!": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a function name, got ", line, column, column) current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "Function names can't start with numbers.", line, column, column) elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a function name.", line, column, column) while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.FUNCTION_REFERENCE, value=current_token )) current_token = "" elif current_char == "*": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a list reference, got ", line, column, column) current_char = input_string[pos] if current_char in digits or current_char == ".": traceback(input_string, "SyntaxError", "List references can't start with numbers.", line, column, column) elif current_char == "\n": traceback(input_string, "SyntaxError", "Expected a list reference.", line, column, column) while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.LIST_REFERENCE, value=current_token )) current_token = "" elif current_char == "-": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a type name, got ", line, column, column) current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a type", line, column, column) is_number = False if current_char in digits: is_number = True start_col = column while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break if is_number and not current_char in digits+".": traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column) current_token += current_char pos += 1 column += 1 if not is_number: tokens.append(Token( TokenType.TYPE, value=current_token )) else: if "." in current_token: tokens.append(Token( TokenType.FLOAT, value=float("-"+current_token) )) else: tokens.append(Token( TokenType.INTEGER, value=int("-"+current_token) )) current_token = "" elif current_char == "@": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a label decleration, got ", line, column, column) current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a label decleration", line, column, column) start_col = column while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.LABEL_DECLERATION, value=current_token )) current_token = "" elif current_char == "%": pos += 1 column += 1 if len(input_string) == pos: traceback(input_string, "SyntaxError", "Expected a label or line reference, got ", line, column, column) current_char = input_string[pos] if current_char == "\n": traceback(input_string, "SyntaxError", "Expected a label or line reference", line, column, column) start_col = column if current_char in digits or current_char == ".": # its a line number reference while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break if not current_char in digits: # random ass character in the middle of the line number traceback(input_string, "SyntaxError", "Malformed line number.", line, start_col, column) current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.LINE_REFERENCE, value=int(current_token) )) else: # its a label name while pos < len(input_string): current_char = input_string[pos] if current_char in "\t ": column += 1 break if current_char == "\n": line += 1 column = 1 break current_token += current_char pos += 1 column += 1 tokens.append(Token( TokenType.LABEL_REFERENCE, value=current_token )) current_token = "" elif current_char == '"': pos += 1 column += 1 start_col = column current_char = input_string[pos] while current_char != '"': current_token += current_char pos += 1 column += 1 if pos > len(input_string)-1: traceback(input_string, "SyntaxError", f"String was never closed.", line, start_col, start_col) current_char = input_string[pos] tokens.append(Token( TokenType.STRING, value=current_token )) current_token = "" elif current_char in ascii_letters: start_col = column+1 while pos <= len(input_string)-1: current_char = input_string[pos] if current_char in "\t ": break elif current_char in "\n": column = 1 line += 1 break current_token += current_char pos += 1 column += 1 if current_char not in ascii_letters + digits: break if current_token in instructions: tokens.append(Token( TokenType.INSTRUCTION, value=current_token )) elif current_token in ["true", "false"]: tokens.append(Token( TokenType.BOOL, value=current_token == "true" )) else: traceback(input_string, "SyntaxError", f"\"{current_token}\" isn't a valid instruction.", line, start_col, column) current_token = "" elif current_char in digits: start_col = column while pos <= len(input_string)-1: current_char = input_string[pos] if current_char in whitespace: # end of the number if current_char == "\n": column = 0 line += 1 #pos += 1 else: pass #pos += 1 #column += 1 break if not current_char in digits + ".": traceback(input_string, "SyntaxError", "Malformed number.", line, start_col, column) current_token += current_char pos += 1 column += 1 if "." in current_token: tokens.append(Token( TokenType.FLOAT, value=float(current_token) )) else: tokens.append(Token( TokenType.INTEGER, value=int(current_token) )) current_token = "" elif current_char in '\t ': column += 1 pos += 1 continue else: traceback(input_string, "SyntaxError", f"Unkown token \"{current_char}\"", line, column, column) column += 1 pos += 1 tokens.append(Token( TokenType.EOF, None )) return tokens