50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
import re
|
|
import random
|
|
|
|
regex = re.compile("[^a-zA-Z ]")
|
|
|
|
class Token:
|
|
def __init__(self):
|
|
self.completions: list[str] = []
|
|
def __repr__(self):
|
|
return self.completions.__repr__()
|
|
def add_completion(self, word: str):
|
|
self.completions.append(word)
|
|
|
|
class Model:
|
|
def __init__(self):
|
|
self.tokens: dict[str, Token] = {}
|
|
def predict(self, previous: str) -> str | None:
|
|
if previous in self.tokens:
|
|
completion = self.tokens[previous].completions
|
|
return random.choice(completion)
|
|
else:
|
|
return None
|
|
|
|
def tokenise(instr: str):
|
|
return re.split(r" |\n", regex.sub("", instr).lower())
|
|
|
|
def train(data: str):
|
|
model = Model()
|
|
spldata = tokenise(data)
|
|
for i in range(0, len(spldata) - 1):
|
|
word = spldata[i]
|
|
next_word = spldata[i + 1]
|
|
if not word in model.tokens:
|
|
model.tokens[word] = Token()
|
|
model.tokens[word].add_completion(next_word)
|
|
return model
|
|
|
|
def prompt(prompt: str, model: Model) -> str:
|
|
words = tokenise(prompt)
|
|
output: str = ""
|
|
currentWord = words[-1]
|
|
for _ in range(1, 200):
|
|
currentWord = model.predict(currentWord)
|
|
if currentWord == None:
|
|
break
|
|
output += currentWord + " "
|
|
|
|
return output
|
|
|