import numpy as np
import json
from keras.optimizers import Adam, SGD
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU
from keras.preprocessing.text import Tokenizer
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
from model_settings import *


with open("dataset.json", "r") as f:
    dset = json.load(f)

with open("responses.txt", "r") as f:
    dset_size = len(f.readlines())

tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
tokenizer.fit_on_texts(list(dset.keys()))

vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
model.add(Dense(1024, activation="relu")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just  (x>0)*x  ) and overall cool
model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
model.add(Dense(512, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(dset_size, activation="linear")) # TBH it doesn't matter that much what activation function to use IN THIS CASE, IN THIS LINE (in others it might be a really big deal), just linear does nothing at all to the output, that might be something like softmax but i'll test that later

X = [] # we're loading the training data into input X
y = [] # and output y

for key in dset:
    tokens = tokenizer.texts_to_sequences([key,])[0]
    X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
    output_array = np.zeros(dset_size)
    output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
    y.append(output_array)

X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
y = np.array(y) # that's why keras supports only numpy arrays ^

model.compile(optimizer=Adam(), loss="mse", metrics=["accuracy",]) # kind of like settings for the training
# TODO: change the loss

model.fit(X, y, epochs=10, batch_size=8) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
# Use   workers=4, use_multiprocessing=True)   if you don't have a GPU

model.summary() # just for you to see info about the model, useful because you can check the parameter count

model.save("chatbot.keras")