import numpy as np import json from keras.optimizers import Adam, SGD from keras.models import Sequential from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU from keras.preprocessing.text import Tokenizer from keras_self_attention import SeqSelfAttention, SeqWeightedAttention from model_settings import * with open("dataset.json", "r") as f: dset = json.load(f) with open("responses.txt", "r") as f: dset_size = len(f.readlines()) tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc. tokenizer.fit_on_texts(list(dset.keys())) vocab_size = len(tokenizer.word_index) + 1 model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len)) model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word model.add(Dense(1024, activation="relu")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident) model.add(Dense(512, activation="relu")) model.add(Dense(512, activation="relu")) model.add(Dense(256, activation="relu")) model.add(Dense(dset_size, activation="linear")) # TBH it doesn't matter that much what activation function to use IN THIS CASE, IN THIS LINE (in others it might be a really big deal), just linear does nothing at all to the output, that might be something like softmax but i'll test that later X = [] # we're loading the training data into input X y = [] # and output y for key in dset: tokens = tokenizer.texts_to_sequences([key,])[0] X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code output_array = np.zeros(dset_size) output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response y.append(output_array) X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited) y = np.array(y) # that's why keras supports only numpy arrays ^ model.compile(optimizer=Adam(), loss="mse", metrics=["accuracy",]) # kind of like settings for the training # TODO: change the loss model.fit(X, y, epochs=10, batch_size=8) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1. # Use workers=4, use_multiprocessing=True) if you don't have a GPU model.summary() # just for you to see info about the model, useful because you can check the parameter count model.save("chatbot.keras")