neural-chatbot / train.py
ierhon's picture
Move inp_len to model_settings.py
f8fed98
raw
history blame
3.45 kB
import numpy as np
import json
from keras.optimizers import Adam, SGD
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU
from keras.preprocessing.text import Tokenizer
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
from model_settings import *
with open("dataset.json", "r") as f: # TODO: move the outputs into a separate file, so it would be "key": 0, "key2": 1 etc
dset = json.load(f)
dset_size = len(dset)
tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
tokenizer.fit_on_texts(list(dset.keys()))
vocab_size = len(tokenizer.get_vocabulary())
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
model.add(Dense(1024, activation="relu")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool
model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
model.add(Dense(512, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(dset_size, activation="linear")) # TBH it doesn't matter that much what activation function to use IN THIS CASE, IN THIS LINE (in others it might be a really big deal), just linear does nothing at all to the output, that might be something like softmax but i'll test that later
X = [] # we're loading the training data into input X
y = [] # and output y
for line, key in enumerate(dset):
tokens = tokenizer.texts_to_sequences([key,])[0]
X.append(numpy.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
output_array = np.zeros(dset_size)
output_array[line] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
y.append(output_array)
X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
y = np.array(y) # that's why keras supports only numpy arrays ^
model.compile(optimizer=Adam(), loss="mse", metrics=["accuracy",]) # kind of like settings for the training
# TODO: change the loss
model.fit(X, y, epochs=10, batch_size=8) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
# Use workers=4, use_multiprocessing=True) if you don't have a GPU
model.summary() # just for you to see info about the model, useful because you can check the parameter count
model.save("chatbot.keras")