ierhon commited on
Commit
4ab880f
·
1 Parent(s): b3998a0

Fix vocab_size and a NameError numpy instead of np

Browse files
Files changed (1) hide show
  1. train.py +2 -2
train.py CHANGED
@@ -15,7 +15,7 @@ dset_size = len(dset)
15
  tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
16
  tokenizer.fit_on_texts(list(dset.keys()))
17
 
18
- vocab_size = len(tokenizer.get_vocabulary())
19
 
20
  model = Sequential()
21
  model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
@@ -33,7 +33,7 @@ y = [] # and output y
33
 
34
  for line, key in enumerate(dset):
35
  tokens = tokenizer.texts_to_sequences([key,])[0]
36
- X.append(numpy.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
37
  output_array = np.zeros(dset_size)
38
  output_array[line] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
39
  y.append(output_array)
 
15
  tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
16
  tokenizer.fit_on_texts(list(dset.keys()))
17
 
18
+ vocab_size = len(tokenizer.word_index)
19
 
20
  model = Sequential()
21
  model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
 
33
 
34
  for line, key in enumerate(dset):
35
  tokens = tokenizer.texts_to_sequences([key,])[0]
36
+ X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
37
  output_array = np.zeros(dset_size)
38
  output_array[line] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
39
  y.append(output_array)