Yhhxhfh commited on
Commit
32d6875
1 Parent(s): 8d1c4bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -102
app.py CHANGED
@@ -11,9 +11,7 @@ import redis
11
  import uvicorn
12
  import nltk
13
  from nltk.stem import WordNetLemmatizer
14
- from nltk.corpus import wordnet
15
  from tqdm import tqdm
16
- from tqdm.keras import TqdmCallback
17
  from tensorflow.keras import Sequential
18
  from tensorflow.keras.layers import Dense, Dropout, Input
19
  from tensorflow.keras.optimizers import SGD
@@ -22,105 +20,92 @@ from fastapi import FastAPI
22
  from fastapi.responses import HTMLResponse
23
  from pydantic import BaseModel
24
  from dotenv import load_dotenv
 
 
25
 
26
- # Cargar las variables de entorno
27
  load_dotenv()
28
 
29
  app = FastAPI()
30
 
31
- # Inicializar el lematizador y Redis
32
  lemmatizer = WordNetLemmatizer()
33
  redis_password = os.getenv("REDIS_PASSWORD")
34
  r = redis.Redis(host=os.getenv("REDIS_HOST"), port=int(os.getenv("REDIS_PORT")), password=redis_password)
35
 
36
- # Cargar datos en Redis
37
- def load_data_to_redis():
38
- files_to_load = {
39
- 'intents.json': 'intents',
40
- 'classes.pkl': 'classes',
41
- 'words.pkl': 'words',
42
- 'chatbot_model.h5': 'chatbot_model'
 
 
 
 
 
 
 
 
 
43
  }
44
 
45
- for file_name, redis_key in files_to_load.items():
46
- if os.path.exists(file_name) and not r.exists(redis_key):
47
- print(f"Cargando {file_name} a Redis...")
48
- if file_name.endswith('.json'):
49
- with open(file_name) as f:
50
- data = json.load(f)
51
- r.set(redis_key, json.dumps(data))
52
- elif file_name.endswith('.h5'):
53
- with open(file_name, 'rb') as f:
54
- r.set(redis_key, f.read())
55
- else:
56
- with open(file_name, 'rb') as f:
57
- r.set(redis_key, pickle.dumps(pickle.load(f)))
58
-
59
- # Asegurarse de que las carpetas existan
 
 
 
 
60
  if not os.path.exists('models'):
61
  os.makedirs('models')
62
 
63
- def initialize_redis():
64
- global r
65
- try:
66
- r.ping()
67
- print("Conexión a Redis exitosa.")
68
- load_data_to_redis()
69
- except redis.exceptions.ConnectionError:
70
- print("Error al conectar a Redis. Saliendo.")
71
- exit(1)
72
-
73
  async def train_and_save_model():
74
  global lemmatizer, r
75
  while True:
76
  words, classes, documents = [], [], []
77
  ignore_words = ['?', '!']
78
 
79
- intents = json.loads(r.get('intents'))
80
-
81
- print("Cargando preguntas de usuario de Redis...")
82
- if not r.exists('user_questions_loaded'):
83
- user_questions = r.lrange('user_questions', 0, -1)
84
- for question in user_questions:
85
- question = question.decode('utf-8')
86
- try:
87
- existing_tag = r.get(f"tag:{question}").decode('utf-8')
88
- documents.append((nltk.word_tokenize(question), existing_tag))
89
- if existing_tag not in classes:
90
- classes.append(existing_tag)
91
- except AttributeError:
92
- documents.append((nltk.word_tokenize(question), "unknown"))
93
- if "unknown" not in classes:
94
- classes.append("unknown")
95
- r.set('user_questions_loaded', 1)
96
-
97
- print("Procesando intenciones de Redis...")
98
  for intent in intents['intents']:
99
  for pattern in intent['patterns']:
100
- w = nltk.word_tokenize(pattern)
101
- words.extend(w)
102
- documents.append((w, intent['tag']))
103
  if intent['tag'] not in classes:
104
  classes.append(intent['tag'])
105
 
106
- print(f"Generando sinónimos para la intención '{intent['tag']}'...")
107
- with multiprocessing.Pool() as pool:
108
- results = []
109
- for _ in tqdm(range(100000), desc="Generando sinónimos", leave=False):
110
- if not intent['patterns']:
111
- break
112
- results.append(pool.apply_async(generate_synonym_pattern, (intent['patterns'],)))
113
-
114
- for result in results:
115
- new_pattern = result.get()
116
- if new_pattern:
117
- intent['patterns'].append(new_pattern)
118
 
119
- words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
120
  words = sorted(set(words))
121
  classes = sorted(set(classes))
122
 
123
- print("Creando datos de entrenamiento...")
124
  training = []
125
  output_empty = [0] * len(classes)
126
  for doc in documents:
@@ -134,20 +119,18 @@ async def train_and_save_model():
134
  training.append([bag, output_row])
135
 
136
  if not training:
137
- print("Aún no hay datos de entrenamiento. Esperando...")
138
  await asyncio.sleep(60)
139
  continue
140
 
141
  train_x = np.array([row[0] for row in training])
142
  train_y = np.array([row[1] for row in training])
143
 
144
- print("Cargando o creando el modelo...")
145
  if r.exists('chatbot_model'):
146
  with tempfile.NamedTemporaryFile(delete=False, suffix='.h5') as temp_file:
147
  temp_file.write(r.get('chatbot_model'))
148
  temp_file_name = temp_file.name
149
  model = load_model(temp_file_name)
150
- os.remove(temp_file_name)
151
  else:
152
  input_layer = Input(shape=(len(train_x[0]),))
153
  layer1 = Dense(128, activation='relu')(input_layer)
@@ -160,10 +143,8 @@ async def train_and_save_model():
160
  sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
161
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
162
 
163
- print("Entrenando el modelo...")
164
- model.fit(train_x, train_y, epochs=1, batch_size=len(train_x), verbose=0, callbacks=[TqdmCallback(verbose=2)])
165
 
166
- print("Guardando datos en Redis...")
167
  r.set('words', pickle.dumps(words))
168
  r.set('classes', pickle.dumps(classes))
169
 
@@ -173,23 +154,20 @@ async def train_and_save_model():
173
  r.set('chatbot_model', f.read())
174
  os.remove(temp_file.name)
175
 
176
- print("Datos y modelo guardados. Reiniciando entrenamiento...")
177
-
178
- def generate_synonym_pattern(patterns):
179
- new_pattern = []
180
- for word in random.choice(patterns).split():
181
- synonyms = wordnet.synsets(word)
182
- if synonyms:
183
- synonym = random.choice(synonyms[0].lemmas()).name()
184
- new_pattern.append(synonym)
185
- else:
186
- new_pattern.append(word)
187
- return " ".join(new_pattern)
188
-
189
- def start_training_loop():
190
- loop = asyncio.new_event_loop()
191
- asyncio.set_event_loop(loop)
192
- loop.run_until_complete(train_and_save_model())
193
 
194
  class ChatMessage(BaseModel):
195
  message: str
@@ -205,9 +183,7 @@ async def chat(message: ChatMessage):
205
  model = load_model(temp_file_name)
206
  os.remove(temp_file.name)
207
 
208
- sentence_words = nltk.word_tokenize(message.message)
209
- sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
210
-
211
  bag = [0] * len(words)
212
  for s in sentence_words:
213
  for i, w in enumerate(words):
@@ -222,9 +198,7 @@ async def chat(message: ChatMessage):
222
  for i, p in results:
223
  return_list.append({"intent": classes[i], "probability": str(p)})
224
 
225
- r.rpush('user_questions', message.message)
226
-
227
- asyncio.create_task(train_and_save_model())
228
 
229
  return return_list
230
 
@@ -326,7 +300,9 @@ async def root():
326
  return html_code
327
 
328
  if __name__ == "__main__":
 
 
329
  initialize_redis()
330
- training_process = multiprocessing.Process(target=start_training_loop)
331
  training_process.start()
332
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
11
  import uvicorn
12
  import nltk
13
  from nltk.stem import WordNetLemmatizer
 
14
  from tqdm import tqdm
 
15
  from tensorflow.keras import Sequential
16
  from tensorflow.keras.layers import Dense, Dropout, Input
17
  from tensorflow.keras.optimizers import SGD
 
20
  from fastapi.responses import HTMLResponse
21
  from pydantic import BaseModel
22
  from dotenv import load_dotenv
23
+ from datetime import datetime
24
+ from kareas_nlp import TextProcessor
25
 
 
26
  load_dotenv()
27
 
28
  app = FastAPI()
29
 
 
30
  lemmatizer = WordNetLemmatizer()
31
  redis_password = os.getenv("REDIS_PASSWORD")
32
  r = redis.Redis(host=os.getenv("REDIS_HOST"), port=int(os.getenv("REDIS_PORT")), password=redis_password)
33
 
34
+ def create_intents_json():
35
+ intents = {
36
+ "intents": [
37
+ {
38
+ "tag": "greeting",
39
+ "patterns": ["Hola", "¿Cómo estás?", "Buenos días"],
40
+ "responses": ["¡Hola!", "¿Cómo puedo ayudarte?"],
41
+ "date": "2021-01-01"
42
+ },
43
+ {
44
+ "tag": "goodbye",
45
+ "patterns": ["Adiós", "Hasta luego", "Nos vemos"],
46
+ "responses": ["¡Hasta luego!", "Cuídate!"],
47
+ "date": "2021-01-01"
48
+ }
49
+ ]
50
  }
51
 
52
+ with open('intents.json', 'w') as f:
53
+ json.dump(intents, f, ensure_ascii=False, indent=4)
54
+
55
+ def load_and_filter_data():
56
+ with open("intents.json") as file:
57
+ intents = json.load(file)
58
+
59
+ filtered_intents = {
60
+ "intents": []
61
+ }
62
+
63
+ for intent in intents['intents']:
64
+ if "date" in intent:
65
+ intent_date = datetime.strptime(intent["date"], "%Y-%m-%d")
66
+ if intent_date.year >= 2000 and intent_date <= datetime.now():
67
+ filtered_intents['intents'].append(intent)
68
+
69
+ return filtered_intents
70
+
71
  if not os.path.exists('models'):
72
  os.makedirs('models')
73
 
 
 
 
 
 
 
 
 
 
 
74
  async def train_and_save_model():
75
  global lemmatizer, r
76
  while True:
77
  words, classes, documents = [], [], []
78
  ignore_words = ['?', '!']
79
 
80
+ intents = load_and_filter_data()
81
+
82
+ user_questions = r.lrange('user_questions', 0, -1)
83
+
84
+ for question in user_questions:
85
+ question = question.decode('utf-8')
86
+ processed_words = TextProcessor().process(question)
87
+ documents.append((processed_words, "user_question"))
88
+ words.extend(processed_words)
89
+
 
 
 
 
 
 
 
 
 
90
  for intent in intents['intents']:
91
  for pattern in intent['patterns']:
92
+ processed_words = TextProcessor().process(pattern)
93
+ documents.append((processed_words, intent['tag']))
94
+ words.extend(processed_words)
95
  if intent['tag'] not in classes:
96
  classes.append(intent['tag'])
97
 
98
+ for intent in intents['intents']:
99
+ for pattern in intent['patterns']:
100
+ synonyms = generate_synonyms(pattern)
101
+ for synonym in synonyms:
102
+ processed_words = TextProcessor().process(synonym)
103
+ documents.append((processed_words, intent['tag']))
104
+ words.extend(processed_words)
 
 
 
 
 
105
 
 
106
  words = sorted(set(words))
107
  classes = sorted(set(classes))
108
 
 
109
  training = []
110
  output_empty = [0] * len(classes)
111
  for doc in documents:
 
119
  training.append([bag, output_row])
120
 
121
  if not training:
 
122
  await asyncio.sleep(60)
123
  continue
124
 
125
  train_x = np.array([row[0] for row in training])
126
  train_y = np.array([row[1] for row in training])
127
 
 
128
  if r.exists('chatbot_model'):
129
  with tempfile.NamedTemporaryFile(delete=False, suffix='.h5') as temp_file:
130
  temp_file.write(r.get('chatbot_model'))
131
  temp_file_name = temp_file.name
132
  model = load_model(temp_file_name)
133
+ os.remove(temp_file.name)
134
  else:
135
  input_layer = Input(shape=(len(train_x[0]),))
136
  layer1 = Dense(128, activation='relu')(input_layer)
 
143
  sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
144
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
145
 
146
+ model.fit(train_x, train_y, epochs=1, batch_size=len(train_x), verbose=0)
 
147
 
 
148
  r.set('words', pickle.dumps(words))
149
  r.set('classes', pickle.dumps(classes))
150
 
 
154
  r.set('chatbot_model', f.read())
155
  os.remove(temp_file.name)
156
 
157
+ def generate_synonyms(pattern):
158
+ synonyms = []
159
+ words = nltk.word_tokenize(pattern)
160
+ for word in words:
161
+ synsets = nltk.corpus.wordnet.synsets(word)
162
+ if synsets:
163
+ for syn in synsets:
164
+ for lemma in syn.lemmas():
165
+ synonyms.append(lemma.name())
166
+ return list(set(synonyms))
167
+
168
+ async def handle_new_message(message: str):
169
+ r.rpush('user_questions', message)
170
+ await train_and_save_model()
 
 
 
171
 
172
  class ChatMessage(BaseModel):
173
  message: str
 
183
  model = load_model(temp_file_name)
184
  os.remove(temp_file.name)
185
 
186
+ sentence_words = TextProcessor().process(message.message)
 
 
187
  bag = [0] * len(words)
188
  for s in sentence_words:
189
  for i, w in enumerate(words):
 
198
  for i, p in results:
199
  return_list.append({"intent": classes[i], "probability": str(p)})
200
 
201
+ await handle_new_message(message.message)
 
 
202
 
203
  return return_list
204
 
 
300
  return html_code
301
 
302
  if __name__ == "__main__":
303
+ print("Iniciando la aplicación...")
304
+ create_intents_json()
305
  initialize_redis()
306
+ training_process = multiprocessing.Process(target=train_and_save_model)
307
  training_process.start()
308
  uvicorn.run(app, host="0.0.0.0", port=7860)