Update app.py
Browse files
app.py
CHANGED
@@ -1,71 +1,245 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
5 |
from dotenv import load_dotenv
|
6 |
-
|
7 |
-
import
|
8 |
-
|
9 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
while True:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
)
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
import tensorflow as tf
|
5 |
+
from google.cloud import storage
|
6 |
+
from keras_nlp.models import BERT
|
7 |
+
from keras_nlp.tokenizers import BertTokenizer
|
8 |
+
from keras_nlp.callbacks import EarlyStopping
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
from dotenv import load_dotenv
|
11 |
+
from tqdm import tqdm
|
12 |
+
import io
|
13 |
+
import random
|
14 |
+
import nltk
|
15 |
+
from nltk.corpus import wordnet
|
16 |
+
from nltk import pos_tag
|
17 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
18 |
+
from nltk.corpus import brown, stopwords, reuters, genesis
|
19 |
+
|
20 |
+
nltk.download('punkt')
|
21 |
+
nltk.download('wordnet')
|
22 |
+
nltk.download('averaged_perceptron_tagger')
|
23 |
+
nltk.download('brown')
|
24 |
+
nltk.download('stopwords')
|
25 |
+
nltk.download('reuters')
|
26 |
+
nltk.download('genesis')
|
27 |
|
28 |
load_dotenv()
|
29 |
+
google_credentials = os.getenv("GOOGLE_CREDENTIALS")
|
30 |
+
bucket_name = os.getenv("BUCKET_NAME")
|
31 |
+
storage_client = storage.Client.from_service_account_info(json.loads(google_credentials))
|
32 |
+
|
33 |
+
def generate_intents(num_intents):
|
34 |
+
intents = []
|
35 |
+
base_intent_questions = [
|
36 |
+
"What is the capital of France?",
|
37 |
+
"How do I create a function in Python?",
|
38 |
+
"What is the difference between a list and a tuple?",
|
39 |
+
"Can you explain the concept of recursion?",
|
40 |
+
"How can I read a file in Python?"
|
41 |
+
]
|
42 |
+
base_intent_answers = [
|
43 |
+
"The capital of France is Paris.",
|
44 |
+
"You can create a function in Python using the 'def' keyword.",
|
45 |
+
"A list is mutable, while a tuple is immutable.",
|
46 |
+
"Recursion is a process in which a function calls itself.",
|
47 |
+
"You can read a file in Python using the open() function."
|
48 |
+
]
|
49 |
+
base_intent_code_snippets = [
|
50 |
+
"def my_function(param): return param * 2",
|
51 |
+
"print('Hello, World!')",
|
52 |
+
"for i in range(5): print(i)",
|
53 |
+
"if condition: do_something()",
|
54 |
+
"import numpy as np"
|
55 |
+
]
|
56 |
+
base_human_dialogues = [
|
57 |
+
"Hey! How are you doing today?",
|
58 |
+
"Could you help me with a coding problem?",
|
59 |
+
"What's your favorite programming language?",
|
60 |
+
"I was thinking about learning data science.",
|
61 |
+
"Do you know any good resources for learning Python?"
|
62 |
+
]
|
63 |
+
for _ in range(num_intents):
|
64 |
+
intent_type = random.choice(['question', 'answer', 'code', 'dialogue'])
|
65 |
+
if intent_type == 'question':
|
66 |
+
intent = random.choice(base_intent_questions)
|
67 |
+
elif intent_type == 'answer':
|
68 |
+
intent = random.choice(base_intent_answers)
|
69 |
+
elif intent_type == 'code':
|
70 |
+
intent = random.choice(base_intent_code_snippets)
|
71 |
+
else:
|
72 |
+
intent = random.choice(base_human_dialogues)
|
73 |
+
intents.append(intent)
|
74 |
+
return intents
|
75 |
+
|
76 |
+
def generate_biblical_texts(num_texts):
|
77 |
+
biblical_quotes = [
|
78 |
+
"For I know the plans I have for you, declares the Lord.",
|
79 |
+
"The Lord is my shepherd; I shall not want.",
|
80 |
+
"I can do all things through Christ who strengthens me.",
|
81 |
+
"And we know that in all things God works for the good of those who love him.",
|
82 |
+
"Trust in the Lord with all your heart and lean not on your own understanding."
|
83 |
+
]
|
84 |
+
return random.choices(biblical_quotes, k=num_texts)
|
85 |
+
|
86 |
+
def generate_disease_info(num_info):
|
87 |
+
diseases = [
|
88 |
+
"Diabetes is a chronic disease that occurs when the body cannot effectively use insulin.",
|
89 |
+
"Hypertension, or high blood pressure, can lead to serious health issues if left untreated.",
|
90 |
+
"Asthma is a condition in which your airways narrow and swell and may produce extra mucus.",
|
91 |
+
"Heart disease encompasses a range of conditions that affect your heart's structure and function.",
|
92 |
+
"COVID-19 is caused by the coronavirus SARS-CoV-2, which can lead to severe respiratory illness."
|
93 |
+
]
|
94 |
+
return random.choices(diseases, k=num_info)
|
95 |
+
|
96 |
+
def generate_medication_info(num_info):
|
97 |
+
medications = [
|
98 |
+
"Aspirin is commonly used to reduce pain, fever, and inflammation.",
|
99 |
+
"Ibuprofen is a non-steroidal anti-inflammatory drug (NSAID) that helps alleviate pain.",
|
100 |
+
"Metformin is used to manage blood sugar levels in people with type 2 diabetes.",
|
101 |
+
"Statins are a class of drugs used to lower cholesterol levels.",
|
102 |
+
"Amoxicillin is an antibiotic used to treat bacterial infections."
|
103 |
+
]
|
104 |
+
return random.choices(medications, k=num_info)
|
105 |
+
|
106 |
+
def generate_news_info(num_info):
|
107 |
+
news_articles = [
|
108 |
+
"Scientists have made a breakthrough in renewable energy technology.",
|
109 |
+
"The stock market saw significant gains today amid positive economic news.",
|
110 |
+
"Global temperatures are on the rise, raising concerns about climate change.",
|
111 |
+
"Researchers have discovered a new species of dinosaur in South America.",
|
112 |
+
"Innovations in artificial intelligence are transforming industries worldwide."
|
113 |
+
]
|
114 |
+
return random.choices(news_articles, k=num_info)
|
115 |
+
|
116 |
+
def generate_general_information(num_info):
|
117 |
+
general_info = [
|
118 |
+
"Python is an interpreted, high-level programming language.",
|
119 |
+
"Machine learning is a field of artificial intelligence that uses statistical techniques.",
|
120 |
+
"Cloud computing enables on-demand access to computing resources.",
|
121 |
+
"Cybersecurity is crucial for protecting sensitive data from unauthorized access.",
|
122 |
+
"Blockchain technology is the backbone of cryptocurrencies like Bitcoin."
|
123 |
+
]
|
124 |
+
return random.choices(general_info, k=num_info)
|
125 |
+
|
126 |
+
def generate_historical_texts(num_texts, start_year, end_year):
|
127 |
+
historical_events = [
|
128 |
+
f"In {year}, significant historical events took place." for year in range(start_year, end_year + 1)
|
129 |
+
]
|
130 |
+
return random.choices(historical_events, k=num_texts)
|
131 |
+
|
132 |
+
def generate_relate_history(num_texts):
|
133 |
+
historical_relations = [
|
134 |
+
"In 1776, the Declaration of Independence was signed in the United States.",
|
135 |
+
"The Industrial Revolution began in the late 18th century.",
|
136 |
+
"The fall of the Berlin Wall in 1989 marked the end of the Cold War.",
|
137 |
+
"World War II ended in 1945.",
|
138 |
+
"The Apollo 11 mission landed the first humans on the moon in 1969."
|
139 |
+
]
|
140 |
+
return random.choices(historical_relations, k=num_texts)
|
141 |
+
|
142 |
+
def generate_sentences(num_sentences):
|
143 |
+
sentences = []
|
144 |
+
base_sentences = [
|
145 |
+
"Artificial intelligence is transforming the world.",
|
146 |
+
"Deep learning is a subset of machine learning.",
|
147 |
+
"Python is a popular programming language.",
|
148 |
+
"Data science involves statistics and programming.",
|
149 |
+
"Natural language processing enables machines to understand human language."
|
150 |
+
]
|
151 |
+
for _ in range(num_sentences):
|
152 |
+
sentence = random.choice(base_sentences)
|
153 |
+
sentences.append(sentence)
|
154 |
+
return sentences
|
155 |
+
|
156 |
+
def generate_questions(num_questions):
|
157 |
+
questions = []
|
158 |
+
base_questions = [
|
159 |
+
"How does machine learning work?",
|
160 |
+
"What are the benefits of using TensorFlow?",
|
161 |
+
"Why is Python widely used in data science?",
|
162 |
+
"What is the importance of data preprocessing?",
|
163 |
+
"How can I improve my programming skills?"
|
164 |
+
]
|
165 |
+
for _ in range(num_questions):
|
166 |
+
question = random.choice(base_questions)
|
167 |
+
questions.append(question)
|
168 |
+
return questions
|
169 |
+
|
170 |
+
def expand_intent(intent):
|
171 |
+
words = intent.split()
|
172 |
+
expanded_intents = []
|
173 |
+
for word in words:
|
174 |
+
synonyms = wordnet.synsets(word)
|
175 |
+
if synonyms:
|
176 |
+
synonym_words = [syn.lemmas()[0].name() for syn in synonyms if syn.lemmas()]
|
177 |
+
for synonym in synonym_words[:2]:
|
178 |
+
new_intent = intent.replace(word, synonym)
|
179 |
+
expanded_intents.append(new_intent)
|
180 |
+
return expanded_intents
|
181 |
+
|
182 |
+
num_intents = 100000
|
183 |
+
intents = generate_intents(num_intents)
|
184 |
+
sentences = generate_sentences(2000)
|
185 |
+
questions = generate_questions(2000)
|
186 |
+
biblical_texts = generate_biblical_texts(2000)
|
187 |
+
disease_info = generate_disease_info(2000)
|
188 |
+
medication_info = generate_medication_info(2000)
|
189 |
+
news_info = generate_news_info(2000)
|
190 |
+
general_info = generate_general_information(2000)
|
191 |
+
historical_texts_1900_2024 = generate_historical_texts(100, 1900, 2024)
|
192 |
+
historical_texts_1000_2024 = generate_historical_texts(200, 1000, 2024)
|
193 |
+
historical_relations = generate_relate_history(200)
|
194 |
+
|
195 |
+
intents.extend(sentences)
|
196 |
+
intents.extend(questions)
|
197 |
+
intents.extend(biblical_texts)
|
198 |
+
intents.extend(disease_info)
|
199 |
+
intents.extend(medication_info)
|
200 |
+
intents.extend(news_info)
|
201 |
+
intents.extend(general_info)
|
202 |
+
intents.extend(historical_texts_1900_2024)
|
203 |
+
intents.extend(historical_texts_1000_2024)
|
204 |
+
intents.extend(historical_relations)
|
205 |
+
|
206 |
+
expanded_intents = []
|
207 |
+
for intent in intents:
|
208 |
+
expanded_intents.extend(expand_intent(intent))
|
209 |
+
|
210 |
+
intents = list(set(expanded_intents))[:100000]
|
211 |
+
|
212 |
+
labels = [1] * len(intents)
|
213 |
|
214 |
while True:
|
215 |
+
X_train, X_val, y_train, y_val = train_test_split(intents, labels, test_size=0.2, random_state=42)
|
216 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
217 |
+
train_encodings = tokenizer(X_train, truncation=True, padding=True)
|
218 |
+
val_encodings = tokenizer(X_val, truncation=True, padding=True)
|
219 |
+
|
220 |
+
model = BERT.from_pretrained('bert-base-uncased')
|
221 |
+
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
222 |
+
|
223 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
|
224 |
+
history = model.fit(train_encodings, y_train, validation_data=(val_encodings, y_val), epochs=10, callbacks=[early_stopping])
|
225 |
+
|
226 |
+
accuracy = history.history['accuracy'][-1]
|
227 |
+
print(f"Accuracy: {accuracy}")
|
228 |
+
|
229 |
+
intents_json = json.dumps(intents, ensure_ascii=False)
|
230 |
+
intents_file_path = 'intents.json'
|
231 |
+
model_file_path = 'model.h5'
|
232 |
+
|
233 |
+
bucket = storage_client.bucket(bucket_name)
|
234 |
+
|
235 |
+
intents_blob = bucket.blob(intents_file_path)
|
236 |
+
model_blob = bucket.blob(model_file_path)
|
237 |
+
|
238 |
+
if not intents_blob.exists():
|
239 |
+
intents_blob.upload_from_string(intents_json, content_type='application/json')
|
240 |
+
print(f"Intents uploaded to {intents_file_path} in bucket {bucket_name}.")
|
241 |
+
|
242 |
+
if not model_blob.exists():
|
243 |
+
model.save(model_file_path)
|
244 |
+
model_blob.upload_from_filename(model_file_path)
|
245 |
+
print(f"Model uploaded to {model_file_path} in bucket {bucket_name}.")
|