Yhhxhfh commited on
Commit
acb6ed9
1 Parent(s): 86a837e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +239 -65
app.py CHANGED
@@ -1,71 +1,245 @@
1
  import os
2
- from huggingface_hub import login, create_repo, upload_folder
3
- from autotrain import AutoTrainAdvanced
4
- from datasets import load_dataset, Dataset
 
 
 
 
 
5
  from dotenv import load_dotenv
6
- import pandas as pd
7
- import gradio as gr
8
- from transformers import AutoModelForCausalLM, AutoTokenizer
9
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  load_dotenv()
12
-
13
- hf_token = os.getenv('HF_TOKEN')
14
- profile_name = os.getenv('HUGGINGFACE_PROFILE')
15
-
16
- login(token=hf_token)
17
-
18
- dataset1 = load_dataset("daqc/wikipedia-txt-spanish", split='train')
19
- dataset2 = load_dataset("jorgeortizfuentes/universal_spanish_chilean_corpus", split='train')
20
-
21
- df1 = pd.DataFrame(dataset1)
22
- df2 = pd.DataFrame(dataset2)
23
-
24
- combined_df = pd.concat([df1, df2], ignore_index=True)
25
- combined_dataset = Dataset.from_pandas(combined_df)
26
-
27
- task_type = "text-generation"
28
- model_name = "meta-llama/Llama-3.2-1B"
29
-
30
- config = {
31
- "task": task_type,
32
- "model": model_name,
33
- "train_data": combined_dataset,
34
- "output_dir": None,
35
- "epochs": 1,
36
- "learning_rate": 5e-5,
37
- "batch_size": 32,
38
- "fp16": True,
39
- "gradient_accumulation_steps": 4,
40
- "max_steps": 1,
41
- }
42
-
43
- model_repo_name = f"{profile_name}/llama-3-2-1b-text-generation"
44
-
45
- create_repo(repo_id=model_repo_name, exist_ok=True, token=hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  while True:
48
- trainer = AutoTrainAdvanced(config=config)
49
- trainer.train()
50
-
51
- upload_folder(
52
- folder_path="./output_model_llama",
53
- repo_id=model_repo_name,
54
- token=hf_token,
55
- repo_type="model"
56
- )
57
-
58
- print(f"Modelo subido correctamente a: https://huggingface.co/{model_repo_name}")
59
- print("Iteración de entrenamiento completada. Continuando con la siguiente...")
60
-
61
- tokenizer = AutoTokenizer.from_pretrained(model_repo_name)
62
- model = AutoModelForCausalLM.from_pretrained(model_repo_name)
63
-
64
- def generate_text(input_text):
65
- inputs = tokenizer.encode(input_text, return_tensors='pt')
66
- with torch.no_grad():
67
- outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
68
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
69
-
70
- iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Interacción con Llama 3.2", description="Escribe un texto y genera una respuesta.")
71
- iface.launch()
 
 
 
 
 
 
 
 
1
  import os
2
+ import json
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ from google.cloud import storage
6
+ from keras_nlp.models import BERT
7
+ from keras_nlp.tokenizers import BertTokenizer
8
+ from keras_nlp.callbacks import EarlyStopping
9
+ from sklearn.model_selection import train_test_split
10
  from dotenv import load_dotenv
11
+ from tqdm import tqdm
12
+ import io
13
+ import random
14
+ import nltk
15
+ from nltk.corpus import wordnet
16
+ from nltk import pos_tag
17
+ from nltk.tokenize import word_tokenize, sent_tokenize
18
+ from nltk.corpus import brown, stopwords, reuters, genesis
19
+
20
+ nltk.download('punkt')
21
+ nltk.download('wordnet')
22
+ nltk.download('averaged_perceptron_tagger')
23
+ nltk.download('brown')
24
+ nltk.download('stopwords')
25
+ nltk.download('reuters')
26
+ nltk.download('genesis')
27
 
28
  load_dotenv()
29
+ google_credentials = os.getenv("GOOGLE_CREDENTIALS")
30
+ bucket_name = os.getenv("BUCKET_NAME")
31
+ storage_client = storage.Client.from_service_account_info(json.loads(google_credentials))
32
+
33
+ def generate_intents(num_intents):
34
+ intents = []
35
+ base_intent_questions = [
36
+ "What is the capital of France?",
37
+ "How do I create a function in Python?",
38
+ "What is the difference between a list and a tuple?",
39
+ "Can you explain the concept of recursion?",
40
+ "How can I read a file in Python?"
41
+ ]
42
+ base_intent_answers = [
43
+ "The capital of France is Paris.",
44
+ "You can create a function in Python using the 'def' keyword.",
45
+ "A list is mutable, while a tuple is immutable.",
46
+ "Recursion is a process in which a function calls itself.",
47
+ "You can read a file in Python using the open() function."
48
+ ]
49
+ base_intent_code_snippets = [
50
+ "def my_function(param): return param * 2",
51
+ "print('Hello, World!')",
52
+ "for i in range(5): print(i)",
53
+ "if condition: do_something()",
54
+ "import numpy as np"
55
+ ]
56
+ base_human_dialogues = [
57
+ "Hey! How are you doing today?",
58
+ "Could you help me with a coding problem?",
59
+ "What's your favorite programming language?",
60
+ "I was thinking about learning data science.",
61
+ "Do you know any good resources for learning Python?"
62
+ ]
63
+ for _ in range(num_intents):
64
+ intent_type = random.choice(['question', 'answer', 'code', 'dialogue'])
65
+ if intent_type == 'question':
66
+ intent = random.choice(base_intent_questions)
67
+ elif intent_type == 'answer':
68
+ intent = random.choice(base_intent_answers)
69
+ elif intent_type == 'code':
70
+ intent = random.choice(base_intent_code_snippets)
71
+ else:
72
+ intent = random.choice(base_human_dialogues)
73
+ intents.append(intent)
74
+ return intents
75
+
76
+ def generate_biblical_texts(num_texts):
77
+ biblical_quotes = [
78
+ "For I know the plans I have for you, declares the Lord.",
79
+ "The Lord is my shepherd; I shall not want.",
80
+ "I can do all things through Christ who strengthens me.",
81
+ "And we know that in all things God works for the good of those who love him.",
82
+ "Trust in the Lord with all your heart and lean not on your own understanding."
83
+ ]
84
+ return random.choices(biblical_quotes, k=num_texts)
85
+
86
+ def generate_disease_info(num_info):
87
+ diseases = [
88
+ "Diabetes is a chronic disease that occurs when the body cannot effectively use insulin.",
89
+ "Hypertension, or high blood pressure, can lead to serious health issues if left untreated.",
90
+ "Asthma is a condition in which your airways narrow and swell and may produce extra mucus.",
91
+ "Heart disease encompasses a range of conditions that affect your heart's structure and function.",
92
+ "COVID-19 is caused by the coronavirus SARS-CoV-2, which can lead to severe respiratory illness."
93
+ ]
94
+ return random.choices(diseases, k=num_info)
95
+
96
+ def generate_medication_info(num_info):
97
+ medications = [
98
+ "Aspirin is commonly used to reduce pain, fever, and inflammation.",
99
+ "Ibuprofen is a non-steroidal anti-inflammatory drug (NSAID) that helps alleviate pain.",
100
+ "Metformin is used to manage blood sugar levels in people with type 2 diabetes.",
101
+ "Statins are a class of drugs used to lower cholesterol levels.",
102
+ "Amoxicillin is an antibiotic used to treat bacterial infections."
103
+ ]
104
+ return random.choices(medications, k=num_info)
105
+
106
+ def generate_news_info(num_info):
107
+ news_articles = [
108
+ "Scientists have made a breakthrough in renewable energy technology.",
109
+ "The stock market saw significant gains today amid positive economic news.",
110
+ "Global temperatures are on the rise, raising concerns about climate change.",
111
+ "Researchers have discovered a new species of dinosaur in South America.",
112
+ "Innovations in artificial intelligence are transforming industries worldwide."
113
+ ]
114
+ return random.choices(news_articles, k=num_info)
115
+
116
+ def generate_general_information(num_info):
117
+ general_info = [
118
+ "Python is an interpreted, high-level programming language.",
119
+ "Machine learning is a field of artificial intelligence that uses statistical techniques.",
120
+ "Cloud computing enables on-demand access to computing resources.",
121
+ "Cybersecurity is crucial for protecting sensitive data from unauthorized access.",
122
+ "Blockchain technology is the backbone of cryptocurrencies like Bitcoin."
123
+ ]
124
+ return random.choices(general_info, k=num_info)
125
+
126
+ def generate_historical_texts(num_texts, start_year, end_year):
127
+ historical_events = [
128
+ f"In {year}, significant historical events took place." for year in range(start_year, end_year + 1)
129
+ ]
130
+ return random.choices(historical_events, k=num_texts)
131
+
132
+ def generate_relate_history(num_texts):
133
+ historical_relations = [
134
+ "In 1776, the Declaration of Independence was signed in the United States.",
135
+ "The Industrial Revolution began in the late 18th century.",
136
+ "The fall of the Berlin Wall in 1989 marked the end of the Cold War.",
137
+ "World War II ended in 1945.",
138
+ "The Apollo 11 mission landed the first humans on the moon in 1969."
139
+ ]
140
+ return random.choices(historical_relations, k=num_texts)
141
+
142
+ def generate_sentences(num_sentences):
143
+ sentences = []
144
+ base_sentences = [
145
+ "Artificial intelligence is transforming the world.",
146
+ "Deep learning is a subset of machine learning.",
147
+ "Python is a popular programming language.",
148
+ "Data science involves statistics and programming.",
149
+ "Natural language processing enables machines to understand human language."
150
+ ]
151
+ for _ in range(num_sentences):
152
+ sentence = random.choice(base_sentences)
153
+ sentences.append(sentence)
154
+ return sentences
155
+
156
+ def generate_questions(num_questions):
157
+ questions = []
158
+ base_questions = [
159
+ "How does machine learning work?",
160
+ "What are the benefits of using TensorFlow?",
161
+ "Why is Python widely used in data science?",
162
+ "What is the importance of data preprocessing?",
163
+ "How can I improve my programming skills?"
164
+ ]
165
+ for _ in range(num_questions):
166
+ question = random.choice(base_questions)
167
+ questions.append(question)
168
+ return questions
169
+
170
+ def expand_intent(intent):
171
+ words = intent.split()
172
+ expanded_intents = []
173
+ for word in words:
174
+ synonyms = wordnet.synsets(word)
175
+ if synonyms:
176
+ synonym_words = [syn.lemmas()[0].name() for syn in synonyms if syn.lemmas()]
177
+ for synonym in synonym_words[:2]:
178
+ new_intent = intent.replace(word, synonym)
179
+ expanded_intents.append(new_intent)
180
+ return expanded_intents
181
+
182
+ num_intents = 100000
183
+ intents = generate_intents(num_intents)
184
+ sentences = generate_sentences(2000)
185
+ questions = generate_questions(2000)
186
+ biblical_texts = generate_biblical_texts(2000)
187
+ disease_info = generate_disease_info(2000)
188
+ medication_info = generate_medication_info(2000)
189
+ news_info = generate_news_info(2000)
190
+ general_info = generate_general_information(2000)
191
+ historical_texts_1900_2024 = generate_historical_texts(100, 1900, 2024)
192
+ historical_texts_1000_2024 = generate_historical_texts(200, 1000, 2024)
193
+ historical_relations = generate_relate_history(200)
194
+
195
+ intents.extend(sentences)
196
+ intents.extend(questions)
197
+ intents.extend(biblical_texts)
198
+ intents.extend(disease_info)
199
+ intents.extend(medication_info)
200
+ intents.extend(news_info)
201
+ intents.extend(general_info)
202
+ intents.extend(historical_texts_1900_2024)
203
+ intents.extend(historical_texts_1000_2024)
204
+ intents.extend(historical_relations)
205
+
206
+ expanded_intents = []
207
+ for intent in intents:
208
+ expanded_intents.extend(expand_intent(intent))
209
+
210
+ intents = list(set(expanded_intents))[:100000]
211
+
212
+ labels = [1] * len(intents)
213
 
214
  while True:
215
+ X_train, X_val, y_train, y_val = train_test_split(intents, labels, test_size=0.2, random_state=42)
216
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
217
+ train_encodings = tokenizer(X_train, truncation=True, padding=True)
218
+ val_encodings = tokenizer(X_val, truncation=True, padding=True)
219
+
220
+ model = BERT.from_pretrained('bert-base-uncased')
221
+ model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
222
+
223
+ early_stopping = EarlyStopping(monitor='val_loss', patience=3)
224
+ history = model.fit(train_encodings, y_train, validation_data=(val_encodings, y_val), epochs=10, callbacks=[early_stopping])
225
+
226
+ accuracy = history.history['accuracy'][-1]
227
+ print(f"Accuracy: {accuracy}")
228
+
229
+ intents_json = json.dumps(intents, ensure_ascii=False)
230
+ intents_file_path = 'intents.json'
231
+ model_file_path = 'model.h5'
232
+
233
+ bucket = storage_client.bucket(bucket_name)
234
+
235
+ intents_blob = bucket.blob(intents_file_path)
236
+ model_blob = bucket.blob(model_file_path)
237
+
238
+ if not intents_blob.exists():
239
+ intents_blob.upload_from_string(intents_json, content_type='application/json')
240
+ print(f"Intents uploaded to {intents_file_path} in bucket {bucket_name}.")
241
+
242
+ if not model_blob.exists():
243
+ model.save(model_file_path)
244
+ model_blob.upload_from_filename(model_file_path)
245
+ print(f"Model uploaded to {model_file_path} in bucket {bucket_name}.")