Spaces:

Yhhxhfh
/

Ghgg

Build error

App Files Files Community

Yhhxhfh commited on 1 day ago

Commit

acb6ed9

•

1 Parent(s): 86a837e

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -65

app.py CHANGED Viewed

@@ -1,71 +1,245 @@
 import os
-from huggingface_hub import login, create_repo, upload_folder
-from autotrain import AutoTrainAdvanced
-from datasets import load_dataset, Dataset
 from dotenv import load_dotenv
-import pandas as pd
-import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 load_dotenv()
-hf_token = os.getenv('HF_TOKEN')
-profile_name = os.getenv('HUGGINGFACE_PROFILE')
-login(token=hf_token)
-dataset1 = load_dataset("daqc/wikipedia-txt-spanish", split='train')
-dataset2 = load_dataset("jorgeortizfuentes/universal_spanish_chilean_corpus", split='train')
-df1 = pd.DataFrame(dataset1)
-df2 = pd.DataFrame(dataset2)
-combined_df = pd.concat([df1, df2], ignore_index=True)
-combined_dataset = Dataset.from_pandas(combined_df)
-task_type = "text-generation"
-model_name = "meta-llama/Llama-3.2-1B"
-config = {
-    "task": task_type,
-    "model": model_name,
-    "train_data": combined_dataset,
-    "output_dir": None,
-    "epochs": 1,
-    "learning_rate": 5e-5,
-    "batch_size": 32,
-    "fp16": True,
-    "gradient_accumulation_steps": 4,
-    "max_steps": 1,
-}
-model_repo_name = f"{profile_name}/llama-3-2-1b-text-generation"
-create_repo(repo_id=model_repo_name, exist_ok=True, token=hf_token)
 while True:
-    trainer = AutoTrainAdvanced(config=config)
-    trainer.train()
-    upload_folder(
-        folder_path="./output_model_llama",
-        repo_id=model_repo_name,
-        token=hf_token,
-        repo_type="model"
-    )
-    print(f"Modelo subido correctamente a: https://huggingface.co/{model_repo_name}")
-    print("Iteración de entrenamiento completada. Continuando con la siguiente...")
-tokenizer = AutoTokenizer.from_pretrained(model_repo_name)
-model = AutoModelForCausalLM.from_pretrained(model_repo_name)
-def generate_text(input_text):
-    inputs = tokenizer.encode(input_text, return_tensors='pt')
-    with torch.no_grad():
-        outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Interacción con Llama 3.2", description="Escribe un texto y genera una respuesta.")
-iface.launch()

 import os
+import json
+import numpy as np
+import tensorflow as tf
+from google.cloud import storage
+from keras_nlp.models import BERT
+from keras_nlp.tokenizers import BertTokenizer
+from keras_nlp.callbacks import EarlyStopping
+from sklearn.model_selection import train_test_split
 from dotenv import load_dotenv
+from tqdm import tqdm
+import io
+import random
+import nltk
+from nltk.corpus import wordnet
+from nltk import pos_tag
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import brown, stopwords, reuters, genesis
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('brown')
+nltk.download('stopwords')
+nltk.download('reuters')
+nltk.download('genesis')
 load_dotenv()
+google_credentials = os.getenv("GOOGLE_CREDENTIALS")
+bucket_name = os.getenv("BUCKET_NAME")
+storage_client = storage.Client.from_service_account_info(json.loads(google_credentials))
+def generate_intents(num_intents):
+    intents = []
+    base_intent_questions = [
+        "What is the capital of France?",
+        "How do I create a function in Python?",
+        "What is the difference between a list and a tuple?",
+        "Can you explain the concept of recursion?",
+        "How can I read a file in Python?"
+    ]
+    base_intent_answers = [
+        "The capital of France is Paris.",
+        "You can create a function in Python using the 'def' keyword.",
+        "A list is mutable, while a tuple is immutable.",
+        "Recursion is a process in which a function calls itself.",
+        "You can read a file in Python using the open() function."
+    ]
+    base_intent_code_snippets = [
+        "def my_function(param): return param * 2",
+        "print('Hello, World!')",
+        "for i in range(5): print(i)",
+        "if condition: do_something()",
+        "import numpy as np"
+    ]
+    base_human_dialogues = [
+        "Hey! How are you doing today?",
+        "Could you help me with a coding problem?",
+        "What's your favorite programming language?",
+        "I was thinking about learning data science.",
+        "Do you know any good resources for learning Python?"
+    ]
+    for _ in range(num_intents):
+        intent_type = random.choice(['question', 'answer', 'code', 'dialogue'])
+        if intent_type == 'question':
+            intent = random.choice(base_intent_questions)
+        elif intent_type == 'answer':
+            intent = random.choice(base_intent_answers)
+        elif intent_type == 'code':
+            intent = random.choice(base_intent_code_snippets)
+        else:
+            intent = random.choice(base_human_dialogues)
+        intents.append(intent)
+    return intents
+def generate_biblical_texts(num_texts):
+    biblical_quotes = [
+        "For I know the plans I have for you, declares the Lord.",
+        "The Lord is my shepherd; I shall not want.",
+        "I can do all things through Christ who strengthens me.",
+        "And we know that in all things God works for the good of those who love him.",
+        "Trust in the Lord with all your heart and lean not on your own understanding."
+    ]
+    return random.choices(biblical_quotes, k=num_texts)
+def generate_disease_info(num_info):
+    diseases = [
+        "Diabetes is a chronic disease that occurs when the body cannot effectively use insulin.",
+        "Hypertension, or high blood pressure, can lead to serious health issues if left untreated.",
+        "Asthma is a condition in which your airways narrow and swell and may produce extra mucus.",
+        "Heart disease encompasses a range of conditions that affect your heart's structure and function.",
+        "COVID-19 is caused by the coronavirus SARS-CoV-2, which can lead to severe respiratory illness."
+    ]
+    return random.choices(diseases, k=num_info)
+def generate_medication_info(num_info):
+    medications = [
+        "Aspirin is commonly used to reduce pain, fever, and inflammation.",
+        "Ibuprofen is a non-steroidal anti-inflammatory drug (NSAID) that helps alleviate pain.",
+        "Metformin is used to manage blood sugar levels in people with type 2 diabetes.",
+        "Statins are a class of drugs used to lower cholesterol levels.",
+        "Amoxicillin is an antibiotic used to treat bacterial infections."
+    ]
+    return random.choices(medications, k=num_info)
+def generate_news_info(num_info):
+    news_articles = [
+        "Scientists have made a breakthrough in renewable energy technology.",
+        "The stock market saw significant gains today amid positive economic news.",
+        "Global temperatures are on the rise, raising concerns about climate change.",
+        "Researchers have discovered a new species of dinosaur in South America.",
+        "Innovations in artificial intelligence are transforming industries worldwide."
+    ]
+    return random.choices(news_articles, k=num_info)
+def generate_general_information(num_info):
+    general_info = [
+        "Python is an interpreted, high-level programming language.",
+        "Machine learning is a field of artificial intelligence that uses statistical techniques.",
+        "Cloud computing enables on-demand access to computing resources.",
+        "Cybersecurity is crucial for protecting sensitive data from unauthorized access.",
+        "Blockchain technology is the backbone of cryptocurrencies like Bitcoin."
+    ]
+    return random.choices(general_info, k=num_info)
+def generate_historical_texts(num_texts, start_year, end_year):
+    historical_events = [
+        f"In {year}, significant historical events took place." for year in range(start_year, end_year + 1)
+    ]
+    return random.choices(historical_events, k=num_texts)
+def generate_relate_history(num_texts):
+    historical_relations = [
+        "In 1776, the Declaration of Independence was signed in the United States.",
+        "The Industrial Revolution began in the late 18th century.",
+        "The fall of the Berlin Wall in 1989 marked the end of the Cold War.",
+        "World War II ended in 1945.",
+        "The Apollo 11 mission landed the first humans on the moon in 1969."
+    ]
+    return random.choices(historical_relations, k=num_texts)
+def generate_sentences(num_sentences):
+    sentences = []
+    base_sentences = [
+        "Artificial intelligence is transforming the world.",
+        "Deep learning is a subset of machine learning.",
+        "Python is a popular programming language.",
+        "Data science involves statistics and programming.",
+        "Natural language processing enables machines to understand human language."
+    ]
+    for _ in range(num_sentences):
+        sentence = random.choice(base_sentences)
+        sentences.append(sentence)
+    return sentences
+def generate_questions(num_questions):
+    questions = []
+    base_questions = [
+        "How does machine learning work?",
+        "What are the benefits of using TensorFlow?",
+        "Why is Python widely used in data science?",
+        "What is the importance of data preprocessing?",
+        "How can I improve my programming skills?"
+    ]
+    for _ in range(num_questions):
+        question = random.choice(base_questions)
+        questions.append(question)
+    return questions
+def expand_intent(intent):
+    words = intent.split()
+    expanded_intents = []
+    for word in words:
+        synonyms = wordnet.synsets(word)
+        if synonyms:
+            synonym_words = [syn.lemmas()[0].name() for syn in synonyms if syn.lemmas()]
+            for synonym in synonym_words[:2]:
+                new_intent = intent.replace(word, synonym)
+                expanded_intents.append(new_intent)
+    return expanded_intents
+num_intents = 100000
+intents = generate_intents(num_intents)
+sentences = generate_sentences(2000)
+questions = generate_questions(2000)
+biblical_texts = generate_biblical_texts(2000)
+disease_info = generate_disease_info(2000)
+medication_info = generate_medication_info(2000)
+news_info = generate_news_info(2000)
+general_info = generate_general_information(2000)
+historical_texts_1900_2024 = generate_historical_texts(100, 1900, 2024)
+historical_texts_1000_2024 = generate_historical_texts(200, 1000, 2024)
+historical_relations = generate_relate_history(200)
+intents.extend(sentences)
+intents.extend(questions)
+intents.extend(biblical_texts)
+intents.extend(disease_info)
+intents.extend(medication_info)
+intents.extend(news_info)
+intents.extend(general_info)
+intents.extend(historical_texts_1900_2024)
+intents.extend(historical_texts_1000_2024)
+intents.extend(historical_relations)
+expanded_intents = []
+for intent in intents:
+    expanded_intents.extend(expand_intent(intent))
+intents = list(set(expanded_intents))[:100000]
+labels = [1] * len(intents)
 while True:
+    X_train, X_val, y_train, y_val = train_test_split(intents, labels, test_size=0.2, random_state=42)
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    train_encodings = tokenizer(X_train, truncation=True, padding=True)
+    val_encodings = tokenizer(X_val, truncation=True, padding=True)
+    model = BERT.from_pretrained('bert-base-uncased')
+    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
+    history = model.fit(train_encodings, y_train, validation_data=(val_encodings, y_val), epochs=10, callbacks=[early_stopping])
+    accuracy = history.history['accuracy'][-1]
+    print(f"Accuracy: {accuracy}")
+    intents_json = json.dumps(intents, ensure_ascii=False)
+    intents_file_path = 'intents.json'
+    model_file_path = 'model.h5'
+    bucket = storage_client.bucket(bucket_name)
+    intents_blob = bucket.blob(intents_file_path)
+    model_blob = bucket.blob(model_file_path)
+    if not intents_blob.exists():
+        intents_blob.upload_from_string(intents_json, content_type='application/json')
+        print(f"Intents uploaded to {intents_file_path} in bucket {bucket_name}.")
+    if not model_blob.exists():
+        model.save(model_file_path)
+        model_blob.upload_from_filename(model_file_path)
+        print(f"Model uploaded to {model_file_path} in bucket {bucket_name}.")