Canstralian's picture
Update app.py
b2d9c06 verified
raw
history blame
1.84 kB
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load datasets
dataset_names = [
"b-mc2/sql-create-context",
"TuneIt/o1-python",
"HuggingFaceFW/fineweb-2",
"HuggingFaceFW/fineweb-2",
"sentence-transformers/embedding-training-data",
"prithivMLmods/Deepthink-Reasoning",
"O1-OPEN/OpenO1-SFT",
"Clinton/Text-to-sql-v1",
"RUC-NLPIR/FlashRAG_datasets"
]
# Loading all datasets in one go
datasets = {name: load_dataset(name) for name in dataset_names}
# Load SentenceTransformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Define sentences
sentences = [
"The firewall successfully blocked unauthorized access attempts.",
"The system detected a potential phishing attack targeting users.",
"Regular software updates are essential to patch known vulnerabilities.",
"Implementing multi-factor authentication enhances account security."
"The function returns the sum of two numbers.",
"A list comprehension provides a concise way to create lists.",
"The 'try' block is used to handle exceptions in Python.",
"Using 'lambda' allows for the creation of anonymous functions."
]
# Compute sentence embeddings
embeddings = model.encode(sentences)
# Calculate cosine similarity between sentence embeddings
similarities = cosine_similarity(embeddings)
# Print similarity matrix shape and values
print(similarities.shape) # Expected output: (4, 4)
print(similarities)
# Load transformer model for Seq2Seq tasks
tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql")
model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql")