File size: 1,861 Bytes
b2d9c06 27745fe b2d9c06 a106517 b2d9c06 c0aa793 95e6354 b2d9c06 27745fe b2d9c06 c0aa793 b2d9c06 27745fe b2d9c06 c0aa793 27745fe b2d9c06 27745fe b2d9c06 c0aa793 27745fe b2d9c06 c0aa793 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load datasets
dataset_names = [
"b-mc2/sql-create-context",
"TuneIt/o1-python",
"HuggingFaceFW/fineweb-2",
"HuggingFaceFW/fineweb-2",
"sentence-transformers/embedding-training-data",
"prithivMLmods/Deepthink-Reasoning",
"O1-OPEN/OpenO1-SFT",
"Clinton/Text-to-sql-v1",
"RUC-NLPIR/FlashRAG_datasets"
]
# Loading all datasets in one go
datasets = {name: load_dataset(name) for name in dataset_names}
# Load SentenceTransformer model
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Define sentences
sentences = [
"The firewall successfully blocked unauthorized access attempts.",
"The system detected a potential phishing attack targeting users.",
"Regular software updates are essential to patch known vulnerabilities.",
"Implementing multi-factor authentication enhances account security.",
"The function returns the sum of two numbers.",
"A list comprehension provides a concise way to create lists.",
"The 'try' block is used to handle exceptions in Python.",
"Using 'lambda' allows for the creation of anonymous functions."
]
# Compute sentence embeddings
embeddings = sentence_model.encode(sentences)
# Calculate cosine similarity between sentence embeddings
similarities = cosine_similarity(embeddings)
# Print similarity matrix shape and values
print(similarities.shape) # Expected output: (8, 8)
print(similarities)
# Load transformer model for Seq2Seq tasks
tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql")
seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql")
|