|
from datasets import load_dataset |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
dataset_names = [ |
|
"b-mc2/sql-create-context", |
|
"TuneIt/o1-python", |
|
"HuggingFaceFW/fineweb-2", |
|
"HuggingFaceFW/fineweb-2", |
|
"sentence-transformers/embedding-training-data", |
|
"prithivMLmods/Deepthink-Reasoning", |
|
"O1-OPEN/OpenO1-SFT", |
|
"Clinton/Text-to-sql-v1", |
|
"RUC-NLPIR/FlashRAG_datasets" |
|
] |
|
|
|
|
|
datasets = {name: load_dataset(name) for name in dataset_names} |
|
|
|
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
sentences = [ |
|
"The firewall successfully blocked unauthorized access attempts.", |
|
"The system detected a potential phishing attack targeting users.", |
|
"Regular software updates are essential to patch known vulnerabilities.", |
|
"Implementing multi-factor authentication enhances account security." |
|
"The function returns the sum of two numbers.", |
|
"A list comprehension provides a concise way to create lists.", |
|
"The 'try' block is used to handle exceptions in Python.", |
|
"Using 'lambda' allows for the creation of anonymous functions." |
|
] |
|
|
|
|
|
|
|
embeddings = model.encode(sentences) |
|
|
|
|
|
similarities = cosine_similarity(embeddings) |
|
|
|
|
|
print(similarities.shape) |
|
print(similarities) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql") |
|
|