File size: 1,861 Bytes
b2d9c06
27745fe
 
b2d9c06
a106517
b2d9c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0aa793
95e6354
b2d9c06
27745fe
b2d9c06
 
 
c0aa793
b2d9c06
 
 
 
27745fe
 
b2d9c06
c0aa793
27745fe
b2d9c06
27745fe
 
b2d9c06
c0aa793
27745fe
b2d9c06
 
 
c0aa793
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load datasets
dataset_names = [
    "b-mc2/sql-create-context",
    "TuneIt/o1-python",
    "HuggingFaceFW/fineweb-2",
    "HuggingFaceFW/fineweb-2",
    "sentence-transformers/embedding-training-data",
    "prithivMLmods/Deepthink-Reasoning",
    "O1-OPEN/OpenO1-SFT",
    "Clinton/Text-to-sql-v1",
    "RUC-NLPIR/FlashRAG_datasets"
]

# Loading all datasets in one go
datasets = {name: load_dataset(name) for name in dataset_names}

# Load SentenceTransformer model
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Define sentences
sentences = [
    "The firewall successfully blocked unauthorized access attempts.",
    "The system detected a potential phishing attack targeting users.",
    "Regular software updates are essential to patch known vulnerabilities.",
    "Implementing multi-factor authentication enhances account security.",
    "The function returns the sum of two numbers.",
    "A list comprehension provides a concise way to create lists.",
    "The 'try' block is used to handle exceptions in Python.",
    "Using 'lambda' allows for the creation of anonymous functions."
]

# Compute sentence embeddings
embeddings = sentence_model.encode(sentences)

# Calculate cosine similarity between sentence embeddings
similarities = cosine_similarity(embeddings)

# Print similarity matrix shape and values
print(similarities.shape)  # Expected output: (8, 8)
print(similarities)

# Load transformer model for Seq2Seq tasks
tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql")
seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql")