from datasets import load_dataset from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Load datasets dataset_names = [ "b-mc2/sql-create-context", "TuneIt/o1-python", "HuggingFaceFW/fineweb-2", "HuggingFaceFW/fineweb-2", "sentence-transformers/embedding-training-data", "prithivMLmods/Deepthink-Reasoning", "O1-OPEN/OpenO1-SFT", "Clinton/Text-to-sql-v1", "RUC-NLPIR/FlashRAG_datasets" ] # Loading all datasets in one go datasets = {name: load_dataset(name) for name in dataset_names} # Load SentenceTransformer model sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Define sentences sentences = [ "The firewall successfully blocked unauthorized access attempts.", "The system detected a potential phishing attack targeting users.", "Regular software updates are essential to patch known vulnerabilities.", "Implementing multi-factor authentication enhances account security.", "The function returns the sum of two numbers.", "A list comprehension provides a concise way to create lists.", "The 'try' block is used to handle exceptions in Python.", "Using 'lambda' allows for the creation of anonymous functions." ] # Compute sentence embeddings embeddings = sentence_model.encode(sentences) # Calculate cosine similarity between sentence embeddings similarities = cosine_similarity(embeddings) # Print similarity matrix shape and values print(similarities.shape) # Expected output: (8, 8) print(similarities) # Load transformer model for Seq2Seq tasks tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql") seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql")