Canstralian commited on
Commit
b2d9c06
·
verified ·
1 Parent(s): 193caaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -13
app.py CHANGED
@@ -1,26 +1,50 @@
 
1
  from sentence_transformers import SentenceTransformer
2
  from sklearn.metrics.pairwise import cosine_similarity
3
- import numpy as np
4
 
5
- # Load the model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
7
 
8
- # Define your sentences
9
  sentences = [
10
- "That is a happy person",
11
- "That is a happy dog",
12
- "That is a very happy person",
13
- "Today is a sunny day"
 
 
 
 
14
  ]
15
 
16
- # Encode the sentences to get their embeddings
 
17
  embeddings = model.encode(sentences)
18
 
19
- # Compute the cosine similarity matrix
20
  similarities = cosine_similarity(embeddings)
21
 
22
- # Print the shape of the similarity matrix
23
- print(similarities.shape) # Output: (4, 4)
24
-
25
- # Optionally, print the similarity matrix
26
  print(similarities)
 
 
 
 
 
1
+ from datasets import load_dataset
2
  from sentence_transformers import SentenceTransformer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
 
6
+ # Load datasets
7
+ dataset_names = [
8
+ "b-mc2/sql-create-context",
9
+ "TuneIt/o1-python",
10
+ "HuggingFaceFW/fineweb-2",
11
+ "HuggingFaceFW/fineweb-2",
12
+ "sentence-transformers/embedding-training-data",
13
+ "prithivMLmods/Deepthink-Reasoning",
14
+ "O1-OPEN/OpenO1-SFT",
15
+ "Clinton/Text-to-sql-v1",
16
+ "RUC-NLPIR/FlashRAG_datasets"
17
+ ]
18
+
19
+ # Loading all datasets in one go
20
+ datasets = {name: load_dataset(name) for name in dataset_names}
21
+
22
+ # Load SentenceTransformer model
23
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
24
 
25
+ # Define sentences
26
  sentences = [
27
+ "The firewall successfully blocked unauthorized access attempts.",
28
+ "The system detected a potential phishing attack targeting users.",
29
+ "Regular software updates are essential to patch known vulnerabilities.",
30
+ "Implementing multi-factor authentication enhances account security."
31
+ "The function returns the sum of two numbers.",
32
+ "A list comprehension provides a concise way to create lists.",
33
+ "The 'try' block is used to handle exceptions in Python.",
34
+ "Using 'lambda' allows for the creation of anonymous functions."
35
  ]
36
 
37
+
38
+ # Compute sentence embeddings
39
  embeddings = model.encode(sentences)
40
 
41
+ # Calculate cosine similarity between sentence embeddings
42
  similarities = cosine_similarity(embeddings)
43
 
44
+ # Print similarity matrix shape and values
45
+ print(similarities.shape) # Expected output: (4, 4)
 
 
46
  print(similarities)
47
+
48
+ # Load transformer model for Seq2Seq tasks
49
+ tokenizer = AutoTokenizer.from_pretrained("cssupport/t5-small-awesome-text-to-sql")
50
+ model = AutoModelForSeq2SeqLM.from_pretrained("cssupport/t5-small-awesome-text-to-sql")