taskswithcode commited on
Commit
02d9efc
·
1 Parent(s): 1feb2b0
Files changed (4) hide show
  1. app.py +23 -15
  2. sim_app_models.json +61 -1
  3. twc_embeddings.py +6 -6
  4. twc_openai_embeddings.py +94 -0
app.py CHANGED
@@ -6,6 +6,7 @@ from io import StringIO
6
  import pdb
7
  import json
8
  from twc_embeddings import HFModel,SimCSEModel,SGPTModel
 
9
  import torch
10
  import requests
11
  import socket
@@ -59,7 +60,7 @@ def get_views(action):
59
 
60
  def construct_model_info_for_display(model_names):
61
  options_arr = []
62
- markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>These are either state-of-the-art or the most downloaded models on Huggingface</i></div>"
63
  markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
64
  for node in model_names:
65
  options_arr .append(node["name"])
@@ -101,15 +102,15 @@ def load_model(model_name,model_class,load_model_name):
101
 
102
 
103
  @st.experimental_memo
104
- def cached_compute_similarity(sentences,_model,model_name,main_index):
105
- texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
106
  results = _model.output_results(None,texts,embeddings,main_index)
107
  return results
108
 
109
 
110
- def uncached_compute_similarity(sentences,_model,model_name,main_index):
111
  with st.spinner('Computing vectors for sentences'):
112
- texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
113
  results = _model.output_results(None,texts,embeddings,main_index)
114
  #st.success("Similarity computation complete")
115
  return results
@@ -121,7 +122,7 @@ def get_model_info(model_names,model_name):
121
  return node,model_name
122
  return get_model_info(model_names,DEFAULT_HF_MODEL)
123
 
124
- def run_test(model_names,model_name,sentences,display_area,main_index,user_uploaded,custom_model):
125
  display_area.text("Loading model:" + model_name)
126
  #Note. model_name may get mapped to new name in the call below for custom models
127
  orig_model_name = model_name
@@ -133,14 +134,18 @@ def run_test(model_names,model_name,sentences,display_area,main_index,user_uploa
133
  if ("Note" in model_info):
134
  fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
135
  display_area.write(fail_link)
 
 
 
 
136
  model = load_model(model_name,model_info["class"],load_model_name)
137
  display_area.text("Model " + model_name + " load complete")
138
  try:
139
  if (user_uploaded):
140
- results = uncached_compute_similarity(sentences,model,model_name,main_index)
141
  else:
142
  display_area.text("Computing vectors for sentences")
143
- results = cached_compute_similarity(sentences,model,model_name,main_index)
144
  display_area.text("Similarity computation complete")
145
  return results
146
 
@@ -250,15 +255,18 @@ def app_main(app_mode,example_files,model_name_files):
250
  st.session_state["model_name"] = run_model
251
  st.session_state["main_index"] = main_index
252
 
253
- results = run_test(model_names,run_model,sentences,display_area,main_index - 1,(uploaded_file is not None),(len(custom_model_selection) != 0))
254
  display_area.empty()
255
  with display_area.container():
256
- device = 'GPU' if torch.cuda.is_available() else 'CPU'
257
- response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
258
- if (len(custom_model_selection) != 0):
259
- st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
260
- display_results(sentences,main_index - 1,results,response_info,app_mode,run_model)
261
- #st.json(results)
 
 
 
262
  st.download_button(
263
  label="Download results as json",
264
  data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
 
6
  import pdb
7
  import json
8
  from twc_embeddings import HFModel,SimCSEModel,SGPTModel
9
+ from twc_openai_embeddings import OpenAIModel
10
  import torch
11
  import requests
12
  import socket
 
60
 
61
  def construct_model_info_for_display(model_names):
62
  options_arr = []
63
+ markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>The selected models satisfy one or more of the following (1) state-of-the-art (2) the most downloaded models on Huggingface (3) Large Language Models (e.g. GPT-3)</i></div>"
64
  markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
65
  for node in model_names:
66
  options_arr .append(node["name"])
 
102
 
103
 
104
  @st.experimental_memo
105
+ def cached_compute_similarity(input_file_name,sentences,_model,model_name,main_index):
106
+ texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
107
  results = _model.output_results(None,texts,embeddings,main_index)
108
  return results
109
 
110
 
111
+ def uncached_compute_similarity(input_file_name,sentences,_model,model_name,main_index):
112
  with st.spinner('Computing vectors for sentences'):
113
+ texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
114
  results = _model.output_results(None,texts,embeddings,main_index)
115
  #st.success("Similarity computation complete")
116
  return results
 
122
  return node,model_name
123
  return get_model_info(model_names,DEFAULT_HF_MODEL)
124
 
125
+ def run_test(model_names,model_name,input_file_name,sentences,display_area,main_index,user_uploaded,custom_model):
126
  display_area.text("Loading model:" + model_name)
127
  #Note. model_name may get mapped to new name in the call below for custom models
128
  orig_model_name = model_name
 
134
  if ("Note" in model_info):
135
  fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
136
  display_area.write(fail_link)
137
+ if (user_uploaded and "custom_load" in model_info and model_info["custom_load"] == "False"):
138
+ fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
139
+ display_area.write(fail_link)
140
+ return {"error":fail_link}
141
  model = load_model(model_name,model_info["class"],load_model_name)
142
  display_area.text("Model " + model_name + " load complete")
143
  try:
144
  if (user_uploaded):
145
+ results = uncached_compute_similarity(input_file_name,sentences,model,model_name,main_index)
146
  else:
147
  display_area.text("Computing vectors for sentences")
148
+ results = cached_compute_similarity(input_file_name,sentences,model,model_name,main_index)
149
  display_area.text("Similarity computation complete")
150
  return results
151
 
 
255
  st.session_state["model_name"] = run_model
256
  st.session_state["main_index"] = main_index
257
 
258
+ results = run_test(model_names,run_model,st.session_state["file_name"],sentences,display_area,main_index - 1,(uploaded_file is not None),(len(custom_model_selection) != 0))
259
  display_area.empty()
260
  with display_area.container():
261
+ if ("error" in results):
262
+ st.error(results["error"])
263
+ else:
264
+ device = 'GPU' if torch.cuda.is_available() else 'CPU'
265
+ response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
266
+ if (len(custom_model_selection) != 0):
267
+ st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
268
+ display_results(sentences,main_index - 1,results,response_info,app_mode,run_model)
269
+ #st.json(results)
270
  st.download_button(
271
  label="Download results as json",
272
  data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
sim_app_models.json CHANGED
@@ -128,7 +128,67 @@
128
  },
129
  "paper_url":"https://arxiv.org/abs/2104.08821v4",
130
  "mark":"True",
131
- "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
 
134
  ]
 
128
  },
129
  "paper_url":"https://arxiv.org/abs/2104.08821v4",
130
  "mark":"True",
131
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
132
+ { "name":"GPT-3-175B (text-similarity-davinci-001)" ,
133
+ "model":"text-similarity-davinci-001",
134
+ "fork_url":"https://openai.com/api/",
135
+ "orig_author_url":"https://openai.com/api/",
136
+ "orig_author":"OpenAI",
137
+ "sota_info": {
138
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
139
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
140
+ },
141
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
142
+ "mark":"True",
143
+ "custom_load":"False",
144
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
145
+ "alt_url":"https://openai.com/api/",
146
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
147
+ { "name":"GPT-3-6.7B (text-similarity-curie-001)" ,
148
+ "model":"text-similarity-curie-001",
149
+ "fork_url":"https://openai.com/api/",
150
+ "orig_author_url":"https://openai.com/api/",
151
+ "orig_author":"OpenAI",
152
+ "sota_info": {
153
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
154
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
155
+ },
156
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
157
+ "mark":"True",
158
+ "custom_load":"False",
159
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
160
+ "alt_url":"https://openai.com/api/",
161
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
162
+ { "name":"GPT-3-1.3B (text-similarity-babbage-001)" ,
163
+ "model":"text-similarity-babbage-001",
164
+ "fork_url":"https://openai.com/api/",
165
+ "orig_author_url":"https://openai.com/api/",
166
+ "orig_author":"OpenAI",
167
+ "sota_info": {
168
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
169
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
170
+ },
171
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
172
+ "mark":"True",
173
+ "custom_load":"False",
174
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
175
+ "alt_url":"https://openai.com/api/",
176
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
177
+ { "name":"GPT-3-350M (text-similarity-ada-001)" ,
178
+ "model":"text-similarity-ada-001",
179
+ "fork_url":"https://openai.com/api/",
180
+ "orig_author_url":"https://openai.com/api/",
181
+ "orig_author":"OpenAI",
182
+ "sota_info": {
183
+ "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
184
+ "sota_link":"https://paperswithcode.com/method/gpt-3"
185
+ },
186
+ "paper_url":"https://arxiv.org/abs/2005.14165v4",
187
+ "mark":"True",
188
+ "custom_load":"False",
189
+ "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
190
+ "alt_url":"https://openai.com/api/",
191
+ "class":"OpenAIModel","sota_link":"https://arxiv.org/abs/2005.14165v4"}
192
 
193
 
194
  ]
twc_embeddings.py CHANGED
@@ -32,7 +32,7 @@ class CausalLMModel:
32
  self.model.eval()
33
  self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
34
 
35
- def compute_embeddings(self,input_data,is_file):
36
  if (self.debug):
37
  print("Computing embeddings for:", input_data[:20])
38
  model = self.model
@@ -160,7 +160,7 @@ class SGPTQnAModel:
160
 
161
  return embeddings
162
 
163
- def compute_embeddings(self,input_data,is_file):
164
  if (self.debug):
165
  print("Computing embeddings for:", input_data[:20])
166
  model = self.model
@@ -215,7 +215,7 @@ class SimCSEModel:
215
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
216
  self.model = AutoModel.from_pretrained(model_name)
217
 
218
- def compute_embeddings(self,input_data,is_file):
219
  texts = read_text(input_data) if is_file == True else input_data
220
  inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
221
  with torch.no_grad():
@@ -266,7 +266,7 @@ class SGPTModel:
266
  # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
267
  self.model.eval()
268
 
269
- def compute_embeddings(self,input_data,is_file):
270
  if (self.debug):
271
  print("Computing embeddings for:", input_data[:20])
272
  model = self.model
@@ -353,7 +353,7 @@ class HFModel:
353
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
354
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
355
 
356
- def compute_embeddings(self,input_data,is_file):
357
  #print("Computing embeddings for:", input_data[:20])
358
  model = self.model
359
  tokenizer = self.tokenizer
@@ -403,5 +403,5 @@ if __name__ == '__main__':
403
  results = parser.parse_args()
404
  obj = HFModel()
405
  obj.init_model(results.model)
406
- texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
407
  results = obj.output_results(results.output,texts,embeddings)
 
32
  self.model.eval()
33
  self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
34
 
35
+ def compute_embeddings(self,input_file_name,input_data,is_file):
36
  if (self.debug):
37
  print("Computing embeddings for:", input_data[:20])
38
  model = self.model
 
160
 
161
  return embeddings
162
 
163
+ def compute_embeddings(self,input_file_name,input_data,is_file):
164
  if (self.debug):
165
  print("Computing embeddings for:", input_data[:20])
166
  model = self.model
 
215
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
216
  self.model = AutoModel.from_pretrained(model_name)
217
 
218
+ def compute_embeddings(self,input_file_name,input_data,is_file):
219
  texts = read_text(input_data) if is_file == True else input_data
220
  inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
221
  with torch.no_grad():
 
266
  # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
267
  self.model.eval()
268
 
269
+ def compute_embeddings(self,input_file_name,input_data,is_file):
270
  if (self.debug):
271
  print("Computing embeddings for:", input_data[:20])
272
  model = self.model
 
353
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
354
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
355
 
356
+ def compute_embeddings(self,input_file_name,input_data,is_file):
357
  #print("Computing embeddings for:", input_data[:20])
358
  model = self.model
359
  tokenizer = self.tokenizer
 
403
  results = parser.parse_args()
404
  obj = HFModel()
405
  obj.init_model(results.model)
406
+ texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
407
  results = obj.output_results(results.output,texts,embeddings)
twc_openai_embeddings.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.spatial.distance import cosine
2
+ import argparse
3
+ import json
4
+ import os
5
+ import openai
6
+ import pdb
7
+
8
+ def read_text(input_file):
9
+ arr = open(input_file).read().split("\n")
10
+ return arr[:-1]
11
+
12
+
13
+ class OpenAIModel:
14
+ def __init__(self):
15
+ self.debug = False
16
+ self.model_name = None
17
+ print("In OpenAI API constructor")
18
+
19
+
20
+ def init_model(self,model_name = None):
21
+ #print("Init model",model_name)
22
+ openai.api_key = os.getenv("OPENAI_API_KEY")
23
+ if (len(openai.api_key) == 0):
24
+ print("Open API key not set")
25
+
26
+ if (model_name is None):
27
+ self.model_name = "text-similarity-ada-001"
28
+ else:
29
+ self.model_name = model_name
30
+
31
+
32
+ def compute_embeddings(self,input_file_name,input_data,is_file):
33
+ if (len(openai.api_key) == 0):
34
+ print("Open API key not set")
35
+ return [],[]
36
+ in_file = self.model_name + '.'.join(input_file_name.split('.')[:-1]) + "_embed.json"
37
+ cached = False
38
+ try:
39
+ fp = open(in_file)
40
+ cached = True
41
+ embeddings = json.load(fp)
42
+ print("Using cached embeddings")
43
+ except:
44
+ pass
45
+
46
+ texts = read_text(input_data) if is_file == True else input_data
47
+ if (not cached):
48
+ print(f"Computing embeddings for {input_file_name} and model {self.model_name}")
49
+ response = openai.Embedding.create(
50
+ input=texts,
51
+ model=self.model_name
52
+ )
53
+ embeddings = []
54
+ for i in range(len(response['data'])):
55
+ embeddings.append(response['data'][i]['embedding'])
56
+ if (not cached):
57
+ with open(in_file,"w") as fp:
58
+ json.dump(embeddings,fp)
59
+ return texts,embeddings
60
+
61
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
62
+ if (len(openai.api_key) == 0):
63
+ print("Open API key not set")
64
+ return {}
65
+ # Calculate cosine similarities
66
+ # Cosine similarities are in [-1, 1]. Higher means more similar
67
+ cosine_dict = {}
68
+ #print("Total sentences",len(texts))
69
+ for i in range(len(texts)):
70
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
71
+
72
+ #print("Input sentence:",texts[main_index])
73
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
74
+ if (self.debug):
75
+ for key in sorted_dict:
76
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
77
+ if (output_file is not None):
78
+ with open(output_file,"w") as fp:
79
+ fp.write(json.dumps(sorted_dict,indent=0))
80
+ return sorted_dict
81
+
82
+
83
+
84
+ if __name__ == '__main__':
85
+ parser = argparse.ArgumentParser(description='OpenAI model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
86
+ parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
87
+ parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
88
+ parser.add_argument('-model', action="store", dest="model",default="text-similarity-ada-001",help="model name")
89
+
90
+ results = parser.parse_args()
91
+ obj = OpenAIModel()
92
+ obj.init_model(results.model)
93
+ texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
94
+ results = obj.output_results(results.output,texts,embeddings)