zmbfeng commited on
Commit
b91bc9f
·
1 Parent(s): 1481eaa

commonality score added, highest combined scored sentences to the front

Browse files
Files changed (1) hide show
  1. app.py +36 -27
app.py CHANGED
@@ -38,7 +38,7 @@ def combined_similarity(similarity, sentence, query):
38
 
39
  # Adjust the similarity score with the common words count
40
  combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
41
- return combined_score
42
 
43
  big_text = """
44
  <div style='text-align: center;'>
@@ -124,53 +124,62 @@ if 'list_count' in st.session_state:
124
  st.rerun()
125
  if 'paragraph_sentence_encodings' in st.session_state:
126
  query = st.text_input("Enter your query")
 
127
  if query:
128
- query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to('cuda')
 
129
  with torch.no_grad(): # Disable gradient calculation for inference
130
  # Perform the forward pass on the GPU
131
  query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
132
  :].cpu().numpy() # Move the result to CPU and convert to NumPy
 
133
  paragraph_scores = []
134
  sentence_scores = []
135
- sentence_encoding = []
136
- total_count=len(st.session_state.paragraph_sentence_encodings)
137
  processing_progress_bar = st.progress(0)
138
- for index,paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
139
- progress_percentage = index / (total_count- 1)
 
140
  processing_progress_bar.progress(progress_percentage)
141
- best_similarity = -1
142
  sentence_similarities = []
143
  for sentence_encoding in paragraph_sentence_encoding[1]:
144
  if sentence_encoding:
145
  similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
146
- # adjusted_similarity = similarity*len(sentence_encoding[0].split())**0.5
147
- combined_score = combined_similarity(similarity, sentence_encoding[0], query)
148
-
149
- # print("sentence="+sentence_encoding[0] + " len="+str())
150
-
151
- sentence_similarities.append(combined_score)
152
  sentence_scores.append((combined_score, sentence_encoding[0]))
153
- # best_similarity = max(best_similarity, similarity)
154
- sentence_similarities.sort(reverse=True)
155
 
156
  # Calculate the average of the top three sentence similarities
157
  if len(sentence_similarities) >= 3:
158
- top_three_avg_similarity = np.mean(sentence_similarities[:3])
 
 
159
  elif sentence_similarities:
160
- top_three_avg_similarity = np.mean(sentence_similarities)
 
 
161
  else:
162
  top_three_avg_similarity = 0
163
- paragraph_scores.append((top_three_avg_similarity, paragraph_sentence_encoding[0]))
 
 
 
 
 
 
 
 
 
 
164
  sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
165
- # Display the scores and sentences
166
- # print("Top scored sentences and their scores:")
167
- # for score, sentence in sentence_scores: # Print top 10 for demonstration
168
- # print(f"Score: {score:.4f}, Sentence: {sentence}")
169
- # Sort the paragraphs by their best similarity score
170
  paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
171
 
172
- # Debug prints to understand the scores and paragraphs
173
  st.write("Top scored paragraphs and their scores:")
174
- for score, paragraph in paragraph_scores[:5]: # Print top 5 for debugging
175
-
176
- st.write(f"Score: {score}, Paragraph: {paragraph['text']}")
 
38
 
39
  # Adjust the similarity score with the common words count
40
  combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
41
+ return combined_score,similarity,(common_words / max(len(query_words), 1))
42
 
43
  big_text = """
44
  <div style='text-align: center;'>
 
124
  st.rerun()
125
  if 'paragraph_sentence_encodings' in st.session_state:
126
  query = st.text_input("Enter your query")
127
+
128
  if query:
129
+ query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
130
+ 'cuda')
131
  with torch.no_grad(): # Disable gradient calculation for inference
132
  # Perform the forward pass on the GPU
133
  query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
134
  :].cpu().numpy() # Move the result to CPU and convert to NumPy
135
+
136
  paragraph_scores = []
137
  sentence_scores = []
138
+ total_count = len(st.session_state.paragraph_sentence_encodings)
 
139
  processing_progress_bar = st.progress(0)
140
+
141
+ for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
142
+ progress_percentage = index / (total_count - 1)
143
  processing_progress_bar.progress(progress_percentage)
144
+
145
  sentence_similarities = []
146
  for sentence_encoding in paragraph_sentence_encoding[1]:
147
  if sentence_encoding:
148
  similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
149
+ combined_score, similarity_score, commonality_score = combined_similarity(similarity,
150
+ sentence_encoding[0],
151
+ query)
152
+ sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
 
 
153
  sentence_scores.append((combined_score, sentence_encoding[0]))
154
+
155
+ sentence_similarities.sort(reverse=True, key=lambda x: x[0])
156
 
157
  # Calculate the average of the top three sentence similarities
158
  if len(sentence_similarities) >= 3:
159
+ top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
160
+ top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
161
+ top_three_sentences = sentence_similarities[:3]
162
  elif sentence_similarities:
163
+ top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
164
+ top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
165
+ top_three_sentences = sentence_similarities
166
  else:
167
  top_three_avg_similarity = 0
168
+ top_three_avg_commonality = 0
169
+ top_three_sentences = []
170
+
171
+ # Move top 3 sentences to the beginning of the paragraph
172
+ top_three_texts = [s[1] for s in top_three_sentences]
173
+ remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
174
+ reordered_paragraph = top_three_texts + remaining_texts
175
+
176
+ paragraph_scores.append(
177
+ (top_three_avg_similarity, top_three_avg_commonality, {'text': ' '.join(reordered_paragraph)}))
178
+
179
  sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
 
 
 
 
 
180
  paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
181
 
 
182
  st.write("Top scored paragraphs and their scores:")
183
+ for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
184
+ st.write(
185
+ f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}, Paragraph: {paragraph['text']}")