Spaces:
Sleeping
Sleeping
commonality score added, highest combined scored sentences to the front
Browse files
app.py
CHANGED
@@ -38,7 +38,7 @@ def combined_similarity(similarity, sentence, query):
|
|
38 |
|
39 |
# Adjust the similarity score with the common words count
|
40 |
combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
|
41 |
-
return combined_score
|
42 |
|
43 |
big_text = """
|
44 |
<div style='text-align: center;'>
|
@@ -124,53 +124,62 @@ if 'list_count' in st.session_state:
|
|
124 |
st.rerun()
|
125 |
if 'paragraph_sentence_encodings' in st.session_state:
|
126 |
query = st.text_input("Enter your query")
|
|
|
127 |
if query:
|
128 |
-
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
|
|
|
129 |
with torch.no_grad(): # Disable gradient calculation for inference
|
130 |
# Perform the forward pass on the GPU
|
131 |
query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
|
132 |
:].cpu().numpy() # Move the result to CPU and convert to NumPy
|
|
|
133 |
paragraph_scores = []
|
134 |
sentence_scores = []
|
135 |
-
|
136 |
-
total_count=len(st.session_state.paragraph_sentence_encodings)
|
137 |
processing_progress_bar = st.progress(0)
|
138 |
-
|
139 |
-
|
|
|
140 |
processing_progress_bar.progress(progress_percentage)
|
141 |
-
|
142 |
sentence_similarities = []
|
143 |
for sentence_encoding in paragraph_sentence_encoding[1]:
|
144 |
if sentence_encoding:
|
145 |
similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
sentence_similarities.append(combined_score)
|
152 |
sentence_scores.append((combined_score, sentence_encoding[0]))
|
153 |
-
|
154 |
-
sentence_similarities.sort(reverse=True)
|
155 |
|
156 |
# Calculate the average of the top three sentence similarities
|
157 |
if len(sentence_similarities) >= 3:
|
158 |
-
top_three_avg_similarity = np.mean(sentence_similarities[:3])
|
|
|
|
|
159 |
elif sentence_similarities:
|
160 |
-
top_three_avg_similarity = np.mean(sentence_similarities)
|
|
|
|
|
161 |
else:
|
162 |
top_three_avg_similarity = 0
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
|
165 |
-
# Display the scores and sentences
|
166 |
-
# print("Top scored sentences and their scores:")
|
167 |
-
# for score, sentence in sentence_scores: # Print top 10 for demonstration
|
168 |
-
# print(f"Score: {score:.4f}, Sentence: {sentence}")
|
169 |
-
# Sort the paragraphs by their best similarity score
|
170 |
paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
|
171 |
|
172 |
-
# Debug prints to understand the scores and paragraphs
|
173 |
st.write("Top scored paragraphs and their scores:")
|
174 |
-
for
|
175 |
-
|
176 |
-
|
|
|
38 |
|
39 |
# Adjust the similarity score with the common words count
|
40 |
combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
|
41 |
+
return combined_score,similarity,(common_words / max(len(query_words), 1))
|
42 |
|
43 |
big_text = """
|
44 |
<div style='text-align: center;'>
|
|
|
124 |
st.rerun()
|
125 |
if 'paragraph_sentence_encodings' in st.session_state:
|
126 |
query = st.text_input("Enter your query")
|
127 |
+
|
128 |
if query:
|
129 |
+
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
|
130 |
+
'cuda')
|
131 |
with torch.no_grad(): # Disable gradient calculation for inference
|
132 |
# Perform the forward pass on the GPU
|
133 |
query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
|
134 |
:].cpu().numpy() # Move the result to CPU and convert to NumPy
|
135 |
+
|
136 |
paragraph_scores = []
|
137 |
sentence_scores = []
|
138 |
+
total_count = len(st.session_state.paragraph_sentence_encodings)
|
|
|
139 |
processing_progress_bar = st.progress(0)
|
140 |
+
|
141 |
+
for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
|
142 |
+
progress_percentage = index / (total_count - 1)
|
143 |
processing_progress_bar.progress(progress_percentage)
|
144 |
+
|
145 |
sentence_similarities = []
|
146 |
for sentence_encoding in paragraph_sentence_encoding[1]:
|
147 |
if sentence_encoding:
|
148 |
similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
|
149 |
+
combined_score, similarity_score, commonality_score = combined_similarity(similarity,
|
150 |
+
sentence_encoding[0],
|
151 |
+
query)
|
152 |
+
sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
|
|
|
|
|
153 |
sentence_scores.append((combined_score, sentence_encoding[0]))
|
154 |
+
|
155 |
+
sentence_similarities.sort(reverse=True, key=lambda x: x[0])
|
156 |
|
157 |
# Calculate the average of the top three sentence similarities
|
158 |
if len(sentence_similarities) >= 3:
|
159 |
+
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
|
160 |
+
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
|
161 |
+
top_three_sentences = sentence_similarities[:3]
|
162 |
elif sentence_similarities:
|
163 |
+
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
|
164 |
+
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
|
165 |
+
top_three_sentences = sentence_similarities
|
166 |
else:
|
167 |
top_three_avg_similarity = 0
|
168 |
+
top_three_avg_commonality = 0
|
169 |
+
top_three_sentences = []
|
170 |
+
|
171 |
+
# Move top 3 sentences to the beginning of the paragraph
|
172 |
+
top_three_texts = [s[1] for s in top_three_sentences]
|
173 |
+
remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
|
174 |
+
reordered_paragraph = top_three_texts + remaining_texts
|
175 |
+
|
176 |
+
paragraph_scores.append(
|
177 |
+
(top_three_avg_similarity, top_three_avg_commonality, {'text': ' '.join(reordered_paragraph)}))
|
178 |
+
|
179 |
sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
|
|
|
|
|
|
|
|
|
|
|
180 |
paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
|
181 |
|
|
|
182 |
st.write("Top scored paragraphs and their scores:")
|
183 |
+
for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
|
184 |
+
st.write(
|
185 |
+
f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}, Paragraph: {paragraph['text']}")
|