Spaces:

zmbfeng
/

knowledge_extraction_a

Sleeping

App Files Files Community

zmbfeng commited on Aug 4, 2024

Commit

b91bc9f

1 Parent(s): 1481eaa

commonality score added, highest combined scored sentences to the front

Browse files

Files changed (1) hide show

app.py +36 -27

app.py CHANGED Viewed

@@ -38,7 +38,7 @@ def combined_similarity(similarity, sentence, query):
     # Adjust the similarity score with the common words count
     combined_score = similarity + (common_words / max(len(query_words), 1))  # Normalize by the length of the query to keep the score between -1 and 1
-    return combined_score
 big_text = """
     <div style='text-align: center;'>
@@ -124,53 +124,62 @@ if 'list_count' in st.session_state:
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")
     if query:
-        query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to('cuda')
         with torch.no_grad():  # Disable gradient calculation for inference
             # Perform the forward pass on the GPU
             query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
                              :].cpu().numpy()  # Move the result to CPU and convert to NumPy
         paragraph_scores = []
         sentence_scores = []
-        sentence_encoding = []
-        total_count=len(st.session_state.paragraph_sentence_encodings)
         processing_progress_bar = st.progress(0)
-        for index,paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
-            progress_percentage = index / (total_count- 1)
             processing_progress_bar.progress(progress_percentage)
-            best_similarity = -1
             sentence_similarities = []
             for sentence_encoding in paragraph_sentence_encoding[1]:
                 if sentence_encoding:
                     similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
-                    # adjusted_similarity = similarity*len(sentence_encoding[0].split())**0.5
-                    combined_score = combined_similarity(similarity, sentence_encoding[0], query)
-                    # print("sentence="+sentence_encoding[0] + " len="+str())
-                    sentence_similarities.append(combined_score)
                     sentence_scores.append((combined_score, sentence_encoding[0]))
-                    # best_similarity = max(best_similarity, similarity)
-            sentence_similarities.sort(reverse=True)
             # Calculate the average of the top three sentence similarities
             if len(sentence_similarities) >= 3:
-                top_three_avg_similarity = np.mean(sentence_similarities[:3])
             elif sentence_similarities:
-                top_three_avg_similarity = np.mean(sentence_similarities)
             else:
                 top_three_avg_similarity = 0
-            paragraph_scores.append((top_three_avg_similarity, paragraph_sentence_encoding[0]))
         sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
-        # Display the scores and sentences
-        # print("Top scored sentences and their scores:")
-        # for score, sentence in sentence_scores:  # Print top 10 for demonstration
-        #     print(f"Score: {score:.4f}, Sentence: {sentence}")
-        # Sort the paragraphs by their best similarity score
         paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
-        # Debug prints to understand the scores and paragraphs
         st.write("Top scored paragraphs and their scores:")
-        for score, paragraph in paragraph_scores[:5]:  # Print top 5 for debugging
-            st.write(f"Score: {score}, Paragraph: {paragraph['text']}")

     # Adjust the similarity score with the common words count
     combined_score = similarity + (common_words / max(len(query_words), 1))  # Normalize by the length of the query to keep the score between -1 and 1
+    return combined_score,similarity,(common_words / max(len(query_words), 1))
 big_text = """
     <div style='text-align: center;'>
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")
     if query:
+        query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
+            'cuda')
         with torch.no_grad():  # Disable gradient calculation for inference
             # Perform the forward pass on the GPU
             query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
                              :].cpu().numpy()  # Move the result to CPU and convert to NumPy
         paragraph_scores = []
         sentence_scores = []
+        total_count = len(st.session_state.paragraph_sentence_encodings)
         processing_progress_bar = st.progress(0)
+        for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
+            progress_percentage = index / (total_count - 1)
             processing_progress_bar.progress(progress_percentage)
             sentence_similarities = []
             for sentence_encoding in paragraph_sentence_encoding[1]:
                 if sentence_encoding:
                     similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
+                    combined_score, similarity_score, commonality_score = combined_similarity(similarity,
+                                                                                              sentence_encoding[0],
+                                                                                              query)
+                    sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
                     sentence_scores.append((combined_score, sentence_encoding[0]))
+            sentence_similarities.sort(reverse=True, key=lambda x: x[0])
             # Calculate the average of the top three sentence similarities
             if len(sentence_similarities) >= 3:
+                top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
+                top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
+                top_three_sentences = sentence_similarities[:3]
             elif sentence_similarities:
+                top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
+                top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
+                top_three_sentences = sentence_similarities
             else:
                 top_three_avg_similarity = 0
+                top_three_avg_commonality = 0
+                top_three_sentences = []
+            # Move top 3 sentences to the beginning of the paragraph
+            top_three_texts = [s[1] for s in top_three_sentences]
+            remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
+            reordered_paragraph = top_three_texts + remaining_texts
+            paragraph_scores.append(
+                (top_three_avg_similarity, top_three_avg_commonality, {'text': ' '.join(reordered_paragraph)}))
         sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
         paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
         st.write("Top scored paragraphs and their scores:")
+        for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
+            st.write(
+                f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}, Paragraph: {paragraph['text']}")