Spaces:

a-guy-from-burma
/

text-similarity-in-percentage

Running

App Files Files Community

a-guy-from-burma commited on Apr 20, 2024

Commit

84949b2

verified ·

1 Parent(s): f4e65c8

Update app.py

Browse files

chunk matching, new way

Files changed (1) hide show

app.py +29 -6

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import nltk
 from nltk import pos_tag
 from nltk.tokenize import word_tokenize
 import requests
 nltk.download('averaged_perceptron_tagger')
@@ -33,10 +34,7 @@ def calculate_similarity(text1, text2):
     return f"{similarity.item():.2%} Similarity"
 def report_issue(text1, text2, similarity):
-    # Replace '[FORM_ID]' with the actual ID of your Google Form.
     url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse'
-    #https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/viewform?usp=sf_link
-    # Replace 'entry.XXXXX' with the actual entry IDs from your Google Form.
     data = {
         'entry.1041881480': text1,
         'entry.1520964719': text2,
@@ -48,6 +46,23 @@ def report_issue(text1, text2, similarity):
     else:
         return "Failed to send report."
 with gr.Blocks() as app:
     with gr.Row():
         text1 = gr.Textbox(label="Input Text 1")
@@ -55,14 +70,22 @@ with gr.Blocks() as app:
     with gr.Row():
         button = gr.Button("Calculate Similarity")
     output = gr.Text(label="Similarity")
     button.click(
-        fn=calculate_similarity,
         inputs=[text1, text2],
-        outputs=output
     )
-    report_button = gr.Button("Report to Developer")
     report_button.click(
         fn=report_issue,
         inputs=[text1, text2, output],

 import nltk
 from nltk import pos_tag
 from nltk.tokenize import word_tokenize
+from nltk.chunk import RegexpParser
 import requests
 nltk.download('averaged_perceptron_tagger')
     return f"{similarity.item():.2%} Similarity"
 def report_issue(text1, text2, similarity):
     url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse'
     data = {
         'entry.1041881480': text1,
         'entry.1520964719': text2,
     else:
         return "Failed to send report."
+def extract_chunks(text):
+    # Define grammar for chunking
+    grammar = r"""
+    NP: {<DT>?<JJ>*<NN>+}   # Chunk sequences of DT, JJ, NN
+    PP: {<IN><NP>}          # Chunk prepositions followed by NP
+    VP: {<VB.*><NP|PP>*}    # Chunk verbs and their arguments
+    """
+    # Tokenize and POS-tag
+    words = word_tokenize(text)
+    tagged_words = pos_tag(words)
+    chunk_parser = RegexpParser(grammar)
+    tree = chunk_parser.parse(tagged_words)
+    # Extract phrases
+    phrases = [" ".join(word for word, tag in subtree.leaves()) for subtree in tree.subtrees() if subtree.label() in ['NP', 'PP', 'VP']]
+    return phrases
 with gr.Blocks() as app:
     with gr.Row():
         text1 = gr.Textbox(label="Input Text 1")
     with gr.Row():
         button = gr.Button("Calculate Similarity")
     output = gr.Text(label="Similarity")
+    chunks_output = gr.Text(label="Extracted Chunks")
+    def combined_function(text1, text2):
+        similarity = calculate_similarity(text1, text2)
+        chunks1 = extract_chunks(text1)
+        chunks2 = extract_chunks(text2)
+        chunks_text = f"Chunks in Text 1: {chunks1}\nChunks in Text 2: {chunks2}"
+        return similarity, chunks_text
     button.click(
+        fn=combined_function,
         inputs=[text1, text2],
+        outputs=[output, chunks_output]
     )
+    report_button = gr.Button("Send result for better training")
     report_button.click(
         fn=report_issue,
         inputs=[text1, text2, output],