Spaces:

Sourikta
/

Min-Hash-Based-Document-Similarity-Checker

Sleeping

App Files Files Community

Sourikta commited on Dec 12, 2024

Commit

41abfdd

verified ·

1 Parent(s): 48a65ac

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -56

app.py CHANGED Viewed

@@ -1,74 +1,98 @@
-import os
-import fitz  # PyMuPDF for PDF extraction
-import docx  # python-docx for DOCX extraction
-from sentence_transformers import SentenceTransformer, util
-from fastapi import FastAPI, UploadFile
-import gradio as gr
-app = FastAPI()
-# Initialize the SentenceTransformer model
-model = SentenceTransformer('all-MiniLM-L6-v2')
-def extract_text_from_pdf(pdf_path):
-    try:
-        doc = fitz.open(pdf_path)
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        return text
-    except Exception as e:
-        print(f"Error extracting text from PDF: {str(e)}")
-        return ""
 def extract_text_from_docx(docx_path):
     try:
-        doc = docx.Document(docx_path)
-        text = "\n".join([para.text for para in doc.paragraphs])
         return text
     except Exception as e:
         print(f"Error extracting text from DOCX: {str(e)}")
         return ""
-def calculate_cosine_similarity(doc1: str, doc2: str) -> float:
-    # Combine paragraphs into larger chunks (every 5 sentences for example)
-    def chunk_text(text, chunk_size=5):
-        sentences = text.split('.')
-        chunks = ['.'.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
-        return chunks
-    chunks1 = chunk_text(doc1)
-    chunks2 = chunk_text(doc2)
-    # Get embeddings for chunks
-    embeddings1 = model.encode(chunks1, convert_to_tensor=True)
-    embeddings2 = model.encode(chunks2, convert_to_tensor=True)
-    # Calculate cosine similarities between all chunk pairs
-    cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
-    # Calculate the mean of the max similarities for each chunk
-    max_similarities1 = cosine_similarities.max(dim=1)[0]
-    max_similarities2 = cosine_similarities.max(dim=0)[0]
-    mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
-    return mean_similarity.item()
-def similarity(file1, file2):
-    text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
-    text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
-    return calculate_cosine_similarity(text1, text2)
 # Create a Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Document Similarity Checker")
-    file1 = gr.File(label="Upload Document 1")
-    file2 = gr.File(label="Upload Document 2")
-    output = gr.Textbox(label="Similarity Score")
-    submit = gr.Button("Submit")
-    submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
-# Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
-port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
-demo.launch(server_name="0.0.0.0", server_port=port)

+import docx  # Importing the required module for DOCX extraction
+from datasketch import MinHash, MinHashLSH  # Importing MinHash and LSH from datasketch
+import gradio as gr  # Importing Gradio for creating the web interface
+# Function to extract text from DOCX files
 def extract_text_from_docx(docx_path):
     try:
+        doc = docx.Document(docx_path)  # Open the DOCX file
+        text = "\n".join([para.text for para in doc.paragraphs])  # Extract the text from paragraphs
         return text
     except Exception as e:
         print(f"Error extracting text from DOCX: {str(e)}")
         return ""
+# Function to calculate MinHash-based similarity between two texts
+def calculate_similarity(doc1, doc2):
+    def text_to_shingles(text, k=5):
+        # Split the text into k-grams (shingles) of length k
+        shingles = set()
+        for i in range(len(text) - k + 1):
+            shingles.add(text[i:i + k])
+        return shingles
+    # Generate shingles for both documents
+    shingles1 = text_to_shingles(doc1)
+    shingles2 = text_to_shingles(doc2)
+    # Compute MinHash signatures
+    minhash1 = MinHash(num_perm=128)
+    minhash2 = MinHash(num_perm=128)
+    for shingle in shingles1:
+        minhash1.update(shingle.encode('utf8'))
+    for shingle in shingles2:
+        minhash2.update(shingle.encode('utf8'))
+    # Compute Jaccard similarity using MinHash
+    similarity_score = minhash1.jaccard(minhash2)
+    return similarity_score
+# Function to interpret similarity scores
+def interpret_similarity(score):
+    if score == 1.0:
+        return "Exact Match! The documents are identical."
+    elif 0.8 <= score < 1.0:
+        return "High Similarity: The documents are very similar."
+    elif 0.5 <= score < 0.8:
+        return "Moderate Similarity: The documents share some content."
+    elif 0.2 <= score < 0.5:
+        return "Low Similarity: The documents have limited overlap."
+    else:
+        return "Very Low Similarity: The documents are mostly different."
+# Function to handle the similarity calculation
+def similarity(doc1, doc2, file1=None, file2=None):
+    text1 = ""
+    text2 = ""
+    # Check for file uploads
+    if file1 is not None and file1.name.endswith('.docx'):
+        text1 = extract_text_from_docx(file1.name)
+    elif doc1:
+        text1 = doc1
+    else:
+        return "Please provide either a DOCX file or paste the text for Document 1."
+    if file2 is not None and file2.name.endswith('.docx'):
+        text2 = extract_text_from_docx(file2.name)
+    elif doc2:
+        text2 = doc2
+    else:
+        return "Please provide either a DOCX file or paste the text for Document 2."
+    score = calculate_similarity(text1, text2)
+    return f"Similarity Score: {score:.2f}\n{interpret_similarity(score)}"
 # Create a Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("## 📄 Document Similarity Checker")
+    gr.Markdown(
+        "Compare two documents by uploading DOCX files or pasting text. The app calculates similarity using MinHash and provides an interpretative score.")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Document 1")
+            file1 = gr.File(label="Upload DOCX File")
+            doc1 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
+        with gr.Column():
+            gr.Markdown("### Document 2")
+            file2 = gr.File(label="Upload DOCX File")
+            doc2 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
+    output = gr.Textbox(label="Result", lines=3)
+    submit = gr.Button("Check Similarity", variant="primary")
+    submit.click(fn=similarity, inputs=[doc1, doc2, file1, file2], outputs=output)
+# Launch the Gradio app
+demo.launch()