Update app.py
Browse files
app.py
CHANGED
@@ -1,74 +1,98 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
from sentence_transformers import SentenceTransformer, util
|
5 |
-
from fastapi import FastAPI, UploadFile
|
6 |
-
import gradio as gr
|
7 |
-
|
8 |
-
app = FastAPI()
|
9 |
-
|
10 |
-
# Initialize the SentenceTransformer model
|
11 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
-
|
13 |
-
def extract_text_from_pdf(pdf_path):
|
14 |
-
try:
|
15 |
-
doc = fitz.open(pdf_path)
|
16 |
-
text = ""
|
17 |
-
for page in doc:
|
18 |
-
text += page.get_text()
|
19 |
-
return text
|
20 |
-
except Exception as e:
|
21 |
-
print(f"Error extracting text from PDF: {str(e)}")
|
22 |
-
return ""
|
23 |
|
|
|
24 |
def extract_text_from_docx(docx_path):
|
25 |
try:
|
26 |
-
doc = docx.Document(docx_path)
|
27 |
-
text = "\n".join([para.text for para in doc.paragraphs])
|
28 |
return text
|
29 |
except Exception as e:
|
30 |
print(f"Error extracting text from DOCX: {str(e)}")
|
31 |
return ""
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
def
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
embeddings2 = model.encode(chunks2, convert_to_tensor=True)
|
46 |
|
47 |
-
#
|
48 |
-
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Create a Gradio interface
|
63 |
with gr.Blocks() as demo:
|
64 |
-
gr.Markdown("## Document Similarity Checker")
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
#
|
73 |
-
|
74 |
-
demo.launch(server_name="0.0.0.0", server_port=port)
|
|
|
1 |
+
import docx # Importing the required module for DOCX extraction
|
2 |
+
from datasketch import MinHash, MinHashLSH # Importing MinHash and LSH from datasketch
|
3 |
+
import gradio as gr # Importing Gradio for creating the web interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# Function to extract text from DOCX files
|
6 |
def extract_text_from_docx(docx_path):
|
7 |
try:
|
8 |
+
doc = docx.Document(docx_path) # Open the DOCX file
|
9 |
+
text = "\n".join([para.text for para in doc.paragraphs]) # Extract the text from paragraphs
|
10 |
return text
|
11 |
except Exception as e:
|
12 |
print(f"Error extracting text from DOCX: {str(e)}")
|
13 |
return ""
|
14 |
|
15 |
+
# Function to calculate MinHash-based similarity between two texts
|
16 |
+
def calculate_similarity(doc1, doc2):
|
17 |
+
def text_to_shingles(text, k=5):
|
18 |
+
# Split the text into k-grams (shingles) of length k
|
19 |
+
shingles = set()
|
20 |
+
for i in range(len(text) - k + 1):
|
21 |
+
shingles.add(text[i:i + k])
|
22 |
+
return shingles
|
23 |
+
|
24 |
+
# Generate shingles for both documents
|
25 |
+
shingles1 = text_to_shingles(doc1)
|
26 |
+
shingles2 = text_to_shingles(doc2)
|
27 |
+
|
28 |
+
# Compute MinHash signatures
|
29 |
+
minhash1 = MinHash(num_perm=128)
|
30 |
+
minhash2 = MinHash(num_perm=128)
|
31 |
|
32 |
+
for shingle in shingles1:
|
33 |
+
minhash1.update(shingle.encode('utf8'))
|
34 |
|
35 |
+
for shingle in shingles2:
|
36 |
+
minhash2.update(shingle.encode('utf8'))
|
|
|
37 |
|
38 |
+
# Compute Jaccard similarity using MinHash
|
39 |
+
similarity_score = minhash1.jaccard(minhash2)
|
40 |
+
return similarity_score
|
41 |
|
42 |
+
# Function to interpret similarity scores
|
43 |
+
def interpret_similarity(score):
|
44 |
+
if score == 1.0:
|
45 |
+
return "Exact Match! The documents are identical."
|
46 |
+
elif 0.8 <= score < 1.0:
|
47 |
+
return "High Similarity: The documents are very similar."
|
48 |
+
elif 0.5 <= score < 0.8:
|
49 |
+
return "Moderate Similarity: The documents share some content."
|
50 |
+
elif 0.2 <= score < 0.5:
|
51 |
+
return "Low Similarity: The documents have limited overlap."
|
52 |
+
else:
|
53 |
+
return "Very Low Similarity: The documents are mostly different."
|
54 |
|
55 |
+
# Function to handle the similarity calculation
|
56 |
+
def similarity(doc1, doc2, file1=None, file2=None):
|
57 |
+
text1 = ""
|
58 |
+
text2 = ""
|
59 |
|
60 |
+
# Check for file uploads
|
61 |
+
if file1 is not None and file1.name.endswith('.docx'):
|
62 |
+
text1 = extract_text_from_docx(file1.name)
|
63 |
+
elif doc1:
|
64 |
+
text1 = doc1
|
65 |
+
else:
|
66 |
+
return "Please provide either a DOCX file or paste the text for Document 1."
|
67 |
+
|
68 |
+
if file2 is not None and file2.name.endswith('.docx'):
|
69 |
+
text2 = extract_text_from_docx(file2.name)
|
70 |
+
elif doc2:
|
71 |
+
text2 = doc2
|
72 |
+
else:
|
73 |
+
return "Please provide either a DOCX file or paste the text for Document 2."
|
74 |
+
|
75 |
+
score = calculate_similarity(text1, text2)
|
76 |
+
return f"Similarity Score: {score:.2f}\n{interpret_similarity(score)}"
|
77 |
|
78 |
# Create a Gradio interface
|
79 |
with gr.Blocks() as demo:
|
80 |
+
gr.Markdown("## 📄 Document Similarity Checker")
|
81 |
+
gr.Markdown(
|
82 |
+
"Compare two documents by uploading DOCX files or pasting text. The app calculates similarity using MinHash and provides an interpretative score.")
|
83 |
+
with gr.Row():
|
84 |
+
with gr.Column():
|
85 |
+
gr.Markdown("### Document 1")
|
86 |
+
file1 = gr.File(label="Upload DOCX File")
|
87 |
+
doc1 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
|
88 |
+
with gr.Column():
|
89 |
+
gr.Markdown("### Document 2")
|
90 |
+
file2 = gr.File(label="Upload DOCX File")
|
91 |
+
doc2 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
|
92 |
+
output = gr.Textbox(label="Result", lines=3)
|
93 |
+
submit = gr.Button("Check Similarity", variant="primary")
|
94 |
+
|
95 |
+
submit.click(fn=similarity, inputs=[doc1, doc2, file1, file2], outputs=output)
|
96 |
|
97 |
+
# Launch the Gradio app
|
98 |
+
demo.launch()
|
|