Sourikta commited on
Commit
41abfdd
·
verified ·
1 Parent(s): 48a65ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -56
app.py CHANGED
@@ -1,74 +1,98 @@
1
- import os
2
- import fitz # PyMuPDF for PDF extraction
3
- import docx # python-docx for DOCX extraction
4
- from sentence_transformers import SentenceTransformer, util
5
- from fastapi import FastAPI, UploadFile
6
- import gradio as gr
7
-
8
- app = FastAPI()
9
-
10
- # Initialize the SentenceTransformer model
11
- model = SentenceTransformer('all-MiniLM-L6-v2')
12
-
13
- def extract_text_from_pdf(pdf_path):
14
- try:
15
- doc = fitz.open(pdf_path)
16
- text = ""
17
- for page in doc:
18
- text += page.get_text()
19
- return text
20
- except Exception as e:
21
- print(f"Error extracting text from PDF: {str(e)}")
22
- return ""
23
 
 
24
  def extract_text_from_docx(docx_path):
25
  try:
26
- doc = docx.Document(docx_path)
27
- text = "\n".join([para.text for para in doc.paragraphs])
28
  return text
29
  except Exception as e:
30
  print(f"Error extracting text from DOCX: {str(e)}")
31
  return ""
32
 
33
- def calculate_cosine_similarity(doc1: str, doc2: str) -> float:
34
- # Combine paragraphs into larger chunks (every 5 sentences for example)
35
- def chunk_text(text, chunk_size=5):
36
- sentences = text.split('.')
37
- chunks = ['.'.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
38
- return chunks
 
 
 
 
 
 
 
 
 
 
39
 
40
- chunks1 = chunk_text(doc1)
41
- chunks2 = chunk_text(doc2)
42
 
43
- # Get embeddings for chunks
44
- embeddings1 = model.encode(chunks1, convert_to_tensor=True)
45
- embeddings2 = model.encode(chunks2, convert_to_tensor=True)
46
 
47
- # Calculate cosine similarities between all chunk pairs
48
- cosine_similarities = util.pytorch_cos_sim(embeddings1, embeddings2)
 
49
 
50
- # Calculate the mean of the max similarities for each chunk
51
- max_similarities1 = cosine_similarities.max(dim=1)[0]
52
- max_similarities2 = cosine_similarities.max(dim=0)[0]
53
- mean_similarity = (max_similarities1.mean() + max_similarities2.mean()) / 2.0
 
 
 
 
 
 
 
 
54
 
55
- return mean_similarity.item()
 
 
 
56
 
57
- def similarity(file1, file2):
58
- text1 = extract_text_from_pdf(file1.name) if file1.name.endswith('.pdf') else extract_text_from_docx(file1.name)
59
- text2 = extract_text_from_pdf(file2.name) if file2.name.endswith('.pdf') else extract_text_from_docx(file2.name)
60
- return calculate_cosine_similarity(text1, text2)
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # Create a Gradio interface
63
  with gr.Blocks() as demo:
64
- gr.Markdown("## Document Similarity Checker")
65
- file1 = gr.File(label="Upload Document 1")
66
- file2 = gr.File(label="Upload Document 2")
67
- output = gr.Textbox(label="Similarity Score")
68
- submit = gr.Button("Submit")
69
-
70
- submit.click(fn=similarity, inputs=[file1, file2], outputs=output)
 
 
 
 
 
 
 
 
 
71
 
72
- # Use the GRADIO_SERVER_PORT environment variable, default to 7860 if not set
73
- port = int(os.getenv('GRADIO_SERVER_PORT', 7860))
74
- demo.launch(server_name="0.0.0.0", server_port=port)
 
1
+ import docx # Importing the required module for DOCX extraction
2
+ from datasketch import MinHash, MinHashLSH # Importing MinHash and LSH from datasketch
3
+ import gradio as gr # Importing Gradio for creating the web interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Function to extract text from DOCX files
6
  def extract_text_from_docx(docx_path):
7
  try:
8
+ doc = docx.Document(docx_path) # Open the DOCX file
9
+ text = "\n".join([para.text for para in doc.paragraphs]) # Extract the text from paragraphs
10
  return text
11
  except Exception as e:
12
  print(f"Error extracting text from DOCX: {str(e)}")
13
  return ""
14
 
15
+ # Function to calculate MinHash-based similarity between two texts
16
+ def calculate_similarity(doc1, doc2):
17
+ def text_to_shingles(text, k=5):
18
+ # Split the text into k-grams (shingles) of length k
19
+ shingles = set()
20
+ for i in range(len(text) - k + 1):
21
+ shingles.add(text[i:i + k])
22
+ return shingles
23
+
24
+ # Generate shingles for both documents
25
+ shingles1 = text_to_shingles(doc1)
26
+ shingles2 = text_to_shingles(doc2)
27
+
28
+ # Compute MinHash signatures
29
+ minhash1 = MinHash(num_perm=128)
30
+ minhash2 = MinHash(num_perm=128)
31
 
32
+ for shingle in shingles1:
33
+ minhash1.update(shingle.encode('utf8'))
34
 
35
+ for shingle in shingles2:
36
+ minhash2.update(shingle.encode('utf8'))
 
37
 
38
+ # Compute Jaccard similarity using MinHash
39
+ similarity_score = minhash1.jaccard(minhash2)
40
+ return similarity_score
41
 
42
+ # Function to interpret similarity scores
43
+ def interpret_similarity(score):
44
+ if score == 1.0:
45
+ return "Exact Match! The documents are identical."
46
+ elif 0.8 <= score < 1.0:
47
+ return "High Similarity: The documents are very similar."
48
+ elif 0.5 <= score < 0.8:
49
+ return "Moderate Similarity: The documents share some content."
50
+ elif 0.2 <= score < 0.5:
51
+ return "Low Similarity: The documents have limited overlap."
52
+ else:
53
+ return "Very Low Similarity: The documents are mostly different."
54
 
55
+ # Function to handle the similarity calculation
56
+ def similarity(doc1, doc2, file1=None, file2=None):
57
+ text1 = ""
58
+ text2 = ""
59
 
60
+ # Check for file uploads
61
+ if file1 is not None and file1.name.endswith('.docx'):
62
+ text1 = extract_text_from_docx(file1.name)
63
+ elif doc1:
64
+ text1 = doc1
65
+ else:
66
+ return "Please provide either a DOCX file or paste the text for Document 1."
67
+
68
+ if file2 is not None and file2.name.endswith('.docx'):
69
+ text2 = extract_text_from_docx(file2.name)
70
+ elif doc2:
71
+ text2 = doc2
72
+ else:
73
+ return "Please provide either a DOCX file or paste the text for Document 2."
74
+
75
+ score = calculate_similarity(text1, text2)
76
+ return f"Similarity Score: {score:.2f}\n{interpret_similarity(score)}"
77
 
78
  # Create a Gradio interface
79
  with gr.Blocks() as demo:
80
+ gr.Markdown("## 📄 Document Similarity Checker")
81
+ gr.Markdown(
82
+ "Compare two documents by uploading DOCX files or pasting text. The app calculates similarity using MinHash and provides an interpretative score.")
83
+ with gr.Row():
84
+ with gr.Column():
85
+ gr.Markdown("### Document 1")
86
+ file1 = gr.File(label="Upload DOCX File")
87
+ doc1 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
88
+ with gr.Column():
89
+ gr.Markdown("### Document 2")
90
+ file2 = gr.File(label="Upload DOCX File")
91
+ doc2 = gr.Textbox(label="Or Paste Text Here", lines=10, placeholder="Paste document text...")
92
+ output = gr.Textbox(label="Result", lines=3)
93
+ submit = gr.Button("Check Similarity", variant="primary")
94
+
95
+ submit.click(fn=similarity, inputs=[doc1, doc2, file1, file2], outputs=output)
96
 
97
+ # Launch the Gradio app
98
+ demo.launch()