RenAzum commited on
Commit
7ec8f89
·
1 Parent(s): 8c2b15e

upload file fix

Browse files
Files changed (1) hide show
  1. app.py +33 -19
app.py CHANGED
@@ -3,28 +3,38 @@ import fitz # PyMuPDF
3
  import docx
4
  from difflib import HtmlDiff, SequenceMatcher
5
  import os
6
- import re
7
 
8
- # Functions to extract text and metadata
9
- def extract_text_pdf(file):
10
- doc = fitz.open(file)
 
 
 
 
 
 
 
 
 
 
 
11
  text = ""
12
  for page in doc:
13
  text += page.get_text()
14
  return text
15
 
16
- def extract_text_word(file):
17
- doc = docx.Document(file)
18
  text = "\n".join([para.text for para in doc.paragraphs])
19
  return text
20
 
21
- def extract_metadata_pdf(file):
22
- doc = fitz.open(file)
23
  metadata = doc.metadata
24
  return metadata
25
 
26
- def extract_metadata_word(file):
27
- doc = docx.Document(file)
28
  core_props = doc.core_properties
29
  metadata = {
30
  "author": core_props.author,
@@ -33,7 +43,7 @@ def extract_metadata_word(file):
33
  }
34
  return metadata
35
 
36
- # Function to compare text using difflib and return highlighted HTML differences
37
  def compare_texts(text1, text2):
38
  differ = HtmlDiff()
39
  return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
@@ -54,6 +64,10 @@ edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"])
54
 
55
  # Process if both files are uploaded
56
  if original_file and edited_file:
 
 
 
 
57
  # Identify file types
58
  original_ext = os.path.splitext(original_file.name)[1]
59
  edited_ext = os.path.splitext(edited_file.name)[1]
@@ -64,15 +78,15 @@ if original_file and edited_file:
64
  else:
65
  # Extract text and metadata
66
  if original_ext == ".pdf":
67
- original_text = extract_text_pdf(original_file)
68
- edited_text = extract_text_pdf(edited_file)
69
- original_metadata = extract_metadata_pdf(original_file)
70
- edited_metadata = extract_metadata_pdf(edited_file)
71
  else:
72
- original_text = extract_text_word(original_file)
73
- edited_text = extract_text_word(edited_file)
74
- original_metadata = extract_metadata_word(original_file)
75
- edited_metadata = extract_metadata_word(edited_file)
76
 
77
  # Display Metadata
78
  st.subheader("Metadata Comparison")
 
3
  import docx
4
  from difflib import HtmlDiff, SequenceMatcher
5
  import os
 
6
 
7
+ # Directory to save uploaded files
8
+ UPLOAD_DIR = "uploaded_files"
9
+ if not os.path.exists(UPLOAD_DIR):
10
+ os.makedirs(UPLOAD_DIR)
11
+
12
+ # Functions to save, extract text, and metadata
13
+ def save_uploaded_file(uploaded_file):
14
+ file_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
15
+ with open(file_path, "wb") as f:
16
+ f.write(uploaded_file.getbuffer())
17
+ return file_path
18
+
19
+ def extract_text_pdf(file_path):
20
+ doc = fitz.open(file_path)
21
  text = ""
22
  for page in doc:
23
  text += page.get_text()
24
  return text
25
 
26
+ def extract_text_word(file_path):
27
+ doc = docx.Document(file_path)
28
  text = "\n".join([para.text for para in doc.paragraphs])
29
  return text
30
 
31
+ def extract_metadata_pdf(file_path):
32
+ doc = fitz.open(file_path)
33
  metadata = doc.metadata
34
  return metadata
35
 
36
+ def extract_metadata_word(file_path):
37
+ doc = docx.Document(file_path)
38
  core_props = doc.core_properties
39
  metadata = {
40
  "author": core_props.author,
 
43
  }
44
  return metadata
45
 
46
+ # Function to compare text and return highlighted HTML differences
47
  def compare_texts(text1, text2):
48
  differ = HtmlDiff()
49
  return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2)
 
64
 
65
  # Process if both files are uploaded
66
  if original_file and edited_file:
67
+ # Save uploaded files
68
+ original_file_path = save_uploaded_file(original_file)
69
+ edited_file_path = save_uploaded_file(edited_file)
70
+
71
  # Identify file types
72
  original_ext = os.path.splitext(original_file.name)[1]
73
  edited_ext = os.path.splitext(edited_file.name)[1]
 
78
  else:
79
  # Extract text and metadata
80
  if original_ext == ".pdf":
81
+ original_text = extract_text_pdf(original_file_path)
82
+ edited_text = extract_text_pdf(edited_file_path)
83
+ original_metadata = extract_metadata_pdf(original_file_path)
84
+ edited_metadata = extract_metadata_pdf(edited_file_path)
85
  else:
86
+ original_text = extract_text_word(original_file_path)
87
+ edited_text = extract_text_word(edited_file_path)
88
+ original_metadata = extract_metadata_word(original_file_path)
89
+ edited_metadata = extract_metadata_word(edited_file_path)
90
 
91
  # Display Metadata
92
  st.subheader("Metadata Comparison")