Spaces:
Running
Running
Update pdf_processor.py
Browse files- pdf_processor.py +57 -39
pdf_processor.py
CHANGED
@@ -7,10 +7,11 @@ from transformers import pipeline
|
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
import io
|
9 |
import os
|
|
|
|
|
10 |
from typing import List, Dict
|
11 |
from agents import create_judge_agent, create_advocate_agent
|
12 |
from crewai import Task, Crew
|
13 |
-
import torch
|
14 |
|
15 |
class PDFProcessor:
|
16 |
def __init__(self):
|
@@ -23,18 +24,17 @@ class PDFProcessor:
|
|
23 |
# Initialize models with better memory management
|
24 |
self.summarizer = pipeline(
|
25 |
"summarization",
|
26 |
-
model="
|
27 |
-
|
28 |
-
torch_dtype=torch.float32,
|
29 |
-
batch_size=1
|
30 |
-
trust_remote_code=True
|
31 |
)
|
32 |
self.progress_callback = None
|
33 |
|
34 |
# Configure torch for memory efficiency
|
35 |
-
if torch.backends.mps.is_available(): # For Mac M1/M2
|
36 |
-
|
37 |
-
|
38 |
torch.cuda.empty_cache()
|
39 |
torch.cuda.set_per_process_memory_fraction(0.7)
|
40 |
|
@@ -112,15 +112,8 @@ class PDFProcessor:
|
|
112 |
def _process_arabic_text(self, text: str) -> str:
|
113 |
"""Process Arabic text with improved handling."""
|
114 |
try:
|
115 |
-
# Configure arabic-reshaper for better text handling
|
116 |
-
configuration = {
|
117 |
-
'delete_harakat': False,
|
118 |
-
'support_ligatures': True,
|
119 |
-
'RIAL SIGN': True
|
120 |
-
}
|
121 |
-
|
122 |
# Reshape Arabic text
|
123 |
-
reshaped_text = arabic_reshaper.reshape(text
|
124 |
|
125 |
# Apply bidirectional algorithm
|
126 |
text = get_display(reshaped_text)
|
@@ -135,59 +128,84 @@ class PDFProcessor:
|
|
135 |
return text # Return original text if processing fails
|
136 |
|
137 |
def summarize_document(self, text: str) -> str:
|
138 |
-
"""Generate a summary of the document with improved memory management."""
|
139 |
try:
|
140 |
-
# Split text into smaller chunks
|
141 |
chunks = self.text_splitter.split_text(text)
|
|
|
|
|
|
|
142 |
summaries = []
|
|
|
143 |
|
144 |
-
# Process chunks in batches
|
145 |
-
batch_size =
|
146 |
-
for i in range(0,
|
147 |
-
# Clear
|
148 |
if torch.cuda.is_available():
|
149 |
torch.cuda.empty_cache()
|
150 |
elif torch.backends.mps.is_available():
|
151 |
-
# Force garbage collection for MPS
|
152 |
import gc
|
153 |
gc.collect()
|
|
|
154 |
|
155 |
batch = chunks[i:i + batch_size]
|
156 |
for chunk in batch:
|
|
|
|
|
|
|
157 |
try:
|
158 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
summary = self.summarizer(
|
160 |
chunk,
|
161 |
-
max_length=
|
162 |
-
min_length=
|
163 |
do_sample=False,
|
164 |
-
num_beams=
|
165 |
-
early_stopping=True
|
|
|
166 |
)
|
167 |
-
|
|
|
|
|
|
|
|
|
168 |
except Exception as e:
|
169 |
print(f"Warning: Error summarizing chunk: {str(e)}")
|
170 |
-
#
|
171 |
-
|
|
|
|
|
172 |
|
173 |
# Update progress
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
178 |
|
179 |
-
# Combine summaries
|
180 |
final_summary = " ".join(summaries)
|
181 |
|
182 |
-
# Clean
|
183 |
final_summary = self._clean_text(final_summary)
|
184 |
final_summary = self._process_arabic_text(final_summary)
|
185 |
|
|
|
|
|
|
|
|
|
186 |
return final_summary
|
187 |
|
188 |
except Exception as e:
|
189 |
print(f"Error in summarization: {str(e)}")
|
190 |
-
# Fallback to a simple extractive summary
|
191 |
return self._create_extractive_summary(text)
|
192 |
|
193 |
def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str:
|
|
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
import io
|
9 |
import os
|
10 |
+
import re
|
11 |
+
import torch
|
12 |
from typing import List, Dict
|
13 |
from agents import create_judge_agent, create_advocate_agent
|
14 |
from crewai import Task, Crew
|
|
|
15 |
|
16 |
class PDFProcessor:
|
17 |
def __init__(self):
|
|
|
24 |
# Initialize models with better memory management
|
25 |
self.summarizer = pipeline(
|
26 |
"summarization",
|
27 |
+
model="sshleifer/distilbart-cnn-6-6", # Using a smaller, faster model
|
28 |
+
device="cpu", # Use CPU for better compatibility
|
29 |
+
torch_dtype=torch.float32,
|
30 |
+
batch_size=1
|
|
|
31 |
)
|
32 |
self.progress_callback = None
|
33 |
|
34 |
# Configure torch for memory efficiency
|
35 |
+
#if torch.backends.mps.is_available(): # For Mac M1/M2
|
36 |
+
# torch.backends.mps.set_per_process_memory_fraction(0.7) # Use only 70% of available memory
|
37 |
+
if torch.cuda.is_available(): # For CUDA devices
|
38 |
torch.cuda.empty_cache()
|
39 |
torch.cuda.set_per_process_memory_fraction(0.7)
|
40 |
|
|
|
112 |
def _process_arabic_text(self, text: str) -> str:
|
113 |
"""Process Arabic text with improved handling."""
|
114 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
# Reshape Arabic text
|
116 |
+
reshaped_text = arabic_reshaper.reshape(text)
|
117 |
|
118 |
# Apply bidirectional algorithm
|
119 |
text = get_display(reshaped_text)
|
|
|
128 |
return text # Return original text if processing fails
|
129 |
|
130 |
def summarize_document(self, text: str) -> str:
|
131 |
+
"""Generate a summary of the document with improved memory management and handling of Arabic text."""
|
132 |
try:
|
133 |
+
# Split text into smaller chunks with consideration for Arabic text
|
134 |
chunks = self.text_splitter.split_text(text)
|
135 |
+
if not chunks:
|
136 |
+
return self._create_extractive_summary(text)
|
137 |
+
|
138 |
summaries = []
|
139 |
+
total_chunks = len(chunks)
|
140 |
|
141 |
+
# Process chunks in batches with improved memory management
|
142 |
+
batch_size = 2 # Reduced batch size for better stability
|
143 |
+
for i in range(0, total_chunks, batch_size):
|
144 |
+
# Clear memory before processing new batch
|
145 |
if torch.cuda.is_available():
|
146 |
torch.cuda.empty_cache()
|
147 |
elif torch.backends.mps.is_available():
|
|
|
148 |
import gc
|
149 |
gc.collect()
|
150 |
+
torch.mps.empty_cache()
|
151 |
|
152 |
batch = chunks[i:i + batch_size]
|
153 |
for chunk in batch:
|
154 |
+
if not chunk.strip():
|
155 |
+
continue
|
156 |
+
|
157 |
try:
|
158 |
+
# Determine if chunk is primarily Arabic
|
159 |
+
is_arabic = any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in chunk)
|
160 |
+
|
161 |
+
# Adjust summary parameters based on text type
|
162 |
+
max_length = 150 if is_arabic else 130
|
163 |
+
min_length = 40 if is_arabic else 30
|
164 |
+
|
165 |
+
# Generate summary with optimized parameters
|
166 |
summary = self.summarizer(
|
167 |
chunk,
|
168 |
+
max_length=max_length,
|
169 |
+
min_length=min_length,
|
170 |
do_sample=False,
|
171 |
+
num_beams=1, # Single beam for efficiency
|
172 |
+
early_stopping=True,
|
173 |
+
truncation=True
|
174 |
)
|
175 |
+
|
176 |
+
summary_text = summary[0]['summary_text'].strip()
|
177 |
+
if summary_text:
|
178 |
+
summaries.append(summary_text)
|
179 |
+
|
180 |
except Exception as e:
|
181 |
print(f"Warning: Error summarizing chunk: {str(e)}")
|
182 |
+
# Fallback to extractive summary for this chunk
|
183 |
+
chunk_summary = self._create_extractive_summary(chunk, sentences_count=2)
|
184 |
+
if chunk_summary:
|
185 |
+
summaries.append(chunk_summary)
|
186 |
|
187 |
# Update progress
|
188 |
+
progress = min(0.3 + (i / total_chunks) * 0.4, 0.7)
|
189 |
+
self.update_progress("جاري تلخيص المستند...", progress)
|
190 |
+
|
191 |
+
if not summaries:
|
192 |
+
return self._create_extractive_summary(text)
|
193 |
|
194 |
+
# Combine summaries with improved formatting
|
195 |
final_summary = " ".join(summaries)
|
196 |
|
197 |
+
# Clean and process the final summary
|
198 |
final_summary = self._clean_text(final_summary)
|
199 |
final_summary = self._process_arabic_text(final_summary)
|
200 |
|
201 |
+
# Ensure reasonable length
|
202 |
+
if len(final_summary) > 2000:
|
203 |
+
final_summary = self._create_extractive_summary(final_summary, sentences_count=10)
|
204 |
+
|
205 |
return final_summary
|
206 |
|
207 |
except Exception as e:
|
208 |
print(f"Error in summarization: {str(e)}")
|
|
|
209 |
return self._create_extractive_summary(text)
|
210 |
|
211 |
def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str:
|