Hassankhwileh commited on
Commit
6e30b5a
·
verified ·
1 Parent(s): 638e211

Update pdf_processor.py

Browse files
Files changed (1) hide show
  1. pdf_processor.py +57 -39
pdf_processor.py CHANGED
@@ -7,10 +7,11 @@ from transformers import pipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  import io
9
  import os
 
 
10
  from typing import List, Dict
11
  from agents import create_judge_agent, create_advocate_agent
12
  from crewai import Task, Crew
13
- import torch
14
 
15
  class PDFProcessor:
16
  def __init__(self):
@@ -23,18 +24,17 @@ class PDFProcessor:
23
  # Initialize models with better memory management
24
  self.summarizer = pipeline(
25
  "summarization",
26
- model="facebook/bart-large-cnn",
27
- device_map="auto", # Automatically choose best device
28
- torch_dtype=torch.float32, # Use float32 for better memory efficiency
29
- batch_size=1 , # Process one chunk at a time
30
- trust_remote_code=True
31
  )
32
  self.progress_callback = None
33
 
34
  # Configure torch for memory efficiency
35
- if torch.backends.mps.is_available(): # For Mac M1/M2
36
- torch.backends.mps.set_per_process_memory_fraction(0.7) # Use only 70% of available memory
37
- elif torch.cuda.is_available(): # For CUDA devices
38
  torch.cuda.empty_cache()
39
  torch.cuda.set_per_process_memory_fraction(0.7)
40
 
@@ -112,15 +112,8 @@ class PDFProcessor:
112
  def _process_arabic_text(self, text: str) -> str:
113
  """Process Arabic text with improved handling."""
114
  try:
115
- # Configure arabic-reshaper for better text handling
116
- configuration = {
117
- 'delete_harakat': False,
118
- 'support_ligatures': True,
119
- 'RIAL SIGN': True
120
- }
121
-
122
  # Reshape Arabic text
123
- reshaped_text = arabic_reshaper.reshape(text, configuration=configuration)
124
 
125
  # Apply bidirectional algorithm
126
  text = get_display(reshaped_text)
@@ -135,59 +128,84 @@ class PDFProcessor:
135
  return text # Return original text if processing fails
136
 
137
  def summarize_document(self, text: str) -> str:
138
- """Generate a summary of the document with improved memory management."""
139
  try:
140
- # Split text into smaller chunks
141
  chunks = self.text_splitter.split_text(text)
 
 
 
142
  summaries = []
 
143
 
144
- # Process chunks in batches to manage memory
145
- batch_size = 3 # Process 3 chunks at a time
146
- for i in range(0, len(chunks), batch_size):
147
- # Clear GPU/MPS memory before processing new batch
148
  if torch.cuda.is_available():
149
  torch.cuda.empty_cache()
150
  elif torch.backends.mps.is_available():
151
- # Force garbage collection for MPS
152
  import gc
153
  gc.collect()
 
154
 
155
  batch = chunks[i:i + batch_size]
156
  for chunk in batch:
 
 
 
157
  try:
158
- # Generate summary with controlled length and parameters
 
 
 
 
 
 
 
159
  summary = self.summarizer(
160
  chunk,
161
- max_length=130,
162
- min_length=30,
163
  do_sample=False,
164
- num_beams=2, # Reduced beam search for memory efficiency
165
- early_stopping=True
 
166
  )
167
- summaries.append(summary[0]['summary_text'])
 
 
 
 
168
  except Exception as e:
169
  print(f"Warning: Error summarizing chunk: {str(e)}")
170
- # If summarization fails, include a portion of the original text
171
- summaries.append(chunk[:200] + "...")
 
 
172
 
173
  # Update progress
174
- self.update_progress(
175
- "جاري تلخيص المستند...",
176
- min(0.3 + (i / len(chunks)) * 0.4, 0.7)
177
- )
 
178
 
179
- # Combine summaries intelligently
180
  final_summary = " ".join(summaries)
181
 
182
- # Clean up the final summary
183
  final_summary = self._clean_text(final_summary)
184
  final_summary = self._process_arabic_text(final_summary)
185
 
 
 
 
 
186
  return final_summary
187
 
188
  except Exception as e:
189
  print(f"Error in summarization: {str(e)}")
190
- # Fallback to a simple extractive summary
191
  return self._create_extractive_summary(text)
192
 
193
  def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str:
 
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  import io
9
  import os
10
+ import re
11
+ import torch
12
  from typing import List, Dict
13
  from agents import create_judge_agent, create_advocate_agent
14
  from crewai import Task, Crew
 
15
 
16
  class PDFProcessor:
17
  def __init__(self):
 
24
  # Initialize models with better memory management
25
  self.summarizer = pipeline(
26
  "summarization",
27
+ model="sshleifer/distilbart-cnn-6-6", # Using a smaller, faster model
28
+ device="cpu", # Use CPU for better compatibility
29
+ torch_dtype=torch.float32,
30
+ batch_size=1
 
31
  )
32
  self.progress_callback = None
33
 
34
  # Configure torch for memory efficiency
35
+ #if torch.backends.mps.is_available(): # For Mac M1/M2
36
+ # torch.backends.mps.set_per_process_memory_fraction(0.7) # Use only 70% of available memory
37
+ if torch.cuda.is_available(): # For CUDA devices
38
  torch.cuda.empty_cache()
39
  torch.cuda.set_per_process_memory_fraction(0.7)
40
 
 
112
  def _process_arabic_text(self, text: str) -> str:
113
  """Process Arabic text with improved handling."""
114
  try:
 
 
 
 
 
 
 
115
  # Reshape Arabic text
116
+ reshaped_text = arabic_reshaper.reshape(text)
117
 
118
  # Apply bidirectional algorithm
119
  text = get_display(reshaped_text)
 
128
  return text # Return original text if processing fails
129
 
130
  def summarize_document(self, text: str) -> str:
131
+ """Generate a summary of the document with improved memory management and handling of Arabic text."""
132
  try:
133
+ # Split text into smaller chunks with consideration for Arabic text
134
  chunks = self.text_splitter.split_text(text)
135
+ if not chunks:
136
+ return self._create_extractive_summary(text)
137
+
138
  summaries = []
139
+ total_chunks = len(chunks)
140
 
141
+ # Process chunks in batches with improved memory management
142
+ batch_size = 2 # Reduced batch size for better stability
143
+ for i in range(0, total_chunks, batch_size):
144
+ # Clear memory before processing new batch
145
  if torch.cuda.is_available():
146
  torch.cuda.empty_cache()
147
  elif torch.backends.mps.is_available():
 
148
  import gc
149
  gc.collect()
150
+ torch.mps.empty_cache()
151
 
152
  batch = chunks[i:i + batch_size]
153
  for chunk in batch:
154
+ if not chunk.strip():
155
+ continue
156
+
157
  try:
158
+ # Determine if chunk is primarily Arabic
159
+ is_arabic = any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in chunk)
160
+
161
+ # Adjust summary parameters based on text type
162
+ max_length = 150 if is_arabic else 130
163
+ min_length = 40 if is_arabic else 30
164
+
165
+ # Generate summary with optimized parameters
166
  summary = self.summarizer(
167
  chunk,
168
+ max_length=max_length,
169
+ min_length=min_length,
170
  do_sample=False,
171
+ num_beams=1, # Single beam for efficiency
172
+ early_stopping=True,
173
+ truncation=True
174
  )
175
+
176
+ summary_text = summary[0]['summary_text'].strip()
177
+ if summary_text:
178
+ summaries.append(summary_text)
179
+
180
  except Exception as e:
181
  print(f"Warning: Error summarizing chunk: {str(e)}")
182
+ # Fallback to extractive summary for this chunk
183
+ chunk_summary = self._create_extractive_summary(chunk, sentences_count=2)
184
+ if chunk_summary:
185
+ summaries.append(chunk_summary)
186
 
187
  # Update progress
188
+ progress = min(0.3 + (i / total_chunks) * 0.4, 0.7)
189
+ self.update_progress("جاري تلخيص المستند...", progress)
190
+
191
+ if not summaries:
192
+ return self._create_extractive_summary(text)
193
 
194
+ # Combine summaries with improved formatting
195
  final_summary = " ".join(summaries)
196
 
197
+ # Clean and process the final summary
198
  final_summary = self._clean_text(final_summary)
199
  final_summary = self._process_arabic_text(final_summary)
200
 
201
+ # Ensure reasonable length
202
+ if len(final_summary) > 2000:
203
+ final_summary = self._create_extractive_summary(final_summary, sentences_count=10)
204
+
205
  return final_summary
206
 
207
  except Exception as e:
208
  print(f"Error in summarization: {str(e)}")
 
209
  return self._create_extractive_summary(text)
210
 
211
  def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str: