Spaces:

Hassankhwileh
/

legalAI

Running

App Files Files Community

Hassankhwileh commited on 17 days ago

Commit

6e30b5a

verified ·

1 Parent(s): 638e211

Update pdf_processor.py

Browse files

Files changed (1) hide show

pdf_processor.py +57 -39

pdf_processor.py CHANGED Viewed

@@ -7,10 +7,11 @@ from transformers import pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import io
 import os
 from typing import List, Dict
 from agents import create_judge_agent, create_advocate_agent
 from crewai import Task, Crew
-import torch
 class PDFProcessor:
     def __init__(self):
@@ -23,18 +24,17 @@ class PDFProcessor:
         # Initialize models with better memory management
         self.summarizer = pipeline(
             "summarization",
-            model="facebook/bart-large-cnn",
-            device_map="auto",  # Automatically choose best device
-            torch_dtype=torch.float32,  # Use float32 for better memory efficiency
-            batch_size=1 , # Process one chunk at a time
-            trust_remote_code=True
         )
         self.progress_callback = None
         # Configure torch for memory efficiency
-        if torch.backends.mps.is_available():  # For Mac M1/M2
-            torch.backends.mps.set_per_process_memory_fraction(0.7)  # Use only 70% of available memory
-        elif torch.cuda.is_available():  # For CUDA devices
             torch.cuda.empty_cache()
             torch.cuda.set_per_process_memory_fraction(0.7)
@@ -112,15 +112,8 @@ class PDFProcessor:
     def _process_arabic_text(self, text: str) -> str:
         """Process Arabic text with improved handling."""
         try:
-            # Configure arabic-reshaper for better text handling
-            configuration = {
-                'delete_harakat': False,
-                'support_ligatures': True,
-                'RIAL SIGN': True
-            }
             # Reshape Arabic text
-            reshaped_text = arabic_reshaper.reshape(text, configuration=configuration)
             # Apply bidirectional algorithm
             text = get_display(reshaped_text)
@@ -135,59 +128,84 @@ class PDFProcessor:
             return text  # Return original text if processing fails
     def summarize_document(self, text: str) -> str:
-        """Generate a summary of the document with improved memory management."""
         try:
-            # Split text into smaller chunks
             chunks = self.text_splitter.split_text(text)
             summaries = []
-            # Process chunks in batches to manage memory
-            batch_size = 3  # Process 3 chunks at a time
-            for i in range(0, len(chunks), batch_size):
-                # Clear GPU/MPS memory before processing new batch
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 elif torch.backends.mps.is_available():
-                    # Force garbage collection for MPS
                     import gc
                     gc.collect()
                 batch = chunks[i:i + batch_size]
                 for chunk in batch:
                     try:
-                        # Generate summary with controlled length and parameters
                         summary = self.summarizer(
                             chunk,
-                            max_length=130,
-                            min_length=30,
                             do_sample=False,
-                            num_beams=2,  # Reduced beam search for memory efficiency
-                            early_stopping=True
                         )
-                        summaries.append(summary[0]['summary_text'])
                     except Exception as e:
                         print(f"Warning: Error summarizing chunk: {str(e)}")
-                        # If summarization fails, include a portion of the original text
-                        summaries.append(chunk[:200] + "...")
                 # Update progress
-                self.update_progress(
-                    "جاري تلخيص المستند...",
-                    min(0.3 + (i / len(chunks)) * 0.4, 0.7)
-                )
-            # Combine summaries intelligently
             final_summary = " ".join(summaries)
-            # Clean up the final summary
             final_summary = self._clean_text(final_summary)
             final_summary = self._process_arabic_text(final_summary)
             return final_summary
         except Exception as e:
             print(f"Error in summarization: {str(e)}")
-            # Fallback to a simple extractive summary
             return self._create_extractive_summary(text)
     def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str:

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import io
 import os
+import re
+import torch
 from typing import List, Dict
 from agents import create_judge_agent, create_advocate_agent
 from crewai import Task, Crew
 class PDFProcessor:
     def __init__(self):
         # Initialize models with better memory management
         self.summarizer = pipeline(
             "summarization",
+            model="sshleifer/distilbart-cnn-6-6",  # Using a smaller, faster model
+            device="cpu",  # Use CPU for better compatibility
+            torch_dtype=torch.float32,
+            batch_size=1
         )
         self.progress_callback = None
         # Configure torch for memory efficiency
+        #if torch.backends.mps.is_available():  # For Mac M1/M2
+        #   torch.backends.mps.set_per_process_memory_fraction(0.7)  # Use only 70% of available memory
+        if torch.cuda.is_available():  # For CUDA devices
             torch.cuda.empty_cache()
             torch.cuda.set_per_process_memory_fraction(0.7)
     def _process_arabic_text(self, text: str) -> str:
         """Process Arabic text with improved handling."""
         try:
             # Reshape Arabic text
+            reshaped_text = arabic_reshaper.reshape(text)
             # Apply bidirectional algorithm
             text = get_display(reshaped_text)
             return text  # Return original text if processing fails
     def summarize_document(self, text: str) -> str:
+        """Generate a summary of the document with improved memory management and handling of Arabic text."""
         try:
+            # Split text into smaller chunks with consideration for Arabic text
             chunks = self.text_splitter.split_text(text)
+            if not chunks:
+                return self._create_extractive_summary(text)
             summaries = []
+            total_chunks = len(chunks)
+            # Process chunks in batches with improved memory management
+            batch_size = 2  # Reduced batch size for better stability
+            for i in range(0, total_chunks, batch_size):
+                # Clear memory before processing new batch
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 elif torch.backends.mps.is_available():
                     import gc
                     gc.collect()
+                    torch.mps.empty_cache()
                 batch = chunks[i:i + batch_size]
                 for chunk in batch:
+                    if not chunk.strip():
+                        continue
                     try:
+                        # Determine if chunk is primarily Arabic
+                        is_arabic = any(ord(c) >= 0x0600 and ord(c) <= 0x06FF for c in chunk)
+                        # Adjust summary parameters based on text type
+                        max_length = 150 if is_arabic else 130
+                        min_length = 40 if is_arabic else 30
+                        # Generate summary with optimized parameters
                         summary = self.summarizer(
                             chunk,
+                            max_length=max_length,
+                            min_length=min_length,
                             do_sample=False,
+                            num_beams=1,  # Single beam for efficiency
+                            early_stopping=True,
+                            truncation=True
                         )
+                        summary_text = summary[0]['summary_text'].strip()
+                        if summary_text:
+                            summaries.append(summary_text)
                     except Exception as e:
                         print(f"Warning: Error summarizing chunk: {str(e)}")
+                        # Fallback to extractive summary for this chunk
+                        chunk_summary = self._create_extractive_summary(chunk, sentences_count=2)
+                        if chunk_summary:
+                            summaries.append(chunk_summary)
                 # Update progress
+                progress = min(0.3 + (i / total_chunks) * 0.4, 0.7)
+                self.update_progress("جاري تلخيص المستند...", progress)
+            if not summaries:
+                return self._create_extractive_summary(text)
+            # Combine summaries with improved formatting
             final_summary = " ".join(summaries)
+            # Clean and process the final summary
             final_summary = self._clean_text(final_summary)
             final_summary = self._process_arabic_text(final_summary)
+            # Ensure reasonable length
+            if len(final_summary) > 2000:
+                final_summary = self._create_extractive_summary(final_summary, sentences_count=10)
             return final_summary
         except Exception as e:
             print(f"Error in summarization: {str(e)}")
             return self._create_extractive_summary(text)
     def _create_extractive_summary(self, text: str, sentences_count: int = 5) -> str: