Redaction_PDF_advanced

Sleeping

App Files Files Community

edithram23 commited on Jul 7, 2024

Commit

748048a

verified ·

1 Parent(s): 2c79e12

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -21

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from transformers import AutoTokenizer
-from transformers import AutoModelForSeq2SeqLM
 import streamlit as st
 import fitz  # PyMuPDF
 from docx import Document
@@ -16,6 +15,7 @@ def sentence_tokenize(text):
 model_dir_large = 'edithram23/Redaction_Personal_info_v1'
 tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
 model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
 # model_dir_small = 'edithram23/Redaction'
 # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
@@ -42,6 +42,50 @@ address_recognizer = PatternRecognizer(supported_entity="ADDRESS", patterns=[add
 analyzer.registry.add_recognizer(address_recognizer)
 analyzer.get_recognizers
 # Define a function to extract entities
 def extract_entities(text):
     entities = {
         "NAME": [],
@@ -132,25 +176,22 @@ if uploaded_file is not None:
     if pdf_document:
         redacted_text = []
         for pg in pdf_document:
-              text = pg.get_text('text')
-              sentences = sentence_tokenize(text)
-              for sent in sentences:
-                entities,words_out = extract_entities(sent)
-                avai_red = pg.search_for(sent)
-                new=[]
-                for w in words_out:
-                  new+=w.split('\n')
-                words_out = [i for i in new if len(i)>2]
-                print(words_out)
-                for i in avai_red:
-                        b = pg.get_text("text", clip=i)
-                        # result = [item for item in output if item in b]  # Get elements of 'a' that are in 'b'
-                        for j in words_out:
-                            new_n = pg.search_for(j, clip=i)
-                            for all in new_n:
-                              pg.add_redact_annot(all,fill=(0, 0, 0))
-              pg.apply_redactions()
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)

+from transformers import pipeline
 import streamlit as st
 import fitz  # PyMuPDF
 from docx import Document
 model_dir_large = 'edithram23/Redaction_Personal_info_v1'
 tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
 model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
+pipe1 = pipeline("token-classification", model="edithram23/new-bert-v2")
 # model_dir_small = 'edithram23/Redaction'
 # tokenizer_small = AutoTokenizer.from_pretrained(model_dir_small)
 analyzer.registry.add_recognizer(address_recognizer)
 analyzer.get_recognizers
 # Define a function to extract entities
+def combine_words(entities):
+    combined_entities = []
+    current_entity = None
+    for entity in entities:
+        if current_entity:
+            if current_entity['end'] == entity['start']:
+                # Combine the words without space
+                current_entity['word'] += entity['word'].replace('##', '')
+                current_entity['end'] = entity['end']
+            elif current_entity['end'] + 1 == entity['start']:
+                # Combine the words with a space
+                current_entity['word'] += ' ' + entity['word'].replace('##', '')
+                current_entity['end'] = entity['end']
+            else:
+                # Add the previous combined entity to the list
+                combined_entities.append(current_entity)
+                # Start a new entity
+                current_entity = entity.copy()
+                current_entity['word'] = current_entity['word'].replace('##', '')
+        else:
+            # Initialize the first entity
+            current_entity = entity.copy()
+            current_entity['word'] = current_entity['word'].replace('##', '')
+    # Add the last entity
+    if current_entity:
+        combined_entities.append(current_entity)
+    return combined_entities
+def words_red_bert(text):
+  final=[]
+  sentences = sentence_tokenize(text)
+  for sentence in sentences:
+    x=[pipe1(sentence)]
+    m = combine_words(x[0])
+    for j in m:
+      if(j['entity']!='none' and len(j['word'])>1 and j['word']!=', '):
+        final.append(j['word'])
+  return final
 def extract_entities(text):
     entities = {
         "NAME": [],
     if pdf_document:
         redacted_text = []
         for pg in pdf_document:
+            text = pg.get_text('text')
+            sentences = sentence_tokenize(text)
+            for sent in sentences:
+              entities,words_out = extract_entities(sent)
+              bert_words = words_red_bert(sent)
+              new=[]
+              for w in words_out:
+                new+=w.split('\n')
+              words_out+=bert_words
+              words_out = [i for i in new if len(i)>2]
+              # print(words_out)
+              words_out=sorted(words_out, key=len,reverse=True)
+              print(words_out)
+              for i in words_out:
+                redact_text(pg,i)
         output_pdf = "output_redacted.pdf"
         pdf_document.save(output_pdf)