edithram23 commited on
Commit
06135da
·
verified ·
1 Parent(s): eb1cea3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -10
app.py CHANGED
@@ -6,6 +6,7 @@ import fitz # PyMuPDF
6
  from docx import Document
7
  import re
8
  import nltk
 
9
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
10
  nltk.download('punkt')
11
 
@@ -182,26 +183,30 @@ if uploaded_file is not None:
182
  sentences = sentence_tokenize(text)
183
  for sent in sentences:
184
  x = mask_generation(sent)
 
 
 
 
185
  # sent_out = re.sub("\n","|",sent)
186
  # sent_out = re.sub(".","|",sent_out)
187
  # sent_out = re.sub(",","|",sent_out)
188
  # sent_out = re.sub(" ","|",sent_out)
189
  # sent_n_q_c=sent_out.split("|")
190
- sent_n_q_c=[]
191
- sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
192
- for i in sent_n:
193
- for j in i.split(" "):
194
- sent_n_q_c+=j.split(',')
195
- x_q = x.lower().replace('.',' ').split(' ')
196
- e=[]
197
- for i in x_q:
198
- e+=i.split(',')
199
  # sent_out = re.sub("\n","|",x)
200
  # sent_out = re.sub(".","|",sent_out)
201
  # sent_out = re.sub(",","|",sent_out)
202
  # sent_out = re.sub(" ","|",sent_out)
203
  # e = sent_out.split("|")
204
- t5_words=list(set(sent_n_q_c).difference(set(e)))
205
  entities,words_out = extract_entities(sent)
206
  words_out+=t5_words
207
  # print("\nwords_out:",words_out)
 
6
  from docx import Document
7
  import re
8
  import nltk
9
+ from nltk import word_tokenize
10
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
11
  nltk.download('punkt')
12
 
 
183
  sentences = sentence_tokenize(text)
184
  for sent in sentences:
185
  x = mask_generation(sent)
186
+ sent_words = word_tokenize(sent.lower())
187
+ t5_words = word_tokenize(x.lower())
188
+
189
+ t5_words=list(set(sent_words).difference(set(t5_words)))
190
  # sent_out = re.sub("\n","|",sent)
191
  # sent_out = re.sub(".","|",sent_out)
192
  # sent_out = re.sub(",","|",sent_out)
193
  # sent_out = re.sub(" ","|",sent_out)
194
  # sent_n_q_c=sent_out.split("|")
195
+ # sent_n_q_c=[]
196
+ # sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
197
+ # for i in sent_n:
198
+ # for j in i.split(" "):
199
+ # sent_n_q_c+=j.split(',')
200
+ # x_q = x.lower().replace('.',' ').split(' ')
201
+ # e=[]
202
+ # for i in x_q:
203
+ # e+=i.split(',')
204
  # sent_out = re.sub("\n","|",x)
205
  # sent_out = re.sub(".","|",sent_out)
206
  # sent_out = re.sub(",","|",sent_out)
207
  # sent_out = re.sub(" ","|",sent_out)
208
  # e = sent_out.split("|")
209
+ # t5_words=list(set(sent_n_q_c).difference(set(e)))
210
  entities,words_out = extract_entities(sent)
211
  words_out+=t5_words
212
  # print("\nwords_out:",words_out)