Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import fitz # PyMuPDF
|
|
6 |
from docx import Document
|
7 |
import re
|
8 |
import nltk
|
|
|
9 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
|
10 |
nltk.download('punkt')
|
11 |
|
@@ -182,26 +183,30 @@ if uploaded_file is not None:
|
|
182 |
sentences = sentence_tokenize(text)
|
183 |
for sent in sentences:
|
184 |
x = mask_generation(sent)
|
|
|
|
|
|
|
|
|
185 |
# sent_out = re.sub("\n","|",sent)
|
186 |
# sent_out = re.sub(".","|",sent_out)
|
187 |
# sent_out = re.sub(",","|",sent_out)
|
188 |
# sent_out = re.sub(" ","|",sent_out)
|
189 |
# sent_n_q_c=sent_out.split("|")
|
190 |
-
sent_n_q_c=[]
|
191 |
-
sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
|
192 |
-
for i in sent_n:
|
193 |
-
|
194 |
-
|
195 |
-
x_q = x.lower().replace('.',' ').split(' ')
|
196 |
-
e=[]
|
197 |
-
for i in x_q:
|
198 |
-
|
199 |
# sent_out = re.sub("\n","|",x)
|
200 |
# sent_out = re.sub(".","|",sent_out)
|
201 |
# sent_out = re.sub(",","|",sent_out)
|
202 |
# sent_out = re.sub(" ","|",sent_out)
|
203 |
# e = sent_out.split("|")
|
204 |
-
t5_words=list(set(sent_n_q_c).difference(set(e)))
|
205 |
entities,words_out = extract_entities(sent)
|
206 |
words_out+=t5_words
|
207 |
# print("\nwords_out:",words_out)
|
|
|
6 |
from docx import Document
|
7 |
import re
|
8 |
import nltk
|
9 |
+
from nltk import word_tokenize
|
10 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, RecognizerResult, Pattern
|
11 |
nltk.download('punkt')
|
12 |
|
|
|
183 |
sentences = sentence_tokenize(text)
|
184 |
for sent in sentences:
|
185 |
x = mask_generation(sent)
|
186 |
+
sent_words = word_tokenize(sent.lower())
|
187 |
+
t5_words = word_tokenize(x.lower())
|
188 |
+
|
189 |
+
t5_words=list(set(sent_words).difference(set(t5_words)))
|
190 |
# sent_out = re.sub("\n","|",sent)
|
191 |
# sent_out = re.sub(".","|",sent_out)
|
192 |
# sent_out = re.sub(",","|",sent_out)
|
193 |
# sent_out = re.sub(" ","|",sent_out)
|
194 |
# sent_n_q_c=sent_out.split("|")
|
195 |
+
# sent_n_q_c=[]
|
196 |
+
# sent_n = list(set(sent.lower().replace('.',' ').split("\n")))
|
197 |
+
# for i in sent_n:
|
198 |
+
# for j in i.split(" "):
|
199 |
+
# sent_n_q_c+=j.split(',')
|
200 |
+
# x_q = x.lower().replace('.',' ').split(' ')
|
201 |
+
# e=[]
|
202 |
+
# for i in x_q:
|
203 |
+
# e+=i.split(',')
|
204 |
# sent_out = re.sub("\n","|",x)
|
205 |
# sent_out = re.sub(".","|",sent_out)
|
206 |
# sent_out = re.sub(",","|",sent_out)
|
207 |
# sent_out = re.sub(" ","|",sent_out)
|
208 |
# e = sent_out.split("|")
|
209 |
+
# t5_words=list(set(sent_n_q_c).difference(set(e)))
|
210 |
entities,words_out = extract_entities(sent)
|
211 |
words_out+=t5_words
|
212 |
# print("\nwords_out:",words_out)
|