Spaces:
Sleeping
Sleeping
Commit
·
ca81ab8
1
Parent(s):
7de58f8
feat: adding regex and transformer
Browse files- .gitignore +1 -0
- app.py +5 -4
- lib/masker/masker.py +18 -0
- lib/masker/ner.py +25 -0
- lib/masker/regex.py +16 -0
- lib/masker/transformer.py +18 -0
- requirements.txt +3 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Pipfile
|
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
def
|
4 |
-
return
|
5 |
|
6 |
-
demo = gr.Interface(fn=
|
7 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from lib.masker.masker import run_mask
|
3 |
|
4 |
+
def mask(text, options):
|
5 |
+
return run_mask(text, options)
|
6 |
|
7 |
+
demo = gr.Interface(fn=mask, inputs=["text", "text"], outputs="text")
|
8 |
+
demo.launch(debug=True)
|
lib/masker/masker.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lib.masker.regex import REGEX_FUNCTIONS
|
2 |
+
from lib.masker.transformer import TRANSFORMER_FUNCTIONS
|
3 |
+
|
4 |
+
DEFAULT_OPTIONS = [
|
5 |
+
"name", "email", "phone", "credit_card", "local"
|
6 |
+
]
|
7 |
+
|
8 |
+
def run_mask(text, options):
|
9 |
+
if not isinstance(options, list) or not options:
|
10 |
+
options = DEFAULT_OPTIONS
|
11 |
+
|
12 |
+
for option in options:
|
13 |
+
if option in TRANSFORMER_FUNCTIONS:
|
14 |
+
text = TRANSFORMER_FUNCTIONS[option](text)
|
15 |
+
if option in REGEX_FUNCTIONS:
|
16 |
+
text = REGEX_FUNCTIONS[option](text)
|
17 |
+
|
18 |
+
return text
|
lib/masker/ner.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
2 |
+
|
3 |
+
entity_types = {
|
4 |
+
'B-PESSOA': 'name',
|
5 |
+
'I-PESSOA': 'name',
|
6 |
+
'B-LOCAL': 'local',
|
7 |
+
'I-LOCAL': 'local',
|
8 |
+
}
|
9 |
+
|
10 |
+
def ner_exec(text):
|
11 |
+
model_name = "liaad/NER_harem_bert-base-portuguese-cased"
|
12 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
14 |
+
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
|
15 |
+
return ner_pipeline(text)
|
16 |
+
|
17 |
+
def ner_filter(ner_results, text, mask_type):
|
18 |
+
for entity in ner_results:
|
19 |
+
entity_word = entity['word']
|
20 |
+
type_name = entity_types[entity['entity']]
|
21 |
+
|
22 |
+
if (type_name == mask_type):
|
23 |
+
text = text.replace(entity_word, "*" * len(entity_word))
|
24 |
+
|
25 |
+
return text
|
lib/masker/regex.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def mask_credit_card(text):
|
4 |
+
return re.sub(r'(\d{12})\d{4}', r'\1****', text)
|
5 |
+
|
6 |
+
def mask_phone(text):
|
7 |
+
return re.sub(r'\+55\s?(\d{2})\s?(\d{5})[-\s]?(\d{4})', r'(\1) XXX-XXXX', text)
|
8 |
+
|
9 |
+
def mask_email(text):
|
10 |
+
return re.sub(r'([a-zA-Z0-9._%+-])([a-zA-Z0-9.-]+)@([a-zA-Z0-9.-]+)', r'\1*****@\3', text)
|
11 |
+
|
12 |
+
REGEX_FUNCTIONS = {
|
13 |
+
'credit_card': mask_credit_card,
|
14 |
+
'phone': mask_phone,
|
15 |
+
'email': mask_email,
|
16 |
+
}
|
lib/masker/transformer.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lib.masker.ner import ner_exec, ner_filter
|
2 |
+
|
3 |
+
def mask_name(text):
|
4 |
+
mask_type = "name"
|
5 |
+
ner_results = ner_exec(text)
|
6 |
+
|
7 |
+
return ner_filter(ner_results, text, mask_type)
|
8 |
+
|
9 |
+
def mask_local(text):
|
10 |
+
mask_type = "local"
|
11 |
+
ner_results = ner_exec(text)
|
12 |
+
|
13 |
+
return ner_filter(ner_results, text, mask_type)
|
14 |
+
|
15 |
+
TRANSFORMER_FUNCTIONS = {
|
16 |
+
'name': mask_name,
|
17 |
+
'local': mask_local,
|
18 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
torch
|