marcelo3macedo commited on
Commit
ca81ab8
·
1 Parent(s): 7de58f8

feat: adding regex and transformer

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ Pipfile
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
  import gradio as gr
2
+ from lib.masker.masker import run_mask
3
 
4
+ def mask(text, options):
5
+ return run_mask(text, options)
6
 
7
+ demo = gr.Interface(fn=mask, inputs=["text", "text"], outputs="text")
8
+ demo.launch(debug=True)
lib/masker/masker.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.masker.regex import REGEX_FUNCTIONS
2
+ from lib.masker.transformer import TRANSFORMER_FUNCTIONS
3
+
4
+ DEFAULT_OPTIONS = [
5
+ "name", "email", "phone", "credit_card", "local"
6
+ ]
7
+
8
+ def run_mask(text, options):
9
+ if not isinstance(options, list) or not options:
10
+ options = DEFAULT_OPTIONS
11
+
12
+ for option in options:
13
+ if option in TRANSFORMER_FUNCTIONS:
14
+ text = TRANSFORMER_FUNCTIONS[option](text)
15
+ if option in REGEX_FUNCTIONS:
16
+ text = REGEX_FUNCTIONS[option](text)
17
+
18
+ return text
lib/masker/ner.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
2
+
3
+ entity_types = {
4
+ 'B-PESSOA': 'name',
5
+ 'I-PESSOA': 'name',
6
+ 'B-LOCAL': 'local',
7
+ 'I-LOCAL': 'local',
8
+ }
9
+
10
+ def ner_exec(text):
11
+ model_name = "liaad/NER_harem_bert-base-portuguese-cased"
12
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
15
+ return ner_pipeline(text)
16
+
17
+ def ner_filter(ner_results, text, mask_type):
18
+ for entity in ner_results:
19
+ entity_word = entity['word']
20
+ type_name = entity_types[entity['entity']]
21
+
22
+ if (type_name == mask_type):
23
+ text = text.replace(entity_word, "*" * len(entity_word))
24
+
25
+ return text
lib/masker/regex.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def mask_credit_card(text):
4
+ return re.sub(r'(\d{12})\d{4}', r'\1****', text)
5
+
6
+ def mask_phone(text):
7
+ return re.sub(r'\+55\s?(\d{2})\s?(\d{5})[-\s]?(\d{4})', r'(\1) XXX-XXXX', text)
8
+
9
+ def mask_email(text):
10
+ return re.sub(r'([a-zA-Z0-9._%+-])([a-zA-Z0-9.-]+)@([a-zA-Z0-9.-]+)', r'\1*****@\3', text)
11
+
12
+ REGEX_FUNCTIONS = {
13
+ 'credit_card': mask_credit_card,
14
+ 'phone': mask_phone,
15
+ 'email': mask_email,
16
+ }
lib/masker/transformer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib.masker.ner import ner_exec, ner_filter
2
+
3
+ def mask_name(text):
4
+ mask_type = "name"
5
+ ner_results = ner_exec(text)
6
+
7
+ return ner_filter(ner_results, text, mask_type)
8
+
9
+ def mask_local(text):
10
+ mask_type = "local"
11
+ ner_results = ner_exec(text)
12
+
13
+ return ner_filter(ner_results, text, mask_type)
14
+
15
+ TRANSFORMER_FUNCTIONS = {
16
+ 'name': mask_name,
17
+ 'local': mask_local,
18
+ }
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch