Spaces:
Sleeping
Sleeping
Merge branch 'refactor'
Browse files- __pycache__/spacy_analyzer.cpython-310.pyc +0 -0
- anonymize.py +0 -44
- anonymizer.py +37 -0
- app.py +10 -29
- match_replace.py +156 -0
- spacy_recognizer.py β spacy_analyzer.py +44 -12
__pycache__/spacy_analyzer.cpython-310.pyc
ADDED
Binary file (4.17 kB). View file
|
|
anonymize.py
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
from spacy_recognizer import CustomSpacyRecognizer
|
2 |
-
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
3 |
-
from presidio_anonymizer import AnonymizerEngine
|
4 |
-
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
5 |
-
from presidio_anonymizer.entities import OperatorConfig
|
6 |
-
import pandas as pd
|
7 |
-
from json import JSONEncoder
|
8 |
-
import json
|
9 |
-
import warnings
|
10 |
-
import os
|
11 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
12 |
-
warnings.filterwarnings('ignore')
|
13 |
-
|
14 |
-
def prepare_analyzer(configuration):
|
15 |
-
"""Return AnalyzerEngine."""
|
16 |
-
|
17 |
-
spacy_recognizer = CustomSpacyRecognizer()
|
18 |
-
|
19 |
-
print('Hallej')
|
20 |
-
|
21 |
-
# Create NLP engine based on configuration
|
22 |
-
provider = NlpEngineProvider(nlp_configuration=configuration)
|
23 |
-
nlp_engine = provider.create_engine()
|
24 |
-
|
25 |
-
# add rule-based recognizers
|
26 |
-
registry = RecognizerRegistry()
|
27 |
-
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
28 |
-
registry.add_recognizer(spacy_recognizer)
|
29 |
-
|
30 |
-
# remove the nlp engine we passed, to use custom label mappings
|
31 |
-
registry.remove_recognizer("SpacyRecognizer")
|
32 |
-
|
33 |
-
analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
|
34 |
-
registry=registry,
|
35 |
-
supported_languages=["en"])
|
36 |
-
|
37 |
-
return analyzer
|
38 |
-
|
39 |
-
def generate_surrogate(name):
|
40 |
-
"""Return appropriate surrogate name from text string"""
|
41 |
-
if "John" in name:
|
42 |
-
return "Jill"
|
43 |
-
else:
|
44 |
-
return "SURROGATE_NAME"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
anonymizer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from presidio_anonymizer import AnonymizerEngine
|
2 |
+
from presidio_anonymizer.entities import OperatorConfig
|
3 |
+
from presidio_analyzer import RecognizerResult
|
4 |
+
|
5 |
+
def retrieve_name_records():
|
6 |
+
"""Read in a table of names with gender and country code fields."""
|
7 |
+
pass
|
8 |
+
|
9 |
+
def generate_surrogate(name):
|
10 |
+
"""Return appropriate surrogate name from text string"""
|
11 |
+
if "John" in name:
|
12 |
+
return "Jill"
|
13 |
+
else:
|
14 |
+
return "SURROGATE_NAME"
|
15 |
+
|
16 |
+
def anonymize(
|
17 |
+
anonymizer: AnonymizerEngine,
|
18 |
+
text: str,
|
19 |
+
analyze_results: list[RecognizerResult]
|
20 |
+
):
|
21 |
+
"""Anonymize identified input using Presidio Anonymizer."""
|
22 |
+
|
23 |
+
if not text:
|
24 |
+
return
|
25 |
+
|
26 |
+
res = anonymizer.anonymize(
|
27 |
+
text,
|
28 |
+
analyze_results,
|
29 |
+
operators={
|
30 |
+
"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate}),
|
31 |
+
"EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[email protected]"}),
|
32 |
+
"PHONE_NUMBER": OperatorConfig("replace", {"new_value": "888-888-8888"}),
|
33 |
+
"URL": OperatorConfig("replace", {"new_value": "aol.com"}),
|
34 |
+
}
|
35 |
+
)
|
36 |
+
|
37 |
+
return res.text
|
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
|
2 |
"""Streamlit app for Student Name Detection models."""
|
3 |
|
4 |
-
from
|
|
|
5 |
from presidio_anonymizer import AnonymizerEngine
|
6 |
-
from presidio_anonymizer.entities import OperatorConfig
|
7 |
import pandas as pd
|
8 |
from annotated_text import annotated_text
|
9 |
from json import JSONEncoder
|
@@ -17,7 +17,7 @@ warnings.filterwarnings('ignore')
|
|
17 |
# Helper methods
|
18 |
@st.cache(allow_output_mutation=True)
|
19 |
def analyzer_engine():
|
20 |
-
"""Return AnalyzerEngine."""
|
21 |
|
22 |
configuration = {
|
23 |
"nlp_engine_name": "spacy",
|
@@ -34,27 +34,6 @@ def anonymizer_engine():
|
|
34 |
"""Return AnonymizerEngine."""
|
35 |
return AnonymizerEngine()
|
36 |
|
37 |
-
def get_supported_entities():
|
38 |
-
"""Return supported entities from the Analyzer Engine."""
|
39 |
-
return analyzer_engine().get_supported_entities()
|
40 |
-
|
41 |
-
def analyze(**kwargs):
|
42 |
-
"""Analyze input using Analyzer engine and input arguments (kwargs)."""
|
43 |
-
if "entities" not in kwargs or "All" in kwargs["entities"]:
|
44 |
-
kwargs["entities"] = None
|
45 |
-
return analyzer_engine().analyze(**kwargs)
|
46 |
-
|
47 |
-
def anonymize(text, analyze_results):
|
48 |
-
"""Anonymize identified input using Presidio Anonymizer."""
|
49 |
-
if not text:
|
50 |
-
return
|
51 |
-
res = anonymizer_engine().anonymize(
|
52 |
-
text,
|
53 |
-
analyze_results,
|
54 |
-
operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
|
55 |
-
)
|
56 |
-
return res.text
|
57 |
-
|
58 |
def annotate(text, st_analyze_results, st_entities):
|
59 |
tokens = []
|
60 |
# sort by start index
|
@@ -85,8 +64,8 @@ st.sidebar.markdown(
|
|
85 |
|
86 |
st_entities = st.sidebar.multiselect(
|
87 |
label="Which entities to look for?",
|
88 |
-
options=get_supported_entities(),
|
89 |
-
default=list(get_supported_entities()),
|
90 |
)
|
91 |
|
92 |
st_threshold = st.sidebar.slider(
|
@@ -122,7 +101,7 @@ if 'first_load' not in st.session_state:
|
|
122 |
st.subheader("Analyzed")
|
123 |
with st.spinner("Analyzing..."):
|
124 |
if button or st.session_state.first_load:
|
125 |
-
st_analyze_results = analyze(
|
126 |
text=st_text,
|
127 |
entities=st_entities,
|
128 |
language="en",
|
@@ -140,9 +119,11 @@ st.subheader("Anonymized")
|
|
140 |
|
141 |
with st.spinner("Anonymizing..."):
|
142 |
if button or st.session_state.first_load:
|
143 |
-
st_anonymize_results = anonymize(
|
|
|
|
|
144 |
st_anonymize_results
|
145 |
-
|
146 |
# table result
|
147 |
st.subheader("Detailed Findings")
|
148 |
if st_analyze_results:
|
|
|
1 |
|
2 |
"""Streamlit app for Student Name Detection models."""
|
3 |
|
4 |
+
from spacy_analyzer import prepare_analyzer
|
5 |
+
from anonymizer import anonymize
|
6 |
from presidio_anonymizer import AnonymizerEngine
|
|
|
7 |
import pandas as pd
|
8 |
from annotated_text import annotated_text
|
9 |
from json import JSONEncoder
|
|
|
17 |
# Helper methods
|
18 |
@st.cache(allow_output_mutation=True)
|
19 |
def analyzer_engine():
|
20 |
+
"""Return AnalyzerEngine and cache with Streamlit."""
|
21 |
|
22 |
configuration = {
|
23 |
"nlp_engine_name": "spacy",
|
|
|
34 |
"""Return AnonymizerEngine."""
|
35 |
return AnonymizerEngine()
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def annotate(text, st_analyze_results, st_entities):
|
38 |
tokens = []
|
39 |
# sort by start index
|
|
|
64 |
|
65 |
st_entities = st.sidebar.multiselect(
|
66 |
label="Which entities to look for?",
|
67 |
+
options=analyzer_engine().get_supported_entities(),
|
68 |
+
default=list(analyzer_engine().get_supported_entities()),
|
69 |
)
|
70 |
|
71 |
st_threshold = st.sidebar.slider(
|
|
|
101 |
st.subheader("Analyzed")
|
102 |
with st.spinner("Analyzing..."):
|
103 |
if button or st.session_state.first_load:
|
104 |
+
st_analyze_results = analyzer_engine().analyze(
|
105 |
text=st_text,
|
106 |
entities=st_entities,
|
107 |
language="en",
|
|
|
119 |
|
120 |
with st.spinner("Anonymizing..."):
|
121 |
if button or st.session_state.first_load:
|
122 |
+
st_anonymize_results = anonymize(anonymizer_engine(),
|
123 |
+
st_text,
|
124 |
+
st_analyze_results)
|
125 |
st_anonymize_results
|
126 |
+
|
127 |
# table result
|
128 |
st.subheader("Detailed Findings")
|
129 |
if st_analyze_results:
|
match_replace.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
def replace_name_old(country_code, gender, f_l, original_name, fb_df):
|
4 |
+
"""
|
5 |
+
Receiving country, gender, first_last name, and the original name.
|
6 |
+
Match with a name that matches gender and country, and is randomly retrieved from the
|
7 |
+
facebook dataset.
|
8 |
+
Compare the surrogate name with the original name to make sure they are different.
|
9 |
+
Return the surrogate name in a form of string.
|
10 |
+
f_l: F or L for first or last name -> str
|
11 |
+
"""
|
12 |
+
# prioritizing GENDER over country?
|
13 |
+
# it is a very large dataset so can take long, how to improve the speed?
|
14 |
+
# Q: If want to get a whole name at a time? (just combining)
|
15 |
+
# Q: If only get initials? (change to other letters which should be easy)
|
16 |
+
# translating gender code
|
17 |
+
###### randomly find a match in the data set! And a return a similar one
|
18 |
+
# if gender == 'male':
|
19 |
+
# gender = 'M'
|
20 |
+
# elif gender == 'female':
|
21 |
+
# gender = 'F'
|
22 |
+
# else:
|
23 |
+
# gender = None
|
24 |
+
|
25 |
+
surrogate_name = original_name
|
26 |
+
# checking whether the surrogate name and the original name is the same
|
27 |
+
# using the while loop
|
28 |
+
# TODO: [Old version] the order of gender and country need to be changed
|
29 |
+
while(surrogate_name == original_name):
|
30 |
+
# situation when gender can be matched
|
31 |
+
if not gender:
|
32 |
+
gender_df = fb_df[fb_df["gender"] == gender]
|
33 |
+
gender_c_df = gender_df[gender_df["country"] == country_code]
|
34 |
+
# situations: whether country code can be matched
|
35 |
+
if gender_c_df.shape[0] > 0:
|
36 |
+
surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
|
37 |
+
# if gender match, country not match: randomly return from gender df
|
38 |
+
else:
|
39 |
+
surrogate_name = gender_df[f_l].sample(n=1).to_string()
|
40 |
+
else:
|
41 |
+
# situation when gender cannot be match: gender is None
|
42 |
+
country_df = fb_df[fb_df["country"] == country_code]
|
43 |
+
# situation when country can be matched
|
44 |
+
if country_df.shape[0] > 0:
|
45 |
+
surrogate_name = country_df[f_l].sample(n=1).to_string()
|
46 |
+
# situation when neither gender nor country can be matched
|
47 |
+
# randomly return one name from the whole dataset
|
48 |
+
else:
|
49 |
+
surrogate_name = fb_df[f_l].sample(n=1).to_string()
|
50 |
+
|
51 |
+
return surrogate_name
|
52 |
+
|
53 |
+
def match_entity(original_info, entity):
|
54 |
+
# TODO: need refinement for each kind of entity
|
55 |
+
if entity == 'STUDENT':
|
56 |
+
# TODO: here, change between 1 and 2
|
57 |
+
return match_name_2(original_info)
|
58 |
+
elif entity == 'EMAIL_ADDRESS':
|
59 |
+
return '[email protected]'
|
60 |
+
elif entity == 'PHONE_NUMBER':
|
61 |
+
#TODO: specific form of number will be returned for consistency
|
62 |
+
return '000-000-0000'
|
63 |
+
elif entity == 'URL':
|
64 |
+
return 'google.com'
|
65 |
+
else:
|
66 |
+
pass
|
67 |
+
|
68 |
+
def match_name(original_name):
|
69 |
+
# FIXME: take too LONG time to run (large df used multi-times), how to improve
|
70 |
+
# FIXME: here we only keep the first name for now
|
71 |
+
# TODO: how to match both first and last? -- first name match gender, last name match country?
|
72 |
+
# gender is not applied to last name
|
73 |
+
# the name distinguished by first and last?
|
74 |
+
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
|
75 |
+
first_name = original_name.split()[0]
|
76 |
+
global fb_df
|
77 |
+
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
|
78 |
+
names = fb_df[fb_df['first']==first_name]
|
79 |
+
if not names.empty:
|
80 |
+
name_df = names.sample(n=1)
|
81 |
+
# prevent for same name - deleting same name from df
|
82 |
+
new_df = fb_df[fb_df['first'] != first_name]
|
83 |
+
new_name = replace_name(name_df, new_df)
|
84 |
+
return new_name
|
85 |
+
else:
|
86 |
+
return 'Jane Doe'
|
87 |
+
|
88 |
+
def replace_name(name_df, new_df):
|
89 |
+
"""
|
90 |
+
:param name_df: df that match the original first name -> data frame
|
91 |
+
:param new_df: df that does not repeat with original name
|
92 |
+
:return: whole name: that match country & gender -> str
|
93 |
+
"""
|
94 |
+
gender = name_df['gender'].to_string(index=False)
|
95 |
+
country = name_df['country'].to_string(index=False)
|
96 |
+
|
97 |
+
# match country, then match gender
|
98 |
+
country_df = new_df[new_df['country'] == country]
|
99 |
+
country_g_df = country_df[country_df['gender'] == gender]
|
100 |
+
|
101 |
+
first = country_g_df['first'].sample(n=1).to_string(index=False)
|
102 |
+
last = country_g_df['last'].sample(n=1).to_string(index=False)
|
103 |
+
return first+' '+last
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def match_name_2(original_name):
|
108 |
+
"""
|
109 |
+
Work by match gender from first name, match country from the last name
|
110 |
+
:param original_name:
|
111 |
+
:return:
|
112 |
+
"""
|
113 |
+
global fb_df
|
114 |
+
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
|
115 |
+
# FIXME: work when get a full name, may need branch to only first or last name....
|
116 |
+
gender = name_match_gender(original_name.split()[0])
|
117 |
+
print(original_name.split()[1])
|
118 |
+
country = name_match_country(original_name.split()[-1])
|
119 |
+
return replace_name_2(gender, country)
|
120 |
+
|
121 |
+
|
122 |
+
def name_match_country(last_name):
|
123 |
+
names = fb_df[fb_df['last'] == last_name]
|
124 |
+
if not names.empty:
|
125 |
+
country = names['country'].sample(n=1).to_string(index=False)
|
126 |
+
return country
|
127 |
+
else:
|
128 |
+
return 'US'
|
129 |
+
|
130 |
+
def name_match_gender(first_name):
|
131 |
+
names = fb_df[fb_df['first'] == first_name]
|
132 |
+
gender = names['gender'].sample(n=1).to_string(index=False)
|
133 |
+
return gender
|
134 |
+
|
135 |
+
def replace_name_2(gender, country):
|
136 |
+
# TODO: prevent same name
|
137 |
+
country_df = fb_df[fb_df['country'] == country]
|
138 |
+
country_g_df = country_df[country_df['gender'] == gender]
|
139 |
+
|
140 |
+
first = country_g_df['first'].sample(n=1).to_string(index=False)
|
141 |
+
last = country_g_df['last'].sample(n=1).to_string(index=False)
|
142 |
+
full_name = first +' ' + last
|
143 |
+
return full_name
|
144 |
+
|
145 |
+
def replace_text(str_list):
|
146 |
+
surrogate_text = ''
|
147 |
+
for i in str_list:
|
148 |
+
if isinstance(i, tuple):
|
149 |
+
i = match_entity(i[0], i[1])
|
150 |
+
surrogate_text += i
|
151 |
+
return surrogate_text
|
152 |
+
|
153 |
+
if __name__ == "__main__":
|
154 |
+
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
|
155 |
+
# print(matching("PH", 'female', 'first', 'Momo', fb_df))
|
156 |
+
print(match_entity('Nora Wang', 'STUDENT'))
|
spacy_recognizer.py β spacy_analyzer.py
RENAMED
@@ -1,18 +1,19 @@
|
|
1 |
-
import logging
|
2 |
-
from typing import Optional, List, Tuple, Set
|
3 |
|
4 |
from presidio_analyzer import (
|
|
|
5 |
RecognizerResult,
|
|
|
6 |
LocalRecognizer,
|
7 |
AnalysisExplanation,
|
8 |
)
|
9 |
-
from presidio_analyzer.nlp_engine import NlpArtifacts
|
10 |
-
from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
|
11 |
|
|
|
|
|
|
|
|
|
12 |
logger = logging.getLogger("presidio-analyzer")
|
13 |
|
14 |
class CustomSpacyRecognizer(LocalRecognizer):
|
15 |
-
|
16 |
ENTITIES = [
|
17 |
"STUDENT",
|
18 |
]
|
@@ -30,9 +31,8 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
30 |
def __init__(
|
31 |
self,
|
32 |
supported_language: str = "en",
|
33 |
-
supported_entities: Optional[
|
34 |
-
check_label_groups: Optional[
|
35 |
-
context: Optional[List[str]] = None,
|
36 |
ner_strength: float = 0.85,
|
37 |
):
|
38 |
self.ner_strength = ner_strength
|
@@ -49,7 +49,7 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
49 |
"""Load the model, not used. Model is loaded during initialization."""
|
50 |
pass
|
51 |
|
52 |
-
def get_supported_entities(self) ->
|
53 |
"""
|
54 |
Return supported entities by this model.
|
55 |
:return: List of the supported entities.
|
@@ -72,8 +72,17 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
72 |
)
|
73 |
return explanation
|
74 |
|
75 |
-
def analyze(self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
results = []
|
|
|
77 |
if not nlp_artifacts:
|
78 |
logger.warning("Skipping SpaCy, nlp artifacts not provided...")
|
79 |
return results
|
@@ -107,8 +116,31 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
107 |
|
108 |
@staticmethod
|
109 |
def __check_label(
|
110 |
-
entity: str, label: str, check_label_groups:
|
111 |
) -> bool:
|
112 |
return any(
|
113 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
114 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
from presidio_analyzer import (
|
3 |
+
AnalyzerEngine,
|
4 |
RecognizerResult,
|
5 |
+
RecognizerRegistry,
|
6 |
LocalRecognizer,
|
7 |
AnalysisExplanation,
|
8 |
)
|
|
|
|
|
9 |
|
10 |
+
from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
|
11 |
+
from typing import Optional
|
12 |
+
|
13 |
+
import logging
|
14 |
logger = logging.getLogger("presidio-analyzer")
|
15 |
|
16 |
class CustomSpacyRecognizer(LocalRecognizer):
|
|
|
17 |
ENTITIES = [
|
18 |
"STUDENT",
|
19 |
]
|
|
|
31 |
def __init__(
|
32 |
self,
|
33 |
supported_language: str = "en",
|
34 |
+
supported_entities: Optional[list[str]] = None,
|
35 |
+
check_label_groups: Optional[tuple[set, set]] = None,
|
|
|
36 |
ner_strength: float = 0.85,
|
37 |
):
|
38 |
self.ner_strength = ner_strength
|
|
|
49 |
"""Load the model, not used. Model is loaded during initialization."""
|
50 |
pass
|
51 |
|
52 |
+
def get_supported_entities(self) -> list[str]:
|
53 |
"""
|
54 |
Return supported entities by this model.
|
55 |
:return: List of the supported entities.
|
|
|
72 |
)
|
73 |
return explanation
|
74 |
|
75 |
+
def analyze(self,
|
76 |
+
text: str,
|
77 |
+
entities: list[str] = None,
|
78 |
+
nlp_artifacts: NlpArtifacts = None):
|
79 |
+
"""Analyze input using Analyzer engine and input arguments (kwargs)."""
|
80 |
+
|
81 |
+
if not entities or "All" in entities:
|
82 |
+
entities = None
|
83 |
+
|
84 |
results = []
|
85 |
+
|
86 |
if not nlp_artifacts:
|
87 |
logger.warning("Skipping SpaCy, nlp artifacts not provided...")
|
88 |
return results
|
|
|
116 |
|
117 |
@staticmethod
|
118 |
def __check_label(
|
119 |
+
entity: str, label: str, check_label_groups: tuple[set, set]
|
120 |
) -> bool:
|
121 |
return any(
|
122 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
123 |
+
)
|
124 |
+
|
125 |
+
def prepare_analyzer(configuration):
|
126 |
+
"""Handle Preparation of Analyzer Engine for Presidio."""
|
127 |
+
|
128 |
+
spacy_recognizer = CustomSpacyRecognizer()
|
129 |
+
|
130 |
+
# Create NLP engine based on configuration
|
131 |
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
132 |
+
nlp_engine = provider.create_engine()
|
133 |
+
|
134 |
+
# add rule-based recognizers
|
135 |
+
registry = RecognizerRegistry()
|
136 |
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
137 |
+
registry.add_recognizer(spacy_recognizer)
|
138 |
+
|
139 |
+
# remove the nlp engine we passed, to use custom label mappings
|
140 |
+
registry.remove_recognizer("SpacyRecognizer")
|
141 |
+
|
142 |
+
analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
|
143 |
+
registry=registry,
|
144 |
+
supported_languages=["en"])
|
145 |
+
|
146 |
+
return analyzer
|