langdonholmes commited on
Commit
c30df3e
β€’
2 Parent(s): d6c1f97 287a33f

Merge branch 'refactor'

Browse files
__pycache__/spacy_analyzer.cpython-310.pyc ADDED
Binary file (4.17 kB). View file
 
anonymize.py DELETED
@@ -1,44 +0,0 @@
1
- from spacy_recognizer import CustomSpacyRecognizer
2
- from presidio_analyzer.nlp_engine import NlpEngineProvider
3
- from presidio_anonymizer import AnonymizerEngine
4
- from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
5
- from presidio_anonymizer.entities import OperatorConfig
6
- import pandas as pd
7
- from json import JSONEncoder
8
- import json
9
- import warnings
10
- import os
11
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
- warnings.filterwarnings('ignore')
13
-
14
- def prepare_analyzer(configuration):
15
- """Return AnalyzerEngine."""
16
-
17
- spacy_recognizer = CustomSpacyRecognizer()
18
-
19
- print('Hallej')
20
-
21
- # Create NLP engine based on configuration
22
- provider = NlpEngineProvider(nlp_configuration=configuration)
23
- nlp_engine = provider.create_engine()
24
-
25
- # add rule-based recognizers
26
- registry = RecognizerRegistry()
27
- registry.load_predefined_recognizers(nlp_engine=nlp_engine)
28
- registry.add_recognizer(spacy_recognizer)
29
-
30
- # remove the nlp engine we passed, to use custom label mappings
31
- registry.remove_recognizer("SpacyRecognizer")
32
-
33
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
34
- registry=registry,
35
- supported_languages=["en"])
36
-
37
- return analyzer
38
-
39
- def generate_surrogate(name):
40
- """Return appropriate surrogate name from text string"""
41
- if "John" in name:
42
- return "Jill"
43
- else:
44
- return "SURROGATE_NAME"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
anonymizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from presidio_anonymizer import AnonymizerEngine
2
+ from presidio_anonymizer.entities import OperatorConfig
3
+ from presidio_analyzer import RecognizerResult
4
+
5
+ def retrieve_name_records():
6
+ """Read in a table of names with gender and country code fields."""
7
+ pass
8
+
9
+ def generate_surrogate(name):
10
+ """Return appropriate surrogate name from text string"""
11
+ if "John" in name:
12
+ return "Jill"
13
+ else:
14
+ return "SURROGATE_NAME"
15
+
16
+ def anonymize(
17
+ anonymizer: AnonymizerEngine,
18
+ text: str,
19
+ analyze_results: list[RecognizerResult]
20
+ ):
21
+ """Anonymize identified input using Presidio Anonymizer."""
22
+
23
+ if not text:
24
+ return
25
+
26
+ res = anonymizer.anonymize(
27
+ text,
28
+ analyze_results,
29
+ operators={
30
+ "STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate}),
31
+ "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[email protected]"}),
32
+ "PHONE_NUMBER": OperatorConfig("replace", {"new_value": "888-888-8888"}),
33
+ "URL": OperatorConfig("replace", {"new_value": "aol.com"}),
34
+ }
35
+ )
36
+
37
+ return res.text
app.py CHANGED
@@ -1,9 +1,9 @@
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
- from anonymize import prepare_analyzer, generate_surrogate
 
5
  from presidio_anonymizer import AnonymizerEngine
6
- from presidio_anonymizer.entities import OperatorConfig
7
  import pandas as pd
8
  from annotated_text import annotated_text
9
  from json import JSONEncoder
@@ -17,7 +17,7 @@ warnings.filterwarnings('ignore')
17
  # Helper methods
18
  @st.cache(allow_output_mutation=True)
19
  def analyzer_engine():
20
- """Return AnalyzerEngine."""
21
 
22
  configuration = {
23
  "nlp_engine_name": "spacy",
@@ -34,27 +34,6 @@ def anonymizer_engine():
34
  """Return AnonymizerEngine."""
35
  return AnonymizerEngine()
36
 
37
- def get_supported_entities():
38
- """Return supported entities from the Analyzer Engine."""
39
- return analyzer_engine().get_supported_entities()
40
-
41
- def analyze(**kwargs):
42
- """Analyze input using Analyzer engine and input arguments (kwargs)."""
43
- if "entities" not in kwargs or "All" in kwargs["entities"]:
44
- kwargs["entities"] = None
45
- return analyzer_engine().analyze(**kwargs)
46
-
47
- def anonymize(text, analyze_results):
48
- """Anonymize identified input using Presidio Anonymizer."""
49
- if not text:
50
- return
51
- res = anonymizer_engine().anonymize(
52
- text,
53
- analyze_results,
54
- operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
55
- )
56
- return res.text
57
-
58
  def annotate(text, st_analyze_results, st_entities):
59
  tokens = []
60
  # sort by start index
@@ -85,8 +64,8 @@ st.sidebar.markdown(
85
 
86
  st_entities = st.sidebar.multiselect(
87
  label="Which entities to look for?",
88
- options=get_supported_entities(),
89
- default=list(get_supported_entities()),
90
  )
91
 
92
  st_threshold = st.sidebar.slider(
@@ -122,7 +101,7 @@ if 'first_load' not in st.session_state:
122
  st.subheader("Analyzed")
123
  with st.spinner("Analyzing..."):
124
  if button or st.session_state.first_load:
125
- st_analyze_results = analyze(
126
  text=st_text,
127
  entities=st_entities,
128
  language="en",
@@ -140,9 +119,11 @@ st.subheader("Anonymized")
140
 
141
  with st.spinner("Anonymizing..."):
142
  if button or st.session_state.first_load:
143
- st_anonymize_results = anonymize(st_text, st_analyze_results)
 
 
144
  st_anonymize_results
145
-
146
  # table result
147
  st.subheader("Detailed Findings")
148
  if st_analyze_results:
 
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
+ from spacy_analyzer import prepare_analyzer
5
+ from anonymizer import anonymize
6
  from presidio_anonymizer import AnonymizerEngine
 
7
  import pandas as pd
8
  from annotated_text import annotated_text
9
  from json import JSONEncoder
 
17
  # Helper methods
18
  @st.cache(allow_output_mutation=True)
19
  def analyzer_engine():
20
+ """Return AnalyzerEngine and cache with Streamlit."""
21
 
22
  configuration = {
23
  "nlp_engine_name": "spacy",
 
34
  """Return AnonymizerEngine."""
35
  return AnonymizerEngine()
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def annotate(text, st_analyze_results, st_entities):
38
  tokens = []
39
  # sort by start index
 
64
 
65
  st_entities = st.sidebar.multiselect(
66
  label="Which entities to look for?",
67
+ options=analyzer_engine().get_supported_entities(),
68
+ default=list(analyzer_engine().get_supported_entities()),
69
  )
70
 
71
  st_threshold = st.sidebar.slider(
 
101
  st.subheader("Analyzed")
102
  with st.spinner("Analyzing..."):
103
  if button or st.session_state.first_load:
104
+ st_analyze_results = analyzer_engine().analyze(
105
  text=st_text,
106
  entities=st_entities,
107
  language="en",
 
119
 
120
  with st.spinner("Anonymizing..."):
121
  if button or st.session_state.first_load:
122
+ st_anonymize_results = anonymize(anonymizer_engine(),
123
+ st_text,
124
+ st_analyze_results)
125
  st_anonymize_results
126
+
127
  # table result
128
  st.subheader("Detailed Findings")
129
  if st_analyze_results:
match_replace.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def replace_name_old(country_code, gender, f_l, original_name, fb_df):
4
+ """
5
+ Receiving country, gender, first_last name, and the original name.
6
+ Match with a name that matches gender and country, and is randomly retrieved from the
7
+ facebook dataset.
8
+ Compare the surrogate name with the original name to make sure they are different.
9
+ Return the surrogate name in a form of string.
10
+ f_l: F or L for first or last name -> str
11
+ """
12
+ # prioritizing GENDER over country?
13
+ # it is a very large dataset so can take long, how to improve the speed?
14
+ # Q: If want to get a whole name at a time? (just combining)
15
+ # Q: If only get initials? (change to other letters which should be easy)
16
+ # translating gender code
17
+ ###### randomly find a match in the data set! And a return a similar one
18
+ # if gender == 'male':
19
+ # gender = 'M'
20
+ # elif gender == 'female':
21
+ # gender = 'F'
22
+ # else:
23
+ # gender = None
24
+
25
+ surrogate_name = original_name
26
+ # checking whether the surrogate name and the original name is the same
27
+ # using the while loop
28
+ # TODO: [Old version] the order of gender and country need to be changed
29
+ while(surrogate_name == original_name):
30
+ # situation when gender can be matched
31
+ if not gender:
32
+ gender_df = fb_df[fb_df["gender"] == gender]
33
+ gender_c_df = gender_df[gender_df["country"] == country_code]
34
+ # situations: whether country code can be matched
35
+ if gender_c_df.shape[0] > 0:
36
+ surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
37
+ # if gender match, country not match: randomly return from gender df
38
+ else:
39
+ surrogate_name = gender_df[f_l].sample(n=1).to_string()
40
+ else:
41
+ # situation when gender cannot be match: gender is None
42
+ country_df = fb_df[fb_df["country"] == country_code]
43
+ # situation when country can be matched
44
+ if country_df.shape[0] > 0:
45
+ surrogate_name = country_df[f_l].sample(n=1).to_string()
46
+ # situation when neither gender nor country can be matched
47
+ # randomly return one name from the whole dataset
48
+ else:
49
+ surrogate_name = fb_df[f_l].sample(n=1).to_string()
50
+
51
+ return surrogate_name
52
+
53
+ def match_entity(original_info, entity):
54
+ # TODO: need refinement for each kind of entity
55
+ if entity == 'STUDENT':
56
+ # TODO: here, change between 1 and 2
57
+ return match_name_2(original_info)
58
+ elif entity == 'EMAIL_ADDRESS':
59
+ return '[email protected]'
60
+ elif entity == 'PHONE_NUMBER':
61
+ #TODO: specific form of number will be returned for consistency
62
+ return '000-000-0000'
63
+ elif entity == 'URL':
64
+ return 'google.com'
65
+ else:
66
+ pass
67
+
68
+ def match_name(original_name):
69
+ # FIXME: take too LONG time to run (large df used multi-times), how to improve
70
+ # FIXME: here we only keep the first name for now
71
+ # TODO: how to match both first and last? -- first name match gender, last name match country?
72
+ # gender is not applied to last name
73
+ # the name distinguished by first and last?
74
+ # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
75
+ first_name = original_name.split()[0]
76
+ global fb_df
77
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
78
+ names = fb_df[fb_df['first']==first_name]
79
+ if not names.empty:
80
+ name_df = names.sample(n=1)
81
+ # prevent for same name - deleting same name from df
82
+ new_df = fb_df[fb_df['first'] != first_name]
83
+ new_name = replace_name(name_df, new_df)
84
+ return new_name
85
+ else:
86
+ return 'Jane Doe'
87
+
88
+ def replace_name(name_df, new_df):
89
+ """
90
+ :param name_df: df that match the original first name -> data frame
91
+ :param new_df: df that does not repeat with original name
92
+ :return: whole name: that match country & gender -> str
93
+ """
94
+ gender = name_df['gender'].to_string(index=False)
95
+ country = name_df['country'].to_string(index=False)
96
+
97
+ # match country, then match gender
98
+ country_df = new_df[new_df['country'] == country]
99
+ country_g_df = country_df[country_df['gender'] == gender]
100
+
101
+ first = country_g_df['first'].sample(n=1).to_string(index=False)
102
+ last = country_g_df['last'].sample(n=1).to_string(index=False)
103
+ return first+' '+last
104
+
105
+
106
+
107
+ def match_name_2(original_name):
108
+ """
109
+ Work by match gender from first name, match country from the last name
110
+ :param original_name:
111
+ :return:
112
+ """
113
+ global fb_df
114
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
115
+ # FIXME: work when get a full name, may need branch to only first or last name....
116
+ gender = name_match_gender(original_name.split()[0])
117
+ print(original_name.split()[1])
118
+ country = name_match_country(original_name.split()[-1])
119
+ return replace_name_2(gender, country)
120
+
121
+
122
+ def name_match_country(last_name):
123
+ names = fb_df[fb_df['last'] == last_name]
124
+ if not names.empty:
125
+ country = names['country'].sample(n=1).to_string(index=False)
126
+ return country
127
+ else:
128
+ return 'US'
129
+
130
+ def name_match_gender(first_name):
131
+ names = fb_df[fb_df['first'] == first_name]
132
+ gender = names['gender'].sample(n=1).to_string(index=False)
133
+ return gender
134
+
135
+ def replace_name_2(gender, country):
136
+ # TODO: prevent same name
137
+ country_df = fb_df[fb_df['country'] == country]
138
+ country_g_df = country_df[country_df['gender'] == gender]
139
+
140
+ first = country_g_df['first'].sample(n=1).to_string(index=False)
141
+ last = country_g_df['last'].sample(n=1).to_string(index=False)
142
+ full_name = first +' ' + last
143
+ return full_name
144
+
145
+ def replace_text(str_list):
146
+ surrogate_text = ''
147
+ for i in str_list:
148
+ if isinstance(i, tuple):
149
+ i = match_entity(i[0], i[1])
150
+ surrogate_text += i
151
+ return surrogate_text
152
+
153
+ if __name__ == "__main__":
154
+ fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
155
+ # print(matching("PH", 'female', 'first', 'Momo', fb_df))
156
+ print(match_entity('Nora Wang', 'STUDENT'))
spacy_recognizer.py β†’ spacy_analyzer.py RENAMED
@@ -1,18 +1,19 @@
1
- import logging
2
- from typing import Optional, List, Tuple, Set
3
 
4
  from presidio_analyzer import (
 
5
  RecognizerResult,
 
6
  LocalRecognizer,
7
  AnalysisExplanation,
8
  )
9
- from presidio_analyzer.nlp_engine import NlpArtifacts
10
- from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
11
 
 
 
 
 
12
  logger = logging.getLogger("presidio-analyzer")
13
 
14
  class CustomSpacyRecognizer(LocalRecognizer):
15
-
16
  ENTITIES = [
17
  "STUDENT",
18
  ]
@@ -30,9 +31,8 @@ class CustomSpacyRecognizer(LocalRecognizer):
30
  def __init__(
31
  self,
32
  supported_language: str = "en",
33
- supported_entities: Optional[List[str]] = None,
34
- check_label_groups: Optional[Tuple[Set, Set]] = None,
35
- context: Optional[List[str]] = None,
36
  ner_strength: float = 0.85,
37
  ):
38
  self.ner_strength = ner_strength
@@ -49,7 +49,7 @@ class CustomSpacyRecognizer(LocalRecognizer):
49
  """Load the model, not used. Model is loaded during initialization."""
50
  pass
51
 
52
- def get_supported_entities(self) -> List[str]:
53
  """
54
  Return supported entities by this model.
55
  :return: List of the supported entities.
@@ -72,8 +72,17 @@ class CustomSpacyRecognizer(LocalRecognizer):
72
  )
73
  return explanation
74
 
75
- def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
 
 
 
 
 
 
 
 
76
  results = []
 
77
  if not nlp_artifacts:
78
  logger.warning("Skipping SpaCy, nlp artifacts not provided...")
79
  return results
@@ -107,8 +116,31 @@ class CustomSpacyRecognizer(LocalRecognizer):
107
 
108
  @staticmethod
109
  def __check_label(
110
- entity: str, label: str, check_label_groups: Tuple[Set, Set]
111
  ) -> bool:
112
  return any(
113
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
114
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  from presidio_analyzer import (
3
+ AnalyzerEngine,
4
  RecognizerResult,
5
+ RecognizerRegistry,
6
  LocalRecognizer,
7
  AnalysisExplanation,
8
  )
 
 
9
 
10
+ from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
11
+ from typing import Optional
12
+
13
+ import logging
14
  logger = logging.getLogger("presidio-analyzer")
15
 
16
  class CustomSpacyRecognizer(LocalRecognizer):
 
17
  ENTITIES = [
18
  "STUDENT",
19
  ]
 
31
  def __init__(
32
  self,
33
  supported_language: str = "en",
34
+ supported_entities: Optional[list[str]] = None,
35
+ check_label_groups: Optional[tuple[set, set]] = None,
 
36
  ner_strength: float = 0.85,
37
  ):
38
  self.ner_strength = ner_strength
 
49
  """Load the model, not used. Model is loaded during initialization."""
50
  pass
51
 
52
+ def get_supported_entities(self) -> list[str]:
53
  """
54
  Return supported entities by this model.
55
  :return: List of the supported entities.
 
72
  )
73
  return explanation
74
 
75
+ def analyze(self,
76
+ text: str,
77
+ entities: list[str] = None,
78
+ nlp_artifacts: NlpArtifacts = None):
79
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
80
+
81
+ if not entities or "All" in entities:
82
+ entities = None
83
+
84
  results = []
85
+
86
  if not nlp_artifacts:
87
  logger.warning("Skipping SpaCy, nlp artifacts not provided...")
88
  return results
 
116
 
117
  @staticmethod
118
  def __check_label(
119
+ entity: str, label: str, check_label_groups: tuple[set, set]
120
  ) -> bool:
121
  return any(
122
  [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
123
+ )
124
+
125
+ def prepare_analyzer(configuration):
126
+ """Handle Preparation of Analyzer Engine for Presidio."""
127
+
128
+ spacy_recognizer = CustomSpacyRecognizer()
129
+
130
+ # Create NLP engine based on configuration
131
+ provider = NlpEngineProvider(nlp_configuration=configuration)
132
+ nlp_engine = provider.create_engine()
133
+
134
+ # add rule-based recognizers
135
+ registry = RecognizerRegistry()
136
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
137
+ registry.add_recognizer(spacy_recognizer)
138
+
139
+ # remove the nlp engine we passed, to use custom label mappings
140
+ registry.remove_recognizer("SpacyRecognizer")
141
+
142
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
143
+ registry=registry,
144
+ supported_languages=["en"])
145
+
146
+ return analyzer