langdonholmes commited on
Commit
0c29fae
β€’
1 Parent(s): 30b20e5

refactor gender country matching

Browse files
app.py CHANGED
@@ -1,14 +1,17 @@
1
 
2
  '''Streamlit app for Student Name Detection models.'''
3
- from piilo.analyzer import prepare_analyzer
4
- from piilo.anonymizer import surrogate_anonymizer
5
- import pandas as pd
6
- from annotated_text import annotated_text
7
- from json import JSONEncoder
8
  import json
 
9
  import warnings
 
 
 
10
  import streamlit as st
11
- import os
 
 
 
 
12
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
13
  warnings.filterwarnings('ignore')
14
 
@@ -56,7 +59,7 @@ st.set_page_config(page_title='Student Name Detector (English)', layout='wide')
56
 
57
  # Side bar
58
  st.sidebar.markdown(
59
- '''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected by Coursera.
60
  '''
61
  )
62
 
@@ -74,7 +77,7 @@ st_return_decision_process = st.sidebar.checkbox(
74
  'Add analysis explanations in json')
75
 
76
  st.sidebar.info(
77
- 'This is part of a deidentification project for student-generated text.'
78
  )
79
 
80
  # Main panel
 
1
 
2
  '''Streamlit app for Student Name Detection models.'''
 
 
 
 
 
3
  import json
4
+ import os
5
  import warnings
6
+ from json import JSONEncoder
7
+
8
+ import pandas as pd
9
  import streamlit as st
10
+ from annotated_text import annotated_text
11
+
12
+ from piilo.engines.analyzer import prepare_analyzer
13
+ from piilo.engines.anonymizer import surrogate_anonymizer
14
+
15
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
16
  warnings.filterwarnings('ignore')
17
 
 
59
 
60
  # Side bar
61
  st.sidebar.markdown(
62
+ '''Detect and anonymize PII in text using an [NLP model](https://huggingface.co/langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected from a massive online open-enrollment course.
63
  '''
64
  )
65
 
 
77
  'Add analysis explanations in json')
78
 
79
  st.sidebar.info(
80
+ 'This is part of a project to develop new anonymization systems that are appropriate for student-generated text.'
81
  )
82
 
83
  # Main panel
piilo/engines/__init__.py ADDED
File without changes
piilo/{analyzer.py β†’ engines/analyzer.py} RENAMED
File without changes
piilo/{anonymizer.py β†’ engines/anonymizer.py} RENAMED
@@ -10,11 +10,14 @@ from presidio_anonymizer import AnonymizerEngine
10
  from presidio_anonymizer.entities import OperatorConfig
11
  from presidio_anonymizer.operators import OperatorType
12
 
13
- name_table = Path(__file__).parent.parent / 'data' / 'ascii_names.parquet'
 
14
 
15
  logger = logging.getLogger('anonymizer')
16
-
17
  class NameDatabase(NameDataset):
 
 
 
18
  def __init__(self) -> None:
19
  super().__init__()
20
 
@@ -29,27 +32,31 @@ class NameDatabase(NameDataset):
29
  return {'first_name': fn, 'last_name': ln}
30
 
31
  def get_gender(self, first_names: str) -> str:
32
- '''Return the most frequent gender code for a specific last name,
33
  or None if a match cannot be found.
34
  '''
35
  gender = NameWrapper(self.search(first_names)).gender
36
  return gender if gender else None
37
 
38
  def get_country(self, last_names: str) -> str:
39
- '''Return the most frequent country code for a specific last name,
40
  or None if a match cannot be found.
41
  '''
42
  country = NameWrapper(self.search(last_names)).country
43
  return country if country else None
44
 
45
  class surrogate_anonymizer(AnonymizerEngine):
 
 
 
46
  def __init__(self):
47
  super().__init__()
48
  self.names_db = NameDatabase()
49
  self.names_df = pd.read_parquet(name_table)
50
 
51
  # keep track of names we have seen
52
- self.seen_names = dict()
 
53
 
54
  def get_random_name(
55
  self,
@@ -63,58 +70,74 @@ class surrogate_anonymizer(AnonymizerEngine):
63
  returns two rows of the names dataframe
64
  '''
65
  names_view = self.names_df
 
66
  if country:
67
  names_view = names_view[names_view['country'] == country]
 
68
  if gender:
69
  names_view = names_view[names_view['gender'] == gender]
 
70
  if names_view.size < 25:
71
- return self.names_df.sample(n=2, weights=self.names_df['count'])
72
- return names_view.sample(n=2, weights=names_view['count'])
 
 
73
 
74
  def generate_surrogate(self, original_name: str) -> str:
75
  '''Generate a surrogate name.
76
  '''
 
77
  if original_name == 'PII':
78
  # Every time we call this function, Presidio will validate it
79
  # by testing that the function returns a str when the input is
80
- # 'PII'. Bypass this test.
81
  return 'PII'
82
 
83
- # If we have seen this name before, return the same surrogate
84
- if original_name in self.seen_names:
85
- return self.seen_names[original_name]
86
-
87
  # Use nameparser to split the name
88
  name = HumanName(original_name)
 
 
89
 
90
- gender = self.names_db.get_gender(name.first) if name.first else None
91
- logger.info(f'Gender set to {gender}')
92
- country = self.names_db.get_country(name.last) if name.last else None
93
- logger.info(f'Country set to {country}')
94
-
95
- surrogate_name = ''
96
-
97
- name_candidates = self.get_random_name(gender=gender, country=country)
98
-
99
- surrogate_name += name_candidates.iloc[0]['first']
100
- logger.info(f'First name surrogate is {surrogate_name}')
101
-
102
  if name.last:
103
- logger.info(f'Last name surrogate is {name_candidates.iloc[1]["last"]}')
104
- surrogate_name += ' ' + name_candidates.iloc[1]['last']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- logger.info(f'Returning surrogate name {surrogate_name}')
107
 
108
- self.seen_names[original_name] = surrogate_name
 
109
 
110
- return surrogate_name
111
 
112
  def anonymize(
113
  self,
114
  text: str,
115
  analyzer_results: List[RecognizerResult]
116
  ):
117
- '''Anonymize identified input using Presidio Anonymizer.'''
 
118
 
119
  if not text:
120
  return
@@ -149,6 +172,8 @@ if __name__ == '__main__':
149
  anonymizer = surrogate_anonymizer()
150
 
151
  test_names = ['Nora Wang',
 
 
152
  'MJ',
153
  '',
154
  '(',
 
10
  from presidio_anonymizer.entities import OperatorConfig
11
  from presidio_anonymizer.operators import OperatorType
12
 
13
+ data = Path(__file__).parent.parent.parent / 'data'
14
+ name_table = data / 'ascii_names.parquet'
15
 
16
  logger = logging.getLogger('anonymizer')
 
17
  class NameDatabase(NameDataset):
18
+ '''A wrapper around the names_dataset.NameDataset class.
19
+ '''
20
+
21
  def __init__(self) -> None:
22
  super().__init__()
23
 
 
32
  return {'first_name': fn, 'last_name': ln}
33
 
34
  def get_gender(self, first_names: str) -> str:
35
+ '''Return the most frequent gender code for the provided first name,
36
  or None if a match cannot be found.
37
  '''
38
  gender = NameWrapper(self.search(first_names)).gender
39
  return gender if gender else None
40
 
41
  def get_country(self, last_names: str) -> str:
42
+ '''Return the most frequent country code for a the provided last name,
43
  or None if a match cannot be found.
44
  '''
45
  country = NameWrapper(self.search(last_names)).country
46
  return country if country else None
47
 
48
  class surrogate_anonymizer(AnonymizerEngine):
49
+ '''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
50
+ '''
51
+
52
  def __init__(self):
53
  super().__init__()
54
  self.names_db = NameDatabase()
55
  self.names_df = pd.read_parquet(name_table)
56
 
57
  # keep track of names we have seen
58
+ self.seen_first_names = dict()
59
+ self.seen_last_names = dict()
60
 
61
  def get_random_name(
62
  self,
 
70
  returns two rows of the names dataframe
71
  '''
72
  names_view = self.names_df
73
+
74
  if country:
75
  names_view = names_view[names_view['country'] == country]
76
+
77
  if gender:
78
  names_view = names_view[names_view['gender'] == gender]
79
+
80
  if names_view.size < 25:
81
+ # If we don't have enough names, just return a random sample
82
+ return self.names_df.sample(n=1, weights=self.names_df['count'])
83
+
84
+ return names_view.sample(n=1, weights=names_view['count'])
85
 
86
  def generate_surrogate(self, original_name: str) -> str:
87
  '''Generate a surrogate name.
88
  '''
89
+
90
  if original_name == 'PII':
91
  # Every time we call this function, Presidio will validate it
92
  # by testing that the function returns a str when the input is
93
+ # 'PII'. We don't need to run below code in this case.
94
  return 'PII'
95
 
 
 
 
 
96
  # Use nameparser to split the name
97
  name = HumanName(original_name)
98
+ new_name = HumanName()
99
+ gender, country = None, None
100
 
101
+ # First check if we have seen this name before
 
 
 
 
 
 
 
 
 
 
 
102
  if name.last:
103
+ if name.last in self.seen_last_names:
104
+ new_name.last = self.seen_last_names[name.last]
105
+ else:
106
+ # Sample last name, matching country
107
+ country = self.names_db.get_country(name.last)
108
+ logger.info(f'Country set to {country}')
109
+ new_name.last = self.get_random_name(
110
+ country=country,
111
+ )['last'].iloc[0]
112
+ logger.info(f'Last name surrogate is {new_name.last}')
113
+
114
+ if name.first:
115
+ if name.first in self.seen_first_names:
116
+ new_name.first = self.seen_first_names[name.first]
117
+ else:
118
+ # Sample first name matching gender and country, if available.
119
+ gender = self.names_db.get_gender(name.first)
120
+ logger.info(f'Gender set to {gender}')
121
+ new_name.first = self.get_random_name(
122
+ gender=gender,
123
+ country=country,
124
+ )['first'].iloc[0]
125
+ logger.info(f'First name surrogate is {new_name.first}')
126
 
127
+ logger.info(f'Returning surrogate name {new_name}')
128
 
129
+ self.seen_first_names[name.first] = new_name.first
130
+ self.seen_last_names[name.last] = new_name.last
131
 
132
+ return str(new_name)
133
 
134
  def anonymize(
135
  self,
136
  text: str,
137
  analyzer_results: List[RecognizerResult]
138
  ):
139
+ '''Anonymize identified input using Presidio Anonymizer.
140
+ '''
141
 
142
  if not text:
143
  return
 
172
  anonymizer = surrogate_anonymizer()
173
 
174
  test_names = ['Nora Wang',
175
+ 'John Williams',
176
+ 'John H. Williams',
177
  'MJ',
178
  '',
179
  '(',
main.py β†’ piilo/main.py RENAMED
@@ -5,31 +5,26 @@ import logging
5
  from fastapi import FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
 
8
- from piilo.analyzer import prepare_analyzer
9
- from piilo.anonymizer import surrogate_anonymizer
10
- from piilo.models.anonymize import AnonymizeRequest, AnonymizeResponse
11
 
12
- # Define Student Name Detection Model
13
  configuration = {
14
  'nlp_engine_name': 'spacy',
15
  'models': [
16
  {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
17
  }
18
 
19
- # set up logger for this module
20
  logger = logging.getLogger('api')
21
  logging.basicConfig(level=logging.INFO)
22
 
23
- # Load Custom Presidio Analyzer and Anonymizer
24
- logger.info("Loading Presidio Analyzer and Anonymizer")
25
  analyzer = prepare_analyzer(configuration)
26
  anonymizer = surrogate_anonymizer()
27
- logger.info("Loaded Presidio Analyzer and Anonymizer")
28
 
29
- # Initialize FastAPI
30
  app = FastAPI()
31
 
32
- # Enable CORS
33
  app.add_middleware(
34
  CORSMiddleware,
35
  allow_origins=["*"],
@@ -44,8 +39,7 @@ def hello():
44
 
45
  @app.post("/anonymize")
46
  def anonymize(anon_req: AnonymizeRequest) -> AnonymizeResponse:
47
- '''Anonymize PII in text using a custom Presidio Analyzer and Anonymizer
48
- '''
49
  analyzer_result = analyzer.analyze(anon_req.raw_text,
50
  entities=anon_req.entities,
51
  language=anon_req.language,
@@ -62,7 +56,6 @@ def anonymize(anon_req: AnonymizeRequest) -> AnonymizeResponse:
62
 
63
  if __name__ == "__main__":
64
  import os
65
-
66
  import uvicorn
67
 
68
  uvicorn.run(
 
5
  from fastapi import FastAPI
6
  from fastapi.middleware.cors import CORSMiddleware
7
 
8
+ from engines.analyzer import prepare_analyzer
9
+ from engines.anonymizer import surrogate_anonymizer
10
+ from models.anonymize import AnonymizeRequest, AnonymizeResponse
11
 
 
12
  configuration = {
13
  'nlp_engine_name': 'spacy',
14
  'models': [
15
  {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
16
  }
17
 
 
18
  logger = logging.getLogger('api')
19
  logging.basicConfig(level=logging.INFO)
20
 
21
+ logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
 
22
  analyzer = prepare_analyzer(configuration)
23
  anonymizer = surrogate_anonymizer()
24
+ logger.info("Loading Successful!")
25
 
 
26
  app = FastAPI()
27
 
 
28
  app.add_middleware(
29
  CORSMiddleware,
30
  allow_origins=["*"],
 
39
 
40
  @app.post("/anonymize")
41
  def anonymize(anon_req: AnonymizeRequest) -> AnonymizeResponse:
42
+
 
43
  analyzer_result = analyzer.analyze(anon_req.raw_text,
44
  entities=anon_req.entities,
45
  language=anon_req.language,
 
56
 
57
  if __name__ == "__main__":
58
  import os
 
59
  import uvicorn
60
 
61
  uvicorn.run(
test_main.py β†’ piilo/test_main.py RENAMED
File without changes