langdonholmes commited on
Commit
22bf201
1 Parent(s): a002c38

add __init__.py

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. piilo/anonymizer.py +132 -0
app.py CHANGED
@@ -1,8 +1,8 @@
1
 
2
  '''Streamlit app for Student Name Detection models.'''
3
 
4
- from analyzer import prepare_analyzer
5
- from anonymizer import surrogate_anonymizer
6
  import pandas as pd
7
  from annotated_text import annotated_text
8
  from json import JSONEncoder
 
1
 
2
  '''Streamlit app for Student Name Detection models.'''
3
 
4
+ from piilo.analyzer import prepare_analyzer
5
+ from piilo.anonymizer import surrogate_anonymizer
6
  import pandas as pd
7
  from annotated_text import annotated_text
8
  from json import JSONEncoder
piilo/anonymizer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import List, Optional
4
+
5
+ import pandas as pd
6
+ from presidio_analyzer import RecognizerResult
7
+ from presidio_anonymizer import AnonymizerEngine
8
+ from presidio_anonymizer.entities import OperatorConfig
9
+ from presidio_anonymizer.operators import OperatorType
10
+ from nameparser import HumanName
11
+ from names_database import NameDatabase
12
+
13
+ name_table = Path('data', 'ascii_names.parquet')
14
+
15
+ logger = logging.getLogger('anonymizer')
16
+
17
+ class surrogate_anonymizer(AnonymizerEngine):
18
+ def __init__(self):
19
+ super().__init__()
20
+ self.names_db = NameDatabase()
21
+ self.names_df = pd.read_parquet(name_table)
22
+
23
+ # keep track of names we have seen
24
+ self.seen_names = dict()
25
+
26
+ def get_random_name(
27
+ self,
28
+ country: Optional[str] = None,
29
+ gender: Optional[str] = None
30
+ ) -> pd.DataFrame:
31
+ '''Returns two random names from the database as a DataFrame.
32
+ Both rows match gender and country, if provided.
33
+ :country: ISO country code e.g. "CO" for Columbia
34
+ :gender: 'M' or 'F'
35
+ returns two rows of the names dataframe
36
+ '''
37
+ names_view = self.names_df
38
+ if country:
39
+ names_view = names_view[names_view['country'] == country]
40
+ if gender:
41
+ names_view = names_view[names_view['gender'] == gender]
42
+ if names_view.size < 25:
43
+ return self.names_df.sample(n=2, weights=self.names_df['count'])
44
+ return names_view.sample(n=2, weights=names_view['count'])
45
+
46
+ def generate_surrogate(self, original_name: str) -> str:
47
+ '''Generate a surrogate name.
48
+ '''
49
+ if original_name == 'PII':
50
+ # Every time we call this function, Presidio will validate it
51
+ # by testing that the function returns a str when the input is
52
+ # 'PII'. Bypass this test.
53
+ return 'PII'
54
+
55
+ # If we have seen this name before, return the same surrogate
56
+ if original_name in self.seen_names:
57
+ return self.seen_names[original_name]
58
+
59
+ # Use nameparser to split the name
60
+ name = HumanName(original_name)
61
+
62
+ gender = self.names_db.get_gender(name.first) if name.first else None
63
+ logger.info(f'Gender set to {gender}')
64
+ country = self.names_db.get_country(name.last) if name.last else None
65
+ logger.info(f'Country set to {country}')
66
+
67
+ surrogate_name = ''
68
+
69
+ name_candidates = self.get_random_name(gender=gender, country=country)
70
+
71
+ surrogate_name += name_candidates.iloc[0]['first']
72
+ logger.info(f'First name surrogate is {surrogate_name}')
73
+
74
+ if name.last:
75
+ logger.info(f'Last name surrogate is {name_candidates.iloc[1]["last"]}')
76
+ surrogate_name += ' ' + name_candidates.iloc[1]['last']
77
+
78
+ logger.info(f'Returning surrogate name {surrogate_name}')
79
+
80
+ self.seen_names[original_name] = surrogate_name
81
+
82
+ return surrogate_name
83
+
84
+ def anonymize(
85
+ self,
86
+ text: str,
87
+ analyzer_results: List[RecognizerResult]
88
+ ):
89
+ '''Anonymize identified input using Presidio Anonymizer.'''
90
+
91
+ if not text:
92
+ return
93
+
94
+ analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
95
+ analyzer_results
96
+ )
97
+
98
+ operators = self._AnonymizerEngine__check_or_add_default_operator(
99
+ {
100
+ 'STUDENT': OperatorConfig('custom',
101
+ {'lambda': self.generate_surrogate}),
102
+ 'EMAIL_ADDRESS': OperatorConfig('replace',
103
+ {'new_value': '[email protected]'}),
104
+ 'PHONE_NUMBER': OperatorConfig('replace',
105
+ {'new_value': '888-888-8888'}),
106
+ 'URL': OperatorConfig('replace',
107
+ {'new_value': 'aol.com'}),
108
+ }
109
+ )
110
+
111
+ res = self._operate(text,
112
+ analyzer_results,
113
+ operators,
114
+ OperatorType.Anonymize)
115
+
116
+ return res.text
117
+
118
+ if __name__ == '__main__':
119
+ logging.basicConfig(level=logging.DEBUG)
120
+
121
+ anonymizer = surrogate_anonymizer()
122
+
123
+ test_names = ['Nora Wang',
124
+ 'MJ',
125
+ '',
126
+ '(',
127
+ 'Mario Escobar Sanchez',
128
+ 'Jane Fonda Michelle Rousseau',
129
+ 'Sir Phillipe Ricardo de la Sota Mayor']
130
+
131
+ for name in test_names:
132
+ anonymizer.generate_surrogate(name)