piilo / anonymizer.py
langdonholmes
typo in last commit
ad6caec
raw
history blame
2.36 kB
from typing import List
from presidio_analyzer import RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from names_database import NameDatabase
names_db = NameDatabase()
def split_name(original_name: str):
'''Splits name into parts.
If one token, assume it is a first name.
If two tokens, first and last name.
If three tokens, one first name and two last names.
If four tokens, two first names and two last names.'''
names = original_name.split()
if len(names) == 1:
return names[0], None
elif len(names) == 2:
return names[0], names[1]
elif len(names) == 3:
return names[0], ' '.join(names[1:])
elif len(names) == 4:
return ' '.join(names[:2]), ' '.join(names[2:])
else:
return None, None
def generate_surrogate(original_name: str):
'''Generate a surrogate name.
'''
first_names, last_names = split_name(original_name)
gender = names_db.get_gender(first_names) if first_names else None
country = names_db.get_country(last_names) if last_names else None
surrogate_name = ''
name_candidates = names_db.get_random_name(
gender=gender,
country=country)
surrogate_name += name_candidates.iloc[0]['first']
if last_names:
surrogate_name += ' ' + name_candidates.iloc[1]['last']
return surrogate_name
def anonymize(
anonymizer: AnonymizerEngine,
text: str,
analyze_results: List[RecognizerResult]
):
'''Anonymize identified input using Presidio Anonymizer.'''
if not text:
return
res = anonymizer.anonymize(
text,
analyze_results,
operators={
'STUDENT': OperatorConfig('custom',
{'lambda': generate_surrogate}),
'EMAIL_ADDRESS': OperatorConfig('replace',
{'new_value': '[email protected]'}),
'PHONE_NUMBER': OperatorConfig('replace',
{'new_value': '888-888-8888'}),
'URL': OperatorConfig('replace',
{'new_value': 'aol.com'}),
}
)
return res.text
if __name__ == '__main__':
print(generate_surrogate('Nora Wang'))