Spaces:
Sleeping
Sleeping
from typing import List | |
from presidio_analyzer import RecognizerResult | |
from presidio_anonymizer import AnonymizerEngine | |
from presidio_anonymizer.entities import OperatorConfig | |
from names_database import NameDatabase | |
names_db = NameDatabase() | |
def split_name(original_name: str): | |
'''Splits name into parts. | |
If one token, assume it is a first name. | |
If two tokens, first and last name. | |
If three tokens, one first name and two last names. | |
If four tokens, two first names and two last names.''' | |
names = original_name.split() | |
if len(names) == 1: | |
return names[0], None | |
elif len(names) == 2: | |
return names[0], names[1] | |
elif len(names) == 3: | |
return names[0], ' '.join(names[1:]) | |
elif len(names) == 4: | |
return ' '.join(names[:2]), ' '.join(names[2:]) | |
else: | |
return None, None | |
def generate_surrogate(original_name: str): | |
'''Generate a surrogate name. | |
''' | |
first_names, last_names = split_name(original_name) | |
gender = names_db.get_gender(first_names) if first_names else None | |
country = names_db.get_country(last_names) if last_names else None | |
surrogate_name = '' | |
name_candidates = names_db.get_random_name( | |
gender=gender, | |
country=country) | |
surrogate_name += name_candidates.iloc[0]['first'] | |
if last_names: | |
surrogate_name += ' ' + name_candidates.iloc[1]['last'] | |
return surrogate_name | |
def anonymize( | |
anonymizer: AnonymizerEngine, | |
text: str, | |
analyze_results: List[RecognizerResult] | |
): | |
'''Anonymize identified input using Presidio Anonymizer.''' | |
if not text: | |
return | |
res = anonymizer.anonymize( | |
text, | |
analyze_results, | |
operators={ | |
'STUDENT': OperatorConfig('custom', | |
{'lambda': generate_surrogate}), | |
'EMAIL_ADDRESS': OperatorConfig('replace', | |
{'new_value': '[email protected]'}), | |
'PHONE_NUMBER': OperatorConfig('replace', | |
{'new_value': '888-888-8888'}), | |
'URL': OperatorConfig('replace', | |
{'new_value': 'aol.com'}), | |
} | |
) | |
return res.text | |
if __name__ == '__main__': | |
print(generate_surrogate('Nora Wang')) |