piilo / match_replace.py
langdonholmes
inherited from names_database
b0e291c
raw
history blame
4.21 kB
import pandas as pd
from names_database import NameDatabase
names_db = NameDatabase
def describe_name(first_names, last_names):
gender = names_db.get_gender() if first_names else None
country = names_db.get_country() if last_names else None
return gender, country
def split_name(all_names):
'''Splits name into parts.
If one token, assume it is a first name.
If two tokens, first and last name.
If three tokens, one first name and two last names.
If four tokens, two first names and two last names.'''
match all_names.split():
case [first]:
return first, None
case [first, last]:
return first, last
case [first, last_1, last_2]:
return first, ' '.join((last_1, last_2))
case [first_1, first_2, last_1, last_2]:
return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
case _:
return None, None
def match_name(original_name):
# FIXME: take too LONG time to run (large df used multi-times), how to improve
# FIXME: here we only keep the first name for now
# TODO: how to match both first and last? -- first name match gender, last name match country?
# gender is not applied to last name
# the name distinguished by first and last?
# FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
first_name = original_name.split()[0]
global fb_df
names = fb_df[fb_df['first']==first_name]
if not names.empty:
name_df = names.sample(n=1)
# prevent for same name - deleting same name from df
new_df = fb_df[fb_df['first'] != first_name]
new_name = replace_name(name_df, new_df)
return new_name
else:
return 'Jane Doe'
def replace_name(name_df, new_df):
"""
:param name_df: df that match the original first name -> data frame
:param new_df: df that does not repeat with original name
:return: whole name: that match country & gender -> str
"""
gender = name_df['gender'].to_string(index=False)
country = name_df['country'].to_string(index=False)
# match country, then match gender
country_df = new_df[new_df['country'] == country]
country_g_df = country_df[country_df['gender'] == gender]
first = country_g_df['first'].sample(n=1).to_string(index=False)
last = country_g_df['last'].sample(n=1).to_string(index=False)
return first+' '+last
def match_name_2(original_name):
"""
Work by match gender from first name, match country from the last name
:param original_name:
:return:
"""
global fb_df
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
# FIXME: work when get a full name, may need branch to only first or last name....
gender = name_match_gender(original_name.split()[0])
print(original_name.split()[1])
country = name_match_country(original_name.split()[-1])
return replace_name_2(gender, country)
def name_match_country(last_name):
names = fb_df[fb_df['last'] == last_name]
if not names.empty:
country = names['country'].sample(n=1).to_string(index=False)
return country
else:
return 'US'
def name_match_gender(first_name):
names = fb_df[fb_df['first'] == first_name]
gender = names['gender'].sample(n=1).to_string(index=False)
return gender
def replace_name_2(gender, country):
# TODO: prevent same name
country_df = fb_df[fb_df['country'] == country]
country_g_df = country_df[country_df['gender'] == gender]
first = country_g_df['first'].sample(n=1).to_string(index=False)
last = country_g_df['last'].sample(n=1).to_string(index=False)
full_name = first +' ' + last
return full_name
def replace_text(str_list):
surrogate_text = ''
for i in str_list:
if isinstance(i, tuple):
i = match_entity(i[0], i[1])
surrogate_text += i
return surrogate_text
if __name__ == "__main__":
fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
# print(matching("PH", 'female', 'first', 'Momo', fb_df))
print(match_entity('Nora Wang', 'STUDENT'))