File size: 4,209 Bytes
5806da1
 
b0e291c
5806da1
b0e291c
5806da1
b0e291c
 
 
 
5806da1
b0e291c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5806da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd

from names_database import NameDatabase

names_db = NameDatabase

def describe_name(first_names, last_names):
    gender = names_db.get_gender() if first_names else None
    country = names_db.get_country() if last_names else None
    return gender, country

def split_name(all_names):
    '''Splits name into parts.
    If one token, assume it is a first name.
    If two tokens, first and last name.
    If three tokens, one first name and two last names.
    If four tokens, two first names and two last names.'''
    match all_names.split():
        case [first]:
            return first, None
        case [first, last]:
            return first, last
        case [first, last_1, last_2]:
            return first, ' '.join((last_1, last_2))
        case [first_1, first_2, last_1, last_2]:
            return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
        case _:
            return None, None

def match_name(original_name):
    # FIXME: take too LONG time to run (large df used multi-times), how to improve
    # FIXME: here we only keep the first name for now
    # TODO: how to match both first and last? -- first name match gender, last name match country?
    # gender is not applied to last name
    # the name distinguished by first and last?
    # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
    first_name = original_name.split()[0]
    global fb_df
    names = fb_df[fb_df['first']==first_name]
    if not names.empty:
        name_df = names.sample(n=1)
        # prevent for same name - deleting same name from df
        new_df = fb_df[fb_df['first'] != first_name]
        new_name = replace_name(name_df, new_df)
        return new_name
    else:
        return 'Jane Doe'

def replace_name(name_df, new_df):
    """
    :param name_df: df that match the original first name -> data frame
    :param new_df: df that does not repeat with original name
    :return: whole name: that match country & gender -> str
    """
    gender = name_df['gender'].to_string(index=False)
    country = name_df['country'].to_string(index=False)

    # match country, then match gender
    country_df = new_df[new_df['country'] == country]
    country_g_df = country_df[country_df['gender'] == gender]

    first = country_g_df['first'].sample(n=1).to_string(index=False)
    last = country_g_df['last'].sample(n=1).to_string(index=False)
    return first+' '+last



def match_name_2(original_name):
    """
    Work by match gender from first name, match country from the last name
    :param original_name:
    :return:
    """
    global fb_df
    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
    # FIXME: work when get a full name, may need branch to only first or last name....
    gender = name_match_gender(original_name.split()[0])
    print(original_name.split()[1])
    country = name_match_country(original_name.split()[-1])
    return replace_name_2(gender, country)


def name_match_country(last_name):
    names = fb_df[fb_df['last'] == last_name]
    if not names.empty:
        country = names['country'].sample(n=1).to_string(index=False)
        return country
    else:
        return 'US'

def name_match_gender(first_name):
    names = fb_df[fb_df['first'] == first_name]
    gender = names['gender'].sample(n=1).to_string(index=False)
    return gender

def replace_name_2(gender, country):
    # TODO: prevent same name
    country_df = fb_df[fb_df['country'] == country]
    country_g_df = country_df[country_df['gender'] == gender]

    first = country_g_df['first'].sample(n=1).to_string(index=False)
    last = country_g_df['last'].sample(n=1).to_string(index=False)
    full_name = first +' ' + last
    return full_name

def replace_text(str_list):
    surrogate_text = ''
    for i in str_list:
        if isinstance(i, tuple):
            i = match_entity(i[0], i[1])
        surrogate_text += i
    return surrogate_text

if __name__ == "__main__":
    fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
    # print(matching("PH", 'female', 'first', 'Momo', fb_df))
    print(match_entity('Nora Wang', 'STUDENT'))