langdonholmes commited on
Commit
b0e291c
1 Parent(s): 6449ca4

inherited from names_database

Browse files
.gitignore CHANGED
@@ -1,3 +1 @@
1
- __pycache__\*
2
- __pycache__/spacy_analyzer.cpython-310.pyc
3
- __pycache__/spacy_recognizer.cpython-310.pyc
 
1
+ __pycache__/*
 
 
Pipfile CHANGED
@@ -14,6 +14,7 @@ streamlit = "==1.17.0"
14
  tokenizers = "==0.12.1"
15
  torch = "==1.12.0"
16
  en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
 
17
 
18
  [dev-packages]
19
 
 
14
  tokenizers = "==0.12.1"
15
  torch = "==1.12.0"
16
  en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
17
+ names-dataset = "*"
18
 
19
  [dev-packages]
20
 
Pipfile.lock CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_meta": {
3
  "hash": {
4
- "sha256": "95f6932d0b58210f580f1426df3f909ce879d49cb04320ceff02998ac500d614"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
@@ -463,39 +463,46 @@
463
  "markers": "python_version >= '3.6'",
464
  "version": "==1.0.9"
465
  },
 
 
 
 
 
 
 
466
  "numpy": {
467
  "hashes": [
468
- "sha256:0044f7d944ee882400890f9ae955220d29b33d809a038923d88e4e01d652acd9",
469
- "sha256:0e3463e6ac25313462e04aea3fb8a0a30fb906d5d300f58b3bc2c23da6a15398",
470
- "sha256:179a7ef0889ab769cc03573b6217f54c8bd8e16cef80aad369e1e8185f994cd7",
471
- "sha256:2386da9a471cc00a1f47845e27d916d5ec5346ae9696e01a8a34760858fe9dd2",
472
- "sha256:26089487086f2648944f17adaa1a97ca6aee57f513ba5f1c0b7ebdabbe2b9954",
473
- "sha256:28bc9750ae1f75264ee0f10561709b1462d450a4808cd97c013046073ae64ab6",
474
- "sha256:28e418681372520c992805bb723e29d69d6b7aa411065f48216d8329d02ba032",
475
- "sha256:442feb5e5bada8408e8fcd43f3360b78683ff12a4444670a7d9e9824c1817d36",
476
- "sha256:6ec0c021cd9fe732e5bab6401adea5a409214ca5592cd92a114f7067febcba0c",
477
- "sha256:7094891dcf79ccc6bc2a1f30428fa5edb1e6fb955411ffff3401fb4ea93780a8",
478
- "sha256:84e789a085aabef2f36c0515f45e459f02f570c4b4c4c108ac1179c34d475ed7",
479
- "sha256:87a118968fba001b248aac90e502c0b13606721b1343cdaddbc6e552e8dfb56f",
480
- "sha256:8e669fbdcdd1e945691079c2cae335f3e3a56554e06bbd45d7609a6cf568c700",
481
- "sha256:ad2925567f43643f51255220424c23d204024ed428afc5aad0f86f3ffc080086",
482
- "sha256:b0677a52f5d896e84414761531947c7a330d1adc07c3a4372262f25d84af7bf7",
483
- "sha256:b07b40f5fb4fa034120a5796288f24c1fe0e0580bbfff99897ba6267af42def2",
484
- "sha256:b09804ff570b907da323b3d762e74432fb07955701b17b08ff1b5ebaa8cfe6a9",
485
- "sha256:b162ac10ca38850510caf8ea33f89edcb7b0bb0dfa5592d59909419986b72407",
486
- "sha256:b31da69ed0c18be8b77bfce48d234e55d040793cebb25398e2a7d84199fbc7e2",
487
- "sha256:caf65a396c0d1f9809596be2e444e3bd4190d86d5c1ce21f5fc4be60a3bc5b36",
488
- "sha256:cfa1161c6ac8f92dea03d625c2d0c05e084668f4a06568b77a25a89111621566",
489
- "sha256:dae46bed2cb79a58d6496ff6d8da1e3b95ba09afeca2e277628171ca99b99db1",
490
- "sha256:ddc7ab52b322eb1e40521eb422c4e0a20716c271a306860979d450decbb51b8e",
491
- "sha256:de92efa737875329b052982e37bd4371d52cabf469f83e7b8be9bb7752d67e51",
492
- "sha256:e274f0f6c7efd0d577744f52032fdd24344f11c5ae668fe8d01aac0422611df1",
493
- "sha256:ed5fb71d79e771ec930566fae9c02626b939e37271ec285e9efaf1b5d4370e7d",
494
- "sha256:ef85cf1f693c88c1fd229ccd1055570cb41cdf4875873b7728b6301f12cd05bf",
495
- "sha256:f1b739841821968798947d3afcefd386fa56da0caf97722a5de53e07c4ccedc7"
496
  ],
497
  "markers": "python_version >= '3.10'",
498
- "version": "==1.24.1"
499
  },
500
  "packaging": {
501
  "hashes": [
@@ -542,10 +549,10 @@
542
  },
543
  "phonenumbers": {
544
  "hashes": [
545
- "sha256:2e3fd1f3fde226b289489275517c76edf223eafd9f43a2c2c36498a44b73d4b0",
546
- "sha256:6eb2faf29c19f946baf10f1c977a1f856cab90819fe7735b8e141d5407420c4a"
547
  ],
548
- "version": "==8.13.5"
549
  },
550
  "pillow": {
551
  "hashes": [
@@ -737,6 +744,13 @@
737
  "markers": "python_version >= '3.7'",
738
  "version": "==11.0.0"
739
  },
 
 
 
 
 
 
 
740
  "pycryptodome": {
741
  "hashes": [
742
  "sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
@@ -1070,11 +1084,11 @@
1070
  },
1071
  "setuptools": {
1072
  "hashes": [
1073
- "sha256:a7687c12b444eaac951ea87a9627c4f904ac757e7abdc5aac32833234af90378",
1074
- "sha256:e261cdf010c11a41cb5cb5f1bf3338a7433832029f559a6a7614bd42a967c300"
1075
  ],
1076
  "markers": "python_version >= '3.7'",
1077
- "version": "==67.1.0"
1078
  },
1079
  "six": {
1080
  "hashes": [
@@ -1480,11 +1494,11 @@
1480
  },
1481
  "zipp": {
1482
  "hashes": [
1483
- "sha256:73efd63936398aac78fd92b6f4865190119d6c91b531532e798977ea8dd402eb",
1484
- "sha256:9eb0a4c5feab9b08871db0d672745b53450d7f26992fd1e4653aa43345e97b86"
1485
  ],
1486
  "markers": "python_version >= '3.7'",
1487
- "version": "==3.12.0"
1488
  }
1489
  },
1490
  "develop": {}
 
1
  {
2
  "_meta": {
3
  "hash": {
4
+ "sha256": "6a4aa8c782c5b5fd8f5f0b3d7ba6cb6541f37295823bdee26d3fd575533c5999"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
 
463
  "markers": "python_version >= '3.6'",
464
  "version": "==1.0.9"
465
  },
466
+ "names-dataset": {
467
+ "hashes": [
468
+ "sha256:69eea12c9d97e1ae32b6db955bb9b39f7816eb2727d3c6abc726cb475ad160ac"
469
+ ],
470
+ "index": "pypi",
471
+ "version": "==3.1.0"
472
+ },
473
  "numpy": {
474
  "hashes": [
475
+ "sha256:003a9f530e880cb2cd177cba1af7220b9aa42def9c4afc2a2fc3ee6be7eb2b22",
476
+ "sha256:150947adbdfeceec4e5926d956a06865c1c690f2fd902efede4ca6fe2e657c3f",
477
+ "sha256:2620e8592136e073bd12ee4536149380695fbe9ebeae845b81237f986479ffc9",
478
+ "sha256:2eabd64ddb96a1239791da78fa5f4e1693ae2dadc82a76bc76a14cbb2b966e96",
479
+ "sha256:4173bde9fa2a005c2c6e2ea8ac1618e2ed2c1c6ec8a7657237854d42094123a0",
480
+ "sha256:4199e7cfc307a778f72d293372736223e39ec9ac096ff0a2e64853b866a8e18a",
481
+ "sha256:4cecaed30dc14123020f77b03601559fff3e6cd0c048f8b5289f4eeabb0eb281",
482
+ "sha256:557d42778a6869c2162deb40ad82612645e21d79e11c1dc62c6e82a2220ffb04",
483
+ "sha256:63e45511ee4d9d976637d11e6c9864eae50e12dc9598f531c035265991910468",
484
+ "sha256:6524630f71631be2dabe0c541e7675db82651eb998496bbe16bc4f77f0772253",
485
+ "sha256:76807b4063f0002c8532cfeac47a3068a69561e9c8715efdad3c642eb27c0756",
486
+ "sha256:7de8fdde0003f4294655aa5d5f0a89c26b9f22c0a58790c38fae1ed392d44a5a",
487
+ "sha256:889b2cc88b837d86eda1b17008ebeb679d82875022200c6e8e4ce6cf549b7acb",
488
+ "sha256:92011118955724465fb6853def593cf397b4a1367495e0b59a7e69d40c4eb71d",
489
+ "sha256:97cf27e51fa078078c649a51d7ade3c92d9e709ba2bfb97493007103c741f1d0",
490
+ "sha256:9a23f8440561a633204a67fb44617ce2a299beecf3295f0d13c495518908e910",
491
+ "sha256:a51725a815a6188c662fb66fb32077709a9ca38053f0274640293a14fdd22978",
492
+ "sha256:a77d3e1163a7770164404607b7ba3967fb49b24782a6ef85d9b5f54126cc39e5",
493
+ "sha256:adbdce121896fd3a17a77ab0b0b5eedf05a9834a18699db6829a64e1dfccca7f",
494
+ "sha256:c29e6bd0ec49a44d7690ecb623a8eac5ab8a923bce0bea6293953992edf3a76a",
495
+ "sha256:c72a6b2f4af1adfe193f7beb91ddf708ff867a3f977ef2ec53c0ffb8283ab9f5",
496
+ "sha256:d0a2db9d20117bf523dde15858398e7c0858aadca7c0f088ac0d6edd360e9ad2",
497
+ "sha256:e3ab5d32784e843fc0dd3ab6dcafc67ef806e6b6828dc6af2f689be0eb4d781d",
498
+ "sha256:e428c4fbfa085f947b536706a2fc349245d7baa8334f0c5723c56a10595f9b95",
499
+ "sha256:e8d2859428712785e8a8b7d2b3ef0a1d1565892367b32f915c4a4df44d0e64f5",
500
+ "sha256:eef70b4fc1e872ebddc38cddacc87c19a3709c0e3e5d20bf3954c147b1dd941d",
501
+ "sha256:f64bb98ac59b3ea3bf74b02f13836eb2e24e48e0ab0145bbda646295769bd780",
502
+ "sha256:f9006288bcf4895917d02583cf3411f98631275bc67cce355a7f39f8c14338fa"
503
  ],
504
  "markers": "python_version >= '3.10'",
505
+ "version": "==1.24.2"
506
  },
507
  "packaging": {
508
  "hashes": [
 
549
  },
550
  "phonenumbers": {
551
  "hashes": [
552
+ "sha256:1531b42c8c49a1f06b08598441bf1f11fe2618f707c6fc96b581b44aa4f2b0e3",
553
+ "sha256:f8bd92975ba7463b7828ae2f95e1037b7e0ab8f023e9e8ffb7c560fd7f5d66d7"
554
  ],
555
+ "version": "==8.13.6"
556
  },
557
  "pillow": {
558
  "hashes": [
 
744
  "markers": "python_version >= '3.7'",
745
  "version": "==11.0.0"
746
  },
747
+ "pycountry": {
748
+ "hashes": [
749
+ "sha256:b2163a246c585894d808f18783e19137cb70a0c18fb36748dc01fc6f109c1646"
750
+ ],
751
+ "markers": "python_version >= '3.6' and python_version < '4'",
752
+ "version": "==22.3.5"
753
+ },
754
  "pycryptodome": {
755
  "hashes": [
756
  "sha256:04779cc588ad8f13c80a060b0b1c9d1c203d051d8a43879117fe6b8aaf1cd3fa",
 
1084
  },
1085
  "setuptools": {
1086
  "hashes": [
1087
+ "sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c",
1088
+ "sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"
1089
  ],
1090
  "markers": "python_version >= '3.7'",
1091
+ "version": "==67.2.0"
1092
  },
1093
  "six": {
1094
  "hashes": [
 
1494
  },
1495
  "zipp": {
1496
  "hashes": [
1497
+ "sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6",
1498
+ "sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b"
1499
  ],
1500
  "markers": "python_version >= '3.7'",
1501
+ "version": "==3.13.0"
1502
  }
1503
  },
1504
  "develop": {}
data/ascii_fb_names_small.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf5cf2fa43dc172c613f72793641f668e33c30b4e23932616de36cc0ce3447d
3
+ size 33601747
match_replace.py CHANGED
@@ -1,69 +1,31 @@
1
  import pandas as pd
2
 
3
- def replace_name_old(country_code, gender, f_l, original_name, fb_df):
4
- """
5
- Receiving country, gender, first_last name, and the original name.
6
- Match with a name that matches gender and country, and is randomly retrieved from the
7
- facebook dataset.
8
- Compare the surrogate name with the original name to make sure they are different.
9
- Return the surrogate name in a form of string.
10
- f_l: F or L for first or last name -> str
11
- """
12
- # prioritizing GENDER over country?
13
- # it is a very large dataset so can take long, how to improve the speed?
14
- # Q: If want to get a whole name at a time? (just combining)
15
- # Q: If only get initials? (change to other letters which should be easy)
16
- # translating gender code
17
- ###### randomly find a match in the data set! And a return a similar one
18
- # if gender == 'male':
19
- # gender = 'M'
20
- # elif gender == 'female':
21
- # gender = 'F'
22
- # else:
23
- # gender = None
24
-
25
- surrogate_name = original_name
26
- # checking whether the surrogate name and the original name is the same
27
- # using the while loop
28
- # TODO: [Old version] the order of gender and country need to be changed
29
- while(surrogate_name == original_name):
30
- # situation when gender can be matched
31
- if not gender:
32
- gender_df = fb_df[fb_df["gender"] == gender]
33
- gender_c_df = gender_df[gender_df["country"] == country_code]
34
- # situations: whether country code can be matched
35
- if gender_c_df.shape[0] > 0:
36
- surrogate_name = gender_c_df[f_l].sample(n=1).to_string()
37
- # if gender match, country not match: randomly return from gender df
38
- else:
39
- surrogate_name = gender_df[f_l].sample(n=1).to_string()
40
- else:
41
- # situation when gender cannot be match: gender is None
42
- country_df = fb_df[fb_df["country"] == country_code]
43
- # situation when country can be matched
44
- if country_df.shape[0] > 0:
45
- surrogate_name = country_df[f_l].sample(n=1).to_string()
46
- # situation when neither gender nor country can be matched
47
- # randomly return one name from the whole dataset
48
- else:
49
- surrogate_name = fb_df[f_l].sample(n=1).to_string()
50
-
51
- return surrogate_name
52
-
53
- def match_entity(original_info, entity):
54
- # TODO: need refinement for each kind of entity
55
- if entity == 'STUDENT':
56
- # TODO: here, change between 1 and 2
57
- return match_name_2(original_info)
58
- elif entity == 'EMAIL_ADDRESS':
59
- return '[email protected]'
60
- elif entity == 'PHONE_NUMBER':
61
- #TODO: specific form of number will be returned for consistency
62
- return '000-000-0000'
63
- elif entity == 'URL':
64
- return 'google.com'
65
- else:
66
- pass
67
 
68
  def match_name(original_name):
69
  # FIXME: take too LONG time to run (large df used multi-times), how to improve
@@ -74,7 +36,6 @@ def match_name(original_name):
74
  # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
75
  first_name = original_name.split()[0]
76
  global fb_df
77
- fb_df = pd.read_parquet('ascii_fb_names_small.parquet')
78
  names = fb_df[fb_df['first']==first_name]
79
  if not names.empty:
80
  name_df = names.sample(n=1)
 
1
  import pandas as pd
2
 
3
+ from names_database import NameDatabase
4
+
5
+ names_db = NameDatabase
6
+
7
+ def describe_name(first_names, last_names):
8
+ gender = names_db.get_gender() if first_names else None
9
+ country = names_db.get_country() if last_names else None
10
+ return gender, country
11
+
12
+ def split_name(all_names):
13
+ '''Splits name into parts.
14
+ If one token, assume it is a first name.
15
+ If two tokens, first and last name.
16
+ If three tokens, one first name and two last names.
17
+ If four tokens, two first names and two last names.'''
18
+ match all_names.split():
19
+ case [first]:
20
+ return first, None
21
+ case [first, last]:
22
+ return first, last
23
+ case [first, last_1, last_2]:
24
+ return first, ' '.join((last_1, last_2))
25
+ case [first_1, first_2, last_1, last_2]:
26
+ return ' '.join((first_1, first_2)), ' '.join((last_1, last_2))
27
+ case _:
28
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def match_name(original_name):
31
  # FIXME: take too LONG time to run (large df used multi-times), how to improve
 
36
  # FIXME: since it is completely random, the same original name may be diff after replacing. How to know whether the two names is the same person?
37
  first_name = original_name.split()[0]
38
  global fb_df
 
39
  names = fb_df[fb_df['first']==first_name]
40
  if not names.empty:
41
  name_df = names.sample(n=1)
names_database.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from names_dataset import NameDataset, NameWrapper
2
+ from typing import Optional
3
+
4
+ class NameDatabase(NameDataset):
5
+ def __init__(self) -> None:
6
+ super().__init__()
7
+ self.names = pd.read_parquet('ascii_fb_names_small.parquet')
8
+
9
+ def get_random_name(
10
+ self,
11
+ country: Optional[str] = None,
12
+ gender: Optional[str] = None
13
+ ):
14
+ '''country: ISO country code in 'alpha 2' format
15
+ gender: "M" or "F"
16
+ '''
17
+ names_view = self.names
18
+ if country:
19
+ names_view = names_view[names_view['country'] == country]
20
+ if gender:
21
+ names_view = names_view[names_view['gender'] == gender]
22
+ return names_view.sample(weights=names_view.count)
23
+
24
+ def get_gender(first_names: str):
25
+ return NameWrapper(self.search(first_names)).gender
26
+
27
+ def get_country(last_names: str):
28
+ return NameWrapper(self.search(last_names)).country