langdonholmes commited on
Commit
a002c38
1 Parent(s): 2db3ee4
Files changed (8) hide show
  1. .gitignore +1 -0
  2. Pipfile +3 -0
  3. Pipfile.lock +122 -48
  4. anonymizer.py +13 -0
  5. app.py +0 -1
  6. main.py +64 -4
  7. models/anonymize.py +10 -0
  8. test_main.py +35 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
  __pycache__/*
 
 
1
  __pycache__/*
2
+ data/ascii_names.parquet
Pipfile CHANGED
@@ -15,6 +15,9 @@ tokenizers = "==0.12.1"
15
  torch = "==1.12.0"
16
  en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
17
  names-dataset = "*"
 
 
 
18
 
19
  [dev-packages]
20
 
 
15
  torch = "==1.12.0"
16
  en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
17
  names-dataset = "*"
18
+ fastapi = "*"
19
+ httpx = "*"
20
+ uvicorn = "*"
21
 
22
  [dev-packages]
23
 
Pipfile.lock CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_meta": {
3
  "hash": {
4
- "sha256": "6a4aa8c782c5b5fd8f5f0b3d7ba6cb6541f37295823bdee26d3fd575533c5999"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
@@ -24,6 +24,14 @@
24
  "markers": "python_version >= '3.7'",
25
  "version": "==4.2.2"
26
  },
 
 
 
 
 
 
 
 
27
  "attrs": {
28
  "hashes": [
29
  "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
@@ -270,6 +278,14 @@
270
  "markers": "python_version >= '3.6'",
271
  "version": "==0.4"
272
  },
 
 
 
 
 
 
 
 
273
  "filelock": {
274
  "hashes": [
275
  "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de",
@@ -288,11 +304,19 @@
288
  },
289
  "gitpython": {
290
  "hashes": [
291
- "sha256:769c2d83e13f5d938b7688479da374c4e3d49f71549aaf462b646db9602ea6f8",
292
- "sha256:cd455b0000615c60e286208ba540271af9fe531fa6a87cc590a7298785ab2882"
293
  ],
294
  "markers": "python_version >= '3.7'",
295
- "version": "==3.1.30"
 
 
 
 
 
 
 
 
296
  },
297
  "htbuilder": {
298
  "hashes": [
@@ -301,13 +325,29 @@
301
  "markers": "python_version >= '3.5'",
302
  "version": "==0.6.1"
303
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  "huggingface-hub": {
305
  "hashes": [
306
- "sha256:93809eabbfb2058a808bddf8b2a70f645de3f9df73ce87ddf5163d4c74b71c0c",
307
- "sha256:da82c9ec8f9d8f976ffd3fd8249d20bb35c2dd3145a9f7ca1106f0ebefd9afa0"
308
  ],
309
  "markers": "python_full_version >= '3.7.0'",
310
- "version": "==0.12.0"
311
  },
312
  "idna": {
313
  "hashes": [
@@ -351,11 +391,11 @@
351
  },
352
  "markdown-it-py": {
353
  "hashes": [
354
- "sha256:93de681e5c021a432c63147656fe21790bc01231e0cd2da73626f1aa3ac0fe27",
355
- "sha256:cf7e59fed14b5ae17c0006eff14a2d9a00ed5f3a846148153899a0224e2c07da"
356
  ],
357
  "markers": "python_version >= '3.7'",
358
- "version": "==2.1.0"
359
  },
360
  "markupsafe": {
361
  "hashes": [
@@ -1066,6 +1106,16 @@
1066
  ],
1067
  "version": "==1.5.1"
1068
  },
 
 
 
 
 
 
 
 
 
 
1069
  "rich": {
1070
  "hashes": [
1071
  "sha256:125d96d20c92b946b983d0d392b84ff945461e5a06d3867e9f9e575f8697b67f",
@@ -1084,11 +1134,11 @@
1084
  },
1085
  "setuptools": {
1086
  "hashes": [
1087
- "sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c",
1088
- "sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"
1089
  ],
1090
  "markers": "python_version >= '3.7'",
1091
- "version": "==67.2.0"
1092
  },
1093
  "six": {
1094
  "hashes": [
@@ -1114,6 +1164,14 @@
1114
  "markers": "python_version >= '3.6'",
1115
  "version": "==5.0.0"
1116
  },
 
 
 
 
 
 
 
 
1117
  "spacy": {
1118
  "hashes": [
1119
  "sha256:0a93797b9fea6ec1ecf3b95d86b8228d364470afac7278b23c13fd4305ad4ec2",
@@ -1240,6 +1298,14 @@
1240
  "index": "pypi",
1241
  "version": "==3.0.0"
1242
  },
 
 
 
 
 
 
 
 
1243
  "streamlit": {
1244
  "hashes": [
1245
  "sha256:0b3a9539e6ebcb8e5d57d16a846c3488143e1954174d7f1b2b40bf3e919302cc",
@@ -1414,11 +1480,11 @@
1414
  },
1415
  "typing-extensions": {
1416
  "hashes": [
1417
- "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
1418
- "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
1419
  ],
1420
  "markers": "python_version >= '3.7'",
1421
- "version": "==4.4.0"
1422
  },
1423
  "tzdata": {
1424
  "hashes": [
@@ -1444,6 +1510,14 @@
1444
  "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
1445
  "version": "==1.26.14"
1446
  },
 
 
 
 
 
 
 
 
1447
  "validators": {
1448
  "hashes": [
1449
  "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"
@@ -1460,45 +1534,45 @@
1460
  },
1461
  "watchdog": {
1462
  "hashes": [
1463
- "sha256:102a60093090fc3ff76c983367b19849b7cc24ec414a43c0333680106e62aae1",
1464
- "sha256:17f1708f7410af92ddf591e94ae71a27a13974559e72f7e9fde3ec174b26ba2e",
1465
- "sha256:195ab1d9d611a4c1e5311cbf42273bc541e18ea8c32712f2fb703cfc6ff006f9",
1466
- "sha256:4cb5ecc332112017fbdb19ede78d92e29a8165c46b68a0b8ccbd0a154f196d5e",
1467
- "sha256:5100eae58133355d3ca6c1083a33b81355c4f452afa474c2633bd2fbbba398b3",
1468
- "sha256:61fdb8e9c57baf625e27e1420e7ca17f7d2023929cd0065eb79c83da1dfbeacd",
1469
- "sha256:6ccd8d84b9490a82b51b230740468116b8205822ea5fdc700a553d92661253a3",
1470
- "sha256:6e01d699cd260d59b84da6bda019dce0a3353e3fcc774408ae767fe88ee096b7",
1471
- "sha256:748ca797ff59962e83cc8e4b233f87113f3cf247c23e6be58b8a2885c7337aa3",
1472
- "sha256:83a7cead445008e880dbde833cb9e5cc7b9a0958edb697a96b936621975f15b9",
1473
- "sha256:8586d98c494690482c963ffb24c49bf9c8c2fe0589cec4dc2f753b78d1ec301d",
1474
- "sha256:8b5cde14e5c72b2df5d074774bdff69e9b55da77e102a91f36ef26ca35f9819c",
1475
- "sha256:8c28c23972ec9c524967895ccb1954bc6f6d4a557d36e681a36e84368660c4ce",
1476
- "sha256:967636031fa4c4955f0f3f22da3c5c418aa65d50908d31b73b3b3ffd66d60640",
1477
- "sha256:96cbeb494e6cbe3ae6aacc430e678ce4b4dd3ae5125035f72b6eb4e5e9eb4f4e",
1478
- "sha256:978a1aed55de0b807913b7482d09943b23a2d634040b112bdf31811a422f6344",
1479
- "sha256:a09483249d25cbdb4c268e020cb861c51baab2d1affd9a6affc68ffe6a231260",
1480
- "sha256:a480d122740debf0afac4ddd583c6c0bb519c24f817b42ed6f850e2f6f9d64a8",
1481
- "sha256:adaf2ece15f3afa33a6b45f76b333a7da9256e1360003032524d61bdb4c422ae",
1482
- "sha256:bc43c1b24d2f86b6e1cc15f68635a959388219426109233e606517ff7d0a5a73",
1483
- "sha256:c27d8c1535fd4474e40a4b5e01f4ba6720bac58e6751c667895cbc5c8a7af33c",
1484
- "sha256:cdcc23c9528601a8a293eb4369cbd14f6b4f34f07ae8769421252e9c22718b6f",
1485
- "sha256:cece1aa596027ff56369f0b50a9de209920e1df9ac6d02c7f9e5d8162eb4f02b",
1486
- "sha256:d0f29fd9f3f149a5277929de33b4f121a04cf84bb494634707cfa8ea8ae106a8",
1487
- "sha256:d6b87477752bd86ac5392ecb9eeed92b416898c30bd40c7e2dd03c3146105646",
1488
- "sha256:e038be858425c4f621900b8ff1a3a1330d9edcfeaa1c0468aeb7e330fb87693e",
1489
- "sha256:e618a4863726bc7a3c64f95c218437f3349fb9d909eb9ea3a1ed3b567417c661",
1490
- "sha256:f8ac23ff2c2df4471a61af6490f847633024e5aa120567e08d07af5718c9d092"
1491
  ],
1492
  "markers": "platform_system != 'Darwin'",
1493
- "version": "==2.2.1"
1494
  },
1495
  "zipp": {
1496
  "hashes": [
1497
- "sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6",
1498
- "sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b"
1499
  ],
1500
  "markers": "python_version >= '3.7'",
1501
- "version": "==3.13.0"
1502
  }
1503
  },
1504
  "develop": {}
 
1
  {
2
  "_meta": {
3
  "hash": {
4
+ "sha256": "9cf8ce38b07b8e9be412869628fa94aa5e8444cfda715ed26c2dc73d547e2d9a"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
 
24
  "markers": "python_version >= '3.7'",
25
  "version": "==4.2.2"
26
  },
27
+ "anyio": {
28
+ "hashes": [
29
+ "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421",
30
+ "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"
31
+ ],
32
+ "markers": "python_full_version >= '3.6.2'",
33
+ "version": "==3.6.2"
34
+ },
35
  "attrs": {
36
  "hashes": [
37
  "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836",
 
278
  "markers": "python_version >= '3.6'",
279
  "version": "==0.4"
280
  },
281
+ "fastapi": {
282
+ "hashes": [
283
+ "sha256:023a0f5bd2c8b2609014d3bba1e14a1d7df96c6abea0a73070621c9862b9a4de",
284
+ "sha256:ae7b97c778e2f2ec3fb3cb4fb14162129411d99907fb71920f6d69a524340ebf"
285
+ ],
286
+ "index": "pypi",
287
+ "version": "==0.92.0"
288
+ },
289
  "filelock": {
290
  "hashes": [
291
  "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de",
 
304
  },
305
  "gitpython": {
306
  "hashes": [
307
+ "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573",
308
+ "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"
309
  ],
310
  "markers": "python_version >= '3.7'",
311
+ "version": "==3.1.31"
312
+ },
313
+ "h11": {
314
+ "hashes": [
315
+ "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
316
+ "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"
317
+ ],
318
+ "markers": "python_version >= '3.7'",
319
+ "version": "==0.14.0"
320
  },
321
  "htbuilder": {
322
  "hashes": [
 
325
  "markers": "python_version >= '3.5'",
326
  "version": "==0.6.1"
327
  },
328
+ "httpcore": {
329
+ "hashes": [
330
+ "sha256:c5d6f04e2fc530f39e0c077e6a30caa53f1451096120f1f38b954afd0b17c0cb",
331
+ "sha256:da1fb708784a938aa084bde4feb8317056c55037247c787bd7e19eb2c2949dc0"
332
+ ],
333
+ "markers": "python_version >= '3.7'",
334
+ "version": "==0.16.3"
335
+ },
336
+ "httpx": {
337
+ "hashes": [
338
+ "sha256:9818458eb565bb54898ccb9b8b251a28785dd4a55afbc23d0eb410754fe7d0f9",
339
+ "sha256:a211fcce9b1254ea24f0cd6af9869b3d29aba40154e947d2a07bb499b3e310d6"
340
+ ],
341
+ "index": "pypi",
342
+ "version": "==0.23.3"
343
+ },
344
  "huggingface-hub": {
345
  "hashes": [
346
+ "sha256:6f960f6246ef9c3446d0d6275e853485515682c350917fdaf2a59705f8b9ebb3",
347
+ "sha256:867586cc8543fe1bd43a219fedbea7d71690021ad80f0c46f35c4751069278d7"
348
  ],
349
  "markers": "python_full_version >= '3.7.0'",
350
+ "version": "==0.12.1"
351
  },
352
  "idna": {
353
  "hashes": [
 
391
  },
392
  "markdown-it-py": {
393
  "hashes": [
394
+ "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30",
395
+ "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"
396
  ],
397
  "markers": "python_version >= '3.7'",
398
+ "version": "==2.2.0"
399
  },
400
  "markupsafe": {
401
  "hashes": [
 
1106
  ],
1107
  "version": "==1.5.1"
1108
  },
1109
+ "rfc3986": {
1110
+ "extras": [
1111
+ "idna2008"
1112
+ ],
1113
+ "hashes": [
1114
+ "sha256:270aaf10d87d0d4e095063c65bf3ddbc6ee3d0b226328ce21e036f946e421835",
1115
+ "sha256:a86d6e1f5b1dc238b218b012df0aa79409667bb209e58da56d0b94704e712a97"
1116
+ ],
1117
+ "version": "==1.5.0"
1118
+ },
1119
  "rich": {
1120
  "hashes": [
1121
  "sha256:125d96d20c92b946b983d0d392b84ff945461e5a06d3867e9f9e575f8697b67f",
 
1134
  },
1135
  "setuptools": {
1136
  "hashes": [
1137
+ "sha256:e5fd0a713141a4a105412233c63dc4e17ba0090c8e8334594ac790ec97792330",
1138
+ "sha256:f106dee1b506dee5102cc3f3e9e68137bbad6d47b616be7991714b0c62204251"
1139
  ],
1140
  "markers": "python_version >= '3.7'",
1141
+ "version": "==67.4.0"
1142
  },
1143
  "six": {
1144
  "hashes": [
 
1164
  "markers": "python_version >= '3.6'",
1165
  "version": "==5.0.0"
1166
  },
1167
+ "sniffio": {
1168
+ "hashes": [
1169
+ "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
1170
+ "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
1171
+ ],
1172
+ "markers": "python_version >= '3.7'",
1173
+ "version": "==1.3.0"
1174
+ },
1175
  "spacy": {
1176
  "hashes": [
1177
  "sha256:0a93797b9fea6ec1ecf3b95d86b8228d364470afac7278b23c13fd4305ad4ec2",
 
1298
  "index": "pypi",
1299
  "version": "==3.0.0"
1300
  },
1301
+ "starlette": {
1302
+ "hashes": [
1303
+ "sha256:774f1df1983fd594b9b6fb3ded39c2aa1979d10ac45caac0f4255cbe2acb8628",
1304
+ "sha256:854c71e73736c429c2bdb07801f2c76c9cba497e7c3cf4988fde5e95fe4cdb3c"
1305
+ ],
1306
+ "markers": "python_version >= '3.7'",
1307
+ "version": "==0.25.0"
1308
+ },
1309
  "streamlit": {
1310
  "hashes": [
1311
  "sha256:0b3a9539e6ebcb8e5d57d16a846c3488143e1954174d7f1b2b40bf3e919302cc",
 
1480
  },
1481
  "typing-extensions": {
1482
  "hashes": [
1483
+ "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb",
1484
+ "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"
1485
  ],
1486
  "markers": "python_version >= '3.7'",
1487
+ "version": "==4.5.0"
1488
  },
1489
  "tzdata": {
1490
  "hashes": [
 
1510
  "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
1511
  "version": "==1.26.14"
1512
  },
1513
+ "uvicorn": {
1514
+ "hashes": [
1515
+ "sha256:a4e12017b940247f836bc90b72e725d7dfd0c8ed1c51eb365f5ba30d9f5127d8",
1516
+ "sha256:c3ed1598a5668208723f2bb49336f4509424ad198d6ab2615b7783db58d919fd"
1517
+ ],
1518
+ "index": "pypi",
1519
+ "version": "==0.20.0"
1520
+ },
1521
  "validators": {
1522
  "hashes": [
1523
  "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"
 
1534
  },
1535
  "watchdog": {
1536
  "hashes": [
1537
+ "sha256:00f93782c67042d9525ec51628330b5faf5fb84bcb7ebaac05ea8528cfb20bba",
1538
+ "sha256:0f7d759299ce21a3d2a77e18d430c24811369c3432453701790acc6ff45a7101",
1539
+ "sha256:139262f678b4e6a7013261c772059bca358441de04fb0e0087489a34db9e3db0",
1540
+ "sha256:15bf5b165d7a6b48265411dad74fb0d33053f8270eb6575faad0e016035cf9f7",
1541
+ "sha256:1d9c656495172873bf1ddc7e39e80055fcdd21c4608cf68f23a28116dcba0b43",
1542
+ "sha256:242e57253e84a736e6777ba756c48cf6a68d3d90cb9e01bd6bfd371a949ace3a",
1543
+ "sha256:3fa1572f5a2f6d17d4d860edbc04488fef31b007c25c2f3b11203fb8179b7c67",
1544
+ "sha256:3fa74b0ef4825f9112932675a002296cb2d3d3e400d7a44c32fafd1ecc83ada0",
1545
+ "sha256:43d76d7888b26850b908208bb82383a193e8b0f25d0abaa84452f191b4acdea4",
1546
+ "sha256:45c13e7e6eea1013da419bf9aa9a8f5df7bbf3e5edce40bc6df84130febf39d5",
1547
+ "sha256:473164a2de473f708ca194a992466eeefff73b58273bbb88e089c5a5a98fcda1",
1548
+ "sha256:4e648df44a4c6ea6da4d9eb6722745c986b9d70268f25ae60f140082d7c8908e",
1549
+ "sha256:5ddbbe87f9ed726940d174076da030cd01ec45433ef2b1b2e6094c84f2af17f1",
1550
+ "sha256:6d79b5954db8f41d6a7f5763042b988f7a4afd40b7d141456061fa7c5b7f2159",
1551
+ "sha256:7767a3da3307d9cf597832f692702441a97c259e5d0d560f2e57c43ad0d191d2",
1552
+ "sha256:8863913ea2c3f256d18c33d84546518636e391cd8f50d209b9a31221e0f7d3fd",
1553
+ "sha256:8a214955769d2ef0f7aaa82f31863e3bdf6b083ce1b5f1c2e85cab0f66fba024",
1554
+ "sha256:982f5416a2817003172994d865285dd6a2b3836f033cd3fa87d1a62096a162cc",
1555
+ "sha256:9d39effe6909be898ba3e7286a9e9b17a6a9f734fb1ef9dde3e9bb68715fca39",
1556
+ "sha256:9e651b4874477c1bf239417d43818bbfd047aaf641b029fa60d6f5109ede0db0",
1557
+ "sha256:a3559ee82a10976de1ec544b6ebe3b4aa398d491860a283d80ec0f550076d068",
1558
+ "sha256:a4b9bece40d46bf6fb8621817ea7d903eae2b9b3ebac55a51ed50354a79061a8",
1559
+ "sha256:a623de186477e9e05f8461087f856412eae5cd005cc4bcb232ed5c6f9a8709f5",
1560
+ "sha256:aa4773160b9cb21ba369cb42d59a947087330b3a02480173033a6a6cc137a510",
1561
+ "sha256:c1b3962e5463a848ba2a342cb66c80251dca27a102933b8f38d231d2a9e5a543",
1562
+ "sha256:d04662017efd00a014cff9068708e085d67f2fac43f48bbbb95a7f97490487f3",
1563
+ "sha256:ebe756f788cb130fdc5c150ea8a4fda39cb4ee3a5873a345607c8b84fecf018b",
1564
+ "sha256:f1a655f4a49f9232311b9967f42cc2eaf43fd4903f3bed850dd4570fda5d5eff"
1565
  ],
1566
  "markers": "platform_system != 'Darwin'",
1567
+ "version": "==2.3.0"
1568
  },
1569
  "zipp": {
1570
  "hashes": [
1571
+ "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b",
1572
+ "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"
1573
  ],
1574
  "markers": "python_version >= '3.7'",
1575
+ "version": "==3.15.0"
1576
  }
1577
  },
1578
  "develop": {}
anonymizer.py CHANGED
@@ -20,6 +20,9 @@ class surrogate_anonymizer(AnonymizerEngine):
20
  self.names_db = NameDatabase()
21
  self.names_df = pd.read_parquet(name_table)
22
 
 
 
 
23
  def get_random_name(
24
  self,
25
  country: Optional[str] = None,
@@ -72,6 +75,10 @@ class surrogate_anonymizer(AnonymizerEngine):
72
  # 'PII'. Bypass this test.
73
  return 'PII'
74
 
 
 
 
 
75
  first_names, last_names = self.split_name(original_name)
76
  gender = self.names_db.get_gender(first_names) if first_names else None
77
  logger.debug(f'Gender set to {gender}')
@@ -90,6 +97,9 @@ class surrogate_anonymizer(AnonymizerEngine):
90
  surrogate_name += ' ' + name_candidates.iloc[1]['last']
91
 
92
  logger.info(f'Returning surrogate name {surrogate_name}')
 
 
 
93
  return surrogate_name
94
 
95
  def anonymize(
@@ -128,7 +138,9 @@ class surrogate_anonymizer(AnonymizerEngine):
128
 
129
  if __name__ == '__main__':
130
  logging.basicConfig(level=logging.DEBUG)
 
131
  anonymizer = surrogate_anonymizer()
 
132
  test_names = ['Nora Wang',
133
  'MJ',
134
  '',
@@ -136,5 +148,6 @@ if __name__ == '__main__':
136
  'Mario Escobar Sanchez',
137
  'Jane Fonda Michelle Rousseau',
138
  'Sir Phillipe Ricardo de la Sota Mayor']
 
139
  for name in test_names:
140
  anonymizer.generate_surrogate(name)
 
20
  self.names_db = NameDatabase()
21
  self.names_df = pd.read_parquet(name_table)
22
 
23
+ # keep track of names we have seen
24
+ self.seen_names = dict()
25
+
26
  def get_random_name(
27
  self,
28
  country: Optional[str] = None,
 
75
  # 'PII'. Bypass this test.
76
  return 'PII'
77
 
78
+ # If we have seen this name before, return the same surrogate
79
+ if original_name in self.seen_names:
80
+ return self.seen_names[original_name]
81
+
82
  first_names, last_names = self.split_name(original_name)
83
  gender = self.names_db.get_gender(first_names) if first_names else None
84
  logger.debug(f'Gender set to {gender}')
 
97
  surrogate_name += ' ' + name_candidates.iloc[1]['last']
98
 
99
  logger.info(f'Returning surrogate name {surrogate_name}')
100
+
101
+ self.seen_names[original_name] = surrogate_name
102
+
103
  return surrogate_name
104
 
105
  def anonymize(
 
138
 
139
  if __name__ == '__main__':
140
  logging.basicConfig(level=logging.DEBUG)
141
+
142
  anonymizer = surrogate_anonymizer()
143
+
144
  test_names = ['Nora Wang',
145
  'MJ',
146
  '',
 
148
  'Mario Escobar Sanchez',
149
  'Jane Fonda Michelle Rousseau',
150
  'Sir Phillipe Ricardo de la Sota Mayor']
151
+
152
  for name in test_names:
153
  anonymizer.generate_surrogate(name)
app.py CHANGED
@@ -3,7 +3,6 @@
3
 
4
  from analyzer import prepare_analyzer
5
  from anonymizer import surrogate_anonymizer
6
- from presidio_anonymizer import AnonymizerEngine
7
  import pandas as pd
8
  from annotated_text import annotated_text
9
  from json import JSONEncoder
 
3
 
4
  from analyzer import prepare_analyzer
5
  from anonymizer import surrogate_anonymizer
 
6
  import pandas as pd
7
  from annotated_text import annotated_text
8
  from json import JSONEncoder
main.py CHANGED
@@ -1,9 +1,69 @@
1
- import typer
2
 
 
 
 
 
 
 
3
 
4
- def main():
5
- typer.echo("Hello World")
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  if __name__ == "__main__":
9
- typer.run(main)
 
 
 
 
 
 
 
 
1
+ '''API for PIILO'''
2
 
3
+ from analyzer import prepare_analyzer
4
+ from anonymizer import surrogate_anonymizer
5
+ from fastapi import FastAPI
6
+ import logging
7
+ from models.anonymize import AnonymizeRequest, AnonymizeResponse
8
+ from fastapi.middleware.cors import CORSMiddleware
9
 
10
+ # Define Student Name Detection Model
11
+ configuration = {
12
+ 'nlp_engine_name': 'spacy',
13
+ 'models': [
14
+ {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
15
+ }
16
 
17
+ # set up logger for this module
18
+ logger = logging.getLogger('api')
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+ # Load Custom Presidio Analyzer and Anonymizer
22
+ logger.info("Loading Presidio Analyzer and Anonymizer")
23
+ analyzer = prepare_analyzer(configuration)
24
+ anonymizer = surrogate_anonymizer()
25
+ logger.info("Loaded Presidio Analyzer and Anonymizer")
26
+
27
+ # Initialize FastAPI
28
+ app = FastAPI()
29
+
30
+ # Enable CORS
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"],
34
+ allow_methods=["*"],
35
+ allow_headers=["*"],
36
+ )
37
+
38
+ # Define FastAPI routes
39
+ @app.get("/")
40
+ def hello():
41
+ return {"message": "Hello World"}
42
+
43
+ @app.post("/anonymize")
44
+ def anonymize(anon_req: AnonymizeRequest) -> AnonymizeResponse:
45
+ '''Anonymize PII in text using a custom Presidio Analyzer and Anonymizer
46
+ '''
47
+ analyzer_result = analyzer.analyze(anon_req.raw_text,
48
+ entities=anon_req.entities,
49
+ language=anon_req.language,
50
+ )
51
+
52
+ anonymizer_result = anonymizer.anonymize(anon_req.raw_text,
53
+ analyzer_result)
54
+
55
+ anonymize_response = AnonymizeResponse(
56
+ anonymized_text=anonymizer_result
57
+ )
58
+
59
+ return anonymize_response
60
 
61
  if __name__ == "__main__":
62
+ import uvicorn
63
+ import os
64
+
65
+ uvicorn.run(
66
+ "main:app", host="0.0.0.0",
67
+ port=int(os.environ.get("PORT", 8000)),
68
+ reload=True,
69
+ )
models/anonymize.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Optional
3
+
4
+ class AnonymizeRequest(BaseModel):
5
+ raw_text: str
6
+ entities: Optional[list] = None
7
+ language: Optional[str] = 'en'
8
+
9
+ class AnonymizeResponse(BaseModel):
10
+ anonymized_text: str
test_main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+
3
+ from main import app
4
+ import logging
5
+
6
+ logger = logging.getLogger('api')
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+ client = TestClient(app)
10
+
11
+ def test_read_main():
12
+ response = client.get("/")
13
+ assert response.status_code == 200
14
+ assert response.json() == {"message": "Hello World"}
15
+
16
+ def test_email():
17
+ response = client.post("/anonymize",
18
+ json={"raw_text": "My name is [email protected]"},
19
+ )
20
+ assert response.status_code == 200
21
+ assert response.json() == {
22
+ "anonymized_text": "My name is [email protected]"
23
+ }
24
+
25
+ def test_name():
26
+ response = client.post("/anonymize",
27
+ json={"raw_text": "My name is Nora Wang"},
28
+ )
29
+ assert response.status_code == 200
30
+ logger.info(response.json())
31
+
32
+ if __name__ == "__main__":
33
+ test_read_main()
34
+ test_email()
35
+ test_name()