langdonholmes commited on
Commit
af52489
1 Parent(s): 14475a4

add functionality for surrogate name replacement

Browse files
Files changed (3) hide show
  1. Pipfile +21 -0
  2. Pipfile.lock +0 -0
  3. app.py +14 -2
Pipfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ pandas = "==1.4.3"
8
+ presidio-analyzer = "==2.2.28"
9
+ presidio-anonymizer = "==2.2.28"
10
+ spacy = "==3.4.1"
11
+ spacy-transformers = "==1.1.7"
12
+ st-annotated-text = "==3.0.0"
13
+ streamlit = "==1.17.0"
14
+ tokenizers = "==0.12.1"
15
+ torch = "==1.12.0"
16
+ en-student-name-detector = {file = "https://huggingface.co/langdonholmes/en_student_name_detector/resolve/main/en_student_name_detector-any-py3-none-any.whl"}
17
+
18
+ [dev-packages]
19
+
20
+ [requires]
21
+ python_version = "3.10"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -6,6 +6,7 @@ from spacy_recognizer import CustomSpacyRecognizer
6
  from presidio_analyzer.nlp_engine import NlpEngineProvider
7
  from presidio_anonymizer import AnonymizerEngine
8
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
 
9
  import pandas as pd
10
  from annotated_text import annotated_text
11
  from json import JSONEncoder
@@ -63,12 +64,23 @@ def analyze(**kwargs):
63
  kwargs["entities"] = None
64
  return analyzer_engine().analyze(**kwargs)
65
 
 
 
 
 
 
 
66
 
67
  def anonymize(text, analyze_results):
68
  """Anonymize identified input using Presidio Anonymizer."""
69
  if not text:
70
  return
71
- res = anonymizer_engine().anonymize(text, analyze_results)
 
 
 
 
 
72
  return res.text
73
 
74
 
@@ -127,7 +139,7 @@ analyzer_load_state.empty()
127
 
128
  st_text = st.text_area(
129
  label="Type in some text",
130
- value="Learning Reflection\n\nJohn Williams\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]",
131
  height=200,
132
  )
133
 
 
6
  from presidio_analyzer.nlp_engine import NlpEngineProvider
7
  from presidio_anonymizer import AnonymizerEngine
8
  from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
9
+ from presidio_anonymizer.entities import OperatorConfig
10
  import pandas as pd
11
  from annotated_text import annotated_text
12
  from json import JSONEncoder
 
64
  kwargs["entities"] = None
65
  return analyzer_engine().analyze(**kwargs)
66
 
67
+ def generate_surrogate(name):
68
+ """Return appropriate surrogate name from text string"""
69
+ if "John" in name:
70
+ return "Jill"
71
+ else:
72
+ return "SURROGATE_NAME"
73
 
74
  def anonymize(text, analyze_results):
75
  """Anonymize identified input using Presidio Anonymizer."""
76
  if not text:
77
  return
78
+ res = anonymizer_engine().anonymize(
79
+ text,
80
+ analyze_results,
81
+ operators={"STUDENT": OperatorConfig("custom", {"lambda": generate_surrogate})}
82
+
83
+ )
84
  return res.text
85
 
86
 
 
139
 
140
  st_text = st.text_area(
141
  label="Type in some text",
142
+ value="Learning Reflection\n\nJohn Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]",
143
  height=200,
144
  )
145