langdonholmes commited on
Commit
e9abb72
1 Parent(s): f0664d7
Files changed (3) hide show
  1. app.py +199 -2
  2. requirements.txt +7 -0
  3. spacy_recognizer.py +131 -0
app.py CHANGED
@@ -1,4 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
1
+
2
+ """Streamlit app for Student Name Detection models."""
3
+
4
+ import spacy
5
+ from spacy_recognizer import CustomSpacyRecognizer
6
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
7
+ from presidio_anonymizer import AnonymizerEngine
8
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
9
+ import pandas as pd
10
+ from annotated_text import annotated_text
11
+ from json import JSONEncoder
12
+ import json
13
+ import warnings
14
  import streamlit as st
15
+ import os
16
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+ warnings.filterwarnings('ignore')
18
+
19
+ # Helper methods
20
+ @st.cache(allow_output_mutation=True)
21
+ def analyzer_engine():
22
+ """Return AnalyzerEngine."""
23
+
24
+ spacy_recognizer = CustomSpacyRecognizer()
25
+
26
+ configuration = {
27
+ "nlp_engine_name": "spacy",
28
+ "models": [
29
+ {"lang_code": "en", "model_name": "INSERT MODEL NAME"}],
30
+ }
31
+
32
+ # Create NLP engine based on configuration
33
+ provider = NlpEngineProvider(nlp_configuration=configuration)
34
+ nlp_engine = provider.create_engine()
35
+
36
+ registry = RecognizerRegistry()
37
+ # add rule-based recognizers
38
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
39
+ registry.add_recognizer(spacy_recognizer)
40
+ # remove the nlp engine we passed, to use custom label mappings
41
+ registry.remove_recognizer("SpacyRecognizer")
42
+
43
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
44
+ registry=registry, supported_languages=["en"])
45
+
46
+ return analyzer
47
+
48
+
49
+ @st.cache(allow_output_mutation=True)
50
+ def anonymizer_engine():
51
+ """Return AnonymizerEngine."""
52
+ return AnonymizerEngine()
53
+
54
+
55
+ def get_supported_entities():
56
+ """Return supported entities from the Analyzer Engine."""
57
+ return analyzer_engine().get_supported_entities()
58
+
59
+
60
+ def analyze(**kwargs):
61
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
62
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
63
+ kwargs["entities"] = None
64
+ return analyzer_engine().analyze(**kwargs)
65
+
66
+
67
+ def anonymize(text, analyze_results):
68
+ """Anonymize identified input using Presidio Anonymizer."""
69
+ if not text:
70
+ return
71
+ res = anonymizer_engine().anonymize(text, analyze_results)
72
+ return res.text
73
+
74
+
75
+ def annotate(text, st_analyze_results, st_entities):
76
+ tokens = []
77
+ # sort by start index
78
+ results = sorted(st_analyze_results, key=lambda x: x.start)
79
+ for i, res in enumerate(results):
80
+ if i == 0:
81
+ tokens.append(text[:res.start])
82
+
83
+ # append entity text and entity type
84
+ tokens.append((text[res.start: res.end], res.entity_type))
85
+
86
+ # if another entity coming i.e. we're not at the last results element, add text up to next entity
87
+ if i != len(results) - 1:
88
+ tokens.append(text[res.end:results[i+1].start])
89
+ # if no more entities coming, add all remaining text
90
+ else:
91
+ tokens.append(text[res.end:])
92
+ return tokens
93
+
94
+
95
+ st.set_page_config(page_title="Student Name Detector (English)", layout="wide")
96
+
97
+ # Side bar
98
+ st.sidebar.markdown(
99
+ """Detect and anonymize PII in text using an [NLP model](https://huggingface.co/MY_MODEL_NAME) [trained](https://github.com/aialoe/deidentification-pipeline/tree/8bea38040d36ef62e0638fec8cca3ec652539cbe) on student-generated text collected by Coursera.
100
+ """
101
+ )
102
+
103
+ st_entities = st.sidebar.multiselect(
104
+ label="Which entities to look for?",
105
+ options=get_supported_entities(),
106
+ default=list(get_supported_entities()),
107
+ )
108
+
109
+ st_threshold = st.sidebar.slider(
110
+ label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
111
+ )
112
+
113
+ st_return_decision_process = st.sidebar.checkbox(
114
+ "Add analysis explanations in json")
115
+
116
+ st.sidebar.info(
117
+ "This is part of a deidentification project for student-generated text."
118
+ )
119
+
120
+
121
+ # Main panel
122
+ analyzer_load_state = st.info(
123
+ "Starting Presidio analyzer and loading Longformer-based model...")
124
+ engine = analyzer_engine()
125
+ analyzer_load_state.empty()
126
+
127
+
128
+ st_text = st.text_area(
129
+ label="Type in some text",
130
+ value="Learning Reflection\n\nJohn Williams\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" \n\nBy John H. Williams",
131
+ height=200,
132
+ )
133
+
134
+ button = st.button("Detect Student Names")
135
+
136
+ if 'first_load' not in st.session_state:
137
+ st.session_state['first_load'] = True
138
+
139
+ # After
140
+ st.subheader("Analyzed")
141
+ with st.spinner("Analyzing..."):
142
+ if button or st.session_state.first_load:
143
+ st_analyze_results = analyze(
144
+ text=st_text,
145
+ entities=st_entities,
146
+ language="en",
147
+ score_threshold=st_threshold,
148
+ return_decision_process=st_return_decision_process,
149
+ )
150
+ annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
151
+ # annotated_tokens
152
+ annotated_text(*annotated_tokens)
153
+ # vertical space
154
+ st.text("")
155
+
156
+ st.subheader("Anonymized")
157
+
158
+ with st.spinner("Anonymizing..."):
159
+ if button or st.session_state.first_load:
160
+ st_anonymize_results = anonymize(st_text, st_analyze_results)
161
+ st_anonymize_results
162
+
163
+
164
+ # table result
165
+ st.subheader("Detailed Findings")
166
+ if st_analyze_results:
167
+ res_dicts = [r.to_dict() for r in st_analyze_results]
168
+ for d in res_dicts:
169
+ d['Value'] = st_text[d['start']:d['end']]
170
+ df = pd.DataFrame.from_records(res_dicts)
171
+ df = df[["entity_type", "Value", "score", "start", "end"]].rename(
172
+ {
173
+ "entity_type": "Entity type",
174
+ "start": "Start",
175
+ "end": "End",
176
+ "score": "Confidence",
177
+ },
178
+ axis=1,
179
+ )
180
+
181
+ st.dataframe(df, width=1000)
182
+ else:
183
+ st.text("No findings")
184
+
185
+ st.session_state['first_load'] = True
186
+
187
+ # json result
188
+
189
+
190
+ class ToDictListEncoder(JSONEncoder):
191
+ """Encode dict to json."""
192
+
193
+ def default(self, o):
194
+ """Encode to JSON using to_dict."""
195
+ if o:
196
+ return o.to_dict()
197
+ return []
198
+
199
 
200
+ if st_return_decision_process:
201
+ st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ streamlit
3
+ presidio-anonymizer
4
+ presidio-analyzer
5
+ torch
6
+ st-annotated-text
7
+ #https://huggingface.co/my_model.whl
spacy_recognizer.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+
4
+ from presidio_analyzer import (
5
+ RecognizerResult,
6
+ LocalRecognizer,
7
+ AnalysisExplanation,
8
+ )
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+ from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
11
+
12
+ logger = logging.getLogger("presidio-analyzer")
13
+
14
+
15
+ class CustomSpacyRecognizer(LocalRecognizer):
16
+
17
+ ENTITIES = [
18
+ "LOCATION",
19
+ "PERSON",
20
+ "NRP",
21
+ "ORGANIZATION",
22
+ "DATE_TIME",
23
+ ]
24
+
25
+ DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
26
+
27
+ CHECK_LABEL_GROUPS = [
28
+ ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
29
+ ({"PERSON"}, {"PER", "PERSON"}),
30
+ ({"NRP"}, {"NORP", "NRP"}),
31
+ ({"ORGANIZATION"}, {"ORG"}),
32
+ ({"DATE_TIME"}, {"DATE_TIME"}),
33
+ ]
34
+
35
+ MODEL_LANGUAGES = {
36
+ "en": "beki/en_spacy_pii_distilbert",
37
+ }
38
+
39
+ PRESIDIO_EQUIVALENCES = {
40
+ "PER": "PERSON",
41
+ "LOC": "LOCATION",
42
+ "ORG": "ORGANIZATION",
43
+ "NROP": "NRP",
44
+ "DATE_TIME": "DATE_TIME",
45
+ }
46
+
47
+ def __init__(
48
+ self,
49
+ supported_language: str = "en",
50
+ supported_entities: Optional[List[str]] = None,
51
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
52
+ context: Optional[List[str]] = None,
53
+ ner_strength: float = 0.85,
54
+ ):
55
+ self.ner_strength = ner_strength
56
+ self.check_label_groups = (
57
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
58
+ )
59
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
60
+ super().__init__(
61
+ supported_entities=supported_entities,
62
+ supported_language=supported_language,
63
+ )
64
+
65
+ def load(self) -> None:
66
+ """Load the model, not used. Model is loaded during initialization."""
67
+ pass
68
+
69
+ def get_supported_entities(self) -> List[str]:
70
+ """
71
+ Return supported entities by this model.
72
+ :return: List of the supported entities.
73
+ """
74
+ return self.supported_entities
75
+
76
+ def build_spacy_explanation(
77
+ self, original_score: float, explanation: str
78
+ ) -> AnalysisExplanation:
79
+ """
80
+ Create explanation for why this result was detected.
81
+ :param original_score: Score given by this recognizer
82
+ :param explanation: Explanation string
83
+ :return:
84
+ """
85
+ explanation = AnalysisExplanation(
86
+ recognizer=self.__class__.__name__,
87
+ original_score=original_score,
88
+ textual_explanation=explanation,
89
+ )
90
+ return explanation
91
+
92
+ def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
93
+ results = []
94
+ if not nlp_artifacts:
95
+ logger.warning("Skipping SpaCy, nlp artifacts not provided...")
96
+ return results
97
+
98
+ ner_entities = nlp_artifacts.entities
99
+
100
+ for entity in entities:
101
+ if entity not in self.supported_entities:
102
+ continue
103
+ for ent in ner_entities:
104
+ if not self.__check_label(entity, ent.label_, self.check_label_groups):
105
+ continue
106
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
107
+ ent.label_)
108
+ explanation = self.build_spacy_explanation(
109
+ self.ner_strength, textual_explanation
110
+ )
111
+ spacy_result = RecognizerResult(
112
+ entity_type=entity,
113
+ start=ent.start_char,
114
+ end=ent.end_char,
115
+ score=self.ner_strength,
116
+ analysis_explanation=explanation,
117
+ recognition_metadata={
118
+ RecognizerResult.RECOGNIZER_NAME_KEY: self.name
119
+ },
120
+ )
121
+ results.append(spacy_result)
122
+
123
+ return results
124
+
125
+ @staticmethod
126
+ def __check_label(
127
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
128
+ ) -> bool:
129
+ return any(
130
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
131
+ )