File size: 4,996 Bytes
7dda936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30b6161
 
 
 
 
 
 
7dda936
cd2b4b4
7dda936
 
 
fcb82c4
7dda936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db04149
7dda936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

'''Streamlit app for Student Name Detection models.'''

from analyzer import prepare_analyzer
from anonymizer import surrogate_anonymizer
from presidio_anonymizer import AnonymizerEngine
import pandas as pd
from annotated_text import annotated_text
from json import JSONEncoder
import json
import warnings
import streamlit as st
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')

# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
    '''Return AnalyzerEngine and cache with Streamlit.'''

    configuration = {
        'nlp_engine_name': 'spacy',
        'models': [
            {'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
    }

    analyzer = prepare_analyzer(configuration)

    return analyzer

@st.cache(allow_output_mutation=True)
def anonymizer_engine():
    '''Return generate surrogate anonymizer.'''
    return surrogate_anonymizer()

def annotate(text, st_analyze_results, st_entities):
    tokens = []
    # sort by start index
    results = sorted(st_analyze_results, key=lambda x: x.start)
    for i, res in enumerate(results):
        if i == 0:
            tokens.append(text[:res.start])

        # append entity text and entity type
        tokens.append((text[res.start: res.end], res.entity_type))

        # if another entity coming i.e. we're not at the last results element, add text up to next entity
        if i != len(results) - 1:
            tokens.append(text[res.end:results[i+1].start])
        # if no more entities coming, add all remaining text
        else:
            tokens.append(text[res.end:])
    return tokens

hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True) 

# st.set_page_config(page_title='Student Name Detector (English)', layout='wide')

# Side bar
st.sidebar.markdown(
    '''Contact us to get the private [Docker image](https://privacy-ai.com/) for your environment and Get an [API key](https://privacy-ai.com/) 
'''
)

st_entities = st.sidebar.multiselect(
    label='Which entities to look for?',
    options=analyzer_engine().get_supported_entities(),
    default=list(analyzer_engine().get_supported_entities()),
)

st_threshold = st.sidebar.slider(
    label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
)

st_return_decision_process = st.sidebar.checkbox(
    'Add analysis explanations in json')

st.sidebar.info(
    ''
)

# Main panel
analyzer_load_state = st.info(
    'Starting Presidio analyzer and loading Longformer-based model...')
engine = analyzer_engine()
analyzer_load_state.empty()


st_text = st.text_area(
    label='Type in some text',
    value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]',
    height=200,
)

button = st.button('Detect PII')

if 'first_load' not in st.session_state:
    st.session_state['first_load'] = True

# After
st.subheader('Analyzed')
with st.spinner('Analyzing...'):
    if button or st.session_state.first_load:
        st_analyze_results = analyzer_engine().analyze(
            text=st_text,
            entities=st_entities,
            language='en',
            score_threshold=st_threshold,
            return_decision_process=st_return_decision_process,
        )
        annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
        # annotated_tokens
        annotated_text(*annotated_tokens)

# vertical space
st.text('')

st.subheader('Anonymized')
with st.spinner('Anonymizing...'):
    if button or st.session_state.first_load:
        st_anonymize_results = anonymizer_engine().anonymize(
                                         st_text,
                                         st_analyze_results)
        st_anonymize_results
        
# table result
st.subheader('Detailed Findings')
if st_analyze_results:
    res_dicts = [r.to_dict() for r in st_analyze_results]
    for d in res_dicts:
        d['Value'] = st_text[d['start']:d['end']]
    df = pd.DataFrame.from_records(res_dicts)
    df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
        {
            'entity_type': 'Entity type',
            'start': 'Start',
            'end': 'End',
            'score': 'Confidence',
        },
        axis=1,
    )

    st.dataframe(df, width=1000)
else:
    st.text('No findings')

st.session_state['first_load'] = True

# json result
class ToDictListEncoder(JSONEncoder):
    '''Encode dict to json.'''

    def default(self, o):
        '''Encode to JSON using to_dict.'''
        if o:
            return o.to_dict()
        return []

if st_return_decision_process:
    st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))