Spaces:
Sleeping
Sleeping
File size: 4,996 Bytes
7dda936 30b6161 7dda936 cd2b4b4 7dda936 fcb82c4 7dda936 db04149 7dda936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
'''Streamlit app for Student Name Detection models.'''
from analyzer import prepare_analyzer
from anonymizer import surrogate_anonymizer
from presidio_anonymizer import AnonymizerEngine
import pandas as pd
from annotated_text import annotated_text
from json import JSONEncoder
import json
import warnings
import streamlit as st
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')
# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
'''Return AnalyzerEngine and cache with Streamlit.'''
configuration = {
'nlp_engine_name': 'spacy',
'models': [
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
}
analyzer = prepare_analyzer(configuration)
return analyzer
@st.cache(allow_output_mutation=True)
def anonymizer_engine():
'''Return generate surrogate anonymizer.'''
return surrogate_anonymizer()
def annotate(text, st_analyze_results, st_entities):
tokens = []
# sort by start index
results = sorted(st_analyze_results, key=lambda x: x.start)
for i, res in enumerate(results):
if i == 0:
tokens.append(text[:res.start])
# append entity text and entity type
tokens.append((text[res.start: res.end], res.entity_type))
# if another entity coming i.e. we're not at the last results element, add text up to next entity
if i != len(results) - 1:
tokens.append(text[res.end:results[i+1].start])
# if no more entities coming, add all remaining text
else:
tokens.append(text[res.end:])
return tokens
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
# st.set_page_config(page_title='Student Name Detector (English)', layout='wide')
# Side bar
st.sidebar.markdown(
'''Contact us to get the private [Docker image](https://privacy-ai.com/) for your environment and Get an [API key](https://privacy-ai.com/)
'''
)
st_entities = st.sidebar.multiselect(
label='Which entities to look for?',
options=analyzer_engine().get_supported_entities(),
default=list(analyzer_engine().get_supported_entities()),
)
st_threshold = st.sidebar.slider(
label='Acceptance threshold', min_value=0.0, max_value=1.0, value=0.35
)
st_return_decision_process = st.sidebar.checkbox(
'Add analysis explanations in json')
st.sidebar.info(
''
)
# Main panel
analyzer_load_state = st.info(
'Starting Presidio analyzer and loading Longformer-based model...')
engine = analyzer_engine()
analyzer_load_state.empty()
st_text = st.text_area(
label='Type in some text',
value='Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]',
height=200,
)
button = st.button('Detect PII')
if 'first_load' not in st.session_state:
st.session_state['first_load'] = True
# After
st.subheader('Analyzed')
with st.spinner('Analyzing...'):
if button or st.session_state.first_load:
st_analyze_results = analyzer_engine().analyze(
text=st_text,
entities=st_entities,
language='en',
score_threshold=st_threshold,
return_decision_process=st_return_decision_process,
)
annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
# annotated_tokens
annotated_text(*annotated_tokens)
# vertical space
st.text('')
st.subheader('Anonymized')
with st.spinner('Anonymizing...'):
if button or st.session_state.first_load:
st_anonymize_results = anonymizer_engine().anonymize(
st_text,
st_analyze_results)
st_anonymize_results
# table result
st.subheader('Detailed Findings')
if st_analyze_results:
res_dicts = [r.to_dict() for r in st_analyze_results]
for d in res_dicts:
d['Value'] = st_text[d['start']:d['end']]
df = pd.DataFrame.from_records(res_dicts)
df = df[['entity_type', 'Value', 'score', 'start', 'end']].rename(
{
'entity_type': 'Entity type',
'start': 'Start',
'end': 'End',
'score': 'Confidence',
},
axis=1,
)
st.dataframe(df, width=1000)
else:
st.text('No findings')
st.session_state['first_load'] = True
# json result
class ToDictListEncoder(JSONEncoder):
'''Encode dict to json.'''
def default(self, o):
'''Encode to JSON using to_dict.'''
if o:
return o.to_dict()
return []
if st_return_decision_process:
st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder)) |