Spaces:

langdonholmes
/

piilo

Sleeping

piilo / app.py

langdonholmes

functioning

287a33f over 1 year ago

4.91 kB


	"""Streamlit app for Student Name Detection models."""

	from spacy_analyzer import prepare_analyzer
	from anonymizer import anonymize
	from presidio_anonymizer import AnonymizerEngine
	import pandas as pd
	from annotated_text import annotated_text
	from json import JSONEncoder
	import json
	import warnings
	import streamlit as st
	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	warnings.filterwarnings('ignore')

	# Helper methods
	@st.cache(allow_output_mutation=True)
	def analyzer_engine():
	"""Return AnalyzerEngine and cache with Streamlit."""

	configuration = {
	"nlp_engine_name": "spacy",
	"models": [
	{"lang_code": "en", "model_name": "en_student_name_detector"}],
	}

	analyzer = prepare_analyzer(configuration)

	return analyzer

	@st.cache(allow_output_mutation=True)
	def anonymizer_engine():
	"""Return AnonymizerEngine."""
	return AnonymizerEngine()

	def annotate(text, st_analyze_results, st_entities):
	tokens = []
	# sort by start index
	results = sorted(st_analyze_results, key=lambda x: x.start)
	for i, res in enumerate(results):
	if i == 0:
	tokens.append(text[:res.start])

	# append entity text and entity type
	tokens.append((text[res.start: res.end], res.entity_type))

	# if another entity coming i.e. we're not at the last results element, add text up to next entity
	if i != len(results) - 1:
	tokens.append(text[res.end:results[i+1].start])
	# if no more entities coming, add all remaining text
	else:
	tokens.append(text[res.end:])
	return tokens


	st.set_page_config(page_title="Student Name Detector (English)", layout="wide")

	# Side bar
	st.sidebar.markdown(
	"""Detect and anonymize PII in text using an [NLP model](https://huggingface.co./langdonholmes/en_student_name_detector) [trained](https://github.com/aialoe/deidentification-pipeline) on student-generated text collected by Coursera.
	"""
	)

	st_entities = st.sidebar.multiselect(
	label="Which entities to look for?",
	options=analyzer_engine().get_supported_entities(),
	default=list(analyzer_engine().get_supported_entities()),
	)

	st_threshold = st.sidebar.slider(
	label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
	)

	st_return_decision_process = st.sidebar.checkbox(
	"Add analysis explanations in json")

	st.sidebar.info(
	"This is part of a deidentification project for student-generated text."
	)

	# Main panel
	analyzer_load_state = st.info(
	"Starting Presidio analyzer and loading Longformer-based model...")
	engine = analyzer_engine()
	analyzer_load_state.empty()


	st_text = st.text_area(
	label="Type in some text",
	value="Learning Reflection\n\nWritten by John Williams and Samantha Morales\n\nIn this course I learned many things. As Liedtke (2004) said, \"Students grow when they learn\" (Erickson et al. 1998).\n\nBy John H. Williams -- (714) 328-9989 -- [email protected]",
	height=200,
	)

	button = st.button("Detect PII")

	if 'first_load' not in st.session_state:
	st.session_state['first_load'] = True

	# After
	st.subheader("Analyzed")
	with st.spinner("Analyzing..."):
	if button or st.session_state.first_load:
	st_analyze_results = analyzer_engine().analyze(
	text=st_text,
	entities=st_entities,
	language="en",
	score_threshold=st_threshold,
	return_decision_process=st_return_decision_process,
	)
	annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
	# annotated_tokens
	annotated_text(*annotated_tokens)

	# vertical space
	st.text("")

	st.subheader("Anonymized")

	with st.spinner("Anonymizing..."):
	if button or st.session_state.first_load:
	st_anonymize_results = anonymize(anonymizer_engine(),
	st_text,
	st_analyze_results)
	st_anonymize_results

	# table result
	st.subheader("Detailed Findings")
	if st_analyze_results:
	res_dicts = [r.to_dict() for r in st_analyze_results]
	for d in res_dicts:
	d['Value'] = st_text[d['start']:d['end']]
	df = pd.DataFrame.from_records(res_dicts)
	df = df[["entity_type", "Value", "score", "start", "end"]].rename(
	{
	"entity_type": "Entity type",
	"start": "Start",
	"end": "End",
	"score": "Confidence",
	},
	axis=1,
	)

	st.dataframe(df, width=1000)
	else:
	st.text("No findings")

	st.session_state['first_load'] = True

	# json result
	class ToDictListEncoder(JSONEncoder):
	"""Encode dict to json."""

	def default(self, o):
	"""Encode to JSON using to_dict."""
	if o:
	return o.to_dict()
	return []

	if st_return_decision_process:
	st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))