Spaces:

Tonic
/

hallucination-test

Running

App Files Files Community

hallucination-test / app.py

Tonic

improve interface

c3509cc unverified 5 days ago

raw

history blame contribute delete

8.14 kB

	import gradio as gr
	from lettucedetect.models.inference import HallucinationDetector
	import os


	title = """# 🙋🏻‍♂️Welcome to 🌟Tonic's 🥬 LettuceDetect - 🤯🧠 Hallucination Tester 🟢🔴
	Powered by `lettucedect-large-modernbert-en-v1` from KRLabsOrg. Detect hallucinations in answers based on context and questions using ModernBERT with 8192-token context support!
	"""
	description2 = """
	### Model Details
	- Model Name: [lettucedect-large-modernbert-en-v1](https://huggingface.co./KRLabsOrg/lettucedect-large-modernbert-en-v1)
	- Organization: [KRLabsOrg](https://huggingface.co./KRLabsOrg)
	- Github: [https://github.com/KRLabsOrg/LettuceDetect](https://github.com/KRLabsOrg/LettuceDetect)
	- Architecture: ModernBERT (Large) with extended context support up to 8192 tokens
	- Task: Token Classification / Hallucination Detection
	- Training Dataset: [RagTruth](https://huggingface.co./datasets/wandb/RAGTruth-processed)
	- Language: English
	- Capabilities: Detects hallucinated spans in answers, provides confidence scores, and calculates average confidence across detected spans.
	LettuceDetect excels at processing long documents to determine if an answer aligns with the provided context, making it a powerful tool for ensuring factual accuracy.
	"""

	howto1 = """
	### How to Use LettuceDetect Tester
	1. Enter a Context: Provide the source text or document (e.g., "France is a country in Europe..."). This is the factual basis for evaluation.
	2. Enter a Question: Ask something related to the context (e.g., "What is the capital of France?").
	3. Enter an Answer: Input the response you want to check (e.g., "The capital of France is Paris. The population is 69 million.").
	4. Press Submit: Analyze the answer for hallucinations!
	"""

	howto2 = """
	### Understanding the Output
	- Status:
	- 🟢 = No hallucinations detected
	- 🔴 = Hallucinations detected
	- ⚪ = Error occurred
	- Explanation: A brief summary of the result.
	- Highlighted Answer: Shows the answer with hallucinated parts in red, labeled with confidence scores (e.g., "hallucination (conf: 0.9944)").
	- Hallucinated Spans & Confidence: Lists each hallucinated segment with its confidence score.
	- Average Confidence: Displays the average confidence of all detected hallucinations (e.g., "Average Confidence: 0.9944").
	Use this tool to ensure your answers are grounded in reality!
	"""
	join_us = """
	## Join us:

	🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻
	[Join us on Discord](https://discord.gg/n8ytYeh25n)
	On 🤗Huggingface: [MultiTransformer](https://huggingface.co./MultiTransformer)
	On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Data Tonic](https://github.com/multiTonic/thinking-dataset/)
	🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
	"""



	# Initialize the LettuceDetect model
	detector = HallucinationDetector(
	method="transformer",
	model_path="KRLabsOrg/lettucedect-large-modernbert-en-v1"
	)

	# Function to evaluate hallucination with LettuceDetect
	def evaluate_hallucination(context, question, answer):
	try:
	# Get span-level predictions from LettuceDetect
	predictions = detector.predict(
	context=[context],
	question=question,
	answer=answer,
	output_format="spans"
	)

	# Process predictions for HighlightedText
	if not predictions:
	return "🟢", "No hallucinations detected", [(answer, None)], "Confidence: N/A", "N/A"

	highlighted_segments = []
	confidence_scores = []
	last_end = 0
	total_confidence = 0.0

	for pred in predictions:
	start, end = pred['start'], pred['end']
	confidence = pred['confidence']
	text = pred['text']

	# Add non-hallucinated text before this span
	if last_end < start:
	highlighted_segments.append((answer[last_end:start], None))

	# Add hallucinated span with confidence as label
	label_with_confidence = f"hallucination (conf: {confidence:.4f})"
	highlighted_segments.append((text, label_with_confidence))
	confidence_scores.append(f"'{text}' - Confidence: {confidence:.4f}")
	total_confidence += confidence
	last_end = end

	# Add any remaining text after the last hallucination
	if last_end < len(answer):
	highlighted_segments.append((answer[last_end:], None))

	# Calculate average confidence
	avg_confidence = total_confidence / len(predictions) if predictions else 0.0

	# Determine overall status
	status = "🔴" if predictions else "🟢"
	explanation = "Hallucinations detected" if predictions else "No hallucinations detected"

	return (
	status,
	explanation,
	highlighted_segments,
	"\n".join(confidence_scores) if confidence_scores else "N/A",
	f"Average Confidence: {avg_confidence:.4f}" if predictions else "N/A"
	)

	except Exception as e:
	return "⚪", f"Error: {str(e)}", [(answer, None)], "N/A", "N/A"

	# Gradio Blocks interface
	with gr.Blocks(
	title="🥬 LettuceDetect Hallucination Tester 🟢🔴"
	) as demo:
	gr.Markdown(title)
	with gr.Row():
	with gr.Group():
	gr.Markdown(description2)
	with gr.Group():
	gr.Markdown(howto2)
	with gr.Row():
	with gr.Group():
	gr.Markdown(howto1)
	with gr.Group():
	gr.Markdown(join_us)
	with gr.Row():
	with gr.Column(scale=2):
	# Inputs
	context_input = gr.Textbox(
	label="Context",
	lines=5,
	placeholder="Enter the context (e.g., a document or source text)..."
	)
	question_input = gr.Textbox(
	label="Question",
	placeholder="Enter the question..."
	)
	answer_input = gr.Textbox(
	label="Answer",
	lines=3,
	placeholder="Enter the answer to evaluate..."
	)
	submit_btn = gr.Button("Submit")

	with gr.Column(scale=3):
	with gr.Row():
	with gr.Column():
	status_output = gr.Label(label="Status")
	with gr.Column():
	explanation_output = gr.Textbox(label="Explanation", interactive=False)

	highlighted_answer_output = gr.HighlightedText(
	label="Answer with Hallucinations Highlighted",
	show_legend=False,
	color_map={"hallucination": "red"}, # Note: Only "hallucination" is used as base category
	combine_adjacent=True
	)
	spans_output = gr.Textbox(label="Hallucinated Spans & Confidence", lines=5, interactive=False)
	avg_confidence_output = gr.Textbox(label="Average Confidence", interactive=False)

	# Connect inputs to outputs via the evaluation function
	submit_btn.click(
	fn=evaluate_hallucination,
	inputs=[context_input, question_input, answer_input],
	outputs=[status_output, explanation_output, highlighted_answer_output, spans_output, avg_confidence_output]
	)

	# Example
	gr.Markdown("### Example")
	with gr.Row():
	gr.Examples(
	examples=[
	[
	"France is a country in Europe. The capital of France is Paris. The population of France is 67 million.",
	"What is the capital of France? What is the population of France?",
	"The capital of France is Paris. The population of France is 69 million."
	]
	],
	inputs=[context_input, question_input, answer_input]
	)

	# Launch the demo
	demo.launch()