Spaces:
Running
Running
add new hallucination testing
Browse files- app.py +137 -40
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,48 +1,145 @@
|
|
1 |
-
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import os
|
5 |
-
|
6 |
-
# Define the API parameters
|
7 |
-
API_URL = "https://api-inference.huggingface.co/models/vectara/hallucination_evaluation_model"
|
8 |
-
API_TOKEN = os.getenv("HF_AUTH_TOKEN")
|
9 |
-
if not API_TOKEN:
|
10 |
-
raise ValueError("Please set the HF_AUTH_TOKEN environment variable.")
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
# Function to
|
15 |
-
def
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
#
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
description="How To Use 🌈Hallucination tester: 🗣️📝add any assertion from an LLM or a human 🗣️😷 add any citation from a RAG retriever or a source 👇🏻📩 Press send 🔴red means a 🌈hallucination, 🟢 green means a 🧠credible assertion. Check out the model [vectara/hallucination_evaluation_model](https://huggingface.co/vectara/hallucination_evaluation_model) You can also use 🥒🍆🫑Vectara - Hallucination Tester 🗣️😷 via API below or way by cloning this space. 🧬🔬🔍 Simply click here: Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻 [](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to 🌟 [DataTonic](https://github.com/Tonic-AI/DataTonic) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗",
|
44 |
-
theme='ParityError/Anime',
|
45 |
-
)
|
46 |
|
47 |
-
# Launch the
|
48 |
-
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from lettucedetect.models.inference import HallucinationDetector
|
3 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# Initialize the LettuceDetect model
|
6 |
+
detector = HallucinationDetector(
|
7 |
+
method="transformer",
|
8 |
+
model_path="KRLabsOrg/lettucedect-large-modernbert-en-v1"
|
9 |
+
)
|
10 |
|
11 |
+
# Function to evaluate hallucination with LettuceDetect
|
12 |
+
def evaluate_hallucination(context, question, answer):
|
13 |
+
try:
|
14 |
+
# Get span-level predictions from LettuceDetect
|
15 |
+
predictions = detector.predict(
|
16 |
+
context=[context],
|
17 |
+
question=question,
|
18 |
+
answer=answer,
|
19 |
+
output_format="spans"
|
20 |
+
)
|
21 |
+
|
22 |
+
# Process predictions for HighlightedText
|
23 |
+
if not predictions:
|
24 |
+
return "🟢", "No hallucinations detected", [(answer, None)], "Confidence: N/A", "N/A"
|
25 |
+
|
26 |
+
highlighted_segments = []
|
27 |
+
confidence_scores = []
|
28 |
+
last_end = 0
|
29 |
+
total_confidence = 0.0
|
30 |
+
|
31 |
+
for pred in predictions:
|
32 |
+
start, end = pred['start'], pred['end']
|
33 |
+
confidence = pred['confidence']
|
34 |
+
text = pred['text']
|
35 |
+
|
36 |
+
# Add non-hallucinated text before this span
|
37 |
+
if last_end < start:
|
38 |
+
highlighted_segments.append((answer[last_end:start], None))
|
39 |
+
|
40 |
+
# Add hallucinated span with confidence as label
|
41 |
+
label_with_confidence = f"hallucination (conf: {confidence:.4f})"
|
42 |
+
highlighted_segments.append((text, label_with_confidence))
|
43 |
+
confidence_scores.append(f"'{text}' - Confidence: {confidence:.4f}")
|
44 |
+
total_confidence += confidence
|
45 |
+
last_end = end
|
46 |
+
|
47 |
+
# Add any remaining text after the last hallucination
|
48 |
+
if last_end < len(answer):
|
49 |
+
highlighted_segments.append((answer[last_end:], None))
|
50 |
+
|
51 |
+
# Calculate average confidence
|
52 |
+
avg_confidence = total_confidence / len(predictions) if predictions else 0.0
|
53 |
+
|
54 |
+
# Determine overall status
|
55 |
+
status = "🔴" if predictions else "🟢"
|
56 |
+
explanation = "Hallucinations detected" if predictions else "No hallucinations detected"
|
57 |
+
|
58 |
+
return (
|
59 |
+
status,
|
60 |
+
explanation,
|
61 |
+
highlighted_segments,
|
62 |
+
"\n".join(confidence_scores) if confidence_scores else "N/A",
|
63 |
+
f"Average Confidence: {avg_confidence:.4f}" if predictions else "N/A"
|
64 |
+
)
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
return "⚪", f"Error: {str(e)}", [(answer, None)], "N/A", "N/A"
|
68 |
|
69 |
+
# Gradio Blocks interface
|
70 |
+
with gr.Blocks(
|
71 |
+
title="🥬 LettuceDetect Hallucination Tester 🟢🔴",
|
72 |
+
theme="ParityError/Anime"
|
73 |
+
) as demo:
|
74 |
+
gr.Markdown(
|
75 |
+
"""
|
76 |
+
# 🥬 LettuceDetect Hallucination Tester 🟢🔴
|
77 |
+
Powered by `lettucedect-large-modernbert-en-v1` from KRLabsOrg. Detect hallucinations in answers based on context and questions using ModernBERT with 8192-token context support!
|
78 |
+
|
79 |
+
### How to Use:
|
80 |
+
1. Enter a **Context** (source document or info).
|
81 |
+
2. Enter a **Question** related to the context.
|
82 |
+
3. Enter an **Answer** to evaluate.
|
83 |
+
4. Press **Submit** to see if the answer hallucinates!
|
84 |
+
|
85 |
+
- 🟢 = No hallucinations
|
86 |
+
- 🔴 = Hallucinations detected
|
87 |
+
- Highlighted text shows hallucinated spans in **red** with confidence scores.
|
88 |
+
"""
|
89 |
+
)
|
90 |
|
91 |
+
with gr.Row():
|
92 |
+
with gr.Column(scale=2):
|
93 |
+
# Inputs
|
94 |
+
context_input = gr.Textbox(
|
95 |
+
label="Context",
|
96 |
+
lines=5,
|
97 |
+
placeholder="Enter the context (e.g., a document or source text)..."
|
98 |
+
)
|
99 |
+
question_input = gr.Textbox(
|
100 |
+
label="Question",
|
101 |
+
placeholder="Enter the question..."
|
102 |
+
)
|
103 |
+
answer_input = gr.Textbox(
|
104 |
+
label="Answer",
|
105 |
+
lines=3,
|
106 |
+
placeholder="Enter the answer to evaluate..."
|
107 |
+
)
|
108 |
+
submit_btn = gr.Button("Submit")
|
109 |
+
|
110 |
+
with gr.Column(scale=3):
|
111 |
+
# Outputs
|
112 |
+
status_output = gr.Label(label="Status")
|
113 |
+
explanation_output = gr.Textbox(label="Explanation", interactive=False)
|
114 |
+
highlighted_answer_output = gr.HighlightedText(
|
115 |
+
label="Answer with Hallucinations Highlighted",
|
116 |
+
show_legend=True,
|
117 |
+
color_map={"hallucination": "red"}, # Note: Only "hallucination" is used as base category
|
118 |
+
combine_adjacent=True
|
119 |
+
)
|
120 |
+
spans_output = gr.Textbox(label="Hallucinated Spans & Confidence", lines=5, interactive=False)
|
121 |
+
avg_confidence_output = gr.Textbox(label="Average Confidence", interactive=False)
|
122 |
|
123 |
+
# Connect inputs to outputs via the evaluation function
|
124 |
+
submit_btn.click(
|
125 |
+
fn=evaluate_hallucination,
|
126 |
+
inputs=[context_input, question_input, answer_input],
|
127 |
+
outputs=[status_output, explanation_output, highlighted_answer_output, spans_output, avg_confidence_output]
|
128 |
+
)
|
129 |
|
130 |
+
# Example
|
131 |
+
gr.Markdown("### Example")
|
132 |
+
with gr.Row():
|
133 |
+
gr.Examples(
|
134 |
+
examples=[
|
135 |
+
[
|
136 |
+
"France is a country in Europe. The capital of France is Paris. The population of France is 67 million.",
|
137 |
+
"What is the capital of France? What is the population of France?",
|
138 |
+
"The capital of France is Paris. The population of France is 69 million."
|
139 |
+
]
|
140 |
+
],
|
141 |
+
inputs=[context_input, question_input, answer_input]
|
142 |
+
)
|
|
|
|
|
|
|
143 |
|
144 |
+
# Launch the demo
|
145 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
lettucedetect
|