Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .env.example +3 -0
- app.py +57 -0
- common.py +155 -0
- criteria_handler.py +222 -0
- data/models.jsonl +17 -0
- data_handler.py +77 -0
- eval_criteria_library.py +61 -0
- get_llm_answer.py +137 -0
- model_handler.py +220 -0
- models.jsonl +19 -0
- random_sample/__init__.py +1 -0
- random_sample/arena_interface.py +378 -0
- random_sample/common.py +126 -0
- random_sample/gen_api_answer.py +77 -0
- random_sample/prompts.py +94 -0
- random_sample/random_sample_generation.py +183 -0
- random_sample_tab.py +6 -0
- requirements.txt +11 -0
- score_handler.py +242 -0
- ui_components.py +4 -0
- utils.py +37 -0
.env.example
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
TOGETHER_API_KEY=your_together_api_key_here
|
2 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
3 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# sandbox_runner.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from data_handler import upload_test_data
|
5 |
+
from criteria_handler import select_evaluation_criteria
|
6 |
+
from model_handler import select_evaluators
|
7 |
+
from score_handler import handle_analysis
|
8 |
+
from random_sample_tab import random_sample_tab
|
9 |
+
|
10 |
+
def run_sandbox():
|
11 |
+
with gr.Blocks(css="""
|
12 |
+
.truncate_cells table {
|
13 |
+
table-layout: fixed !important;
|
14 |
+
width: 100% !important;
|
15 |
+
}
|
16 |
+
.truncate_cells table td,
|
17 |
+
.truncate_cells table th {
|
18 |
+
white-space: nowrap !important;
|
19 |
+
overflow: hidden !important;
|
20 |
+
text-overflow: ellipsis !important;
|
21 |
+
max-width: 200px !important;
|
22 |
+
text-align: left !important;
|
23 |
+
vertical-align: top !important;
|
24 |
+
}
|
25 |
+
""") as demo:
|
26 |
+
gr.Markdown("# Atla Testing Sandbox")
|
27 |
+
with gr.Tabs():
|
28 |
+
# Random samples tab
|
29 |
+
random_sample_tab()
|
30 |
+
|
31 |
+
# Sandbox tab
|
32 |
+
with gr.TabItem("Custom Dataset"):
|
33 |
+
# Initialize state object to track the DataFrame
|
34 |
+
df_state = gr.State(value=None)
|
35 |
+
# Initialize state object to track the prompt
|
36 |
+
prompt_state = gr.State(value=None)
|
37 |
+
# Initialize the evaluation_complete flag
|
38 |
+
evaluation_complete = gr.State(value=None)
|
39 |
+
|
40 |
+
# Data upload
|
41 |
+
data_upload_group, df_state = upload_test_data(df_state)
|
42 |
+
|
43 |
+
# Criteria selection
|
44 |
+
criteria_group, df_state, prompt_state, save_prompt_button = \
|
45 |
+
select_evaluation_criteria(data_upload_group, df_state, prompt_state)
|
46 |
+
|
47 |
+
# Models selection
|
48 |
+
model_selection_group, df_state, analyze_results_button = \
|
49 |
+
select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button)
|
50 |
+
|
51 |
+
# Result analysis
|
52 |
+
handle_analysis(df_state, model_selection_group, analyze_results_button)
|
53 |
+
|
54 |
+
demo.launch()
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
run_sandbox()
|
common.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Page Headers
|
2 |
+
MAIN_TITLE = "# Judge Arena - Free LLM Evals to test your GenAI application"
|
3 |
+
|
4 |
+
# How it works section
|
5 |
+
HOW_IT_WORKS = """
|
6 |
+
- **Run any form of evaluation:** from simple hallucination detection to qualitative interpretations
|
7 |
+
- **Evaluate anything:** coding, analysis, creative writing, math, or general knowledge
|
8 |
+
"""
|
9 |
+
|
10 |
+
BATTLE_RULES = """
|
11 |
+
## 🤺 Battle Rules:
|
12 |
+
- Both AIs stay anonymous - if either reveals its identity, the duel is void
|
13 |
+
- Choose the LLM judge that most aligns with your judgement
|
14 |
+
- If both score the same - choose the critique that you prefer more!
|
15 |
+
<br><br>
|
16 |
+
"""
|
17 |
+
|
18 |
+
# CSS Styles
|
19 |
+
CSS_STYLES = """
|
20 |
+
.prompt-row {
|
21 |
+
align-items: flex-start !important;
|
22 |
+
}
|
23 |
+
.send-button-row {
|
24 |
+
display: flex;
|
25 |
+
justify-content: flex-end;
|
26 |
+
margin-top: 8px;
|
27 |
+
}
|
28 |
+
/* Style for metric buttons */
|
29 |
+
.metric-button-active {
|
30 |
+
background-color: #2B3A55 !important;
|
31 |
+
color: white !important;
|
32 |
+
}
|
33 |
+
/* Add this to ensure proper button spacing */
|
34 |
+
.metric-buttons-row {
|
35 |
+
gap: 8px;
|
36 |
+
}
|
37 |
+
"""
|
38 |
+
|
39 |
+
# Default Eval Prompt
|
40 |
+
EVAL_DESCRIPTION = """
|
41 |
+
## 📝 Instructions
|
42 |
+
**Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
|
43 |
+
- Evaluation criteria
|
44 |
+
- Scoring rubric
|
45 |
+
- (Optional) Examples\n
|
46 |
+
|
47 |
+
**Any variables you define in your prompt using {{double curly braces}} will automatically map to the corresponding input fields under "Sample to evaluate" section on the right.**
|
48 |
+
|
49 |
+
<br><br>
|
50 |
+
"""
|
51 |
+
|
52 |
+
DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on [INSERT CRITERIA]
|
53 |
+
|
54 |
+
Score:
|
55 |
+
A score of 1 means that the response's answer meets all of the evaluation criteria.
|
56 |
+
A score of 0 means that the response's answer does not meet all of the evaluation criteria.
|
57 |
+
|
58 |
+
Here is the data:
|
59 |
+
[BEGIN DATA]
|
60 |
+
***
|
61 |
+
[User Query]: {{input}}
|
62 |
+
***
|
63 |
+
[Response]: {{response}}
|
64 |
+
***
|
65 |
+
[END DATA]"""
|
66 |
+
|
67 |
+
# Default Variable Values
|
68 |
+
DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
|
69 |
+
A) Jaguar
|
70 |
+
B) Toucan
|
71 |
+
C) Polar Bear
|
72 |
+
D) Sloth"""
|
73 |
+
DEFAULT_RESPONSE = "C) Polar Bear"
|
74 |
+
|
75 |
+
# Voting Section Header
|
76 |
+
VOTING_HEADER = """
|
77 |
+
# Start Voting Now
|
78 |
+
"""
|
79 |
+
|
80 |
+
# Acknowledgements
|
81 |
+
ACKNOWLEDGEMENTS = """
|
82 |
+
<br><br><br>
|
83 |
+
# Acknowledgements
|
84 |
+
|
85 |
+
We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
|
86 |
+
|
87 |
+
We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
|
88 |
+
"""
|
89 |
+
|
90 |
+
# Policy Content
|
91 |
+
POLICY_CONTENT = """
|
92 |
+
# About Atla
|
93 |
+
|
94 |
+
Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
|
95 |
+
<br><br>
|
96 |
+
# Our Mission
|
97 |
+
|
98 |
+
By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
|
99 |
+
<br><br>
|
100 |
+
# Judge Arena Policy
|
101 |
+
|
102 |
+
## Overview
|
103 |
+
|
104 |
+
Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
|
105 |
+
|
106 |
+
## Transparency
|
107 |
+
|
108 |
+
- **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
|
109 |
+
- **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
|
110 |
+
- **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
|
111 |
+
|
112 |
+
## Model Inclusion Criteria
|
113 |
+
|
114 |
+
Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
|
115 |
+
|
116 |
+
- **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
|
117 |
+
- **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
|
118 |
+
- **Accessibility**:
|
119 |
+
- **Public API Access**: Models accessible through public APIs without restrictive barriers.
|
120 |
+
- **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
|
121 |
+
|
122 |
+
## Leaderboard Management
|
123 |
+
|
124 |
+
- **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
|
125 |
+
- **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
|
126 |
+
- **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
|
127 |
+
|
128 |
+
This policy might be updated to reflect changes in our practices or in response to community feedback.
|
129 |
+
|
130 |
+
# FAQ
|
131 |
+
|
132 |
+
**Isn't this the same as Chatbot Arena?**
|
133 |
+
|
134 |
+
We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
|
135 |
+
|
136 |
+
**What are the Evaluator Prompt Templates based on?**
|
137 |
+
|
138 |
+
As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
|
139 |
+
|
140 |
+
These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
|
141 |
+
|
142 |
+
**Why should I trust this leaderboard?**
|
143 |
+
|
144 |
+
We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
|
145 |
+
|
146 |
+
**Who funds this effort?**
|
147 |
+
|
148 |
+
Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
|
149 |
+
|
150 |
+
**What is Atla working on?**
|
151 |
+
|
152 |
+
We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
|
153 |
+
<br><br>
|
154 |
+
# Get in touch
|
155 |
+
Feel free to email us at [[email protected]](mailto:[email protected]) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
|
criteria_handler.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# criteria_handler.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import re
|
5 |
+
from eval_criteria_library import EXAMPLE_METRICS
|
6 |
+
|
7 |
+
def select_evaluation_criteria(data_upload_group, df_state, prompt_state):
|
8 |
+
with gr.Group(visible=True) as criteria_group:
|
9 |
+
select_eval_criteria_button = gr.Button("Select Evaluation Criteria", visible=False)
|
10 |
+
|
11 |
+
criteria_dropdown = gr.Dropdown(
|
12 |
+
choices=list(EXAMPLE_METRICS.keys()),
|
13 |
+
label="Choose Evaluation Criteria",
|
14 |
+
value=list(EXAMPLE_METRICS.keys())[0],
|
15 |
+
visible=False
|
16 |
+
)
|
17 |
+
|
18 |
+
with gr.Row(visible=False) as mapping_row:
|
19 |
+
with gr.Column():
|
20 |
+
# Left column - Evaluation Criteria Editor
|
21 |
+
prompt_editor = gr.Textbox(
|
22 |
+
label="Evaluation Criteria",
|
23 |
+
lines=15,
|
24 |
+
visible=False,
|
25 |
+
placeholder="Enter the evaluation criteria/rubric here..."
|
26 |
+
)
|
27 |
+
with gr.Column():
|
28 |
+
# Right column - Required and Optional Variable Mapping
|
29 |
+
# Required mappings
|
30 |
+
input_mapping = gr.Dropdown(
|
31 |
+
choices=[],
|
32 |
+
label="Map 'model_input' to column (Required)",
|
33 |
+
interactive=True,
|
34 |
+
visible=False
|
35 |
+
)
|
36 |
+
output_mapping = gr.Dropdown(
|
37 |
+
choices=[],
|
38 |
+
label="Map 'model_output' to column (Required)",
|
39 |
+
interactive=True,
|
40 |
+
visible=False
|
41 |
+
)
|
42 |
+
# Optional mappings
|
43 |
+
context_mapping = gr.Dropdown(
|
44 |
+
choices=[],
|
45 |
+
label="Map 'model_context' to column (Optional)",
|
46 |
+
interactive=True,
|
47 |
+
visible=False
|
48 |
+
)
|
49 |
+
expected_output_mapping = gr.Dropdown(
|
50 |
+
choices=[],
|
51 |
+
label="Map 'expected_model_output' to column (Optional)",
|
52 |
+
interactive=True,
|
53 |
+
visible=False
|
54 |
+
)
|
55 |
+
# We'll place the "Back to Data" and "Select Evaluators" within the same row:
|
56 |
+
with gr.Row(visible=False) as nav_row:
|
57 |
+
back_to_data_button = gr.Button("← Back to Data", visible=False)
|
58 |
+
save_prompt_button = gr.Button("Select Evaluators", visible=False)
|
59 |
+
|
60 |
+
def update_column_choices(df_state):
|
61 |
+
df = df_state.value
|
62 |
+
columns = df.columns.tolist() if df is not None else []
|
63 |
+
return {
|
64 |
+
input_mapping: gr.update(choices=columns, visible=True),
|
65 |
+
output_mapping: gr.update(choices=columns, visible=True),
|
66 |
+
context_mapping: gr.update(choices=['None'] + columns, visible=True),
|
67 |
+
expected_output_mapping: gr.update(choices=['None'] + columns, visible=True)
|
68 |
+
}
|
69 |
+
|
70 |
+
def update_prompt(selected_criteria, df_state):
|
71 |
+
if selected_criteria in EXAMPLE_METRICS:
|
72 |
+
evaluation_criteria = EXAMPLE_METRICS[selected_criteria]['prompt']
|
73 |
+
else:
|
74 |
+
evaluation_criteria = ""
|
75 |
+
updates = {prompt_editor: gr.update(value=evaluation_criteria, visible=True)}
|
76 |
+
updates.update(update_column_choices(df_state))
|
77 |
+
return updates
|
78 |
+
|
79 |
+
def show_criteria_selection():
|
80 |
+
default_criterion = list(EXAMPLE_METRICS.keys())[0]
|
81 |
+
evaluation_criteria = EXAMPLE_METRICS[default_criterion]['prompt']
|
82 |
+
updates = {
|
83 |
+
select_eval_criteria_button: gr.update(visible=False),
|
84 |
+
criteria_dropdown: gr.update(visible=True),
|
85 |
+
prompt_editor: gr.update(value=evaluation_criteria, visible=True),
|
86 |
+
data_upload_group: gr.update(visible=False),
|
87 |
+
mapping_row: gr.update(visible=True),
|
88 |
+
# Show the nav row and buttons
|
89 |
+
nav_row: gr.update(visible=True),
|
90 |
+
back_to_data_button: gr.update(visible=True),
|
91 |
+
save_prompt_button: gr.update(visible=True),
|
92 |
+
}
|
93 |
+
updates.update(update_column_choices(df_state))
|
94 |
+
return updates
|
95 |
+
|
96 |
+
def save_prompt(evaluation_criteria, input_col, output_col, context_col, expected_output_col):
|
97 |
+
# Use the actual Jinja template with proper Jinja syntax and raw JSON
|
98 |
+
template = '''You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
|
99 |
+
|
100 |
+
Here are some rules of the evaluation:
|
101 |
+
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
|
102 |
+
|
103 |
+
Your reply should strictly follow this format:
|
104 |
+
Your output format should strictly adhere to JSON as follows: {% raw %}{"feedback": "<write feedback>", "result": <numerical score>}{% endraw %}. Ensure the output is valid JSON, without additional formatting or explanations.
|
105 |
+
|
106 |
+
Here is the data.
|
107 |
+
|
108 |
+
{% if model_context is defined and model_context %}Context:
|
109 |
+
```
|
110 |
+
{{ model_context }}
|
111 |
+
```
|
112 |
+
|
113 |
+
{% endif %}Instruction:
|
114 |
+
```
|
115 |
+
{{ model_input }}
|
116 |
+
```
|
117 |
+
|
118 |
+
Response:
|
119 |
+
```
|
120 |
+
{{ model_output }}
|
121 |
+
```
|
122 |
+
|
123 |
+
Score Rubrics:
|
124 |
+
{{ evaluation_criteria }}
|
125 |
+
|
126 |
+
{% if expected_model_output is defined and expected_model_output %}Reference answer:
|
127 |
+
{{ expected_model_output }}{% endif %}'''
|
128 |
+
|
129 |
+
# Create mapping dictionary
|
130 |
+
mapping_dict = {
|
131 |
+
'model_input': input_col,
|
132 |
+
'model_output': output_col,
|
133 |
+
'evaluation_criteria': evaluation_criteria
|
134 |
+
}
|
135 |
+
|
136 |
+
# Add optional mappings if selected
|
137 |
+
if context_col != 'None':
|
138 |
+
mapping_dict['model_context'] = context_col
|
139 |
+
if expected_output_col != 'None':
|
140 |
+
mapping_dict['expected_model_output'] = expected_output_col
|
141 |
+
|
142 |
+
prompt_state.value = {
|
143 |
+
'template': template,
|
144 |
+
'mappings': mapping_dict
|
145 |
+
}
|
146 |
+
|
147 |
+
# Update event handlers
|
148 |
+
select_eval_criteria_button.click(
|
149 |
+
fn=show_criteria_selection,
|
150 |
+
inputs=[],
|
151 |
+
outputs=[
|
152 |
+
|
153 |
+
select_eval_criteria_button,
|
154 |
+
criteria_dropdown,
|
155 |
+
prompt_editor,
|
156 |
+
|
157 |
+
data_upload_group,
|
158 |
+
mapping_row,
|
159 |
+
nav_row,
|
160 |
+
back_to_data_button,
|
161 |
+
save_prompt_button
|
162 |
+
,
|
163 |
+
input_mapping, output_mapping, context_mapping, expected_output_mapping
|
164 |
+
]
|
165 |
+
)
|
166 |
+
|
167 |
+
criteria_dropdown.change(
|
168 |
+
fn=update_prompt,
|
169 |
+
inputs=[criteria_dropdown, df_state],
|
170 |
+
outputs=[prompt_editor, input_mapping, output_mapping, context_mapping, expected_output_mapping]
|
171 |
+
)
|
172 |
+
|
173 |
+
def make_select_button_visible(df_value):
|
174 |
+
if df_value is not None:
|
175 |
+
return gr.update(visible=True)
|
176 |
+
else:
|
177 |
+
return gr.update(visible=False)
|
178 |
+
|
179 |
+
df_state.change(
|
180 |
+
fn=make_select_button_visible,
|
181 |
+
inputs=df_state,
|
182 |
+
outputs=select_eval_criteria_button
|
183 |
+
)
|
184 |
+
|
185 |
+
save_prompt_button.click(
|
186 |
+
fn=save_prompt,
|
187 |
+
inputs=[
|
188 |
+
prompt_editor, input_mapping, output_mapping,
|
189 |
+
context_mapping, expected_output_mapping
|
190 |
+
],
|
191 |
+
outputs=[]
|
192 |
+
)
|
193 |
+
|
194 |
+
# BACK BUTTON: Hide the criteria UI, show the data upload UI
|
195 |
+
def back_to_data():
|
196 |
+
return {
|
197 |
+
# show data upload group again
|
198 |
+
data_upload_group: gr.update(visible=True),
|
199 |
+
# hide the criteria group
|
200 |
+
criteria_dropdown: gr.update(visible=False),
|
201 |
+
prompt_editor: gr.update(visible=False),
|
202 |
+
mapping_row: gr.update(visible=False),
|
203 |
+
nav_row: gr.update(visible=False),
|
204 |
+
# make "Select Evaluation Criteria" button visible again
|
205 |
+
select_eval_criteria_button: gr.update(visible=True),
|
206 |
+
}
|
207 |
+
|
208 |
+
back_to_data_button.click(
|
209 |
+
fn=back_to_data,
|
210 |
+
inputs=[],
|
211 |
+
outputs=[
|
212 |
+
data_upload_group,
|
213 |
+
criteria_dropdown,
|
214 |
+
prompt_editor,
|
215 |
+
mapping_row,
|
216 |
+
nav_row,
|
217 |
+
select_eval_criteria_button
|
218 |
+
]
|
219 |
+
)
|
220 |
+
|
221 |
+
# Return both the criteria rule group, the df_state, prompt_state, save_prompt_button
|
222 |
+
return criteria_group, df_state, prompt_state, save_prompt_button
|
data/models.jsonl
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
|
2 |
+
{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
|
3 |
+
{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
|
4 |
+
{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
|
5 |
+
{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
|
6 |
+
{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
|
7 |
+
{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
|
8 |
+
{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
|
9 |
+
{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
|
10 |
+
{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
|
11 |
+
{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
|
12 |
+
{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
|
13 |
+
{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
|
14 |
+
{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
|
15 |
+
{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
|
16 |
+
{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
|
17 |
+
{"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
|
data_handler.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# data_handler.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
import json
|
6 |
+
def upload_test_data(df_state):
|
7 |
+
with gr.Group() as data_upload_group:
|
8 |
+
file_upload = gr.File(
|
9 |
+
label="Upload JSON with test data incl. true labels as integers or floats",
|
10 |
+
file_types=[".json"],
|
11 |
+
)
|
12 |
+
import_button = gr.Button("Import Data", visible=False)
|
13 |
+
# Show exactly 5 rows, no scrolling
|
14 |
+
df_display = gr.Dataframe(
|
15 |
+
visible=False,
|
16 |
+
elem_classes=["truncate_cells"],
|
17 |
+
label="Uploaded Data"
|
18 |
+
)
|
19 |
+
error_display = gr.Textbox(visible=False)
|
20 |
+
|
21 |
+
def display_file_info(file):
|
22 |
+
if file is not None:
|
23 |
+
return {
|
24 |
+
import_button: gr.update(visible=True),
|
25 |
+
error_display: gr.update(visible=False) # Hide previous errors
|
26 |
+
}
|
27 |
+
else:
|
28 |
+
return {
|
29 |
+
import_button: gr.update(visible=False),
|
30 |
+
df_display: gr.update(visible=False),
|
31 |
+
error_display: gr.update(visible=False) # Hide previous errors
|
32 |
+
}
|
33 |
+
|
34 |
+
def import_data(file):
|
35 |
+
if file is not None:
|
36 |
+
try:
|
37 |
+
df_state.value = pd.json_normalize(json.load(open(file.name)))
|
38 |
+
|
39 |
+
return {
|
40 |
+
df_display: gr.update(value=df_state.value, visible=True),
|
41 |
+
import_button: gr.update(visible=False),
|
42 |
+
df_state: df_state,
|
43 |
+
error_display: gr.update(visible=False) # Hide previous errors
|
44 |
+
}
|
45 |
+
except json.JSONDecodeError as e:
|
46 |
+
return {
|
47 |
+
df_display: gr.update(visible=False),
|
48 |
+
error_display: gr.update(value="**Error:** Invalid JSON file. Please upload a valid JSON file.", visible=True),
|
49 |
+
import_button: gr.update(visible=True),
|
50 |
+
df_state: None
|
51 |
+
}
|
52 |
+
except Exception as e:
|
53 |
+
return {
|
54 |
+
df_display: gr.update(visible=False),
|
55 |
+
error_display: gr.update(value=f"**Error:** {str(e)}", visible=True),
|
56 |
+
import_button: gr.update(visible=True),
|
57 |
+
df_state: None
|
58 |
+
}
|
59 |
+
else:
|
60 |
+
return {
|
61 |
+
df_display: gr.update(visible=False),
|
62 |
+
import_button: gr.update(visible=True),
|
63 |
+
df_state: None
|
64 |
+
}
|
65 |
+
|
66 |
+
file_upload.change(
|
67 |
+
fn=display_file_info,
|
68 |
+
inputs=file_upload,
|
69 |
+
outputs=[import_button, df_display, error_display]
|
70 |
+
)
|
71 |
+
import_button.click(
|
72 |
+
fn=import_data,
|
73 |
+
inputs=file_upload,
|
74 |
+
outputs=[df_display, import_button, df_state, error_display]
|
75 |
+
)
|
76 |
+
|
77 |
+
return data_upload_group, df_state
|
eval_criteria_library.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EXAMPLE_METRICS = {
|
2 |
+
"Custom": {
|
3 |
+
"prompt":
|
4 |
+
"""Evaluate a chat bot response to a user's input based on [INSERT CRITERIA OR SELECT EXAMPLE FROM LIST]
|
5 |
+
|
6 |
+
0: The response's answer does not meet all of the evaluation criteria.
|
7 |
+
1: The response's answer meets all of the evaluation criteria.""",
|
8 |
+
},
|
9 |
+
"Relevance": {
|
10 |
+
"prompt": """Evaluate how well the response fulfill the requirements of the instruction by providing relevant information. This includes responding in accordance with the explicit and implicit purpose of given instruction.
|
11 |
+
|
12 |
+
1: The response is completely unrelated to the instruction, or the model entirely misunderstands the instruction.
|
13 |
+
2: Most of the key points in the response are irrelevant to the instruction, and the response misses major requirements of the instruction.
|
14 |
+
3: Some major points in the response contain irrelevant information or miss some requirements of the instruction.
|
15 |
+
4: The response is relevant to the instruction but misses minor requirements of the instruction.
|
16 |
+
5: The response is perfectly relevant to the instruction, and the model fulfills all of the requirements of the instruction.""",
|
17 |
+
},
|
18 |
+
"Correctness": {
|
19 |
+
"prompt": """Evaluate whether the information provided in the response is correct given the reference response. Ignore differences in punctuation and phrasing between the student answer and true answer. It is okay if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements.
|
20 |
+
|
21 |
+
0: The response is not factually accurate when compared against the reference response or includes conflicting statements.
|
22 |
+
1: The response is supported by the reference response and does not contain conflicting statements.""",
|
23 |
+
},
|
24 |
+
"Helpfulness": {
|
25 |
+
"prompt": """Evaluate how helpful the response is to address the user query.
|
26 |
+
|
27 |
+
1: The response is not at all useful, failing to address the instruction or provide any valuable information.
|
28 |
+
2: The response has minimal usefulness, addressing the instruction only superficially or providing mostly irrelevant information.
|
29 |
+
3: The response is moderately useful, addressing some aspects of the instruction effectively but lacking in others.
|
30 |
+
4: The response is very useful, effectively addressing most aspects of the instruction and providing valuable information.
|
31 |
+
5: The response is exceptionally useful, fully addressing the instruction and providing highly valuable information.""",
|
32 |
+
},
|
33 |
+
"Faithfulness": {
|
34 |
+
"prompt": """Evaluate how well the statements in the response are directly supported by the context given in the related passages.
|
35 |
+
|
36 |
+
1: The response contains statements that directly contradict the context or are entirely unsupported by it.
|
37 |
+
2: The response includes some information from the context, but contains significant ungrounded claims or misinterpretations.
|
38 |
+
3: The response is mostly grounded in the context, with only minor unsupported claims or misinterpretations.
|
39 |
+
4: The response closely aligns with the context, with only rare and minor deviations.
|
40 |
+
5: The response is fully grounded in the context, with all statements accurately reflecting the provided information.""",
|
41 |
+
},
|
42 |
+
"Logical coherence": {
|
43 |
+
"prompt": """Evaluate how logically accurate and correct the response is for the instruction given.
|
44 |
+
|
45 |
+
1: The logic of the model’s response is completely incoherent.
|
46 |
+
2: The model’s response contains major logical inconsistencies or errors.
|
47 |
+
3: The model’s response contains some logical inconsistencies or errors, but they are not significant."
|
48 |
+
4: The model’s response is logically sound, but it is slightly flawed in some aspect.
|
49 |
+
5: The model’s response is logically flawless.""",
|
50 |
+
},
|
51 |
+
"Conciseness": {
|
52 |
+
"prompt": """Evaluate how concise the response is presented to the user without any unncecessary information.
|
53 |
+
|
54 |
+
1: The response is highly redundant or contains a lot of unnecessary information, requiring a complete rewrite for optimal clarity and efficiency.
|
55 |
+
2: The response lacks conciseness and needs a substantial rewrite for better optimization.
|
56 |
+
3: The response is somewhat concise but includes unnecessary information, requiring
|
57 |
+
some edits for improved optimization.
|
58 |
+
4: The response is mostly concise but could benefit from minor edits for better optimization.
|
59 |
+
5: The response is optimally concise and does not contain any unnecessary information, requiring no further optimization.""",
|
60 |
+
},
|
61 |
+
}
|
get_llm_answer.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# get_llm_answer.py
|
2 |
+
|
3 |
+
from openai import OpenAI
|
4 |
+
import anthropic
|
5 |
+
from together import Together
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
import atla
|
9 |
+
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
# Initialize clients
|
14 |
+
anthropic_client = anthropic.Anthropic()
|
15 |
+
openai_client = OpenAI()
|
16 |
+
together_client = Together()
|
17 |
+
atla_client = atla.Atla()
|
18 |
+
|
19 |
+
SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
20 |
+
|
21 |
+
def get_openai_response(model_name, prompt):
|
22 |
+
"""Get response from OpenAI API"""
|
23 |
+
try:
|
24 |
+
response = openai_client.chat.completions.create(
|
25 |
+
model=model_name,
|
26 |
+
messages=[
|
27 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
28 |
+
{"role": "user", "content": prompt},
|
29 |
+
],
|
30 |
+
)
|
31 |
+
return response.choices[0].message.content
|
32 |
+
except Exception as e:
|
33 |
+
return f"Error with OpenAI model {model_name}: {str(e)}"
|
34 |
+
|
35 |
+
|
36 |
+
def get_anthropic_response(model_name, prompt):
|
37 |
+
"""Get response from Anthropic API"""
|
38 |
+
try:
|
39 |
+
response = anthropic_client.messages.create(
|
40 |
+
model=model_name,
|
41 |
+
max_tokens=1000,
|
42 |
+
temperature=0,
|
43 |
+
system=SYSTEM_PROMPT,
|
44 |
+
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
|
45 |
+
)
|
46 |
+
return response.content[0].text
|
47 |
+
except Exception as e:
|
48 |
+
return f"Error with Anthropic model {model_name}: {str(e)}"
|
49 |
+
|
50 |
+
|
51 |
+
def get_together_response(model_name, prompt):
|
52 |
+
"""Get response from Together API"""
|
53 |
+
try:
|
54 |
+
response = together_client.chat.completions.create(
|
55 |
+
model=model_name,
|
56 |
+
messages=[
|
57 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
58 |
+
{"role": "user", "content": prompt},
|
59 |
+
],
|
60 |
+
stream=False,
|
61 |
+
)
|
62 |
+
return response.choices[0].message.content
|
63 |
+
except Exception as e:
|
64 |
+
return f"Error with Together model {model_name}: {str(e)}"
|
65 |
+
|
66 |
+
|
67 |
+
def get_atla_response(model_name, model_input, model_output, model_context, expected_output, evaluation_criteria):
|
68 |
+
"""Get response from Atla API"""
|
69 |
+
try:
|
70 |
+
response = atla_client.evaluation.create(
|
71 |
+
model_id=model_name,
|
72 |
+
model_input=model_input,
|
73 |
+
model_output=model_output,
|
74 |
+
model_context=model_context,
|
75 |
+
expected_model_output=expected_output,
|
76 |
+
evaluation_criteria=evaluation_criteria,
|
77 |
+
)
|
78 |
+
# Return the score and critique directly from the evaluation result
|
79 |
+
return {
|
80 |
+
"score": response.result.evaluation.score,
|
81 |
+
"critique": response.result.evaluation.critique
|
82 |
+
}
|
83 |
+
except Exception as e:
|
84 |
+
return f"Error with Atla model {model_name}: {str(e)}"
|
85 |
+
|
86 |
+
|
87 |
+
def get_model_response(model_name, model_info, prompt=None, **kwargs):
|
88 |
+
"""Get response from appropriate API based on model organization"""
|
89 |
+
if not model_info:
|
90 |
+
return "Model not found or unsupported."
|
91 |
+
|
92 |
+
api_model = model_info["api_model"]
|
93 |
+
organization = model_info["organization"]
|
94 |
+
|
95 |
+
try:
|
96 |
+
if organization == "Atla":
|
97 |
+
return get_atla_response(
|
98 |
+
api_model,
|
99 |
+
kwargs.get('model_input'),
|
100 |
+
kwargs.get('model_output'),
|
101 |
+
kwargs.get('model_context'),
|
102 |
+
kwargs.get('expected_output'),
|
103 |
+
kwargs.get('evaluation_criteria')
|
104 |
+
)
|
105 |
+
elif organization == "OpenAI":
|
106 |
+
return get_openai_response(api_model, prompt)
|
107 |
+
elif organization == "Anthropic":
|
108 |
+
return get_anthropic_response(api_model, prompt)
|
109 |
+
else:
|
110 |
+
# All other organizations use Together API
|
111 |
+
return get_together_response(api_model, prompt)
|
112 |
+
except Exception as e:
|
113 |
+
return f"Error with {organization} model {model_name}: {str(e)}"
|
114 |
+
|
115 |
+
|
116 |
+
def parse_model_response(response):
|
117 |
+
try:
|
118 |
+
# Debug print
|
119 |
+
print(f"Raw model response: {response}")
|
120 |
+
|
121 |
+
# First try to parse the entire response as JSON
|
122 |
+
try:
|
123 |
+
data = json.loads(response)
|
124 |
+
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
125 |
+
except json.JSONDecodeError:
|
126 |
+
# If that fails (typically for smaller models), try to find JSON within the response
|
127 |
+
json_match = re.search(r"{.*}", response)
|
128 |
+
if json_match:
|
129 |
+
data = json.loads(json_match.group(0))
|
130 |
+
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
131 |
+
else:
|
132 |
+
return "Error", f"Failed to parse response: {response}"
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
# Debug print for error case
|
136 |
+
print(f"Failed to parse response: {str(e)}")
|
137 |
+
return "Error", f"Failed to parse response: {response}"
|
model_handler.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model_handler.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
from get_llm_answer import get_model_response, parse_model_response, get_atla_response
|
8 |
+
from jinja2 import Template
|
9 |
+
|
10 |
+
def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button):
|
11 |
+
with gr.Group(visible=True) as model_selection_group:
|
12 |
+
select_evaluators_button = gr.Button("Select Evaluators", visible=False)
|
13 |
+
|
14 |
+
# Load the model_data from JSONL
|
15 |
+
def load_model_data():
|
16 |
+
model_data = {}
|
17 |
+
try:
|
18 |
+
script_dir = os.path.dirname(__file__)
|
19 |
+
file_path = os.path.join(script_dir, "models.jsonl")
|
20 |
+
with open(file_path, "r") as f:
|
21 |
+
for line in f:
|
22 |
+
model = json.loads(line)
|
23 |
+
model_data[model["name"]] = {
|
24 |
+
"organization": model["organization"],
|
25 |
+
"license": model["license"],
|
26 |
+
"api_model": model["api_model"],
|
27 |
+
}
|
28 |
+
except FileNotFoundError:
|
29 |
+
print("Warning: models.jsonl not found")
|
30 |
+
return {}
|
31 |
+
return model_data
|
32 |
+
|
33 |
+
|
34 |
+
model_data = load_model_data()
|
35 |
+
model_choices = list(model_data.keys())
|
36 |
+
|
37 |
+
# Define dropdowns using model choices
|
38 |
+
with gr.Row(visible=False) as evaluator_row:
|
39 |
+
judge_a_dropdown = gr.Dropdown(
|
40 |
+
choices=["Selene"], label="Judge A", value="Selene", interactive=False
|
41 |
+
)
|
42 |
+
judge_b_dropdown = gr.Dropdown(
|
43 |
+
choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
|
44 |
+
)
|
45 |
+
|
46 |
+
# A Markdown for "Evaluation in progress..." and final heading
|
47 |
+
loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
|
48 |
+
|
49 |
+
# NEW: define a Dataframe to show final evaluation results, like in data_handler
|
50 |
+
evaluation_result_df = gr.Dataframe(
|
51 |
+
visible=False,
|
52 |
+
label="Evaluation Results",
|
53 |
+
elem_classes=["truncate_cells"]
|
54 |
+
)
|
55 |
+
|
56 |
+
# Define the three-button row AFTER the markdown,
|
57 |
+
# so it appears *below* the "Evaluation Complete" message.
|
58 |
+
with gr.Row(visible=False) as evaluation_nav_row:
|
59 |
+
back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
|
60 |
+
run_evaluation_button = gr.Button("Run Evaluation", visible=False)
|
61 |
+
analyze_results_button = gr.Button("Analyze Results", visible=False)
|
62 |
+
|
63 |
+
# Show evaluator selection UI
|
64 |
+
def show_evaluator_selection(current_df):
|
65 |
+
# Hide Criteria UI and show Evaluator UI
|
66 |
+
updates = {
|
67 |
+
criteria_group: gr.update(visible=False),
|
68 |
+
save_prompt_button: gr.update(visible=False),
|
69 |
+
evaluator_row: gr.update(visible=True),
|
70 |
+
evaluation_nav_row: gr.update(visible=True),
|
71 |
+
run_evaluation_button: gr.update(visible=True),
|
72 |
+
back_to_criteria_button: gr.update(visible=True),
|
73 |
+
# By default, hide "Analyze Results" and the result dataframe
|
74 |
+
analyze_results_button: gr.update(visible=False),
|
75 |
+
evaluation_result_df: gr.update(visible=False),
|
76 |
+
}
|
77 |
+
if (
|
78 |
+
current_df.value is not None
|
79 |
+
and hasattr(current_df.value, "attrs")
|
80 |
+
and current_df.value.attrs.get("eval_done")
|
81 |
+
):
|
82 |
+
# If a previous evaluation was completed, show the heading + dataframe
|
83 |
+
updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
|
84 |
+
updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
|
85 |
+
updates[analyze_results_button] = gr.update(visible=True)
|
86 |
+
|
87 |
+
return updates
|
88 |
+
|
89 |
+
# Note that we pass df_state to show_evaluator_selection
|
90 |
+
save_prompt_button.click(
|
91 |
+
fn=show_evaluator_selection,
|
92 |
+
inputs=[df_state],
|
93 |
+
outputs=[
|
94 |
+
save_prompt_button,
|
95 |
+
criteria_group,
|
96 |
+
evaluator_row,
|
97 |
+
evaluation_nav_row,
|
98 |
+
run_evaluation_button,
|
99 |
+
back_to_criteria_button,
|
100 |
+
loading_spinner,
|
101 |
+
analyze_results_button,
|
102 |
+
evaluation_result_df,
|
103 |
+
],
|
104 |
+
)
|
105 |
+
|
106 |
+
# Back to Criteria
|
107 |
+
def back_to_criteria():
|
108 |
+
return {
|
109 |
+
save_prompt_button: gr.update(visible=True),
|
110 |
+
criteria_group: gr.update(visible=True),
|
111 |
+
evaluator_row: gr.update(visible=False),
|
112 |
+
evaluation_nav_row: gr.update(visible=False),
|
113 |
+
run_evaluation_button: gr.update(visible=False),
|
114 |
+
# Hide the "Evaluation Complete" markdown
|
115 |
+
loading_spinner: gr.update(visible=False),
|
116 |
+
analyze_results_button: gr.update(visible=False),
|
117 |
+
evaluation_result_df: gr.update(visible=False),
|
118 |
+
}
|
119 |
+
|
120 |
+
back_to_criteria_button.click(
|
121 |
+
fn=back_to_criteria,
|
122 |
+
inputs=[],
|
123 |
+
outputs=[
|
124 |
+
save_prompt_button,
|
125 |
+
criteria_group,
|
126 |
+
evaluator_row,
|
127 |
+
evaluation_nav_row,
|
128 |
+
run_evaluation_button,
|
129 |
+
loading_spinner,
|
130 |
+
analyze_results_button,
|
131 |
+
evaluation_result_df
|
132 |
+
],
|
133 |
+
)
|
134 |
+
|
135 |
+
# Run evaluation
|
136 |
+
def run_evaluation(judge_a, judge_b):
|
137 |
+
# Show loading spinner
|
138 |
+
yield {loading_spinner: gr.update(visible=True)}
|
139 |
+
|
140 |
+
# Get template and mappings from prompt state
|
141 |
+
template_str = prompt_state.value['template']
|
142 |
+
mappings = prompt_state.value['mappings']
|
143 |
+
evaluation_criteria = mappings.get('evaluation_criteria')
|
144 |
+
|
145 |
+
# Create Jinja template for Judge B only
|
146 |
+
template = Template(template_str)
|
147 |
+
|
148 |
+
# Submit prompt to chosen models
|
149 |
+
for index, row in df_state.value.iterrows():
|
150 |
+
# Create a context dictionary for this row
|
151 |
+
context = {}
|
152 |
+
model_context = None
|
153 |
+
expected_output = None
|
154 |
+
|
155 |
+
for key, column in mappings.items():
|
156 |
+
if key == 'evaluation_criteria':
|
157 |
+
continue # Skip as we handle it separately
|
158 |
+
elif column and column != 'None':
|
159 |
+
context[key] = str(row[column])
|
160 |
+
if column == 'model_context':
|
161 |
+
model_context = str(row[column])
|
162 |
+
elif column == 'expected_model_output':
|
163 |
+
expected_output = str(row[column])
|
164 |
+
|
165 |
+
# For Judge B, render the template using Jinja
|
166 |
+
current_prompt = template.render(**context)
|
167 |
+
# For Judge A (Atla Selene), call get_atla_response directly
|
168 |
+
response_a = get_atla_response(
|
169 |
+
"atla-selene",
|
170 |
+
model_input=context.get('model_input'),
|
171 |
+
model_output=context.get('model_output'),
|
172 |
+
model_context=model_context,
|
173 |
+
expected_output=expected_output,
|
174 |
+
evaluation_criteria=evaluation_criteria
|
175 |
+
)
|
176 |
+
response_b = get_model_response(
|
177 |
+
judge_b,
|
178 |
+
model_data.get(judge_b),
|
179 |
+
current_prompt
|
180 |
+
)
|
181 |
+
|
182 |
+
# Parse the responses - handle Atla response differently
|
183 |
+
if isinstance(response_a, dict): # Atla response
|
184 |
+
score_a, critique_a = response_a['score'], response_a['critique']
|
185 |
+
else: # Error case
|
186 |
+
score_a, critique_a = "Error", response_a
|
187 |
+
|
188 |
+
score_b, critique_b = parse_model_response(response_b)
|
189 |
+
|
190 |
+
df_state.value.loc[index, 'score_a'] = score_a
|
191 |
+
df_state.value.loc[index, 'critique_a'] = critique_a
|
192 |
+
df_state.value.loc[index, 'score_b'] = score_b
|
193 |
+
df_state.value.loc[index, 'critique_b'] = critique_b
|
194 |
+
|
195 |
+
import time
|
196 |
+
time.sleep(2)
|
197 |
+
|
198 |
+
# Hide loading spinner
|
199 |
+
yield {loading_spinner: gr.update(visible=False)}
|
200 |
+
|
201 |
+
# Show "Evaluation Complete" heading and the final DataFrame
|
202 |
+
yield {
|
203 |
+
loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
|
204 |
+
evaluation_result_df: gr.update(value=df_state.value, visible=True),
|
205 |
+
analyze_results_button: gr.update(visible=True),
|
206 |
+
}
|
207 |
+
|
208 |
+
# Store the "already run evaluation" flag safely in .attrs
|
209 |
+
if hasattr(df_state.value, "attrs"):
|
210 |
+
df_state.value.attrs["eval_done"] = True
|
211 |
+
|
212 |
+
run_evaluation_button.click(
|
213 |
+
fn=run_evaluation,
|
214 |
+
inputs=[judge_a_dropdown, judge_b_dropdown],
|
215 |
+
outputs=[loading_spinner, evaluation_result_df, analyze_results_button],
|
216 |
+
)
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
return model_selection_group, df_state, analyze_results_button
|
models.jsonl
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
|
2 |
+
{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
|
3 |
+
{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
|
4 |
+
{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
|
5 |
+
{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
|
6 |
+
{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
|
7 |
+
{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
|
8 |
+
{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
|
9 |
+
{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
|
10 |
+
{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
|
11 |
+
{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
|
12 |
+
{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest"}
|
13 |
+
{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
|
14 |
+
{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
|
15 |
+
{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
|
16 |
+
{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
|
17 |
+
{"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest"}
|
18 |
+
{"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
|
19 |
+
{"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
|
random_sample/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# This file can be empty - it just marks the directory as a Python package
|
random_sample/arena_interface.py
ADDED
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
from .gen_api_answer import (
|
9 |
+
get_atla_response
|
10 |
+
)
|
11 |
+
|
12 |
+
from .prompts import (
|
13 |
+
DEFAULT_EVAL_CRITERIA,
|
14 |
+
DEFAULT_EVAL_PROMPT,
|
15 |
+
DEFAULT_EVAL_PROMPT_EDITABLE,
|
16 |
+
FIXED_EVAL_SUFFIX
|
17 |
+
)
|
18 |
+
|
19 |
+
from .random_sample_generation import (
|
20 |
+
get_random_human_ai_pair,
|
21 |
+
get_random_human_ai_ground_truth_pair,
|
22 |
+
generate_ai_response
|
23 |
+
)
|
24 |
+
|
25 |
+
from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
|
26 |
+
|
27 |
+
def parse_variables(prompt):
|
28 |
+
# Extract variables enclosed in double curly braces
|
29 |
+
variables = re.findall(r"{{(.*?)}}", prompt)
|
30 |
+
# Remove duplicates while preserving order
|
31 |
+
seen = set()
|
32 |
+
variables = [
|
33 |
+
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
|
34 |
+
]
|
35 |
+
return variables
|
36 |
+
|
37 |
+
|
38 |
+
def get_final_prompt(eval_prompt, variable_values):
|
39 |
+
# Replace variables in the eval prompt with their values
|
40 |
+
for var, val in variable_values.items():
|
41 |
+
eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
|
42 |
+
return eval_prompt
|
43 |
+
|
44 |
+
|
45 |
+
def populate_random_example(request: gr.Request, compatible_mode: bool):
|
46 |
+
"""Generate a random human-AI conversation example and reset judge outputs."""
|
47 |
+
if compatible_mode:
|
48 |
+
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
49 |
+
else:
|
50 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
51 |
+
ground_truth_msg = ""
|
52 |
+
|
53 |
+
return [
|
54 |
+
gr.update(value=human_msg),
|
55 |
+
gr.update(value=ai_msg),
|
56 |
+
gr.update(value="🎲", variant="secondary"),
|
57 |
+
gr.update(value=""), # Clear score
|
58 |
+
gr.update(value=""), # Clear critique
|
59 |
+
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
|
60 |
+
]
|
61 |
+
|
62 |
+
|
63 |
+
def create_arena_interface():
|
64 |
+
with gr.Blocks(theme="default", css=CSS_STYLES) as interface:
|
65 |
+
# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
|
66 |
+
eval_prompt = gr.Textbox(
|
67 |
+
value=DEFAULT_EVAL_PROMPT,
|
68 |
+
visible=False
|
69 |
+
)
|
70 |
+
with gr.Row():
|
71 |
+
# Left side - Input section
|
72 |
+
with gr.Column(scale=1):
|
73 |
+
with gr.Group():
|
74 |
+
human_input = gr.TextArea(
|
75 |
+
label="👩 User Input",
|
76 |
+
lines=5,
|
77 |
+
placeholder="Enter the human message here..."
|
78 |
+
)
|
79 |
+
with gr.Row():
|
80 |
+
generate_btn = gr.Button(
|
81 |
+
"Generate AI Response",
|
82 |
+
size="sm",
|
83 |
+
interactive=False
|
84 |
+
)
|
85 |
+
|
86 |
+
ai_response = gr.TextArea(
|
87 |
+
label="🤖 AI Response",
|
88 |
+
lines=10,
|
89 |
+
placeholder="Enter the AI response here..."
|
90 |
+
)
|
91 |
+
|
92 |
+
# Ground truth response (initially hidden)
|
93 |
+
ground_truth = gr.TextArea(
|
94 |
+
label="🎯 Ground truth response",
|
95 |
+
lines=10,
|
96 |
+
placeholder="Enter the ground truth response here...",
|
97 |
+
visible=False
|
98 |
+
)
|
99 |
+
|
100 |
+
with gr.Row():
|
101 |
+
random_btn = gr.Button("🎲", scale=2)
|
102 |
+
send_btn = gr.Button(
|
103 |
+
value="Run evaluation",
|
104 |
+
variant="primary",
|
105 |
+
size="lg",
|
106 |
+
scale=8
|
107 |
+
)
|
108 |
+
|
109 |
+
# Right side - Model outputs
|
110 |
+
with gr.Column(scale=1):
|
111 |
+
gr.Markdown("## 👩⚖️ Selene-Mini Evaluation")
|
112 |
+
with gr.Group():
|
113 |
+
with gr.Row():
|
114 |
+
score = gr.Textbox(label="Score", lines=1, interactive=False)
|
115 |
+
critique = gr.TextArea(label="Critique", lines=12, interactive=False)
|
116 |
+
|
117 |
+
gr.Markdown("<br>")
|
118 |
+
|
119 |
+
|
120 |
+
# Replace the "Edit Judge Prompt" Accordion section with:
|
121 |
+
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
122 |
+
gr.Markdown("<br>")
|
123 |
+
use_reference_toggle = gr.Checkbox(
|
124 |
+
label="Use a reference response",
|
125 |
+
value=False
|
126 |
+
)
|
127 |
+
|
128 |
+
# Hide the default prompt editor
|
129 |
+
with gr.Column(visible=False) as default_prompt_editor:
|
130 |
+
eval_prompt_editable = gr.TextArea(
|
131 |
+
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
132 |
+
label="Evaluation Criteria",
|
133 |
+
lines=12
|
134 |
+
)
|
135 |
+
|
136 |
+
with gr.Row(visible=False) as edit_buttons_row:
|
137 |
+
cancel_prompt_btn = gr.Button("Cancel")
|
138 |
+
save_prompt_btn = gr.Button("Save", variant="primary")
|
139 |
+
|
140 |
+
# Show the compatible mode editor
|
141 |
+
with gr.Column(visible=True) as compatible_prompt_editor:
|
142 |
+
eval_criteria_text = gr.TextArea(
|
143 |
+
label="Evaluation Criteria",
|
144 |
+
lines=12,
|
145 |
+
value=DEFAULT_EVAL_CRITERIA,
|
146 |
+
placeholder="Enter the complete evaluation criteria and scoring rubric..."
|
147 |
+
)
|
148 |
+
with gr.Row(visible=False) as compatible_edit_buttons_row:
|
149 |
+
compatible_cancel_btn = gr.Button("Cancel")
|
150 |
+
compatible_save_btn = gr.Button("Save", variant="primary")
|
151 |
+
|
152 |
+
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
|
153 |
+
is_editing = gr.State(False) # Track editing state
|
154 |
+
compatible_mode_state = gr.State(False) # Track compatible mode state
|
155 |
+
|
156 |
+
# Update model names after responses are generated
|
157 |
+
def update_model_names(model_a, model_b):
|
158 |
+
return gr.update(value=f"*Model: {model_a}*"), gr.update(
|
159 |
+
value=f"*Model: {model_b}*"
|
160 |
+
)
|
161 |
+
|
162 |
+
# Store the last submitted prompt and variables for comparison
|
163 |
+
last_submission = gr.State({})
|
164 |
+
|
165 |
+
# Update the save/cancel buttons section in the compatible prompt editor
|
166 |
+
def save_criteria(new_criteria, previous_criteria):
|
167 |
+
return [
|
168 |
+
gr.update(value=new_criteria), # Update the criteria
|
169 |
+
new_criteria, # Update the previous criteria state
|
170 |
+
gr.update(visible=False) # Hide the buttons
|
171 |
+
]
|
172 |
+
|
173 |
+
def cancel_criteria(previous_criteria):
|
174 |
+
return [
|
175 |
+
gr.update(value=previous_criteria), # Revert to previous criteria
|
176 |
+
previous_criteria, # Keep the previous criteria state
|
177 |
+
gr.update(visible=False) # Hide the buttons
|
178 |
+
]
|
179 |
+
|
180 |
+
def show_criteria_edit_buttons(current_value, previous_value):
|
181 |
+
# Show buttons only if the current value differs from the previous value
|
182 |
+
return gr.update(visible=current_value != previous_value)
|
183 |
+
|
184 |
+
# Add handlers for save/cancel buttons and criteria changes
|
185 |
+
compatible_save_btn.click(
|
186 |
+
fn=save_criteria,
|
187 |
+
inputs=[eval_criteria_text, eval_prompt_previous],
|
188 |
+
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
|
189 |
+
)
|
190 |
+
|
191 |
+
compatible_cancel_btn.click(
|
192 |
+
fn=cancel_criteria,
|
193 |
+
inputs=[eval_prompt_previous],
|
194 |
+
outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
|
195 |
+
)
|
196 |
+
|
197 |
+
eval_criteria_text.change(
|
198 |
+
fn=show_criteria_edit_buttons,
|
199 |
+
inputs=[eval_criteria_text, eval_prompt_previous],
|
200 |
+
outputs=compatible_edit_buttons_row
|
201 |
+
)
|
202 |
+
|
203 |
+
# Function to toggle visibility based on compatible mode
|
204 |
+
def toggle_use_reference(checked):
|
205 |
+
if checked:
|
206 |
+
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
207 |
+
return {
|
208 |
+
ground_truth: gr.update(visible=True, value=ground_truth_msg),
|
209 |
+
human_input: gr.update(value=human_msg),
|
210 |
+
ai_response: gr.update(value=ai_msg),
|
211 |
+
score: gr.update(value=""),
|
212 |
+
critique: gr.update(value=""),
|
213 |
+
random_btn: gr.update(value="🎲", variant="secondary"),
|
214 |
+
}
|
215 |
+
else:
|
216 |
+
return {
|
217 |
+
ground_truth: gr.update(visible=False)
|
218 |
+
}
|
219 |
+
|
220 |
+
# Update the change handler to include all necessary outputs
|
221 |
+
use_reference_toggle.change(
|
222 |
+
fn=toggle_use_reference,
|
223 |
+
inputs=[use_reference_toggle],
|
224 |
+
outputs=[
|
225 |
+
ground_truth,
|
226 |
+
human_input,
|
227 |
+
ai_response,
|
228 |
+
score,
|
229 |
+
critique,
|
230 |
+
random_btn,
|
231 |
+
]
|
232 |
+
)
|
233 |
+
|
234 |
+
# Add a new state variable to track first game
|
235 |
+
first_game_state = gr.State(True) # Initialize as True
|
236 |
+
|
237 |
+
# Update the submit function to parse the evaluation criteria
|
238 |
+
def submit_and_store(
|
239 |
+
use_reference,
|
240 |
+
eval_criteria_text,
|
241 |
+
human_input,
|
242 |
+
ai_response,
|
243 |
+
ground_truth_input,
|
244 |
+
):
|
245 |
+
# Build prompt data dictionary
|
246 |
+
prompt_data = {
|
247 |
+
'human_input': human_input,
|
248 |
+
'ai_response': ai_response,
|
249 |
+
'ground_truth_input': ground_truth_input if use_reference else None,
|
250 |
+
'eval_criteria': eval_criteria_text,
|
251 |
+
}
|
252 |
+
|
253 |
+
# Get response from Atla
|
254 |
+
response = get_atla_response(
|
255 |
+
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
|
256 |
+
prompt=prompt_data,
|
257 |
+
max_tokens=500,
|
258 |
+
temperature=0.01
|
259 |
+
)
|
260 |
+
|
261 |
+
# Response now contains score and critique directly
|
262 |
+
if isinstance(response, dict) and 'score' in response and 'critique' in response:
|
263 |
+
score = str(response['score'])
|
264 |
+
critique = response['critique']
|
265 |
+
else:
|
266 |
+
# Handle error case
|
267 |
+
score = "Error"
|
268 |
+
critique = str(response)
|
269 |
+
|
270 |
+
return [
|
271 |
+
score,
|
272 |
+
critique,
|
273 |
+
gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
|
274 |
+
gr.update(value="🎲"),
|
275 |
+
]
|
276 |
+
|
277 |
+
# Update the click handler to use False for is_first_game after first submission
|
278 |
+
def create_submit_handler():
|
279 |
+
first_game = True
|
280 |
+
|
281 |
+
def handler(*args):
|
282 |
+
nonlocal first_game
|
283 |
+
result = submit_and_store(*args)
|
284 |
+
first_game = False # Set to False after first submission
|
285 |
+
return result
|
286 |
+
|
287 |
+
return handler
|
288 |
+
|
289 |
+
# Update the send_btn click handler
|
290 |
+
send_btn.click(
|
291 |
+
fn=submit_and_store,
|
292 |
+
inputs=[
|
293 |
+
use_reference_toggle,
|
294 |
+
eval_criteria_text,
|
295 |
+
human_input,
|
296 |
+
ai_response,
|
297 |
+
ground_truth,
|
298 |
+
],
|
299 |
+
outputs=[
|
300 |
+
score,
|
301 |
+
critique,
|
302 |
+
send_btn,
|
303 |
+
random_btn,
|
304 |
+
],
|
305 |
+
)
|
306 |
+
|
307 |
+
# Add random button handler
|
308 |
+
random_btn.click(
|
309 |
+
fn=populate_random_example,
|
310 |
+
inputs=[use_reference_toggle],
|
311 |
+
outputs=[
|
312 |
+
human_input,
|
313 |
+
ai_response,
|
314 |
+
random_btn,
|
315 |
+
score,
|
316 |
+
critique,
|
317 |
+
ground_truth,
|
318 |
+
]
|
319 |
+
)
|
320 |
+
|
321 |
+
# Add input change handlers
|
322 |
+
def handle_input_change():
|
323 |
+
"""Reset UI state when inputs are changed"""
|
324 |
+
return [
|
325 |
+
gr.update(value="Run evaluation", variant="primary"), # send_btn
|
326 |
+
gr.update(value="🎲", variant="secondary"), # random_btn
|
327 |
+
]
|
328 |
+
|
329 |
+
# Update the change handlers for inputs
|
330 |
+
human_input.change(
|
331 |
+
fn=handle_input_change,
|
332 |
+
inputs=[],
|
333 |
+
outputs=[send_btn, random_btn]
|
334 |
+
)
|
335 |
+
|
336 |
+
ai_response.change(
|
337 |
+
fn=handle_input_change,
|
338 |
+
inputs=[],
|
339 |
+
outputs=[send_btn, random_btn]
|
340 |
+
)
|
341 |
+
|
342 |
+
generate_btn.click(
|
343 |
+
fn=lambda msg: (
|
344 |
+
generate_ai_response(msg)[0], # Only take the response text
|
345 |
+
gr.update(
|
346 |
+
value="Generate AI Response", # Keep the label
|
347 |
+
interactive=False # Disable the button
|
348 |
+
)
|
349 |
+
),
|
350 |
+
inputs=[human_input],
|
351 |
+
outputs=[ai_response, generate_btn]
|
352 |
+
)
|
353 |
+
|
354 |
+
human_input.change(
|
355 |
+
fn=lambda x: gr.update(interactive=bool(x.strip())),
|
356 |
+
inputs=[human_input],
|
357 |
+
outputs=[generate_btn]
|
358 |
+
)
|
359 |
+
|
360 |
+
# Update the demo.load to include the random example population
|
361 |
+
interface.load(
|
362 |
+
fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
|
363 |
+
inputs=[],
|
364 |
+
outputs=[
|
365 |
+
human_input,
|
366 |
+
ai_response,
|
367 |
+
random_btn,
|
368 |
+
score,
|
369 |
+
critique,
|
370 |
+
ground_truth,
|
371 |
+
]
|
372 |
+
)
|
373 |
+
|
374 |
+
return interface
|
375 |
+
|
376 |
+
if __name__ == "__main__":
|
377 |
+
demo = create_arena_interface()
|
378 |
+
demo.launch()
|
random_sample/common.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Page Headers
|
2 |
+
MAIN_TITLE = "# Selene-Mini"
|
3 |
+
|
4 |
+
# How it works section
|
5 |
+
HOW_IT_WORKS = """
|
6 |
+
Try running evals with Selene-Mini in this playground! Our HF model card can be found [here](https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B).
|
7 |
+
"""
|
8 |
+
|
9 |
+
BATTLE_RULES = """
|
10 |
+
## 🤺 Choose the winner
|
11 |
+
1. Define your scoring criteria in the **Evaluator Prompt**
|
12 |
+
2. Add a test case to the **Sample to evaluate**
|
13 |
+
3. Test the evaluators & vote for the model that best aligns with your judgement!
|
14 |
+
\n
|
15 |
+
Variables defined in your prompt with {{double curly braces}} map to input fields under **Sample to evaluate**.
|
16 |
+
|
17 |
+
<br>
|
18 |
+
"""
|
19 |
+
|
20 |
+
# CSS Styles
|
21 |
+
CSS_STYLES = """
|
22 |
+
.prompt-row {
|
23 |
+
align-items: flex-start !important;
|
24 |
+
}
|
25 |
+
.send-button-row {
|
26 |
+
display: flex;
|
27 |
+
justify-content: flex-end;
|
28 |
+
margin-top: 8px;
|
29 |
+
}
|
30 |
+
/* Style for metric buttons */
|
31 |
+
.metric-button-active {
|
32 |
+
background-color: #2B3A55 !important;
|
33 |
+
color: white !important;
|
34 |
+
}
|
35 |
+
/* Add this to ensure proper button spacing */
|
36 |
+
.metric-buttons-row {
|
37 |
+
gap: 8px;
|
38 |
+
}
|
39 |
+
"""
|
40 |
+
|
41 |
+
# Default Eval Prompt
|
42 |
+
EVAL_DESCRIPTION = """
|
43 |
+
## 📝 Tips
|
44 |
+
**Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
|
45 |
+
- Evaluation criteria
|
46 |
+
- Scoring rubric
|
47 |
+
- Examples (Optional)
|
48 |
+
"""
|
49 |
+
|
50 |
+
# Voting Section Header
|
51 |
+
VOTING_HEADER = """
|
52 |
+
# Start Voting Now
|
53 |
+
"""
|
54 |
+
|
55 |
+
# Acknowledgements
|
56 |
+
ACKNOWLEDGEMENTS = """
|
57 |
+
<br><br>
|
58 |
+
# Acknowledgements
|
59 |
+
|
60 |
+
We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
|
61 |
+
|
62 |
+
We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
|
63 |
+
"""
|
64 |
+
|
65 |
+
# Policy Content
|
66 |
+
POLICY_CONTENT = """
|
67 |
+
# About Atla
|
68 |
+
|
69 |
+
Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
|
70 |
+
<br><br>
|
71 |
+
# [Our Mission](https://www.atla-ai.com/company)
|
72 |
+
|
73 |
+
By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
|
74 |
+
Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
|
75 |
+
<br><br>
|
76 |
+
# Judge Arena Policy
|
77 |
+
|
78 |
+
## Overview
|
79 |
+
|
80 |
+
Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
|
81 |
+
|
82 |
+
## Transparency
|
83 |
+
|
84 |
+
- **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
|
85 |
+
- **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
|
86 |
+
- **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
|
87 |
+
|
88 |
+
## Model Inclusion Criteria
|
89 |
+
|
90 |
+
Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
|
91 |
+
|
92 |
+
- **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
|
93 |
+
- **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
|
94 |
+
- **Accessibility**:
|
95 |
+
- **Public API Access**: Models accessible through public APIs without restrictive barriers.
|
96 |
+
- **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
|
97 |
+
|
98 |
+
## Leaderboard Management
|
99 |
+
|
100 |
+
- **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
|
101 |
+
- **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
|
102 |
+
- **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
|
103 |
+
|
104 |
+
*This policy might be updated to reflect changes in our practices or in response to community feedback.*
|
105 |
+
<br><br>
|
106 |
+
# FAQ
|
107 |
+
|
108 |
+
**Isn't this the same as Chatbot Arena?**
|
109 |
+
|
110 |
+
We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
|
111 |
+
|
112 |
+
**Why should I trust this leaderboard?**
|
113 |
+
|
114 |
+
We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
|
115 |
+
|
116 |
+
**Who funds this effort?**
|
117 |
+
|
118 |
+
Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
|
119 |
+
|
120 |
+
**What is Atla working on?**
|
121 |
+
|
122 |
+
We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
|
123 |
+
<br><br>
|
124 |
+
# Get in touch
|
125 |
+
We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.gg/yNpUAMqs).
|
126 |
+
\nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
|
random_sample/gen_api_answer.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import anthropic
|
3 |
+
from together import Together
|
4 |
+
import os
|
5 |
+
from atla import Atla
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from .prompts import (
|
8 |
+
JUDGE_SYSTEM_PROMPT,
|
9 |
+
ATLA_PROMPT,
|
10 |
+
ATLA_PROMPT_WITH_REFERENCE
|
11 |
+
)
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
# Initialize clients
|
16 |
+
anthropic_client = anthropic.Anthropic()
|
17 |
+
openai_client = OpenAI()
|
18 |
+
together_client = Together()
|
19 |
+
hf_api_key = os.getenv("HF_API_KEY")
|
20 |
+
|
21 |
+
atla_client = Atla()
|
22 |
+
|
23 |
+
def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
24 |
+
"""Get response from OpenAI API"""
|
25 |
+
try:
|
26 |
+
response = openai_client.chat.completions.create(
|
27 |
+
model=model_name,
|
28 |
+
messages=[
|
29 |
+
{"role": "system", "content": system_prompt},
|
30 |
+
{"role": "user", "content": prompt},
|
31 |
+
],
|
32 |
+
max_completion_tokens=max_tokens,
|
33 |
+
temperature=temperature,
|
34 |
+
)
|
35 |
+
return response.choices[0].message.content
|
36 |
+
except Exception as e:
|
37 |
+
return f"Error with OpenAI model {model_name}: {str(e)}"
|
38 |
+
|
39 |
+
def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
40 |
+
"""Get response from Anthropic API"""
|
41 |
+
try:
|
42 |
+
response = anthropic_client.messages.create(
|
43 |
+
model=model_name,
|
44 |
+
max_tokens=max_tokens,
|
45 |
+
temperature=temperature,
|
46 |
+
system=system_prompt,
|
47 |
+
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
|
48 |
+
)
|
49 |
+
return response.content[0].text
|
50 |
+
except Exception as e:
|
51 |
+
return f"Error with Anthropic model {model_name}: {str(e)}"
|
52 |
+
|
53 |
+
|
54 |
+
def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
|
55 |
+
"""Get response from Atla API"""
|
56 |
+
try:
|
57 |
+
# Extract components from the prompt data
|
58 |
+
model_input = prompt.get('human_input', '')
|
59 |
+
model_output = prompt.get('ai_response', '')
|
60 |
+
expected_output = prompt.get('ground_truth_input')
|
61 |
+
evaluation_criteria = prompt.get('eval_criteria', '')
|
62 |
+
|
63 |
+
response = atla_client.evaluation.create(
|
64 |
+
model_id="atla-selene",
|
65 |
+
model_input=model_input,
|
66 |
+
model_output=model_output,
|
67 |
+
expected_model_output=expected_output if expected_output else None,
|
68 |
+
evaluation_criteria=evaluation_criteria,
|
69 |
+
)
|
70 |
+
|
71 |
+
# Return the score and critique directly
|
72 |
+
return {
|
73 |
+
"score": response.result.evaluation.score,
|
74 |
+
"critique": response.result.evaluation.critique
|
75 |
+
}
|
76 |
+
except Exception as e:
|
77 |
+
return f"Error with Atla model {model_name}: {str(e)}"
|
random_sample/prompts.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default values for compatible mode
|
2 |
+
DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?
|
3 |
+
|
4 |
+
Scoring Rubric:
|
5 |
+
Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
|
6 |
+
Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
|
7 |
+
Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
|
8 |
+
Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
|
9 |
+
Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
|
10 |
+
|
11 |
+
# Default Eval Prompt
|
12 |
+
DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
|
13 |
+
|
14 |
+
Scoring Rubric:
|
15 |
+
Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
|
16 |
+
Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
|
17 |
+
Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
|
18 |
+
Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
|
19 |
+
Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
|
20 |
+
|
21 |
+
[User Query]: {{input}}
|
22 |
+
|
23 |
+
[AI Response]: {{response}}"""
|
24 |
+
|
25 |
+
# Split the eval prompt into editable and fixed parts
|
26 |
+
DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
|
27 |
+
|
28 |
+
Scoring Rubric:
|
29 |
+
Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
|
30 |
+
Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
|
31 |
+
Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
|
32 |
+
Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
|
33 |
+
Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
|
34 |
+
|
35 |
+
# Fixed suffix that will always be appended
|
36 |
+
FIXED_EVAL_SUFFIX = """
|
37 |
+
[User Query]: {{human_input}}
|
38 |
+
|
39 |
+
[AI Response]: {{ai_response}}"""
|
40 |
+
|
41 |
+
ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score integer, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
|
42 |
+
Here are some rules of the evaluation:
|
43 |
+
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
|
44 |
+
|
45 |
+
Your reply should strictly follow this format:
|
46 |
+
**Reasoning:** <Your feedback>
|
47 |
+
|
48 |
+
**Result:** <Your score>
|
49 |
+
|
50 |
+
Here is the data:
|
51 |
+
|
52 |
+
Instruction:
|
53 |
+
```
|
54 |
+
{human_input}
|
55 |
+
```
|
56 |
+
|
57 |
+
Response:
|
58 |
+
```
|
59 |
+
{ai_response}
|
60 |
+
```
|
61 |
+
|
62 |
+
Score Rubrics:
|
63 |
+
{eval_criteria}"""
|
64 |
+
|
65 |
+
ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
|
66 |
+
|
67 |
+
Here are some rules of the evaluation:
|
68 |
+
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
|
69 |
+
|
70 |
+
Your reply should strictly follow this format:
|
71 |
+
**Reasoning:** <Your feedback>
|
72 |
+
|
73 |
+
**Result:** <Your score>
|
74 |
+
|
75 |
+
Here is the data:
|
76 |
+
|
77 |
+
Instruction:
|
78 |
+
```
|
79 |
+
{human_input}
|
80 |
+
```
|
81 |
+
|
82 |
+
Response:
|
83 |
+
```
|
84 |
+
{ai_response}
|
85 |
+
```
|
86 |
+
|
87 |
+
Score Rubrics:
|
88 |
+
{eval_criteria}
|
89 |
+
|
90 |
+
Reference answer:
|
91 |
+
{ground_truth_input}"""
|
92 |
+
|
93 |
+
# Judge system prompt for non-Prometheus models
|
94 |
+
JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
random_sample/random_sample_generation.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import anthropic
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import random
|
6 |
+
import os
|
7 |
+
from .gen_api_answer import get_openai_response, get_anthropic_response
|
8 |
+
|
9 |
+
# Initialize clients
|
10 |
+
anthropic_client = anthropic.Anthropic()
|
11 |
+
openai_client = OpenAI()
|
12 |
+
|
13 |
+
GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
14 |
+
BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
15 |
+
AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
16 |
+
|
17 |
+
GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
18 |
+
BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
19 |
+
AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
20 |
+
|
21 |
+
GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
|
22 |
+
GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
|
23 |
+
1. A random human message (not a simple one-word answer question)
|
24 |
+
2. An AI response (a few sentences long)
|
25 |
+
3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
|
26 |
+
|
27 |
+
Format as JSON with "human", "ai", and "ground_truth" fields."""
|
28 |
+
|
29 |
+
RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
|
30 |
+
|
31 |
+
def get_random_human_ai_pair():
|
32 |
+
# Select system prompt with specified probabilities
|
33 |
+
system_prompt = random.choices(
|
34 |
+
[GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
|
35 |
+
weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
|
36 |
+
)[0]
|
37 |
+
|
38 |
+
# Log which type of response is being generated
|
39 |
+
prompt_type = {
|
40 |
+
GOOD_SYSTEM_PROMPT: "good",
|
41 |
+
BAD_SYSTEM_PROMPT: "bad",
|
42 |
+
AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
|
43 |
+
}[system_prompt]
|
44 |
+
print(f"Generating {prompt_type} response")
|
45 |
+
|
46 |
+
# Randomly choose between GPT-3.5 and Claude with 65%/35% weights
|
47 |
+
model_choice = random.choices([
|
48 |
+
("gpt-3.5-turbo", get_openai_response),
|
49 |
+
("claude-3-5-haiku-latest", get_anthropic_response)
|
50 |
+
], weights=[0.5, 0.5])[0]
|
51 |
+
model_name, api_func = model_choice
|
52 |
+
|
53 |
+
# Generate response using selected model
|
54 |
+
response = api_func(
|
55 |
+
model_name=model_name,
|
56 |
+
prompt=GENERATION_PROMPT,
|
57 |
+
system_prompt=system_prompt,
|
58 |
+
max_tokens=500,
|
59 |
+
temperature=1
|
60 |
+
)
|
61 |
+
|
62 |
+
# Define default messages
|
63 |
+
default_human = "How do muscles grow?"
|
64 |
+
default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
|
65 |
+
|
66 |
+
try:
|
67 |
+
# Clean the response by replacing newlines with spaces
|
68 |
+
cleaned_response = response.replace('\n', ' ').replace('\r', '')
|
69 |
+
data = json.loads(cleaned_response)
|
70 |
+
|
71 |
+
# Extract messages with fallbacks
|
72 |
+
human_message = data.get("human", default_human)
|
73 |
+
ai_message = data.get("ai", default_ai)
|
74 |
+
|
75 |
+
# Debug logging
|
76 |
+
print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
|
77 |
+
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Failed to parse response: {str(e)}\n {response}")
|
80 |
+
human_message = default_human
|
81 |
+
ai_message = default_ai
|
82 |
+
|
83 |
+
return human_message, ai_message
|
84 |
+
|
85 |
+
def get_random_human_ai_ground_truth_pair():
|
86 |
+
# Select system prompt with specified probabilities
|
87 |
+
system_prompts = {
|
88 |
+
"good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
|
89 |
+
"bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
|
90 |
+
"ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
|
91 |
+
}
|
92 |
+
|
93 |
+
prompt_type = random.choices(
|
94 |
+
["good", "bad", "ambiguous"],
|
95 |
+
weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
|
96 |
+
)[0]
|
97 |
+
|
98 |
+
system_prompt = system_prompts[prompt_type]
|
99 |
+
print(f"Generating {prompt_type} response with ground truth")
|
100 |
+
|
101 |
+
# Randomly choose between GPT-3.5 and Claude with 50/50 weights
|
102 |
+
model_choice = random.choices([
|
103 |
+
("gpt-3.5-turbo", get_openai_response),
|
104 |
+
("claude-3-5-haiku-latest", get_anthropic_response)
|
105 |
+
], weights=[0.5, 0.5])[0]
|
106 |
+
model_name, api_func = model_choice
|
107 |
+
|
108 |
+
# Define default messages
|
109 |
+
defaults = {
|
110 |
+
"human": "How do muscles grow?",
|
111 |
+
"ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
|
112 |
+
"ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
|
113 |
+
|
114 |
+
1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
|
115 |
+
|
116 |
+
2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
|
117 |
+
|
118 |
+
3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
|
119 |
+
|
120 |
+
4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
|
121 |
+
|
122 |
+
5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
|
123 |
+
|
124 |
+
6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
|
125 |
+
|
126 |
+
This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
|
127 |
+
}
|
128 |
+
|
129 |
+
# Generate response using selected model
|
130 |
+
response = api_func(
|
131 |
+
model_name=model_name,
|
132 |
+
prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
|
133 |
+
system_prompt=system_prompt,
|
134 |
+
max_tokens=1000, # Increased token limit to accommodate ground truth
|
135 |
+
temperature=1
|
136 |
+
)
|
137 |
+
|
138 |
+
# Parse the response to get all three components
|
139 |
+
try:
|
140 |
+
# Clean the response by replacing newlines with spaces
|
141 |
+
cleaned_response = response.replace('\n', ' ').replace('\r', '')
|
142 |
+
data = json.loads(cleaned_response)
|
143 |
+
|
144 |
+
# Extract messages with fallbacks
|
145 |
+
human_message = data.get("human", defaults["human"])
|
146 |
+
ai_message = data.get("ai", defaults["ai"])
|
147 |
+
ground_truth = data.get("ground_truth", defaults["ground_truth"])
|
148 |
+
|
149 |
+
# Debug logging
|
150 |
+
print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
|
151 |
+
|
152 |
+
except Exception as e:
|
153 |
+
print(f"Failed to parse response: {str(e)}\n {response}")
|
154 |
+
human_message = defaults["human"]
|
155 |
+
ai_message = defaults["ai"]
|
156 |
+
ground_truth = defaults["ground_truth"]
|
157 |
+
|
158 |
+
return human_message, ai_message, ground_truth
|
159 |
+
|
160 |
+
def generate_ai_response(human_msg):
|
161 |
+
"""Generate AI response using GPT-3.5-turbo"""
|
162 |
+
if not human_msg.strip():
|
163 |
+
return "", False
|
164 |
+
|
165 |
+
try:
|
166 |
+
response = get_openai_response(
|
167 |
+
"gpt-3.5-turbo",
|
168 |
+
human_msg,
|
169 |
+
system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
|
170 |
+
max_tokens=1000,
|
171 |
+
temperature=1
|
172 |
+
)
|
173 |
+
# Extract just the response content since we don't need JSON format here
|
174 |
+
if isinstance(response, str):
|
175 |
+
# Clean up any JSON formatting if present
|
176 |
+
try:
|
177 |
+
data = json.loads(response)
|
178 |
+
response = data.get("content", response)
|
179 |
+
except json.JSONDecodeError:
|
180 |
+
pass
|
181 |
+
return response, False # Return response and button interactive state
|
182 |
+
except Exception as e:
|
183 |
+
return f"Error generating response: {str(e)}", False
|
random_sample_tab.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from random_sample.arena_interface import create_arena_interface
|
3 |
+
|
4 |
+
def random_sample_tab():
|
5 |
+
with gr.TabItem("Random samples"):
|
6 |
+
return create_arena_interface()
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
pymongo
|
3 |
+
gradio
|
4 |
+
python-dotenv
|
5 |
+
openai
|
6 |
+
anthropic
|
7 |
+
together
|
8 |
+
|
9 |
+
# Development dependencies
|
10 |
+
black
|
11 |
+
pytest
|
score_handler.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
def handle_analysis(df_state, model_selection_group, analyze_results_button):
|
8 |
+
with gr.Group(visible=False) as analysis_group:
|
9 |
+
gr.Markdown("## Analysis")
|
10 |
+
|
11 |
+
# Dropdown to select the accuracy measurement
|
12 |
+
accuracy_measurement_dropdown = gr.Dropdown(
|
13 |
+
choices=['Accuracy', 'Pearson Correlation'],
|
14 |
+
label='Select Evaluation Metric'
|
15 |
+
)
|
16 |
+
|
17 |
+
# We remove the two compare dropdowns and only keep ground truth
|
18 |
+
with gr.Row():
|
19 |
+
ground_truth_dropdown = gr.Dropdown(
|
20 |
+
choices=[],
|
21 |
+
label='Select True Label Column'
|
22 |
+
)
|
23 |
+
|
24 |
+
# Define two side-by-side boxes for results
|
25 |
+
with gr.Row():
|
26 |
+
judge_a_result = gr.Textbox(
|
27 |
+
label="Judge A Results",
|
28 |
+
lines=10,
|
29 |
+
interactive=False,
|
30 |
+
visible=False
|
31 |
+
)
|
32 |
+
judge_b_result = gr.Textbox(
|
33 |
+
label="Judge B Results",
|
34 |
+
lines=10,
|
35 |
+
interactive=False,
|
36 |
+
visible=False
|
37 |
+
)
|
38 |
+
|
39 |
+
# Move the JSON output below those textboxes and buttons
|
40 |
+
json_output = gr.File(label="Results .json", interactive=False, visible=False)
|
41 |
+
|
42 |
+
# Now place the row of buttons AFTER the json_output
|
43 |
+
with gr.Row():
|
44 |
+
back_to_results_button = gr.Button("← Back to Results")
|
45 |
+
calculate_button = gr.Button("Calculate")
|
46 |
+
download_button = gr.Button("Download Results as JSON")
|
47 |
+
|
48 |
+
# Show analysis group
|
49 |
+
def show_analysis_group():
|
50 |
+
df = df_state.value
|
51 |
+
if df is not None:
|
52 |
+
columns = df.columns.tolist()
|
53 |
+
else:
|
54 |
+
columns = []
|
55 |
+
# Now we only update ground_truth_dropdown
|
56 |
+
return (
|
57 |
+
gr.update(visible=True), # analysis_group
|
58 |
+
gr.update(visible=False), # model_selection_group
|
59 |
+
gr.update(choices=columns), # ground_truth_dropdown
|
60 |
+
)
|
61 |
+
|
62 |
+
analyze_results_button.click(
|
63 |
+
fn=show_analysis_group,
|
64 |
+
inputs=[],
|
65 |
+
outputs=[
|
66 |
+
analysis_group,
|
67 |
+
model_selection_group,
|
68 |
+
ground_truth_dropdown # only this one
|
69 |
+
]
|
70 |
+
)
|
71 |
+
|
72 |
+
def back_to_results():
|
73 |
+
return (
|
74 |
+
gr.update(visible=False), # Hide analysis_group
|
75 |
+
gr.update(visible=True), # Show model_selection_group
|
76 |
+
)
|
77 |
+
|
78 |
+
back_to_results_button.click(
|
79 |
+
fn=back_to_results,
|
80 |
+
inputs=[],
|
81 |
+
outputs=[analysis_group, model_selection_group]
|
82 |
+
)
|
83 |
+
|
84 |
+
def calculate_multiple_accuracies(measurement, ground_truth_col, df_state):
|
85 |
+
# Hard-code 'score_a' and 'score_b' as the columns to compare
|
86 |
+
col2_name = "score_a"
|
87 |
+
col3_name = "score_b"
|
88 |
+
df = df_state.value
|
89 |
+
if df is None:
|
90 |
+
# Return two "No DataFrame" messages
|
91 |
+
return (
|
92 |
+
gr.update(value="No DataFrame available.", visible=True),
|
93 |
+
gr.update(value="No DataFrame available.", visible=True)
|
94 |
+
)
|
95 |
+
|
96 |
+
# Check if user-supplied ground_truth_col is valid
|
97 |
+
missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns]
|
98 |
+
if missing_columns:
|
99 |
+
msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}."
|
100 |
+
# Return same message in both boxes
|
101 |
+
return (
|
102 |
+
gr.update(value=msg, visible=True),
|
103 |
+
gr.update(value=msg, visible=True)
|
104 |
+
)
|
105 |
+
|
106 |
+
# Compare ground_truth_col with score_a
|
107 |
+
result1 = calculate_accuracy(
|
108 |
+
measurement, ground_truth_col, col2_name,
|
109 |
+
df_state, compare_to_ground_truth=True
|
110 |
+
)
|
111 |
+
text_a = f"Comparison: '{ground_truth_col}' vs. 'Judge A'\n{result1}"
|
112 |
+
|
113 |
+
# Compare ground_truth_col with score_b
|
114 |
+
result2 = calculate_accuracy(
|
115 |
+
measurement, ground_truth_col, col3_name,
|
116 |
+
df_state, compare_to_ground_truth=True
|
117 |
+
)
|
118 |
+
text_b = f"Comparison: '{ground_truth_col}' vs. 'Judge B'\n{result2}"
|
119 |
+
|
120 |
+
# Return them separately, each is for a different Textbox
|
121 |
+
return (
|
122 |
+
gr.update(value=text_a, visible=True),
|
123 |
+
gr.update(value=text_b, visible=True)
|
124 |
+
)
|
125 |
+
|
126 |
+
# Now the calculate_button only expects measurement, ground_truth_col, df_state
|
127 |
+
calculate_button.click(
|
128 |
+
fn=calculate_multiple_accuracies,
|
129 |
+
inputs=[
|
130 |
+
accuracy_measurement_dropdown,
|
131 |
+
ground_truth_dropdown,
|
132 |
+
df_state
|
133 |
+
],
|
134 |
+
outputs=[judge_a_result, judge_b_result]
|
135 |
+
)
|
136 |
+
|
137 |
+
def create_json_download(df_state):
|
138 |
+
if df_state.value is None:
|
139 |
+
return gr.update(value=None, visible=True)
|
140 |
+
|
141 |
+
json_str = df_state.value.to_json(orient='records', indent=2)
|
142 |
+
temp_dir = tempfile.gettempdir()
|
143 |
+
file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json')
|
144 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
145 |
+
f.write(json_str)
|
146 |
+
return gr.update(value=file_path, visible=True)
|
147 |
+
|
148 |
+
download_button.click(
|
149 |
+
fn=create_json_download,
|
150 |
+
inputs=[df_state],
|
151 |
+
outputs=[json_output]
|
152 |
+
)
|
153 |
+
|
154 |
+
# Helper functions
|
155 |
+
|
156 |
+
def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False):
|
157 |
+
df = df_state.value
|
158 |
+
# No changes here (function remains sacred as per your request)
|
159 |
+
if df is None:
|
160 |
+
return "No DataFrame available."
|
161 |
+
if col1 not in df.columns or col2 not in df.columns:
|
162 |
+
return "Selected columns not found in DataFrame."
|
163 |
+
|
164 |
+
results_df = pd.DataFrame()
|
165 |
+
if compare_to_ground_truth:
|
166 |
+
results_df['ground_truth'] = df[col1]
|
167 |
+
results_df['predicted'] = df[col2]
|
168 |
+
else:
|
169 |
+
results_df['extracted_winner'] = df[col1]
|
170 |
+
results_df['truth_result'] = df[col2]
|
171 |
+
|
172 |
+
if measurement == 'Accuracy':
|
173 |
+
result = process_pairwise_accuracy(results_df, compare_to_ground_truth)
|
174 |
+
output_text = (
|
175 |
+
f"Overall Accuracy: {result['overall_accuracy']}\n"
|
176 |
+
f"Number of NaNs: {result['num_extracted_nan']}"
|
177 |
+
)
|
178 |
+
elif measurement == 'Pearson Correlation':
|
179 |
+
result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth)
|
180 |
+
output_text = (
|
181 |
+
f"Pearson Correlation: {result['overall_pearson_correlation']}\n"
|
182 |
+
f"Number of NaNs: {result['num_extracted_nan']}"
|
183 |
+
)
|
184 |
+
else:
|
185 |
+
output_text = "Unknown measurement selected."
|
186 |
+
|
187 |
+
return output_text
|
188 |
+
|
189 |
+
def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict:
|
190 |
+
# Compute 'results' column based on whether comparing to ground truth
|
191 |
+
if compare_to_ground_truth:
|
192 |
+
# NEW: convert both columns to float
|
193 |
+
results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan)
|
194 |
+
results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan)
|
195 |
+
|
196 |
+
results_df['results'] = results_df['ground_truth'] == results_df['predicted']
|
197 |
+
num_extracted_nan = int(results_df['predicted'].isna().sum())
|
198 |
+
else:
|
199 |
+
results_df['results'] = results_df['extracted_winner'] == results_df['truth_result']
|
200 |
+
num_extracted_nan = int(results_df['extracted_winner'].isna().sum())
|
201 |
+
|
202 |
+
overall_accuracy = results_df['results'].mean()
|
203 |
+
|
204 |
+
return {
|
205 |
+
"overall_accuracy": overall_accuracy,
|
206 |
+
"num_extracted_nan": num_extracted_nan,
|
207 |
+
}
|
208 |
+
|
209 |
+
def process_single_rating_pearson_correlation(
|
210 |
+
results_df: pd.DataFrame, compare_to_ground_truth=False
|
211 |
+
) -> dict:
|
212 |
+
if compare_to_ground_truth:
|
213 |
+
pred_col = 'predicted'
|
214 |
+
truth_col = 'ground_truth'
|
215 |
+
else:
|
216 |
+
pred_col = 'extracted_winner'
|
217 |
+
truth_col = 'truth_result'
|
218 |
+
|
219 |
+
results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan)
|
220 |
+
results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan)
|
221 |
+
|
222 |
+
numerical_results = results_df.dropna(subset=[pred_col, truth_col])
|
223 |
+
|
224 |
+
if len(numerical_results) == 0:
|
225 |
+
pearson_corr = np.nan
|
226 |
+
else:
|
227 |
+
pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col])
|
228 |
+
|
229 |
+
num_extracted_nan = int(results_df[pred_col].isna().sum())
|
230 |
+
|
231 |
+
return {
|
232 |
+
"overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0,
|
233 |
+
"num_extracted_nan": num_extracted_nan,
|
234 |
+
}
|
235 |
+
|
236 |
+
def convert_to_float_or_nan(extracted_input):
|
237 |
+
if extracted_input is None or pd.isna(extracted_input):
|
238 |
+
return np.nan
|
239 |
+
try:
|
240 |
+
return float(extracted_input)
|
241 |
+
except ValueError:
|
242 |
+
return np.nan
|
ui_components.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ui_components.py
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
save_prompt_button = gr.Button("Save Prompt", visible=False)
|
utils.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from datetime import datetime
|
3 |
+
import logging
|
4 |
+
|
5 |
+
def parse_variables(prompt):
|
6 |
+
import re
|
7 |
+
# Extract variables enclosed in double curly braces
|
8 |
+
variables = re.findall(r"{{(.*?)}}", prompt)
|
9 |
+
# Remove duplicates while preserving order
|
10 |
+
seen = set()
|
11 |
+
variables = [
|
12 |
+
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
|
13 |
+
]
|
14 |
+
return variables
|
15 |
+
|
16 |
+
def get_logger(sink_name: str = "core_utils") -> logging.Logger:
|
17 |
+
logging.basicConfig(
|
18 |
+
format="%(asctime)s,%(msecs)03d %(levelname)-8s "
|
19 |
+
"[%(filename)s:%(lineno)d] %(message)s",
|
20 |
+
datefmt="%Y-%m-%d:%H:%M:%S",
|
21 |
+
level=logging.INFO,
|
22 |
+
force=True,
|
23 |
+
)
|
24 |
+
logger = logging.getLogger(sink_name)
|
25 |
+
return logger
|
26 |
+
|
27 |
+
|
28 |
+
@dataclass
|
29 |
+
class Vote:
|
30 |
+
timestamp: str
|
31 |
+
prompt: str
|
32 |
+
response_a: str
|
33 |
+
response_b: str
|
34 |
+
model_a: str
|
35 |
+
model_b: str
|
36 |
+
winner: str
|
37 |
+
judge_id: str
|