kaikaidai commited on
Commit
201dd80
·
verified ·
1 Parent(s): 89ead24

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ TOGETHER_API_KEY=your_together_api_key_here
2
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
3
+ OPENAI_API_KEY=your_openai_api_key_here
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sandbox_runner.py
2
+
3
+ import gradio as gr
4
+ from data_handler import upload_test_data
5
+ from criteria_handler import select_evaluation_criteria
6
+ from model_handler import select_evaluators
7
+ from score_handler import handle_analysis
8
+ from random_sample_tab import random_sample_tab
9
+
10
+ def run_sandbox():
11
+ with gr.Blocks(css="""
12
+ .truncate_cells table {
13
+ table-layout: fixed !important;
14
+ width: 100% !important;
15
+ }
16
+ .truncate_cells table td,
17
+ .truncate_cells table th {
18
+ white-space: nowrap !important;
19
+ overflow: hidden !important;
20
+ text-overflow: ellipsis !important;
21
+ max-width: 200px !important;
22
+ text-align: left !important;
23
+ vertical-align: top !important;
24
+ }
25
+ """) as demo:
26
+ gr.Markdown("# Atla Testing Sandbox")
27
+ with gr.Tabs():
28
+ # Random samples tab
29
+ random_sample_tab()
30
+
31
+ # Sandbox tab
32
+ with gr.TabItem("Custom Dataset"):
33
+ # Initialize state object to track the DataFrame
34
+ df_state = gr.State(value=None)
35
+ # Initialize state object to track the prompt
36
+ prompt_state = gr.State(value=None)
37
+ # Initialize the evaluation_complete flag
38
+ evaluation_complete = gr.State(value=None)
39
+
40
+ # Data upload
41
+ data_upload_group, df_state = upload_test_data(df_state)
42
+
43
+ # Criteria selection
44
+ criteria_group, df_state, prompt_state, save_prompt_button = \
45
+ select_evaluation_criteria(data_upload_group, df_state, prompt_state)
46
+
47
+ # Models selection
48
+ model_selection_group, df_state, analyze_results_button = \
49
+ select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button)
50
+
51
+ # Result analysis
52
+ handle_analysis(df_state, model_selection_group, analyze_results_button)
53
+
54
+ demo.launch()
55
+
56
+ if __name__ == "__main__":
57
+ run_sandbox()
common.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Page Headers
2
+ MAIN_TITLE = "# Judge Arena - Free LLM Evals to test your GenAI application"
3
+
4
+ # How it works section
5
+ HOW_IT_WORKS = """
6
+ - **Run any form of evaluation:** from simple hallucination detection to qualitative interpretations
7
+ - **Evaluate anything:** coding, analysis, creative writing, math, or general knowledge
8
+ """
9
+
10
+ BATTLE_RULES = """
11
+ ## 🤺 Battle Rules:
12
+ - Both AIs stay anonymous - if either reveals its identity, the duel is void
13
+ - Choose the LLM judge that most aligns with your judgement
14
+ - If both score the same - choose the critique that you prefer more!
15
+ <br><br>
16
+ """
17
+
18
+ # CSS Styles
19
+ CSS_STYLES = """
20
+ .prompt-row {
21
+ align-items: flex-start !important;
22
+ }
23
+ .send-button-row {
24
+ display: flex;
25
+ justify-content: flex-end;
26
+ margin-top: 8px;
27
+ }
28
+ /* Style for metric buttons */
29
+ .metric-button-active {
30
+ background-color: #2B3A55 !important;
31
+ color: white !important;
32
+ }
33
+ /* Add this to ensure proper button spacing */
34
+ .metric-buttons-row {
35
+ gap: 8px;
36
+ }
37
+ """
38
+
39
+ # Default Eval Prompt
40
+ EVAL_DESCRIPTION = """
41
+ ## 📝 Instructions
42
+ **Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
43
+ - Evaluation criteria
44
+ - Scoring rubric
45
+ - (Optional) Examples\n
46
+
47
+ **Any variables you define in your prompt using {{double curly braces}} will automatically map to the corresponding input fields under "Sample to evaluate" section on the right.**
48
+
49
+ <br><br>
50
+ """
51
+
52
+ DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on [INSERT CRITERIA]
53
+
54
+ Score:
55
+ A score of 1 means that the response's answer meets all of the evaluation criteria.
56
+ A score of 0 means that the response's answer does not meet all of the evaluation criteria.
57
+
58
+ Here is the data:
59
+ [BEGIN DATA]
60
+ ***
61
+ [User Query]: {{input}}
62
+ ***
63
+ [Response]: {{response}}
64
+ ***
65
+ [END DATA]"""
66
+
67
+ # Default Variable Values
68
+ DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
69
+ A) Jaguar
70
+ B) Toucan
71
+ C) Polar Bear
72
+ D) Sloth"""
73
+ DEFAULT_RESPONSE = "C) Polar Bear"
74
+
75
+ # Voting Section Header
76
+ VOTING_HEADER = """
77
+ # Start Voting Now
78
+ """
79
+
80
+ # Acknowledgements
81
+ ACKNOWLEDGEMENTS = """
82
+ <br><br><br>
83
+ # Acknowledgements
84
+
85
+ We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
86
+
87
+ We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
88
+ """
89
+
90
+ # Policy Content
91
+ POLICY_CONTENT = """
92
+ # About Atla
93
+
94
+ Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
95
+ <br><br>
96
+ # Our Mission
97
+
98
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
99
+ <br><br>
100
+ # Judge Arena Policy
101
+
102
+ ## Overview
103
+
104
+ Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
105
+
106
+ ## Transparency
107
+
108
+ - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
109
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
110
+ - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
111
+
112
+ ## Model Inclusion Criteria
113
+
114
+ Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
115
+
116
+ - **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
117
+ - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
118
+ - **Accessibility**:
119
+ - **Public API Access**: Models accessible through public APIs without restrictive barriers.
120
+ - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
121
+
122
+ ## Leaderboard Management
123
+
124
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
125
+ - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
126
+ - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
127
+
128
+ This policy might be updated to reflect changes in our practices or in response to community feedback.
129
+
130
+ # FAQ
131
+
132
+ **Isn't this the same as Chatbot Arena?**
133
+
134
+ We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
135
+
136
+ **What are the Evaluator Prompt Templates based on?**
137
+
138
+ As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
139
+
140
+ These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
141
+
142
+ **Why should I trust this leaderboard?**
143
+
144
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
145
+
146
+ **Who funds this effort?**
147
+
148
+ Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
149
+
150
+ **What is Atla working on?**
151
+
152
+ We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
153
+ <br><br>
154
+ # Get in touch
155
+ Feel free to email us at [[email protected]](mailto:[email protected]) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
criteria_handler.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # criteria_handler.py
2
+
3
+ import gradio as gr
4
+ import re
5
+ from eval_criteria_library import EXAMPLE_METRICS
6
+
7
+ def select_evaluation_criteria(data_upload_group, df_state, prompt_state):
8
+ with gr.Group(visible=True) as criteria_group:
9
+ select_eval_criteria_button = gr.Button("Select Evaluation Criteria", visible=False)
10
+
11
+ criteria_dropdown = gr.Dropdown(
12
+ choices=list(EXAMPLE_METRICS.keys()),
13
+ label="Choose Evaluation Criteria",
14
+ value=list(EXAMPLE_METRICS.keys())[0],
15
+ visible=False
16
+ )
17
+
18
+ with gr.Row(visible=False) as mapping_row:
19
+ with gr.Column():
20
+ # Left column - Evaluation Criteria Editor
21
+ prompt_editor = gr.Textbox(
22
+ label="Evaluation Criteria",
23
+ lines=15,
24
+ visible=False,
25
+ placeholder="Enter the evaluation criteria/rubric here..."
26
+ )
27
+ with gr.Column():
28
+ # Right column - Required and Optional Variable Mapping
29
+ # Required mappings
30
+ input_mapping = gr.Dropdown(
31
+ choices=[],
32
+ label="Map 'model_input' to column (Required)",
33
+ interactive=True,
34
+ visible=False
35
+ )
36
+ output_mapping = gr.Dropdown(
37
+ choices=[],
38
+ label="Map 'model_output' to column (Required)",
39
+ interactive=True,
40
+ visible=False
41
+ )
42
+ # Optional mappings
43
+ context_mapping = gr.Dropdown(
44
+ choices=[],
45
+ label="Map 'model_context' to column (Optional)",
46
+ interactive=True,
47
+ visible=False
48
+ )
49
+ expected_output_mapping = gr.Dropdown(
50
+ choices=[],
51
+ label="Map 'expected_model_output' to column (Optional)",
52
+ interactive=True,
53
+ visible=False
54
+ )
55
+ # We'll place the "Back to Data" and "Select Evaluators" within the same row:
56
+ with gr.Row(visible=False) as nav_row:
57
+ back_to_data_button = gr.Button("← Back to Data", visible=False)
58
+ save_prompt_button = gr.Button("Select Evaluators", visible=False)
59
+
60
+ def update_column_choices(df_state):
61
+ df = df_state.value
62
+ columns = df.columns.tolist() if df is not None else []
63
+ return {
64
+ input_mapping: gr.update(choices=columns, visible=True),
65
+ output_mapping: gr.update(choices=columns, visible=True),
66
+ context_mapping: gr.update(choices=['None'] + columns, visible=True),
67
+ expected_output_mapping: gr.update(choices=['None'] + columns, visible=True)
68
+ }
69
+
70
+ def update_prompt(selected_criteria, df_state):
71
+ if selected_criteria in EXAMPLE_METRICS:
72
+ evaluation_criteria = EXAMPLE_METRICS[selected_criteria]['prompt']
73
+ else:
74
+ evaluation_criteria = ""
75
+ updates = {prompt_editor: gr.update(value=evaluation_criteria, visible=True)}
76
+ updates.update(update_column_choices(df_state))
77
+ return updates
78
+
79
+ def show_criteria_selection():
80
+ default_criterion = list(EXAMPLE_METRICS.keys())[0]
81
+ evaluation_criteria = EXAMPLE_METRICS[default_criterion]['prompt']
82
+ updates = {
83
+ select_eval_criteria_button: gr.update(visible=False),
84
+ criteria_dropdown: gr.update(visible=True),
85
+ prompt_editor: gr.update(value=evaluation_criteria, visible=True),
86
+ data_upload_group: gr.update(visible=False),
87
+ mapping_row: gr.update(visible=True),
88
+ # Show the nav row and buttons
89
+ nav_row: gr.update(visible=True),
90
+ back_to_data_button: gr.update(visible=True),
91
+ save_prompt_button: gr.update(visible=True),
92
+ }
93
+ updates.update(update_column_choices(df_state))
94
+ return updates
95
+
96
+ def save_prompt(evaluation_criteria, input_col, output_col, context_col, expected_output_col):
97
+ # Use the actual Jinja template with proper Jinja syntax and raw JSON
98
+ template = '''You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
99
+
100
+ Here are some rules of the evaluation:
101
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
102
+
103
+ Your reply should strictly follow this format:
104
+ Your output format should strictly adhere to JSON as follows: {% raw %}{"feedback": "<write feedback>", "result": <numerical score>}{% endraw %}. Ensure the output is valid JSON, without additional formatting or explanations.
105
+
106
+ Here is the data.
107
+
108
+ {% if model_context is defined and model_context %}Context:
109
+ ```
110
+ {{ model_context }}
111
+ ```
112
+
113
+ {% endif %}Instruction:
114
+ ```
115
+ {{ model_input }}
116
+ ```
117
+
118
+ Response:
119
+ ```
120
+ {{ model_output }}
121
+ ```
122
+
123
+ Score Rubrics:
124
+ {{ evaluation_criteria }}
125
+
126
+ {% if expected_model_output is defined and expected_model_output %}Reference answer:
127
+ {{ expected_model_output }}{% endif %}'''
128
+
129
+ # Create mapping dictionary
130
+ mapping_dict = {
131
+ 'model_input': input_col,
132
+ 'model_output': output_col,
133
+ 'evaluation_criteria': evaluation_criteria
134
+ }
135
+
136
+ # Add optional mappings if selected
137
+ if context_col != 'None':
138
+ mapping_dict['model_context'] = context_col
139
+ if expected_output_col != 'None':
140
+ mapping_dict['expected_model_output'] = expected_output_col
141
+
142
+ prompt_state.value = {
143
+ 'template': template,
144
+ 'mappings': mapping_dict
145
+ }
146
+
147
+ # Update event handlers
148
+ select_eval_criteria_button.click(
149
+ fn=show_criteria_selection,
150
+ inputs=[],
151
+ outputs=[
152
+
153
+ select_eval_criteria_button,
154
+ criteria_dropdown,
155
+ prompt_editor,
156
+
157
+ data_upload_group,
158
+ mapping_row,
159
+ nav_row,
160
+ back_to_data_button,
161
+ save_prompt_button
162
+ ,
163
+ input_mapping, output_mapping, context_mapping, expected_output_mapping
164
+ ]
165
+ )
166
+
167
+ criteria_dropdown.change(
168
+ fn=update_prompt,
169
+ inputs=[criteria_dropdown, df_state],
170
+ outputs=[prompt_editor, input_mapping, output_mapping, context_mapping, expected_output_mapping]
171
+ )
172
+
173
+ def make_select_button_visible(df_value):
174
+ if df_value is not None:
175
+ return gr.update(visible=True)
176
+ else:
177
+ return gr.update(visible=False)
178
+
179
+ df_state.change(
180
+ fn=make_select_button_visible,
181
+ inputs=df_state,
182
+ outputs=select_eval_criteria_button
183
+ )
184
+
185
+ save_prompt_button.click(
186
+ fn=save_prompt,
187
+ inputs=[
188
+ prompt_editor, input_mapping, output_mapping,
189
+ context_mapping, expected_output_mapping
190
+ ],
191
+ outputs=[]
192
+ )
193
+
194
+ # BACK BUTTON: Hide the criteria UI, show the data upload UI
195
+ def back_to_data():
196
+ return {
197
+ # show data upload group again
198
+ data_upload_group: gr.update(visible=True),
199
+ # hide the criteria group
200
+ criteria_dropdown: gr.update(visible=False),
201
+ prompt_editor: gr.update(visible=False),
202
+ mapping_row: gr.update(visible=False),
203
+ nav_row: gr.update(visible=False),
204
+ # make "Select Evaluation Criteria" button visible again
205
+ select_eval_criteria_button: gr.update(visible=True),
206
+ }
207
+
208
+ back_to_data_button.click(
209
+ fn=back_to_data,
210
+ inputs=[],
211
+ outputs=[
212
+ data_upload_group,
213
+ criteria_dropdown,
214
+ prompt_editor,
215
+ mapping_row,
216
+ nav_row,
217
+ select_eval_criteria_button
218
+ ]
219
+ )
220
+
221
+ # Return both the criteria rule group, the df_state, prompt_state, save_prompt_button
222
+ return criteria_group, df_state, prompt_state, save_prompt_button
data/models.jsonl ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
17
+ {"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
data_handler.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_handler.py
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import json
6
+ def upload_test_data(df_state):
7
+ with gr.Group() as data_upload_group:
8
+ file_upload = gr.File(
9
+ label="Upload JSON with test data incl. true labels as integers or floats",
10
+ file_types=[".json"],
11
+ )
12
+ import_button = gr.Button("Import Data", visible=False)
13
+ # Show exactly 5 rows, no scrolling
14
+ df_display = gr.Dataframe(
15
+ visible=False,
16
+ elem_classes=["truncate_cells"],
17
+ label="Uploaded Data"
18
+ )
19
+ error_display = gr.Textbox(visible=False)
20
+
21
+ def display_file_info(file):
22
+ if file is not None:
23
+ return {
24
+ import_button: gr.update(visible=True),
25
+ error_display: gr.update(visible=False) # Hide previous errors
26
+ }
27
+ else:
28
+ return {
29
+ import_button: gr.update(visible=False),
30
+ df_display: gr.update(visible=False),
31
+ error_display: gr.update(visible=False) # Hide previous errors
32
+ }
33
+
34
+ def import_data(file):
35
+ if file is not None:
36
+ try:
37
+ df_state.value = pd.json_normalize(json.load(open(file.name)))
38
+
39
+ return {
40
+ df_display: gr.update(value=df_state.value, visible=True),
41
+ import_button: gr.update(visible=False),
42
+ df_state: df_state,
43
+ error_display: gr.update(visible=False) # Hide previous errors
44
+ }
45
+ except json.JSONDecodeError as e:
46
+ return {
47
+ df_display: gr.update(visible=False),
48
+ error_display: gr.update(value="**Error:** Invalid JSON file. Please upload a valid JSON file.", visible=True),
49
+ import_button: gr.update(visible=True),
50
+ df_state: None
51
+ }
52
+ except Exception as e:
53
+ return {
54
+ df_display: gr.update(visible=False),
55
+ error_display: gr.update(value=f"**Error:** {str(e)}", visible=True),
56
+ import_button: gr.update(visible=True),
57
+ df_state: None
58
+ }
59
+ else:
60
+ return {
61
+ df_display: gr.update(visible=False),
62
+ import_button: gr.update(visible=True),
63
+ df_state: None
64
+ }
65
+
66
+ file_upload.change(
67
+ fn=display_file_info,
68
+ inputs=file_upload,
69
+ outputs=[import_button, df_display, error_display]
70
+ )
71
+ import_button.click(
72
+ fn=import_data,
73
+ inputs=file_upload,
74
+ outputs=[df_display, import_button, df_state, error_display]
75
+ )
76
+
77
+ return data_upload_group, df_state
eval_criteria_library.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXAMPLE_METRICS = {
2
+ "Custom": {
3
+ "prompt":
4
+ """Evaluate a chat bot response to a user's input based on [INSERT CRITERIA OR SELECT EXAMPLE FROM LIST]
5
+
6
+ 0: The response's answer does not meet all of the evaluation criteria.
7
+ 1: The response's answer meets all of the evaluation criteria.""",
8
+ },
9
+ "Relevance": {
10
+ "prompt": """Evaluate how well the response fulfill the requirements of the instruction by providing relevant information. This includes responding in accordance with the explicit and implicit purpose of given instruction.
11
+
12
+ 1: The response is completely unrelated to the instruction, or the model entirely misunderstands the instruction.
13
+ 2: Most of the key points in the response are irrelevant to the instruction, and the response misses major requirements of the instruction.
14
+ 3: Some major points in the response contain irrelevant information or miss some requirements of the instruction.
15
+ 4: The response is relevant to the instruction but misses minor requirements of the instruction.
16
+ 5: The response is perfectly relevant to the instruction, and the model fulfills all of the requirements of the instruction.""",
17
+ },
18
+ "Correctness": {
19
+ "prompt": """Evaluate whether the information provided in the response is correct given the reference response. Ignore differences in punctuation and phrasing between the student answer and true answer. It is okay if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements.
20
+
21
+ 0: The response is not factually accurate when compared against the reference response or includes conflicting statements.
22
+ 1: The response is supported by the reference response and does not contain conflicting statements.""",
23
+ },
24
+ "Helpfulness": {
25
+ "prompt": """Evaluate how helpful the response is to address the user query.
26
+
27
+ 1: The response is not at all useful, failing to address the instruction or provide any valuable information.
28
+ 2: The response has minimal usefulness, addressing the instruction only superficially or providing mostly irrelevant information.
29
+ 3: The response is moderately useful, addressing some aspects of the instruction effectively but lacking in others.
30
+ 4: The response is very useful, effectively addressing most aspects of the instruction and providing valuable information.
31
+ 5: The response is exceptionally useful, fully addressing the instruction and providing highly valuable information.""",
32
+ },
33
+ "Faithfulness": {
34
+ "prompt": """Evaluate how well the statements in the response are directly supported by the context given in the related passages.
35
+
36
+ 1: The response contains statements that directly contradict the context or are entirely unsupported by it.
37
+ 2: The response includes some information from the context, but contains significant ungrounded claims or misinterpretations.
38
+ 3: The response is mostly grounded in the context, with only minor unsupported claims or misinterpretations.
39
+ 4: The response closely aligns with the context, with only rare and minor deviations.
40
+ 5: The response is fully grounded in the context, with all statements accurately reflecting the provided information.""",
41
+ },
42
+ "Logical coherence": {
43
+ "prompt": """Evaluate how logically accurate and correct the response is for the instruction given.
44
+
45
+ 1: The logic of the model’s response is completely incoherent.
46
+ 2: The model’s response contains major logical inconsistencies or errors.
47
+ 3: The model’s response contains some logical inconsistencies or errors, but they are not significant."
48
+ 4: The model’s response is logically sound, but it is slightly flawed in some aspect.
49
+ 5: The model’s response is logically flawless.""",
50
+ },
51
+ "Conciseness": {
52
+ "prompt": """Evaluate how concise the response is presented to the user without any unncecessary information.
53
+
54
+ 1: The response is highly redundant or contains a lot of unnecessary information, requiring a complete rewrite for optimal clarity and efficiency.
55
+ 2: The response lacks conciseness and needs a substantial rewrite for better optimization.
56
+ 3: The response is somewhat concise but includes unnecessary information, requiring
57
+ some edits for improved optimization.
58
+ 4: The response is mostly concise but could benefit from minor edits for better optimization.
59
+ 5: The response is optimally concise and does not contain any unnecessary information, requiring no further optimization.""",
60
+ },
61
+ }
get_llm_answer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # get_llm_answer.py
2
+
3
+ from openai import OpenAI
4
+ import anthropic
5
+ from together import Together
6
+ import json
7
+ import re
8
+ import atla
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ # Initialize clients
14
+ anthropic_client = anthropic.Anthropic()
15
+ openai_client = OpenAI()
16
+ together_client = Together()
17
+ atla_client = atla.Atla()
18
+
19
+ SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
+
21
+ def get_openai_response(model_name, prompt):
22
+ """Get response from OpenAI API"""
23
+ try:
24
+ response = openai_client.chat.completions.create(
25
+ model=model_name,
26
+ messages=[
27
+ {"role": "system", "content": SYSTEM_PROMPT},
28
+ {"role": "user", "content": prompt},
29
+ ],
30
+ )
31
+ return response.choices[0].message.content
32
+ except Exception as e:
33
+ return f"Error with OpenAI model {model_name}: {str(e)}"
34
+
35
+
36
+ def get_anthropic_response(model_name, prompt):
37
+ """Get response from Anthropic API"""
38
+ try:
39
+ response = anthropic_client.messages.create(
40
+ model=model_name,
41
+ max_tokens=1000,
42
+ temperature=0,
43
+ system=SYSTEM_PROMPT,
44
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
45
+ )
46
+ return response.content[0].text
47
+ except Exception as e:
48
+ return f"Error with Anthropic model {model_name}: {str(e)}"
49
+
50
+
51
+ def get_together_response(model_name, prompt):
52
+ """Get response from Together API"""
53
+ try:
54
+ response = together_client.chat.completions.create(
55
+ model=model_name,
56
+ messages=[
57
+ {"role": "system", "content": SYSTEM_PROMPT},
58
+ {"role": "user", "content": prompt},
59
+ ],
60
+ stream=False,
61
+ )
62
+ return response.choices[0].message.content
63
+ except Exception as e:
64
+ return f"Error with Together model {model_name}: {str(e)}"
65
+
66
+
67
+ def get_atla_response(model_name, model_input, model_output, model_context, expected_output, evaluation_criteria):
68
+ """Get response from Atla API"""
69
+ try:
70
+ response = atla_client.evaluation.create(
71
+ model_id=model_name,
72
+ model_input=model_input,
73
+ model_output=model_output,
74
+ model_context=model_context,
75
+ expected_model_output=expected_output,
76
+ evaluation_criteria=evaluation_criteria,
77
+ )
78
+ # Return the score and critique directly from the evaluation result
79
+ return {
80
+ "score": response.result.evaluation.score,
81
+ "critique": response.result.evaluation.critique
82
+ }
83
+ except Exception as e:
84
+ return f"Error with Atla model {model_name}: {str(e)}"
85
+
86
+
87
+ def get_model_response(model_name, model_info, prompt=None, **kwargs):
88
+ """Get response from appropriate API based on model organization"""
89
+ if not model_info:
90
+ return "Model not found or unsupported."
91
+
92
+ api_model = model_info["api_model"]
93
+ organization = model_info["organization"]
94
+
95
+ try:
96
+ if organization == "Atla":
97
+ return get_atla_response(
98
+ api_model,
99
+ kwargs.get('model_input'),
100
+ kwargs.get('model_output'),
101
+ kwargs.get('model_context'),
102
+ kwargs.get('expected_output'),
103
+ kwargs.get('evaluation_criteria')
104
+ )
105
+ elif organization == "OpenAI":
106
+ return get_openai_response(api_model, prompt)
107
+ elif organization == "Anthropic":
108
+ return get_anthropic_response(api_model, prompt)
109
+ else:
110
+ # All other organizations use Together API
111
+ return get_together_response(api_model, prompt)
112
+ except Exception as e:
113
+ return f"Error with {organization} model {model_name}: {str(e)}"
114
+
115
+
116
+ def parse_model_response(response):
117
+ try:
118
+ # Debug print
119
+ print(f"Raw model response: {response}")
120
+
121
+ # First try to parse the entire response as JSON
122
+ try:
123
+ data = json.loads(response)
124
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
125
+ except json.JSONDecodeError:
126
+ # If that fails (typically for smaller models), try to find JSON within the response
127
+ json_match = re.search(r"{.*}", response)
128
+ if json_match:
129
+ data = json.loads(json_match.group(0))
130
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
131
+ else:
132
+ return "Error", f"Failed to parse response: {response}"
133
+
134
+ except Exception as e:
135
+ # Debug print for error case
136
+ print(f"Failed to parse response: {str(e)}")
137
+ return "Error", f"Failed to parse response: {response}"
model_handler.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_handler.py
2
+
3
+ import gradio as gr
4
+ import json
5
+ import os
6
+ import re
7
+ from get_llm_answer import get_model_response, parse_model_response, get_atla_response
8
+ from jinja2 import Template
9
+
10
+ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button):
11
+ with gr.Group(visible=True) as model_selection_group:
12
+ select_evaluators_button = gr.Button("Select Evaluators", visible=False)
13
+
14
+ # Load the model_data from JSONL
15
+ def load_model_data():
16
+ model_data = {}
17
+ try:
18
+ script_dir = os.path.dirname(__file__)
19
+ file_path = os.path.join(script_dir, "models.jsonl")
20
+ with open(file_path, "r") as f:
21
+ for line in f:
22
+ model = json.loads(line)
23
+ model_data[model["name"]] = {
24
+ "organization": model["organization"],
25
+ "license": model["license"],
26
+ "api_model": model["api_model"],
27
+ }
28
+ except FileNotFoundError:
29
+ print("Warning: models.jsonl not found")
30
+ return {}
31
+ return model_data
32
+
33
+
34
+ model_data = load_model_data()
35
+ model_choices = list(model_data.keys())
36
+
37
+ # Define dropdowns using model choices
38
+ with gr.Row(visible=False) as evaluator_row:
39
+ judge_a_dropdown = gr.Dropdown(
40
+ choices=["Selene"], label="Judge A", value="Selene", interactive=False
41
+ )
42
+ judge_b_dropdown = gr.Dropdown(
43
+ choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
44
+ )
45
+
46
+ # A Markdown for "Evaluation in progress..." and final heading
47
+ loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
48
+
49
+ # NEW: define a Dataframe to show final evaluation results, like in data_handler
50
+ evaluation_result_df = gr.Dataframe(
51
+ visible=False,
52
+ label="Evaluation Results",
53
+ elem_classes=["truncate_cells"]
54
+ )
55
+
56
+ # Define the three-button row AFTER the markdown,
57
+ # so it appears *below* the "Evaluation Complete" message.
58
+ with gr.Row(visible=False) as evaluation_nav_row:
59
+ back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
60
+ run_evaluation_button = gr.Button("Run Evaluation", visible=False)
61
+ analyze_results_button = gr.Button("Analyze Results", visible=False)
62
+
63
+ # Show evaluator selection UI
64
+ def show_evaluator_selection(current_df):
65
+ # Hide Criteria UI and show Evaluator UI
66
+ updates = {
67
+ criteria_group: gr.update(visible=False),
68
+ save_prompt_button: gr.update(visible=False),
69
+ evaluator_row: gr.update(visible=True),
70
+ evaluation_nav_row: gr.update(visible=True),
71
+ run_evaluation_button: gr.update(visible=True),
72
+ back_to_criteria_button: gr.update(visible=True),
73
+ # By default, hide "Analyze Results" and the result dataframe
74
+ analyze_results_button: gr.update(visible=False),
75
+ evaluation_result_df: gr.update(visible=False),
76
+ }
77
+ if (
78
+ current_df.value is not None
79
+ and hasattr(current_df.value, "attrs")
80
+ and current_df.value.attrs.get("eval_done")
81
+ ):
82
+ # If a previous evaluation was completed, show the heading + dataframe
83
+ updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
84
+ updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
85
+ updates[analyze_results_button] = gr.update(visible=True)
86
+
87
+ return updates
88
+
89
+ # Note that we pass df_state to show_evaluator_selection
90
+ save_prompt_button.click(
91
+ fn=show_evaluator_selection,
92
+ inputs=[df_state],
93
+ outputs=[
94
+ save_prompt_button,
95
+ criteria_group,
96
+ evaluator_row,
97
+ evaluation_nav_row,
98
+ run_evaluation_button,
99
+ back_to_criteria_button,
100
+ loading_spinner,
101
+ analyze_results_button,
102
+ evaluation_result_df,
103
+ ],
104
+ )
105
+
106
+ # Back to Criteria
107
+ def back_to_criteria():
108
+ return {
109
+ save_prompt_button: gr.update(visible=True),
110
+ criteria_group: gr.update(visible=True),
111
+ evaluator_row: gr.update(visible=False),
112
+ evaluation_nav_row: gr.update(visible=False),
113
+ run_evaluation_button: gr.update(visible=False),
114
+ # Hide the "Evaluation Complete" markdown
115
+ loading_spinner: gr.update(visible=False),
116
+ analyze_results_button: gr.update(visible=False),
117
+ evaluation_result_df: gr.update(visible=False),
118
+ }
119
+
120
+ back_to_criteria_button.click(
121
+ fn=back_to_criteria,
122
+ inputs=[],
123
+ outputs=[
124
+ save_prompt_button,
125
+ criteria_group,
126
+ evaluator_row,
127
+ evaluation_nav_row,
128
+ run_evaluation_button,
129
+ loading_spinner,
130
+ analyze_results_button,
131
+ evaluation_result_df
132
+ ],
133
+ )
134
+
135
+ # Run evaluation
136
+ def run_evaluation(judge_a, judge_b):
137
+ # Show loading spinner
138
+ yield {loading_spinner: gr.update(visible=True)}
139
+
140
+ # Get template and mappings from prompt state
141
+ template_str = prompt_state.value['template']
142
+ mappings = prompt_state.value['mappings']
143
+ evaluation_criteria = mappings.get('evaluation_criteria')
144
+
145
+ # Create Jinja template for Judge B only
146
+ template = Template(template_str)
147
+
148
+ # Submit prompt to chosen models
149
+ for index, row in df_state.value.iterrows():
150
+ # Create a context dictionary for this row
151
+ context = {}
152
+ model_context = None
153
+ expected_output = None
154
+
155
+ for key, column in mappings.items():
156
+ if key == 'evaluation_criteria':
157
+ continue # Skip as we handle it separately
158
+ elif column and column != 'None':
159
+ context[key] = str(row[column])
160
+ if column == 'model_context':
161
+ model_context = str(row[column])
162
+ elif column == 'expected_model_output':
163
+ expected_output = str(row[column])
164
+
165
+ # For Judge B, render the template using Jinja
166
+ current_prompt = template.render(**context)
167
+ # For Judge A (Atla Selene), call get_atla_response directly
168
+ response_a = get_atla_response(
169
+ "atla-selene",
170
+ model_input=context.get('model_input'),
171
+ model_output=context.get('model_output'),
172
+ model_context=model_context,
173
+ expected_output=expected_output,
174
+ evaluation_criteria=evaluation_criteria
175
+ )
176
+ response_b = get_model_response(
177
+ judge_b,
178
+ model_data.get(judge_b),
179
+ current_prompt
180
+ )
181
+
182
+ # Parse the responses - handle Atla response differently
183
+ if isinstance(response_a, dict): # Atla response
184
+ score_a, critique_a = response_a['score'], response_a['critique']
185
+ else: # Error case
186
+ score_a, critique_a = "Error", response_a
187
+
188
+ score_b, critique_b = parse_model_response(response_b)
189
+
190
+ df_state.value.loc[index, 'score_a'] = score_a
191
+ df_state.value.loc[index, 'critique_a'] = critique_a
192
+ df_state.value.loc[index, 'score_b'] = score_b
193
+ df_state.value.loc[index, 'critique_b'] = critique_b
194
+
195
+ import time
196
+ time.sleep(2)
197
+
198
+ # Hide loading spinner
199
+ yield {loading_spinner: gr.update(visible=False)}
200
+
201
+ # Show "Evaluation Complete" heading and the final DataFrame
202
+ yield {
203
+ loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
204
+ evaluation_result_df: gr.update(value=df_state.value, visible=True),
205
+ analyze_results_button: gr.update(visible=True),
206
+ }
207
+
208
+ # Store the "already run evaluation" flag safely in .attrs
209
+ if hasattr(df_state.value, "attrs"):
210
+ df_state.value.attrs["eval_done"] = True
211
+
212
+ run_evaluation_button.click(
213
+ fn=run_evaluation,
214
+ inputs=[judge_a_dropdown, judge_b_dropdown],
215
+ outputs=[loading_spinner, evaluation_result_df, analyze_results_button],
216
+ )
217
+
218
+
219
+
220
+ return model_selection_group, df_state, analyze_results_button
models.jsonl ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest"}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
17
+ {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest"}
18
+ {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
19
+ {"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
random_sample/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file can be empty - it just marks the directory as a Python package
random_sample/arena_interface.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import gradio as gr
4
+
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ from .gen_api_answer import (
9
+ get_atla_response
10
+ )
11
+
12
+ from .prompts import (
13
+ DEFAULT_EVAL_CRITERIA,
14
+ DEFAULT_EVAL_PROMPT,
15
+ DEFAULT_EVAL_PROMPT_EDITABLE,
16
+ FIXED_EVAL_SUFFIX
17
+ )
18
+
19
+ from .random_sample_generation import (
20
+ get_random_human_ai_pair,
21
+ get_random_human_ai_ground_truth_pair,
22
+ generate_ai_response
23
+ )
24
+
25
+ from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
26
+
27
+ def parse_variables(prompt):
28
+ # Extract variables enclosed in double curly braces
29
+ variables = re.findall(r"{{(.*?)}}", prompt)
30
+ # Remove duplicates while preserving order
31
+ seen = set()
32
+ variables = [
33
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
34
+ ]
35
+ return variables
36
+
37
+
38
+ def get_final_prompt(eval_prompt, variable_values):
39
+ # Replace variables in the eval prompt with their values
40
+ for var, val in variable_values.items():
41
+ eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
42
+ return eval_prompt
43
+
44
+
45
+ def populate_random_example(request: gr.Request, compatible_mode: bool):
46
+ """Generate a random human-AI conversation example and reset judge outputs."""
47
+ if compatible_mode:
48
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
49
+ else:
50
+ human_msg, ai_msg = get_random_human_ai_pair()
51
+ ground_truth_msg = ""
52
+
53
+ return [
54
+ gr.update(value=human_msg),
55
+ gr.update(value=ai_msg),
56
+ gr.update(value="🎲", variant="secondary"),
57
+ gr.update(value=""), # Clear score
58
+ gr.update(value=""), # Clear critique
59
+ gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
60
+ ]
61
+
62
+
63
+ def create_arena_interface():
64
+ with gr.Blocks(theme="default", css=CSS_STYLES) as interface:
65
+ # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
66
+ eval_prompt = gr.Textbox(
67
+ value=DEFAULT_EVAL_PROMPT,
68
+ visible=False
69
+ )
70
+ with gr.Row():
71
+ # Left side - Input section
72
+ with gr.Column(scale=1):
73
+ with gr.Group():
74
+ human_input = gr.TextArea(
75
+ label="👩 User Input",
76
+ lines=5,
77
+ placeholder="Enter the human message here..."
78
+ )
79
+ with gr.Row():
80
+ generate_btn = gr.Button(
81
+ "Generate AI Response",
82
+ size="sm",
83
+ interactive=False
84
+ )
85
+
86
+ ai_response = gr.TextArea(
87
+ label="🤖 AI Response",
88
+ lines=10,
89
+ placeholder="Enter the AI response here..."
90
+ )
91
+
92
+ # Ground truth response (initially hidden)
93
+ ground_truth = gr.TextArea(
94
+ label="🎯 Ground truth response",
95
+ lines=10,
96
+ placeholder="Enter the ground truth response here...",
97
+ visible=False
98
+ )
99
+
100
+ with gr.Row():
101
+ random_btn = gr.Button("🎲", scale=2)
102
+ send_btn = gr.Button(
103
+ value="Run evaluation",
104
+ variant="primary",
105
+ size="lg",
106
+ scale=8
107
+ )
108
+
109
+ # Right side - Model outputs
110
+ with gr.Column(scale=1):
111
+ gr.Markdown("## 👩‍⚖️ Selene-Mini Evaluation")
112
+ with gr.Group():
113
+ with gr.Row():
114
+ score = gr.Textbox(label="Score", lines=1, interactive=False)
115
+ critique = gr.TextArea(label="Critique", lines=12, interactive=False)
116
+
117
+ gr.Markdown("<br>")
118
+
119
+
120
+ # Replace the "Edit Judge Prompt" Accordion section with:
121
+ with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
122
+ gr.Markdown("<br>")
123
+ use_reference_toggle = gr.Checkbox(
124
+ label="Use a reference response",
125
+ value=False
126
+ )
127
+
128
+ # Hide the default prompt editor
129
+ with gr.Column(visible=False) as default_prompt_editor:
130
+ eval_prompt_editable = gr.TextArea(
131
+ value=DEFAULT_EVAL_PROMPT_EDITABLE,
132
+ label="Evaluation Criteria",
133
+ lines=12
134
+ )
135
+
136
+ with gr.Row(visible=False) as edit_buttons_row:
137
+ cancel_prompt_btn = gr.Button("Cancel")
138
+ save_prompt_btn = gr.Button("Save", variant="primary")
139
+
140
+ # Show the compatible mode editor
141
+ with gr.Column(visible=True) as compatible_prompt_editor:
142
+ eval_criteria_text = gr.TextArea(
143
+ label="Evaluation Criteria",
144
+ lines=12,
145
+ value=DEFAULT_EVAL_CRITERIA,
146
+ placeholder="Enter the complete evaluation criteria and scoring rubric..."
147
+ )
148
+ with gr.Row(visible=False) as compatible_edit_buttons_row:
149
+ compatible_cancel_btn = gr.Button("Cancel")
150
+ compatible_save_btn = gr.Button("Save", variant="primary")
151
+
152
+ eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
153
+ is_editing = gr.State(False) # Track editing state
154
+ compatible_mode_state = gr.State(False) # Track compatible mode state
155
+
156
+ # Update model names after responses are generated
157
+ def update_model_names(model_a, model_b):
158
+ return gr.update(value=f"*Model: {model_a}*"), gr.update(
159
+ value=f"*Model: {model_b}*"
160
+ )
161
+
162
+ # Store the last submitted prompt and variables for comparison
163
+ last_submission = gr.State({})
164
+
165
+ # Update the save/cancel buttons section in the compatible prompt editor
166
+ def save_criteria(new_criteria, previous_criteria):
167
+ return [
168
+ gr.update(value=new_criteria), # Update the criteria
169
+ new_criteria, # Update the previous criteria state
170
+ gr.update(visible=False) # Hide the buttons
171
+ ]
172
+
173
+ def cancel_criteria(previous_criteria):
174
+ return [
175
+ gr.update(value=previous_criteria), # Revert to previous criteria
176
+ previous_criteria, # Keep the previous criteria state
177
+ gr.update(visible=False) # Hide the buttons
178
+ ]
179
+
180
+ def show_criteria_edit_buttons(current_value, previous_value):
181
+ # Show buttons only if the current value differs from the previous value
182
+ return gr.update(visible=current_value != previous_value)
183
+
184
+ # Add handlers for save/cancel buttons and criteria changes
185
+ compatible_save_btn.click(
186
+ fn=save_criteria,
187
+ inputs=[eval_criteria_text, eval_prompt_previous],
188
+ outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
189
+ )
190
+
191
+ compatible_cancel_btn.click(
192
+ fn=cancel_criteria,
193
+ inputs=[eval_prompt_previous],
194
+ outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
195
+ )
196
+
197
+ eval_criteria_text.change(
198
+ fn=show_criteria_edit_buttons,
199
+ inputs=[eval_criteria_text, eval_prompt_previous],
200
+ outputs=compatible_edit_buttons_row
201
+ )
202
+
203
+ # Function to toggle visibility based on compatible mode
204
+ def toggle_use_reference(checked):
205
+ if checked:
206
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
207
+ return {
208
+ ground_truth: gr.update(visible=True, value=ground_truth_msg),
209
+ human_input: gr.update(value=human_msg),
210
+ ai_response: gr.update(value=ai_msg),
211
+ score: gr.update(value=""),
212
+ critique: gr.update(value=""),
213
+ random_btn: gr.update(value="🎲", variant="secondary"),
214
+ }
215
+ else:
216
+ return {
217
+ ground_truth: gr.update(visible=False)
218
+ }
219
+
220
+ # Update the change handler to include all necessary outputs
221
+ use_reference_toggle.change(
222
+ fn=toggle_use_reference,
223
+ inputs=[use_reference_toggle],
224
+ outputs=[
225
+ ground_truth,
226
+ human_input,
227
+ ai_response,
228
+ score,
229
+ critique,
230
+ random_btn,
231
+ ]
232
+ )
233
+
234
+ # Add a new state variable to track first game
235
+ first_game_state = gr.State(True) # Initialize as True
236
+
237
+ # Update the submit function to parse the evaluation criteria
238
+ def submit_and_store(
239
+ use_reference,
240
+ eval_criteria_text,
241
+ human_input,
242
+ ai_response,
243
+ ground_truth_input,
244
+ ):
245
+ # Build prompt data dictionary
246
+ prompt_data = {
247
+ 'human_input': human_input,
248
+ 'ai_response': ai_response,
249
+ 'ground_truth_input': ground_truth_input if use_reference else None,
250
+ 'eval_criteria': eval_criteria_text,
251
+ }
252
+
253
+ # Get response from Atla
254
+ response = get_atla_response(
255
+ model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
256
+ prompt=prompt_data,
257
+ max_tokens=500,
258
+ temperature=0.01
259
+ )
260
+
261
+ # Response now contains score and critique directly
262
+ if isinstance(response, dict) and 'score' in response and 'critique' in response:
263
+ score = str(response['score'])
264
+ critique = response['critique']
265
+ else:
266
+ # Handle error case
267
+ score = "Error"
268
+ critique = str(response)
269
+
270
+ return [
271
+ score,
272
+ critique,
273
+ gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
274
+ gr.update(value="🎲"),
275
+ ]
276
+
277
+ # Update the click handler to use False for is_first_game after first submission
278
+ def create_submit_handler():
279
+ first_game = True
280
+
281
+ def handler(*args):
282
+ nonlocal first_game
283
+ result = submit_and_store(*args)
284
+ first_game = False # Set to False after first submission
285
+ return result
286
+
287
+ return handler
288
+
289
+ # Update the send_btn click handler
290
+ send_btn.click(
291
+ fn=submit_and_store,
292
+ inputs=[
293
+ use_reference_toggle,
294
+ eval_criteria_text,
295
+ human_input,
296
+ ai_response,
297
+ ground_truth,
298
+ ],
299
+ outputs=[
300
+ score,
301
+ critique,
302
+ send_btn,
303
+ random_btn,
304
+ ],
305
+ )
306
+
307
+ # Add random button handler
308
+ random_btn.click(
309
+ fn=populate_random_example,
310
+ inputs=[use_reference_toggle],
311
+ outputs=[
312
+ human_input,
313
+ ai_response,
314
+ random_btn,
315
+ score,
316
+ critique,
317
+ ground_truth,
318
+ ]
319
+ )
320
+
321
+ # Add input change handlers
322
+ def handle_input_change():
323
+ """Reset UI state when inputs are changed"""
324
+ return [
325
+ gr.update(value="Run evaluation", variant="primary"), # send_btn
326
+ gr.update(value="🎲", variant="secondary"), # random_btn
327
+ ]
328
+
329
+ # Update the change handlers for inputs
330
+ human_input.change(
331
+ fn=handle_input_change,
332
+ inputs=[],
333
+ outputs=[send_btn, random_btn]
334
+ )
335
+
336
+ ai_response.change(
337
+ fn=handle_input_change,
338
+ inputs=[],
339
+ outputs=[send_btn, random_btn]
340
+ )
341
+
342
+ generate_btn.click(
343
+ fn=lambda msg: (
344
+ generate_ai_response(msg)[0], # Only take the response text
345
+ gr.update(
346
+ value="Generate AI Response", # Keep the label
347
+ interactive=False # Disable the button
348
+ )
349
+ ),
350
+ inputs=[human_input],
351
+ outputs=[ai_response, generate_btn]
352
+ )
353
+
354
+ human_input.change(
355
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
356
+ inputs=[human_input],
357
+ outputs=[generate_btn]
358
+ )
359
+
360
+ # Update the demo.load to include the random example population
361
+ interface.load(
362
+ fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
363
+ inputs=[],
364
+ outputs=[
365
+ human_input,
366
+ ai_response,
367
+ random_btn,
368
+ score,
369
+ critique,
370
+ ground_truth,
371
+ ]
372
+ )
373
+
374
+ return interface
375
+
376
+ if __name__ == "__main__":
377
+ demo = create_arena_interface()
378
+ demo.launch()
random_sample/common.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Page Headers
2
+ MAIN_TITLE = "# Selene-Mini"
3
+
4
+ # How it works section
5
+ HOW_IT_WORKS = """
6
+ Try running evals with Selene-Mini in this playground! Our HF model card can be found [here](https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B).
7
+ """
8
+
9
+ BATTLE_RULES = """
10
+ ## 🤺 Choose the winner
11
+ 1. Define your scoring criteria in the **Evaluator Prompt**
12
+ 2. Add a test case to the **Sample to evaluate**
13
+ 3. Test the evaluators & vote for the model that best aligns with your judgement!
14
+ \n
15
+ Variables defined in your prompt with {{double curly braces}} map to input fields under **Sample to evaluate**.
16
+
17
+ <br>
18
+ """
19
+
20
+ # CSS Styles
21
+ CSS_STYLES = """
22
+ .prompt-row {
23
+ align-items: flex-start !important;
24
+ }
25
+ .send-button-row {
26
+ display: flex;
27
+ justify-content: flex-end;
28
+ margin-top: 8px;
29
+ }
30
+ /* Style for metric buttons */
31
+ .metric-button-active {
32
+ background-color: #2B3A55 !important;
33
+ color: white !important;
34
+ }
35
+ /* Add this to ensure proper button spacing */
36
+ .metric-buttons-row {
37
+ gap: 8px;
38
+ }
39
+ """
40
+
41
+ # Default Eval Prompt
42
+ EVAL_DESCRIPTION = """
43
+ ## 📝 Tips
44
+ **Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
45
+ - Evaluation criteria
46
+ - Scoring rubric
47
+ - Examples (Optional)
48
+ """
49
+
50
+ # Voting Section Header
51
+ VOTING_HEADER = """
52
+ # Start Voting Now
53
+ """
54
+
55
+ # Acknowledgements
56
+ ACKNOWLEDGEMENTS = """
57
+ <br><br>
58
+ # Acknowledgements
59
+
60
+ We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
61
+
62
+ We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
63
+ """
64
+
65
+ # Policy Content
66
+ POLICY_CONTENT = """
67
+ # About Atla
68
+
69
+ Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
70
+ <br><br>
71
+ # [Our Mission](https://www.atla-ai.com/company)
72
+
73
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
74
+ Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
75
+ <br><br>
76
+ # Judge Arena Policy
77
+
78
+ ## Overview
79
+
80
+ Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
81
+
82
+ ## Transparency
83
+
84
+ - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
85
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
86
+ - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
87
+
88
+ ## Model Inclusion Criteria
89
+
90
+ Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
91
+
92
+ - **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
93
+ - **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
94
+ - **Accessibility**:
95
+ - **Public API Access**: Models accessible through public APIs without restrictive barriers.
96
+ - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
97
+
98
+ ## Leaderboard Management
99
+
100
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
101
+ - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
102
+ - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
103
+
104
+ *This policy might be updated to reflect changes in our practices or in response to community feedback.*
105
+ <br><br>
106
+ # FAQ
107
+
108
+ **Isn't this the same as Chatbot Arena?**
109
+
110
+ We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
111
+
112
+ **Why should I trust this leaderboard?**
113
+
114
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
115
+
116
+ **Who funds this effort?**
117
+
118
+ Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
119
+
120
+ **What is Atla working on?**
121
+
122
+ We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
123
+ <br><br>
124
+ # Get in touch
125
+ We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.gg/yNpUAMqs).
126
+ \nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
random_sample/gen_api_answer.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import anthropic
3
+ from together import Together
4
+ import os
5
+ from atla import Atla
6
+ from dotenv import load_dotenv
7
+ from .prompts import (
8
+ JUDGE_SYSTEM_PROMPT,
9
+ ATLA_PROMPT,
10
+ ATLA_PROMPT_WITH_REFERENCE
11
+ )
12
+
13
+ load_dotenv()
14
+
15
+ # Initialize clients
16
+ anthropic_client = anthropic.Anthropic()
17
+ openai_client = OpenAI()
18
+ together_client = Together()
19
+ hf_api_key = os.getenv("HF_API_KEY")
20
+
21
+ atla_client = Atla()
22
+
23
+ def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
24
+ """Get response from OpenAI API"""
25
+ try:
26
+ response = openai_client.chat.completions.create(
27
+ model=model_name,
28
+ messages=[
29
+ {"role": "system", "content": system_prompt},
30
+ {"role": "user", "content": prompt},
31
+ ],
32
+ max_completion_tokens=max_tokens,
33
+ temperature=temperature,
34
+ )
35
+ return response.choices[0].message.content
36
+ except Exception as e:
37
+ return f"Error with OpenAI model {model_name}: {str(e)}"
38
+
39
+ def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
40
+ """Get response from Anthropic API"""
41
+ try:
42
+ response = anthropic_client.messages.create(
43
+ model=model_name,
44
+ max_tokens=max_tokens,
45
+ temperature=temperature,
46
+ system=system_prompt,
47
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
48
+ )
49
+ return response.content[0].text
50
+ except Exception as e:
51
+ return f"Error with Anthropic model {model_name}: {str(e)}"
52
+
53
+
54
+ def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
55
+ """Get response from Atla API"""
56
+ try:
57
+ # Extract components from the prompt data
58
+ model_input = prompt.get('human_input', '')
59
+ model_output = prompt.get('ai_response', '')
60
+ expected_output = prompt.get('ground_truth_input')
61
+ evaluation_criteria = prompt.get('eval_criteria', '')
62
+
63
+ response = atla_client.evaluation.create(
64
+ model_id="atla-selene",
65
+ model_input=model_input,
66
+ model_output=model_output,
67
+ expected_model_output=expected_output if expected_output else None,
68
+ evaluation_criteria=evaluation_criteria,
69
+ )
70
+
71
+ # Return the score and critique directly
72
+ return {
73
+ "score": response.result.evaluation.score,
74
+ "critique": response.result.evaluation.critique
75
+ }
76
+ except Exception as e:
77
+ return f"Error with Atla model {model_name}: {str(e)}"
random_sample/prompts.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default values for compatible mode
2
+ DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?
3
+
4
+ Scoring Rubric:
5
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
6
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
7
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
8
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
9
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
10
+
11
+ # Default Eval Prompt
12
+ DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
13
+
14
+ Scoring Rubric:
15
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
16
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
17
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
18
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
19
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
20
+
21
+ [User Query]: {{input}}
22
+
23
+ [AI Response]: {{response}}"""
24
+
25
+ # Split the eval prompt into editable and fixed parts
26
+ DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
27
+
28
+ Scoring Rubric:
29
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
30
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
31
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
32
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
33
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
34
+
35
+ # Fixed suffix that will always be appended
36
+ FIXED_EVAL_SUFFIX = """
37
+ [User Query]: {{human_input}}
38
+
39
+ [AI Response]: {{ai_response}}"""
40
+
41
+ ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score integer, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
42
+ Here are some rules of the evaluation:
43
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
44
+
45
+ Your reply should strictly follow this format:
46
+ **Reasoning:** <Your feedback>
47
+
48
+ **Result:** <Your score>
49
+
50
+ Here is the data:
51
+
52
+ Instruction:
53
+ ```
54
+ {human_input}
55
+ ```
56
+
57
+ Response:
58
+ ```
59
+ {ai_response}
60
+ ```
61
+
62
+ Score Rubrics:
63
+ {eval_criteria}"""
64
+
65
+ ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
66
+
67
+ Here are some rules of the evaluation:
68
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
69
+
70
+ Your reply should strictly follow this format:
71
+ **Reasoning:** <Your feedback>
72
+
73
+ **Result:** <Your score>
74
+
75
+ Here is the data:
76
+
77
+ Instruction:
78
+ ```
79
+ {human_input}
80
+ ```
81
+
82
+ Response:
83
+ ```
84
+ {ai_response}
85
+ ```
86
+
87
+ Score Rubrics:
88
+ {eval_criteria}
89
+
90
+ Reference answer:
91
+ {ground_truth_input}"""
92
+
93
+ # Judge system prompt for non-Prometheus models
94
+ JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
random_sample/random_sample_generation.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import anthropic
3
+ import json
4
+ import re
5
+ import random
6
+ import os
7
+ from .gen_api_answer import get_openai_response, get_anthropic_response
8
+
9
+ # Initialize clients
10
+ anthropic_client = anthropic.Anthropic()
11
+ openai_client = OpenAI()
12
+
13
+ GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
14
+ BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
15
+ AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
16
+
17
+ GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
18
+ BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
19
+ AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
+
21
+ GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
22
+ GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
23
+ 1. A random human message (not a simple one-word answer question)
24
+ 2. An AI response (a few sentences long)
25
+ 3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
26
+
27
+ Format as JSON with "human", "ai", and "ground_truth" fields."""
28
+
29
+ RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
30
+
31
+ def get_random_human_ai_pair():
32
+ # Select system prompt with specified probabilities
33
+ system_prompt = random.choices(
34
+ [GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
35
+ weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
36
+ )[0]
37
+
38
+ # Log which type of response is being generated
39
+ prompt_type = {
40
+ GOOD_SYSTEM_PROMPT: "good",
41
+ BAD_SYSTEM_PROMPT: "bad",
42
+ AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
43
+ }[system_prompt]
44
+ print(f"Generating {prompt_type} response")
45
+
46
+ # Randomly choose between GPT-3.5 and Claude with 65%/35% weights
47
+ model_choice = random.choices([
48
+ ("gpt-3.5-turbo", get_openai_response),
49
+ ("claude-3-5-haiku-latest", get_anthropic_response)
50
+ ], weights=[0.5, 0.5])[0]
51
+ model_name, api_func = model_choice
52
+
53
+ # Generate response using selected model
54
+ response = api_func(
55
+ model_name=model_name,
56
+ prompt=GENERATION_PROMPT,
57
+ system_prompt=system_prompt,
58
+ max_tokens=500,
59
+ temperature=1
60
+ )
61
+
62
+ # Define default messages
63
+ default_human = "How do muscles grow?"
64
+ default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
65
+
66
+ try:
67
+ # Clean the response by replacing newlines with spaces
68
+ cleaned_response = response.replace('\n', ' ').replace('\r', '')
69
+ data = json.loads(cleaned_response)
70
+
71
+ # Extract messages with fallbacks
72
+ human_message = data.get("human", default_human)
73
+ ai_message = data.get("ai", default_ai)
74
+
75
+ # Debug logging
76
+ print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
77
+
78
+ except Exception as e:
79
+ print(f"Failed to parse response: {str(e)}\n {response}")
80
+ human_message = default_human
81
+ ai_message = default_ai
82
+
83
+ return human_message, ai_message
84
+
85
+ def get_random_human_ai_ground_truth_pair():
86
+ # Select system prompt with specified probabilities
87
+ system_prompts = {
88
+ "good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
89
+ "bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
90
+ "ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
91
+ }
92
+
93
+ prompt_type = random.choices(
94
+ ["good", "bad", "ambiguous"],
95
+ weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
96
+ )[0]
97
+
98
+ system_prompt = system_prompts[prompt_type]
99
+ print(f"Generating {prompt_type} response with ground truth")
100
+
101
+ # Randomly choose between GPT-3.5 and Claude with 50/50 weights
102
+ model_choice = random.choices([
103
+ ("gpt-3.5-turbo", get_openai_response),
104
+ ("claude-3-5-haiku-latest", get_anthropic_response)
105
+ ], weights=[0.5, 0.5])[0]
106
+ model_name, api_func = model_choice
107
+
108
+ # Define default messages
109
+ defaults = {
110
+ "human": "How do muscles grow?",
111
+ "ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
112
+ "ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
113
+
114
+ 1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
115
+
116
+ 2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
117
+
118
+ 3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
119
+
120
+ 4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
121
+
122
+ 5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
123
+
124
+ 6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
125
+
126
+ This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
127
+ }
128
+
129
+ # Generate response using selected model
130
+ response = api_func(
131
+ model_name=model_name,
132
+ prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
133
+ system_prompt=system_prompt,
134
+ max_tokens=1000, # Increased token limit to accommodate ground truth
135
+ temperature=1
136
+ )
137
+
138
+ # Parse the response to get all three components
139
+ try:
140
+ # Clean the response by replacing newlines with spaces
141
+ cleaned_response = response.replace('\n', ' ').replace('\r', '')
142
+ data = json.loads(cleaned_response)
143
+
144
+ # Extract messages with fallbacks
145
+ human_message = data.get("human", defaults["human"])
146
+ ai_message = data.get("ai", defaults["ai"])
147
+ ground_truth = data.get("ground_truth", defaults["ground_truth"])
148
+
149
+ # Debug logging
150
+ print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
151
+
152
+ except Exception as e:
153
+ print(f"Failed to parse response: {str(e)}\n {response}")
154
+ human_message = defaults["human"]
155
+ ai_message = defaults["ai"]
156
+ ground_truth = defaults["ground_truth"]
157
+
158
+ return human_message, ai_message, ground_truth
159
+
160
+ def generate_ai_response(human_msg):
161
+ """Generate AI response using GPT-3.5-turbo"""
162
+ if not human_msg.strip():
163
+ return "", False
164
+
165
+ try:
166
+ response = get_openai_response(
167
+ "gpt-3.5-turbo",
168
+ human_msg,
169
+ system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
170
+ max_tokens=1000,
171
+ temperature=1
172
+ )
173
+ # Extract just the response content since we don't need JSON format here
174
+ if isinstance(response, str):
175
+ # Clean up any JSON formatting if present
176
+ try:
177
+ data = json.loads(response)
178
+ response = data.get("content", response)
179
+ except json.JSONDecodeError:
180
+ pass
181
+ return response, False # Return response and button interactive state
182
+ except Exception as e:
183
+ return f"Error generating response: {str(e)}", False
random_sample_tab.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from random_sample.arena_interface import create_arena_interface
3
+
4
+ def random_sample_tab():
5
+ with gr.TabItem("Random samples"):
6
+ return create_arena_interface()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ pymongo
3
+ gradio
4
+ python-dotenv
5
+ openai
6
+ anthropic
7
+ together
8
+
9
+ # Development dependencies
10
+ black
11
+ pytest
score_handler.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import tempfile
5
+ import os
6
+
7
+ def handle_analysis(df_state, model_selection_group, analyze_results_button):
8
+ with gr.Group(visible=False) as analysis_group:
9
+ gr.Markdown("## Analysis")
10
+
11
+ # Dropdown to select the accuracy measurement
12
+ accuracy_measurement_dropdown = gr.Dropdown(
13
+ choices=['Accuracy', 'Pearson Correlation'],
14
+ label='Select Evaluation Metric'
15
+ )
16
+
17
+ # We remove the two compare dropdowns and only keep ground truth
18
+ with gr.Row():
19
+ ground_truth_dropdown = gr.Dropdown(
20
+ choices=[],
21
+ label='Select True Label Column'
22
+ )
23
+
24
+ # Define two side-by-side boxes for results
25
+ with gr.Row():
26
+ judge_a_result = gr.Textbox(
27
+ label="Judge A Results",
28
+ lines=10,
29
+ interactive=False,
30
+ visible=False
31
+ )
32
+ judge_b_result = gr.Textbox(
33
+ label="Judge B Results",
34
+ lines=10,
35
+ interactive=False,
36
+ visible=False
37
+ )
38
+
39
+ # Move the JSON output below those textboxes and buttons
40
+ json_output = gr.File(label="Results .json", interactive=False, visible=False)
41
+
42
+ # Now place the row of buttons AFTER the json_output
43
+ with gr.Row():
44
+ back_to_results_button = gr.Button("← Back to Results")
45
+ calculate_button = gr.Button("Calculate")
46
+ download_button = gr.Button("Download Results as JSON")
47
+
48
+ # Show analysis group
49
+ def show_analysis_group():
50
+ df = df_state.value
51
+ if df is not None:
52
+ columns = df.columns.tolist()
53
+ else:
54
+ columns = []
55
+ # Now we only update ground_truth_dropdown
56
+ return (
57
+ gr.update(visible=True), # analysis_group
58
+ gr.update(visible=False), # model_selection_group
59
+ gr.update(choices=columns), # ground_truth_dropdown
60
+ )
61
+
62
+ analyze_results_button.click(
63
+ fn=show_analysis_group,
64
+ inputs=[],
65
+ outputs=[
66
+ analysis_group,
67
+ model_selection_group,
68
+ ground_truth_dropdown # only this one
69
+ ]
70
+ )
71
+
72
+ def back_to_results():
73
+ return (
74
+ gr.update(visible=False), # Hide analysis_group
75
+ gr.update(visible=True), # Show model_selection_group
76
+ )
77
+
78
+ back_to_results_button.click(
79
+ fn=back_to_results,
80
+ inputs=[],
81
+ outputs=[analysis_group, model_selection_group]
82
+ )
83
+
84
+ def calculate_multiple_accuracies(measurement, ground_truth_col, df_state):
85
+ # Hard-code 'score_a' and 'score_b' as the columns to compare
86
+ col2_name = "score_a"
87
+ col3_name = "score_b"
88
+ df = df_state.value
89
+ if df is None:
90
+ # Return two "No DataFrame" messages
91
+ return (
92
+ gr.update(value="No DataFrame available.", visible=True),
93
+ gr.update(value="No DataFrame available.", visible=True)
94
+ )
95
+
96
+ # Check if user-supplied ground_truth_col is valid
97
+ missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns]
98
+ if missing_columns:
99
+ msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}."
100
+ # Return same message in both boxes
101
+ return (
102
+ gr.update(value=msg, visible=True),
103
+ gr.update(value=msg, visible=True)
104
+ )
105
+
106
+ # Compare ground_truth_col with score_a
107
+ result1 = calculate_accuracy(
108
+ measurement, ground_truth_col, col2_name,
109
+ df_state, compare_to_ground_truth=True
110
+ )
111
+ text_a = f"Comparison: '{ground_truth_col}' vs. 'Judge A'\n{result1}"
112
+
113
+ # Compare ground_truth_col with score_b
114
+ result2 = calculate_accuracy(
115
+ measurement, ground_truth_col, col3_name,
116
+ df_state, compare_to_ground_truth=True
117
+ )
118
+ text_b = f"Comparison: '{ground_truth_col}' vs. 'Judge B'\n{result2}"
119
+
120
+ # Return them separately, each is for a different Textbox
121
+ return (
122
+ gr.update(value=text_a, visible=True),
123
+ gr.update(value=text_b, visible=True)
124
+ )
125
+
126
+ # Now the calculate_button only expects measurement, ground_truth_col, df_state
127
+ calculate_button.click(
128
+ fn=calculate_multiple_accuracies,
129
+ inputs=[
130
+ accuracy_measurement_dropdown,
131
+ ground_truth_dropdown,
132
+ df_state
133
+ ],
134
+ outputs=[judge_a_result, judge_b_result]
135
+ )
136
+
137
+ def create_json_download(df_state):
138
+ if df_state.value is None:
139
+ return gr.update(value=None, visible=True)
140
+
141
+ json_str = df_state.value.to_json(orient='records', indent=2)
142
+ temp_dir = tempfile.gettempdir()
143
+ file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json')
144
+ with open(file_path, 'w', encoding='utf-8') as f:
145
+ f.write(json_str)
146
+ return gr.update(value=file_path, visible=True)
147
+
148
+ download_button.click(
149
+ fn=create_json_download,
150
+ inputs=[df_state],
151
+ outputs=[json_output]
152
+ )
153
+
154
+ # Helper functions
155
+
156
+ def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False):
157
+ df = df_state.value
158
+ # No changes here (function remains sacred as per your request)
159
+ if df is None:
160
+ return "No DataFrame available."
161
+ if col1 not in df.columns or col2 not in df.columns:
162
+ return "Selected columns not found in DataFrame."
163
+
164
+ results_df = pd.DataFrame()
165
+ if compare_to_ground_truth:
166
+ results_df['ground_truth'] = df[col1]
167
+ results_df['predicted'] = df[col2]
168
+ else:
169
+ results_df['extracted_winner'] = df[col1]
170
+ results_df['truth_result'] = df[col2]
171
+
172
+ if measurement == 'Accuracy':
173
+ result = process_pairwise_accuracy(results_df, compare_to_ground_truth)
174
+ output_text = (
175
+ f"Overall Accuracy: {result['overall_accuracy']}\n"
176
+ f"Number of NaNs: {result['num_extracted_nan']}"
177
+ )
178
+ elif measurement == 'Pearson Correlation':
179
+ result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth)
180
+ output_text = (
181
+ f"Pearson Correlation: {result['overall_pearson_correlation']}\n"
182
+ f"Number of NaNs: {result['num_extracted_nan']}"
183
+ )
184
+ else:
185
+ output_text = "Unknown measurement selected."
186
+
187
+ return output_text
188
+
189
+ def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict:
190
+ # Compute 'results' column based on whether comparing to ground truth
191
+ if compare_to_ground_truth:
192
+ # NEW: convert both columns to float
193
+ results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan)
194
+ results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan)
195
+
196
+ results_df['results'] = results_df['ground_truth'] == results_df['predicted']
197
+ num_extracted_nan = int(results_df['predicted'].isna().sum())
198
+ else:
199
+ results_df['results'] = results_df['extracted_winner'] == results_df['truth_result']
200
+ num_extracted_nan = int(results_df['extracted_winner'].isna().sum())
201
+
202
+ overall_accuracy = results_df['results'].mean()
203
+
204
+ return {
205
+ "overall_accuracy": overall_accuracy,
206
+ "num_extracted_nan": num_extracted_nan,
207
+ }
208
+
209
+ def process_single_rating_pearson_correlation(
210
+ results_df: pd.DataFrame, compare_to_ground_truth=False
211
+ ) -> dict:
212
+ if compare_to_ground_truth:
213
+ pred_col = 'predicted'
214
+ truth_col = 'ground_truth'
215
+ else:
216
+ pred_col = 'extracted_winner'
217
+ truth_col = 'truth_result'
218
+
219
+ results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan)
220
+ results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan)
221
+
222
+ numerical_results = results_df.dropna(subset=[pred_col, truth_col])
223
+
224
+ if len(numerical_results) == 0:
225
+ pearson_corr = np.nan
226
+ else:
227
+ pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col])
228
+
229
+ num_extracted_nan = int(results_df[pred_col].isna().sum())
230
+
231
+ return {
232
+ "overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0,
233
+ "num_extracted_nan": num_extracted_nan,
234
+ }
235
+
236
+ def convert_to_float_or_nan(extracted_input):
237
+ if extracted_input is None or pd.isna(extracted_input):
238
+ return np.nan
239
+ try:
240
+ return float(extracted_input)
241
+ except ValueError:
242
+ return np.nan
ui_components.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # ui_components.py
2
+ import gradio as gr
3
+
4
+ save_prompt_button = gr.Button("Save Prompt", visible=False)
utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ import logging
4
+
5
+ def parse_variables(prompt):
6
+ import re
7
+ # Extract variables enclosed in double curly braces
8
+ variables = re.findall(r"{{(.*?)}}", prompt)
9
+ # Remove duplicates while preserving order
10
+ seen = set()
11
+ variables = [
12
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
13
+ ]
14
+ return variables
15
+
16
+ def get_logger(sink_name: str = "core_utils") -> logging.Logger:
17
+ logging.basicConfig(
18
+ format="%(asctime)s,%(msecs)03d %(levelname)-8s "
19
+ "[%(filename)s:%(lineno)d] %(message)s",
20
+ datefmt="%Y-%m-%d:%H:%M:%S",
21
+ level=logging.INFO,
22
+ force=True,
23
+ )
24
+ logger = logging.getLogger(sink_name)
25
+ return logger
26
+
27
+
28
+ @dataclass
29
+ class Vote:
30
+ timestamp: str
31
+ prompt: str
32
+ response_a: str
33
+ response_b: str
34
+ model_a: str
35
+ model_b: str
36
+ winner: str
37
+ judge_id: str