scottsuk0306 commited on
Commit
b1b6ed6
Β·
1 Parent(s): 9172f10
Files changed (12) hide show
  1. app.py +157 -0
  2. requirements.txt +5 -0
  3. src/.gitignore +1 -0
  4. src/__init__.py +0 -0
  5. src/assets.py +61 -0
  6. src/content.py +31 -0
  7. src/leaderboard.py +218 -0
  8. src/llm_perf.py +220 -0
  9. src/model_card.py +160 -0
  10. src/model_list.py +529 -0
  11. src/panel.py +60 -0
  12. src/utils.py +99 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
3
+
4
+ from src.assets import custom_css
5
+ from src.content import ABOUT, BGB_LOGO, BGB_TITLE, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
6
+ from src.leaderboard import (
7
+ BGB_COLUMN_MAPPING,
8
+ BGB_COLUMN_TO_DATATYPE,
9
+ CAPABILITY_COLUMNS,
10
+ create_bgb_leaderboard_table,
11
+ create_leaderboard_table,
12
+ get_bgb_leaderboard_df,
13
+ )
14
+ from src.llm_perf import get_eval_df, get_llm_perf_df
15
+ from src.panel import create_select_callback
16
+
17
+ BGB = True
18
+
19
+ # prometheus-eval/prometheus-bgb-8x7b-v2.0
20
+
21
+ # def init_leaderboard():
22
+ # machine = "1xA10"
23
+ # open_llm_perf_df = get_llm_perf_df(machine=machine)
24
+ # search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(open_llm_perf_df)
25
+ # return machine, search_bar, columns_checkboxes, leaderboard_table
26
+
27
+
28
+ EVAL_MODELS = [
29
+ "gpt-4-turbo-2024-04-09",
30
+ "prometheus-bgb-8x7b-v2.0",
31
+ ]
32
+
33
+ EVAL_MODEL_TABS = {
34
+ "gpt-4-turbo-2024-04-09": "GPT-4 as a Judge πŸ…",
35
+ "prometheus-bgb-8x7b-v2.0": "Prometheus as a Judge πŸ…",
36
+ }
37
+
38
+
39
+ demo = gr.Blocks(css=custom_css)
40
+ with demo:
41
+ gr.HTML(BGB_LOGO, elem_classes="logo")
42
+ gr.HTML(BGB_TITLE, elem_classes="title")
43
+ # gr.HTML(BGB_LOGO_AND_TITLE, elem_classes="title")
44
+
45
+ with gr.Tabs(elem_classes="tabs"):
46
+
47
+ for idx, eval_model in enumerate(EVAL_MODELS):
48
+ tab_name = EVAL_MODEL_TABS[eval_model]
49
+
50
+ # Previous code without gradio_leaderboard
51
+
52
+ # machine = eval_model
53
+ # machine_textbox = gr.Textbox(value=eval_model, visible=False)
54
+
55
+ # if BGB:
56
+ # eval_df = get_eval_df(eval_model_name=eval_model)
57
+ # else:
58
+ # eval_df = get_llm_perf_df(machine=machine)
59
+ # # Leaderboard
60
+ # with gr.TabItem(tab_name, id=idx):
61
+ # if BGB:
62
+ # search_bar, columns_checkboxes, type_checkboxes, param_slider, leaderboard_table = create_bgb_leaderboard_table(eval_df)
63
+ # else:
64
+ # search_bar, columns_checkboxes, type_checkboxes, param_slider, leaderboard_table = (
65
+ # create_leaderboard_table(eval_df)
66
+ # )
67
+
68
+ # create_select_callback(
69
+ # # inputs
70
+ # machine_textbox,
71
+ # # interactive
72
+ # columns_checkboxes,
73
+ # search_bar,
74
+ # type_checkboxes,
75
+ # param_slider,
76
+ # # outputs
77
+ # leaderboard_table,
78
+ # )
79
+ with gr.TabItem(tab_name, id=idx):
80
+
81
+ eval_df = get_eval_df(eval_model_name=eval_model)
82
+ eval_df = get_bgb_leaderboard_df(eval_df)
83
+
84
+ ordered_columns = [
85
+ "Model πŸ€—",
86
+ "Average",
87
+ "Grounding ⚑️",
88
+ "Instruction Following πŸ“",
89
+ "Planning πŸ“…",
90
+ "Reasoning πŸ’‘",
91
+ "Refinement πŸ”©",
92
+ "Safety ⚠️",
93
+ "Theory of Mind πŸ€”",
94
+ "Tool Usage πŸ› οΈ",
95
+ "Multilingual πŸ‡¬πŸ‡«",
96
+ "Model Type",
97
+ "Model Params (B)",
98
+ ]
99
+
100
+ ordered_columns_types = [
101
+ "markdown",
102
+ "number",
103
+ "number",
104
+ "number",
105
+ "number",
106
+ "number",
107
+ "number",
108
+ "number",
109
+ "number",
110
+ "number",
111
+ "number",
112
+ "text",
113
+ "number",
114
+ ]
115
+
116
+ eval_df = eval_df[ordered_columns]
117
+
118
+ Leaderboard(
119
+ value=eval_df,
120
+ datatype=ordered_columns_types,
121
+ select_columns=SelectColumns(
122
+ default_selection=ordered_columns,
123
+ cant_deselect=["Model πŸ€—", "Model Type", "Model Params (B)"],
124
+ label="Select Columns to Display:",
125
+ ),
126
+ search_columns=["Model πŸ€—"],
127
+ # hide_columns=["model_name_for_query", "Model Size"],
128
+ filter_columns=[
129
+ ColumnFilter("Model Type", type="checkboxgroup", label="Model types"),
130
+ ColumnFilter(
131
+ "Model Params (B)",
132
+ min=0,
133
+ max=150,
134
+ default=[0, 150],
135
+ type="slider",
136
+ label="Model Params (B)",
137
+ ),
138
+ ],
139
+ )
140
+
141
+ ####################### ABOUT TAB #######################
142
+ with gr.TabItem("About πŸ“–", id=3):
143
+ gr.Markdown(ABOUT, elem_classes="descriptive-text")
144
+
145
+ ####################### CITATION
146
+ with gr.Row():
147
+ with gr.Accordion("πŸ“™ Citation", open=False):
148
+ citation_button = gr.Textbox(
149
+ value=CITATION_BUTTON,
150
+ label=CITATION_BUTTON_LABEL,
151
+ elem_id="citation-button",
152
+ show_copy_button=True,
153
+ )
154
+
155
+ if __name__ == "__main__":
156
+ # Launch demo
157
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ huggingface_hub
2
+ transformers
3
+ gradio
4
+ plotly
5
+ pandas
src/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
src/__init__.py ADDED
File without changes
src/assets.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ .logo {
3
+ width: 300px;
4
+ height: auto;
5
+ margin: 0 auto;
6
+ max-width: 100%
7
+ object-fit: contain;
8
+ }
9
+ .text {
10
+ font-size: 16px !important;
11
+ }
12
+
13
+ .tabs button {
14
+ font-size: 20px;
15
+ }
16
+ .subtabs button {
17
+ font-size: 20px;
18
+ }
19
+
20
+ .descriptive-text span {
21
+ font-size: 16px !important;
22
+ }
23
+
24
+ #control-panel span {
25
+ font-size: 20px !important;
26
+ }
27
+ #search-bar span {
28
+ font-size: 16px !important;
29
+ }
30
+ #threshold-slider span {
31
+ font-size: 16px !important;
32
+ }
33
+ #memory-slider span {
34
+ font-size: 16px !important;
35
+ }
36
+ #columns-checkboxes span {
37
+ font-size: 16px !important;
38
+ }
39
+ #backend-checkboxes span {
40
+ font-size: 16px !important;
41
+ }
42
+ #dtype-checkboxes span {
43
+ font-size: 16px !important;
44
+ }
45
+ #optimization-checkboxes span {
46
+ font-size: 16px !important;
47
+ }
48
+ #quantization-checkboxes span {
49
+ font-size: 16px !important;
50
+ }
51
+ #kernel-checkboxes span {
52
+ font-size: 16px !important;
53
+ }
54
+
55
+ #leaderboard-table td:first-child,
56
+ #leaderboard-table th:first-child {
57
+ max-width: 300px;
58
+ overflow: auto;
59
+ white-space: nowrap;
60
+ }
61
+ """
src/content.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LOGO = '<img src="https://raw.githubusercontent.com/prometheus-eval/leaderboard/main/logo.png">'
2
+
3
+ TITLE = """<h1 align="center" id="space-title">πŸ€— BiGGen-Bench Leaderboard πŸ‹οΈ</h1>"""
4
+
5
+ BGB_LOGO = '<img src="https://raw.githubusercontent.com/prometheus-eval/leaderboard/main/logo.png" alt="Logo" style="width: 30%; display: block; margin: auto;">'
6
+ BGB_TITLE = """<h1 align="center">BiGGen-Bench Leaderboard</h1>"""
7
+
8
+
9
+ ABOUT = """
10
+ ## πŸ“ About
11
+ ### BiGGen-Bench Leaderboard
12
+
13
+ Welcome to the 🌟 BiGGen-Bench Leaderboard πŸš€, a dedicated benchmarking platform designed to evaluate the nuanced capabilities of Generative Language Models (GLMs) across a variety of complex and diverse tasks. Leveraging the refined methodologies of [BiGGen-Bench](https://github.com/prometheus-eval/prometheus-eval), our leaderboard offers a comprehensive assessment framework that mirrors human-like discernment and precision in evaluating language models.
14
+
15
+ #### Evaluation Details
16
+
17
+ - **Evaluation Scope**: Covers nine key capabilities of GLMs across 77 tasks, with 765 unique instances tailored to test specific aspects of model performance.
18
+ - **Scoring System**: Utilizes a detailed scoring rubric from 1 to 5, reflecting a range of outcomes based on instance-specific criteria closely aligned with the nuanced requirements of each task.
19
+ - **Hardware and Setup**: Benchmarks are conducted using a controlled setup to ensure consistent and fair comparison across different models.
20
+ - **Transparency and Openness**: All codes, data, and detailed evaluation results are publicly available to foster transparency and enable community-driven enhancements and verifications.
21
+
22
+ #### Benchmarking Script
23
+
24
+ All benchmarks are executed using the provided [code](https://github.com/prometheus-eval/prometheus-eval/blob/main/BiGGen-Bench) within the BiGGen-Bench repository. This script ensures that all models are evaluated under identical conditions, guaranteeing reliability and reproducibility of results.
25
+
26
+ """
27
+
28
+
29
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
30
+ CITATION_BUTTON = r"""TBA
31
+ """
src/leaderboard.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.utils import model_hyperlink, process_score
4
+
5
+ LEADERBOARD_COLUMN_TO_DATATYPE = {
6
+ # open llm
7
+ "Model πŸ€—": "markdown",
8
+ "Experiment πŸ§ͺ": "str",
9
+ # primary measurements
10
+ "Prefill (s)": "number",
11
+ "Decode (tokens/s)": "number",
12
+ "Memory (MB)": "number",
13
+ "Energy (tokens/kWh)": "number",
14
+ # deployment settings
15
+ "Backend 🏭": "str",
16
+ "Precision πŸ“₯": "str",
17
+ "Quantization πŸ—œοΈ": "str",
18
+ "Attention πŸ‘οΈ": "str",
19
+ "Kernel βš›οΈ": "str",
20
+ # additional measurements
21
+ # "Reserved Memory (MB)": "number",
22
+ # "Used Memory (MB)": "number",
23
+ "Open LLM Score (%)": "number",
24
+ "End-to-End (s)": "number",
25
+ "Architecture πŸ›οΈ": "str",
26
+ "Params (B)": "number",
27
+ }
28
+
29
+
30
+ PRIMARY_COLUMNS = [
31
+ "Model πŸ€—",
32
+ "Experiment πŸ§ͺ",
33
+ "Prefill (s)",
34
+ "Decode (tokens/s)",
35
+ "Memory (MB)",
36
+ "Energy (tokens/kWh)",
37
+ "Open LLM Score (%)",
38
+ ]
39
+
40
+
41
+ CAPABILITY_COLUMNS = [
42
+ "Grounding ⚑️",
43
+ "Instruction Following πŸ“",
44
+ "Planning πŸ“…",
45
+ "Reasoning πŸ’‘",
46
+ "Refinement πŸ”©",
47
+ "Safety ⚠️",
48
+ "Theory of Mind πŸ€”",
49
+ "Tool Usage πŸ› οΈ",
50
+ "Multilingual πŸ‡¬πŸ‡«",
51
+ ]
52
+
53
+
54
+ BGB_COLUMN_MAPPING = {
55
+ "model_name_or_path": "Model πŸ€—",
56
+ "average": "Average",
57
+ "grounding": "Grounding ⚑️",
58
+ "instruction_following": "Instruction Following πŸ“",
59
+ "planning": "Planning πŸ“…",
60
+ "reasoning": "Reasoning πŸ’‘",
61
+ "refinement": "Refinement πŸ”©",
62
+ "safety": "Safety ⚠️",
63
+ "theory_of_mind": "Theory of Mind πŸ€”",
64
+ "tool_usage": "Tool Usage πŸ› οΈ",
65
+ "multilingual": "Multilingual πŸ‡¬πŸ‡«",
66
+ "model_params": "Model Params (B)",
67
+ "model_type": "Model Type",
68
+ }
69
+
70
+
71
+ BGB_COLUMN_TO_DATATYPE = {
72
+ "Model πŸ€—": "markdown",
73
+ "Average": "number",
74
+ "Grounding ⚑️": "number",
75
+ "Instruction Following πŸ“": "number",
76
+ "Planning πŸ“…": "number",
77
+ "Reasoning πŸ’‘": "number",
78
+ "Refinement πŸ”©": "number",
79
+ "Safety ⚠️": "number",
80
+ "Theory of Mind πŸ€”": "number",
81
+ "Tool Usage πŸ› οΈ": "number",
82
+ "Multilingual πŸ‡¬πŸ‡«": "number",
83
+ "Model Params (B)": "number",
84
+ "Model Type": "str",
85
+ }
86
+
87
+
88
+ def process_model(model_name):
89
+ link = f"https://huggingface.co/{model_name}"
90
+ return model_hyperlink(link, model_name)
91
+
92
+
93
+ # TODO: Process base, chat, proprietary models differently
94
+ def process_bgb_model(row):
95
+ model_name = row.iloc[0]
96
+ model_type = row.iloc[1]
97
+
98
+ if model_type == "Base" or model_type == "Chat":
99
+ link = f"https://huggingface.co/{model_name}"
100
+ return model_hyperlink(link, model_name)
101
+ elif model_type == "Proprietary":
102
+
103
+ api_model_2_link = {
104
+ "gpt-3.5-turbo-1106": "https://platform.openai.com/docs/models/gpt-3-5",
105
+ "gpt-3.5-turbo-0125": "https://platform.openai.com/docs/models/gpt-3-5",
106
+ "gpt-4-0125-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday",
107
+ "gpt-4-1106-preview": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday",
108
+ "gpt-4-turbo-2024-04-09": "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
109
+ "gpt-4o-2024-05-13": "https://openai.com/index/hello-gpt-4o/",
110
+ "claude-3-haiku-20240307": "https://www.anthropic.com/news/claude-3-family",
111
+ "claude-3-opus-20240229": "https://www.anthropic.com/news/claude-3-family",
112
+ "claude-3-sonnet-20240229": "https://www.anthropic.com/news/claude-3-family",
113
+ "mistral-large": "https://mistral.ai/news/mistral-large/",
114
+ "mistral-medium": "https://mistral.ai/news/la-plateforme/",
115
+ "gemini-1.0-pro": "https://deepmind.google/technologies/gemini/pro/",
116
+ "gemini-pro-1.5": "https://deepmind.google/technologies/gemini/pro/",
117
+ "google/gemini-flash-1.5": "https://deepmind.google/technologies/gemini/flash/",
118
+ }
119
+
120
+ link = api_model_2_link[model_name]
121
+ return model_hyperlink(link, model_name)
122
+
123
+ else:
124
+ raise NotImplementedError(f"Model type {model_type} not implemented")
125
+
126
+
127
+ def get_leaderboard_df(llm_perf_df):
128
+ df = llm_perf_df.copy()
129
+ # transform for leaderboard
130
+ df["Model πŸ€—"] = df["Model πŸ€—"].apply(process_bgb_model)
131
+ # process quantization for leaderboard
132
+ df["Open LLM Score (%)"] = df.apply(lambda x: process_score(x["Open LLM Score (%)"], x["Quantization πŸ—œοΈ"]), axis=1)
133
+ return df
134
+
135
+
136
+ def get_bgb_leaderboard_df(eval_df):
137
+ df = eval_df.copy()
138
+ # transform for leaderboard
139
+ df["Model πŸ€—"] = df[["Model πŸ€—", "Model Type"]].apply(process_bgb_model, axis=1)
140
+ return df
141
+
142
+
143
+ def create_leaderboard_table(llm_perf_df):
144
+ # get dataframe
145
+ leaderboard_df = get_leaderboard_df(llm_perf_df)
146
+
147
+ # create search bar
148
+ with gr.Row():
149
+ search_bar = gr.Textbox(
150
+ label="Model πŸ€—",
151
+ info="πŸ” Search for a model name",
152
+ elem_id="search-bar",
153
+ )
154
+ # create checkboxes
155
+ with gr.Row():
156
+ columns_checkboxes = gr.CheckboxGroup(
157
+ label="Columns πŸ“Š",
158
+ value=PRIMARY_COLUMNS,
159
+ choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
160
+ info="β˜‘οΈ Select the columns to display",
161
+ elem_id="columns-checkboxes",
162
+ )
163
+ # create table
164
+ leaderboard_table = gr.components.Dataframe(
165
+ value=leaderboard_df[PRIMARY_COLUMNS],
166
+ datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
167
+ headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
168
+ elem_id="leaderboard-table",
169
+ )
170
+
171
+ return search_bar, columns_checkboxes, leaderboard_table
172
+
173
+
174
+ def create_bgb_leaderboard_table(eval_df):
175
+ # get dataframe
176
+ bgb_leaderboard_df = get_bgb_leaderboard_df(eval_df)
177
+
178
+ # create search bar
179
+ with gr.Row():
180
+ search_bar = gr.Textbox(
181
+ label="Model πŸ€—",
182
+ info="πŸ” Search for a model name",
183
+ elem_id="search-bar",
184
+ )
185
+
186
+ with gr.Row():
187
+ type_checkboxes = gr.CheckboxGroup(
188
+ label="Model Type",
189
+ value=["Base", "Chat", "Proprietary"],
190
+ choices=["Base", "Chat", "Proprietary"],
191
+ info="β˜‘οΈ Select the capabilities to display",
192
+ elem_id="type-checkboxes",
193
+ )
194
+
195
+ with gr.Row():
196
+ param_slider = gr.Slider(
197
+ minimum=0, maximum=150, value=7, step=1, interactive=True, label="Model Params (B)", elem_id="param-slider"
198
+ )
199
+
200
+ # create checkboxes
201
+ with gr.Row():
202
+ columns_checkboxes = gr.CheckboxGroup(
203
+ label="Capabilities πŸ“Š",
204
+ value=CAPABILITY_COLUMNS,
205
+ choices=CAPABILITY_COLUMNS,
206
+ info="β˜‘οΈ Select the capabilities to display",
207
+ elem_id="columns-checkboxes",
208
+ )
209
+
210
+ # create table
211
+ bgb_leaderboard_table = gr.components.Dataframe(
212
+ value=bgb_leaderboard_df[list(BGB_COLUMN_MAPPING.values())],
213
+ datatype=list(BGB_COLUMN_TO_DATATYPE.values()),
214
+ headers=list(BGB_COLUMN_MAPPING.keys()),
215
+ elem_id="leaderboard-table",
216
+ )
217
+
218
+ return search_bar, columns_checkboxes, type_checkboxes, param_slider, bgb_leaderboard_table
src/llm_perf.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ from src.model_list import MODEL_MAPPING, MODEL_SHORT_TO_LONG, get_all_model_list
7
+ from src.utils import process_kernels, process_quantizations
8
+
9
+ COLUMNS_MAPPING = {
10
+ "config.name": "Experiment πŸ§ͺ",
11
+ "config.backend.model": "Model πŸ€—",
12
+ # primary measurements
13
+ "report.prefill.latency.p50": "Prefill (s)",
14
+ "report.per_token.latency.p50": "Per Token (s)",
15
+ "report.decode.throughput.value": "Decode (tokens/s)",
16
+ "report.decode.efficiency.value": "Energy (tokens/kWh)",
17
+ "report.decode.memory.max_allocated": "Memory (MB)",
18
+ # deployment settings
19
+ "config.backend.name": "Backend 🏭",
20
+ "config.backend.torch_dtype": "Precision πŸ“₯",
21
+ "quantization": "Quantization πŸ—œοΈ",
22
+ "attention": "Attention πŸ‘οΈ",
23
+ "kernel": "Kernel βš›οΈ",
24
+ # additional information
25
+ "architecture": "Architecture πŸ›οΈ",
26
+ "prefill+decode": "End-to-End (s)",
27
+ "Average ⬆️": "Open LLM Score (%)",
28
+ "#Params (B)": "Params (B)",
29
+ }
30
+ SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
31
+ SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
32
+ SORTING_ASCENDING = [False, True, False]
33
+
34
+ BGB_SORTING_COLUMNS = ["Average"]
35
+
36
+ # Use the above capabilities to create the columns
37
+ BGB_COLUMNS_MAPPING = {
38
+ "model_name_or_path": "Model πŸ€—",
39
+ "model_params": "Model Params (B)",
40
+ "model_type": "Model Type",
41
+ "average": "Average",
42
+ "grounding": "Grounding ⚑️",
43
+ "instruction_following": "Instruction Following πŸ“",
44
+ "planning": "Planning πŸ“…",
45
+ "reasoning": "Reasoning πŸ’‘",
46
+ "refinement": "Refinement πŸ”©",
47
+ "safety": "Safety ⚠️",
48
+ "theory_of_mind": "Theory of Mind πŸ€”",
49
+ "tool_usage": "Tool Usage πŸ› οΈ",
50
+ "multilingual": "Multilingual πŸ‡¬πŸ‡«",
51
+ }
52
+
53
+
54
+ def get_raw_llm_perf_df(machine: str = "1xA10"):
55
+ dfs = []
56
+ for subset in SUBSETS:
57
+ try:
58
+ dfs.append(
59
+ pd.read_csv(f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{subset}-{machine}.csv")
60
+ )
61
+ except Exception:
62
+ print(f"Subset {subset} for machine {machine} not found")
63
+
64
+ perf_df = pd.concat(dfs)
65
+ llm_df = pd.read_csv("hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv")
66
+
67
+ llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="config.backend.model")
68
+
69
+ return llm_perf_df
70
+
71
+
72
+ def processed_llm_perf_df(llm_perf_df):
73
+ # some assertions
74
+ assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
75
+ assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
76
+ assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
77
+ assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
78
+ # fix couple stuff
79
+ llm_perf_df.dropna(subset=["report.decode.latency.p50"], inplace=True)
80
+ llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace("flash_attention_2", "fa2")
81
+ llm_perf_df["prefill+decode"] = llm_perf_df["report.prefill.latency.p50"] + (
82
+ llm_perf_df["report.decode.latency.p50"]
83
+ )
84
+ # llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
85
+ # process_architectures
86
+ # )
87
+ llm_perf_df["architecture"] = llm_perf_df["Architecture"]
88
+ llm_perf_df["attention"] = (
89
+ llm_perf_df["config.backend.attn_implementation"]
90
+ .str.replace("flash_attention_2", "FAv2")
91
+ .str.replace("eager", "Eager")
92
+ .str.replace("sdpa", "SDPA")
93
+ )
94
+ llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
95
+ llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
96
+ # round numerical columns
97
+ llm_perf_df = llm_perf_df.round(
98
+ {
99
+ "report.prefill.latency.p50": 3,
100
+ "report.decode.latency.p50": 3,
101
+ "report.decode.throughput.value": 3,
102
+ "report.decode.efficiency.value": 3,
103
+ "report.decode.memory.max_allocated": 3,
104
+ "Average ⬆️": 3,
105
+ "prefill+decode": 3,
106
+ "#Params (B)": 3,
107
+ }
108
+ )
109
+ # filter columns
110
+ llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
111
+ # rename columns
112
+ llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
113
+ # sort by metric
114
+ llm_perf_df.sort_values(
115
+ by=SORTING_COLUMNS,
116
+ ascending=SORTING_ASCENDING,
117
+ inplace=True,
118
+ )
119
+
120
+ return llm_perf_df
121
+
122
+
123
+ def get_llm_perf_df(machine: str = "1xA10"):
124
+ if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
125
+ llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
126
+ else:
127
+ llm_perf_df = get_raw_llm_perf_df(machine)
128
+ llm_perf_df = processed_llm_perf_df(llm_perf_df)
129
+ llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
130
+
131
+ return llm_perf_df
132
+
133
+
134
+ def get_eval_df(eval_model_name: str):
135
+
136
+ assert eval_model_name in ["gpt-4-turbo-2024-04-09", "prometheus-bgb-8x7b-v2.0"]
137
+
138
+ base_dir = Path(__file__).parent.parent / "data"
139
+ filepath = base_dir / f"bgb-leaderboard-{eval_model_name}.pkl"
140
+ # For debugging
141
+ csv_filepath = base_dir / f"bgb-leaderboard-{eval_model_name}.csv"
142
+
143
+ def change_model_name(model_name: str):
144
+ # TODO: Hard code models with different names
145
+ model_name_or_path = MODEL_SHORT_TO_LONG.get(model_name, model_name)
146
+ if model_name == "qwen/qwen-110b-chat":
147
+ model_name_or_path = "Qwen/Qwen1.5-110B-Chat"
148
+
149
+ if model_name_or_path.endswith("-hjpark"):
150
+ model_name_or_path = model_name_or_path.replace("-hjpark", "")
151
+
152
+ return model_name_or_path
153
+
154
+ if os.path.exists(filepath) and False:
155
+ eval_df = pd.read_pickle(filepath)
156
+ else:
157
+ # Process the df
158
+ raw_filepath = base_dir / f"eval_by_{eval_model_name}.csv"
159
+ eval_df = pd.read_csv(raw_filepath)
160
+
161
+ eval_df["model_name_or_path"] = eval_df["model_name"].apply(lambda x: change_model_name(x))
162
+ eval_df.drop(columns=["model_name"], inplace=True)
163
+
164
+ eval_df["model_params"] = eval_df["model_name_or_path"].apply(
165
+ lambda x: MODEL_MAPPING.get(x, ["Unknown", "Unknown"])[0]
166
+ )
167
+ eval_df["model_type"] = eval_df["model_name_or_path"].apply(
168
+ lambda x: MODEL_MAPPING.get(x, ["Unknown", "Unknown"])[1]
169
+ )
170
+
171
+ capabilities = [
172
+ "grounding",
173
+ "instruction_following",
174
+ "planning",
175
+ "reasoning",
176
+ "refinement",
177
+ "safety",
178
+ "theory_of_mind",
179
+ "tool_usage",
180
+ "multilingual",
181
+ ]
182
+
183
+ # Make the average of the capabilities
184
+ eval_df["average"] = eval_df[capabilities].mean(axis=1)
185
+
186
+ # Round to 3 decimal places for capabilities and average
187
+ eval_df = eval_df.round(
188
+ {
189
+ "average": 3,
190
+ "grounding": 3,
191
+ "instruction_following": 3,
192
+ "planning": 3,
193
+ "reasoning": 3,
194
+ "refinement": 3,
195
+ "safety": 3,
196
+ "theory_of_mind": 3,
197
+ "tool_usage": 3,
198
+ "multilingual": 3,
199
+ }
200
+ )
201
+
202
+ # print(eval_df[eval_df['model_params'] == 'Unknown'])
203
+ eval_df.rename(columns=BGB_COLUMNS_MAPPING, inplace=True)
204
+
205
+ eval_df.sort_values(
206
+ by=BGB_SORTING_COLUMNS,
207
+ ascending=False,
208
+ inplace=True,
209
+ )
210
+
211
+ eval_df.to_pickle(str(filepath))
212
+ eval_df.to_csv(str(csv_filepath), index=False)
213
+ # import pdb; pdb.set_trace()
214
+
215
+ return eval_df
216
+
217
+
218
+ if __name__ == "__main__":
219
+ get_eval_df("gpt-4-turbo-2024-04-09")
220
+ get_eval_df("prometheus-bgb-8x7b-v2.0")
src/model_card.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
10
+ from transformers import AutoConfig, AutoTokenizer
11
+
12
+
13
+ # ht to @Wauplin, thank you for the snippet!
14
+ # See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
15
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
16
+ # Returns operation status, and error message
17
+ try:
18
+ card = ModelCard.load(repo_id)
19
+ except huggingface_hub.utils.EntryNotFoundError:
20
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it.", None
21
+
22
+ # Enforce license metadata
23
+ if card.data.license is None:
24
+ if not ("license_name" in card.data and "license_link" in card.data):
25
+ return (
26
+ False,
27
+ (
28
+ "License not found. Please add a license to your model card using the `license` metadata or a"
29
+ " `license_name`/`license_link` pair."
30
+ ),
31
+ None,
32
+ )
33
+
34
+ # Enforce card content
35
+ if len(card.text) < 200:
36
+ return False, "Please add a description to your model card, it is too short.", None
37
+
38
+ return True, "", card
39
+
40
+
41
+ def is_model_on_hub(
42
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
43
+ ) -> tuple[bool, str, AutoConfig]:
44
+ try:
45
+ config = AutoConfig.from_pretrained(
46
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
47
+ ) # , force_download=True)
48
+ if test_tokenizer:
49
+ try:
50
+ tk = AutoTokenizer.from_pretrained(
51
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
52
+ )
53
+ except ValueError as e:
54
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
55
+ except Exception:
56
+ return (
57
+ False,
58
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
59
+ None,
60
+ )
61
+ return True, None, config
62
+
63
+ except ValueError:
64
+ return (
65
+ False,
66
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
67
+ None,
68
+ )
69
+
70
+ except Exception as e:
71
+ if "You are trying to access a gated repo." in str(e):
72
+ return True, "uses a gated model.", None
73
+ return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
74
+
75
+
76
+ def get_model_size(model_info: ModelInfo, precision: str):
77
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
78
+ safetensors = None
79
+ try:
80
+ safetensors = get_safetensors_metadata(model_info.id)
81
+ except Exception as e:
82
+ print(e)
83
+
84
+ if safetensors is not None:
85
+ model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
86
+ else:
87
+ try:
88
+ size_match = re.search(size_pattern, model_info.id.lower())
89
+ model_size = size_match.group(0)
90
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
91
+ except AttributeError:
92
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
93
+
94
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
95
+ model_size = size_factor * model_size
96
+ return model_size
97
+
98
+
99
+ def get_model_arch(model_info: ModelInfo):
100
+ return model_info.config.get("architectures", "Unknown")
101
+
102
+
103
+ def get_model_tags(model_card, model: str):
104
+ is_merge_from_metadata = False
105
+ is_moe_from_metadata = False
106
+
107
+ tags = []
108
+ if model_card is None:
109
+ return tags
110
+ if model_card.data.tags:
111
+ is_merge_from_metadata = any(
112
+ [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
113
+ )
114
+ is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
115
+
116
+ is_merge_from_model_card = any(
117
+ keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
118
+ )
119
+ if is_merge_from_model_card or is_merge_from_metadata:
120
+ tags.append("merge")
121
+ is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
122
+ # Hardcoding because of gating problem
123
+ if "Qwen/Qwen1.5-32B" in model:
124
+ is_moe_from_model_card = False
125
+ is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
126
+ if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
127
+ tags.append("moe")
128
+
129
+ return tags
130
+
131
+
132
+ def test():
133
+ model = "meta-llama/Meta-Llama-3-8B-Instruct"
134
+
135
+ # Test check_model_card
136
+ status, error, card = check_model_card(model)
137
+
138
+ # Test is_model_on_hub
139
+ status2, error2, config2 = is_model_on_hub(model, "main")
140
+ assert status == True
141
+ print(status2, error2, config2)
142
+
143
+ # Test get_model_size
144
+ model_info = ModelInfo(id=model)
145
+ precision = "GPTQ"
146
+ model_size = get_model_size(model_info, precision)
147
+ print(model_size)
148
+
149
+ import pdb
150
+
151
+ pdb.set_trace()
152
+
153
+ # Test get_model_arch
154
+ # model_arch = get_model_arch(model_info)
155
+
156
+ pass
157
+
158
+
159
+ if __name__ == "__main__":
160
+ test()
src/model_list.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODELS = {
2
+ "pretrained": {
3
+ "<=4B": [
4
+ "microsoft/phi-1",
5
+ "microsoft/phi-1_5",
6
+ "microsoft/phi-2",
7
+ "Qwen/Qwen1.5-0.5B",
8
+ "Qwen/Qwen1.5-1.8B",
9
+ "Qwen/Qwen1.5-4B",
10
+ "google/gemma-2b",
11
+ "allenai/OLMo-1B",
12
+ ],
13
+ "<=7B": [
14
+ "google/gemma-7b",
15
+ "mistralai/Mistral-7B-v0.1",
16
+ "Qwen/Qwen1.5-7B",
17
+ "01-ai/Yi-6B",
18
+ "meta-llama/Llama-2-7b-hf",
19
+ "codellama/CodeLlama-7b-hf",
20
+ "EleutherAI/llemma_7b",
21
+ "allenai/OLMo-7B",
22
+ "mistral-community/Mistral-7B-v0.2",
23
+ ],
24
+ "<=14B": [
25
+ "Qwen/Qwen1.5-14B",
26
+ "meta-llama/Llama-2-13b-hf",
27
+ "codellama/CodeLlama-13b-hf",
28
+ "upstage/SOLAR-10.7B-v1.0",
29
+ "meta-llama/Meta-Llama-3-8B",
30
+ ],
31
+ "<=50B": [
32
+ "01-ai/Yi-34B",
33
+ "EleutherAI/llemma_34b",
34
+ "codellama/CodeLlama-34b-hf",
35
+ "mistralai/Mixtral-8x7B-v0.1",
36
+ "Qwen/Qwen1.5-32B",
37
+ ],
38
+ "<=75B": [
39
+ "meta-llama/Llama-2-70b-hf",
40
+ "codellama/CodeLlama-70b-hf",
41
+ "meta-llama/Meta-Llama-3-70B",
42
+ "Qwen/Qwen1.5-72B",
43
+ ],
44
+ "<=175B": [
45
+ "mistral-community/Mixtral-8x22B-v0.1-AWQ",
46
+ ],
47
+ },
48
+ "instruction_tuned": {
49
+ "<=4B": [
50
+ "Qwen/Qwen1.5-0.5B-Chat",
51
+ "Qwen/Qwen1.5-1.8B-Chat",
52
+ "Qwen/Qwen1.5-4B-Chat",
53
+ "google/gemma-2b-it",
54
+ "google/gemma-1.1-2b-it",
55
+ "microsoft/Phi-3-mini-4k-instruct",
56
+ "microsoft/Phi-3-mini-128k-instruct",
57
+ ],
58
+ "<=7B": [
59
+ "google/gemma-7b-it",
60
+ "mistralai/Mistral-7B-Instruct-v0.2",
61
+ "Qwen/Qwen1.5-7B-Chat",
62
+ "01-ai/Yi-6B-Chat",
63
+ "meta-llama/Llama-2-7b-chat-hf",
64
+ "codellama/CodeLlama-7b-Instruct-hf",
65
+ "allenai/OLMo-7B-SFT",
66
+ "allenai/OLMo-7B-Instruct",
67
+ "allenai/tulu-2-7b",
68
+ "allenai/tulu-2-dpo-7b",
69
+ "allenai/codetulu-2-7b",
70
+ "microsoft/Orca-2-7b",
71
+ "openchat/openchat-3.5-0106",
72
+ "teknium/OpenHermes-2-Mistral-7B",
73
+ "teknium/OpenHermes-2.5-Mistral-7B",
74
+ "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
75
+ "HuggingFaceH4/zephyr-7b-beta",
76
+ "berkeley-nest/Starling-LM-7B-alpha",
77
+ "Nexusflow/Starling-LM-7B-beta",
78
+ "kaist-ai/mistral-orpo-alpha",
79
+ "kaist-ai/mistral-orpo-beta",
80
+ "google/gemma-1.1-7b-it",
81
+ ],
82
+ "<=14B": [
83
+ "Qwen/Qwen1.5-14B-Chat",
84
+ "meta-llama/Llama-2-13b-chat-hf",
85
+ "codellama/CodeLlama-13b-Instruct-hf",
86
+ "allenai/tulu-2-13b",
87
+ "allenai/tulu-2-dpo-13b",
88
+ "allenai/codetulu-2-13b",
89
+ "microsoft/Orca-2-13b",
90
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
91
+ "meta-llama/Meta-Llama-3-8B-Instruct",
92
+ "CohereForAI/aya-101",
93
+ ],
94
+ "<=50B": [
95
+ "01-ai/Yi-34B-Chat",
96
+ "codellama/CodeLlama-34b-Instruct-hf",
97
+ "allenai/codetulu-2-34b",
98
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
99
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
100
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
101
+ "NousResearch/Nous-Hermes-2-Yi-34B",
102
+ "CohereForAI/c4ai-command-r-v01",
103
+ "Qwen/Qwen1.5-32B-Chat",
104
+ ],
105
+ "<=75B": [
106
+ "meta-llama/Llama-2-70b-chat-hf",
107
+ "codellama/CodeLlama-70b-Instruct-hf",
108
+ "Qwen/Qwen1.5-72B-Chat",
109
+ "allenai/tulu-2-dpo-70b",
110
+ "meta-llama/Meta-Llama-3-70B-Instruct",
111
+ ],
112
+ "<=175B": [
113
+ "alpindale/c4ai-command-r-plus-GPTQ",
114
+ "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ",
115
+ "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ",
116
+ ],
117
+ },
118
+ }
119
+
120
+ API_MODELS = [
121
+ "gpt-3.5-turbo-0125",
122
+ "gpt-3.5-turbo-1106",
123
+ "gpt-4-0125-preview",
124
+ "gpt-4-1106-preview",
125
+ "gpt-4-turbo-2024-04-09",
126
+ "gpt-4o-2024-05-13",
127
+ "claude-3-haiku-20240307",
128
+ "claude-3-opus-20240229",
129
+ "claude-3-sonnet-20240229",
130
+ "mistral-large",
131
+ "mistral-medium",
132
+ "gemini-1.0-pro",
133
+ "gemini-pro-1.5",
134
+ "google/gemini-flash-1.5",
135
+ "qwen/qwen-110b-chat",
136
+ ]
137
+
138
+
139
+ ORDERED_MODELS = [
140
+ "microsoft/phi-1",
141
+ "microsoft/phi-1_5",
142
+ "microsoft/phi-2",
143
+ "Qwen/Qwen1.5-0.5B",
144
+ "Qwen/Qwen1.5-1.8B",
145
+ "Qwen/Qwen1.5-4B",
146
+ "google/gemma-2b",
147
+ "allenai/OLMo-1B",
148
+ "Qwen/Qwen1.5-0.5B-Chat",
149
+ "Qwen/Qwen1.5-1.8B-Chat",
150
+ "Qwen/Qwen1.5-4B-Chat",
151
+ "microsoft/Phi-3-mini-4k-instruct",
152
+ "microsoft/Phi-3-mini-128k-instruct",
153
+ "google/gemma-2b-it",
154
+ "google/gemma-1.1-2b-it",
155
+ "google/gemma-7b",
156
+ "mistralai/Mistral-7B-v0.1",
157
+ "mistral-community/Mistral-7B-v0.2",
158
+ "Qwen/Qwen1.5-7B",
159
+ "01-ai/Yi-6B",
160
+ "meta-llama/Llama-2-7b-hf",
161
+ "codellama/CodeLlama-7b-hf",
162
+ "meta-llama/Meta-Llama-3-8B",
163
+ "EleutherAI/llemma_7b",
164
+ "allenai/OLMo-7B",
165
+ "google/gemma-7b-it",
166
+ "google/gemma-1.1-7b-it",
167
+ "mistralai/Mistral-7B-Instruct-v0.2",
168
+ "Qwen/Qwen1.5-7B-Chat",
169
+ "01-ai/Yi-6B-Chat",
170
+ "meta-llama/Llama-2-7b-chat-hf",
171
+ "codellama/CodeLlama-7b-Instruct-hf",
172
+ "meta-llama/Meta-Llama-3-8B-Instruct",
173
+ "allenai/OLMo-7B-SFT",
174
+ "allenai/OLMo-7B-Instruct",
175
+ "allenai/tulu-2-7b",
176
+ "allenai/tulu-2-dpo-7b",
177
+ "allenai/codetulu-2-7b",
178
+ "microsoft/Orca-2-7b",
179
+ "openchat/openchat-3.5-0106",
180
+ "teknium/OpenHermes-2-Mistral-7B",
181
+ "teknium/OpenHermes-2.5-Mistral-7B",
182
+ "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
183
+ "Starling-LM-7B-alpha",
184
+ "Starling-LM-7B-beta",
185
+ "kaist-ai/mistral-orpo-alpha",
186
+ "kaist-ai/mistral-orpo-beta",
187
+ "HuggingFaceH4/zephyr-7b-beta",
188
+ "Qwen/Qwen1.5-14B",
189
+ "meta-llama/Llama-2-13b-hf",
190
+ "codellama/CodeLlama-13b-hf",
191
+ "upstage/SOLAR-10.7B-v1.0",
192
+ "Qwen/Qwen1.5-14B-Chat",
193
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
194
+ "CohereForAI/aya-101",
195
+ "meta-llama/Llama-2-13b-chat-hf",
196
+ "codellama/CodeLlama-13b-Instruct-hf",
197
+ "allenai/tulu-2-13b",
198
+ "allenai/tulu-2-dpo-13b",
199
+ "allenai/codetulu-2-13b",
200
+ "microsoft/Orca-2-13b",
201
+ "01-ai/Yi-34B",
202
+ "EleutherAI/llemma_34b",
203
+ "Qwen/Qwen1.5-32B",
204
+ "codellama/CodeLlama-34b-hf",
205
+ "mistralai/Mixtral-8x7B-v0.1",
206
+ "01-ai/Yi-34B-Chat",
207
+ "NousResearch/Nous-Hermes-2-Yi-34B",
208
+ "codellama/CodeLlama-34b-Instruct-hf",
209
+ "allenai/codetulu-2-34b",
210
+ "Qwen/Qwen1.5-32B-Chat",
211
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
212
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
213
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
214
+ "CohereForAI/c4ai-command-r-v01",
215
+ "meta-llama/Llama-2-70b-hf",
216
+ "codellama/CodeLlama-70b-hf",
217
+ "mistral-community/Mixtral-8x22B-v0.1-AWQ",
218
+ "meta-llama/Meta-Llama-3-70B",
219
+ "Qwen/Qwen1.5-72B",
220
+ "meta-llama/Llama-2-70b-chat-hf",
221
+ "codellama/CodeLlama-70b-Instruct-hf",
222
+ "allenai/tulu-2-dpo-70b",
223
+ "alpindale/c4ai-command-r-plus-GPTQ",
224
+ "meta-llama/Meta-Llama-3-70B-Instruct",
225
+ "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ",
226
+ "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ",
227
+ "Qwen/Qwen1.5-72B-Chat",
228
+ "qwen/qwen-110b-chat",
229
+ "gpt-3.5-turbo-1106",
230
+ "gpt-3.5-turbo-0125",
231
+ "gpt-4-1106-preview",
232
+ "gpt-4-0125-preview",
233
+ "gpt-4-turbo-2024-04-09",
234
+ "gpt-4o-2024-05-13",
235
+ "mistral-medium",
236
+ "mistral-large",
237
+ "gemini-1.0-pro",
238
+ "gemini-pro-1.5",
239
+ "google/gemini-flash-1.5",
240
+ "claude-3-haiku-20240307",
241
+ "claude-3-sonnet-20240229",
242
+ "claude-3-opus-20240229",
243
+ ]
244
+
245
+
246
+ bgb_trained_models = [
247
+ "microsoft/phi-1",
248
+ "microsoft/phi-1_5",
249
+ "microsoft/phi-2",
250
+ "Qwen/Qwen1.5-0.5B",
251
+ "Qwen/Qwen1.5-1.8B",
252
+ "Qwen/Qwen1.5-4B",
253
+ "google/gemma-2b",
254
+ "allenai/OLMo-1B",
255
+ "google/gemma-7b",
256
+ "mistralai/Mistral-7B-v0.1",
257
+ "Qwen/Qwen1.5-7B",
258
+ "01-ai/Yi-6B",
259
+ "meta-llama/Llama-2-7b-hf",
260
+ "codellama/CodeLlama-7b-hf",
261
+ "EleutherAI/llemma_7b",
262
+ "allenai/OLMo-7B",
263
+ "Qwen/Qwen1.5-14B",
264
+ "meta-llama/Llama-2-13b-hf",
265
+ "codellama/CodeLlama-13b-hf",
266
+ "upstage/SOLAR-10.7B-v1.0",
267
+ "01-ai/Yi-34B",
268
+ "EleutherAI/llemma_34b",
269
+ "codellama/CodeLlama-34b-hf",
270
+ "mistralai/Mixtral-8x7B-v0.1",
271
+ "meta-llama/Llama-2-70b-hf",
272
+ "codellama/CodeLlama-70b-hf",
273
+ "Qwen/Qwen1.5-72B",
274
+ "Qwen/Qwen1.5-0.5B-Chat",
275
+ "Qwen/Qwen1.5-1.8B-Chat",
276
+ "Qwen/Qwen1.5-4B-Chat",
277
+ "google/gemma-2b-it",
278
+ "google/gemma-7b-it",
279
+ "mistralai/Mistral-7B-Instruct-v0.2",
280
+ "Qwen/Qwen1.5-7B-Chat",
281
+ "01-ai/Yi-6B-Chat",
282
+ "meta-llama/Llama-2-7b-chat-hf",
283
+ "codellama/CodeLlama-7b-Instruct-hf",
284
+ "allenai/OLMo-7B-SFT",
285
+ "allenai/OLMo-7B-Instruct",
286
+ "allenai/tulu-2-7b",
287
+ "allenai/tulu-2-dpo-7b",
288
+ "allenai/codetulu-2-7b",
289
+ "microsoft/Orca-2-7b",
290
+ "openchat/openchat-3.5-0106",
291
+ "teknium/OpenHermes-2-Mistral-7B",
292
+ "teknium/OpenHermes-2.5-Mistral-7B",
293
+ "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
294
+ "HuggingFaceH4/zephyr-7b-beta",
295
+ "Qwen/Qwen1.5-14B-Chat",
296
+ "meta-llama/Llama-2-13b-chat-hf",
297
+ "codellama/CodeLlama-13b-Instruct-hf",
298
+ "allenai/tulu-2-13b",
299
+ "allenai/tulu-2-dpo-13b",
300
+ "allenai/codetulu-2-13b",
301
+ "microsoft/Orca-2-13b",
302
+ "01-ai/Yi-34B-Chat",
303
+ "codellama/CodeLlama-34b-Instruct-hf",
304
+ "allenai/codetulu-2-34b",
305
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
306
+ "NousResearch/Nous-Hermes-2-Mistral-8x7B-SFT",
307
+ "NousResearch/Nous-Hermes-2-Mistral-8x7B-DPO",
308
+ "NousResearch/Nous-Hermes-2-Yi-34B",
309
+ "meta-llama/Llama-2-70b-chat-hf",
310
+ "codellama/CodeLlama-70b-Instruct-hf",
311
+ "Qwen/Qwen1.5-72B-Chat",
312
+ "allenai/tulu-2-dpo-72b",
313
+ ]
314
+
315
+
316
+ MODEL_MAPPING = {
317
+ "microsoft/phi-1": [1.3, "Base"],
318
+ "microsoft/phi-1_5": [1.3, "Base"],
319
+ "microsoft/phi-2": [2.7, "Base"],
320
+ "Qwen/Qwen1.5-0.5B": [0.5, "Base"],
321
+ "Qwen/Qwen1.5-1.8B": [1.8, "Base"],
322
+ "Qwen/Qwen1.5-4B": [4.0, "Base"],
323
+ "google/gemma-2b": [2.0, "Base"],
324
+ "allenai/OLMo-1B": [1.0, "Base"],
325
+ "Qwen/Qwen1.5-0.5B-Chat": [0.5, "Chat", "Qwen/Qwen1.5-0.5B"],
326
+ "Qwen/Qwen1.5-1.8B-Chat": [1.8, "Chat", "Qwen/Qwen1.5-1.8B"],
327
+ "Qwen/Qwen1.5-4B-Chat": [4.0, "Chat", "Qwen/Qwen1.5-4B"],
328
+ "microsoft/Phi-3-mini-4k-instruct": [3.8, "Chat"],
329
+ "microsoft/Phi-3-mini-128k-instruct": [3.8, "Chat"],
330
+ "google/gemma-2b-it": [2.0, "Chat", "google/gemma-2b"],
331
+ "google/gemma-1.1-2b-it": [2.0, "Chat"],
332
+ "google/gemma-7b": [7.0, "Base"],
333
+ "mistralai/Mistral-7B-v0.1": [7.0, "Base"],
334
+ "mistral-community/Mistral-7B-v0.2": [7.0, "Base"],
335
+ "Qwen/Qwen1.5-7B": [7.0, "Base"],
336
+ "01-ai/Yi-6B": [6.0, "Base"],
337
+ "meta-llama/Llama-2-7b-hf": [7.0, "Base"],
338
+ "codellama/CodeLlama-7b-hf": [7.0, "Base"],
339
+ "meta-llama/Meta-Llama-3-8B": [8.0, "Base"],
340
+ "EleutherAI/llemma_7b": [7.0, "Base"],
341
+ "allenai/OLMo-7B": [7.0, "Base"],
342
+ "google/gemma-7b-it": [7.0, "Chat", "google/gemma-7b"],
343
+ "google/gemma-1.1-7b-it": [7.0, "Chat"],
344
+ "mistralai/Mistral-7B-Instruct-v0.2": [7.0, "Chat", "mistral-community/Mistral-7B-v0.2"],
345
+ "Qwen/Qwen1.5-7B-Chat": [7.0, "Chat", "Qwen/Qwen1.5-7B"],
346
+ "01-ai/Yi-6B-Chat": [6.0, "Chat", "01-ai/Yi-6B"],
347
+ "meta-llama/Llama-2-7b-chat-hf": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
348
+ "codellama/CodeLlama-7b-Instruct-hf": [7.0, "Chat", "codellama/CodeLlama-7b-hf"],
349
+ "meta-llama/Meta-Llama-3-8B-Instruct": [8.0, "Chat", "meta-llama/Meta-Llama-3-8B"],
350
+ "allenai/OLMo-7B-SFT": [7.0, "Chat", "allenai/OLMo-7B"],
351
+ "allenai/OLMo-7B-Instruct": [7.0, "Chat", "allenai/OLMo-7B"],
352
+ "allenai/tulu-2-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
353
+ "allenai/tulu-2-dpo-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
354
+ "allenai/codetulu-2-7b": [7.0, "Chat", "codellama/CodeLlama-7b-hf"],
355
+ "microsoft/Orca-2-7b": [7.0, "Chat", "meta-llama/Llama-2-7b-hf"],
356
+ "openchat/openchat-3.5-0106": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
357
+ "teknium/OpenHermes-2-Mistral-7B": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
358
+ "teknium/OpenHermes-2.5-Mistral-7B": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
359
+ "NousResearch/Nous-Hermes-2-Mistral-7B-DPO": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
360
+ "Starling-LM-7B-alpha": [7.0, "Chat"],
361
+ "Starling-LM-7B-beta": [7.0, "Chat"],
362
+ "kaist-ai/mistral-orpo-alpha": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
363
+ "kaist-ai/mistral-orpo-beta": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
364
+ "HuggingFaceH4/zephyr-7b-beta": [7.0, "Chat", "mistralai/Mistral-7B-v0.1"],
365
+ "Qwen/Qwen1.5-14B": [14.0, "Base"],
366
+ "meta-llama/Llama-2-13b-hf": [13.0, "Base"],
367
+ "codellama/CodeLlama-13b-hf": [13.0, "Base"],
368
+ "upstage/SOLAR-10.7B-v1.0": [10.7, "Base"],
369
+ "Qwen/Qwen1.5-14B-Chat": [14.0, "Chat", "Qwen/Qwen1.5-14B"],
370
+ "upstage/SOLAR-10.7B-Instruct-v1.0": [10.7, "Chat", "upstage/SOLAR-10.7B-v1.0"],
371
+ "CohereForAI/aya-101": [13.0, "Chat"],
372
+ "meta-llama/Llama-2-13b-chat-hf": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
373
+ "codellama/CodeLlama-13b-Instruct-hf": [13.0, "Chat", "codellama/CodeLlama-13b-hf"],
374
+ "allenai/tulu-2-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
375
+ "allenai/tulu-2-dpo-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
376
+ "allenai/codetulu-2-13b": [13.0, "Chat", "codellama/CodeLlama-13b-hf"],
377
+ "microsoft/Orca-2-13b": [13.0, "Chat", "meta-llama/Llama-2-13b-hf"],
378
+ "01-ai/Yi-34B": [34.0, "Base"],
379
+ "EleutherAI/llemma_34b": [34.0, "Base"],
380
+ "Qwen/Qwen1.5-32B": [32.0, "Base"],
381
+ "codellama/CodeLlama-34b-hf": [34.0, "Base"],
382
+ "mistralai/Mixtral-8x7B-v0.1": [46.7, "Base"],
383
+ "01-ai/Yi-34B-Chat": [34.0, "Chat", "01-ai/Yi-34B"],
384
+ "NousResearch/Nous-Hermes-2-Yi-34B": [34.0, "Chat", "01-ai/Yi-34B"],
385
+ "codellama/CodeLlama-34b-Instruct-hf": [34.0, "Chat", "codellama/CodeLlama-34b-hf"],
386
+ "allenai/codetulu-2-34b": [34.0, "Chat", "codellama/CodeLlama-34b-hf"],
387
+ "Qwen/Qwen1.5-32B-Chat": [32.0, "Chat", "Qwen/Qwen1.5-32B"],
388
+ "mistralai/Mixtral-8x7B-Instruct-v0.1": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
389
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
390
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": [46.7, "Chat", "mistralai/Mixtral-8x7B-v0.1"],
391
+ "CohereForAI/c4ai-command-r-v01": [35.0, "Chat"],
392
+ "meta-llama/Llama-2-70b-hf": [70.0, "Base"],
393
+ "codellama/CodeLlama-70b-hf": [70.0, "Base"],
394
+ "mistral-community/Mixtral-8x22B-v0.1-AWQ": ["AWQ", "Base"],
395
+ "meta-llama/Meta-Llama-3-70B": [70.0, "Base"],
396
+ "Qwen/Qwen1.5-72B": [72.0, "Base"],
397
+ "meta-llama/Llama-2-70b-chat-hf": [70.0, "Chat", "meta-llama/Llama-2-70b-hf"],
398
+ "codellama/CodeLlama-70b-Instruct-hf": [70.0, "Chat", "codellama/CodeLlama-70b-hf"],
399
+ "allenai/tulu-2-dpo-70b": [70.0, "Chat", "meta-llama/Llama-2-70b-hf"],
400
+ "alpindale/c4ai-command-r-plus-GPTQ": ["GPTQ", "Chat"],
401
+ "meta-llama/Meta-Llama-3-70B-Instruct": [70.0, "Chat", "meta-llama/Meta-Llama-3-70B"],
402
+ "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ": ["AWQ", "Chat", "mistral-community/Mixtral-8x22B-v0.1-AWQ"],
403
+ "MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ": ["AWQ", "Chat", "mistral-community/Mixtral-8x22B-v0.1-AWQ"],
404
+ "Qwen/Qwen1.5-72B-Chat": [72.0, "Chat", "Qwen/Qwen1.5-72B"],
405
+ "qwen/qwen-110b-chat": [110.0, "Chat", None],
406
+ "gpt-3.5-turbo-1106": ["Proprietary", "Proprietary"],
407
+ "gpt-3.5-turbo-0125": ["Proprietary", "Proprietary"],
408
+ "gpt-4-1106-preview": ["Proprietary", "Proprietary"],
409
+ "gpt-4-0125-preview": ["Proprietary", "Proprietary"],
410
+ "gpt-4-turbo-2024-04-09": ["Proprietary", "Proprietary"],
411
+ "gpt-4o-2024-05-13": ["Proprietary", "Proprietary"],
412
+ "mistral-medium": ["Proprietary", "Proprietary"],
413
+ "mistral-large": ["Proprietary", "Proprietary"],
414
+ "gemini-1.0-pro": ["Proprietary", "Proprietary"],
415
+ "gemini-pro-1.5": ["Proprietary", "Proprietary"],
416
+ "google/gemini-flash-1.5": ["Proprietary", "Proprietary"],
417
+ "claude-3-haiku-20240307": ["Proprietary", "Proprietary"],
418
+ "claude-3-sonnet-20240229": ["Proprietary", "Proprietary"],
419
+ "claude-3-opus-20240229": ["Proprietary", "Proprietary"],
420
+ }
421
+
422
+
423
+ MODEL_SHORT_TO_LONG = {model.split("/")[-1]: model for model in ORDERED_MODELS}
424
+
425
+
426
+ def get_model_type(model_name: str) -> str:
427
+ for _, model_list in MODELS["pretrained"].items():
428
+ if model_name in model_list:
429
+ return "base"
430
+
431
+ for _, model_list in MODELS["instruction_tuned"].items():
432
+ if model_name in model_list:
433
+ return "instruct"
434
+
435
+ if model_name in API_MODELS:
436
+ return "api"
437
+
438
+ raise ValueError(f"Model {model_name} not found in model_list.py")
439
+ return None
440
+
441
+
442
+ def get_open_model_list() -> list:
443
+ all_models = []
444
+ for _, model_list in MODELS["pretrained"].items():
445
+ all_models.extend(model_list)
446
+
447
+ for _, model_list in MODELS["instruction_tuned"].items():
448
+ all_models.extend(model_list)
449
+
450
+ return all_models
451
+
452
+
453
+ def get_all_model_list() -> list:
454
+ all_models = []
455
+ for _, model_list in MODELS["pretrained"].items():
456
+ all_models.extend(model_list)
457
+
458
+ for _, model_list in MODELS["instruction_tuned"].items():
459
+ all_models.extend(model_list)
460
+
461
+ all_models.extend(API_MODELS)
462
+
463
+ return all_models
464
+
465
+
466
+ def get_pretrained_models() -> list:
467
+ all_models = []
468
+ for _, model_list in MODELS["pretrained"].items():
469
+ all_models.extend(model_list)
470
+ return all_models
471
+
472
+
473
+ def get_instruct_models() -> list:
474
+ all_models = []
475
+ for _, model_list in MODELS["instruction_tuned"].items():
476
+ all_models.extend(model_list)
477
+ return all_models
478
+
479
+
480
+ def get_model_params(model_name: str) -> int:
481
+ for size_range, model_list in MODELS["pretrained"].items():
482
+ if model_name in model_list:
483
+ return int(size_range.split("B")[0].replace("<=", ""))
484
+
485
+ for size_range, model_list in MODELS["instruction_tuned"].items():
486
+ if model_name in model_list:
487
+ return int(size_range.split("B")[0].replace("<=", ""))
488
+
489
+ raise ValueError(f"Model {model_name} not found in model_list.py")
490
+
491
+
492
+ def get_model_num_gpus(model_name: str) -> int:
493
+ model_params = get_model_params(model_name)
494
+ num_gpus = {
495
+ 4: 1,
496
+ 7: 1,
497
+ 14: 2,
498
+ 50: 4,
499
+ 75: 8,
500
+ 175: 4,
501
+ }[model_params]
502
+ return num_gpus
503
+
504
+
505
+ def get_not_trained_models() -> list:
506
+ all_models = get_all_model_list()
507
+ trained_models = bgb_trained_models
508
+ not_trained_models = [model for model in all_models if model not in trained_models]
509
+ return not_trained_models
510
+
511
+
512
+ def is_trained_model(model_name: str) -> bool:
513
+ return model_name in bgb_trained_models
514
+
515
+
516
+ if __name__ == "__main__":
517
+ assert get_model_type("microsoft/phi-1"), "base"
518
+ assert get_model_params("microsoft/phi-2"), 4
519
+
520
+ models = get_all_model_list()
521
+
522
+ model_list_str = ""
523
+ for model in models:
524
+ model_list_str += f'"{model}"\n'
525
+ print(model_list_str)
526
+
527
+ print(f"{len(models)} models found in src/model_list.py")
528
+
529
+ print(get_not_trained_models())
src/panel.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.leaderboard import BGB_COLUMN_MAPPING, get_bgb_leaderboard_df, get_leaderboard_df
4
+ from src.llm_perf import get_eval_df, get_llm_perf_df
5
+
6
+
7
+ def select_columns_fn(machine, columns, search, llm_perf_df=None):
8
+ if llm_perf_df is None:
9
+ llm_perf_df = get_llm_perf_df(machine=machine)
10
+
11
+ selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
12
+ selected_leaderboard_df = selected_leaderboard_df[
13
+ selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
14
+ ]
15
+ selected_leaderboard_df = selected_leaderboard_df[columns]
16
+
17
+ return selected_leaderboard_df
18
+
19
+
20
+ def select_columns_bgb_fn(machine, columns, search, type_checkboxes, param_slider, eval_df=None):
21
+ if eval_df is None:
22
+ eval_df = get_eval_df(machine)
23
+
24
+ selected_leaderboard_df = get_bgb_leaderboard_df(eval_df)
25
+ selected_leaderboard_df = selected_leaderboard_df[
26
+ selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
27
+ ]
28
+
29
+ print(param_slider)
30
+
31
+ import pdb
32
+
33
+ pdb.set_trace()
34
+
35
+ columns = ["Model πŸ€—"] + columns + type_checkboxes
36
+
37
+ return selected_leaderboard_df[columns]
38
+
39
+
40
+ def create_select_callback(
41
+ # fixed
42
+ machine_textbox,
43
+ # interactive
44
+ columns_checkboxes,
45
+ search_bar,
46
+ type_checkboxes,
47
+ param_slider,
48
+ # outputs
49
+ leaderboard_table,
50
+ ):
51
+ columns_checkboxes.change(
52
+ fn=select_columns_bgb_fn,
53
+ inputs=[machine_textbox, columns_checkboxes, search_bar, type_checkboxes, param_slider],
54
+ outputs=[leaderboard_table],
55
+ )
56
+ search_bar.change(
57
+ fn=select_columns_bgb_fn,
58
+ inputs=[machine_textbox, columns_checkboxes, search_bar, type_checkboxes, param_slider],
59
+ outputs=[leaderboard_table],
60
+ )
src/utils.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig
2
+
3
+ LLM_MODEL_ARCHS = {
4
+ "stablelm_epoch": "πŸ”΄ StableLM-Epoch",
5
+ "stablelm_alpha": "πŸ”΄ StableLM-Alpha",
6
+ "mixformer-sequential": "πŸ§‘β€πŸ’» Phi Ο†",
7
+ "RefinedWebModel": "πŸ¦… Falcon",
8
+ "gpt_bigcode": "⭐ StarCoder",
9
+ "RefinedWeb": "πŸ¦… Falcon",
10
+ "baichuan": "🌊 Baichuan 百川", # river
11
+ "internlm": "πŸ§‘β€πŸŽ“ InternLM δΉ¦η”Ÿ", # scholar
12
+ "mistral": "Ⓜ️ Mistral",
13
+ "mixtral": "Ⓜ️ Mixtral",
14
+ "codegen": "♾️ CodeGen",
15
+ "chatglm": "πŸ’¬ ChatGLM",
16
+ "falcon": "πŸ¦… Falcon",
17
+ "bloom": "🌸 Bloom",
18
+ "llama": "πŸ¦™ LLaMA",
19
+ "rwkv": "πŸ¦β€β¬› RWKV",
20
+ "deci": "πŸ”΅ deci",
21
+ "Yi": "πŸ«‚ Yi δΊΊ", # people
22
+ "mpt": "🧱 MPT",
23
+ # suggest something
24
+ "gpt_neox": "GPT-NeoX",
25
+ "gpt_neo": "GPT-Neo",
26
+ "gpt2": "GPT-2",
27
+ "gptj": "GPT-J",
28
+ "bart": "BART",
29
+ }
30
+
31
+
32
+ def model_hyperlink(link, model_name):
33
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
34
+
35
+
36
+ def process_architectures(model):
37
+ # return "Unknown"
38
+ try:
39
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
40
+ return LLM_MODEL_ARCHS.get(config.model_type, "Unknown")
41
+ except Exception:
42
+ return "Unknown"
43
+
44
+
45
+ def process_score(score, quantization):
46
+ if quantization != "Unquantized":
47
+ return f"{score:.2f}*"
48
+ else:
49
+ return f"{score:.2f} "
50
+
51
+
52
+ def process_quantizations(x):
53
+ if (
54
+ x["config.backend.quantization_scheme"] == "bnb"
55
+ and x["config.backend.quantization_config.load_in_4bit"] is True
56
+ ):
57
+ return "BnB.4bit"
58
+ elif (
59
+ x["config.backend.quantization_scheme"] == "bnb"
60
+ and x["config.backend.quantization_config.load_in_8bit"] is True
61
+ ):
62
+ return "BnB.8bit"
63
+ elif x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.bits"] == 4:
64
+ return "GPTQ.4bit"
65
+ elif x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.bits"] == 4:
66
+ return "AWQ.4bit"
67
+ else:
68
+ return "Unquantized"
69
+
70
+
71
+ def process_kernels(x):
72
+ if x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.version"] == 1:
73
+ return "GPTQ.ExllamaV1"
74
+
75
+ elif x["config.backend.quantization_scheme"] == "gptq" and x["config.backend.quantization_config.version"] == 2:
76
+ return "GPTQ.ExllamaV2"
77
+ elif (
78
+ x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.version"] == "gemm"
79
+ ):
80
+ return "AWQ.GEMM"
81
+ elif (
82
+ x["config.backend.quantization_scheme"] == "awq" and x["config.backend.quantization_config.version"] == "gemv"
83
+ ):
84
+ return "AWQ.GEMV"
85
+ else:
86
+ return "No Kernel"
87
+
88
+
89
+ def test():
90
+ model = "Qwen/Qwen1.5-32B"
91
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
92
+
93
+ import pdb
94
+
95
+ pdb.set_trace()
96
+
97
+
98
+ if __name__ == "__main__":
99
+ test()