Spaces:
Running
Running
fix-memory-requirements-for-cpu
#36
by
baptistecolle
HF staff
- opened
- app.py +4 -7
- hardware.yaml +1 -1
- src/llm_perf.py +27 -7
app.py
CHANGED
@@ -67,13 +67,10 @@ with demo:
|
|
67 |
search_bar, columns_checkboxes, leaderboard_table = (
|
68 |
create_leaderboard_table(open_llm_perf_df)
|
69 |
)
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
lat_score_mem_plot = create_lat_score_mem_plot(
|
75 |
-
open_llm_perf_df
|
76 |
-
)
|
77 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
78 |
# with gr.TabItem("Attention π", id=2):
|
79 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
|
|
67 |
search_bar, columns_checkboxes, leaderboard_table = (
|
68 |
create_leaderboard_table(open_llm_perf_df)
|
69 |
)
|
70 |
+
with gr.TabItem("Find Your Best Model π§", id=1):
|
71 |
+
lat_score_mem_plot = create_lat_score_mem_plot(
|
72 |
+
open_llm_perf_df
|
73 |
+
)
|
|
|
|
|
|
|
74 |
###################### ATTENTIONS SPEEDUP TAB #######################
|
75 |
# with gr.TabItem("Attention π", id=2):
|
76 |
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
hardware.yaml
CHANGED
@@ -39,7 +39,7 @@
|
|
39 |
- machine: 32vCPU-C7i
|
40 |
description: Intel-Xeon-SPR-385W π₯οΈ
|
41 |
detail: |
|
42 |
-
We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
|
43 |
hardware_provider: intel
|
44 |
hardware_type: cpu
|
45 |
subsets:
|
|
|
39 |
- machine: 32vCPU-C7i
|
40 |
description: Intel-Xeon-SPR-385W π₯οΈ
|
41 |
detail: |
|
42 |
+
We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark. The memory requirement is the max RAM consumption during the decode phase.
|
43 |
hardware_provider: intel
|
44 |
hardware_type: cpu
|
45 |
subsets:
|
src/llm_perf.py
CHANGED
@@ -15,7 +15,6 @@ COLUMNS_MAPPING = {
|
|
15 |
"report.per_token.latency.p50": "Per Token (s)",
|
16 |
"report.decode.throughput.value": "Decode (tokens/s)",
|
17 |
"report.decode.efficiency.value": "Energy (tokens/kWh)",
|
18 |
-
"report.decode.memory.max_allocated": "Memory (MB)",
|
19 |
# deployment settings
|
20 |
"config.backend.name": "Backend π",
|
21 |
"config.backend.torch_dtype": "Precision π₯",
|
@@ -28,6 +27,15 @@ COLUMNS_MAPPING = {
|
|
28 |
"Average β¬οΈ": "Open LLM Score (%)",
|
29 |
"#Params (B)": "Params (B)",
|
30 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
32 |
SORTING_ASCENDING = [False, True, False]
|
33 |
|
@@ -39,9 +47,10 @@ def get_raw_llm_perf_df(
|
|
39 |
for subset in subsets:
|
40 |
for backend in backends:
|
41 |
try:
|
|
|
42 |
dfs.append(
|
43 |
pd.read_csv(
|
44 |
-
|
45 |
)
|
46 |
)
|
47 |
except Exception:
|
@@ -70,7 +79,7 @@ def get_raw_llm_perf_df(
|
|
70 |
return llm_perf_df
|
71 |
|
72 |
|
73 |
-
def processed_llm_perf_df(llm_perf_df):
|
74 |
# some assertions
|
75 |
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
|
76 |
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
|
@@ -105,15 +114,23 @@ def processed_llm_perf_df(llm_perf_df):
|
|
105 |
"report.decode.throughput.value": 3,
|
106 |
"report.decode.efficiency.value": 3,
|
107 |
"report.decode.memory.max_allocated": 3,
|
|
|
108 |
"Average β¬οΈ": 3,
|
109 |
"prefill+decode": 3,
|
110 |
"#Params (B)": 3,
|
111 |
}
|
112 |
)
|
|
|
113 |
# filter columns
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
# sort by metric
|
118 |
llm_perf_df.sort_values(
|
119 |
by=SORTING_COLUMNS,
|
@@ -121,6 +138,9 @@ def processed_llm_perf_df(llm_perf_df):
|
|
121 |
inplace=True,
|
122 |
)
|
123 |
|
|
|
|
|
|
|
124 |
return llm_perf_df
|
125 |
|
126 |
|
@@ -137,7 +157,7 @@ def get_llm_perf_df(
|
|
137 |
else:
|
138 |
print(f"Dataset machine {machine} not found, downloading...")
|
139 |
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
|
140 |
-
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
141 |
llm_perf_df.to_csv(
|
142 |
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
143 |
)
|
|
|
15 |
"report.per_token.latency.p50": "Per Token (s)",
|
16 |
"report.decode.throughput.value": "Decode (tokens/s)",
|
17 |
"report.decode.efficiency.value": "Energy (tokens/kWh)",
|
|
|
18 |
# deployment settings
|
19 |
"config.backend.name": "Backend π",
|
20 |
"config.backend.torch_dtype": "Precision π₯",
|
|
|
27 |
"Average β¬οΈ": "Open LLM Score (%)",
|
28 |
"#Params (B)": "Params (B)",
|
29 |
}
|
30 |
+
|
31 |
+
CUDA_COLUMNS_MAPPING = COLUMNS_MAPPING | {
|
32 |
+
"report.decode.memory.max_allocated": "Memory (MB)",
|
33 |
+
}
|
34 |
+
|
35 |
+
INTEL_COLUMNS_MAPPING = COLUMNS_MAPPING | {
|
36 |
+
"report.decode.memory.max_ram": "Memory (MB)",
|
37 |
+
}
|
38 |
+
|
39 |
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
40 |
SORTING_ASCENDING = [False, True, False]
|
41 |
|
|
|
47 |
for subset in subsets:
|
48 |
for backend in backends:
|
49 |
try:
|
50 |
+
url = f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
|
51 |
dfs.append(
|
52 |
pd.read_csv(
|
53 |
+
url
|
54 |
)
|
55 |
)
|
56 |
except Exception:
|
|
|
79 |
return llm_perf_df
|
80 |
|
81 |
|
82 |
+
def processed_llm_perf_df(llm_perf_df, hardware_type: str):
|
83 |
# some assertions
|
84 |
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
|
85 |
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
|
|
|
114 |
"report.decode.throughput.value": 3,
|
115 |
"report.decode.efficiency.value": 3,
|
116 |
"report.decode.memory.max_allocated": 3,
|
117 |
+
"report.decode.memory.max_ram": 3,
|
118 |
"Average β¬οΈ": 3,
|
119 |
"prefill+decode": 3,
|
120 |
"#Params (B)": 3,
|
121 |
}
|
122 |
)
|
123 |
+
|
124 |
# filter columns
|
125 |
+
if hardware_type == "cuda":
|
126 |
+
llm_perf_df = llm_perf_df[list(CUDA_COLUMNS_MAPPING.keys())]
|
127 |
+
llm_perf_df.rename(columns=CUDA_COLUMNS_MAPPING, inplace=True)
|
128 |
+
elif hardware_type == "cpu":
|
129 |
+
llm_perf_df = llm_perf_df[list(INTEL_COLUMNS_MAPPING.keys())]
|
130 |
+
llm_perf_df.rename(columns=INTEL_COLUMNS_MAPPING, inplace=True)
|
131 |
+
else:
|
132 |
+
raise ValueError(f"Hardware type {hardware_type} not supported")
|
133 |
+
|
134 |
# sort by metric
|
135 |
llm_perf_df.sort_values(
|
136 |
by=SORTING_COLUMNS,
|
|
|
138 |
inplace=True,
|
139 |
)
|
140 |
|
141 |
+
assert llm_perf_df["Memory (MB)"].notna().any(), "The dataset should contain at least one memory value, otherwise this implies that all the benchmarks have failed (contains only a traceback)"
|
142 |
+
assert llm_perf_df.columns.is_unique, "All columns should be unique"
|
143 |
+
|
144 |
return llm_perf_df
|
145 |
|
146 |
|
|
|
157 |
else:
|
158 |
print(f"Dataset machine {machine} not found, downloading...")
|
159 |
llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
|
160 |
+
llm_perf_df = processed_llm_perf_df(llm_perf_df, hardware_type)
|
161 |
llm_perf_df.to_csv(
|
162 |
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
163 |
)
|