fix-memory-requirements-for-cpu

#36
by baptistecolle HF staff - opened
Files changed (3) hide show
  1. app.py +4 -7
  2. hardware.yaml +1 -1
  3. src/llm_perf.py +27 -7
app.py CHANGED
@@ -67,13 +67,10 @@ with demo:
67
  search_bar, columns_checkboxes, leaderboard_table = (
68
  create_leaderboard_table(open_llm_perf_df)
69
  )
70
- if (
71
- config.hardware_provider != "intel"
72
- ): # TODO intel CPU does not measure the memory requirements correctly, so disable the graph feature until we fix the underlying issue
73
- with gr.TabItem("Find Your Best Model 🧭", id=1):
74
- lat_score_mem_plot = create_lat_score_mem_plot(
75
- open_llm_perf_df
76
- )
77
  ###################### ATTENTIONS SPEEDUP TAB #######################
78
  # with gr.TabItem("Attention πŸ“ˆ", id=2):
79
  # attn_prefill_plot, attn_decode_plot = create_attn_plots(
 
67
  search_bar, columns_checkboxes, leaderboard_table = (
68
  create_leaderboard_table(open_llm_perf_df)
69
  )
70
+ with gr.TabItem("Find Your Best Model 🧭", id=1):
71
+ lat_score_mem_plot = create_lat_score_mem_plot(
72
+ open_llm_perf_df
73
+ )
 
 
 
74
  ###################### ATTENTIONS SPEEDUP TAB #######################
75
  # with gr.TabItem("Attention πŸ“ˆ", id=2):
76
  # attn_prefill_plot, attn_decode_plot = create_attn_plots(
hardware.yaml CHANGED
@@ -39,7 +39,7 @@
39
  - machine: 32vCPU-C7i
40
  description: Intel-Xeon-SPR-385W πŸ–₯️
41
  detail: |
42
- We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
43
  hardware_provider: intel
44
  hardware_type: cpu
45
  subsets:
 
39
  - machine: 32vCPU-C7i
40
  description: Intel-Xeon-SPR-385W πŸ–₯️
41
  detail: |
42
+ We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark. The memory requirement is the max RAM consumption during the decode phase.
43
  hardware_provider: intel
44
  hardware_type: cpu
45
  subsets:
src/llm_perf.py CHANGED
@@ -15,7 +15,6 @@ COLUMNS_MAPPING = {
15
  "report.per_token.latency.p50": "Per Token (s)",
16
  "report.decode.throughput.value": "Decode (tokens/s)",
17
  "report.decode.efficiency.value": "Energy (tokens/kWh)",
18
- "report.decode.memory.max_allocated": "Memory (MB)",
19
  # deployment settings
20
  "config.backend.name": "Backend 🏭",
21
  "config.backend.torch_dtype": "Precision πŸ“₯",
@@ -28,6 +27,15 @@ COLUMNS_MAPPING = {
28
  "Average ⬆️": "Open LLM Score (%)",
29
  "#Params (B)": "Params (B)",
30
  }
 
 
 
 
 
 
 
 
 
31
  SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
32
  SORTING_ASCENDING = [False, True, False]
33
 
@@ -39,9 +47,10 @@ def get_raw_llm_perf_df(
39
  for subset in subsets:
40
  for backend in backends:
41
  try:
 
42
  dfs.append(
43
  pd.read_csv(
44
- f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
45
  )
46
  )
47
  except Exception:
@@ -70,7 +79,7 @@ def get_raw_llm_perf_df(
70
  return llm_perf_df
71
 
72
 
73
- def processed_llm_perf_df(llm_perf_df):
74
  # some assertions
75
  assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
76
  assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
@@ -105,15 +114,23 @@ def processed_llm_perf_df(llm_perf_df):
105
  "report.decode.throughput.value": 3,
106
  "report.decode.efficiency.value": 3,
107
  "report.decode.memory.max_allocated": 3,
 
108
  "Average ⬆️": 3,
109
  "prefill+decode": 3,
110
  "#Params (B)": 3,
111
  }
112
  )
 
113
  # filter columns
114
- llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
115
- # rename columns
116
- llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
 
 
 
 
 
 
117
  # sort by metric
118
  llm_perf_df.sort_values(
119
  by=SORTING_COLUMNS,
@@ -121,6 +138,9 @@ def processed_llm_perf_df(llm_perf_df):
121
  inplace=True,
122
  )
123
 
 
 
 
124
  return llm_perf_df
125
 
126
 
@@ -137,7 +157,7 @@ def get_llm_perf_df(
137
  else:
138
  print(f"Dataset machine {machine} not found, downloading...")
139
  llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
140
- llm_perf_df = processed_llm_perf_df(llm_perf_df)
141
  llm_perf_df.to_csv(
142
  f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
143
  )
 
15
  "report.per_token.latency.p50": "Per Token (s)",
16
  "report.decode.throughput.value": "Decode (tokens/s)",
17
  "report.decode.efficiency.value": "Energy (tokens/kWh)",
 
18
  # deployment settings
19
  "config.backend.name": "Backend 🏭",
20
  "config.backend.torch_dtype": "Precision πŸ“₯",
 
27
  "Average ⬆️": "Open LLM Score (%)",
28
  "#Params (B)": "Params (B)",
29
  }
30
+
31
+ CUDA_COLUMNS_MAPPING = COLUMNS_MAPPING | {
32
+ "report.decode.memory.max_allocated": "Memory (MB)",
33
+ }
34
+
35
+ INTEL_COLUMNS_MAPPING = COLUMNS_MAPPING | {
36
+ "report.decode.memory.max_ram": "Memory (MB)",
37
+ }
38
+
39
  SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
40
  SORTING_ASCENDING = [False, True, False]
41
 
 
47
  for subset in subsets:
48
  for backend in backends:
49
  try:
50
+ url = f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
51
  dfs.append(
52
  pd.read_csv(
53
+ url
54
  )
55
  )
56
  except Exception:
 
79
  return llm_perf_df
80
 
81
 
82
+ def processed_llm_perf_df(llm_perf_df, hardware_type: str):
83
  # some assertions
84
  assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
85
  assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
 
114
  "report.decode.throughput.value": 3,
115
  "report.decode.efficiency.value": 3,
116
  "report.decode.memory.max_allocated": 3,
117
+ "report.decode.memory.max_ram": 3,
118
  "Average ⬆️": 3,
119
  "prefill+decode": 3,
120
  "#Params (B)": 3,
121
  }
122
  )
123
+
124
  # filter columns
125
+ if hardware_type == "cuda":
126
+ llm_perf_df = llm_perf_df[list(CUDA_COLUMNS_MAPPING.keys())]
127
+ llm_perf_df.rename(columns=CUDA_COLUMNS_MAPPING, inplace=True)
128
+ elif hardware_type == "cpu":
129
+ llm_perf_df = llm_perf_df[list(INTEL_COLUMNS_MAPPING.keys())]
130
+ llm_perf_df.rename(columns=INTEL_COLUMNS_MAPPING, inplace=True)
131
+ else:
132
+ raise ValueError(f"Hardware type {hardware_type} not supported")
133
+
134
  # sort by metric
135
  llm_perf_df.sort_values(
136
  by=SORTING_COLUMNS,
 
138
  inplace=True,
139
  )
140
 
141
+ assert llm_perf_df["Memory (MB)"].notna().any(), "The dataset should contain at least one memory value, otherwise this implies that all the benchmarks have failed (contains only a traceback)"
142
+ assert llm_perf_df.columns.is_unique, "All columns should be unique"
143
+
144
  return llm_perf_df
145
 
146
 
 
157
  else:
158
  print(f"Dataset machine {machine} not found, downloading...")
159
  llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
160
+ llm_perf_df = processed_llm_perf_df(llm_perf_df, hardware_type)
161
  llm_perf_df.to_csv(
162
  f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
163
  )