Spaces:
Sleeping
Sleeping
Alan Liu
commited on
Commit
·
c93009d
1
Parent(s):
ed50ee5
add client throughput
Browse files- app.py +9 -6
- calc_util.py +13 -1
app.py
CHANGED
@@ -138,8 +138,9 @@ with col3: # Prefilling
|
|
138 |
prefilling_operation_count = prefilling_operation(model_config, inference_config)
|
139 |
prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
|
140 |
inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
141 |
-
inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
|
142 |
inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
|
|
|
|
143 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
|
144 |
|
145 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
@@ -162,9 +163,9 @@ with col3: # Prefilling
|
|
162 |
|
163 |
header5("Summary: Prefilling")
|
164 |
st.markdown(create_table(df_subtotal_operation_count))
|
165 |
-
st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
|
166 |
st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
|
167 |
st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
|
|
|
168 |
|
169 |
if inference_config['KV_cache']:
|
170 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
@@ -175,9 +176,9 @@ with col4: # Generation
|
|
175 |
generation_operation_count = generation_operation(model_config, inference_config)
|
176 |
generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
|
177 |
inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
178 |
-
inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
|
179 |
-
inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
|
180 |
inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
|
|
|
|
181 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
|
182 |
|
183 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
@@ -199,10 +200,12 @@ with col4: # Generation
|
|
199 |
|
200 |
header5("Summary: Generation")
|
201 |
st.markdown(create_table(df_subtotal_operation_count))
|
202 |
-
st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
203 |
-
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
204 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
205 |
st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
|
|
|
|
|
206 |
|
207 |
if inference_config['KV_cache']:
|
208 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
|
138 |
prefilling_operation_count = prefilling_operation(model_config, inference_config)
|
139 |
prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
|
140 |
inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
|
|
141 |
inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
142 |
+
calc_prefilling_throughput(model_config, inference_config, inference_info)
|
143 |
+
|
144 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
|
145 |
|
146 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
|
|
163 |
|
164 |
header5("Summary: Prefilling")
|
165 |
st.markdown(create_table(df_subtotal_operation_count))
|
|
|
166 |
st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
|
167 |
st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
|
168 |
+
st.write(f"Prefillng throughput (tokens/s): {inference_info['prefilling_throughput']:.2f} ({inference_info['prefilling_bound_type']}-bound)")
|
169 |
|
170 |
if inference_config['KV_cache']:
|
171 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
|
176 |
generation_operation_count = generation_operation(model_config, inference_config)
|
177 |
generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
|
178 |
inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
|
|
|
|
179 |
inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
180 |
+
calc_generation_throughput(model_config, inference_config, inference_info)
|
181 |
+
|
182 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
|
183 |
|
184 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
|
|
200 |
|
201 |
header5("Summary: Generation")
|
202 |
st.markdown(create_table(df_subtotal_operation_count))
|
203 |
+
#st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
204 |
+
#st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
205 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
206 |
st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
|
207 |
+
st.write(f"Generation-only throughput (tokens/s): {inference_info['generation_throughput']:.2f} ({inference_info['generation_bound_type']}-bound)")
|
208 |
+
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['client_generation_throughput']:.2f}")
|
209 |
|
210 |
if inference_config['KV_cache']:
|
211 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
calc_util.py
CHANGED
@@ -296,4 +296,16 @@ def generation_activation_memory(model_config, inference_config):
|
|
296 |
activation_memory['mlp'] + activation_memory['layernorm']
|
297 |
)
|
298 |
|
299 |
-
return activation_memory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
activation_memory['mlp'] + activation_memory['layernorm']
|
297 |
)
|
298 |
|
299 |
+
return activation_memory
|
300 |
+
|
301 |
+
|
302 |
+
def calc_prefilling_throughput(model_config, inference_config, inference_info):
|
303 |
+
inference_info['prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']])
|
304 |
+
inference_info['prefilling_bound_type'] = "memory" if inference_info['inference_prefilling_time'] < inference_info['prefilling_memory_latency'] else "arithmetic"
|
305 |
+
|
306 |
+
def calc_generation_throughput(model_config, inference_config, inference_info):
|
307 |
+
inference_info['generation_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
|
308 |
+
inference_info['generation_bound_type'] = "memory" if inference_info['inference_generation_time'] < inference_info['generation_memory_latency'] else "arithmetic"
|
309 |
+
|
310 |
+
total_time = max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']]) + max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
|
311 |
+
inference_info['client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / total_time
|