derek-thomas HF staff commited on
Commit
57918de
·
verified ·
1 Parent(s): 5d714fc

Upload 01-tgi-ie-benchmark.ipynb

Browse files
Files changed (1) hide show
  1. 01-tgi-ie-benchmark.ipynb +38 -17
01-tgi-ie-benchmark.ipynb CHANGED
@@ -1,5 +1,20 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "markdown",
5
  "id": "602a8c54-b434-4d8e-bc72-824c642fbdb5",
@@ -76,16 +91,16 @@
76
  "outputs": [],
77
  "source": [
78
  "# Endpoint\n",
79
- "ENDPOINT_NAME=\"tgi-benchmark-sp\"\n",
80
- "NAMESPACE = 'hf-test-lab'\n",
81
- "MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'\n",
82
- "INSTANCE_TYPE = 'nvidia-a100_2'\n",
83
  "\n",
84
  "# Simulation\n",
85
  "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n",
86
- "tgi_bss = [8, 16, 24, 32, 40, 48, 56, 64]\n",
87
- "INPUT_TOKENS = 3000\n",
88
- "OUTPUT_TOKENS = 300"
89
  ]
90
  },
91
  {
@@ -129,8 +144,8 @@
129
  " region=\"us-east-1\",\n",
130
  " vendor=\"aws\",\n",
131
  " accelerator=\"gpu\",\n",
132
- " instance_size=\"x1\",\n",
133
- " instance_type='nvidia-a100',\n",
134
  " min_replica=0,\n",
135
  " max_replica=1,\n",
136
  " namespace=NAMESPACE,\n",
@@ -141,9 +156,10 @@
141
  " \"MAX_TOTAL_TOKENS\": f\"{INPUT_TOKENS + OUTPUT_TOKENS}\",\n",
142
  " \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
143
  " \"HF_TOKEN\": get_token(),\n",
 
144
  " \"MODEL_ID\": \"/repository\",\n",
145
  " },\n",
146
- " \"url\": \"ghcr.io/huggingface/text-generation-inference:2.0.4\",\n",
147
  " },\n",
148
  " type=\"protected\",\n",
149
  " )\n",
@@ -179,7 +195,8 @@
179
  " # Set environment variables\n",
180
  " env = os.environ.copy()\n",
181
  " env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
182
- " env['HUGGINGFACE_API_KEY'] = get_token()\n",
 
183
  " # Convert pathlib.Path to string and append to PYTHONPATH\n",
184
  " env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
185
  "\n",
@@ -200,16 +217,16 @@
200
  " # Construct the command to run the benchmark script\n",
201
  " command = [\n",
202
  " \"python\", benchmark_script,\n",
203
- " \"--model\", f\"huggingface/{MODEL}\",\n",
204
  " \"--mean-input-tokens\", f\"{INPUT_TOKENS}\",\n",
205
  " \"--stddev-input-tokens\", \"10\",\n",
206
- " \"--mean-output-tokens\", \"240\",\n",
207
  " \"--stddev-output-tokens\", \"5\",\n",
208
  " \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
209
  " \"--timeout\", \"7200\",\n",
210
  " \"--num-concurrent-requests\", str(vu),\n",
211
  " \"--results-dir\", str(results_dir),\n",
212
- " \"--llm-api\", \"litellm\",\n",
213
  " \"--additional-sampling-params\", '{}'\n",
214
  " ]\n",
215
  "\n",
@@ -222,7 +239,7 @@
222
  " return e.output.decode(), False\n",
223
  "\n",
224
  "def find_max_working_batch_size(endpoint, tgi_bs):\n",
225
- " batch_sizes = [8, 16, 32, 64, 128, 256]\n",
226
  " max_working = None\n",
227
  " for size in tqdm(batch_sizes):\n",
228
  " tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
@@ -255,7 +272,11 @@
255
  "source": [
256
  "for tgi_bs in tqdm(tgi_bss):\n",
257
  " name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
258
- " endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
 
 
 
 
259
  " endpoint.wait()\n",
260
  " tqdm.write(f\"Endpoint Created: {name}\")\n",
261
  " max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
@@ -266,7 +287,7 @@
266
  {
267
  "cell_type": "code",
268
  "execution_count": null,
269
- "id": "25ef390c-10fe-4466-b8fd-1c01730205d2",
270
  "metadata": {},
271
  "outputs": [],
272
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "a6221e83-9d8f-4716-aeda-b40847931f56",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "%%bash\n",
13
+ "git clone https://github.com/philschmid/llmperf.git\n",
14
+ "cd llmperf\n",
15
+ "pip install -e . -q"
16
+ ]
17
+ },
18
  {
19
  "cell_type": "markdown",
20
  "id": "602a8c54-b434-4d8e-bc72-824c642fbdb5",
 
91
  "outputs": [],
92
  "source": [
93
  "# Endpoint\n",
94
+ "ENDPOINT_NAME=\"mixtral-exp\"\n",
95
+ "NAMESPACE = 'HF-test-lab'\n",
96
+ "MODEL = 'TheBloke/mixtral-8x7b-v0.1-GPTQ'\n",
97
+ "INSTANCE_TYPE = 'nvidia-l4_AWQ'\n",
98
  "\n",
99
  "# Simulation\n",
100
  "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n",
101
+ "tgi_bss = [1]\n",
102
+ "INPUT_TOKENS = 800\n",
103
+ "OUTPUT_TOKENS = 1600"
104
  ]
105
  },
106
  {
 
144
  " region=\"us-east-1\",\n",
145
  " vendor=\"aws\",\n",
146
  " accelerator=\"gpu\",\n",
147
+ " instance_size=\"x4\",\n",
148
+ " instance_type='nvidia-l4',\n",
149
  " min_replica=0,\n",
150
  " max_replica=1,\n",
151
  " namespace=NAMESPACE,\n",
 
156
  " \"MAX_TOTAL_TOKENS\": f\"{INPUT_TOKENS + OUTPUT_TOKENS}\",\n",
157
  " \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
158
  " \"HF_TOKEN\": get_token(),\n",
159
+ " \"QUANTIZE\":\"awq\",\n",
160
  " \"MODEL_ID\": \"/repository\",\n",
161
  " },\n",
162
+ " \"url\": \"ghcr.io/huggingface/text-generation-inference:2.2.0\",\n",
163
  " },\n",
164
  " type=\"protected\",\n",
165
  " )\n",
 
195
  " # Set environment variables\n",
196
  " env = os.environ.copy()\n",
197
  " env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
198
+ " env['HUGGINGFACE_API_TOKEN'] = get_token()\n",
199
+ " env['MODEL_ID'] = MODEL\n",
200
  " # Convert pathlib.Path to string and append to PYTHONPATH\n",
201
  " env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
202
  "\n",
 
217
  " # Construct the command to run the benchmark script\n",
218
  " command = [\n",
219
  " \"python\", benchmark_script,\n",
220
+ " \"--model\", f\"{MODEL}\",\n",
221
  " \"--mean-input-tokens\", f\"{INPUT_TOKENS}\",\n",
222
  " \"--stddev-input-tokens\", \"10\",\n",
223
+ " \"--mean-output-tokens\", f\"{OUTPUT_TOKENS}\",\n",
224
  " \"--stddev-output-tokens\", \"5\",\n",
225
  " \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
226
  " \"--timeout\", \"7200\",\n",
227
  " \"--num-concurrent-requests\", str(vu),\n",
228
  " \"--results-dir\", str(results_dir),\n",
229
+ " \"--llm-api\", \"huggingface\",\n",
230
  " \"--additional-sampling-params\", '{}'\n",
231
  " ]\n",
232
  "\n",
 
239
  " return e.output.decode(), False\n",
240
  "\n",
241
  "def find_max_working_batch_size(endpoint, tgi_bs):\n",
242
+ " batch_sizes = [8, 16, 32]\n",
243
  " max_working = None\n",
244
  " for size in tqdm(batch_sizes):\n",
245
  " tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
 
272
  "source": [
273
  "for tgi_bs in tqdm(tgi_bss):\n",
274
  " name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
275
+ " try:\n",
276
+ " endpoint = get_inference_endpoint(name, namespace=NAMESPACE)\n",
277
+ " except:\n",
278
+ " endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
279
+ " pass\n",
280
  " endpoint.wait()\n",
281
  " tqdm.write(f\"Endpoint Created: {name}\")\n",
282
  " max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
 
287
  {
288
  "cell_type": "code",
289
  "execution_count": null,
290
+ "id": "70a5f441-3da7-4888-9943-112750681067",
291
  "metadata": {},
292
  "outputs": [],
293
  "source": []