Laurent1 commited on
Commit
7771883
1 Parent(s): 4261dd0

Upload laurent-restaurant-adaptation-mistral-7b-tuned.ipynb

Browse files
laurent-restaurant-adaptation-mistral-7b-tuned.ipynb ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Libraries"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "! pip install bitsandbytes\n",
17
+ "! pip install einops\n",
18
+ "! pip install peft\n",
19
+ "! pip install datasets==2.14.6"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 1,
25
+ "metadata": {
26
+ "tags": []
27
+ },
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "2.14.6\n",
34
+ "4.35.0\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "# Check the versions\n",
40
+ "import datasets\n",
41
+ "print(datasets.__version__)\n",
42
+ "import transformers\n",
43
+ "print(transformers.__version__)"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "metadata": {},
49
+ "source": [
50
+ "# Restaurant dataset"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 2,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "name": "stdout",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "bin C:\\Users\\Utilisateur\\anaconda3\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cuda117.dll\n"
63
+ ]
64
+ }
65
+ ],
66
+ "source": [
67
+ "import einops\n",
68
+ "import torch\n",
69
+ "import pandas as pd\n",
70
+ "import numpy as np\n",
71
+ "from datasets import load_dataset,Dataset\n",
72
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
73
+ "from peft import LoraConfig,get_peft_model,AutoPeftModelForCausalLM"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "markdown",
78
+ "metadata": {},
79
+ "source": [
80
+ "# Load of the dataset for domain adaptation"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 3,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "dataset0 = load_dataset(\"Argen7um/restrant-qa\")#.select(range(877))"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "## Adaptation of the data for training"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 4,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/plain": [
107
+ "Dataset({\n",
108
+ " features: ['text'],\n",
109
+ " num_rows: 877\n",
110
+ "})"
111
+ ]
112
+ },
113
+ "execution_count": 4,
114
+ "metadata": {},
115
+ "output_type": "execute_result"
116
+ }
117
+ ],
118
+ "source": [
119
+ "text = []\n",
120
+ "for i in range(877): \n",
121
+ " text.append('At Laurent restaurant : '+ dataset0['train'][i]['Prompt'].split('[question]:')[1].replace(' [/INST]\\n',''))\n",
122
+ "\n",
123
+ "data_text = pd.DataFrame(columns = ['text'])\n",
124
+ "data_text['text'] = text\n",
125
+ "\n",
126
+ "dataset_text = Dataset.from_pandas(data_text)\n",
127
+ "dataset_text"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "markdown",
132
+ "metadata": {},
133
+ "source": [
134
+ "## Check the distribution of the length of the rows (truncation impact ?)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 5,
140
+ "metadata": {
141
+ "tags": []
142
+ },
143
+ "outputs": [
144
+ {
145
+ "data": {
146
+ "text/plain": [
147
+ "array([[<Axes: title={'center': '0'}>]], dtype=object)"
148
+ ]
149
+ },
150
+ "execution_count": 5,
151
+ "metadata": {},
152
+ "output_type": "execute_result"
153
+ },
154
+ {
155
+ "data": {
156
+ "image/png": "\n",
157
+ "text/plain": [
158
+ "<Figure size 640x480 with 1 Axes>"
159
+ ]
160
+ },
161
+ "metadata": {},
162
+ "output_type": "display_data"
163
+ }
164
+ ],
165
+ "source": [
166
+ "LEN = []\n",
167
+ "for i in range(877):\n",
168
+ " LEN.append(len(dataset_text['text'][i]))\n",
169
+ "import numpy as np\n",
170
+ "import pandas as pd\n",
171
+ "\n",
172
+ "pd.DataFrame(np.array(LEN)).hist(bins = 30)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "markdown",
177
+ "metadata": {},
178
+ "source": [
179
+ "# Tokenization of the dataset"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 6,
185
+ "metadata": {},
186
+ "outputs": [
187
+ {
188
+ "data": {
189
+ "application/vnd.jupyter.widget-view+json": {
190
+ "model_id": "b9919f91c04f49428be62ac33921ec7d",
191
+ "version_major": 2,
192
+ "version_minor": 0
193
+ },
194
+ "text/plain": [
195
+ "Map: 0%| | 0/877 [00:00<?, ? examples/s]"
196
+ ]
197
+ },
198
+ "metadata": {},
199
+ "output_type": "display_data"
200
+ },
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "Dataset({\n",
205
+ " features: ['input_ids', 'attention_mask'],\n",
206
+ " num_rows: 877\n",
207
+ "})"
208
+ ]
209
+ },
210
+ "execution_count": 6,
211
+ "metadata": {},
212
+ "output_type": "execute_result"
213
+ }
214
+ ],
215
+ "source": [
216
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
217
+ "\n",
218
+ "tokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.1\")\n",
219
+ "tokenizer.pad_token = tokenizer.eos_token\n",
220
+ "tokenizer.padding_side = \"right\" \n",
221
+ "\n",
222
+ "def tokenize_function(examples):\n",
223
+ " result = tokenizer(examples[\"text\"])\n",
224
+ " return result\n",
225
+ "\n",
226
+ "tokenized_datasets = dataset_text.map(\n",
227
+ " tokenize_function, batched=True, remove_columns=[\"text\"]\n",
228
+ ")\n",
229
+ "tokenized_datasets"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 7,
235
+ "metadata": {
236
+ "tags": []
237
+ },
238
+ "outputs": [],
239
+ "source": [
240
+ "tokenizer.mask_token = '<MASK>'\n",
241
+ "collator = DataCollatorForLanguageModeling(mlm = True,mlm_probability=0.15,tokenizer = tokenizer)"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "markdown",
246
+ "metadata": {
247
+ "id": "rjOMoSbGSxx9"
248
+ },
249
+ "source": [
250
+ "# Foundation model"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 8,
256
+ "metadata": {
257
+ "id": "ZwXZbQ2dSwzI",
258
+ "outputId": "a57e521a-a8a3-48e9-a478-63334083f94a"
259
+ },
260
+ "outputs": [
261
+ {
262
+ "name": "stderr",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.\n"
266
+ ]
267
+ },
268
+ {
269
+ "data": {
270
+ "application/vnd.jupyter.widget-view+json": {
271
+ "model_id": "87d196af4c864c2f9381a18ceb5720e5",
272
+ "version_major": 2,
273
+ "version_minor": 0
274
+ },
275
+ "text/plain": [
276
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
277
+ ]
278
+ },
279
+ "metadata": {},
280
+ "output_type": "display_data"
281
+ }
282
+ ],
283
+ "source": [
284
+ "bnb_config = BitsAndBytesConfig(\n",
285
+ " load_in_4bit=True,\n",
286
+ " bnb_4bit_quant_type=\"nf4\",\n",
287
+ " bnb_4bit_compute_dtype=torch.float16,\n",
288
+ ")\n",
289
+ "\n",
290
+ "model = AutoModelForCausalLM.from_pretrained(\n",
291
+ " \"mistralai/Mistral-7B-Instruct-v0.1\",\n",
292
+ " device_map=\"auto\",\n",
293
+ " torch_dtype=torch.float16, #torch.bfloat16,\n",
294
+ " trust_remote_code=True\n",
295
+ " )"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "markdown",
300
+ "metadata": {
301
+ "id": "NuAx3zBeUL1q"
302
+ },
303
+ "source": [
304
+ "## LoRa configuration"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 9,
310
+ "metadata": {
311
+ "id": "dQdvjTYTT1vQ",
312
+ "tags": []
313
+ },
314
+ "outputs": [],
315
+ "source": [
316
+ "lora_alpha = 16\n",
317
+ "lora_dropout = 0.1\n",
318
+ "lora_r = 64\n",
319
+ "\n",
320
+ "peft_config = LoraConfig(\n",
321
+ " lora_alpha=lora_alpha,\n",
322
+ " lora_dropout=lora_dropout,\n",
323
+ " r=lora_r,\n",
324
+ " bias=\"none\",\n",
325
+ " task_type=\"CAUSAL_LM\",\n",
326
+ " target_modules=[\n",
327
+ " \"Wqkv\",\n",
328
+ " \"out_proj\",\n",
329
+ " \"up_proj\",\n",
330
+ " \"down_proj\",\n",
331
+ " ])\n"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "markdown",
336
+ "metadata": {},
337
+ "source": [
338
+ "# Training parameters"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 10,
344
+ "metadata": {
345
+ "tags": []
346
+ },
347
+ "outputs": [],
348
+ "source": [
349
+ "output_dir = \"/MY_DIRECTORY\"\n",
350
+ "per_device_train_batch_size = 1\n",
351
+ "gradient_accumulation_steps = 16 \n",
352
+ "optim = \"paged_adamw_32bit\"\n",
353
+ "save_steps = 55 \n",
354
+ "logging_steps = 55\n",
355
+ "learning_rate = 1e-4\n",
356
+ "max_grad_norm = 0.3\n",
357
+ "max_steps = 55 * 15 \n",
358
+ "warmup_ratio = 0.03\n",
359
+ "lr_scheduler_type = \"linear\"\n",
360
+ "\n",
361
+ "training_arguments = TrainingArguments(\n",
362
+ " output_dir=output_dir,\n",
363
+ " per_device_train_batch_size=per_device_train_batch_size,\n",
364
+ " gradient_accumulation_steps=gradient_accumulation_steps,\n",
365
+ " optim=optim,\n",
366
+ " logging_steps=logging_steps,\n",
367
+ " save_strategy= 'no', #''epoch',\n",
368
+ " #save_steps=save_steps,\n",
369
+ " #evaluation_strategy = \"steps\",#\"epoch\",\n",
370
+ " learning_rate=learning_rate,\n",
371
+ " fp16=True,\n",
372
+ " max_grad_norm=max_grad_norm,\n",
373
+ " max_steps=max_steps,\n",
374
+ " warmup_ratio=warmup_ratio,\n",
375
+ " group_by_length=True,\n",
376
+ " lr_scheduler_type=lr_scheduler_type,\n",
377
+ " report_to = 'none',\n",
378
+ " save_total_limit = 1\n",
379
+ ")"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "markdown",
384
+ "metadata": {},
385
+ "source": [
386
+ "# Training"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": 11,
392
+ "metadata": {},
393
+ "outputs": [],
394
+ "source": [
395
+ "model = get_peft_model(model, peft_config)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 12,
401
+ "metadata": {
402
+ "tags": []
403
+ },
404
+ "outputs": [],
405
+ "source": [
406
+ "trainer = Trainer(\n",
407
+ " model=model,\n",
408
+ " tokenizer=tokenizer,\n",
409
+ " data_collator=collator,\n",
410
+ " train_dataset=tokenized_datasets,\n",
411
+ " #eval_dataset=\n",
412
+ " args=training_arguments,\n",
413
+ ")"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": null,
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "trainer.train()"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "metadata": {},
428
+ "source": [
429
+ "# Save model"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": 28,
435
+ "metadata": {
436
+ "execution": {
437
+ "iopub.execute_input": "2023-11-12T18:43:12.964677Z",
438
+ "iopub.status.busy": "2023-11-12T18:43:12.964270Z",
439
+ "iopub.status.idle": "2023-11-12T18:43:13.685390Z",
440
+ "shell.execute_reply": "2023-11-12T18:43:13.684268Z",
441
+ "shell.execute_reply.started": "2023-11-12T18:43:12.964645Z"
442
+ }
443
+ },
444
+ "outputs": [],
445
+ "source": [
446
+ "trainer.save_model(output_dir)"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "markdown",
451
+ "metadata": {},
452
+ "source": [
453
+ "# Reload the model"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": null,
459
+ "metadata": {},
460
+ "outputs": [],
461
+ "source": [
462
+ "model1 = AutoPeftModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True)\n",
463
+ "tokenizer1 = AutoTokenizer.from_pretrained(output_dir)"
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "markdown",
468
+ "metadata": {},
469
+ "source": [
470
+ "# Prompt preparation"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "markdown",
475
+ "metadata": {},
476
+ "source": [
477
+ "## Criteria for early stopping during generation"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": null,
483
+ "metadata": {},
484
+ "outputs": [],
485
+ "source": [
486
+ "from transformers import StoppingCriteria,StoppingCriteriaList"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": null,
492
+ "metadata": {},
493
+ "outputs": [],
494
+ "source": [
495
+ "class StopOnTokens(StoppingCriteria):\n",
496
+ " def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:\n",
497
+ " stop_ids = [28723] # corresponding to '.'\n",
498
+ " for stop_id in stop_ids:\n",
499
+ " if input_ids[0][-1] == stop_id:\n",
500
+ " return True\n",
501
+ " return False"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "markdown",
506
+ "metadata": {},
507
+ "source": [
508
+ "## Prompt answers"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": null,
514
+ "metadata": {},
515
+ "outputs": [],
516
+ "source": [
517
+ "\n",
518
+ "text = \"At Laurent restaurant : do you have any vegetarian options?\"\n",
519
+ "#text = \"At Laurent restaurant: do you have Apple pie?\"\n",
520
+ "#text = \"At Laurent restaurant: what is included in the Premium Sweetheart Set for Two?\"\n",
521
+ "#text = \"At Laurent restaurant: do you have Seafood Paella?\"\n",
522
+ "#text = \"At Laurent restaurant: what is the best menu?\"\n",
523
+ "\n",
524
+ "inputs = tokenizer1(text, return_tensors=\"pt\").to('cuda')\n",
525
+ "out = model1.generate(**inputs, \n",
526
+ " pad_token_id=tokenizer.eos_token_id,\n",
527
+ " stopping_criteria = StoppingCriteriaList([StopOnTokens()]),\n",
528
+ " max_new_tokens=100\n",
529
+ " )\n",
530
+ "\n",
531
+ "tokenizer1.decode(out[0],skip_special_tokens=True).split(\"[answer]:\")[1]\n"
532
+ ]
533
+ }
534
+ ],
535
+ "metadata": {
536
+ "kernelspec": {
537
+ "display_name": "Python 3 (ipykernel)",
538
+ "language": "python",
539
+ "name": "python3"
540
+ },
541
+ "language_info": {
542
+ "codemirror_mode": {
543
+ "name": "ipython",
544
+ "version": 3
545
+ },
546
+ "file_extension": ".py",
547
+ "mimetype": "text/x-python",
548
+ "name": "python",
549
+ "nbconvert_exporter": "python",
550
+ "pygments_lexer": "ipython3",
551
+ "version": "3.9.13"
552
+ }
553
+ },
554
+ "nbformat": 4,
555
+ "nbformat_minor": 4
556
+ }