model\\n'}}{% endif %}\"\n",
+ "\n",
+ "model = AutoModelForCausalLM.from_pretrained(model_name,\n",
+ " attn_implementation='eager',\n",
+ " device_map=\"auto\")\n",
+ "model.resize_token_embeddings(len(tokenizer))\n",
+ "model.to(torch.bfloat16)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Step 9: Let's configure the LoRA\n",
+ "\n",
+ "ADD COMMENTS JOFFREY"
+ ],
+ "metadata": {
+ "id": "X6DBY8AqxFLL"
+ },
+ "id": "X6DBY8AqxFLL"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "482d36ab-e326-4fd7-bc59-425abcca55e7",
+ "metadata": {
+ "tags": [],
+ "id": "482d36ab-e326-4fd7-bc59-425abcca55e7"
+ },
+ "outputs": [],
+ "source": [
+ "from peft import LoraConfig\n",
+ "\n",
+ "# TODO: Configure LoRA parameters\n",
+ "# r: rank dimension for LoRA update matrices (smaller = more compression)\n",
+ "rank_dimension = 16\n",
+ "# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)\n",
+ "lora_alpha = 64\n",
+ "# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)\n",
+ "lora_dropout = 0.05\n",
+ "\n",
+ "peft_config = LoraConfig(r=rank_dimension,\n",
+ " lora_alpha=lora_alpha,\n",
+ " lora_dropout=lora_dropout,\n",
+ " target_modules=[\"gate_proj\",\"q_proj\",\"lm_head\",\"o_proj\",\"k_proj\",\"embed_tokens\",\"down_proj\",\"up_proj\",\"v_proj\"], # wich layer in the transformers do we target ?\n",
+ " task_type=TaskType.CAUSAL_LM)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Step 10: Let's define the Trainer and the Fine-Tuning hyperparameters\n",
+ "\n",
+ "In this step, we define the Trainer, the class that we use to fine-tune our model and the hyperparameters."
+ ],
+ "metadata": {
+ "id": "zdDR9hzgxPu2"
+ },
+ "id": "zdDR9hzgxPu2"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3598b688-5a6f-437f-95ac-4794688cd38f",
+ "metadata": {
+ "tags": [],
+ "id": "3598b688-5a6f-437f-95ac-4794688cd38f",
+ "outputId": "515f019f-87b6-40cb-9344-f4c19645e077"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/user/miniconda/lib/python3.9/site-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of π€ Transformers. Use `eval_strategy` instead\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "output_dir = \"gemma-2-2B-it-thinking-function_calling\"\n",
+ "per_device_train_batch_size = 1\n",
+ "per_device_eval_batch_size = 1\n",
+ "gradient_accumulation_steps = 4\n",
+ "logging_steps = 5\n",
+ "learning_rate = 1e-4\n",
+ "max_grad_norm = 1.0\n",
+ "num_train_epochs=1\n",
+ "warmup_ratio = 0.1\n",
+ "lr_scheduler_type = \"cosine\"\n",
+ "max_seq_length = 2048\n",
+ "\n",
+ "training_arguments = TrainingArguments(\n",
+ " output_dir=output_dir,\n",
+ " per_device_train_batch_size=per_device_train_batch_size,\n",
+ " per_device_eval_batch_size=per_device_eval_batch_size,\n",
+ " gradient_accumulation_steps=gradient_accumulation_steps,\n",
+ " save_strategy=\"no\",\n",
+ " evaluation_strategy=\"epoch\",\n",
+ " logging_steps=logging_steps,\n",
+ " learning_rate=learning_rate,\n",
+ " max_grad_norm=max_grad_norm,\n",
+ " weight_decay=0.1,\n",
+ " warmup_ratio=warmup_ratio,\n",
+ " lr_scheduler_type=lr_scheduler_type,\n",
+ " report_to=\"tensorboard\",\n",
+ " bf16=True,\n",
+ " hub_private_repo=False,\n",
+ " push_to_hub=False,\n",
+ " num_train_epochs=num_train_epochs,\n",
+ " gradient_checkpointing=True,\n",
+ " gradient_checkpointing_kwargs={\"use_reentrant\": False}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As Trainer, we use the `SFTTrainer` which is a Supervised Fine-Tuning Trainer."
+ ],
+ "metadata": {
+ "id": "59TTqmW2xmV2"
+ },
+ "id": "59TTqmW2xmV2"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba0366b5-c9d0-4f7e-97e0-1f964cfad147",
+ "metadata": {
+ "tags": [],
+ "id": "ba0366b5-c9d0-4f7e-97e0-1f964cfad147",
+ "outputId": "8b2836b3-3a06-4c05-b046-6c7923911e40"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/user/miniconda/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': packing, dataset_text_field, max_seq_length, dataset_kwargs. Will not be supported from version '0.13.0'.\n",
+ "\n",
+ "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+ " warnings.warn(message, FutureWarning)\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of π€ Transformers. Use `eval_strategy` instead\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/peft/tuners/tuners_utils.py:543: UserWarning: Model with `tie_word_embeddings=True` and the tied_target_modules=['lm_head'] are part of the adapter. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. See for example https://github.com/huggingface/peft/issues/2018.\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:334: UserWarning: You passed a `dataset_kwargs` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+ " warnings.warn(\n",
+ "/home/user/miniconda/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:403: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer = SFTTrainer(\n",
+ " model=model,\n",
+ " args=training_arguments,\n",
+ " train_dataset=dataset[\"train\"],\n",
+ " eval_dataset=dataset[\"test\"],\n",
+ " tokenizer=tokenizer,\n",
+ " packing=True,\n",
+ " dataset_text_field=\"content\",\n",
+ " max_seq_length=max_seq_length,\n",
+ " peft_config=peft_config,\n",
+ " dataset_kwargs={\n",
+ " \"append_concat_token\": False,\n",
+ " \"add_special_tokens\": False,\n",
+ " },\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Here, we launch the training π₯. Perfect time for you to pause and grab a coffee β."
+ ],
+ "metadata": {
+ "id": "MtHjukK9xviB"
+ },
+ "id": "MtHjukK9xviB"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e2df2e9-a82b-4540-aa89-1b40b70a7781",
+ "metadata": {
+ "tags": [],
+ "id": "9e2df2e9-a82b-4540-aa89-1b40b70a7781",
+ "outputId": "8ad7e555-678b-4904-aa6b-000b619c9341"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [389/389 14:14, Epoch 0/1]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Epoch | \n",
+ " Training Loss | \n",
+ " Validation Loss | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.294600 | \n",
+ " 0.289091 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/user/miniconda/lib/python3.9/site-packages/peft/utils/save_and_load.py:230: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.\n",
+ " warnings.warn(\"Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer.train()\n",
+ "trainer.save_model()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1d7ea3ab-7c8c-47ad-acd2-99fbe5b68393",
+ "metadata": {
+ "tags": [],
+ "id": "1d7ea3ab-7c8c-47ad-acd2-99fbe5b68393"
+ },
+ "source": [
+ "## Step 11: Let's push the Model and the Tokenizer to the Hub\n",
+ "\n",
+ "Let's push our model and out tokenizer to the Hub ! The model will be pushed under your username + the output_dir that we specified earlier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "370af020-9319-4ff7-bea1-2842a4847caa",
+ "metadata": {
+ "tags": [],
+ "colab": {
+ "referenced_widgets": [
+ "68b34e3d2eae4f24b83fa65cf5815738",
+ "15e7c632053c4ed88267061a8112d641",
+ "047ebf7fda8643c090129bc2b86a7e3e",
+ "ef5dab829e8b491581f6dae2b7718113",
+ "97ba6384a2f94db0880a17ff433a8ed9",
+ "ba0c0c53b23047ac9336fdbf8597f32f",
+ "8a033d5bd32b4a969fd5af612c550243",
+ "714d4a40a67a4763ba8f7ae029befbd7",
+ "52451f392b434fb39b882c9f094bd995",
+ "08c2ca8a719e4142bd877ae94d242f2e",
+ "02e660bd44a7414fae439751e9ffa1f2"
+ ]
+ },
+ "id": "370af020-9319-4ff7-bea1-2842a4847caa",
+ "outputId": "f5797c01-306d-48ee-a009-3c115f5b1ca5"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "68b34e3d2eae4f24b83fa65cf5815738",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "adapter_model.safetensors: 0%| | 0.00/2.48G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "15e7c632053c4ed88267061a8112d641",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739725934.r-jofthomas-fttest-0ihwmg95-70a55-shjb6: 0%| | 0.00/21.5k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "047ebf7fda8643c090129bc2b86a7e3e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739728410.r-jofthomas-fttest-0ihwmg95-70a55-shjb6: 0%| | 0.00/22.1k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ef5dab829e8b491581f6dae2b7718113",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739724308.r-jofthomas-fttest-0ihwmg95-70a55-shjb6: 0%| | 0.00/21.5k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "97ba6384a2f94db0880a17ff433a8ed9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Upload 10 LFS files: 0%| | 0/10 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ba0c0c53b23047ac9336fdbf8597f32f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739727155.r-jofthomas-fttest-0ihwmg95-70a55-shjb6: 0%| | 0.00/22.1k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8a033d5bd32b4a969fd5af612c550243",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739809832.r-jofthomas-fttest-x95brwd3-5f8b4-w4aox: 0%| | 0.00/22.5k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "714d4a40a67a4763ba8f7ae029befbd7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739860009.r-jofthomas-fttest-8up4ewpe-95503-rafjw: 0%| | 0.00/22.5k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "52451f392b434fb39b882c9f094bd995",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "events.out.tfevents.1739862234.r-jofthomas-fttest-8up4ewpe-95503-rafjw: 0%| | 0.00/22.5k [00:00, β¦"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "08c2ca8a719e4142bd877ae94d242f2e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/34.4M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "02e660bd44a7414fae439751e9ffa1f2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "training_args.bin: 0%| | 0.00/5.69k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "CommitInfo(commit_url='https://huggingface.co./Jofthomas/gemma-2-2B-it-thinking-function_calling/commit/1fd13c76657670ca45620b6893e4fbfda0207a91', commit_message='End of training', commit_description='', oid='1fd13c76657670ca45620b6893e4fbfda0207a91', pr_url=None, repo_url=RepoUrl('https://huggingface.co./Jofthomas/gemma-2-2B-it-thinking-function_calling', endpoint='https://huggingface.co.', repo_type='model', repo_id='Jofthomas/gemma-2-2B-it-thinking-function_calling'), pr_revision=None, pr_num=None)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trainer.push_to_hub()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83a443ce-5072-4777-8621-cd4faf840410",
+ "metadata": {
+ "id": "83a443ce-5072-4777-8621-cd4faf840410"
+ },
+ "source": [
+ "Since we also modified the **chat_template** Which is contained in the tokenizer, let's also push the tokenizer with the model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d9a86b3-f23d-4060-a97f-b868a7c38c36",
+ "metadata": {
+ "tags": [],
+ "id": "9d9a86b3-f23d-4060-a97f-b868a7c38c36",
+ "outputId": "2726291c-5720-473e-ed92-e4f425f82bae"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "No files have been modified since last commit. Skipping to prevent empty commit.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "CommitInfo(commit_url='https://huggingface.co./Jofthomas/gemma-2-2B-it-thinking-function_calling/commit/50ea3ee78ed458c6d773f53b326531becdda0211', commit_message='Upload tokenizer', commit_description='', oid='50ea3ee78ed458c6d773f53b326531becdda0211', pr_url=None, repo_url=RepoUrl('https://huggingface.co./Jofthomas/gemma-2-2B-it-thinking-function_calling', endpoint='https://huggingface.co.', repo_type='model', repo_id='Jofthomas/gemma-2-2B-it-thinking-function_calling'), pr_revision=None, pr_num=None)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tokenizer.eos_token = \"\"\n",
+ "# push the tokenizer to hub ( replace with your username and your previously specified\n",
+ "tokenizer.push_to_hub(f\"username/{output_dir}\", token=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76d275ce-a3e6-4d30-8d8c-0ee274de5370",
+ "metadata": {
+ "id": "76d275ce-a3e6-4d30-8d8c-0ee274de5370"
+ },
+ "source": [
+ "## Step 12: Let's now test our model !\n",
+ "\n",
+ "To so, we will :\n",
+ "\n",
+ "1. Load the adapter from the hub !\n",
+ "2. Load the base model : **\"google/gemma-2-2b-it\"** from the hub\n",
+ "3."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "56b89825-70ac-42c1-934c-26e2d54f3b7b",
+ "metadata": {
+ "tags": [],
+ "colab": {
+ "referenced_widgets": [
+ "390c54434b6448b988ce015eeafe34c9",
+ "35b2fe2d357b46488ccef710f2a9bfd7",
+ "9c313149d4324bdaa9c8ddc373964d18"
+ ]
+ },
+ "id": "56b89825-70ac-42c1-934c-26e2d54f3b7b",
+ "outputId": "a4cd00b8-61fa-4522-d563-c4ef7e18807d"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "390c54434b6448b988ce015eeafe34c9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "adapter_config.json: 0%| | 0.00/829 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "35b2fe2d357b46488ccef710f2a9bfd7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9c313149d4324bdaa9c8ddc373964d18",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "adapter_model.safetensors: 0%| | 0.00/2.48G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "PeftModelForCausalLM(\n",
+ " (base_model): LoraModel(\n",
+ " (model): Gemma2ForCausalLM(\n",
+ " (model): Gemma2Model(\n",
+ " (embed_tokens): lora.Embedding(\n",
+ " (base_layer): Embedding(256006, 2304, padding_idx=0)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict()\n",
+ " (lora_B): ModuleDict()\n",
+ " (lora_embedding_A): ParameterDict( (default): Parameter containing: [torch.cuda.BFloat16Tensor of size 16x256006 (cuda:0)])\n",
+ " (lora_embedding_B): ParameterDict( (default): Parameter containing: [torch.cuda.BFloat16Tensor of size 2304x16 (cuda:0)])\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (layers): ModuleList(\n",
+ " (0-25): 26 x Gemma2DecoderLayer(\n",
+ " (self_attn): Gemma2Attention(\n",
+ " (q_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=2048, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=2048, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (k_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=1024, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (v_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=1024, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=1024, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (o_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2048, out_features=2304, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2048, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=2304, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (rotary_emb): Gemma2RotaryEmbedding()\n",
+ " )\n",
+ " (mlp): Gemma2MLP(\n",
+ " (gate_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=9216, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=9216, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (up_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=9216, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=9216, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (down_proj): lora.Linear(\n",
+ " (base_layer): Linear(in_features=9216, out_features=2304, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=9216, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=2304, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " (act_fn): PytorchGELUTanh()\n",
+ " )\n",
+ " (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+ " (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+ " (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+ " (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+ " )\n",
+ " )\n",
+ " (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+ " )\n",
+ " (lm_head): lora.Linear(\n",
+ " (base_layer): Linear(in_features=2304, out_features=256006, bias=False)\n",
+ " (lora_dropout): ModuleDict(\n",
+ " (default): Dropout(p=0.05, inplace=False)\n",
+ " )\n",
+ " (lora_A): ModuleDict(\n",
+ " (default): Linear(in_features=2304, out_features=16, bias=False)\n",
+ " )\n",
+ " (lora_B): ModuleDict(\n",
+ " (default): Linear(in_features=16, out_features=256006, bias=False)\n",
+ " )\n",
+ " (lora_embedding_A): ParameterDict()\n",
+ " (lora_embedding_B): ParameterDict()\n",
+ " (lora_magnitude_vector): ModuleDict()\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ ")"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from peft import PeftModel, PeftConfig\n",
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+ "from datasets import load_dataset\n",
+ "import torch\n",
+ "\n",
+ "bnb_config = BitsAndBytesConfig(\n",
+ " load_in_4bit=True,\n",
+ " bnb_4bit_quant_type=\"nf4\",\n",
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
+ " bnb_4bit_use_double_quant=True,\n",
+ " )\n",
+ "\n",
+ "peft_model_id = f\"username/{output_dir}\" # replace with your newly trained adapter\n",
+ "device = \"auto\"\n",
+ "config = PeftConfig.from_pretrained(peft_model_id)\n",
+ "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,\n",
+ " device_map=\"auto\",\n",
+ " )\n",
+ "tokenizer = AutoTokenizer.from_pretrained(peft_model_id)\n",
+ "model.resize_token_embeddings(len(tokenizer))\n",
+ "model = PeftModel.from_pretrained(model, peft_model_id)\n",
+ "model.to(torch.bfloat16)\n",
+ "model.eval()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "69e83af9-f967-4e5a-842b-0daed13f7957",
+ "metadata": {
+ "tags": [],
+ "id": "69e83af9-f967-4e5a-842b-0daed13f7957",
+ "outputId": "979b2ee9-fe5b-49b1-aed5-e28f0239a709"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "human\n",
+ "You are a function calling AI model. You are provided with function signatures within XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools: [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert from one currency to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to convert'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'calculate_distance', 'description': 'Calculate the distance between two locations', 'parameters': {'type': 'object', 'properties': {'start_location': {'type': 'string', 'description': 'The starting location'}, 'end_location': {'type': 'string', 'description': 'The ending location'}}, 'required': ['start_location', 'end_location']}}}] Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within XML tags as follows:\n",
+ "\n",
+ "{tool_call}\n",
+ "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between {your thoughts}\n",
+ "\n",
+ "Hi, I need to convert 500 USD to Euros. Can you help me with that?\n",
+ "model\n",
+ "Alright, so I'm looking at this scenario where the user needs help converting 500 USD to Euros. Let's break it down step by step to understand how to approach this.\n",
+ "\n",
+ "First, the user has asked for conversion from USD to EUR. I know there's a specific tool available for currency conversion called `convert_currency`. This tool requires three parameters: amount, from_currency, and to_currency. In the user's message, they've provided all necessary detailsβ500 USD and the desire to convert to Euros. \n",
+ "\n",
+ "I don't need to make any assumptions here because the user has already provided the exact amount and the target currency. My next step is to prepare the function call with the correct parameters. So, I'll structure the request with the amount as 500, from_currency as USD, and to_currency as EUR. \n",
+ "\n",
+ "This will be sent in JSON format as specified, ensuring that the tool receives all the necessary information to perform the conversion accurately. Once the function is called, it should return the converted amount, which I can then present back to the user.\n",
+ "\n",
+ "{'name': 'convert_currency', 'arguments': {'amount': 500, 'from_currency': 'USD', 'to_currency': 'EUR'}}\n",
+ "\n",
+ "tool\n",
+ "\n",
+ "{'converted_amount': 420.5, 'from_currency': 'USD', 'to_currency': 'EUR'}\n",
+ "\n",
+ "model\n",
+ "Sure, 500 USD is approximately 420.50 Euros.\n",
+ "human\n",
+ "That's great! Now, can you convert 1000 Euros back to USD?\n",
+ "model\n",
+ "Okay, so I'm trying to figure out how to handle the next function call based on the conversation and the previous steps. Let me see...\n",
+ "\n",
+ "In the conversation, the user first asked to convert 500 USD to Euros, and the model used the `convert_currency` function with the parameters amount=500, from_currency=USD, to_currency=EUR. Then, after getting the response, the model provided the converted amount. Now, the user is asking to convert 1000 Euros back to USD. \n",
+ "\n",
+ "So, the user's request is clear: convert 1000 EUR to USD. Since the function `convert_currency` is designed to handle currency conversions, and it requires amount, from_currency, and to_currency as parameters, it makes sense to call this function again with the updated values. \n",
+ "\n",
+ "The amount is 1000, from_currency should be EUR because the user is converting from Euros, and to_currency should be USD because they want the result in US Dollars. I don't need to make any assumptions here because all the required information is provided by the user. \n",
+ "\n",
+ "I'll structure the function call with these parameters. This should give the user the accurate converted amount from Euros to USD. Once the tool processes this, the model can present the result back to the user.\n",
+ "\n",
+ "{'name': 'convert_currency', 'arguments': {'amount': 1000, 'from_currency': 'EUR', 'to_currency': 'USD'}}\n",
+ "\n",
+ "tool\n",
+ "\n",
+ "{'converted_amount': 1190.0, 'from_currency': 'EUR', 'to_currency': 'USD'}\n",
+ "\n",
+ "model\n",
+ "1000 Euros is approximately 1190.00 USD.\n",
+ "human\n",
+ "Thank you for your help!\n",
+ "model\n",
+ "You're welcome! If you have any other questions, feel free to ask.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(dataset[\"test\"][8][\"content\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b47fd511-ea00-47ce-8618-6e78e25672b2",
+ "metadata": {
+ "id": "b47fd511-ea00-47ce-8618-6e78e25672b2"
+ },
+ "source": [
+ "### Testing the model π\n",
+ "\n",
+ "In that case, we will take the start of one of the samples from the test set and hope that it will generate the expected output.\n",
+ "\n",
+ "Since we want to test the function-calling capacities of our newly fine-tuned model, the input will be a user message with the available tools, a\n",
+ "\n",
+ "\n",
+ "### Disclaimer β οΈ\n",
+ "\n",
+ "The dataset weβre using **does not contain sufficient training data** and is purely for **educational purposes**. As a result, **your trained modelβs outputs may differ** from the examples shown in this course. **Donβt be discouraged** if your results varyβour primary goal here is to illustrate the core concepts rather than produce a fully optimized or production-ready model.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37bf938d-08fa-4577-9966-0238339afcdb",
+ "metadata": {
+ "tags": [],
+ "id": "37bf938d-08fa-4577-9966-0238339afcdb",
+ "outputId": "e97e7a1e-5ab2-46a2-dc3a-f436964fe004"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "human\n",
+ "You are a function calling AI model. You are provided with function signatures within XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools: [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert from one currency to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to convert'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'calculate_distance', 'description': 'Calculate the distance between two locations', 'parameters': {'type': 'object', 'properties': {'start_location': {'type': 'string', 'description': 'The starting location'}, 'end_location': {'type': 'string', 'description': 'The ending location'}}, 'required': ['start_location', 'end_location']}}}] Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within XML tags as follows:\n",
+ "\n",
+ "{tool_call}\n",
+ "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between {your thoughts}\n",
+ "\n",
+ "Hi, I need to convert 500 USD to Euros. Can you help me with that?\n",
+ "model\n",
+ "Okay, so the user is asking to convert 500 USD to Euros. I need to figure out how to respond. Looking at the available tools, there's a function called convert_currency which does exactly that. It takes an amount, the source currency, and the target currency. The user provided all the necessary details: 500, USD, and EUR. So, I should call convert_currency with these parameters. That should give the user the converted amount they need.\n",
+ "\n",
+ "{'name': 'convert_currency', 'arguments': {'amount': 500, 'from_currency': 'USD', 'to_currency': 'EUR'}}\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "prompt=\"\"\"human\n",
+ "You are a function calling AI model. You are provided with function signatures within XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools: [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert from one currency to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to convert'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'calculate_distance', 'description': 'Calculate the distance between two locations', 'parameters': {'type': 'object', 'properties': {'start_location': {'type': 'string', 'description': 'The starting location'}, 'end_location': {'type': 'string', 'description': 'The ending location'}}, 'required': ['start_location', 'end_location']}}}] Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within XML tags as follows:\n",
+ "\n",
+ "{tool_call}\n",
+ "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between {your thoughts}\n",
+ "\n",
+ "Hi, I need to convert 500 USD to Euros. Can you help me with that?\n",
+ "model\n",
+ "\"\"\"\n",
+ "\n",
+ "inputs = tokenizer(prompt, return_tensors=\"pt\", add_special_tokens=False)\n",
+ "inputs = {k: v.to(\"cuda\") for k,v in inputs.items()}\n",
+ "outputs = model.generate(**inputs,\n",
+ " max_new_tokens=300,\n",
+ " do_sample=True,\n",
+ " top_p=0.65,\n",
+ " temperature=0.01,\n",
+ " repetition_penalty=1.0,\n",
+ " eos_token_id=tokenizer.eos_token_id)\n",
+ "print(tokenizer.decode(outputs[0]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Congratulations on finishing this first Bonus Unit π₯³\n",
+ "\n",
+ "You've just **mastered what Function-Calling is and how to fine-tune your model to do Function-Calling**!\n",
+ "\n",
+ "If it's the first time you do this, it's normal that you're feeling puzzled. Take time to check the documentation and understand each part of the code and why we did it this way.\n",
+ "\n",
+ "Also, don't hesitate to try to **fine-tune different models**. The **best way to learn is by trying.**\n",
+ "\n",
+ "### Keep Learning, Stay Awesome π€"
+ ],
+ "metadata": {
+ "id": "xWewPCZOyfJQ"
+ },
+ "id": "xWewPCZOyfJQ"
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.5"
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file