{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "8fe0634c-c88a-4c89-b956-1f4247c2d503", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "e2fc659d-8a38-4a94-bf3c-81b2778c780a", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /home/inflaton/code/projects/courses/novel-translation\n" ] } ], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "6601e3ff-e856-4353-98d8-42fcb158f230", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading env vars from: /home/inflaton/code/projects/courses/novel-translation/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "f30d283f-4759-403b-8cc4-94e360a76c04", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "('unsloth/Qwen2-0.5B-Instruct',\n", " True,\n", " 'models/Qwen2-0.5B-Instruct-MAC-',\n", " 'Qwen2-0.5B-Instruct-MAC-',\n", " 2048,\n", " 10,\n", " None,\n", " 'datasets/mac/mac.tsv')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "model_name = os.getenv(\"MODEL_NAME\")\n", "token = os.getenv(\"HF_TOKEN\") or None\n", "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n", "local_model = os.getenv(\"LOCAL_MODEL\")\n", "hub_model = os.getenv(\"HUB_MODEL\")\n", "num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n", "data_path = os.getenv(\"DATA_PATH\")\n", "\n", "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = (\n", " None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", ")\n", "\n", "model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "6f80b3a1-a6e6-43c7-b54f-da10ef37df32", "showTitle": false, "title": "" }, "id": "r2v_X2fA0Df5" }, "source": [ "* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc\n", "* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.\n", "* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.\n", "* With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co./unsloth) has Llama, Mistral 4bit models.\n", "* [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "9df0e65d-07a4-4d5e-8848-c41872280e6f", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 353, "referenced_widgets": [ "98c58f23f4d549518832cb2d18f796e8", "09b76013aa9e45efb6deb23a7a0d0925", "39b29a75374b45c0a22506010be2b84e", "78e5400bff924a92a4cc61c4ff18b182", "2a58d04b428c46f4b3dbadd3bc6cd529", "dea41c5260884aa6879b5e1d1697b14f", "89965917796a4f81b899fdc7685f33df", "30cdc32298134cb0be4d41615b9e5774", "47928317548c454bba6358ab132e8dee", "b9b313fd861948f5aba25b24b1518d30", "4c666f4ace3943f8b80ecd20e7503236", "c22f71b1f85843209d7e5321506b9cb9", "1f44c9ce1adf470cbb19784493ed209f", "f1addc4479d849879e743cf9089e6540", "8b3505352a5a42bf910428c40ce40465", "4c4c88d4c701450692fa0f6b0c5764b0", "0c34be936c8145d3ab41282f30a70713", "0a92c56bfa134ef583220d7ef0b13e17", "43dec2ede91341f5af60eb522e18e984", "d8e5318cead340c4adbeaccc05d39225", "49277aeeac16434a865a4d12308b1abc", "2157f01726d748f8a9ae4a00664430da", "fce7a61c25ec4390af43d92b7c473a45", "30307300bc4e4baf96560e30969a82b6", "8fc142b628fb40568730234de1cafde2", "a8464a4c711e4e00aafdfc919b60d07e", "5f40db8173dd4d76b6ef5ed6d9ec8b6e", "e36a3f9eff0e4cf68834d66b0213ae96", "a0037bdccf254159becde630bee3d1db", "4ae7e449e4ea4c729b5f34607c18ebae", "3572201bd4d74a58b7a665f9bdfdcdba", "fb995c740590427b882572c81d4e848c", "201b59ccd9f845e197029b57e424aefc", "cf245afeb1c04f29a24d291608c3d157", "b518dcee69074b87be73957cd810e7ed", "e29104486d594b2992d7285e0ef77371", "6578fd7acdb54c4c93528ea431fd0144", "d35db8148a354c56aaac56dbae22536f", "d891f8d0b1fc462f8008d02bb2a15692", "cced8fd7e998472794f3f3e3018956a5", "a9f0cc51fc3d4d7b874c32dcf1c5bdf2", "2f6c70dd266c4816bfad3fd3d192929a", "370692d819df41828b48c4ad446f977b", "a0bf9160eb2647409b3200270914b90f", "2d18ddf6482c4d97829ac0e5a7b9868f", "9f679ad3ec7f4fe8ad0510ffb57bc2ab", "f2df530d22c74977b249dd9fb5f4829b", "89b2ef0dbfea47ab8e6f8d659e3351d1", "3056b148aa9f4e6e8aa3b61d26886255", "4ea63adfce694725bdba878aef709dd3", "74501720ac7e4dbb911a4a99b3633bc6", "21db8a77b00d4a4e82fdfa608657531f", "6dbbedeca9314e66ae50e44ffa31a414", "b8908fa0df3743ecb9d12983a739104f", "177c78fce95d4b4ab33057c5a048d693", "27155728b6b84cb199c91c940095d0a8", "6b91feeed5464877991ac2c207aebe7c", "cca8113c54c0495daedce1327bf9c68b", "2e63a29e2f7247bba5beede9a568c99f", "5c9d781c28944f3eb86e2a6d44efdf18", "4b2061b8a73c43ffb0c2f83daf0d0183", "69ac12aec0714318bf2c83d4f4e745f5", "e02f9b7849c64531835eb77b860d1c93", "56aee4853b7740e6a977254f5d1fa66d", "b993eaec6b224440bf80c0958c6fb536", "de868e26e7154f62aa86223a539ad421" ] }, "id": "QmUBVEnvCDJv", "outputId": "a0e2d781-4934-415a-90b4-35165b9e44c5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n", "==((====))== Unsloth: Fast Qwen2 patching release 2024.5\n", " \\\\ /| GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.994 GB. Platform = Linux.\n", "O^O/ \\_/ \\ Pytorch: 2.2.2+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.\n", "\\ / Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.\n", " \"-____-\" Free Apache license: http://github.com/unslothai/unsloth\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 10.6 s, sys: 2.07 s, total: 12.6 s\n", "Wall time: 51.9 s\n" ] } ], "source": [ "%%time\n", "\n", "from llm_toolkit.translation_engine import *\n", "\n", "model, tokenizer = load_model(model_name, load_in_4bit)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "adfadd0f-8c01-4f67-b643-cff930c1ce00", "showTitle": false, "title": "" }, "id": "SXd9bTZd1aaL" }, "source": [ "We now add LoRA adapters so we only need to update 1 to 10% of all parameters!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "2cd85242-237f-4cca-a706-b7664ec9d3e5", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "6bZsfBuZDeCL", "outputId": "bc6d9ce7-f82a-4191-d0c5-ec8247d9b9eb" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Unsloth 2024.5 patched 24 layers with 0 QKV layers, 24 O layers and 24 MLP layers.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9.31 s, sys: 0 ns, total: 9.31 s\n", "Wall time: 2.12 s\n" ] } ], "source": [ "%%time\n", "\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules=[\n", " \"q_proj\",\n", " \"k_proj\",\n", " \"v_proj\",\n", " \"o_proj\",\n", " \"gate_proj\",\n", " \"up_proj\",\n", " \"down_proj\",\n", " ],\n", " lora_alpha=16,\n", " lora_dropout=0, # Supports any, but = 0 is optimized\n", " bias=\"none\", # Supports any, but = \"none\" is optimized\n", " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n", " use_gradient_checkpointing=\"unsloth\", # True or \"unsloth\" for very long context\n", " random_state=3407,\n", " use_rslora=False, # We support rank stabilized LoRA\n", " loftq_config=None, # And LoftQ\n", ")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "2c3fdf26-130f-4ce7-9c51-d62e1ce17629", "showTitle": false, "title": "" }, "id": "vITh0KVJ10qX" }, "source": [ "\n", "### Data Prep\n", "We now use the Alpaca dataset from [yahma](https://huggingface.co./datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.\n", "\n", "**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co./docs/trl/sft_trainer#train-on-completions-only).\n", "\n", "**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!\n", "\n", "If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).\n", "\n", "For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "a378fd31-1620-42bc-b97a-82f4ffbdcb11", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading train/test data files\n", "DatasetDict({\n", " train: Dataset({\n", " features: ['chinese', 'english', 'text', 'prompt'],\n", " num_rows: 4528\n", " })\n", " test: Dataset({\n", " features: ['chinese', 'english', 'text', 'prompt'],\n", " num_rows: 1133\n", " })\n", "})\n" ] } ], "source": [ "import os\n", "from llm_toolkit.translation_engine import *\n", "\n", "datasets = load_translation_dataset(data_path, tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "6ac9ac82-aaf3-482b-8cdb-8eab19fde5ae", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "({'chinese': '全仗着狐仙搭救。',\n", " 'english': 'Because I was protected by a fox fairy.',\n", " 'text': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n全仗着狐仙搭救。<|im_end|>\\n<|im_start|>assistant\\nBecause I was protected by a fox fairy.<|im_end|>',\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n全仗着狐仙搭救。<|im_end|>\\n<|im_start|>assistant\\n'},\n", " {'chinese': '老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。',\n", " 'english': 'Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.',\n", " 'text': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。<|im_end|>\\n<|im_start|>assistant\\nOld Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.<|im_end|>',\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。<|im_end|>\\n<|im_start|>assistant\\n'})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets[\"train\"][0], datasets[\"test\"][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "87f73aab-12df-4e4e-b758-ee055e17ed58", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "({'chinese': '周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。',\n", " 'english': \"'She said they don't really belong to the family but were adopted into the clan years ago when your grandfather and theirs were working in the same office.\",\n", " 'text': \"<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。<|im_end|>\\n<|im_start|>assistant\\n'She said they don't really belong to the family but were adopted into the clan years ago when your grandfather and theirs were working in the same office.<|im_end|>\",\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。<|im_end|>\\n<|im_start|>assistant\\n'},\n", " {'chinese': '“听到了吗?',\n", " 'english': \"'Did you hear that?'\",\n", " 'text': \"<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n“听到了吗?<|im_end|>\\n<|im_start|>assistant\\n'Did you hear that?'<|im_end|>\",\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n“听到了吗?<|im_end|>\\n<|im_start|>assistant\\n'})" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets[\"train\"][1000], datasets[\"test\"][1000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "dd425707-88cc-43db-8bc0-e858c8084e16", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "26e4202cca81496a90d15a0dd4ca9cf1", "ba90fdb8822d47dab7ba203bee297f37", "61560ff6a36b44f4a9dfdae5c52791d4", "95fbe66647904c06a20f640630d6dc0e", "57182a263d324a3dbf1471c74290a0d5", "0f8b6bfe16894500838793f2491d403f", "bb19f6c747754682a514373a3a0535ba", "db19fc8d37db4e45a5790a876836d8c4", "36166c7bcb854b34aca1f41a5d6ea50b", "b0a370dc20654b279b9680692e34418e", "cfeb365ddf7548d58b2557f22737fcf5", "73e352a3404f4c7dad0737f57d29e92f", "988a0e8c1f89446086858da0a891a79c", "4ccedf0d93094e63b57a0f8a434fba06", "6b2012c3f88547af8884a9ea90e3164b", "7e29cb8dd4df4d5b94407cd8fd3f2011", "ad2be500fc164c0f86f33e914ef8e6a0", "5234566b1bfc4655b8d582ea5b46ed9f", "4463edd481c1467f914c7dcd6c6e6ffc", "6d3b9a05db0b4dadb638c686faa0c40a", "938f45f1b3e24118b815d96ae34ba86a", "9367047a800747f79c6b225d92397846", "d1b47d39450d4019ae85c9b2f943eeaf", "4dcf6ff672d24983a1877a8431709aa9", "7975adbc2ec5489ea7fa0167e620d85c", "71ce208e20d6483abb9ed923510c86d7", "cfe8cae0e22b495bafa221a63d13b283", "5807d5fb827d490fb3bc698f801ffff5", "c4f2b06a82fd4987b8b659524a7b503b", "6e34619b45934040b6092e6fb01ea7fe", "271ddaa553a042d09b6db7b450643d8f", "d69dc491b3ab44d7852b21873ed7bb7f", "f401d53bf28e44eb906bce6c05412662", "daf4cd890b35422683d22fd30bc71e83", "b0240cd9a4554b29ae11f8051984a1c6", "bc883d4cf13e4f8b8a4fe5f410cb6efd", "99fdbb0300c14c139d1937c646f0cfe7", "c161d94df0f04feba9542237e0856c22", "edaf890370314a218f138015faa0b05d", "697f027529b54ee9956bae78a11e0611", "e9159e03e61f4f56978ece9c3bca49b2", "810ff6c0e17d4fa09a30fef27eacff90", "7358cdad832342c983e31efb8754ab78", "e9adf418296e436fb48bb9f78885598b" ] }, "id": "LjY75GoYUCB8", "outputId": "7e2045fb-9ce9-49b1-b6e7-d5c9bc92455c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|im_start|>system\n", "You are an expert in translating Chinese into English.<|im_end|>\n", "<|im_start|>user\n", "Translate from Chinese to English.\n", "“听到了吗?<|im_end|>\n", "<|im_start|>assistant\n", "\n", "----------------------------------------\n", "<|im_start|>system\n", "You are an expert in translating Chinese into English.<|im_end|>\n", "<|im_start|>user\n", "Translate from Chinese to English.\n", "“听到了吗?<|im_end|>\n", "<|im_start|>assistant\n", "Did you hear it?<|im_end|>\n", "CPU times: user 1.62 s, sys: 160 ms, total: 1.78 s\n", "Wall time: 1.8 s\n" ] } ], "source": [ "%%time\n", "\n", "prompt1 = datasets[\"test\"][\"prompt\"][1000]\n", "print(prompt1)\n", "print(\"--\" * 20)\n", "test_model(model, tokenizer, prompt1)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "53cc91a1-1623-4197-bf82-78cdadad933e", "showTitle": false, "title": "" }, "id": "idAEIeSQ3xdS" }, "source": [ "\n", "### Train the model\n", "Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co./docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "f1053974-9253-4d4d-a172-1f5fea046745", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 122, "referenced_widgets": [ "3cf2dd993b5e4d3daecf61e4bab5a404", "087b76a8b7514269b1f0ab29b062e444", "35b0e8c26d6640e9bd0ed7b242a423d8", "54ad89e05fd74576b9b8b5b5a10eaf8d", "a41dc44766444a998bec2d777f249d23", "a069d2ab23824f29aa320ac256e2cfe9", "06e806c82c7b4cbea31c5358dd9c3434", "2e5087c76f98437cb5dc729230358cba", "036fc5746f43416db18c19ad8fd36677", "fdb1941405ed4e4aa06019933892deb3", "668d5377ca56426a99753867e6e24862" ] }, "id": "95_Nn-89DhsL", "outputId": "bce9db22-b022-4e43-de3f-c7ea4c9c3c4e" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c9df9c466cc24f5e8715c02eb6764c3c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map (num_proc=2): 0%| | 0/4528 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from trl import SFTTrainer\n", "from llm_toolkit.transformers import TrainingArguments\n", "from unsloth import is_bfloat16_supported\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=datasets[\"train\"],\n", " dataset_text_field=\"text\",\n", " max_seq_length=max_seq_length,\n", " dataset_num_proc=2,\n", " packing=False, # Can make training 5x faster for short sequences.\n", " args=TrainingArguments(\n", " per_device_train_batch_size=2,\n", " gradient_accumulation_steps=4,\n", " warmup_steps=5,\n", " num_train_epochs=num_train_epochs,\n", " learning_rate=2e-4,\n", " fp16=not is_bfloat16_supported(),\n", " bf16=is_bfloat16_supported(),\n", " logging_steps=100,\n", " optim=\"adamw_8bit\",\n", " weight_decay=0.01,\n", " lr_scheduler_type=\"linear\",\n", " seed=3407,\n", " output_dir=\"outputs\",\n", " ),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "db089cc7-e333-42ae-a468-bed571bb4214", "showTitle": false, "title": "" }, "cellView": "form", "colab": { "base_uri": "https://localhost:8080/" }, "id": "2ejIt2xSNKKp", "outputId": "c73d8dfa-f4a1-4a01-a6dc-018bf82516a2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GPU = NVIDIA GeForce RTX 4080 Laptop GPU. Max memory = 11.994 GB.\n", "1.199 GB of memory reserved.\n" ] } ], "source": [ "# @title Show current memory stats\n", "import torch\n", "\n", "gpu_stats = torch.cuda.get_device_properties(0)\n", "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", "print(f\"{start_gpu_memory} GB of memory reserved.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "412a7db0-6980-4834-bd87-fb15b794eb75", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "yqxqAZ7KJ4oL", "outputId": "69117b9b-b6f8-4d0e-c262-6998ba2c46bd" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1\n", " \\\\ /| Num examples = 4,528 | Num Epochs = 10\n", "O^O/ \\_/ \\ Batch size per device = 2 | Gradient Accumulation steps = 4\n", "\\ / Total batch size = 8 | Total steps = 5,660\n", " \"-____-\" Number of trainable parameters = 8,798,208\n" ] }, { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|---|
100 | \n", "2.137700 | \n", "
200 | \n", "2.001500 | \n", "
300 | \n", "1.938200 | \n", "
400 | \n", "1.935400 | \n", "
500 | \n", "1.899800 | \n", "
600 | \n", "1.819500 | \n", "
700 | \n", "1.689600 | \n", "
800 | \n", "1.737300 | \n", "
900 | \n", "1.665900 | \n", "
1000 | \n", "1.664600 | \n", "
1100 | \n", "1.723000 | \n", "
1200 | \n", "1.520200 | \n", "
1300 | \n", "1.381000 | \n", "
1400 | \n", "1.423000 | \n", "
1500 | \n", "1.419400 | \n", "
1600 | \n", "1.436500 | \n", "
1700 | \n", "1.401500 | \n", "
1800 | \n", "1.119500 | \n", "
1900 | \n", "1.130700 | \n", "
2000 | \n", "1.139100 | \n", "
2100 | \n", "1.120000 | \n", "
2200 | \n", "1.166200 | \n", "
2300 | \n", "1.062000 | \n", "
2400 | \n", "0.858400 | \n", "
2500 | \n", "0.846800 | \n", "
2600 | \n", "0.892000 | \n", "
2700 | \n", "0.887700 | \n", "
2800 | \n", "0.907300 | \n", "
2900 | \n", "0.728300 | \n", "
3000 | \n", "0.644400 | \n", "
3100 | \n", "0.652400 | \n", "
3200 | \n", "0.683000 | \n", "
3300 | \n", "0.673200 | \n", "
3400 | \n", "0.670000 | \n", "
3500 | \n", "0.460900 | \n", "
3600 | \n", "0.487600 | \n", "
3700 | \n", "0.501000 | \n", "
3800 | \n", "0.491400 | \n", "
3900 | \n", "0.501700 | \n", "
4000 | \n", "0.452200 | \n", "
4100 | \n", "0.352000 | \n", "
4200 | \n", "0.368500 | \n", "
4300 | \n", "0.368500 | \n", "
4400 | \n", "0.360600 | \n", "
4500 | \n", "0.374900 | \n", "
4600 | \n", "0.294500 | \n", "
4700 | \n", "0.270000 | \n", "
4800 | \n", "0.270800 | \n", "
4900 | \n", "0.285300 | \n", "
5000 | \n", "0.282200 | \n", "
5100 | \n", "0.285100 | \n", "
5200 | \n", "0.216900 | \n", "
5300 | \n", "0.228700 | \n", "
5400 | \n", "0.223900 | \n", "
5500 | \n", "0.226100 | \n", "
5600 | \n", "0.229100 | \n", "
"
],
"text/plain": [
"