{ "cells": [ { "cell_type": "markdown", "id": "b814f2a1-95be-4ed5-9b06-780c653311db", "metadata": {}, "source": [ "# ANALYZING THE PERPLEXITY OF MERAK-7B-V1 USING MBZUAI/BACTRIAN-X DATASET" ] }, { "cell_type": "markdown", "id": "a3df9bb7-1a91-4ac9-bc2b-f9695026b52c", "metadata": {}, "source": [ "### WARNING = This notebook needs HUGE VRAM for successfully run it. It needs at least 48 GB of VRAM" ] }, { "cell_type": "markdown", "id": "22c91003-4f21-4999-8e27-bc950d9ed6cf", "metadata": {}, "source": [ "Let's Install the packages requirement." ] }, { "cell_type": "code", "execution_count": 1, "id": "ac2d3acf-c918-4d04-a7e4-ec64d48f594e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting bitsandbytes==0.39.1\n", " Downloading bitsandbytes-0.39.1-py3-none-any.whl (97.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.1/97.1 MB\u001b[0m \u001b[31m41.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hInstalling collected packages: bitsandbytes\n", "Successfully installed bitsandbytes-0.39.1\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Collecting transformers==4.31.0\n", " Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m90.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.31.0) (2.28.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1\n", " Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m51.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m95.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.31.0) (3.10.7)\n", "Collecting tqdm>=4.27\n", " Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 kB\u001b[0m \u001b[31m42.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31.0) (23.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31.0) (1.24.2)\n", "Collecting regex!=2019.12.17\n", " Downloading regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.4/770.4 kB\u001b[0m \u001b[31m104.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31.0) (6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31.0) (4.5.0)\n", "Collecting fsspec\n", " Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.8/163.8 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests->transformers==4.31.0) (2019.11.28)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3/dist-packages (from requests->transformers==4.31.0) (1.25.8)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31.0) (3.1.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->transformers==4.31.0) (2.8)\n", "Installing collected packages: tokenizers, safetensors, tqdm, regex, fsspec, huggingface-hub, transformers\n", "Successfully installed fsspec-2023.6.0 huggingface-hub-0.16.4 regex-2023.6.3 safetensors-0.3.1 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.31.0\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Collecting peft==0.4.0\n", " Downloading peft-0.4.0-py3-none-any.whl (72 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.9/72.9 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (23.0)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (6.0)\n", "Collecting accelerate\n", " Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.2/244.2 kB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (2.0.0)\n", "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (0.3.1)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (4.31.0)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (5.9.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.4.0) (1.24.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (2.0.0)\n", "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (10.2.10.91)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (3.10.7)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (4.5.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.7.99)\n", "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (8.5.0.96)\n", "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.7.4.91)\n", "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.7.91)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.7.101)\n", "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.10.3.66)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.7.99)\n", "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (10.9.0.58)\n", "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (11.4.0.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (3.1.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (3.0)\n", "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (2.14.3)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.4.0) (1.11.1)\n", "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.4.0) (0.40.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.13.0->peft==0.4.0) (67.6.1)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.4.0) (16.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.13.0->peft==0.4.0) (3.26.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.4.0) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.4.0) (2.28.2)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.4.0) (0.13.3)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.4.0) (0.16.4)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.4.0) (4.65.0)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers->peft==0.4.0) (2023.6.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.4.0) (2.1.2)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3/dist-packages (from requests->transformers->peft==0.4.0) (1.25.8)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests->transformers->peft==0.4.0) (2019.11.28)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->peft==0.4.0) (3.1.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->transformers->peft==0.4.0) (2.8)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.13.0->peft==0.4.0) (1.3.0)\n", "Installing collected packages: accelerate, peft\n", "Successfully installed accelerate-0.21.0 peft-0.4.0\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Collecting accelerate==0.20.3\n", " Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.20.3) (2.0.0)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate==0.20.3) (6.0)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==0.20.3) (5.9.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.20.3) (1.24.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.20.3) (23.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.7.99)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (3.1.2)\n", "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (8.5.0.96)\n", "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.7.4.91)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (3.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (3.10.7)\n", "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (10.9.0.58)\n", "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.4.0.1)\n", "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (2.14.3)\n", "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.7.91)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (2.0.0)\n", "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.10.3.66)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.7.99)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (4.5.0)\n", "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (10.2.10.91)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (11.7.101)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate==0.20.3) (1.11.1)\n", "Requirement already satisfied: wheel in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.6.0->accelerate==0.20.3) (0.40.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch>=1.6.0->accelerate==0.20.3) (67.6.1)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate==0.20.3) (16.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate==0.20.3) (3.26.1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->accelerate==0.20.3) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->accelerate==0.20.3) (1.3.0)\n", "Installing collected packages: accelerate\n", " Attempting uninstall: accelerate\n", " Found existing installation: accelerate 0.21.0\n", " Uninstalling accelerate-0.21.0:\n", " Successfully uninstalled accelerate-0.21.0\n", "Successfully installed accelerate-0.20.3\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Collecting einops==0.6.1\n", " Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting scipy\n", " Downloading scipy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.3/36.3 MB\u001b[0m \u001b[31m46.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting sentencepiece\n", " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.14.3-py3-none-any.whl (519 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.1/519.1 kB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy<1.28.0,>=1.21.6 in /usr/local/lib/python3.10/dist-packages (from scipy) (1.24.2)\n", "Collecting pyarrow>=8.0.0\n", " Downloading pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.9/38.9 MB\u001b[0m \u001b[31m66.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.65.0)\n", "Collecting aiohttp\n", " Downloading aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess\n", " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m62.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.16.4)\n", "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Collecting dill<0.3.8,>=0.3.0\n", " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m70.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0)\n", "Collecting pandas\n", " Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m89.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.0)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.28.2)\n", "Collecting xxhash\n", " Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m69.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting frozenlist>=1.1.1\n", " Downloading frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.7/225.7 kB\u001b[0m \u001b[31m76.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting aiosignal>=1.1.2\n", " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (3.1.0)\n", "Collecting yarl<2.0,>=1.0\n", " Downloading yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m100.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (22.2.0)\n", "Collecting multidict<7.0,>=4.5\n", " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m54.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (3.10.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.19.0->datasets) (2019.11.28)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3/dist-packages (from requests>=2.19.0->datasets) (1.25.8)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.19.0->datasets) (2.8)\n", "Collecting pytz>=2020.1\n", " Downloading pytz-2023.3-py2.py3-none-any.whl (502 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m502.3/502.3 kB\u001b[0m \u001b[31m103.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Collecting tzdata>=2022.1\n", " Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m71.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.14.0)\n", "Installing collected packages: sentencepiece, pytz, xxhash, tzdata, scipy, pyarrow, multidict, frozenlist, einops, dill, async-timeout, yarl, pandas, multiprocess, aiosignal, aiohttp, datasets\n", "Successfully installed aiohttp-3.8.5 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.14.3 dill-0.3.7 einops-0.6.1 frozenlist-1.4.0 multidict-6.0.4 multiprocess-0.70.15 pandas-2.0.3 pyarrow-12.0.1 pytz-2023.3 scipy-1.11.1 sentencepiece-0.1.99 tzdata-2023.3 xxhash-3.3.0 yarl-1.9.2\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install bitsandbytes==0.39.1\n", "!pip install transformers==4.31.0\n", "!pip install peft==0.4.0\n", "!pip install accelerate==0.20.3\n", "!pip install einops==0.6.1 scipy sentencepiece datasets" ] }, { "cell_type": "code", "execution_count": 2, "id": "c2bc520e-fb55-4527-9c35-948e003a778e", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "id": "da08ee68-7746-439b-bbe3-bb4ebd96a778", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e629e51a5e7b431ead7acc1c94e96be6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/635 [00:00)\n" ] } ], "source": [ "inputs = tokenizer(\"Saya menyukai tari zapin\", return_tensors = \"pt\")\n", "loss = model(input_ids = inputs[\"input_ids\"], labels = inputs[\"input_ids\"]).loss\n", "ppl = torch.exp(loss)\n", "print(ppl)" ] }, { "cell_type": "markdown", "id": "56581598-51f6-4dcd-b89a-8579257c7446", "metadata": {}, "source": [ "### Now, we learn how to calculate perplexity in a text in Pandas Dataframe" ] }, { "cell_type": "code", "execution_count": 5, "id": "ea1cb6c0-89d8-4153-a700-ee3e240a3eb6", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QuestionAnswer
0<|prompt|>Siapa penulis naskah proklamasi keme...
\n", "
" ], "text/plain": [ " QuestionAnswer\n", "0 <|prompt|>Siapa penulis naskah proklamasi keme..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"eval_single.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 6, "id": "cb405b92-0028-4eaa-a1b9-234fdbd83d8d", "metadata": { "tags": [] }, "outputs": [], "source": [ "def Perplexity(TEXT):\n", " inputs = tokenizer(TEXT, return_tensors = \"pt\")\n", " with torch.inference_mode():\n", " loss = model(input_ids = inputs[\"input_ids\"], labels = inputs[\"input_ids\"]).loss\n", " ppl = torch.exp(loss)\n", " return np.float16(ppl.detach().numpy())" ] }, { "cell_type": "code", "execution_count": 7, "id": "35185dc3-2235-41fb-9f36-bb46878a8c2b", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
QuestionAnswerPPL
0<|prompt|>Siapa penulis naskah proklamasi keme...7.3125
\n", "
" ], "text/plain": [ " QuestionAnswer PPL\n", "0 <|prompt|>Siapa penulis naskah proklamasi keme... 7.3125" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['PPL'] = df['QuestionAnswer'].apply(Perplexity)\n", "df" ] }, { "cell_type": "markdown", "id": "9cc9160e-d190-4759-8730-45660b2144e8", "metadata": { "tags": [] }, "source": [ "It has PPL of 7.3125. It much lower than first example. So, it was better." ] }, { "cell_type": "markdown", "id": "43ca3e71-6ff5-4125-bec7-cb397a2a385e", "metadata": {}, "source": [ "### LETS CALCULATE PERPLEXITY OF MERAK-7B-V1" ] }, { "cell_type": "code", "execution_count": 8, "id": "47d235ed-b637-4b6e-8934-01e5237c1a65", "metadata": { "tags": [] }, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "markdown", "id": "1d0fef05-8e89-4b8a-8116-1a997916c8b2", "metadata": {}, "source": [ "#### We never trained Merak-7B-v1 with MBZUAI/Bactrian-X. So, it was good dataset for evaluation how good the model is.." ] }, { "cell_type": "code", "execution_count": 9, "id": "8c5a2697-e248-4bfd-aefc-aeb39e81ce23", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d893944ef53948e6a690637b8a5b8e67", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading builder script: 0%| | 0.00/2.26k [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutput
0Laporkan suhu rata-rata di Scottsdale, AZ pada...Tidak ditemukan data suhu rata-rata untuk Scot...
1Bagaimana cara kerja turbin angin?Turbin angin bekerja dengan mengubah energi ki...
2Hasilkan daftar ras anjing paling populer di AS.1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ...
3Apa itu jaringan pribadi virtual (VPN)?Jaringan pribadi virtual (VPN) adalah sebuah t...
4Identifikasi tema puisi Pablo Neruda \"If You F...Tema puisi \"If You Forget Me\" karya Pablo Neru...
............
67012Sarankan langkah kebijakan baru untuk mengatas...Ada tren kejahatan dunia maya yang berkembang ...1. Peningkatan Kesadaran Masyarakat. Melalui k...
67013Kembangkan pernyataan tesis yang menangkap ese...Artikel ini akan mengeksplorasi bagaimana berb...Pernyataan tesis dari artikel ini adalah bahwa...
67014Sarankan beberapa tindakan yang dapat diambil ...Sebagai AI, saya akan memberikan beberapa sara...
67015Cocokkan masukan berikut dengan jenis tugas pe...Gambar kucingPembelajaran mesin dalam klasifikasi gambar.
67016Tulis postingan mikroblog tentang pengalaman t...Baru saja merasakan pengalaman yang sangat men...
\n", "

67017 rows × 3 columns

\n", "" ], "text/plain": [ " instruction \\\n", "0 Laporkan suhu rata-rata di Scottsdale, AZ pada... \n", "1 Bagaimana cara kerja turbin angin? \n", "2 Hasilkan daftar ras anjing paling populer di AS. \n", "3 Apa itu jaringan pribadi virtual (VPN)? \n", "4 Identifikasi tema puisi Pablo Neruda \"If You F... \n", "... ... \n", "67012 Sarankan langkah kebijakan baru untuk mengatas... \n", "67013 Kembangkan pernyataan tesis yang menangkap ese... \n", "67014 Sarankan beberapa tindakan yang dapat diambil ... \n", "67015 Cocokkan masukan berikut dengan jenis tugas pe... \n", "67016 Tulis postingan mikroblog tentang pengalaman t... \n", "\n", " input \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "... ... \n", "67012 Ada tren kejahatan dunia maya yang berkembang ... \n", "67013 Artikel ini akan mengeksplorasi bagaimana berb... \n", "67014 \n", "67015 Gambar kucing \n", "67016 \n", "\n", " output \n", "0 Tidak ditemukan data suhu rata-rata untuk Scot... \n", "1 Turbin angin bekerja dengan mengubah energi ki... \n", "2 1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ... \n", "3 Jaringan pribadi virtual (VPN) adalah sebuah t... \n", "4 Tema puisi \"If You Forget Me\" karya Pablo Neru... \n", "... ... \n", "67012 1. Peningkatan Kesadaran Masyarakat. Melalui k... \n", "67013 Pernyataan tesis dari artikel ini adalah bahwa... \n", "67014 Sebagai AI, saya akan memberikan beberapa sara... \n", "67015 Pembelajaran mesin dalam klasifikasi gambar. \n", "67016 Baru saja merasakan pengalaman yang sangat men... \n", "\n", "[67017 rows x 3 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI = pd.DataFrame( {'instruction':dataset['instruction'],\n", " 'input':dataset['input'],\n", " 'output':dataset['output']} )\n", "df_MBZUAI" ] }, { "cell_type": "markdown", "id": "868e539e-73be-497a-91d8-e93d10dc2ef0", "metadata": {}, "source": [ "### Since the dataset has some rows which has 'input' column, we examine that with different approach for creating QnA text " ] }, { "cell_type": "markdown", "id": "b4e669ae-106e-4c09-af5e-4105e7517223", "metadata": {}, "source": [ "When the data has no 'input' column, the format is: \n", "`<|prompt|>{instruction}<|answer|>{output}`" ] }, { "cell_type": "code", "execution_count": 11, "id": "a47ca5e3-7cb0-44e6-81f1-4b8a04d0d432", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutput
0Laporkan suhu rata-rata di Scottsdale, AZ pada...Tidak ditemukan data suhu rata-rata untuk Scot...
1Bagaimana cara kerja turbin angin?Turbin angin bekerja dengan mengubah energi ki...
2Hasilkan daftar ras anjing paling populer di AS.1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ...
3Apa itu jaringan pribadi virtual (VPN)?Jaringan pribadi virtual (VPN) adalah sebuah t...
4Identifikasi tema puisi Pablo Neruda \"If You F...Tema puisi \"If You Forget Me\" karya Pablo Neru...
............
67007Identifikasi 3 faktor yang dapat berdampak neg...1. Ketidakstabilan Ekonomi: Fluktuasi dalam ek...
67008Ciptakan olahraga baruSebagai AI, saya tidak memiliki ide atau kreat...
67009Kembangkan strategi untuk mengoptimalkan konte...1. Melakukan penelitian kata kunci: Lakukan pe...
67014Sarankan beberapa tindakan yang dapat diambil ...Sebagai AI, saya akan memberikan beberapa sara...
67016Tulis postingan mikroblog tentang pengalaman t...Baru saja merasakan pengalaman yang sangat men...
\n", "

41764 rows × 3 columns

\n", "
" ], "text/plain": [ " instruction input \\\n", "0 Laporkan suhu rata-rata di Scottsdale, AZ pada... \n", "1 Bagaimana cara kerja turbin angin? \n", "2 Hasilkan daftar ras anjing paling populer di AS. \n", "3 Apa itu jaringan pribadi virtual (VPN)? \n", "4 Identifikasi tema puisi Pablo Neruda \"If You F... \n", "... ... ... \n", "67007 Identifikasi 3 faktor yang dapat berdampak neg... \n", "67008 Ciptakan olahraga baru \n", "67009 Kembangkan strategi untuk mengoptimalkan konte... \n", "67014 Sarankan beberapa tindakan yang dapat diambil ... \n", "67016 Tulis postingan mikroblog tentang pengalaman t... \n", "\n", " output \n", "0 Tidak ditemukan data suhu rata-rata untuk Scot... \n", "1 Turbin angin bekerja dengan mengubah energi ki... \n", "2 1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ... \n", "3 Jaringan pribadi virtual (VPN) adalah sebuah t... \n", "4 Tema puisi \"If You Forget Me\" karya Pablo Neru... \n", "... ... \n", "67007 1. Ketidakstabilan Ekonomi: Fluktuasi dalam ek... \n", "67008 Sebagai AI, saya tidak memiliki ide atau kreat... \n", "67009 1. Melakukan penelitian kata kunci: Lakukan pe... \n", "67014 Sebagai AI, saya akan memberikan beberapa sara... \n", "67016 Baru saja merasakan pengalaman yang sangat men... \n", "\n", "[41764 rows x 3 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI_NoInput = df_MBZUAI[df_MBZUAI['input']=='']\n", "df_MBZUAI_NoInput" ] }, { "cell_type": "code", "execution_count": 12, "id": "c7964620-f059-47de-8a2e-15af43a51466", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_486/1761940573.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_MBZUAI_NoInput['QnA'] = \"<|prompt|>\" + df_MBZUAI_NoInput[\"instruction\"].astype(str) + \"<|answer|>\" + df_MBZUAI_NoInput[\"output\"].astype(str)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutputQnA
0Laporkan suhu rata-rata di Scottsdale, AZ pada...Tidak ditemukan data suhu rata-rata untuk Scot...<|prompt|>Laporkan suhu rata-rata di Scottsdal...
1Bagaimana cara kerja turbin angin?Turbin angin bekerja dengan mengubah energi ki...<|prompt|>Bagaimana cara kerja turbin angin?<|...
2Hasilkan daftar ras anjing paling populer di AS.1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ...<|prompt|>Hasilkan daftar ras anjing paling po...
3Apa itu jaringan pribadi virtual (VPN)?Jaringan pribadi virtual (VPN) adalah sebuah t...<|prompt|>Apa itu jaringan pribadi virtual (VP...
4Identifikasi tema puisi Pablo Neruda \"If You F...Tema puisi \"If You Forget Me\" karya Pablo Neru...<|prompt|>Identifikasi tema puisi Pablo Neruda...
...............
67007Identifikasi 3 faktor yang dapat berdampak neg...1. Ketidakstabilan Ekonomi: Fluktuasi dalam ek...<|prompt|>Identifikasi 3 faktor yang dapat ber...
67008Ciptakan olahraga baruSebagai AI, saya tidak memiliki ide atau kreat...<|prompt|>Ciptakan olahraga baru<|answer|>Seba...
67009Kembangkan strategi untuk mengoptimalkan konte...1. Melakukan penelitian kata kunci: Lakukan pe...<|prompt|>Kembangkan strategi untuk mengoptima...
67014Sarankan beberapa tindakan yang dapat diambil ...Sebagai AI, saya akan memberikan beberapa sara...<|prompt|>Sarankan beberapa tindakan yang dapa...
67016Tulis postingan mikroblog tentang pengalaman t...Baru saja merasakan pengalaman yang sangat men...<|prompt|>Tulis postingan mikroblog tentang pe...
\n", "

41764 rows × 4 columns

\n", "
" ], "text/plain": [ " instruction input \\\n", "0 Laporkan suhu rata-rata di Scottsdale, AZ pada... \n", "1 Bagaimana cara kerja turbin angin? \n", "2 Hasilkan daftar ras anjing paling populer di AS. \n", "3 Apa itu jaringan pribadi virtual (VPN)? \n", "4 Identifikasi tema puisi Pablo Neruda \"If You F... \n", "... ... ... \n", "67007 Identifikasi 3 faktor yang dapat berdampak neg... \n", "67008 Ciptakan olahraga baru \n", "67009 Kembangkan strategi untuk mengoptimalkan konte... \n", "67014 Sarankan beberapa tindakan yang dapat diambil ... \n", "67016 Tulis postingan mikroblog tentang pengalaman t... \n", "\n", " output \\\n", "0 Tidak ditemukan data suhu rata-rata untuk Scot... \n", "1 Turbin angin bekerja dengan mengubah energi ki... \n", "2 1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ... \n", "3 Jaringan pribadi virtual (VPN) adalah sebuah t... \n", "4 Tema puisi \"If You Forget Me\" karya Pablo Neru... \n", "... ... \n", "67007 1. Ketidakstabilan Ekonomi: Fluktuasi dalam ek... \n", "67008 Sebagai AI, saya tidak memiliki ide atau kreat... \n", "67009 1. Melakukan penelitian kata kunci: Lakukan pe... \n", "67014 Sebagai AI, saya akan memberikan beberapa sara... \n", "67016 Baru saja merasakan pengalaman yang sangat men... \n", "\n", " QnA \n", "0 <|prompt|>Laporkan suhu rata-rata di Scottsdal... \n", "1 <|prompt|>Bagaimana cara kerja turbin angin?<|... \n", "2 <|prompt|>Hasilkan daftar ras anjing paling po... \n", "3 <|prompt|>Apa itu jaringan pribadi virtual (VP... \n", "4 <|prompt|>Identifikasi tema puisi Pablo Neruda... \n", "... ... \n", "67007 <|prompt|>Identifikasi 3 faktor yang dapat ber... \n", "67008 <|prompt|>Ciptakan olahraga baru<|answer|>Seba... \n", "67009 <|prompt|>Kembangkan strategi untuk mengoptima... \n", "67014 <|prompt|>Sarankan beberapa tindakan yang dapa... \n", "67016 <|prompt|>Tulis postingan mikroblog tentang pe... \n", "\n", "[41764 rows x 4 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI_NoInput['QnA'] = \"<|prompt|>\" + df_MBZUAI_NoInput[\"instruction\"].astype(str) + \"<|answer|>\" + df_MBZUAI_NoInput[\"output\"].astype(str)\n", "df_MBZUAI_NoInput" ] }, { "cell_type": "markdown", "id": "9db2a8db-0cf4-42d4-849c-508656454550", "metadata": {}, "source": [ "When the data has 'input' column, the format is: \n", "`<|prompt|>{instruction}: {input}<|answer|>{output}`" ] }, { "cell_type": "code", "execution_count": 13, "id": "7829bd2b-c3a3-4a85-b285-53a7c73a54fe", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutput
7Kategorikan film ini ke dalam salah satu genre...RevenantDrama.
9Ubah string berikut menjadi huruf kecil penuh.Halohalo
19Diberi string berikut 'a1b2c3', ubah menjadi '...a1b2c3dapat diubah menjadi 'abc123' dengan menggunak...
20Berikan ulasan untuk restoran dalam 200 kata a...Gajah LiarRestoran Gajah Liar adalah tempat yang menyaji...
22Buat hidangan baru menggunakan bahan-bahan ber...Wortel, bawang putih, bawang merah, brokolidapat dipadukan menjadi hidangan sehat yang le...
............
67010Beri judul artikel yang diberikan.Artikel ini akan membahas masalah mendesak def...\"Urgensi Menangani Deforestasi: Dampaknya terh...
67011Tulis ulang cerita sehingga memiliki akhir yan...Alkisah, hiduplah seekor unicorn ajaib di huta...Tapi, kekagumannya berubah menjadi kesedihan k...
67012Sarankan langkah kebijakan baru untuk mengatas...Ada tren kejahatan dunia maya yang berkembang ...1. Peningkatan Kesadaran Masyarakat. Melalui k...
67013Kembangkan pernyataan tesis yang menangkap ese...Artikel ini akan mengeksplorasi bagaimana berb...Pernyataan tesis dari artikel ini adalah bahwa...
67015Cocokkan masukan berikut dengan jenis tugas pe...Gambar kucingPembelajaran mesin dalam klasifikasi gambar.
\n", "

25253 rows × 3 columns

\n", "
" ], "text/plain": [ " instruction \\\n", "7 Kategorikan film ini ke dalam salah satu genre... \n", "9 Ubah string berikut menjadi huruf kecil penuh. \n", "19 Diberi string berikut 'a1b2c3', ubah menjadi '... \n", "20 Berikan ulasan untuk restoran dalam 200 kata a... \n", "22 Buat hidangan baru menggunakan bahan-bahan ber... \n", "... ... \n", "67010 Beri judul artikel yang diberikan. \n", "67011 Tulis ulang cerita sehingga memiliki akhir yan... \n", "67012 Sarankan langkah kebijakan baru untuk mengatas... \n", "67013 Kembangkan pernyataan tesis yang menangkap ese... \n", "67015 Cocokkan masukan berikut dengan jenis tugas pe... \n", "\n", " input \\\n", "7 Revenant \n", "9 Halo \n", "19 a1b2c3 \n", "20 Gajah Liar \n", "22 Wortel, bawang putih, bawang merah, brokoli \n", "... ... \n", "67010 Artikel ini akan membahas masalah mendesak def... \n", "67011 Alkisah, hiduplah seekor unicorn ajaib di huta... \n", "67012 Ada tren kejahatan dunia maya yang berkembang ... \n", "67013 Artikel ini akan mengeksplorasi bagaimana berb... \n", "67015 Gambar kucing \n", "\n", " output \n", "7 Drama. \n", "9 halo \n", "19 dapat diubah menjadi 'abc123' dengan menggunak... \n", "20 Restoran Gajah Liar adalah tempat yang menyaji... \n", "22 dapat dipadukan menjadi hidangan sehat yang le... \n", "... ... \n", "67010 \"Urgensi Menangani Deforestasi: Dampaknya terh... \n", "67011 Tapi, kekagumannya berubah menjadi kesedihan k... \n", "67012 1. Peningkatan Kesadaran Masyarakat. Melalui k... \n", "67013 Pernyataan tesis dari artikel ini adalah bahwa... \n", "67015 Pembelajaran mesin dalam klasifikasi gambar. \n", "\n", "[25253 rows x 3 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI_WithInput = df_MBZUAI[df_MBZUAI['input']!='']\n", "df_MBZUAI_WithInput" ] }, { "cell_type": "code", "execution_count": 14, "id": "3e4353a9-dfc2-4958-aac5-c48268c974aa", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_486/3992297062.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_MBZUAI_WithInput['QnA'] = \"<|prompt|>\" + df_MBZUAI_WithInput[\"instruction\"].astype(str) + \": \" + df_MBZUAI_WithInput[\"input\"].astype(str) + \"<|answer|>\" + df_MBZUAI_WithInput[\"output\"].astype(str)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutputQnA
7Kategorikan film ini ke dalam salah satu genre...RevenantDrama.<|prompt|>Kategorikan film ini ke dalam salah ...
9Ubah string berikut menjadi huruf kecil penuh.Halohalo<|prompt|>Ubah string berikut menjadi huruf ke...
19Diberi string berikut 'a1b2c3', ubah menjadi '...a1b2c3dapat diubah menjadi 'abc123' dengan menggunak...<|prompt|>Diberi string berikut 'a1b2c3', ubah...
20Berikan ulasan untuk restoran dalam 200 kata a...Gajah LiarRestoran Gajah Liar adalah tempat yang menyaji...<|prompt|>Berikan ulasan untuk restoran dalam ...
22Buat hidangan baru menggunakan bahan-bahan ber...Wortel, bawang putih, bawang merah, brokolidapat dipadukan menjadi hidangan sehat yang le...<|prompt|>Buat hidangan baru menggunakan bahan...
...............
67010Beri judul artikel yang diberikan.Artikel ini akan membahas masalah mendesak def...\"Urgensi Menangani Deforestasi: Dampaknya terh...<|prompt|>Beri judul artikel yang diberikan.: ...
67011Tulis ulang cerita sehingga memiliki akhir yan...Alkisah, hiduplah seekor unicorn ajaib di huta...Tapi, kekagumannya berubah menjadi kesedihan k...<|prompt|>Tulis ulang cerita sehingga memiliki...
67012Sarankan langkah kebijakan baru untuk mengatas...Ada tren kejahatan dunia maya yang berkembang ...1. Peningkatan Kesadaran Masyarakat. Melalui k...<|prompt|>Sarankan langkah kebijakan baru untu...
67013Kembangkan pernyataan tesis yang menangkap ese...Artikel ini akan mengeksplorasi bagaimana berb...Pernyataan tesis dari artikel ini adalah bahwa...<|prompt|>Kembangkan pernyataan tesis yang men...
67015Cocokkan masukan berikut dengan jenis tugas pe...Gambar kucingPembelajaran mesin dalam klasifikasi gambar.<|prompt|>Cocokkan masukan berikut dengan jeni...
\n", "

25253 rows × 4 columns

\n", "
" ], "text/plain": [ " instruction \\\n", "7 Kategorikan film ini ke dalam salah satu genre... \n", "9 Ubah string berikut menjadi huruf kecil penuh. \n", "19 Diberi string berikut 'a1b2c3', ubah menjadi '... \n", "20 Berikan ulasan untuk restoran dalam 200 kata a... \n", "22 Buat hidangan baru menggunakan bahan-bahan ber... \n", "... ... \n", "67010 Beri judul artikel yang diberikan. \n", "67011 Tulis ulang cerita sehingga memiliki akhir yan... \n", "67012 Sarankan langkah kebijakan baru untuk mengatas... \n", "67013 Kembangkan pernyataan tesis yang menangkap ese... \n", "67015 Cocokkan masukan berikut dengan jenis tugas pe... \n", "\n", " input \\\n", "7 Revenant \n", "9 Halo \n", "19 a1b2c3 \n", "20 Gajah Liar \n", "22 Wortel, bawang putih, bawang merah, brokoli \n", "... ... \n", "67010 Artikel ini akan membahas masalah mendesak def... \n", "67011 Alkisah, hiduplah seekor unicorn ajaib di huta... \n", "67012 Ada tren kejahatan dunia maya yang berkembang ... \n", "67013 Artikel ini akan mengeksplorasi bagaimana berb... \n", "67015 Gambar kucing \n", "\n", " output \\\n", "7 Drama. \n", "9 halo \n", "19 dapat diubah menjadi 'abc123' dengan menggunak... \n", "20 Restoran Gajah Liar adalah tempat yang menyaji... \n", "22 dapat dipadukan menjadi hidangan sehat yang le... \n", "... ... \n", "67010 \"Urgensi Menangani Deforestasi: Dampaknya terh... \n", "67011 Tapi, kekagumannya berubah menjadi kesedihan k... \n", "67012 1. Peningkatan Kesadaran Masyarakat. Melalui k... \n", "67013 Pernyataan tesis dari artikel ini adalah bahwa... \n", "67015 Pembelajaran mesin dalam klasifikasi gambar. \n", "\n", " QnA \n", "7 <|prompt|>Kategorikan film ini ke dalam salah ... \n", "9 <|prompt|>Ubah string berikut menjadi huruf ke... \n", "19 <|prompt|>Diberi string berikut 'a1b2c3', ubah... \n", "20 <|prompt|>Berikan ulasan untuk restoran dalam ... \n", "22 <|prompt|>Buat hidangan baru menggunakan bahan... \n", "... ... \n", "67010 <|prompt|>Beri judul artikel yang diberikan.: ... \n", "67011 <|prompt|>Tulis ulang cerita sehingga memiliki... \n", "67012 <|prompt|>Sarankan langkah kebijakan baru untu... \n", "67013 <|prompt|>Kembangkan pernyataan tesis yang men... \n", "67015 <|prompt|>Cocokkan masukan berikut dengan jeni... \n", "\n", "[25253 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI_WithInput['QnA'] = \"<|prompt|>\" + df_MBZUAI_WithInput[\"instruction\"].astype(str) + \": \" + df_MBZUAI_WithInput[\"input\"].astype(str) + \"<|answer|>\" + df_MBZUAI_WithInput[\"output\"].astype(str)\n", "df_MBZUAI_WithInput" ] }, { "cell_type": "markdown", "id": "5f66058e-7a72-4807-a2dd-6183f5e2746c", "metadata": {}, "source": [ "Now combine the 2 datasets into one dataframe using pandas concat" ] }, { "cell_type": "code", "execution_count": 15, "id": "d1a07185-9e53-4c06-a2c7-e537fb024971", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instructioninputoutputQnA
0Laporkan suhu rata-rata di Scottsdale, AZ pada...Tidak ditemukan data suhu rata-rata untuk Scot...<|prompt|>Laporkan suhu rata-rata di Scottsdal...
1Bagaimana cara kerja turbin angin?Turbin angin bekerja dengan mengubah energi ki...<|prompt|>Bagaimana cara kerja turbin angin?<|...
2Hasilkan daftar ras anjing paling populer di AS.1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ...<|prompt|>Hasilkan daftar ras anjing paling po...
3Apa itu jaringan pribadi virtual (VPN)?Jaringan pribadi virtual (VPN) adalah sebuah t...<|prompt|>Apa itu jaringan pribadi virtual (VP...
4Identifikasi tema puisi Pablo Neruda \"If You F...Tema puisi \"If You Forget Me\" karya Pablo Neru...<|prompt|>Identifikasi tema puisi Pablo Neruda...
...............
67010Beri judul artikel yang diberikan.Artikel ini akan membahas masalah mendesak def...\"Urgensi Menangani Deforestasi: Dampaknya terh...<|prompt|>Beri judul artikel yang diberikan.: ...
67011Tulis ulang cerita sehingga memiliki akhir yan...Alkisah, hiduplah seekor unicorn ajaib di huta...Tapi, kekagumannya berubah menjadi kesedihan k...<|prompt|>Tulis ulang cerita sehingga memiliki...
67012Sarankan langkah kebijakan baru untuk mengatas...Ada tren kejahatan dunia maya yang berkembang ...1. Peningkatan Kesadaran Masyarakat. Melalui k...<|prompt|>Sarankan langkah kebijakan baru untu...
67013Kembangkan pernyataan tesis yang menangkap ese...Artikel ini akan mengeksplorasi bagaimana berb...Pernyataan tesis dari artikel ini adalah bahwa...<|prompt|>Kembangkan pernyataan tesis yang men...
67015Cocokkan masukan berikut dengan jenis tugas pe...Gambar kucingPembelajaran mesin dalam klasifikasi gambar.<|prompt|>Cocokkan masukan berikut dengan jeni...
\n", "

67017 rows × 4 columns

\n", "
" ], "text/plain": [ " instruction \\\n", "0 Laporkan suhu rata-rata di Scottsdale, AZ pada... \n", "1 Bagaimana cara kerja turbin angin? \n", "2 Hasilkan daftar ras anjing paling populer di AS. \n", "3 Apa itu jaringan pribadi virtual (VPN)? \n", "4 Identifikasi tema puisi Pablo Neruda \"If You F... \n", "... ... \n", "67010 Beri judul artikel yang diberikan. \n", "67011 Tulis ulang cerita sehingga memiliki akhir yan... \n", "67012 Sarankan langkah kebijakan baru untuk mengatas... \n", "67013 Kembangkan pernyataan tesis yang menangkap ese... \n", "67015 Cocokkan masukan berikut dengan jenis tugas pe... \n", "\n", " input \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "... ... \n", "67010 Artikel ini akan membahas masalah mendesak def... \n", "67011 Alkisah, hiduplah seekor unicorn ajaib di huta... \n", "67012 Ada tren kejahatan dunia maya yang berkembang ... \n", "67013 Artikel ini akan mengeksplorasi bagaimana berb... \n", "67015 Gambar kucing \n", "\n", " output \\\n", "0 Tidak ditemukan data suhu rata-rata untuk Scot... \n", "1 Turbin angin bekerja dengan mengubah energi ki... \n", "2 1. Labradoodle\\n2. Bulldog Inggris\\n3. Golden ... \n", "3 Jaringan pribadi virtual (VPN) adalah sebuah t... \n", "4 Tema puisi \"If You Forget Me\" karya Pablo Neru... \n", "... ... \n", "67010 \"Urgensi Menangani Deforestasi: Dampaknya terh... \n", "67011 Tapi, kekagumannya berubah menjadi kesedihan k... \n", "67012 1. Peningkatan Kesadaran Masyarakat. Melalui k... \n", "67013 Pernyataan tesis dari artikel ini adalah bahwa... \n", "67015 Pembelajaran mesin dalam klasifikasi gambar. \n", "\n", " QnA \n", "0 <|prompt|>Laporkan suhu rata-rata di Scottsdal... \n", "1 <|prompt|>Bagaimana cara kerja turbin angin?<|... \n", "2 <|prompt|>Hasilkan daftar ras anjing paling po... \n", "3 <|prompt|>Apa itu jaringan pribadi virtual (VP... \n", "4 <|prompt|>Identifikasi tema puisi Pablo Neruda... \n", "... ... \n", "67010 <|prompt|>Beri judul artikel yang diberikan.: ... \n", "67011 <|prompt|>Tulis ulang cerita sehingga memiliki... \n", "67012 <|prompt|>Sarankan langkah kebijakan baru untu... \n", "67013 <|prompt|>Kembangkan pernyataan tesis yang men... \n", "67015 <|prompt|>Cocokkan masukan berikut dengan jeni... \n", "\n", "[67017 rows x 4 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_MBZUAI = pd.concat([df_MBZUAI_NoInput,df_MBZUAI_WithInput])\n", "df_MBZUAI" ] }, { "cell_type": "markdown", "id": "44ab38dc-1a62-4ff8-a618-e262f0657f49", "metadata": {}, "source": [ "Here we go.. There is the process which using Huge VRAM. We calculate perplexity of that dataset. \n", "So, it will create a column called \"PPL\" which calculate perplexity of the text of each rows in \"QnA\" column." ] }, { "cell_type": "code", "execution_count": null, "id": "732140c1-8ea0-40fa-bc24-48d829b06216", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%time\n", "df_MBZUAI['PPL'] = df_MBZUAI['QnA'].apply(Perplexity)\n", "df_MBZUAI" ] }, { "cell_type": "code", "execution_count": null, "id": "8364aaa6-d5b6-4335-94a0-fff672ceab97", "metadata": {}, "outputs": [], "source": [ "df_MBZUAI.to_csv(\"Eval-MBZUAI_Merak-7B-v1.csv\")" ] }, { "cell_type": "markdown", "id": "e9587afc-48cc-47a3-bb5e-9e3f65adf6e7", "metadata": {}, "source": [ "## CITATION\n", "\n", "Priyanka, M. 2022. \"Perplexity of Language Models\", (Online), (https://medium.com/@priyankads/perplexity-of-language-models-41160427ed72, accessed 8 August 2023)." ] }, { "cell_type": "code", "execution_count": null, "id": "b4009246-5178-4f53-b54c-24d4b977ac89", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }