diff --git "a/language_modeling_ipynb.ipynb" "b/language_modeling_ipynb.ipynb"
new file mode 100644--- /dev/null
+++ "b/language_modeling_ipynb.ipynb"
@@ -0,0 +1,2532 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "NinXqXib_ST4",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "cdf8180e-242a-49b0-b551-8fab2a072566"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
+ "Collecting datasets\n",
+ " Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n",
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.7)\n",
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n",
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n",
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
+ "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+ "Collecting xxhash (from datasets)\n",
+ " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+ "Collecting multiprocess<0.70.17 (from datasets)\n",
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+ "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
+ " Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n",
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
+ "Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: xxhash, fsspec, dill, multiprocess, datasets\n",
+ " Attempting uninstall: fsspec\n",
+ " Found existing installation: fsspec 2024.10.0\n",
+ " Uninstalling fsspec-2024.10.0:\n",
+ " Successfully uninstalled fsspec-2024.10.0\n",
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 xxhash-3.5.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Transformers installation\n",
+ "! pip install transformers datasets\n",
+ "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
+ "# ! pip install git+https://github.com/huggingface/transformers.git"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8GIHY6dx_ST5"
+ },
+ "source": [
+ "# Causal language modeling"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Kkn9GFBO_ST6"
+ },
+ "source": [
+ "There are two types of language modeling, causal and masked. This guide illustrates causal language modeling.\n",
+ "Causal language models are frequently used for text generation. You can use these models for creative applications like\n",
+ "choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "hide_input": true,
+ "id": "caiPPXU2_ST6",
+ "outputId": "54f7e440-06bf-4d63-d130-41d9305d522c"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#@title\n",
+ "from IPython.display import HTML\n",
+ "\n",
+ "HTML('')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "r6Nr-TH3_ST7"
+ },
+ "source": [
+ "Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on\n",
+ "the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.\n",
+ "\n",
+ "This guide will show you how to:\n",
+ "\n",
+ "1. Finetune [DistilGPT2](https://huggingface.co./distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co./datasets/eli5) dataset.\n",
+ "2. Use your finetuned model for inference.\n",
+ "\n",
+ "\n",
+ "You can finetune other architectures for causal language modeling following the same steps in this guide.\n",
+ "Choose one of the following architectures:\n",
+ "\n",
+ "\n",
+ "[BART](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/bart), [BERT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/bert), [Bert Generation](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/bert-generation), [BigBird](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/big_bird), [BigBird-Pegasus](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/bigbird_pegasus), [BioGpt](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/biogpt), [Blenderbot](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/blenderbot), [BlenderbotSmall](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/blenderbot-small), [BLOOM](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/bloom), [CamemBERT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/camembert), [CodeGen](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/codegen), [CPM-Ant](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/cpmant), [CTRL](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/ctrl), [Data2VecText](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/data2vec-text), [ELECTRA](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/electra), [ERNIE](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/ernie), [GIT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/git), [GPT-Sw3](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt-sw3), [OpenAI GPT-2](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt2), [GPTBigCode](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt_bigcode), [GPT Neo](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt_neo), [GPT NeoX](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt_neox), [GPT NeoX Japanese](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gpt_neox_japanese), [GPT-J](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/gptj), [LLaMA](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/llama), [Marian](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/marian), [mBART](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/mbart), [MEGA](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/mega), [Megatron-BERT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/megatron-bert), [MVP](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/mvp), [OpenLlama](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/open-llama), [OpenAI GPT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/openai-gpt), [OPT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/opt), [Pegasus](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/pegasus), [PLBart](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/plbart), [ProphetNet](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/prophetnet), [QDQBert](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/qdqbert), [Reformer](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/reformer), [RemBERT](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/rembert), [RoBERTa](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/roberta), [RoBERTa-PreLayerNorm](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/roberta-prelayernorm), [RoCBert](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/roc_bert), [RoFormer](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/roformer), [RWKV](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/rwkv), [Speech2Text2](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/speech_to_text_2), [Transformer-XL](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/transfo-xl), [TrOCR](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/trocr), [XGLM](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xglm), [XLM](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xlm), [XLM-ProphetNet](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xlm-prophetnet), [XLM-RoBERTa](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xlm-roberta), [XLM-RoBERTa-XL](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xlm-roberta-xl), [XLNet](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xlnet), [X-MOD](https://huggingface.co./docs/transformers/main/en/tasks/../model_doc/xmod)\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "Before you begin, make sure you have all the necessary libraries installed:\n",
+ "\n",
+ "```bash\n",
+ "pip install transformers datasets evaluate\n",
+ "```\n",
+ "\n",
+ "We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "HNEFNepD_ST7",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 145,
+ "referenced_widgets": [
+ "c7cbac92b50c4b25b08ed53fb9d1e7ea",
+ "27462e9f8f584bd78302afbb558e7ce1",
+ "c4ddf3e11f494ce192e43ab0af76c1fb",
+ "dfc862db896046059198b5361979b4d0",
+ "897f46281fc04d059196c2a3f74e2602",
+ "8d0af4b805e443be87b93c65c4054481",
+ "3ef31c1b6fd149ada314af97d178fda7",
+ "685da064d0da4425bfe1caa922a73255",
+ "f069dd7f87f648bead5ca105108692a7",
+ "734ec20652934fb7973ea279e1cdcbd1",
+ "00e7f46767104689856ab69ee26e93ea",
+ "a6c08d8b8f464351a9015d1c92b2fe49",
+ "9c447393363e4f489c6b164853d52dba",
+ "219d008e9c184d37b5d44b4534cb740f",
+ "62a3cfa603bf4409958fd33a67421eea",
+ "20abd2c02bd34e2f81618c677bc69d7e",
+ "c64794312be24a3bb508fc6a012872e7",
+ "8708b3d8a0c44e65ba7150e56ba3f7b0",
+ "d1b201fad44a4df4af2578cb3c7d2963",
+ "99904d572ffe47e6a71560fc56fc331c",
+ "a6d2343e8b7145e48530efe26b4d467e",
+ "033330b6ae814b5ea1d816c778f7f72e",
+ "94e78d176d82449aac3722ae909b2fd6",
+ "1db9f91e5ce14354baf2fa76681daa87",
+ "59fe98107d04476bbd98c5e1c528f101",
+ "4291244fdfde4c518cef77c6ed17edfb",
+ "2a33f4d4a0cc407dbdc3781ffacbae3c",
+ "2538e209aa9749ea9d3aeb250e05aebf",
+ "419dcbefbd914099943cca505a3e1a07",
+ "75a7ccf7b458432aaff71a0a0ab28ef3",
+ "5d4c80291ded4492b87701256a245af7",
+ "fcf964c0a87641cfbca1fde6847beb9e"
+ ]
+ },
+ "outputId": "869a3c5e-b980-4c0c-851a-0b4e2ee8a10d"
+ },
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "VBox(children=(HTML(value='
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#@title\n",
+ "from IPython.display import HTML\n",
+ "\n",
+ "HTML('')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UN9pd11c_ST-"
+ },
+ "source": [
+ "The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "O0BDNuZq_ST-"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tQNaKpNt_ST-"
+ },
+ "source": [
+ "You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to\n",
+ "extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co./docs/datasets/process.html#flatten) method:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KdRXyn6i_ST_"
+ },
+ "outputs": [],
+ "source": [
+ "eli5 = eli5.flatten()\n",
+ "eli5[\"train\"][0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9aBjaqlM_ST_"
+ },
+ "source": [
+ "Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead\n",
+ "of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.\n",
+ "\n",
+ "Here is a first preprocessing function to join the list of strings for each example and tokenize the result:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "e37Cp9Lq_ST_"
+ },
+ "outputs": [],
+ "source": [
+ "def preprocess_function(examples):\n",
+ " return tokenizer([\" \".join(x) for x in examples[\"answers.text\"]])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gLic8Ek9_ST_"
+ },
+ "source": [
+ "To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [map](https://huggingface.co./docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tK0s7hcf_ST_"
+ },
+ "outputs": [],
+ "source": [
+ "tokenized_eli5 = eli5.map(\n",
+ " preprocess_function,\n",
+ " batched=True,\n",
+ " num_proc=4,\n",
+ " remove_columns=eli5[\"train\"].column_names,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7RzsXBTC_ST_"
+ },
+ "source": [
+ "This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.\n",
+ "\n",
+ "You can now use a second preprocessing function to\n",
+ "- concatenate all the sequences\n",
+ "- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7AzXGvbw_SUA"
+ },
+ "outputs": [],
+ "source": [
+ "block_size = 128\n",
+ "\n",
+ "\n",
+ "def group_texts(examples):\n",
+ " # Concatenate all texts.\n",
+ " concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+ " total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+ " # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+ " # customize this part to your needs.\n",
+ " if total_length >= block_size:\n",
+ " total_length = (total_length // block_size) * block_size\n",
+ " # Split by chunks of block_size.\n",
+ " result = {\n",
+ " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+ " for k, t in concatenated_examples.items()\n",
+ " }\n",
+ " result[\"labels\"] = result[\"input_ids\"].copy()\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sOyQRzsv_SUA"
+ },
+ "source": [
+ "Apply the `group_texts` function over the entire dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Z7t6EE7P_SUA"
+ },
+ "outputs": [],
+ "source": [
+ "lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ugfVrCmC_SUA"
+ },
+ "source": [
+ "Now create a batch of examples using [DataCollatorForLanguageModeling](https://huggingface.co./docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling). It's more efficient to *dynamically pad* the\n",
+ "sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.\n",
+ "\n",
+ "Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rmHliRoe_SUA"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import DataCollatorForLanguageModeling\n",
+ "\n",
+ "tokenizer.pad_token = tokenizer.eos_token\n",
+ "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iIWXgoTI_SUA"
+ },
+ "source": [
+ "## Train"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "P-Xnbyai_SUA"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the [basic tutorial](https://huggingface.co./docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!\n",
+ "\n",
+ "\n",
+ "\n",
+ "You're ready to start training your model now! Load DistilGPT2 with [AutoModelForCausalLM](https://huggingface.co./docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "RZzJ4kdU_SUB"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
+ "\n",
+ "model = AutoModelForCausalLM.from_pretrained(\"distilgpt2\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QEG5vHkA_SUB"
+ },
+ "source": [
+ "At this point, only three steps remain:\n",
+ "\n",
+ "1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).\n",
+ "2. Pass the training arguments to [Trainer](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, datasets, and data collator.\n",
+ "3. Call [train()](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "QFxucb5l_SUB"
+ },
+ "outputs": [],
+ "source": [
+ "training_args = TrainingArguments(\n",
+ " output_dir=\"my_awesome_eli5_clm-model\",\n",
+ " evaluation_strategy=\"epoch\",\n",
+ " learning_rate=2e-5,\n",
+ " weight_decay=0.01,\n",
+ " push_to_hub=True,\n",
+ ")\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " train_dataset=lm_dataset[\"train\"],\n",
+ " eval_dataset=lm_dataset[\"test\"],\n",
+ " data_collator=data_collator,\n",
+ ")\n",
+ "\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QPwqvjJt_SUB"
+ },
+ "source": [
+ "Once training is completed, use the [evaluate()](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.Trainer.evaluate) method to evaluate your model and get its perplexity:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "v4l5uJvZ_SUB"
+ },
+ "outputs": [],
+ "source": [
+ "import math\n",
+ "\n",
+ "eval_results = trainer.evaluate()\n",
+ "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4YPTUv2X_SUC"
+ },
+ "source": [
+ "Then share your model to the Hub with the [push_to_hub()](https://huggingface.co./docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "M9gA4ae0_SUC"
+ },
+ "outputs": [],
+ "source": [
+ "trainer.push_to_hub()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wko6NzBL_SUG"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding\n",
+ "[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)\n",
+ "or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lW95xHiy_SUG"
+ },
+ "source": [
+ "## Inference"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MZiF18Tp_SUG"
+ },
+ "source": [
+ "Great, now that you've finetuned a model, you can use it for inference!\n",
+ "\n",
+ "Come up with a prompt you'd like to generate text from:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BMIbrIW__SUG"
+ },
+ "outputs": [],
+ "source": [
+ "prompt = \"Somatic hypermutation allows the immune system to\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SaHfWSK8_SUG"
+ },
+ "source": [
+ "The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co./docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for text generation with your model, and pass your text to it:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vh3c1LZE_SUG"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import pipeline\n",
+ "\n",
+ "generator = pipeline(\"text-generation\", model=\"my_awesome_eli5_clm-model\")\n",
+ "generator(prompt)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9cTZrrHU_SUH"
+ },
+ "source": [
+ "Tokenize the text and return the `input_ids` as PyTorch tensors:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xKjYKsVE_SUH"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"my_awesome_eli5_clm-model\")\n",
+ "inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sjz8NpIv_SUH"
+ },
+ "source": [
+ "Use the [generate()](https://huggingface.co./docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate) method to generate text.\n",
+ "For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](https://huggingface.co./docs/transformers/main/en/tasks/../generation_strategies) page."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "G-KrHWZ1_SUH"
+ },
+ "outputs": [],
+ "source": [
+ "from transformers import AutoModelForCausalLM\n",
+ "\n",
+ "model = AutoModelForCausalLM.from_pretrained(\"my_awesome_eli5_clm-model\")\n",
+ "outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_ubarw6t_SUH"
+ },
+ "source": [
+ "Decode the generated token ids back into text:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6IKhmEr7_SUH"
+ },
+ "outputs": [],
+ "source": [
+ "tokenizer.batch_decode(outputs, skip_special_tokens=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "toc_visible": true
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "c7cbac92b50c4b25b08ed53fb9d1e7ea": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "VBoxModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "VBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "VBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_a6d2343e8b7145e48530efe26b4d467e",
+ "IPY_MODEL_033330b6ae814b5ea1d816c778f7f72e",
+ "IPY_MODEL_94e78d176d82449aac3722ae909b2fd6",
+ "IPY_MODEL_1db9f91e5ce14354baf2fa76681daa87"
+ ],
+ "layout": "IPY_MODEL_3ef31c1b6fd149ada314af97d178fda7"
+ }
+ },
+ "27462e9f8f584bd78302afbb558e7ce1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_685da064d0da4425bfe1caa922a73255",
+ "placeholder": "",
+ "style": "IPY_MODEL_f069dd7f87f648bead5ca105108692a7",
+ "value": "
Copy a token from your Hugging Face\ntokens page and paste it below. Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.