{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TRANSFORMER MODELS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transformers, what can they do?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sentiment Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co./distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\ACER\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'label': 'POSITIVE', 'score': 0.9598049521446228}]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "classifier = pipeline(\"sentiment-analysis\")\n",
    "classifier(\"I've been waiting for a HuggingFace course my whole life.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'label': 'POSITIVE', 'score': 0.9598049521446228},\n",
       " {'label': 'NEGATIVE', 'score': 0.9994558691978455}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can pass several sentences\n",
    "classifier(\n",
    "    [\"I've been waiting for a HuggingFace course my whole life.\", \"I hate this so much!\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Zero-shot classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co./facebook/bart-large-mnli).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13af57499d894e8aa77c7ed39138d3dd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:  98%|#########8| 1.60G/1.63G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\ACER\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\huggingface_hub\\file_download.py:147: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--facebook--bart-large-mnli. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co./docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5184b998013d4eacac2a0e943ebcbfdf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "af001870e23b4808862f0f4e160327ef",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "743eb773e873441c813a1d13925215cf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f29eb797c99242558fe742a00411262c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'sequence': 'This is a course about the Transformers library.',\n",
       " 'labels': ['education', 'business', 'politics'],\n",
       " 'scores': [0.8719874024391174, 0.09406554698944092, 0.033947039395570755]}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "classifier = pipeline(\"zero-shot-classification\")\n",
    "\n",
    "classifier(\n",
    "    \"This is a course about the Transformers library.\",\n",
    "    candidate_labels = [\"education\", \"politics\", \"business\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Text generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co./openai-community/gpt2).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'generated_text': 'In this course, we will teach you how to build a custom script and a WebScript web server that uses the JQuery 4.3 framework.\\n\\nYou will run up to 60 minutes with a single setup, in our example JQuery J'}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "generator = pipeline(\"text-generation\")\n",
    "generator(\"In this course, we will teach you how to\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using any model from the Hub in a pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'generated_text': 'In this course, we will teach you how to implement an API that can only be used by a single user.\\n\\n\\nHere are the slides'},\n",
       " {'generated_text': 'In this course, we will teach you how to put food in order to reduce the risk of heart disease and even kill yourself as part of a program'}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "generator = pipeline(\"text-generation\", model=\"distilgpt2\")\n",
    "\n",
    "generator(\n",
    "    \"In this course, we will teach you how to\",\n",
    "    max_length=30,\n",
    "    num_return_sequences=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mask filling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to distilbert/distilroberta-base and revision fb53ab8 (https://huggingface.co./distilbert/distilroberta-base).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
      "Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'score': 0.19198469817638397,\n",
       "  'token': 30412,\n",
       "  'token_str': ' mathematical',\n",
       "  'sequence': 'This course will teach you all about mathematical models.'},\n",
       " {'score': 0.04209211468696594,\n",
       "  'token': 38163,\n",
       "  'token_str': ' computational',\n",
       "  'sequence': 'This course will teach you all about computational models.'}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "unmasker = pipeline(\"fill-mask\")\n",
    "unmasker(\"This course will teach you all about <mask> models.\", top_k=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Named Entity Recognition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co./dbmdz/bert-large-cased-finetuned-conll03-english).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
      "Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
      "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "c:\\Users\\ACER\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\pipelines\\token_classification.py:170: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy=\"AggregationStrategy.SIMPLE\"` instead.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER',\n",
       "  'score': 0.99884915,\n",
       "  'word': 'Ahmad',\n",
       "  'start': 11,\n",
       "  'end': 16},\n",
       " {'entity_group': 'ORG',\n",
       "  'score': 0.9950792,\n",
       "  'word': 'University of Engineering and Technology',\n",
       "  'start': 31,\n",
       "  'end': 71},\n",
       " {'entity_group': 'LOC',\n",
       "  'score': 0.97850055,\n",
       "  'word': 'Lahore',\n",
       "  'start': 73,\n",
       "  'end': 79},\n",
       " {'entity_group': 'ORG',\n",
       "  'score': 0.78072757,\n",
       "  'word': \"Bechelor ' s\",\n",
       "  'start': 95,\n",
       "  'end': 105},\n",
       " {'entity_group': 'ORG',\n",
       "  'score': 0.92247367,\n",
       "  'word': 'Computer Science',\n",
       "  'start': 109,\n",
       "  'end': 125}]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "ner = pipeline(\"ner\", grouped_entities=True)\n",
    "ner(\"My name is Ahmad and I work at University of Engineering and Technology, Lahore. I was prsuing Bechelor's of Computer Science.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Question answering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co./distilbert/distilbert-base-cased-distilled-squad).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "question_answerer = pipeline(\"question-answering\")\n",
    "\n",
    "ans = question_answerer(\n",
    "        question=\"where do I work?\",\n",
    "        context = \"My name is Ahmad and I work at University of Engineering and Technology, Lahore\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'University of Engineering and Technology, Lahore'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ans['answer']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Summarization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co./sshleifer/distilbart-cnn-12-6).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "summarizer = pipeline(\"summarization\")\n",
    "summary = summarizer(\n",
    "    \"\"\"\n",
    "    America has changed dramatically during recent years. Not only has the number of \n",
    "    graduates in traditional engineering disciplines such as mechanical, civil, \n",
    "    electrical, chemical, and aeronautical engineering declined, but in most of \n",
    "    the premier American universities engineering curricula now concentrate on \n",
    "    and encourage largely the study of engineering science. As a result, there \n",
    "    are declining offerings in engineering subjects dealing with infrastructure, \n",
    "    the environment, and related issues, and greater concentration on high \n",
    "    technology subjects, largely supporting increasingly complex scientific \n",
    "    developments. While the latter is important, it should not be at the expense \n",
    "    of more traditional engineering.\n",
    "\n",
    "    Rapidly developing economies such as China and India, as well as other \n",
    "    industrial countries in Europe and Asia, continue to encourage and advance \n",
    "    the teaching of engineering. Both China and India, respectively, graduate \n",
    "    six and eight times as many traditional engineers as does the United States. \n",
    "    Other industrial countries at minimum maintain their output, while America \n",
    "    suffers an increasingly serious decline in the number of engineering graduates \n",
    "    and a lack of well-educated engineers.\n",
    "\"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil,    electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .\n"
     ]
    }
   ],
   "source": [
    "print(summary[0]['summary_text'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Translation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sentencepiece"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e7521143fb794a39b66b0f5d00f9fac8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\ACER\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\huggingface_hub\\file_download.py:147: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\ACER\\.cache\\huggingface\\hub\\models--Helsinki-NLP--opus-mt-fr-en. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co./docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d658b08296d64e4081ac272272b520d7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "92ea52e7b8d446e7a21d844815c4045b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\ACER\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\transformers\\models\\marian\\tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
      "  warnings.warn(\"Recommended: pip install sacremoses.\")\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'translation_text': 'This course is produced by Hugging Face.'}]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sentencepiece\n",
    "from transformers import pipeline\n",
    "\n",
    "translator = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-fr-en\")\n",
    "translator(\"Ce cours est produit par Hugging Face.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bias and limitations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n",
      "  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co./docs/transformers/en/model_doc/auto#auto-classes\n",
      "  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n",
      "  - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n",
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['carpenter', 'lawyer', 'farmer', 'businessman', 'doctor']\n",
      "['nurse', 'maid', 'teacher', 'waitress', 'prostitute']\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "unmasker = pipeline(\"fill-mask\", model=\"bert-base-uncased\")\n",
    "result = unmasker(\"This man works as a [MASK].\")\n",
    "print([r[\"token_str\"] for r in result])\n",
    "\n",
    "result = unmasker(\"This woman works as a [MASK].\")\n",
    "print([r[\"token_str\"] for r in result])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "huggingface-nlp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}