Spaces:

towardsai-tutors
/

ai-tutor-chatbot

Running

File size: 15,782 Bytes

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNHN2Xow++PNnfnmkcw4iqx",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/RAG_101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## RAG 101"
      ],
      "metadata": {
        "id": "_PtvhdlwUmmb"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_L_iOECPT-Ki",
        "outputId": "d432f4d1-49b1-4500-e686-cab5d82a1e15"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/225.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/76.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ],
      "source": [
        "!pip install -q openai==1.6.0"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "from google.colab import userdata\n",
        "\n",
        "# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
        "os.environ[\"OPENAI_API_KEY\"] = \"[OPENAI_API_KEY]\""
      ],
      "metadata": {
        "id": "nP6_Z0DJUerK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from openai import OpenAI\n",
        "\n",
        "# Defining the \"client\" object that enables\n",
        "# us to connect to OpenAI API endpoints.\n",
        "client = OpenAI()"
      ],
      "metadata": {
        "id": "IPmDcnKlUiY5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Create a user prompt with the user's question\n",
        "prompt = f\"How many parameters LLaMA 3 Models have?\"\n",
        "\n",
        "#Call the OpenAI API\n",
        "response = client.chat.completions.create(\n",
        "  model=\"gpt-4o-mini\",\n",
        "  temperature=0.0,\n",
        "  messages=[\n",
        "    {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n",
        "    {\"role\": \"user\", \"content\": prompt}\n",
        "  ]\n",
        ")\n",
        "print(response.choices[0].message.content)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yplCUS2pUiU1",
        "outputId": "657e2a95-a380-4255-c33e-3964554a115a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "As of my last update in October 2023, the LLaMA 3 models have not been officially released, and specific details about their architecture, including the number of parameters, have not been disclosed. The LLaMA (Large Language Model Meta AI) series from Meta (formerly Facebook) includes LLaMA 1 and LLaMA 2, which have various model sizes ranging from 7 billion to 70 billion parameters.\n",
            "\n",
            "For the most accurate and up-to-date information regarding LLaMA 3 or any other models, I recommend checking the official announcements from Meta or relevant research publications.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "response.choices[0].message.content.strip()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 87
        },
        "id": "sFM5-GEgUiRR",
        "outputId": "0f594cd2-720e-431b-a640-eed28eec0017"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'As of my last update in October 2023, the LLaMA 3 models have not been officially released, and specific details about their architecture, including the number of parameters, have not been disclosed. The LLaMA (Large Language Model Meta AI) series from Meta (formerly Facebook) includes LLaMA 1 and LLaMA 2, which have various model sizes ranging from 7 billion to 70 billion parameters.\\n\\nFor the most accurate and up-to-date information regarding LLaMA 3 or any other models, I recommend checking the official announcements from Meta or relevant research publications.'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 27
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# An article about the release of LLaMA 3 model.\n",
        "# https://towardsai.net/p/artificial-intelligence/some-technical-notes-about-llama-3#.\n",
        "ARTICLE = \"\"\"\n",
        "Since the debut of the original version, Llama has become one of the foundational blocks of the open source generative AI space. I prefer to use the term “open models,” given that these releases are not completely open source, but that’s just my preference. Last week, the trend in open models became even hotter with the release Llama 3. The release of Llama 3 builds on incredible momentum within the open model ecosystem and brings its own innovations. The 8B and 70B versions of Llama 3 are available, with a 400B version currently being trained. The Llama 3 architecture is based on a decoder-only model and includes a new, highly optimized 128k tokenizer. This is quite notable, given that, with few exceptions, most large language models simply reuse the same tokenizers. The new tokenizer leads to major performance gains. Another area of improvement in the architecture is the grouped query attention, which was already used in Llama 2 but has been enhanced for the larger models. Grouped query attention helps improve inference performance by caching key parameters. Additionally, the context window has also increased.\n",
        "Training is one area in which Llama 3 drastically improves over its predecessors. The model was trained on 15 trillion tokens, making the corpus quite large for an 8B parameter model, which speaks to the level of optimization Meta achieved in this release. It’s interesting to note that only 5% of the training corpus consisted of non-English tokens. The training infrastructure utilized 16,000 GPUs, achieving a throughput of 400 TFLOPs, which is nothing short of monumental.\n",
        "Architecture Meta AI’s Llama 3 features a standard, decoder-only transformer structure. Llama 3 introduces a tokenizer equipped with a 128K token vocabulary, which enhances language encoding efficiency, significantly boosting model performance. To enhance the inference capabilities, Llama 3 integrates grouped query attention (GQA) across models sized at 8B and 70B. These models are trained with sequences up to 8,192 tokens long, using a masking technique to prevent self-attention across document boundaries.\n",
        "1)Tokenizer The latest iteration of Llama 3 showcases an innovative tokenizer. This tokenizer operates with a vocabulary comprising 128K tokens, optimized beyond its predecessors to yield superior inference performance. Notably, the Llama 3–8B model was trained using an impressive 15 trillion tokens, a feat made possible through effective parameter utilization.\n",
        "2) GQA Grouped-query attention (GQA) ingeniously combines aspects of multi-head attention (MHA) and multi-query attention (MQA) to form an efficient attention mechanism. By caching keys and values from prior tokens, GQA lessens memory demands as batch sizes or context windows expand, thereby streamlining the decoding process in Transformer models.\n",
        "3) RoPE Llama 3 employs Rotary Positional Encoding (RoPE), a sophisticated encoding mechanism that strikes a balance between absolute positional encodings and relative positional encodings. This method not only retains a fixed embedding for each token but also applies a rotational computation to the vectors, enhancing the model’s attention calculations.\n",
        "4) KV Cache Key-Value (KV) caching is a technique deployed to speed up the inference in autoregressive models like GPT and Llama. By storing previously computed keys and values, the model reduces repetitive calculations, thus expediting matrix multiplications and enhancing overall efficiency.\n",
        "Training Meta AI has pre-trained Llama 3 on over 15 trillion tokens gathered from public sources. The training set is seven times larger than that used for Llama 2 and includes a significantly higher volume of code. With more than 5% of the training data consisting of high-quality, non-English content covering over 30 languages, Llama 3 prepares for multilingual applications, although performance in these languages may not equal that in English.\n",
        "In pursuit of the highest data quality, Meta AI developed sophisticated filtering systems, including heuristic and NSFW filters, semantic deduplication, and text classifiers. These systems were refined using insights from previous model generations, particularly Llama 2, which was instrumental in generating training data for Llama 3’s quality-assurance classifiers. For its largest models, Llama 3 utilizes a trio of parallelization strategies: data, model, and pipeline parallelization. Its most effective setup reaches over 400 TFLOPS per GPU, facilitated by training on 16,000 GPUs simultaneously within two custom-built 24,000 GPU clusters. Meta AI has also innovated a new training stack that automates error detection, handling, and maintenance to optimize GPU utilization.\n",
        "Llama 3 Instruct In refining its pretrained models for chat applications, Meta AI has employed a hybrid of supervised fine-tuning (SFT), rejection sampling, proximal policy optimization (PPO), and direct preference optimization (DPO). The selection and quality assurance of prompts and preference rankings significantly influence model performance. Moreover, to ensure model safety, these instruction-fine-tuned models undergo rigorous testing, including red-teaming by experts using adversarial prompts to identify and mitigate potential misuse risks.\n",
        "The Results Llama 3 achieves top-tier performance across leading industry benchmarks like MMLU and CommonSense QA.\n",
        "Additionally, Meta AI has curated a new, high-quality human evaluation set comprising 1,800 prompts spanning 12 critical use cases. Access to this set is restricted even within Meta AI to prevent potential overfitting by the modeling teams.\n",
        "An Impressive Model Llama 3 is a very welcome addition to the open model generative AI stack. The initial benchmark results are quite impressive, and the 400B version could rival GPT-4. Distribution is one area where Meta excelled in this release, making Llama 3 available on all major machine learning platforms. It’s been just a few hours, and we are already seeing open source innovations using Llama 3.\"\"\""
      ],
      "metadata": {
        "id": "a1XgODXdUiN3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Create a user prompt with the user's question\n",
        "prompt = f\"Use the following article as the source and answer the question:\\n\\n{ARTICLE}\\n\\nHow many parameters LLaMA 3 Models have?\"\n",
        "\n",
        "# Call the OpenAI API\n",
        "response = client.chat.completions.create(\n",
        "        model=\"gpt-4o-mini\",\n",
        "        temperature=0.0,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n",
        "            {\"role\": \"user\", \"content\": prompt}\n",
        "        ]\n",
        "    )"
      ],
      "metadata": {
        "id": "KnF52b8IUiEv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print( response.choices[0].message.content.strip() )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7gMvLiitVEdq",
        "outputId": "0245078b-fb3b-498a-ac9a-81f7af396d81"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Llama 3 has models with 8 billion (8B) and 70 billion (70B) parameters, with a 400 billion (400B) version currently being trained.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "len( ARTICLE.split(\" \") )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vyUkCv0yVEZq",
        "outputId": "996108d0-253a-4024-e3e2-e45ad160a3f4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "881"
            ]
          },
          "metadata": {},
          "execution_count": 29
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "nqSwasoUVEXC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "vRm-3T5NVEUX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "q1uzBZbAVERn"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}