File size: 15,782 Bytes
38b5930 6b51bf6 38b5930 6b51bf6 38b5930 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNHN2Xow++PNnfnmkcw4iqx",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/RAG_101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"## RAG 101"
],
"metadata": {
"id": "_PtvhdlwUmmb"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_L_iOECPT-Ki",
"outputId": "d432f4d1-49b1-4500-e686-cab5d82a1e15"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[?25l \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0.0/225.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[?25l \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0.0/76.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!pip install -q openai==1.6.0"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"from google.colab import userdata\n",
"\n",
"# Set the \"OPENAI_API_KEY\" in the Python environment. Will be used by OpenAI client later.\n",
"os.environ[\"OPENAI_API_KEY\"] = \"[OPENAI_API_KEY]\""
],
"metadata": {
"id": "nP6_Z0DJUerK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from openai import OpenAI\n",
"\n",
"# Defining the \"client\" object that enables\n",
"# us to connect to OpenAI API endpoints.\n",
"client = OpenAI()"
],
"metadata": {
"id": "IPmDcnKlUiY5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create a user prompt with the user's question\n",
"prompt = f\"How many parameters LLaMA 3 Models have?\"\n",
"\n",
"#Call the OpenAI API\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" temperature=0.0,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n",
" {\"role\": \"user\", \"content\": prompt}\n",
" ]\n",
")\n",
"print(response.choices[0].message.content)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yplCUS2pUiU1",
"outputId": "657e2a95-a380-4255-c33e-3964554a115a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"As of my last update in October 2023, the LLaMA 3 models have not been officially released, and specific details about their architecture, including the number of parameters, have not been disclosed. The LLaMA (Large Language Model Meta AI) series from Meta (formerly Facebook) includes LLaMA 1 and LLaMA 2, which have various model sizes ranging from 7 billion to 70 billion parameters.\n",
"\n",
"For the most accurate and up-to-date information regarding LLaMA 3 or any other models, I recommend checking the official announcements from Meta or relevant research publications.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"response.choices[0].message.content.strip()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87
},
"id": "sFM5-GEgUiRR",
"outputId": "0f594cd2-720e-431b-a640-eed28eec0017"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'As of my last update in October 2023, the LLaMA 3 models have not been officially released, and specific details about their architecture, including the number of parameters, have not been disclosed. The LLaMA (Large Language Model Meta AI) series from Meta (formerly Facebook) includes LLaMA 1 and LLaMA 2, which have various model sizes ranging from 7 billion to 70 billion parameters.\\n\\nFor the most accurate and up-to-date information regarding LLaMA 3 or any other models, I recommend checking the official announcements from Meta or relevant research publications.'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"source": [
"# An article about the release of LLaMA 3 model.\n",
"# https://towardsai.net/p/artificial-intelligence/some-technical-notes-about-llama-3#.\n",
"ARTICLE = \"\"\"\n",
"Since the debut of the original version, Llama has become one of the foundational blocks of the open source generative AI space. I prefer to use the term βopen models,β given that these releases are not completely open source, but thatβs just my preference. Last week, the trend in open models became even hotter with the release Llama 3. The release of Llama 3 builds on incredible momentum within the open model ecosystem and brings its own innovations. The 8B and 70B versions of Llama 3 are available, with a 400B version currently being trained. The Llama 3 architecture is based on a decoder-only model and includes a new, highly optimized 128k tokenizer. This is quite notable, given that, with few exceptions, most large language models simply reuse the same tokenizers. The new tokenizer leads to major performance gains. Another area of improvement in the architecture is the grouped query attention, which was already used in Llama 2 but has been enhanced for the larger models. Grouped query attention helps improve inference performance by caching key parameters. Additionally, the context window has also increased.\n",
"Training is one area in which Llama 3 drastically improves over its predecessors. The model was trained on 15 trillion tokens, making the corpus quite large for an 8B parameter model, which speaks to the level of optimization Meta achieved in this release. Itβs interesting to note that only 5% of the training corpus consisted of non-English tokens. The training infrastructure utilized 16,000 GPUs, achieving a throughput of 400 TFLOPs, which is nothing short of monumental.\n",
"Architecture Meta AIβs Llama 3 features a standard, decoder-only transformer structure. Llama 3 introduces a tokenizer equipped with a 128K token vocabulary, which enhances language encoding efficiency, significantly boosting model performance. To enhance the inference capabilities, Llama 3 integrates grouped query attention (GQA) across models sized at 8B and 70B. These models are trained with sequences up to 8,192 tokens long, using a masking technique to prevent self-attention across document boundaries.\n",
"1)Tokenizer The latest iteration of Llama 3 showcases an innovative tokenizer. This tokenizer operates with a vocabulary comprising 128K tokens, optimized beyond its predecessors to yield superior inference performance. Notably, the Llama 3β8B model was trained using an impressive 15 trillion tokens, a feat made possible through effective parameter utilization.\n",
"2) GQA Grouped-query attention (GQA) ingeniously combines aspects of multi-head attention (MHA) and multi-query attention (MQA) to form an efficient attention mechanism. By caching keys and values from prior tokens, GQA lessens memory demands as batch sizes or context windows expand, thereby streamlining the decoding process in Transformer models.\n",
"3) RoPE Llama 3 employs Rotary Positional Encoding (RoPE), a sophisticated encoding mechanism that strikes a balance between absolute positional encodings and relative positional encodings. This method not only retains a fixed embedding for each token but also applies a rotational computation to the vectors, enhancing the modelβs attention calculations.\n",
"4) KV Cache Key-Value (KV) caching is a technique deployed to speed up the inference in autoregressive models like GPT and Llama. By storing previously computed keys and values, the model reduces repetitive calculations, thus expediting matrix multiplications and enhancing overall efficiency.\n",
"Training Meta AI has pre-trained Llama 3 on over 15 trillion tokens gathered from public sources. The training set is seven times larger than that used for Llama 2 and includes a significantly higher volume of code. With more than 5% of the training data consisting of high-quality, non-English content covering over 30 languages, Llama 3 prepares for multilingual applications, although performance in these languages may not equal that in English.\n",
"In pursuit of the highest data quality, Meta AI developed sophisticated filtering systems, including heuristic and NSFW filters, semantic deduplication, and text classifiers. These systems were refined using insights from previous model generations, particularly Llama 2, which was instrumental in generating training data for Llama 3βs quality-assurance classifiers. For its largest models, Llama 3 utilizes a trio of parallelization strategies: data, model, and pipeline parallelization. Its most effective setup reaches over 400 TFLOPS per GPU, facilitated by training on 16,000 GPUs simultaneously within two custom-built 24,000 GPU clusters. Meta AI has also innovated a new training stack that automates error detection, handling, and maintenance to optimize GPU utilization.\n",
"Llama 3 Instruct In refining its pretrained models for chat applications, Meta AI has employed a hybrid of supervised fine-tuning (SFT), rejection sampling, proximal policy optimization (PPO), and direct preference optimization (DPO). The selection and quality assurance of prompts and preference rankings significantly influence model performance. Moreover, to ensure model safety, these instruction-fine-tuned models undergo rigorous testing, including red-teaming by experts using adversarial prompts to identify and mitigate potential misuse risks.\n",
"The Results Llama 3 achieves top-tier performance across leading industry benchmarks like MMLU and CommonSense QA.\n",
"Additionally, Meta AI has curated a new, high-quality human evaluation set comprising 1,800 prompts spanning 12 critical use cases. Access to this set is restricted even within Meta AI to prevent potential overfitting by the modeling teams.\n",
"An Impressive Model Llama 3 is a very welcome addition to the open model generative AI stack. The initial benchmark results are quite impressive, and the 400B version could rival GPT-4. Distribution is one area where Meta excelled in this release, making Llama 3 available on all major machine learning platforms. Itβs been just a few hours, and we are already seeing open source innovations using Llama 3.\"\"\""
],
"metadata": {
"id": "a1XgODXdUiN3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create a user prompt with the user's question\n",
"prompt = f\"Use the following article as the source and answer the question:\\n\\n{ARTICLE}\\n\\nHow many parameters LLaMA 3 Models have?\"\n",
"\n",
"# Call the OpenAI API\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-4o-mini\",\n",
" temperature=0.0,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant\"},\n",
" {\"role\": \"user\", \"content\": prompt}\n",
" ]\n",
" )"
],
"metadata": {
"id": "KnF52b8IUiEv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print( response.choices[0].message.content.strip() )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7gMvLiitVEdq",
"outputId": "0245078b-fb3b-498a-ac9a-81f7af396d81"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Llama 3 has models with 8 billion (8B) and 70 billion (70B) parameters, with a 400 billion (400B) version currently being trained.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"len( ARTICLE.split(\" \") )"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vyUkCv0yVEZq",
"outputId": "996108d0-253a-4024-e3e2-e45ad160a3f4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"881"
]
},
"metadata": {},
"execution_count": 29
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "nqSwasoUVEXC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "vRm-3T5NVEUX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "q1uzBZbAVERn"
},
"execution_count": null,
"outputs": []
}
]
} |