Add MFU estimation for Ampere GPUs

Files changed (3) hide show

instruction-tune.py +1 -1
model.py +2 -2
model_sizing.ipynb +60 -25

instruction-tune.py CHANGED Viewed

@@ -26,7 +26,7 @@ def main():
     parser.add_argument("--base_model_path", default="./out/checkpoint.pt", type=str)
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
-    parser.add_argument("--learning_rate", default=1e-2, type=float)
     parser.add_argument("--mask_input", default=True, type=bool)
     parser.add_argument("--rank", default=8, type=int)
     parser.add_argument("--alpha", default=1.0, type=float)

     parser.add_argument("--base_model_path", default="./out/checkpoint.pt", type=str)
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
+    parser.add_argument("--learning_rate", default=5e-4, type=float)
     parser.add_argument("--mask_input", default=True, type=bool)
     parser.add_argument("--rank", default=8, type=int)
     parser.add_argument("--alpha", default=1.0, type=float)

model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from math import sqrt, exp
-from dataclasses import dataclass, field
 from functools import partial
 from typing import Iterator, Self
@@ -210,7 +210,7 @@ class GPT(Module):
         if beam_width <= 0:
             raise ValueError(f"Beam width must be greater than 0, {beam_width} given.")
-        @dataclass(order=True)
         class Candidate:
             log_probability: float
             tokens: Tensor

 from math import sqrt, exp
+from dataclasses import dataclass
 from functools import partial
 from typing import Iterator, Self
         if beam_width <= 0:
             raise ValueError(f"Beam width must be greater than 0, {beam_width} given.")
+        @dataclass
         class Candidate:
             log_probability: float
             tokens: Tensor

model_sizing.ipynb CHANGED Viewed

@@ -1,5 +1,12 @@
 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 35,
@@ -17,7 +24,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, we'll estimate the total number of parameters in the network."
    ]
   },
   {
@@ -260,7 +267,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's estimate the model FLOPs utilization using the method in the PaLM paper by Chowdhery, et al. Then, we'll compare the PaLM estimation with our own as a sanity check."
    ]
   },
   {
@@ -288,19 +295,61 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The estimates seem pretty similar so let's move on!"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's estimate how long it would take to train over every sample in the Openwebtext training set at least once in expectation using a few well-known Nvidia GPUs as benchmarks. Note that these results shown here are a best-case scenario and neglect to factor in overhead such as moving data to and from VRAM."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
    "metadata": {},
    "outputs": [
     {
@@ -310,19 +359,13 @@
       "Total tokens: 8,994,885,755\n",
       "Epochs required: 2,145\n",
       "\n",
-      "RTX A2000: 684.25 seconds/epoch, 16.99 days required\n",
-      "A100 SXM: 70.07 seconds/epoch, 1.74 days required\n",
-      "HGX B100: 1.56 seconds/epoch, 0.04 days required\n"
      ]
     }
    ],
    "source": [
-    "RTX_A2000_BF16_FLOPS_PER_SECOND = 63.9e12\n",
-    "A100_SXM_BF16_FLOPS_PER_SECOND = 624.0e12\n",
-    "HGX_B100_BF16_FLOPS_PER_SECOND = 28000e12\n",
-    "\n",
-    "MODEL_FLOPS_UTILIZATION = 0.3\n",
-    "\n",
     "num_training_tokens = 8994885755\n",
     "samples_per_epoch = 4096\n",
     "\n",
@@ -331,20 +374,12 @@
     "print(f\"Total tokens: {num_training_tokens:,}\")\n",
     "print(f\"Epochs required: {num_epochs_required:,}\", end=\"\\n\\n\")\n",
     "\n",
-    "gpus = {\n",
-    "    \"RTX A2000\": RTX_A2000_BF16_FLOPS_PER_SECOND,\n",
-    "    \"A100 SXM\": A100_SXM_BF16_FLOPS_PER_SECOND,\n",
-    "    \"HGX B100\": HGX_B100_BF16_FLOPS_PER_SECOND,\n",
-    "}\n",
-    "\n",
-    "for name, flops_per_second in gpus.items():\n",
-    "    flops_per_second *= MODEL_FLOPS_UTILIZATION\n",
-    "\n",
-    "    seconds_per_epoch = samples_per_epoch * total_roundtrip_flops / flops_per_second\n",
     "\n",
     "    days_required = num_epochs_required * seconds_per_epoch / 60 / 60 / 24\n",
     "\n",
-    "    print(f\"{name}: {seconds_per_epoch:.2f} seconds/epoch, {days_required:,.2f} days required\")"
    ]
   }
  ],

 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Welcome! In this notebook we aim to estimate the compute and memory requirements needed to train a theoretical model architecture using LightGPT. We'll start by first defining the parameters of the architecture."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 35,
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Next, we'll estimate the total number of trainable parameters in the network."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Now, let's estimate the number of FLOPs using the method in the PaLM paper by Chowdhery, et al. Then, we'll compare the PaLM estimation with our own as a sanity check."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "The two estimates seem pretty similar so let's move on to estimating the model FLOPs utilization (MFU) by comparing some observed throughput data for various GPUs to their advertised theoretical maximum throughput."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RTX A2000 MFU: 17.29%\n",
+      "RTX 3090 MFU: 22.99%\n",
+      "A100 SXM MFU: 37.16%\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "\n",
+    "@dataclass\n",
+    "class Device:\n",
+    "    name: str\n",
+    "    advertised_flops: float\n",
+    "    actual_flops: float\n",
+    "\n",
+    "    @property\n",
+    "    def mfu(self) -> float:\n",
+    "        return self.actual_flops / self.advertised_flops\n",
+    "\n",
+    "    @property\n",
+    "    def percentage_utilization(self) -> float:\n",
+    "        return self.mfu * 100\n",
+    "\n",
+    "devices = [\n",
+    "    Device(\"RTX A2000\", 63.9e12, 3.45 * total_roundtrip_flops),\n",
+    "    Device(\"RTX 3090\", 285.5e12, 20.5 * total_roundtrip_flops),\n",
+    "    Device(\"A100 SXM\", 624.0e12, 72.4 * total_roundtrip_flops),\n",
+    "]\n",
+    "\n",
+    "for device in devices:\n",
+    "    print(f\"{device.name} MFU: {device.percentage_utilization:.2f}%\")\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Now, let's estimate how long it would take to train over every sample in the Openwebtext training set at least once in expectation. Note that these results shown here are a best-case scenario and neglect to factor in other overhead."
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 97,
    "metadata": {},
    "outputs": [
     {
       "Total tokens: 8,994,885,755\n",
       "Epochs required: 2,145\n",
       "\n",
+      "RTX A2000: 1187.25 seconds/epoch, 29.48 days required\n",
+      "RTX 3090: 199.80 seconds/epoch, 4.96 days required\n",
+      "A100 SXM: 56.57 seconds/epoch, 1.40 days required\n"
      ]
     }
    ],
    "source": [
     "num_training_tokens = 8994885755\n",
     "samples_per_epoch = 4096\n",
     "\n",
     "print(f\"Total tokens: {num_training_tokens:,}\")\n",
     "print(f\"Epochs required: {num_epochs_required:,}\", end=\"\\n\\n\")\n",
     "\n",
+    "for device in devices:\n",
+    "    seconds_per_epoch = samples_per_epoch * total_roundtrip_flops / device.actual_flops\n",
     "\n",
     "    days_required = num_epochs_required * seconds_per_epoch / 60 / 60 / 24\n",
     "\n",
+    "    print(f\"{device.name}: {seconds_per_epoch:.2f} seconds/epoch, {days_required:,.2f} days required\")"
    ]
   }
  ],