A little nicer

Files changed (7) hide show

README.md +2 -2
beam_search.py +1 -1
data.py +1 -1
export_model.ipynb +25 -15
instruction-tune.py +4 -1
model.py +24 -21
pre-train.py +4 -1

README.md CHANGED Viewed

@@ -17,7 +17,7 @@ LightGPT is a lightweight generative pre-trained Transformer (GPT) model for the
 ## Features
-- **Parameter-efficiency**: LightGPT aims to be a more parsimonious model by only training parameters that are absolutely necessary. As such, biases and positional embeddings have been completely removed from the architecture. Despite having no positional embeddings (NoPE), LightGPT performs better in terms of context-length generalization than relative embeddings offering good performance up to 2X the trained block size.
 - **Low Memory Utilization**: LightGPT lets you progressively employ training-time memory optimizations such as fully-sharded data-parallel (FSDP), activation checkpointing, mixed precision, and low-memory optimizer updates that allow you to train larger models on smaller hardware.
@@ -83,7 +83,7 @@ torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16
 | Argument | Default | Type | Description |
 |---|---|---|---|
 | --dataset_subset | "sample-10BT" | str | The subset of the Fineweb dataset to train on. Options are `sample-10BT`, `sample-100BT`, and `sample-350BT`. Set to `None` to train on the full 15T token dataset. |
-| --token_encoding | "r50k_base" | str | The encoding scheme to use when tokenizing the dataset. Options include `r50k_base`, `cl100k_base`, and `o200k_base`. |
 | --dataset_path | "./datasets" | str | The path to the preprocessed dataset files on disk. |
 | --num_dataset_processes | 8 | int | The number of processes (CPUs) to use to process the dataset. |
 | --batch_size | 1 | int | The number of samples to pass through the network at a time. |

 ## Features
+- **Parameter-efficiency**: LightGPT aims to be a more parsimonious model by only training parameters that are absolutely necessary. As such, biases and positional embeddings have been completely removed from the architecture. Despite having no positional embeddings (NoPE), LightGPT performs better at context-length generalization than relative embeddings offering good performance even at 2X of the trained context window.
 - **Low Memory Utilization**: LightGPT lets you progressively employ training-time memory optimizations such as fully-sharded data-parallel (FSDP), activation checkpointing, mixed precision, and low-memory optimizer updates that allow you to train larger models on smaller hardware.
 | Argument | Default | Type | Description |
 |---|---|---|---|
 | --dataset_subset | "sample-10BT" | str | The subset of the Fineweb dataset to train on. Options are `sample-10BT`, `sample-100BT`, and `sample-350BT`. Set to `None` to train on the full 15T token dataset. |
+| --token_encoding | "r50k_base" | str | The Tiktoken encoding scheme to use when tokenizing the dataset. Options include `r50k_base`, `p50k_base`, `cl100k_base`, and `o200k_base`. |
 | --dataset_path | "./datasets" | str | The path to the preprocessed dataset files on disk. |
 | --num_dataset_processes | 8 | int | The number of processes (CPUs) to use to process the dataset. |
 | --batch_size | 1 | int | The number of samples to pass through the network at a time. |

beam_search.py CHANGED Viewed

@@ -15,7 +15,7 @@ import tiktoken
 def main():
     parser = ArgumentParser(
-        description="Generate text from the model given a prompt.",
     )
     parser.add_argument(

 def main():
     parser = ArgumentParser(
+        description="Use a greedy search strategy to generate candidate sequences.",
     )
     parser.add_argument(

data.py CHANGED Viewed

@@ -92,7 +92,7 @@ class Fineweb(IterableDataset):
                 index = 0
-                for i in tqdm(range(self.NUM_SHARDS), desc="Writing"):
                     batch = dataset.shard(
                         num_shards=self.NUM_SHARDS, index=i, contiguous=True
                     ).with_format("numpy")

                 index = 0
+                for i in tqdm(range(self.NUM_SHARDS), desc="Saving"):
                     batch = dataset.shard(
                         num_shards=self.NUM_SHARDS, index=i, contiguous=True
                     ).with_format("numpy")

export_model.ipynb CHANGED Viewed

@@ -9,11 +9,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_name = \"lightgpt-small-turbo\"\n",
     "checkpoint_path = \"./checkpoints/checkpoint.pt\"\n",
     "lora_path = None  # \"./checkpoints/lora_instruction.pt\"\n",
     "exports_path = \"./exports\""
@@ -28,14 +28,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Base checkpoint loaded successfully\n"
      ]
     }
    ],
@@ -179,8 +183,8 @@
     "    model,\n",
     "    example_input,\n",
     "    onnx_path,\n",
-    "    input_names=[\"input_tokens\"],\n",
-    "    output_names=[\"output\"],\n",
     "    dynamo=True,\n",
     ")\n",
     "\n",
@@ -226,14 +230,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking good\n"
      ]
     }
    ],
@@ -242,6 +250,8 @@
     "\n",
     "import numpy as np\n",
     "\n",
     "session = onnxruntime.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n",
     "\n",
     "onnx_input = {\"input_tokens\": example_input.numpy()}\n",
@@ -251,7 +261,7 @@
     "onnx_output = output[0]\n",
     "pytorch_output = np.array(example_output.detach())\n",
     "\n",
-    "np.testing.assert_allclose(pytorch_output, onnx_output, rtol=1e-2, atol=1e-03)\n",
     "\n",
     "print(\"Looking good\")"
    ]

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
+    "model_name = \"lightgpt-small\"\n",
     "checkpoint_path = \"./checkpoints/checkpoint.pt\"\n",
     "lora_path = None  # \"./checkpoints/lora_instruction.pt\"\n",
     "exports_path = \"./exports\""
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
+     "ename": "TypeError",
+     "evalue": "GPT.__init__() missing 1 required positional argument: 'feed_forward_ratio'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 7\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmodel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GPT, GPTWithLoRA\n\u001b[1;32m      5\u001b[0m checkpoint \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(checkpoint_path, map_location\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m, weights_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m----> 7\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mGPT\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheckpoint\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_args\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m model \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcompile(model)\n\u001b[1;32m     11\u001b[0m model\u001b[38;5;241m.\u001b[39mload_state_dict(checkpoint[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n",
+      "\u001b[0;31mTypeError\u001b[0m: GPT.__init__() missing 1 required positional argument: 'feed_forward_ratio'"
      ]
     }
    ],
     "    model,\n",
     "    example_input,\n",
     "    onnx_path,\n",
+    "    input_names=[\"input_tokens\", \"labels\"],\n",
+    "    output_names=[\"logits\"],\n",
     "    dynamo=True,\n",
     ")\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
+     "ename": "NameError",
+     "evalue": "name 'onnx_path' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 7\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m assert_allclose\n\u001b[0;32m----> 7\u001b[0m session \u001b[38;5;241m=\u001b[39m onnxruntime\u001b[38;5;241m.\u001b[39mInferenceSession(\u001b[43monnx_path\u001b[49m, providers\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCPUExecutionProvider\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m      9\u001b[0m onnx_input \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m: example_input\u001b[38;5;241m.\u001b[39mnumpy()}\n\u001b[1;32m     11\u001b[0m output \u001b[38;5;241m=\u001b[39m session\u001b[38;5;241m.\u001b[39mrun(\u001b[38;5;28;01mNone\u001b[39;00m, onnx_input)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'onnx_path' is not defined"
      ]
     }
    ],
     "\n",
     "import numpy as np\n",
     "\n",
+    "from numpy.testing import assert_allclose\n",
+    "\n",
     "session = onnxruntime.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n",
     "\n",
     "onnx_input = {\"input_tokens\": example_input.numpy()}\n",
     "onnx_output = output[0]\n",
     "pytorch_output = np.array(example_output.detach())\n",
     "\n",
+    "assert_allclose(pytorch_output, onnx_output, rtol=1e-2, atol=1e-03)\n",
     "\n",
     "print(\"Looking good\")"
    ]

instruction-tune.py CHANGED Viewed

@@ -96,7 +96,10 @@ def main():
         shuffle=False,
     )
-    model = GPT(**model_args, activation_checkpointing=args.activation_checkpointing)
     model = torch.compile(model)

         shuffle=False,
     )
+    model = GPT(**model_args)
+    if args.activation_checkpointing:
+        model.enable_activation_checkpointing()
     model = torch.compile(model)

model.py CHANGED Viewed

@@ -23,7 +23,7 @@ from torch.nn import (
 from torch.nn.functional import softmax, log_softmax
 from torch.nn.utils.parametrize import register_parametrization, remove_parametrizations
-from torch.utils.checkpoint import checkpoint
 class GPT(Module):
@@ -31,27 +31,32 @@ class GPT(Module):
     def __init__(
         self,
-        block_size: int = 1024,
-        embedding_dimensions: int = 1024,
-        num_heads: int = 16,
-        num_layers: int = 24,
-        feed_forward_ratio: int = 4,
-        dropout: float = 0.1,
-        activation_checkpointing: bool = False,
-        vocabulary_size: int = 50257,
-        padding_index: int = -100,
-        eos_index: int = 50256,
     ):
         super().__init__()
         if vocabulary_size <= 0:
             raise ValueError(
                 f"Vocabulary size must be greater than 0, {vocabulary_size} given."
             )
-        if num_layers <= 0:
-            raise ValueError(f"Num layers must be greater than 0, {num_layers} given.")
         token_embeddings = Embedding(
             vocabulary_size, embedding_dimensions, padding_idx=padding_index
         )
@@ -80,10 +85,7 @@ class GPT(Module):
             ]
         )
-        if activation_checkpointing:
-            self.checkpoint = partial(checkpoint, use_reentrant=False)
-        else:
-            self.checkpoint = lambda layer, x, attention_mask: layer(x, attention_mask)
         self.output_norm = RMSNorm(embedding_dimensions)
         self.output_layer = output_layer
@@ -98,6 +100,9 @@ class GPT(Module):
     def num_trainable_params(self) -> int:
         return sum(param.numel() for param in self.parameters() if param.requires_grad)
     def forward(
         self, x: Tensor, y: Tensor | None = None
     ) -> tuple[Tensor, Tensor | None]:
@@ -292,9 +297,7 @@ class GPTWithLoRA(Module):
     to the intermediate layers of the network.
     """
-    def __init__(
-        self, model: GPT, rank: int = 8, alpha: float = 1.0, dropout: float = 0.05
-    ):
         super().__init__()
         if rank <= 0:

 from torch.nn.functional import softmax, log_softmax
 from torch.nn.utils.parametrize import register_parametrization, remove_parametrizations
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
 class GPT(Module):
     def __init__(
         self,
+        block_size: int,
+        embedding_dimensions: int,
+        num_heads: int,
+        num_layers: int,
+        feed_forward_ratio: int,
+        dropout: float,
+        vocabulary_size: int,
+        padding_index: int,
+        eos_index: int,
     ):
         super().__init__()
+        if block_size < 1:
+            raise ValueError(f"Block size must be greater than 0, {block_size} given.")
+        if num_layers <= 0:
+            raise ValueError(f"Num layers must be greater than 0, {num_layers} given.")
+        if feed_forward_ratio not in (1, 2, 4):
+            raise ValueError("Feed-forward ratio must be either 1, 2, or 4.")
         if vocabulary_size <= 0:
             raise ValueError(
                 f"Vocabulary size must be greater than 0, {vocabulary_size} given."
             )
         token_embeddings = Embedding(
             vocabulary_size, embedding_dimensions, padding_idx=padding_index
         )
             ]
         )
+        self.checkpoint = lambda layer, x, attention_mask: layer(x, attention_mask)
         self.output_norm = RMSNorm(embedding_dimensions)
         self.output_layer = output_layer
     def num_trainable_params(self) -> int:
         return sum(param.numel() for param in self.parameters() if param.requires_grad)
+    def enable_activation_checkpointing(self) -> None:
+        self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
     def forward(
         self, x: Tensor, y: Tensor | None = None
     ) -> tuple[Tensor, Tensor | None]:
     to the intermediate layers of the network.
     """
+    def __init__(self, model: GPT, rank: int, alpha: float, dropout: float):
         super().__init__()
         if rank <= 0:

pre-train.py CHANGED Viewed

@@ -196,7 +196,10 @@ def main():
         "eos_index": tokenizer.eot_token,
     }
-    model = GPT(**model_args, activation_checkpointing=args.activation_checkpointing)
     print("Compiling model")
     model = torch.compile(model)

         "eos_index": tokenizer.eot_token,
     }
+    model = GPT(**model_args)
+    if args.activation_checkpointing:
+        model.enable_activation_checkpointing()
     print("Compiling model")
     model = torch.compile(model)