andrewdalpino
/

LightGPT

@@ -127,7 +127,7 @@ Then navigate to the dashboard using your favorite web browser.
 | Argument | Default | Type | Description |
 |---|---|---|---|
 | --base_model_path | "./checkpoints/checkpoint.pt" | string | The path to the base checkpoint on disk. |
-| --max_tokens_per_sample | 4096 | int | The maximum number of tokens to pack into a single training sequence. |
 | --mask_input | False | bool | Should we mask the input part of the training sequences i.e. only train on the supervised  output? |
 | --batch_size | 1 | int | The number of samples to pass through the network at a time. |
 | --gradient_accumulation_steps | 64 | int | The number of batches to pass through the network before updating the weights. |

 | Argument | Default | Type | Description |
 |---|---|---|---|
 | --base_model_path | "./checkpoints/checkpoint.pt" | string | The path to the base checkpoint on disk. |
+| --max_tokens_per_sample | 2048 | int | The maximum number of tokens to pack into a single training sequence. |
 | --mask_input | False | bool | Should we mask the input part of the training sequences i.e. only train on the supervised  output? |
 | --batch_size | 1 | int | The number of samples to pass through the network at a time. |
 | --gradient_accumulation_steps | 64 | int | The number of batches to pass through the network before updating the weights. |

instruction-tune.py CHANGED Viewed

@@ -27,7 +27,7 @@ def main():
     parser.add_argument(
         "--base_model_path", default="./checkpoints/checkpoint.pt", type=str
     )
-    parser.add_argument("--max_tokens_per_sample", default=4096, type=int)
     parser.add_argument("--mask_input", action="store_true")
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--gradient_accumulation_steps", default=64, type=int)
@@ -62,7 +62,7 @@ def main():
         else torch.float32
     )
-    forward_context = autocast(device_type=args.device, dtype=dtype)
     if args.seed:
         torch.manual_seed(args.seed)
@@ -160,7 +160,7 @@ def main():
             x = x.to(args.device, non_blocking=True)
             y = y.to(args.device, non_blocking=True)
-            with forward_context:
                 y_pred, loss = model(x, y)
                 scaled_loss = loss / args.gradient_accumulation_steps

     parser.add_argument(
         "--base_model_path", default="./checkpoints/checkpoint.pt", type=str
     )
+    parser.add_argument("--max_tokens_per_sample", default=2048, type=int)
     parser.add_argument("--mask_input", action="store_true")
     parser.add_argument("--batch_size", default=1, type=int)
     parser.add_argument("--gradient_accumulation_steps", default=64, type=int)
         else torch.float32
     )
+    amp_context = autocast(device_type=args.device, dtype=dtype)
     if args.seed:
         torch.manual_seed(args.seed)
             x = x.to(args.device, non_blocking=True)
             y = y.to(args.device, non_blocking=True)
+            with amp_context:
                 y_pred, loss = model(x, y)
                 scaled_loss = loss / args.gradient_accumulation_steps

model.py CHANGED Viewed

@@ -92,9 +92,7 @@ class GPT(Module):
         """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
         self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
-    def forward(
-        self, x: Tensor, y: Tensor | None = None
-    ) -> tuple[Tensor, Tensor | None]:
         """A forward pass optimized for batch training."""
         z = self.token_embeddings(x)
@@ -110,17 +108,15 @@ class GPT(Module):
         z = self.output_norm(z)
         z = self.output_layer(z)
-        if y is not None:
-            # Flatten the batch dimension before calculating loss.
-            y_pred = z.view(-1, z.size(-1))
-            labels = y.view(-1)
-            loss = self.loss_function(y_pred, labels)
-        else:
-            loss = None
         return z, loss
     def predict(self, x: Tensor) -> Tensor:
         """A forward pass optimized for batch next-token prediction."""
@@ -136,7 +132,7 @@ class GPT(Module):
         z = self.output_norm(z)
-        # Pluck only the last token embedding in the time dimension.
         z = z[:, -1, :]
         z = self.output_layer(z)
@@ -200,7 +196,7 @@ class GPT(Module):
             probabilities = softmax(logits, dim=0)
-            offset = torch.multinomial(probabilities, num_samples=1).squeeze(0)
             next_token = indices[offset]
@@ -251,7 +247,8 @@ class GPT(Module):
             reverse=True,
         )
-        candidates, completed = [], []
         tokens = torch.tensor([], dtype=prompt.dtype).to(prompt.device)
@@ -372,9 +369,7 @@ class GPTWithLoRA(Module):
                 for name in lora_params:
                     remove_parametrizations(module, name, leave_parametrized=True)
-    def forward(
-        self, x: Tensor, y: Tensor | None = None
-    ) -> tuple[Tensor, Tensor | None]:
         return self.model.forward(x, y)
     def predict(self, x: Tensor) -> Tensor:
@@ -407,7 +402,7 @@ class GPTWithLoRA(Module):
 class ONNXModel(Module):
-    """This wrapper provides a cleaner inferencing API for production models."""
     def __init__(self, model: GPT | GPTWithLoRA):
         super().__init__()

         """Instead of memorizing the activations of the forward pass, recompute them at various checkpoints."""
         self.checkpoint = partial(torch_checkpoint, use_reentrant=False)
+    def forward(self, x: Tensor, y: Tensor) -> tuple[Tensor, Tensor]:
         """A forward pass optimized for batch training."""
         z = self.token_embeddings(x)
         z = self.output_norm(z)
         z = self.output_layer(z)
+        # Concatenate the batches along the time dimension.
+        y_pred = z.view(-1, z.size(-1))
+        labels = y.view(-1)
+        loss = self.loss_function(y_pred, labels)
         return z, loss
+    @torch.no_grad()
     def predict(self, x: Tensor) -> Tensor:
         """A forward pass optimized for batch next-token prediction."""
         z = self.output_norm(z)
+        # Pluck only the last token embedding from each batch.
         z = z[:, -1, :]
         z = self.output_layer(z)
             probabilities = softmax(logits, dim=0)
+            offset = torch.multinomial(probabilities, num_samples=1).squeeze()
             next_token = indices[offset]
             reverse=True,
         )
+        candidates: list[Candidate] = []
+        completed: list[Candidate] = []
         tokens = torch.tensor([], dtype=prompt.dtype).to(prompt.device)
                 for name in lora_params:
                     remove_parametrizations(module, name, leave_parametrized=True)
+    def forward(self, x: Tensor, y: Tensor) -> tuple[Tensor, Tensor]:
         return self.model.forward(x, y)
     def predict(self, x: Tensor) -> Tensor:
 class ONNXModel(Module):
+    """This wrapper provides a clean inferencing API for production models."""
     def __init__(self, model: GPT | GPTWithLoRA):
         super().__init__()