ahmedalhammadi
/

gpt2-cpt-dutch

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ahmed-alhammadi-technology-innovation-institute/huggingface/runs/ql9uo9vq)
 This model was trained with SFT.

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ahmed-alhammadi-technology-innovation-institute/huggingface/runs/8yuu3q5i)
 This model was trained with SFT.

all_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
-    "total_flos": 7920590782464.0,
-    "train_loss": 4.042498117469879,
-    "train_runtime": 34.3031,
     "train_samples": 12593,
-    "train_samples_per_second": 309.71,
-    "train_steps_per_second": 2.42
 }

 {
     "epoch": 1.0,
+    "total_flos": 2783887138750464.0,
+    "train_loss": 1.8076151853584381,
+    "train_runtime": 66.5306,
     "train_samples": 12593,
+    "train_samples_per_second": 159.686,
+    "train_steps_per_second": 2.495
 }

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
-    "total_flos": 7920590782464.0,
-    "train_loss": 4.042498117469879,
-    "train_runtime": 34.3031,
     "train_samples": 12593,
-    "train_samples_per_second": 309.71,
-    "train_steps_per_second": 2.42
 }

 {
     "epoch": 1.0,
+    "total_flos": 2783887138750464.0,
+    "train_loss": 1.8076151853584381,
+    "train_runtime": 66.5306,
     "train_samples": 12593,
+    "train_samples_per_second": 159.686,
+    "train_steps_per_second": 2.495
 }

trainer_state.json CHANGED Viewed

@@ -3,7 +3,7 @@
   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 83,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -127,18 +127,137 @@
       "loss": 3.6707,
       "step": 80
     },
     {
       "epoch": 1.0,
-      "step": 83,
-      "total_flos": 7920590782464.0,
-      "train_loss": 4.042498117469879,
-      "train_runtime": 34.3031,
-      "train_samples_per_second": 309.71,
-      "train_steps_per_second": 2.42
     }
   ],
   "logging_steps": 5,
-  "max_steps": 83,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
@@ -154,7 +273,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7920590782464.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 166,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "loss": 3.6707,
       "step": 80
     },
+    {
+      "epoch": 0.5120481927710844,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.3529411764705884e-05,
+      "loss": 3.6468,
+      "step": 85
+    },
+    {
+      "epoch": 0.5421686746987951,
+      "grad_norm": 0.89453125,
+      "learning_rate": 8.23529411764706e-05,
+      "loss": 3.6491,
+      "step": 90
+    },
+    {
+      "epoch": 0.572289156626506,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0001411764705882353,
+      "loss": 3.6893,
+      "step": 95
+    },
+    {
+      "epoch": 0.6024096385542169,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0002,
+      "loss": 3.6404,
+      "step": 100
+    },
+    {
+      "epoch": 0.6325301204819277,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00019944481853548335,
+      "loss": 3.6761,
+      "step": 105
+    },
+    {
+      "epoch": 0.6626506024096386,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.00019778543867110426,
+      "loss": 3.6328,
+      "step": 110
+    },
+    {
+      "epoch": 0.6927710843373494,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00019504028554572864,
+      "loss": 3.6478,
+      "step": 115
+    },
+    {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00019123984032200586,
+      "loss": 3.6035,
+      "step": 120
+    },
+    {
+      "epoch": 0.7530120481927711,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00018642630173483832,
+      "loss": 3.6189,
+      "step": 125
+    },
+    {
+      "epoch": 0.7831325301204819,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00018065311753227273,
+      "loss": 3.6364,
+      "step": 130
+    },
+    {
+      "epoch": 0.8132530120481928,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00017398439101151905,
+      "loss": 3.6213,
+      "step": 135
+    },
+    {
+      "epoch": 0.8433734939759037,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0001664941692397025,
+      "loss": 3.558,
+      "step": 140
+    },
+    {
+      "epoch": 0.8734939759036144,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.00015826562086267956,
+      "loss": 3.5992,
+      "step": 145
+    },
+    {
+      "epoch": 0.9036144578313253,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00014939011263122634,
+      "loss": 3.5675,
+      "step": 150
+    },
+    {
+      "epoch": 0.9337349397590361,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00013996619489850822,
+      "loss": 3.5625,
+      "step": 155
+    },
+    {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001300985073534919,
+      "loss": 3.5525,
+      "step": 160
+    },
+    {
+      "epoch": 0.9939759036144579,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00011989661714062999,
+      "loss": 3.5954,
+      "step": 165
+    },
     {
       "epoch": 1.0,
+      "step": 166,
+      "total_flos": 2783887138750464.0,
+      "train_loss": 1.8076151853584381,
+      "train_runtime": 66.5306,
+      "train_samples_per_second": 159.686,
+      "train_steps_per_second": 2.495
     }
   ],
   "logging_steps": 5,
+  "max_steps": 166,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 2783887138750464.0,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null