Model save

Browse files

Files changed (11) hide show

README.md +2 -2
all_results.json +3 -8
config.json +1 -1
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
runs/Nov21_18-12-56_main-fft-gemma7b-closedqa-0-0/events.out.tfevents.1732231455.main-fft-gemma7b-closedqa-0-0.544.0 +3 -0
train_results.json +3 -3
trainer_state.json +60 -60
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
-- Loss: 2.3268
 ## Model description
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 0.7852        | 1.0   | 130  | 2.3268          |
 ### Framework versions

 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 2.2840
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 0.7805        | 1.0   | 130  | 2.2840          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
     "epoch": 1.0,
-    "eval_loss": 2.326753616333008,
-    "eval_runtime": 0.4763,
-    "eval_samples": 15,
-    "eval_samples_per_second": 25.195,
-    "eval_steps_per_second": 2.1,
     "total_flos": 17867533713408.0,
-    "train_loss": 2.6317037490698008,
-    "train_runtime": 910.2032,
     "train_samples": 111440,
-    "train_samples_per_second": 36.528,
     "train_steps_per_second": 0.143
 }

 {
     "epoch": 1.0,
     "total_flos": 17867533713408.0,
+    "train_loss": 2.5464949974646935,
+    "train_runtime": 907.3672,
     "train_samples": 111440,
+    "train_samples_per_second": 36.642,
     "train_steps_per_second": 0.143
 }

config.json CHANGED Viewed

@@ -24,6 +24,6 @@
   "rope_theta": 10000.0,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.46.3",
-  "use_cache": true,
   "vocab_size": 256000
 }

   "rope_theta": 10000.0,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.46.3",
+  "use_cache": false,
   "vocab_size": 256000
 }

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e268a4e2fd577919456f59597df331446976ae2d378e1c9ef926ffb6d84644ee
 size 4995496656

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c9b31611844f916240641ce8e3df4f3c49bb2413f2e0e589feb58beb1dbf548
 size 4995496656

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23e6cebf4a4318b02f17add4fbd59261a5282794878e3a07737cd2a4ad5c352d
 size 4982953168

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e4c92be6f7e62fab237512ae1baddbeb4f7a70fa29d41949d7a9754a7dbe334
 size 4982953168

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:246a8a6292d87d9f4915fb08ab18eb9b0d6a706a2f61985c4fa70dd49ed20735
 size 4982953200

 version https://git-lfs.github.com/spec/v1
+oid sha256:0d9593409430e230f71effe8a50b391b38d156e3d3bf618d5c04176f93af458f
 size 4982953200

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dbe3e6ca3dff5ac890e8608614f331cb13b9df79aef9cbb9fd8c0d8c63a2726
 size 2113988336

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d2c2f429d8534eb8ba655e2d9a0056f72a8545d46b09cf7ceef58ebb19255c2
 size 2113988336

runs/Nov21_18-12-56_main-fft-gemma7b-closedqa-0-0/events.out.tfevents.1732231455.main-fft-gemma7b-closedqa-0-0.544.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8a4ac83aec09bea9663cb07e472f0c7dc63906f3087493402289c39c0805fe8
+size 12066

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 1.0,
     "total_flos": 17867533713408.0,
-    "train_loss": 2.6317037490698008,
-    "train_runtime": 910.2032,
     "train_samples": 111440,
-    "train_samples_per_second": 36.528,
     "train_steps_per_second": 0.143
 }

 {
     "epoch": 1.0,
     "total_flos": 17867533713408.0,
+    "train_loss": 2.5464949974646935,
+    "train_runtime": 907.3672,
     "train_samples": 111440,
+    "train_samples_per_second": 36.642,
     "train_steps_per_second": 0.143
 }

trainer_state.json CHANGED Viewed

@@ -10,208 +10,208 @@
   "log_history": [
     {
       "epoch": 0.007692307692307693,
-      "grad_norm": 1824.052603668407,
       "learning_rate": 1.5384615384615387e-06,
       "loss": 28.7448,
       "step": 1
     },
     {
       "epoch": 0.038461538461538464,
-      "grad_norm": 316.6103639423174,
       "learning_rate": 7.692307692307694e-06,
-      "loss": 21.1101,
       "step": 5
     },
     {
       "epoch": 0.07692307692307693,
-      "grad_norm": 65.24932753249017,
       "learning_rate": 1.5384615384615387e-05,
-      "loss": 13.727,
       "step": 10
     },
     {
       "epoch": 0.11538461538461539,
-      "grad_norm": 42.92103955322172,
       "learning_rate": 1.9985583705641418e-05,
-      "loss": 6.179,
       "step": 15
     },
     {
       "epoch": 0.15384615384615385,
-      "grad_norm": 440.32026332346305,
       "learning_rate": 1.9823877374156647e-05,
-      "loss": 3.1748,
       "step": 20
     },
     {
       "epoch": 0.19230769230769232,
-      "grad_norm": 18.09915025408189,
       "learning_rate": 1.9485364419471454e-05,
-      "loss": 2.2618,
       "step": 25
     },
     {
       "epoch": 0.23076923076923078,
-      "grad_norm": 16.637938118964563,
       "learning_rate": 1.8976137276390145e-05,
-      "loss": 1.6812,
       "step": 30
     },
     {
       "epoch": 0.2692307692307692,
-      "grad_norm": 10.853465738373286,
       "learning_rate": 1.8305360832480118e-05,
-      "loss": 1.507,
       "step": 35
     },
     {
       "epoch": 0.3076923076923077,
-      "grad_norm": 14.37878731086315,
       "learning_rate": 1.7485107481711014e-05,
-      "loss": 1.3185,
       "step": 40
     },
     {
       "epoch": 0.34615384615384615,
-      "grad_norm": 9.489828253754519,
       "learning_rate": 1.653013984983585e-05,
-      "loss": 1.1709,
       "step": 45
     },
     {
       "epoch": 0.38461538461538464,
-      "grad_norm": 9.57640776000177,
       "learning_rate": 1.5457645101945046e-05,
-      "loss": 1.0826,
       "step": 50
     },
     {
       "epoch": 0.4230769230769231,
-      "grad_norm": 8.920710354614021,
       "learning_rate": 1.4286925614030542e-05,
-      "loss": 1.022,
       "step": 55
     },
     {
       "epoch": 0.46153846153846156,
-      "grad_norm": 9.46700699458278,
       "learning_rate": 1.303905157574247e-05,
-      "loss": 0.9738,
       "step": 60
     },
     {
       "epoch": 0.5,
-      "grad_norm": 6.401169561778988,
       "learning_rate": 1.1736481776669307e-05,
-      "loss": 0.9395,
       "step": 65
     },
     {
       "epoch": 0.5384615384615384,
-      "grad_norm": 3.70149569378144,
       "learning_rate": 1.0402659401094154e-05,
-      "loss": 0.8867,
       "step": 70
     },
     {
       "epoch": 0.5769230769230769,
-      "grad_norm": 3.481283936858623,
       "learning_rate": 9.061590105968208e-06,
-      "loss": 0.8829,
       "step": 75
     },
     {
       "epoch": 0.6153846153846154,
-      "grad_norm": 7.314694136144165,
       "learning_rate": 7.73740997570278e-06,
-      "loss": 0.8661,
       "step": 80
     },
     {
       "epoch": 0.6538461538461539,
-      "grad_norm": 8.561065827412762,
       "learning_rate": 6.453951129574644e-06,
-      "loss": 0.8615,
       "step": 85
     },
     {
       "epoch": 0.6923076923076923,
-      "grad_norm": 1.800187961424463,
       "learning_rate": 5.234312799786921e-06,
-      "loss": 0.8485,
       "step": 90
     },
     {
       "epoch": 0.7307692307692307,
-      "grad_norm": 3.8636605355173326,
       "learning_rate": 4.100445599768774e-06,
-      "loss": 0.8287,
       "step": 95
     },
     {
       "epoch": 0.7692307692307693,
-      "grad_norm": 1.6545101747217463,
       "learning_rate": 3.0727564649040066e-06,
-      "loss": 0.8179,
       "step": 100
     },
     {
       "epoch": 0.8076923076923077,
-      "grad_norm": 1.3925968714516748,
       "learning_rate": 2.1697413758237785e-06,
-      "loss": 0.8102,
       "step": 105
     },
     {
       "epoch": 0.8461538461538461,
-      "grad_norm": 1.6352729776359016,
       "learning_rate": 1.407652474377832e-06,
-      "loss": 0.798,
       "step": 110
     },
     {
       "epoch": 0.8846153846153846,
-      "grad_norm": 0.7372236841264296,
       "learning_rate": 8.002055634117578e-07,
-      "loss": 0.798,
       "step": 115
     },
     {
       "epoch": 0.9230769230769231,
-      "grad_norm": 0.9284794942335707,
       "learning_rate": 3.5833325466437697e-07,
-      "loss": 0.7863,
       "step": 120
     },
     {
       "epoch": 0.9615384615384616,
-      "grad_norm": 0.7745401578262785,
       "learning_rate": 8.99882075409153e-08,
-      "loss": 0.7789,
       "step": 125
     },
     {
       "epoch": 1.0,
-      "grad_norm": 0.8356209966296326,
       "learning_rate": 0.0,
-      "loss": 0.7852,
       "step": 130
     },
     {
       "epoch": 1.0,
-      "eval_loss": 2.326753616333008,
-      "eval_runtime": 0.4596,
-      "eval_samples_per_second": 26.108,
-      "eval_steps_per_second": 2.176,
       "step": 130
     },
     {
       "epoch": 1.0,
       "step": 130,
       "total_flos": 17867533713408.0,
-      "train_loss": 2.6317037490698008,
-      "train_runtime": 910.2032,
-      "train_samples_per_second": 36.528,
       "train_steps_per_second": 0.143
     }
   ],

   "log_history": [
     {
       "epoch": 0.007692307692307693,
+      "grad_norm": 1824.0493249890733,
       "learning_rate": 1.5384615384615387e-06,
       "loss": 28.7448,
       "step": 1
     },
     {
       "epoch": 0.038461538461538464,
+      "grad_norm": 338.1051064004804,
       "learning_rate": 7.692307692307694e-06,
+      "loss": 21.1097,
       "step": 5
     },
     {
       "epoch": 0.07692307692307693,
+      "grad_norm": 60.27893910460336,
       "learning_rate": 1.5384615384615387e-05,
+      "loss": 13.8645,
       "step": 10
     },
     {
       "epoch": 0.11538461538461539,
+      "grad_norm": 19.31150319920863,
       "learning_rate": 1.9985583705641418e-05,
+      "loss": 5.697,
       "step": 15
     },
     {
       "epoch": 0.15384615384615385,
+      "grad_norm": 69.03270795451746,
       "learning_rate": 1.9823877374156647e-05,
+      "loss": 2.3869,
       "step": 20
     },
     {
       "epoch": 0.19230769230769232,
+      "grad_norm": 76.45052582249555,
       "learning_rate": 1.9485364419471454e-05,
+      "loss": 1.7302,
       "step": 25
     },
     {
       "epoch": 0.23076923076923078,
+      "grad_norm": 18.876039351066055,
       "learning_rate": 1.8976137276390145e-05,
+      "loss": 1.5732,
       "step": 30
     },
     {
       "epoch": 0.2692307692307692,
+      "grad_norm": 23.947629924642698,
       "learning_rate": 1.8305360832480118e-05,
+      "loss": 1.3462,
       "step": 35
     },
     {
       "epoch": 0.3076923076923077,
+      "grad_norm": 24.85999234906976,
       "learning_rate": 1.7485107481711014e-05,
+      "loss": 1.2076,
       "step": 40
     },
     {
       "epoch": 0.34615384615384615,
+      "grad_norm": 16.833283516923675,
       "learning_rate": 1.653013984983585e-05,
+      "loss": 1.1067,
       "step": 45
     },
     {
       "epoch": 0.38461538461538464,
+      "grad_norm": 5.260723361500702,
       "learning_rate": 1.5457645101945046e-05,
+      "loss": 1.0689,
       "step": 50
     },
     {
       "epoch": 0.4230769230769231,
+      "grad_norm": 8.067805674932265,
       "learning_rate": 1.4286925614030542e-05,
+      "loss": 1.0097,
       "step": 55
     },
     {
       "epoch": 0.46153846153846156,
+      "grad_norm": 14.86263828160759,
       "learning_rate": 1.303905157574247e-05,
+      "loss": 0.978,
       "step": 60
     },
     {
       "epoch": 0.5,
+      "grad_norm": 6.459278631918247,
       "learning_rate": 1.1736481776669307e-05,
+      "loss": 0.9093,
       "step": 65
     },
     {
       "epoch": 0.5384615384615384,
+      "grad_norm": 2.9078399705453846,
       "learning_rate": 1.0402659401094154e-05,
+      "loss": 0.8944,
       "step": 70
     },
     {
       "epoch": 0.5769230769230769,
+      "grad_norm": 7.660903004613179,
       "learning_rate": 9.061590105968208e-06,
+      "loss": 0.8973,
       "step": 75
     },
     {
       "epoch": 0.6153846153846154,
+      "grad_norm": 5.302988730231839,
       "learning_rate": 7.73740997570278e-06,
+      "loss": 0.8763,
       "step": 80
     },
     {
       "epoch": 0.6538461538461539,
+      "grad_norm": 4.5745469835172825,
       "learning_rate": 6.453951129574644e-06,
+      "loss": 0.85,
       "step": 85
     },
     {
       "epoch": 0.6923076923076923,
+      "grad_norm": 2.9833461950622047,
       "learning_rate": 5.234312799786921e-06,
+      "loss": 0.8283,
       "step": 90
     },
     {
       "epoch": 0.7307692307692307,
+      "grad_norm": 1.3718638590118337,
       "learning_rate": 4.100445599768774e-06,
+      "loss": 0.8142,
       "step": 95
     },
     {
       "epoch": 0.7692307692307693,
+      "grad_norm": 1.0682148259587978,
       "learning_rate": 3.0727564649040066e-06,
+      "loss": 0.8049,
       "step": 100
     },
     {
       "epoch": 0.8076923076923077,
+      "grad_norm": 0.9650280690913742,
       "learning_rate": 2.1697413758237785e-06,
+      "loss": 0.8029,
       "step": 105
     },
     {
       "epoch": 0.8461538461538461,
+      "grad_norm": 1.0735752809829815,
       "learning_rate": 1.407652474377832e-06,
+      "loss": 0.793,
       "step": 110
     },
     {
       "epoch": 0.8846153846153846,
+      "grad_norm": 0.7668233578378257,
       "learning_rate": 8.002055634117578e-07,
+      "loss": 0.7941,
       "step": 115
     },
     {
       "epoch": 0.9230769230769231,
+      "grad_norm": 0.9841012840352416,
       "learning_rate": 3.5833325466437697e-07,
+      "loss": 0.7823,
       "step": 120
     },
     {
       "epoch": 0.9615384615384616,
+      "grad_norm": 0.7158242209277041,
       "learning_rate": 8.99882075409153e-08,
+      "loss": 0.776,
       "step": 125
     },
     {
       "epoch": 1.0,
+      "grad_norm": 0.8276248743240251,
       "learning_rate": 0.0,
+      "loss": 0.7805,
       "step": 130
     },
     {
       "epoch": 1.0,
+      "eval_loss": 2.284010648727417,
+      "eval_runtime": 0.4599,
+      "eval_samples_per_second": 26.092,
+      "eval_steps_per_second": 2.174,
       "step": 130
     },
     {
       "epoch": 1.0,
       "step": 130,
       "total_flos": 17867533713408.0,
+      "train_loss": 2.5464949974646935,
+      "train_runtime": 907.3672,
+      "train_samples_per_second": 36.642,
       "train_steps_per_second": 0.143
     }
   ],

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6422201a5ac12b4bbe915d5718a159f31164a382494e678d2426b5c5035ef49e
 size 7288

 version https://git-lfs.github.com/spec/v1
+oid sha256:e174f24b0a38ca479c097f126c671739f81f1ab012cec2208f1f9e98f03a8aec
 size 7288