sedrickkeh commited on
Commit
558bca0
1 Parent(s): 19610fd

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_camel_ai_math
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_camel_ai_math
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6600
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_camel_ai_math
 
16
 
17
  # OH_DCFT_V3_wo_camel_ai_math
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_camel_ai_math dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6600
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.9980781550288276,
3
- "eval_loss": 0.6631707549095154,
4
- "eval_runtime": 210.5047,
5
- "eval_samples_per_second": 49.951,
6
  "eval_steps_per_second": 0.394,
7
  "total_flos": 1959374817853440.0,
8
- "train_loss": 0.6376129688360752,
9
- "train_runtime": 34970.0124,
10
- "train_samples_per_second": 17.138,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.9980781550288276,
3
+ "eval_loss": 0.6599797010421753,
4
+ "eval_runtime": 210.4663,
5
+ "eval_samples_per_second": 49.96,
6
  "eval_steps_per_second": 0.394,
7
  "total_flos": 1959374817853440.0,
8
+ "train_loss": 0.6400190381922274,
9
+ "train_runtime": 35097.8295,
10
+ "train_samples_per_second": 17.076,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.9980781550288276,
3
- "eval_loss": 0.6631707549095154,
4
- "eval_runtime": 210.5047,
5
- "eval_samples_per_second": 49.951,
6
  "eval_steps_per_second": 0.394
7
  }
 
1
  {
2
  "epoch": 2.9980781550288276,
3
+ "eval_loss": 0.6599797010421753,
4
+ "eval_runtime": 210.4663,
5
+ "eval_samples_per_second": 49.96,
6
  "eval_steps_per_second": 0.394
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9980781550288276,
3
  "total_flos": 1959374817853440.0,
4
- "train_loss": 0.6376129688360752,
5
- "train_runtime": 34970.0124,
6
- "train_samples_per_second": 17.138,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.9980781550288276,
3
  "total_flos": 1959374817853440.0,
4
+ "train_loss": 0.6400190381922274,
5
+ "train_runtime": 35097.8295,
6
+ "train_samples_per_second": 17.076,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,854 +10,854 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.025624599615631006,
13
- "grad_norm": 3.2885472038218984,
14
  "learning_rate": 5e-06,
15
- "loss": 0.9386,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05124919923126201,
20
- "grad_norm": 4.456300061852855,
21
  "learning_rate": 5e-06,
22
- "loss": 0.8437,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07687379884689302,
27
- "grad_norm": 0.995400664964247,
28
  "learning_rate": 5e-06,
29
- "loss": 0.803,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10249839846252402,
34
- "grad_norm": 0.9865743145936546,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7755,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12812299807815503,
41
- "grad_norm": 1.1445876439209104,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7491,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15374759769378604,
48
- "grad_norm": 0.7021402356716058,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7363,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17937219730941703,
55
- "grad_norm": 0.6748305295476841,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7274,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.20499679692504805,
62
- "grad_norm": 0.8880154082614105,
63
  "learning_rate": 5e-06,
64
- "loss": 0.7169,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.23062139654067906,
69
- "grad_norm": 0.9734749908597081,
70
  "learning_rate": 5e-06,
71
- "loss": 0.7133,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.25624599615631005,
76
- "grad_norm": 0.586318081087361,
77
  "learning_rate": 5e-06,
78
- "loss": 0.709,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.28187059577194107,
83
- "grad_norm": 0.49195087780563773,
84
  "learning_rate": 5e-06,
85
- "loss": 0.7021,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.3074951953875721,
90
- "grad_norm": 0.5411392841704284,
91
  "learning_rate": 5e-06,
92
- "loss": 0.708,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3331197950032031,
97
- "grad_norm": 0.6121381534822533,
98
  "learning_rate": 5e-06,
99
- "loss": 0.702,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.35874439461883406,
104
- "grad_norm": 0.5779295696071732,
105
  "learning_rate": 5e-06,
106
- "loss": 0.708,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3843689942344651,
111
- "grad_norm": 0.5431053824058845,
112
  "learning_rate": 5e-06,
113
- "loss": 0.7024,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4099935938500961,
118
- "grad_norm": 0.6102291649380537,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6998,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4356181934657271,
125
- "grad_norm": 0.562363970013211,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6984,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.4612427930813581,
132
- "grad_norm": 0.5583907505215044,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6945,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4868673926969891,
139
- "grad_norm": 0.6011083145558899,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6894,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5124919923126201,
146
- "grad_norm": 0.7575418941889398,
147
  "learning_rate": 5e-06,
148
- "loss": 0.7,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5381165919282511,
153
- "grad_norm": 0.5081456852667208,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6948,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5637411915438821,
160
- "grad_norm": 0.5741517105893602,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6897,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5893657911595132,
167
- "grad_norm": 0.6030704306992282,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6813,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6149903907751442,
174
- "grad_norm": 0.6775227505967703,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6852,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6406149903907752,
181
- "grad_norm": 0.6338125454546812,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6804,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6662395900064062,
188
- "grad_norm": 0.5669879790281377,
189
  "learning_rate": 5e-06,
190
- "loss": 0.6855,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6918641896220371,
195
- "grad_norm": 0.586776540269306,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6782,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.7174887892376681,
202
- "grad_norm": 0.6203573082289723,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6796,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.7431133888532991,
209
- "grad_norm": 0.5394149301983193,
210
  "learning_rate": 5e-06,
211
- "loss": 0.6878,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7687379884689302,
216
- "grad_norm": 0.5647794597410413,
217
  "learning_rate": 5e-06,
218
- "loss": 0.6774,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7943625880845612,
223
- "grad_norm": 0.5886707944345854,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6784,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.8199871877001922,
230
- "grad_norm": 0.7892741242506496,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6802,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.8456117873158232,
237
- "grad_norm": 0.6133781414876699,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6721,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8712363869314542,
244
- "grad_norm": 0.6377632743431111,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6711,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8968609865470852,
251
- "grad_norm": 0.530582926146814,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6752,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.9224855861627163,
258
- "grad_norm": 0.520444529526131,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6685,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.9481101857783472,
265
- "grad_norm": 0.575279318340331,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6824,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9737347853939782,
272
- "grad_norm": 0.8704274805859955,
273
  "learning_rate": 5e-06,
274
- "loss": 0.6763,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9993593850096092,
279
- "grad_norm": 0.5818270555924433,
280
  "learning_rate": 5e-06,
281
- "loss": 0.6796,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9993593850096092,
286
- "eval_loss": 0.6705073118209839,
287
- "eval_runtime": 210.2321,
288
- "eval_samples_per_second": 50.016,
289
  "eval_steps_per_second": 0.395,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.0249839846252402,
294
- "grad_norm": 0.7758678598730366,
295
  "learning_rate": 5e-06,
296
- "loss": 0.6304,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.0506085842408712,
301
- "grad_norm": 0.6249211500646218,
302
  "learning_rate": 5e-06,
303
- "loss": 0.6246,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.0762331838565022,
308
- "grad_norm": 0.6047928898745287,
309
  "learning_rate": 5e-06,
310
- "loss": 0.6304,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.1018577834721333,
315
- "grad_norm": 0.5812394406224998,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6259,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.1274823830877643,
322
- "grad_norm": 0.522119175741154,
323
  "learning_rate": 5e-06,
324
- "loss": 0.6269,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.1531069827033953,
329
- "grad_norm": 0.5901119686651206,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6245,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.1787315823190263,
336
- "grad_norm": 0.5228301365724772,
337
  "learning_rate": 5e-06,
338
- "loss": 0.6258,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.2043561819346573,
343
- "grad_norm": 0.5223355205495337,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6308,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.2299807815502883,
350
- "grad_norm": 0.5682544518265831,
351
  "learning_rate": 5e-06,
352
- "loss": 0.6277,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.2556053811659194,
357
- "grad_norm": 0.5767233826653153,
358
  "learning_rate": 5e-06,
359
- "loss": 0.624,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.2812299807815504,
364
- "grad_norm": 0.5410081056861039,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6099,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.3068545803971814,
371
- "grad_norm": 0.5399164809119454,
372
  "learning_rate": 5e-06,
373
- "loss": 0.6259,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.3324791800128124,
378
- "grad_norm": 0.6426051464181596,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6302,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.3581037796284434,
385
- "grad_norm": 0.5537518586318295,
386
  "learning_rate": 5e-06,
387
- "loss": 0.6241,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.3837283792440744,
392
- "grad_norm": 0.5136102716991056,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6279,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.4093529788597055,
399
- "grad_norm": 0.5691815192061714,
400
  "learning_rate": 5e-06,
401
- "loss": 0.6257,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.4349775784753362,
406
- "grad_norm": 0.5956632063571426,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6265,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.4606021780909673,
413
- "grad_norm": 0.581023423899474,
414
  "learning_rate": 5e-06,
415
- "loss": 0.627,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.4862267777065983,
420
- "grad_norm": 0.5178152762952138,
421
  "learning_rate": 5e-06,
422
- "loss": 0.6268,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.5118513773222293,
427
- "grad_norm": 0.624224902789754,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6329,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.5374759769378603,
434
- "grad_norm": 0.6214150588168393,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6244,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.5631005765534913,
441
- "grad_norm": 0.510471747049578,
442
  "learning_rate": 5e-06,
443
- "loss": 0.6304,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.5887251761691223,
448
- "grad_norm": 0.511918307352263,
449
  "learning_rate": 5e-06,
450
- "loss": 0.6273,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.6143497757847534,
455
- "grad_norm": 0.7352297634775503,
456
  "learning_rate": 5e-06,
457
- "loss": 0.626,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.6399743754003844,
462
- "grad_norm": 0.6567654313011416,
463
  "learning_rate": 5e-06,
464
- "loss": 0.6159,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.6655989750160154,
469
- "grad_norm": 0.549652992706915,
470
  "learning_rate": 5e-06,
471
- "loss": 0.6305,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.6912235746316464,
476
- "grad_norm": 0.5634051713923773,
477
  "learning_rate": 5e-06,
478
- "loss": 0.6269,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.7168481742472774,
483
- "grad_norm": 0.529903884524912,
484
  "learning_rate": 5e-06,
485
- "loss": 0.6259,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.7424727738629084,
490
- "grad_norm": 0.506816776536443,
491
  "learning_rate": 5e-06,
492
- "loss": 0.6273,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.7680973734785392,
497
- "grad_norm": 0.6450672440371147,
498
  "learning_rate": 5e-06,
499
- "loss": 0.617,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.7937219730941703,
504
- "grad_norm": 0.6192873375131733,
505
  "learning_rate": 5e-06,
506
- "loss": 0.6196,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.8193465727098013,
511
- "grad_norm": 0.5352789663117377,
512
  "learning_rate": 5e-06,
513
- "loss": 0.6262,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.8449711723254323,
518
- "grad_norm": 0.4668080446143915,
519
  "learning_rate": 5e-06,
520
- "loss": 0.6294,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.8705957719410633,
525
- "grad_norm": 0.6648243829055217,
526
  "learning_rate": 5e-06,
527
- "loss": 0.6231,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.8962203715566943,
532
- "grad_norm": 0.5272415641474717,
533
  "learning_rate": 5e-06,
534
- "loss": 0.6224,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.9218449711723253,
539
- "grad_norm": 0.5810543386827824,
540
  "learning_rate": 5e-06,
541
- "loss": 0.6225,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.9474695707879563,
546
- "grad_norm": 0.6085709674004274,
547
  "learning_rate": 5e-06,
548
- "loss": 0.6217,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.9730941704035874,
553
- "grad_norm": 0.5375279101850413,
554
  "learning_rate": 5e-06,
555
- "loss": 0.6272,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.9987187700192184,
560
- "grad_norm": 0.6762226804445376,
561
  "learning_rate": 5e-06,
562
- "loss": 0.6256,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.9987187700192184,
567
- "eval_loss": 0.6599456071853638,
568
- "eval_runtime": 209.9848,
569
- "eval_samples_per_second": 50.075,
570
- "eval_steps_per_second": 0.395,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.0243433696348494,
575
- "grad_norm": 0.6490424719585334,
576
  "learning_rate": 5e-06,
577
- "loss": 0.574,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.0499679692504804,
582
- "grad_norm": 0.5740794707367172,
583
  "learning_rate": 5e-06,
584
- "loss": 0.5772,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.0755925688661114,
589
- "grad_norm": 0.6139160456025744,
590
  "learning_rate": 5e-06,
591
- "loss": 0.5716,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.1012171684817424,
596
- "grad_norm": 0.5042331647002983,
597
  "learning_rate": 5e-06,
598
- "loss": 0.5658,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.1268417680973735,
603
- "grad_norm": 0.525240715956572,
604
  "learning_rate": 5e-06,
605
- "loss": 0.5766,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.1524663677130045,
610
- "grad_norm": 0.6301792211373436,
611
  "learning_rate": 5e-06,
612
- "loss": 0.5727,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.1780909673286355,
617
- "grad_norm": 0.5695524017278328,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5653,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.2037155669442665,
624
- "grad_norm": 0.5036859440783635,
625
  "learning_rate": 5e-06,
626
- "loss": 0.582,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.2293401665598975,
631
- "grad_norm": 0.6707419650010094,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5738,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.2549647661755285,
638
- "grad_norm": 0.643330757681528,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5757,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.2805893657911596,
645
- "grad_norm": 0.5671913495072154,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5833,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.3062139654067906,
652
- "grad_norm": 0.7335441489473075,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5779,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.3318385650224216,
659
- "grad_norm": 0.5959572278401346,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5752,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.3574631646380526,
666
- "grad_norm": 0.5395801639714329,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5806,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.3830877642536836,
673
- "grad_norm": 0.7361927812478924,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5779,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.4087123638693146,
680
- "grad_norm": 0.6995594552919696,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5815,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.4343369634849457,
687
- "grad_norm": 0.5261563010125221,
688
  "learning_rate": 5e-06,
689
- "loss": 0.5807,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.4599615631005767,
694
- "grad_norm": 0.5407517395465042,
695
  "learning_rate": 5e-06,
696
- "loss": 0.581,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.4855861627162077,
701
- "grad_norm": 0.5289063213570133,
702
  "learning_rate": 5e-06,
703
- "loss": 0.583,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.5112107623318387,
708
- "grad_norm": 0.5796407083708335,
709
  "learning_rate": 5e-06,
710
- "loss": 0.5839,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.5368353619474697,
715
- "grad_norm": 0.598887219069754,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5783,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.5624599615631007,
722
- "grad_norm": 0.5930341940197382,
723
  "learning_rate": 5e-06,
724
- "loss": 0.5825,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.5880845611787313,
729
- "grad_norm": 0.5331038782110779,
730
  "learning_rate": 5e-06,
731
- "loss": 0.5749,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.6137091607943628,
736
- "grad_norm": 0.543876112697049,
737
  "learning_rate": 5e-06,
738
- "loss": 0.58,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.6393337604099933,
743
- "grad_norm": 0.6281847381865296,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5749,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.664958360025625,
750
- "grad_norm": 0.5842056088552238,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5776,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.6905829596412554,
757
- "grad_norm": 0.5715121557218589,
758
  "learning_rate": 5e-06,
759
- "loss": 0.5769,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.716207559256887,
764
- "grad_norm": 0.4892217105691407,
765
  "learning_rate": 5e-06,
766
- "loss": 0.5779,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.7418321588725174,
771
- "grad_norm": 0.6043402014925059,
772
  "learning_rate": 5e-06,
773
- "loss": 0.5794,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.767456758488149,
778
- "grad_norm": 0.6212507294354088,
779
  "learning_rate": 5e-06,
780
- "loss": 0.5831,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.7930813581037794,
785
- "grad_norm": 0.5086022566714822,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5852,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.818705957719411,
792
- "grad_norm": 0.6072722346149179,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5851,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.8443305573350415,
799
- "grad_norm": 0.5800686633562157,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5784,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.8699551569506725,
806
- "grad_norm": 0.6470106482983882,
807
  "learning_rate": 5e-06,
808
- "loss": 0.5775,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.8955797565663035,
813
- "grad_norm": 0.5990632250208906,
814
  "learning_rate": 5e-06,
815
- "loss": 0.5779,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.9212043561819345,
820
- "grad_norm": 0.5694303405669764,
821
  "learning_rate": 5e-06,
822
- "loss": 0.5803,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.9468289557975655,
827
- "grad_norm": 0.5184100107485876,
828
  "learning_rate": 5e-06,
829
- "loss": 0.5831,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.9724535554131966,
834
- "grad_norm": 0.6155448734761894,
835
  "learning_rate": 5e-06,
836
- "loss": 0.5848,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.9980781550288276,
841
- "grad_norm": 0.6461874509945199,
842
  "learning_rate": 5e-06,
843
- "loss": 0.5851,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.9980781550288276,
848
- "eval_loss": 0.6631707549095154,
849
- "eval_runtime": 211.1021,
850
- "eval_samples_per_second": 49.81,
851
- "eval_steps_per_second": 0.393,
852
  "step": 1170
853
  },
854
  {
855
  "epoch": 2.9980781550288276,
856
  "step": 1170,
857
  "total_flos": 1959374817853440.0,
858
- "train_loss": 0.6376129688360752,
859
- "train_runtime": 34970.0124,
860
- "train_samples_per_second": 17.138,
861
  "train_steps_per_second": 0.033
862
  }
863
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.025624599615631006,
13
+ "grad_norm": 23.87626578586006,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.9021,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05124919923126201,
20
+ "grad_norm": 12.894025041742566,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.8278,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07687379884689302,
27
+ "grad_norm": 1.9039308459782107,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7942,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10249839846252402,
34
+ "grad_norm": 0.8246425522140604,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7682,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.12812299807815503,
41
+ "grad_norm": 0.7903898281300727,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7443,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15374759769378604,
48
+ "grad_norm": 0.7618274610091537,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.733,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17937219730941703,
55
+ "grad_norm": 0.6631690211716542,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7251,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.20499679692504805,
62
+ "grad_norm": 0.7752376118116371,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.7144,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.23062139654067906,
69
+ "grad_norm": 0.751359072795012,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.7106,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.25624599615631005,
76
+ "grad_norm": 0.5794322814005122,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.7063,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.28187059577194107,
83
+ "grad_norm": 0.8734788690919498,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6994,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.3074951953875721,
90
+ "grad_norm": 0.5062452804420097,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.7057,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3331197950032031,
97
+ "grad_norm": 0.6542917427814133,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6997,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.35874439461883406,
104
+ "grad_norm": 0.592400656203172,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.7055,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3843689942344651,
111
+ "grad_norm": 0.5861404729061817,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.7002,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4099935938500961,
118
+ "grad_norm": 0.5129403767949405,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6977,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4356181934657271,
125
+ "grad_norm": 0.5869968660626531,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6961,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.4612427930813581,
132
+ "grad_norm": 0.6257829680932973,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6928,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.4868673926969891,
139
+ "grad_norm": 0.5886221575021535,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.6876,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5124919923126201,
146
+ "grad_norm": 0.6006186676094072,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6983,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5381165919282511,
153
+ "grad_norm": 0.5015888297145041,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6931,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.5637411915438821,
160
+ "grad_norm": 0.5307259707540836,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6881,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.5893657911595132,
167
+ "grad_norm": 0.6093913911692712,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6797,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6149903907751442,
174
+ "grad_norm": 0.6093826734084604,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6838,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6406149903907752,
181
+ "grad_norm": 0.607941001127991,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6789,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6662395900064062,
188
+ "grad_norm": 0.4897426114233287,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.684,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.6918641896220371,
195
+ "grad_norm": 0.4483386511091874,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6769,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.7174887892376681,
202
+ "grad_norm": 0.5533227188891904,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6785,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.7431133888532991,
209
+ "grad_norm": 0.5333807928895044,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6866,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7687379884689302,
216
+ "grad_norm": 0.5209254483197653,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6762,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.7943625880845612,
223
+ "grad_norm": 0.45530525150524676,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6773,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.8199871877001922,
230
+ "grad_norm": 0.6616235203126,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.679,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.8456117873158232,
237
+ "grad_norm": 0.5707652588164589,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.671,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.8712363869314542,
244
+ "grad_norm": 0.6197974812556873,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6701,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.8968609865470852,
251
+ "grad_norm": 0.5705968336277791,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.674,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.9224855861627163,
258
+ "grad_norm": 0.46125889577293033,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6675,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.9481101857783472,
265
+ "grad_norm": 0.5263500841671853,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6816,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.9737347853939782,
272
+ "grad_norm": 0.6701180427430501,
273
  "learning_rate": 5e-06,
274
+ "loss": 0.6753,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.9993593850096092,
279
+ "grad_norm": 0.6408384613157992,
280
  "learning_rate": 5e-06,
281
+ "loss": 0.6786,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.9993593850096092,
286
+ "eval_loss": 0.669628381729126,
287
+ "eval_runtime": 210.2529,
288
+ "eval_samples_per_second": 50.011,
289
  "eval_steps_per_second": 0.395,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.0249839846252402,
294
+ "grad_norm": 0.6401466301705097,
295
  "learning_rate": 5e-06,
296
+ "loss": 0.6342,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.0506085842408712,
301
+ "grad_norm": 0.561321939868088,
302
  "learning_rate": 5e-06,
303
+ "loss": 0.6286,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.0762331838565022,
308
+ "grad_norm": 0.5873132600959697,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.634,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.1018577834721333,
315
+ "grad_norm": 0.4982492625003527,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6297,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.1274823830877643,
322
+ "grad_norm": 0.5451894371727711,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.6307,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.1531069827033953,
329
+ "grad_norm": 0.505854015646168,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.6283,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.1787315823190263,
336
+ "grad_norm": 0.47382835639154014,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6296,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.2043561819346573,
343
+ "grad_norm": 0.49475730559885767,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6347,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.2299807815502883,
350
+ "grad_norm": 0.4985567194890561,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6315,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.2556053811659194,
357
+ "grad_norm": 0.5560313955644473,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.6275,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.2812299807815504,
364
+ "grad_norm": 0.49255815849421947,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6134,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.3068545803971814,
371
+ "grad_norm": 0.5136446413395007,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6294,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.3324791800128124,
378
+ "grad_norm": 0.715579880532086,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.634,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.3581037796284434,
385
+ "grad_norm": 0.539683933602384,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.6273,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.3837283792440744,
392
+ "grad_norm": 0.47743309373776915,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.6312,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.4093529788597055,
399
+ "grad_norm": 0.5510090178389563,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.6291,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.4349775784753362,
406
+ "grad_norm": 0.4885531066853449,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.63,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.4606021780909673,
413
+ "grad_norm": 0.4550390667985221,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.63,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.4862267777065983,
420
+ "grad_norm": 0.5094454871437174,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6301,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.5118513773222293,
427
+ "grad_norm": 0.5378797481591068,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6362,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.5374759769378603,
434
+ "grad_norm": 0.4964570729916681,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6275,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.5631005765534913,
441
+ "grad_norm": 0.5150915164078523,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.6336,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.5887251761691223,
448
+ "grad_norm": 0.45827307808132584,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6305,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.6143497757847534,
455
+ "grad_norm": 0.670050138542801,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.629,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.6399743754003844,
462
+ "grad_norm": 0.5390177749556742,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.6189,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.6655989750160154,
469
+ "grad_norm": 0.47514950562866276,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.6335,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.6912235746316464,
476
+ "grad_norm": 0.45626516699726205,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.63,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.7168481742472774,
483
+ "grad_norm": 0.5259666631069414,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.629,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.7424727738629084,
490
+ "grad_norm": 0.45958090467852913,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.6304,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.7680973734785392,
497
+ "grad_norm": 0.537603671552474,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.6201,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.7937219730941703,
504
+ "grad_norm": 0.4944510355408107,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.6227,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.8193465727098013,
511
+ "grad_norm": 0.4459500383872668,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.6293,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.8449711723254323,
518
+ "grad_norm": 0.41107024989867014,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.6325,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.8705957719410633,
525
+ "grad_norm": 0.5205263432281845,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.626,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.8962203715566943,
532
+ "grad_norm": 0.5178259439056772,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.6253,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.9218449711723253,
539
+ "grad_norm": 0.5186310082942625,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.6254,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 1.9474695707879563,
546
+ "grad_norm": 0.48818671836894234,
547
  "learning_rate": 5e-06,
548
+ "loss": 0.6244,
549
  "step": 760
550
  },
551
  {
552
  "epoch": 1.9730941704035874,
553
+ "grad_norm": 0.4816352801983277,
554
  "learning_rate": 5e-06,
555
+ "loss": 0.6301,
556
  "step": 770
557
  },
558
  {
559
  "epoch": 1.9987187700192184,
560
+ "grad_norm": 0.5573576817650256,
561
  "learning_rate": 5e-06,
562
+ "loss": 0.6284,
563
  "step": 780
564
  },
565
  {
566
  "epoch": 1.9987187700192184,
567
+ "eval_loss": 0.6584250926971436,
568
+ "eval_runtime": 210.7445,
569
+ "eval_samples_per_second": 49.895,
570
+ "eval_steps_per_second": 0.394,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.0243433696348494,
575
+ "grad_norm": 0.6596303880050641,
576
  "learning_rate": 5e-06,
577
+ "loss": 0.582,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.0499679692504804,
582
+ "grad_norm": 0.5374575961558432,
583
  "learning_rate": 5e-06,
584
+ "loss": 0.5856,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.0755925688661114,
589
+ "grad_norm": 0.565805764328584,
590
  "learning_rate": 5e-06,
591
+ "loss": 0.5799,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.1012171684817424,
596
+ "grad_norm": 0.45938077865068583,
597
  "learning_rate": 5e-06,
598
+ "loss": 0.574,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.1268417680973735,
603
+ "grad_norm": 0.5086155425168429,
604
  "learning_rate": 5e-06,
605
+ "loss": 0.5846,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.1524663677130045,
610
+ "grad_norm": 0.5951424838463266,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.5804,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.1780909673286355,
617
+ "grad_norm": 0.5389501938241722,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5733,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.2037155669442665,
624
+ "grad_norm": 0.49715965924754885,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5901,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.2293401665598975,
631
+ "grad_norm": 0.6416154288767167,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5814,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.2549647661755285,
638
+ "grad_norm": 0.5840426113039779,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5833,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.2805893657911596,
645
+ "grad_norm": 0.5363577230603178,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5913,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.3062139654067906,
652
+ "grad_norm": 0.4999642649696085,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5853,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.3318385650224216,
659
+ "grad_norm": 0.6712704299303816,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5824,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.3574631646380526,
666
+ "grad_norm": 0.4993129343338316,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5882,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.3830877642536836,
673
+ "grad_norm": 0.6482429574217665,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5852,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.4087123638693146,
680
+ "grad_norm": 0.5858234169687565,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5891,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.4343369634849457,
687
+ "grad_norm": 0.4606498702171825,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5882,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.4599615631005767,
694
+ "grad_norm": 0.4665245302124569,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.5882,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.4855861627162077,
701
+ "grad_norm": 0.4787609348846855,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.5904,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.5112107623318387,
708
+ "grad_norm": 0.5112448288708001,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.5915,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.5368353619474697,
715
+ "grad_norm": 0.5584692413737392,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5855,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.5624599615631007,
722
+ "grad_norm": 0.5151021464825077,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5895,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.5880845611787313,
729
+ "grad_norm": 0.47435391792664705,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.5818,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.6137091607943628,
736
+ "grad_norm": 0.47062007111208676,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5869,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.6393337604099933,
743
+ "grad_norm": 0.5030724465847995,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5817,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.664958360025625,
750
+ "grad_norm": 0.5696236917490695,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5846,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.6905829596412554,
757
+ "grad_norm": 0.5097571703440519,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5838,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.716207559256887,
764
+ "grad_norm": 0.46906709023993814,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5849,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.7418321588725174,
771
+ "grad_norm": 0.6048108768327223,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.5861,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.767456758488149,
778
+ "grad_norm": 0.5763148469790699,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.59,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.7930813581037794,
785
+ "grad_norm": 0.5325120981655176,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5922,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.818705957719411,
792
+ "grad_norm": 0.47425803790021404,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.5919,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.8443305573350415,
799
+ "grad_norm": 0.5025999987290302,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5851,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.8699551569506725,
806
+ "grad_norm": 0.5898623437912127,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.584,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.8955797565663035,
813
+ "grad_norm": 0.49890831764955235,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5847,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 2.9212043561819345,
820
+ "grad_norm": 0.4911458303208662,
821
  "learning_rate": 5e-06,
822
+ "loss": 0.5868,
823
  "step": 1140
824
  },
825
  {
826
  "epoch": 2.9468289557975655,
827
+ "grad_norm": 0.4862045432861365,
828
  "learning_rate": 5e-06,
829
+ "loss": 0.5896,
830
  "step": 1150
831
  },
832
  {
833
  "epoch": 2.9724535554131966,
834
+ "grad_norm": 0.574968732277202,
835
  "learning_rate": 5e-06,
836
+ "loss": 0.5915,
837
  "step": 1160
838
  },
839
  {
840
  "epoch": 2.9980781550288276,
841
+ "grad_norm": 0.536851160723009,
842
  "learning_rate": 5e-06,
843
+ "loss": 0.5918,
844
  "step": 1170
845
  },
846
  {
847
  "epoch": 2.9980781550288276,
848
+ "eval_loss": 0.6599797010421753,
849
+ "eval_runtime": 210.167,
850
+ "eval_samples_per_second": 50.032,
851
+ "eval_steps_per_second": 0.395,
852
  "step": 1170
853
  },
854
  {
855
  "epoch": 2.9980781550288276,
856
  "step": 1170,
857
  "total_flos": 1959374817853440.0,
858
+ "train_loss": 0.6400190381922274,
859
+ "train_runtime": 35097.8295,
860
+ "train_samples_per_second": 17.076,
861
  "train_steps_per_second": 0.033
862
  }
863
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED