Reynold97 commited on
Commit
b90f3f4
·
verified ·
1 Parent(s): 9713801

Training in progress, step 100, checkpoint

Browse files
checkpoint-100/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d140c685bbbdf73eff30c42b3eaef6949a1a4e79c0051674896322d5a2f35f65
3
  size 27313024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b3724d9cb5ad7f3ad6b3145ea19ec715b612d3bf48ac25eb25f71ad3350332
3
  size 27313024
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d45c2bef931e188dd0e466d4eb9509f5f61baf9e9ada63713c9fdff0fa4aeb9c
3
  size 54668218
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d74c2a34d8bd1e3bb822829ae2c223dfde93fc183c47197ac577c0d0e98ad96b
3
  size 54668218
checkpoint-100/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:666592a5b57156ed650e68139efccc3f2979768f94ac7ee04c6f1378fbe157f7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3c9b0005e2657e4343d430644e98c0322af865aa1a9053960adee68c999d5
3
  size 14244
checkpoint-100/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea217589f6a52e6e5bf252b883fdc2c5bb872bd2fee80104e01128c8070232c3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8bf8a092e609ea1421206d102dbe42a4b8e939f00a4089ac1280c8ce0f99ed4
3
  size 1064
checkpoint-100/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 0.6870357394218445,
3
- "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-80",
4
  "epoch": 0.9433962264150944,
5
- "eval_steps": 20,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,750 +10,790 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.009433962264150943,
13
- "grad_norm": 20.337095260620117,
14
- "learning_rate": 2e-05,
15
- "loss": 0.5913,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.018867924528301886,
20
- "grad_norm": 11.506393432617188,
21
- "learning_rate": 2e-05,
22
- "loss": 0.6775,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.02830188679245283,
27
- "grad_norm": 9.703904151916504,
28
- "learning_rate": 2e-05,
29
- "loss": 0.576,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.03773584905660377,
34
- "grad_norm": 11.118324279785156,
35
- "learning_rate": 2e-05,
36
- "loss": 0.5084,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.04716981132075472,
41
- "grad_norm": 13.329315185546875,
42
- "learning_rate": 2e-05,
43
- "loss": 0.5712,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.05660377358490566,
48
- "grad_norm": 29.63173484802246,
49
- "learning_rate": 2e-05,
50
- "loss": 0.7234,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0660377358490566,
55
- "grad_norm": 17.787134170532227,
56
- "learning_rate": 2e-05,
57
- "loss": 0.6053,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.07547169811320754,
62
- "grad_norm": 30.639591217041016,
63
- "learning_rate": 2e-05,
64
- "loss": 0.6873,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.08490566037735849,
69
- "grad_norm": 9.612072944641113,
70
- "learning_rate": 2e-05,
71
- "loss": 0.5541,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.09433962264150944,
76
- "grad_norm": 8.989519119262695,
77
- "learning_rate": 2e-05,
78
- "loss": 0.7858,
 
 
 
 
 
 
 
 
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.10377358490566038,
83
- "grad_norm": 17.486469268798828,
84
- "learning_rate": 2e-05,
85
- "loss": 0.9176,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.11320754716981132,
90
- "grad_norm": 35.29791259765625,
91
- "learning_rate": 2e-05,
92
- "loss": 0.6558,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.12264150943396226,
97
- "grad_norm": 19.468692779541016,
98
- "learning_rate": 2e-05,
99
- "loss": 0.6085,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.1320754716981132,
104
- "grad_norm": 9.410886764526367,
105
- "learning_rate": 2e-05,
106
- "loss": 0.6229,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.14150943396226415,
111
- "grad_norm": 11.87700080871582,
112
- "learning_rate": 2e-05,
113
- "loss": 0.5764,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.1509433962264151,
118
- "grad_norm": 20.188251495361328,
119
- "learning_rate": 2e-05,
120
- "loss": 0.8275,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.16037735849056603,
125
- "grad_norm": 28.298933029174805,
126
- "learning_rate": 2e-05,
127
- "loss": 0.5896,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.16981132075471697,
132
- "grad_norm": 47.8366813659668,
133
- "learning_rate": 2e-05,
134
- "loss": 0.8496,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.1792452830188679,
139
- "grad_norm": 36.19501495361328,
140
- "learning_rate": 2e-05,
141
- "loss": 0.6756,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.18867924528301888,
146
- "grad_norm": 22.574682235717773,
147
- "learning_rate": 2e-05,
148
- "loss": 0.6366,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.18867924528301888,
153
- "eval_loss": 0.7738199234008789,
154
- "eval_runtime": 18.5622,
155
- "eval_samples_per_second": 15.893,
156
- "eval_steps_per_second": 3.179,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.19811320754716982,
161
- "grad_norm": 47.26109313964844,
162
- "learning_rate": 2e-05,
163
- "loss": 0.8049,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.20754716981132076,
168
- "grad_norm": 14.115569114685059,
169
- "learning_rate": 2e-05,
170
- "loss": 0.6604,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.2169811320754717,
175
- "grad_norm": 25.182506561279297,
176
- "learning_rate": 2e-05,
177
- "loss": 0.7591,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.22641509433962265,
182
- "grad_norm": 11.066629409790039,
183
- "learning_rate": 2e-05,
184
- "loss": 0.6497,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.2358490566037736,
189
- "grad_norm": 8.666443824768066,
190
- "learning_rate": 2e-05,
191
- "loss": 0.5788,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.24528301886792453,
196
- "grad_norm": 7.663419723510742,
197
- "learning_rate": 2e-05,
198
- "loss": 0.7128,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.25471698113207547,
203
- "grad_norm": 30.738019943237305,
204
- "learning_rate": 2e-05,
205
- "loss": 0.7349,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.2641509433962264,
210
- "grad_norm": 29.7031307220459,
211
- "learning_rate": 2e-05,
212
- "loss": 0.7618,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.27358490566037735,
217
- "grad_norm": 36.29247283935547,
218
- "learning_rate": 2e-05,
219
- "loss": 0.6923,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.2830188679245283,
224
- "grad_norm": 16.721107482910156,
225
- "learning_rate": 2e-05,
226
- "loss": 0.5942,
 
 
 
 
 
 
 
 
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.29245283018867924,
231
- "grad_norm": 36.51066970825195,
232
- "learning_rate": 2e-05,
233
- "loss": 0.7745,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.3018867924528302,
238
- "grad_norm": 13.144597053527832,
239
- "learning_rate": 2e-05,
240
- "loss": 0.6199,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.3113207547169811,
245
- "grad_norm": 24.113306045532227,
246
- "learning_rate": 2e-05,
247
- "loss": 0.6653,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.32075471698113206,
252
- "grad_norm": 34.57608413696289,
253
- "learning_rate": 2e-05,
254
- "loss": 0.5586,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.330188679245283,
259
- "grad_norm": 15.308676719665527,
260
- "learning_rate": 2e-05,
261
- "loss": 0.7438,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.33962264150943394,
266
- "grad_norm": 34.94574737548828,
267
- "learning_rate": 2e-05,
268
- "loss": 0.7437,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.3490566037735849,
273
- "grad_norm": 53.19334030151367,
274
- "learning_rate": 2e-05,
275
- "loss": 0.8349,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.3584905660377358,
280
- "grad_norm": 38.979618072509766,
281
- "learning_rate": 2e-05,
282
- "loss": 0.6599,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.36792452830188677,
287
- "grad_norm": 30.653545379638672,
288
- "learning_rate": 2e-05,
289
- "loss": 0.6782,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.37735849056603776,
294
- "grad_norm": 28.044891357421875,
295
- "learning_rate": 2e-05,
296
- "loss": 0.7945,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.37735849056603776,
301
- "eval_loss": 0.7332659959793091,
302
- "eval_runtime": 19.1011,
303
- "eval_samples_per_second": 15.444,
304
- "eval_steps_per_second": 3.089,
305
  "step": 40
306
  },
307
  {
308
  "epoch": 0.3867924528301887,
309
- "grad_norm": 7.029095649719238,
310
- "learning_rate": 2e-05,
311
- "loss": 0.6402,
312
  "step": 41
313
  },
314
  {
315
  "epoch": 0.39622641509433965,
316
- "grad_norm": 31.614521026611328,
317
- "learning_rate": 2e-05,
318
- "loss": 0.6392,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.4056603773584906,
323
- "grad_norm": 8.320149421691895,
324
- "learning_rate": 2e-05,
325
- "loss": 0.5229,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.41509433962264153,
330
- "grad_norm": 18.34058380126953,
331
- "learning_rate": 2e-05,
332
- "loss": 0.6935,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.42452830188679247,
337
- "grad_norm": 36.57161331176758,
338
- "learning_rate": 2e-05,
339
- "loss": 0.6399,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.4339622641509434,
344
- "grad_norm": 7.638645172119141,
345
- "learning_rate": 2e-05,
346
- "loss": 0.7015,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.44339622641509435,
351
- "grad_norm": 18.424884796142578,
352
- "learning_rate": 2e-05,
353
- "loss": 0.7157,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.4528301886792453,
358
- "grad_norm": 51.02284240722656,
359
- "learning_rate": 2e-05,
360
- "loss": 0.8159,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.46226415094339623,
365
- "grad_norm": 29.55755615234375,
366
- "learning_rate": 2e-05,
367
- "loss": 0.6529,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.4716981132075472,
372
- "grad_norm": 12.764640808105469,
373
- "learning_rate": 2e-05,
374
- "loss": 0.6704,
 
 
 
 
 
 
 
 
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.4811320754716981,
379
- "grad_norm": 17.65540313720703,
380
- "learning_rate": 2e-05,
381
- "loss": 0.6762,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 0.49056603773584906,
386
- "grad_norm": 10.487552642822266,
387
- "learning_rate": 2e-05,
388
- "loss": 0.6094,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 0.5,
393
- "grad_norm": 10.158540725708008,
394
- "learning_rate": 2e-05,
395
- "loss": 0.6539,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 0.5094339622641509,
400
- "grad_norm": 27.807415008544922,
401
- "learning_rate": 2e-05,
402
- "loss": 0.7554,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 0.5188679245283019,
407
- "grad_norm": 39.26100540161133,
408
- "learning_rate": 2e-05,
409
- "loss": 0.8584,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 0.5283018867924528,
414
- "grad_norm": 8.890057563781738,
415
- "learning_rate": 2e-05,
416
- "loss": 0.7872,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 0.5377358490566038,
421
- "grad_norm": 11.212479591369629,
422
- "learning_rate": 2e-05,
423
- "loss": 0.8501,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 0.5471698113207547,
428
- "grad_norm": 8.871652603149414,
429
- "learning_rate": 2e-05,
430
- "loss": 0.6034,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 0.5566037735849056,
435
- "grad_norm": 13.393775939941406,
436
- "learning_rate": 2e-05,
437
- "loss": 0.5953,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 0.5660377358490566,
442
- "grad_norm": 16.56597328186035,
443
- "learning_rate": 2e-05,
444
- "loss": 0.5595,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 0.5660377358490566,
449
- "eval_loss": 0.7103222012519836,
450
- "eval_runtime": 19.0536,
451
- "eval_samples_per_second": 15.483,
452
- "eval_steps_per_second": 3.097,
453
  "step": 60
454
  },
455
  {
456
  "epoch": 0.5754716981132075,
457
- "grad_norm": 66.63365936279297,
458
- "learning_rate": 2e-05,
459
- "loss": 0.8609,
460
  "step": 61
461
  },
462
  {
463
  "epoch": 0.5849056603773585,
464
- "grad_norm": 43.89859390258789,
465
- "learning_rate": 2e-05,
466
- "loss": 0.7555,
467
  "step": 62
468
  },
469
  {
470
  "epoch": 0.5943396226415094,
471
- "grad_norm": 54.232025146484375,
472
- "learning_rate": 2e-05,
473
- "loss": 0.7666,
474
  "step": 63
475
  },
476
  {
477
  "epoch": 0.6037735849056604,
478
- "grad_norm": 10.439966201782227,
479
- "learning_rate": 2e-05,
480
- "loss": 0.6019,
481
  "step": 64
482
  },
483
  {
484
  "epoch": 0.6132075471698113,
485
- "grad_norm": 15.057198524475098,
486
- "learning_rate": 2e-05,
487
- "loss": 0.6797,
488
  "step": 65
489
  },
490
  {
491
  "epoch": 0.6226415094339622,
492
- "grad_norm": 8.816701889038086,
493
- "learning_rate": 2e-05,
494
- "loss": 0.8066,
495
  "step": 66
496
  },
497
  {
498
  "epoch": 0.6320754716981132,
499
- "grad_norm": 16.436609268188477,
500
- "learning_rate": 2e-05,
501
- "loss": 0.5891,
502
  "step": 67
503
  },
504
  {
505
  "epoch": 0.6415094339622641,
506
- "grad_norm": 27.5755672454834,
507
- "learning_rate": 2e-05,
508
- "loss": 0.6204,
509
  "step": 68
510
  },
511
  {
512
  "epoch": 0.6509433962264151,
513
- "grad_norm": 26.33946990966797,
514
- "learning_rate": 2e-05,
515
- "loss": 0.671,
516
  "step": 69
517
  },
518
  {
519
  "epoch": 0.660377358490566,
520
- "grad_norm": 64.1870346069336,
521
- "learning_rate": 2e-05,
522
- "loss": 0.7638,
 
 
 
 
 
 
 
 
523
  "step": 70
524
  },
525
  {
526
  "epoch": 0.6698113207547169,
527
- "grad_norm": 21.89188003540039,
528
- "learning_rate": 2e-05,
529
- "loss": 0.6771,
530
  "step": 71
531
  },
532
  {
533
  "epoch": 0.6792452830188679,
534
- "grad_norm": 8.088455200195312,
535
- "learning_rate": 2e-05,
536
- "loss": 0.6761,
537
  "step": 72
538
  },
539
  {
540
  "epoch": 0.6886792452830188,
541
- "grad_norm": 11.988521575927734,
542
- "learning_rate": 2e-05,
543
- "loss": 0.6315,
544
  "step": 73
545
  },
546
  {
547
  "epoch": 0.6981132075471698,
548
- "grad_norm": 8.751002311706543,
549
- "learning_rate": 2e-05,
550
- "loss": 0.5967,
551
  "step": 74
552
  },
553
  {
554
  "epoch": 0.7075471698113207,
555
- "grad_norm": 22.44446563720703,
556
- "learning_rate": 2e-05,
557
- "loss": 0.5986,
558
  "step": 75
559
  },
560
  {
561
  "epoch": 0.7169811320754716,
562
- "grad_norm": 6.895334243774414,
563
- "learning_rate": 2e-05,
564
- "loss": 0.6324,
565
  "step": 76
566
  },
567
  {
568
  "epoch": 0.7264150943396226,
569
- "grad_norm": 8.335739135742188,
570
- "learning_rate": 2e-05,
571
- "loss": 0.6581,
572
  "step": 77
573
  },
574
  {
575
  "epoch": 0.7358490566037735,
576
- "grad_norm": 6.27984619140625,
577
- "learning_rate": 2e-05,
578
- "loss": 0.6899,
579
  "step": 78
580
  },
581
  {
582
  "epoch": 0.7452830188679245,
583
- "grad_norm": 13.635252952575684,
584
- "learning_rate": 2e-05,
585
- "loss": 0.7032,
586
  "step": 79
587
  },
588
  {
589
  "epoch": 0.7547169811320755,
590
- "grad_norm": 5.515637397766113,
591
- "learning_rate": 2e-05,
592
- "loss": 0.5121,
593
  "step": 80
594
  },
595
  {
596
  "epoch": 0.7547169811320755,
597
- "eval_loss": 0.6870357394218445,
598
- "eval_runtime": 19.1439,
599
- "eval_samples_per_second": 15.41,
600
- "eval_steps_per_second": 3.082,
601
  "step": 80
602
  },
603
  {
604
  "epoch": 0.7641509433962265,
605
- "grad_norm": 14.854217529296875,
606
- "learning_rate": 2e-05,
607
- "loss": 0.7582,
608
  "step": 81
609
  },
610
  {
611
  "epoch": 0.7735849056603774,
612
- "grad_norm": 19.503761291503906,
613
- "learning_rate": 2e-05,
614
- "loss": 0.7394,
615
  "step": 82
616
  },
617
  {
618
  "epoch": 0.7830188679245284,
619
- "grad_norm": 5.10677433013916,
620
- "learning_rate": 2e-05,
621
- "loss": 0.5536,
622
  "step": 83
623
  },
624
  {
625
  "epoch": 0.7924528301886793,
626
- "grad_norm": 48.037845611572266,
627
- "learning_rate": 2e-05,
628
- "loss": 0.7501,
629
  "step": 84
630
  },
631
  {
632
  "epoch": 0.8018867924528302,
633
- "grad_norm": 28.357952117919922,
634
- "learning_rate": 2e-05,
635
- "loss": 0.7174,
636
  "step": 85
637
  },
638
  {
639
  "epoch": 0.8113207547169812,
640
- "grad_norm": 18.693449020385742,
641
- "learning_rate": 2e-05,
642
- "loss": 0.8174,
643
  "step": 86
644
  },
645
  {
646
  "epoch": 0.8207547169811321,
647
- "grad_norm": 36.01970672607422,
648
- "learning_rate": 2e-05,
649
- "loss": 0.7863,
650
  "step": 87
651
  },
652
  {
653
  "epoch": 0.8301886792452831,
654
- "grad_norm": 63.98431396484375,
655
- "learning_rate": 2e-05,
656
- "loss": 0.7538,
657
  "step": 88
658
  },
659
  {
660
  "epoch": 0.839622641509434,
661
- "grad_norm": 7.736374855041504,
662
- "learning_rate": 2e-05,
663
- "loss": 0.6478,
664
  "step": 89
665
  },
666
  {
667
  "epoch": 0.8490566037735849,
668
- "grad_norm": 9.201268196105957,
669
- "learning_rate": 2e-05,
670
- "loss": 0.7841,
 
 
 
 
 
 
 
 
671
  "step": 90
672
  },
673
  {
674
  "epoch": 0.8584905660377359,
675
- "grad_norm": 26.842529296875,
676
- "learning_rate": 2e-05,
677
- "loss": 0.7152,
678
  "step": 91
679
  },
680
  {
681
  "epoch": 0.8679245283018868,
682
- "grad_norm": 21.91474723815918,
683
- "learning_rate": 2e-05,
684
- "loss": 0.5827,
685
  "step": 92
686
  },
687
  {
688
  "epoch": 0.8773584905660378,
689
- "grad_norm": 9.022438049316406,
690
- "learning_rate": 2e-05,
691
- "loss": 0.6294,
692
  "step": 93
693
  },
694
  {
695
  "epoch": 0.8867924528301887,
696
- "grad_norm": 9.270819664001465,
697
- "learning_rate": 2e-05,
698
- "loss": 0.6174,
699
  "step": 94
700
  },
701
  {
702
  "epoch": 0.8962264150943396,
703
- "grad_norm": 11.497746467590332,
704
- "learning_rate": 2e-05,
705
- "loss": 0.7267,
706
  "step": 95
707
  },
708
  {
709
  "epoch": 0.9056603773584906,
710
- "grad_norm": 19.90700912475586,
711
- "learning_rate": 2e-05,
712
- "loss": 0.665,
713
  "step": 96
714
  },
715
  {
716
  "epoch": 0.9150943396226415,
717
- "grad_norm": 26.896240234375,
718
- "learning_rate": 2e-05,
719
- "loss": 0.7505,
720
  "step": 97
721
  },
722
  {
723
  "epoch": 0.9245283018867925,
724
- "grad_norm": 12.731915473937988,
725
- "learning_rate": 2e-05,
726
- "loss": 0.6568,
727
  "step": 98
728
  },
729
  {
730
  "epoch": 0.9339622641509434,
731
- "grad_norm": 29.186397552490234,
732
- "learning_rate": 2e-05,
733
- "loss": 0.701,
734
  "step": 99
735
  },
736
  {
737
  "epoch": 0.9433962264150944,
738
- "grad_norm": 5.476395130157471,
739
- "learning_rate": 2e-05,
740
- "loss": 0.5688,
741
  "step": 100
742
  },
743
  {
744
  "epoch": 0.9433962264150944,
745
- "eval_loss": 0.689711332321167,
746
- "eval_runtime": 19.1115,
747
- "eval_samples_per_second": 15.436,
748
  "eval_steps_per_second": 3.087,
749
  "step": 100
750
  }
751
  ],
752
  "logging_steps": 1,
753
- "max_steps": 106,
754
  "num_input_tokens_seen": 0,
755
- "num_train_epochs": 1,
756
- "save_steps": 20,
757
  "stateful_callbacks": {
758
  "TrainerControl": {
759
  "args": {
 
1
  {
2
+ "best_metric": 0.6825625896453857,
3
+ "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-100",
4
  "epoch": 0.9433962264150944,
5
+ "eval_steps": 10,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.009433962264150943,
13
+ "grad_norm": 12.244806289672852,
14
+ "learning_rate": 2e-08,
15
+ "loss": 0.4715,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.018867924528301886,
20
+ "grad_norm": 11.454357147216797,
21
+ "learning_rate": 4e-08,
22
+ "loss": 0.5527,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.02830188679245283,
27
+ "grad_norm": 7.327939510345459,
28
+ "learning_rate": 6e-08,
29
+ "loss": 0.5359,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.03773584905660377,
34
+ "grad_norm": 8.935256958007812,
35
+ "learning_rate": 8e-08,
36
+ "loss": 0.4292,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.04716981132075472,
41
+ "grad_norm": 17.576908111572266,
42
+ "learning_rate": 1e-07,
43
+ "loss": 0.5657,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.05660377358490566,
48
+ "grad_norm": 22.42218780517578,
49
+ "learning_rate": 1.2e-07,
50
+ "loss": 0.7024,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0660377358490566,
55
+ "grad_norm": 7.509771347045898,
56
+ "learning_rate": 1.4e-07,
57
+ "loss": 0.5426,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.07547169811320754,
62
+ "grad_norm": 24.912858963012695,
63
+ "learning_rate": 1.6e-07,
64
+ "loss": 0.6312,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.08490566037735849,
69
+ "grad_norm": 10.798696517944336,
70
+ "learning_rate": 1.8e-07,
71
+ "loss": 0.4632,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.09433962264150944,
76
+ "grad_norm": 9.916950225830078,
77
+ "learning_rate": 2e-07,
78
+ "loss": 0.6934,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.09433962264150944,
83
+ "eval_loss": 0.6845090985298157,
84
+ "eval_runtime": 18.8138,
85
+ "eval_samples_per_second": 15.68,
86
+ "eval_steps_per_second": 3.136,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.10377358490566038,
91
+ "grad_norm": 8.111969947814941,
92
+ "learning_rate": 2.1999999999999998e-07,
93
+ "loss": 0.7586,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.11320754716981132,
98
+ "grad_norm": 25.175071716308594,
99
+ "learning_rate": 2.4e-07,
100
+ "loss": 0.6298,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.12264150943396226,
105
+ "grad_norm": 5.813445568084717,
106
+ "learning_rate": 2.6e-07,
107
+ "loss": 0.5559,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.1320754716981132,
112
+ "grad_norm": 7.799736022949219,
113
+ "learning_rate": 2.8e-07,
114
+ "loss": 0.5321,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.14150943396226415,
119
+ "grad_norm": 10.612166404724121,
120
+ "learning_rate": 3e-07,
121
+ "loss": 0.5567,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.1509433962264151,
126
+ "grad_norm": 25.862613677978516,
127
+ "learning_rate": 3.2e-07,
128
+ "loss": 0.7949,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.16037735849056603,
133
+ "grad_norm": 5.672112941741943,
134
+ "learning_rate": 3.4000000000000003e-07,
135
+ "loss": 0.5568,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.16981132075471697,
140
+ "grad_norm": 22.59090805053711,
141
+ "learning_rate": 3.6e-07,
142
+ "loss": 0.651,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.1792452830188679,
147
+ "grad_norm": 6.6907548904418945,
148
+ "learning_rate": 3.7999999999999996e-07,
149
+ "loss": 0.5429,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.18867924528301888,
154
+ "grad_norm": 7.563165187835693,
155
+ "learning_rate": 4e-07,
156
+ "loss": 0.5338,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.18867924528301888,
161
+ "eval_loss": 0.6880346536636353,
162
+ "eval_runtime": 19.0282,
163
+ "eval_samples_per_second": 15.503,
164
+ "eval_steps_per_second": 3.101,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 0.19811320754716982,
169
+ "grad_norm": 22.867984771728516,
170
+ "learning_rate": 4.1999999999999995e-07,
171
+ "loss": 0.6839,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 0.20754716981132076,
176
+ "grad_norm": 12.407017707824707,
177
+ "learning_rate": 4.3999999999999997e-07,
178
+ "loss": 0.6577,
179
  "step": 22
180
  },
181
  {
182
  "epoch": 0.2169811320754717,
183
+ "grad_norm": 12.605359077453613,
184
+ "learning_rate": 4.6e-07,
185
+ "loss": 0.7215,
186
  "step": 23
187
  },
188
  {
189
  "epoch": 0.22641509433962265,
190
+ "grad_norm": 8.375327110290527,
191
+ "learning_rate": 4.8e-07,
192
+ "loss": 0.5053,
193
  "step": 24
194
  },
195
  {
196
  "epoch": 0.2358490566037736,
197
+ "grad_norm": 16.666528701782227,
198
+ "learning_rate": 5e-07,
199
+ "loss": 0.5431,
200
  "step": 25
201
  },
202
  {
203
  "epoch": 0.24528301886792453,
204
+ "grad_norm": 27.57564353942871,
205
+ "learning_rate": 5.2e-07,
206
+ "loss": 0.6242,
207
  "step": 26
208
  },
209
  {
210
  "epoch": 0.25471698113207547,
211
+ "grad_norm": 14.450230598449707,
212
+ "learning_rate": 5.4e-07,
213
+ "loss": 0.6718,
214
  "step": 27
215
  },
216
  {
217
  "epoch": 0.2641509433962264,
218
+ "grad_norm": 16.55278968811035,
219
+ "learning_rate": 5.6e-07,
220
+ "loss": 0.6649,
221
  "step": 28
222
  },
223
  {
224
  "epoch": 0.27358490566037735,
225
+ "grad_norm": 17.196575164794922,
226
+ "learning_rate": 5.8e-07,
227
+ "loss": 0.6084,
228
  "step": 29
229
  },
230
  {
231
  "epoch": 0.2830188679245283,
232
+ "grad_norm": 38.10641860961914,
233
+ "learning_rate": 6e-07,
234
+ "loss": 0.6538,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.2830188679245283,
239
+ "eval_loss": 0.6865962743759155,
240
+ "eval_runtime": 19.122,
241
+ "eval_samples_per_second": 15.427,
242
+ "eval_steps_per_second": 3.085,
243
  "step": 30
244
  },
245
  {
246
  "epoch": 0.29245283018867924,
247
+ "grad_norm": 9.382880210876465,
248
+ "learning_rate": 6.2e-07,
249
+ "loss": 0.6686,
250
  "step": 31
251
  },
252
  {
253
  "epoch": 0.3018867924528302,
254
+ "grad_norm": 25.904178619384766,
255
+ "learning_rate": 6.4e-07,
256
+ "loss": 0.5842,
257
  "step": 32
258
  },
259
  {
260
  "epoch": 0.3113207547169811,
261
+ "grad_norm": 10.835689544677734,
262
+ "learning_rate": 6.6e-07,
263
+ "loss": 0.6862,
264
  "step": 33
265
  },
266
  {
267
  "epoch": 0.32075471698113206,
268
+ "grad_norm": 16.35777473449707,
269
+ "learning_rate": 6.800000000000001e-07,
270
+ "loss": 0.4901,
271
  "step": 34
272
  },
273
  {
274
  "epoch": 0.330188679245283,
275
+ "grad_norm": 11.801332473754883,
276
+ "learning_rate": 7e-07,
277
+ "loss": 0.7005,
278
  "step": 35
279
  },
280
  {
281
  "epoch": 0.33962264150943394,
282
+ "grad_norm": 28.929777145385742,
283
+ "learning_rate": 7.2e-07,
284
+ "loss": 0.7141,
285
  "step": 36
286
  },
287
  {
288
  "epoch": 0.3490566037735849,
289
+ "grad_norm": 33.3692512512207,
290
+ "learning_rate": 7.4e-07,
291
+ "loss": 0.7235,
292
  "step": 37
293
  },
294
  {
295
  "epoch": 0.3584905660377358,
296
+ "grad_norm": 14.086514472961426,
297
+ "learning_rate": 7.599999999999999e-07,
298
+ "loss": 0.5546,
299
  "step": 38
300
  },
301
  {
302
  "epoch": 0.36792452830188677,
303
+ "grad_norm": 8.276351928710938,
304
+ "learning_rate": 7.799999999999999e-07,
305
+ "loss": 0.5855,
306
  "step": 39
307
  },
308
  {
309
  "epoch": 0.37735849056603776,
310
+ "grad_norm": 8.203176498413086,
311
+ "learning_rate": 8e-07,
312
+ "loss": 0.6988,
313
  "step": 40
314
  },
315
  {
316
  "epoch": 0.37735849056603776,
317
+ "eval_loss": 0.6843137741088867,
318
+ "eval_runtime": 19.1524,
319
+ "eval_samples_per_second": 15.403,
320
+ "eval_steps_per_second": 3.081,
321
  "step": 40
322
  },
323
  {
324
  "epoch": 0.3867924528301887,
325
+ "grad_norm": 15.79111099243164,
326
+ "learning_rate": 8.199999999999999e-07,
327
+ "loss": 0.5881,
328
  "step": 41
329
  },
330
  {
331
  "epoch": 0.39622641509433965,
332
+ "grad_norm": 16.36391258239746,
333
+ "learning_rate": 8.399999999999999e-07,
334
+ "loss": 0.6394,
335
  "step": 42
336
  },
337
  {
338
  "epoch": 0.4056603773584906,
339
+ "grad_norm": 14.09928035736084,
340
+ "learning_rate": 8.599999999999999e-07,
341
+ "loss": 0.5188,
342
  "step": 43
343
  },
344
  {
345
  "epoch": 0.41509433962264153,
346
+ "grad_norm": 13.666457176208496,
347
+ "learning_rate": 8.799999999999999e-07,
348
+ "loss": 0.6493,
349
  "step": 44
350
  },
351
  {
352
  "epoch": 0.42452830188679247,
353
+ "grad_norm": 26.71883773803711,
354
+ "learning_rate": 9e-07,
355
+ "loss": 0.5879,
356
  "step": 45
357
  },
358
  {
359
  "epoch": 0.4339622641509434,
360
+ "grad_norm": 7.5422844886779785,
361
+ "learning_rate": 9.2e-07,
362
+ "loss": 0.5821,
363
  "step": 46
364
  },
365
  {
366
  "epoch": 0.44339622641509435,
367
+ "grad_norm": 23.531204223632812,
368
+ "learning_rate": 9.399999999999999e-07,
369
+ "loss": 0.6332,
370
  "step": 47
371
  },
372
  {
373
  "epoch": 0.4528301886792453,
374
+ "grad_norm": 30.758493423461914,
375
+ "learning_rate": 9.6e-07,
376
+ "loss": 0.7319,
377
  "step": 48
378
  },
379
  {
380
  "epoch": 0.46226415094339623,
381
+ "grad_norm": 12.101729393005371,
382
+ "learning_rate": 9.8e-07,
383
+ "loss": 0.5698,
384
  "step": 49
385
  },
386
  {
387
  "epoch": 0.4716981132075472,
388
+ "grad_norm": 8.760655403137207,
389
+ "learning_rate": 1e-06,
390
+ "loss": 0.5976,
391
+ "step": 50
392
+ },
393
+ {
394
+ "epoch": 0.4716981132075472,
395
+ "eval_loss": 0.6827160120010376,
396
+ "eval_runtime": 19.1865,
397
+ "eval_samples_per_second": 15.375,
398
+ "eval_steps_per_second": 3.075,
399
  "step": 50
400
  },
401
  {
402
  "epoch": 0.4811320754716981,
403
+ "grad_norm": 9.829323768615723,
404
+ "learning_rate": 1.02e-06,
405
+ "loss": 0.6239,
406
  "step": 51
407
  },
408
  {
409
  "epoch": 0.49056603773584906,
410
+ "grad_norm": 5.044214248657227,
411
+ "learning_rate": 1.04e-06,
412
+ "loss": 0.5077,
413
  "step": 52
414
  },
415
  {
416
  "epoch": 0.5,
417
+ "grad_norm": 12.68688678741455,
418
+ "learning_rate": 1.06e-06,
419
+ "loss": 0.5502,
420
  "step": 53
421
  },
422
  {
423
  "epoch": 0.5094339622641509,
424
+ "grad_norm": 12.936463356018066,
425
+ "learning_rate": 1.08e-06,
426
+ "loss": 0.7308,
427
  "step": 54
428
  },
429
  {
430
  "epoch": 0.5188679245283019,
431
+ "grad_norm": 21.73121452331543,
432
+ "learning_rate": 1.1e-06,
433
+ "loss": 0.7065,
434
  "step": 55
435
  },
436
  {
437
  "epoch": 0.5283018867924528,
438
+ "grad_norm": 16.07210922241211,
439
+ "learning_rate": 1.12e-06,
440
+ "loss": 0.6706,
441
  "step": 56
442
  },
443
  {
444
  "epoch": 0.5377358490566038,
445
+ "grad_norm": 20.025632858276367,
446
+ "learning_rate": 1.1399999999999999e-06,
447
+ "loss": 0.772,
448
  "step": 57
449
  },
450
  {
451
  "epoch": 0.5471698113207547,
452
+ "grad_norm": 19.071701049804688,
453
+ "learning_rate": 1.16e-06,
454
+ "loss": 0.5186,
455
  "step": 58
456
  },
457
  {
458
  "epoch": 0.5566037735849056,
459
+ "grad_norm": 32.642337799072266,
460
+ "learning_rate": 1.18e-06,
461
+ "loss": 0.5967,
462
  "step": 59
463
  },
464
  {
465
  "epoch": 0.5660377358490566,
466
+ "grad_norm": 8.402029037475586,
467
+ "learning_rate": 1.2e-06,
468
+ "loss": 0.5027,
469
  "step": 60
470
  },
471
  {
472
  "epoch": 0.5660377358490566,
473
+ "eval_loss": 0.6824557781219482,
474
+ "eval_runtime": 19.0886,
475
+ "eval_samples_per_second": 15.454,
476
+ "eval_steps_per_second": 3.091,
477
  "step": 60
478
  },
479
  {
480
  "epoch": 0.5754716981132075,
481
+ "grad_norm": 45.89920425415039,
482
+ "learning_rate": 1.22e-06,
483
+ "loss": 0.7172,
484
  "step": 61
485
  },
486
  {
487
  "epoch": 0.5849056603773585,
488
+ "grad_norm": 31.668310165405273,
489
+ "learning_rate": 1.24e-06,
490
+ "loss": 0.659,
491
  "step": 62
492
  },
493
  {
494
  "epoch": 0.5943396226415094,
495
+ "grad_norm": 49.8336181640625,
496
+ "learning_rate": 1.26e-06,
497
+ "loss": 0.7355,
498
  "step": 63
499
  },
500
  {
501
  "epoch": 0.6037735849056604,
502
+ "grad_norm": 6.202754497528076,
503
+ "learning_rate": 1.28e-06,
504
+ "loss": 0.5217,
505
  "step": 64
506
  },
507
  {
508
  "epoch": 0.6132075471698113,
509
+ "grad_norm": 27.1490535736084,
510
+ "learning_rate": 1.3e-06,
511
+ "loss": 0.6641,
512
  "step": 65
513
  },
514
  {
515
  "epoch": 0.6226415094339622,
516
+ "grad_norm": 21.903213500976562,
517
+ "learning_rate": 1.32e-06,
518
+ "loss": 0.7061,
519
  "step": 66
520
  },
521
  {
522
  "epoch": 0.6320754716981132,
523
+ "grad_norm": 14.298906326293945,
524
+ "learning_rate": 1.34e-06,
525
+ "loss": 0.5415,
526
  "step": 67
527
  },
528
  {
529
  "epoch": 0.6415094339622641,
530
+ "grad_norm": 8.386432647705078,
531
+ "learning_rate": 1.3600000000000001e-06,
532
+ "loss": 0.5722,
533
  "step": 68
534
  },
535
  {
536
  "epoch": 0.6509433962264151,
537
+ "grad_norm": 6.960066318511963,
538
+ "learning_rate": 1.38e-06,
539
+ "loss": 0.616,
540
  "step": 69
541
  },
542
  {
543
  "epoch": 0.660377358490566,
544
+ "grad_norm": 34.883724212646484,
545
+ "learning_rate": 1.4e-06,
546
+ "loss": 0.6072,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.660377358490566,
551
+ "eval_loss": 0.6845781803131104,
552
+ "eval_runtime": 19.1342,
553
+ "eval_samples_per_second": 15.417,
554
+ "eval_steps_per_second": 3.083,
555
  "step": 70
556
  },
557
  {
558
  "epoch": 0.6698113207547169,
559
+ "grad_norm": 6.061687469482422,
560
+ "learning_rate": 1.42e-06,
561
+ "loss": 0.6442,
562
  "step": 71
563
  },
564
  {
565
  "epoch": 0.6792452830188679,
566
+ "grad_norm": 12.237196922302246,
567
+ "learning_rate": 1.44e-06,
568
+ "loss": 0.6129,
569
  "step": 72
570
  },
571
  {
572
  "epoch": 0.6886792452830188,
573
+ "grad_norm": 9.720622062683105,
574
+ "learning_rate": 1.46e-06,
575
+ "loss": 0.5943,
576
  "step": 73
577
  },
578
  {
579
  "epoch": 0.6981132075471698,
580
+ "grad_norm": 6.667535305023193,
581
+ "learning_rate": 1.48e-06,
582
+ "loss": 0.5455,
583
  "step": 74
584
  },
585
  {
586
  "epoch": 0.7075471698113207,
587
+ "grad_norm": 9.55260181427002,
588
+ "learning_rate": 1.5e-06,
589
+ "loss": 0.5702,
590
  "step": 75
591
  },
592
  {
593
  "epoch": 0.7169811320754716,
594
+ "grad_norm": 16.85431671142578,
595
+ "learning_rate": 1.5199999999999998e-06,
596
+ "loss": 0.5913,
597
  "step": 76
598
  },
599
  {
600
  "epoch": 0.7264150943396226,
601
+ "grad_norm": 14.90990924835205,
602
+ "learning_rate": 1.5399999999999999e-06,
603
+ "loss": 0.6405,
604
  "step": 77
605
  },
606
  {
607
  "epoch": 0.7358490566037735,
608
+ "grad_norm": 21.149423599243164,
609
+ "learning_rate": 1.5599999999999999e-06,
610
+ "loss": 0.6653,
611
  "step": 78
612
  },
613
  {
614
  "epoch": 0.7452830188679245,
615
+ "grad_norm": 11.917136192321777,
616
+ "learning_rate": 1.58e-06,
617
+ "loss": 0.6892,
618
  "step": 79
619
  },
620
  {
621
  "epoch": 0.7547169811320755,
622
+ "grad_norm": 18.385757446289062,
623
+ "learning_rate": 1.6e-06,
624
+ "loss": 0.5136,
625
  "step": 80
626
  },
627
  {
628
  "epoch": 0.7547169811320755,
629
+ "eval_loss": 0.6837261915206909,
630
+ "eval_runtime": 19.1072,
631
+ "eval_samples_per_second": 15.439,
632
+ "eval_steps_per_second": 3.088,
633
  "step": 80
634
  },
635
  {
636
  "epoch": 0.7641509433962265,
637
+ "grad_norm": 8.149250984191895,
638
+ "learning_rate": 1.62e-06,
639
+ "loss": 0.7424,
640
  "step": 81
641
  },
642
  {
643
  "epoch": 0.7735849056603774,
644
+ "grad_norm": 26.717226028442383,
645
+ "learning_rate": 1.6399999999999998e-06,
646
+ "loss": 0.768,
647
  "step": 82
648
  },
649
  {
650
  "epoch": 0.7830188679245284,
651
+ "grad_norm": 9.56679630279541,
652
+ "learning_rate": 1.6599999999999998e-06,
653
+ "loss": 0.5652,
654
  "step": 83
655
  },
656
  {
657
  "epoch": 0.7924528301886793,
658
+ "grad_norm": 40.63279342651367,
659
+ "learning_rate": 1.6799999999999998e-06,
660
+ "loss": 0.7145,
661
  "step": 84
662
  },
663
  {
664
  "epoch": 0.8018867924528302,
665
+ "grad_norm": 26.526386260986328,
666
+ "learning_rate": 1.6999999999999998e-06,
667
+ "loss": 0.7287,
668
  "step": 85
669
  },
670
  {
671
  "epoch": 0.8113207547169812,
672
+ "grad_norm": 20.656476974487305,
673
+ "learning_rate": 1.7199999999999998e-06,
674
+ "loss": 0.8114,
675
  "step": 86
676
  },
677
  {
678
  "epoch": 0.8207547169811321,
679
+ "grad_norm": 21.340261459350586,
680
+ "learning_rate": 1.7399999999999999e-06,
681
+ "loss": 0.7421,
682
  "step": 87
683
  },
684
  {
685
  "epoch": 0.8301886792452831,
686
+ "grad_norm": 43.33297348022461,
687
+ "learning_rate": 1.7599999999999999e-06,
688
+ "loss": 0.6437,
689
  "step": 88
690
  },
691
  {
692
  "epoch": 0.839622641509434,
693
+ "grad_norm": 28.463003158569336,
694
+ "learning_rate": 1.78e-06,
695
+ "loss": 0.6925,
696
  "step": 89
697
  },
698
  {
699
  "epoch": 0.8490566037735849,
700
+ "grad_norm": 23.972209930419922,
701
+ "learning_rate": 1.8e-06,
702
+ "loss": 0.798,
703
+ "step": 90
704
+ },
705
+ {
706
+ "epoch": 0.8490566037735849,
707
+ "eval_loss": 0.6815512180328369,
708
+ "eval_runtime": 19.1285,
709
+ "eval_samples_per_second": 15.422,
710
+ "eval_steps_per_second": 3.084,
711
  "step": 90
712
  },
713
  {
714
  "epoch": 0.8584905660377359,
715
+ "grad_norm": 7.108420372009277,
716
+ "learning_rate": 1.82e-06,
717
+ "loss": 0.6795,
718
  "step": 91
719
  },
720
  {
721
  "epoch": 0.8679245283018868,
722
+ "grad_norm": 7.378752708435059,
723
+ "learning_rate": 1.84e-06,
724
+ "loss": 0.5856,
725
  "step": 92
726
  },
727
  {
728
  "epoch": 0.8773584905660378,
729
+ "grad_norm": 9.755034446716309,
730
+ "learning_rate": 1.86e-06,
731
+ "loss": 0.6236,
732
  "step": 93
733
  },
734
  {
735
  "epoch": 0.8867924528301887,
736
+ "grad_norm": 14.284013748168945,
737
+ "learning_rate": 1.8799999999999998e-06,
738
+ "loss": 0.6273,
739
  "step": 94
740
  },
741
  {
742
  "epoch": 0.8962264150943396,
743
+ "grad_norm": 11.498614311218262,
744
+ "learning_rate": 1.8999999999999998e-06,
745
+ "loss": 0.7204,
746
  "step": 95
747
  },
748
  {
749
  "epoch": 0.9056603773584906,
750
+ "grad_norm": 21.997724533081055,
751
+ "learning_rate": 1.92e-06,
752
+ "loss": 0.6552,
753
  "step": 96
754
  },
755
  {
756
  "epoch": 0.9150943396226415,
757
+ "grad_norm": 15.58300495147705,
758
+ "learning_rate": 1.94e-06,
759
+ "loss": 0.751,
760
  "step": 97
761
  },
762
  {
763
  "epoch": 0.9245283018867925,
764
+ "grad_norm": 10.124507904052734,
765
+ "learning_rate": 1.96e-06,
766
+ "loss": 0.664,
767
  "step": 98
768
  },
769
  {
770
  "epoch": 0.9339622641509434,
771
+ "grad_norm": 14.453956604003906,
772
+ "learning_rate": 1.98e-06,
773
+ "loss": 0.6503,
774
  "step": 99
775
  },
776
  {
777
  "epoch": 0.9433962264150944,
778
+ "grad_norm": 16.880348205566406,
779
+ "learning_rate": 2e-06,
780
+ "loss": 0.5736,
781
  "step": 100
782
  },
783
  {
784
  "epoch": 0.9433962264150944,
785
+ "eval_loss": 0.6825625896453857,
786
+ "eval_runtime": 19.1145,
787
+ "eval_samples_per_second": 15.433,
788
  "eval_steps_per_second": 3.087,
789
  "step": 100
790
  }
791
  ],
792
  "logging_steps": 1,
793
+ "max_steps": 318,
794
  "num_input_tokens_seen": 0,
795
+ "num_train_epochs": 3,
796
+ "save_steps": 50,
797
  "stateful_callbacks": {
798
  "TrainerControl": {
799
  "args": {
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26f455500c6d21c30e744017cea9fadfdc34176d20cac0307417afab0d9542d6
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e439f378b390e89bbeefa59eafafdb1ecc84a940e029c0e74ae9a73bbc405b3
3
  size 5112