pygemma-2b-it / checkpoint-3189 /trainer_state.json
Menouar's picture
Upload folder using huggingface_hub
d0821a7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998432356168678,
"eval_steps": 500,
"global_step": 3189,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.1328125,
"learning_rate": 6.269592476489028e-07,
"loss": 2.1857,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.15625,
"learning_rate": 1.2539184952978056e-06,
"loss": 2.2401,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 1.171875,
"learning_rate": 1.8808777429467086e-06,
"loss": 2.2405,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 1.1796875,
"learning_rate": 2.507836990595611e-06,
"loss": 2.2319,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 1.1328125,
"learning_rate": 3.1347962382445144e-06,
"loss": 2.1898,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 1.1171875,
"learning_rate": 3.7617554858934172e-06,
"loss": 2.2211,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 1.0390625,
"learning_rate": 4.3887147335423205e-06,
"loss": 2.1384,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 1.1640625,
"learning_rate": 5.015673981191222e-06,
"loss": 2.1226,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 1.1171875,
"learning_rate": 5.642633228840125e-06,
"loss": 2.0606,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 1.140625,
"learning_rate": 6.269592476489029e-06,
"loss": 2.0945,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 1.1171875,
"learning_rate": 6.896551724137932e-06,
"loss": 2.0137,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 1.21875,
"learning_rate": 7.5235109717868345e-06,
"loss": 1.9084,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 1.1640625,
"learning_rate": 8.150470219435737e-06,
"loss": 1.8718,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 1.09375,
"learning_rate": 8.777429467084641e-06,
"loss": 1.7715,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 1.09375,
"learning_rate": 9.404388714733543e-06,
"loss": 1.6878,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 0.8828125,
"learning_rate": 1.0031347962382445e-05,
"loss": 1.5875,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 0.76953125,
"learning_rate": 1.0658307210031348e-05,
"loss": 1.5414,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": 0.64453125,
"learning_rate": 1.128526645768025e-05,
"loss": 1.4793,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 0.74609375,
"learning_rate": 1.1912225705329154e-05,
"loss": 1.4169,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 0.7578125,
"learning_rate": 1.2539184952978058e-05,
"loss": 1.3904,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": 0.7734375,
"learning_rate": 1.316614420062696e-05,
"loss": 1.3295,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 0.70703125,
"learning_rate": 1.3793103448275863e-05,
"loss": 1.3289,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 0.97265625,
"learning_rate": 1.4420062695924765e-05,
"loss": 1.257,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": 0.9296875,
"learning_rate": 1.5047021943573669e-05,
"loss": 1.2668,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 1.03125,
"learning_rate": 1.567398119122257e-05,
"loss": 1.2188,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 0.375,
"learning_rate": 1.6300940438871475e-05,
"loss": 1.2408,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 0.3515625,
"learning_rate": 1.6927899686520378e-05,
"loss": 1.1614,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 0.828125,
"learning_rate": 1.7554858934169282e-05,
"loss": 1.1643,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 0.35546875,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.1071,
"step": 290
},
{
"epoch": 0.09,
"grad_norm": 0.4375,
"learning_rate": 1.8808777429467086e-05,
"loss": 1.1014,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 0.42578125,
"learning_rate": 1.943573667711599e-05,
"loss": 1.1347,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 0.337890625,
"learning_rate": 1.999999400890905e-05,
"loss": 1.0571,
"step": 320
},
{
"epoch": 0.1,
"grad_norm": 0.443359375,
"learning_rate": 1.9999275086680688e-05,
"loss": 1.0499,
"step": 330
},
{
"epoch": 0.11,
"grad_norm": 0.326171875,
"learning_rate": 1.9997358044965833e-05,
"loss": 1.0195,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 0.291015625,
"learning_rate": 1.9994243113465627e-05,
"loss": 0.9811,
"step": 350
},
{
"epoch": 0.11,
"grad_norm": 0.328125,
"learning_rate": 1.9989930665413148e-05,
"loss": 0.9659,
"step": 360
},
{
"epoch": 0.12,
"grad_norm": 0.3046875,
"learning_rate": 1.9984421217528654e-05,
"loss": 1.0054,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 0.3359375,
"learning_rate": 1.997771542995769e-05,
"loss": 1.0214,
"step": 380
},
{
"epoch": 0.12,
"grad_norm": 0.38671875,
"learning_rate": 1.9969814106191973e-05,
"loss": 0.9312,
"step": 390
},
{
"epoch": 0.13,
"grad_norm": 0.369140625,
"learning_rate": 1.996071819297314e-05,
"loss": 0.9643,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 0.37109375,
"learning_rate": 1.9950428780179274e-05,
"loss": 0.9579,
"step": 410
},
{
"epoch": 0.13,
"grad_norm": 0.353515625,
"learning_rate": 1.9938947100694354e-05,
"loss": 0.9325,
"step": 420
},
{
"epoch": 0.13,
"grad_norm": 0.330078125,
"learning_rate": 1.99262745302605e-05,
"loss": 0.983,
"step": 430
},
{
"epoch": 0.14,
"grad_norm": 0.337890625,
"learning_rate": 1.991241258731314e-05,
"loss": 0.9483,
"step": 440
},
{
"epoch": 0.14,
"grad_norm": 0.341796875,
"learning_rate": 1.9897362932799078e-05,
"loss": 0.9132,
"step": 450
},
{
"epoch": 0.14,
"grad_norm": 0.376953125,
"learning_rate": 1.988112736997747e-05,
"loss": 0.9624,
"step": 460
},
{
"epoch": 0.15,
"grad_norm": 0.33984375,
"learning_rate": 1.9863707844203756e-05,
"loss": 0.9126,
"step": 470
},
{
"epoch": 0.15,
"grad_norm": 0.408203125,
"learning_rate": 1.9845106442696563e-05,
"loss": 0.905,
"step": 480
},
{
"epoch": 0.15,
"grad_norm": 0.345703125,
"learning_rate": 1.982532539428763e-05,
"loss": 0.9313,
"step": 490
},
{
"epoch": 0.16,
"grad_norm": 0.388671875,
"learning_rate": 1.980436706915473e-05,
"loss": 0.9669,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 0.3828125,
"learning_rate": 1.978223397853768e-05,
"loss": 0.9132,
"step": 510
},
{
"epoch": 0.16,
"grad_norm": 0.302734375,
"learning_rate": 1.9758928774437444e-05,
"loss": 0.9013,
"step": 520
},
{
"epoch": 0.17,
"grad_norm": 0.357421875,
"learning_rate": 1.9734454249298367e-05,
"loss": 0.9473,
"step": 530
},
{
"epoch": 0.17,
"grad_norm": 0.392578125,
"learning_rate": 1.9708813335673582e-05,
"loss": 0.9341,
"step": 540
},
{
"epoch": 0.17,
"grad_norm": 0.39453125,
"learning_rate": 1.9682009105873633e-05,
"loss": 0.9394,
"step": 550
},
{
"epoch": 0.18,
"grad_norm": 0.357421875,
"learning_rate": 1.9654044771598343e-05,
"loss": 0.9168,
"step": 560
},
{
"epoch": 0.18,
"grad_norm": 0.3359375,
"learning_rate": 1.9624923683551992e-05,
"loss": 0.894,
"step": 570
},
{
"epoch": 0.18,
"grad_norm": 0.5078125,
"learning_rate": 1.9594649331041826e-05,
"loss": 0.9145,
"step": 580
},
{
"epoch": 0.18,
"grad_norm": 0.384765625,
"learning_rate": 1.9563225341559982e-05,
"loss": 0.9234,
"step": 590
},
{
"epoch": 0.19,
"grad_norm": 0.34375,
"learning_rate": 1.9530655480348823e-05,
"loss": 0.9097,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 0.421875,
"learning_rate": 1.9496943649949777e-05,
"loss": 0.8847,
"step": 610
},
{
"epoch": 0.19,
"grad_norm": 0.3359375,
"learning_rate": 1.9462093889735766e-05,
"loss": 0.8769,
"step": 620
},
{
"epoch": 0.2,
"grad_norm": 0.34765625,
"learning_rate": 1.9426110375427175e-05,
"loss": 0.9179,
"step": 630
},
{
"epoch": 0.2,
"grad_norm": 0.412109375,
"learning_rate": 1.9388997418591518e-05,
"loss": 0.9184,
"step": 640
},
{
"epoch": 0.2,
"grad_norm": 0.35546875,
"learning_rate": 1.9350759466126838e-05,
"loss": 0.8625,
"step": 650
},
{
"epoch": 0.21,
"grad_norm": 0.369140625,
"learning_rate": 1.9311401099728865e-05,
"loss": 0.9069,
"step": 660
},
{
"epoch": 0.21,
"grad_norm": 0.39453125,
"learning_rate": 1.927092703534204e-05,
"loss": 0.9142,
"step": 670
},
{
"epoch": 0.21,
"grad_norm": 0.37109375,
"learning_rate": 1.922934212259444e-05,
"loss": 0.8774,
"step": 680
},
{
"epoch": 0.22,
"grad_norm": 0.38671875,
"learning_rate": 1.9186651344216703e-05,
"loss": 0.8555,
"step": 690
},
{
"epoch": 0.22,
"grad_norm": 0.373046875,
"learning_rate": 1.9142859815444982e-05,
"loss": 0.9122,
"step": 700
},
{
"epoch": 0.22,
"grad_norm": 0.34375,
"learning_rate": 1.909797278340805e-05,
"loss": 0.9002,
"step": 710
},
{
"epoch": 0.23,
"grad_norm": 0.56640625,
"learning_rate": 1.905199562649857e-05,
"loss": 0.913,
"step": 720
},
{
"epoch": 0.23,
"grad_norm": 0.37890625,
"learning_rate": 1.900493385372866e-05,
"loss": 0.8859,
"step": 730
},
{
"epoch": 0.23,
"grad_norm": 0.41015625,
"learning_rate": 1.8956793104069797e-05,
"loss": 0.8955,
"step": 740
},
{
"epoch": 0.24,
"grad_norm": 0.447265625,
"learning_rate": 1.8907579145777156e-05,
"loss": 0.8676,
"step": 750
},
{
"epoch": 0.24,
"grad_norm": 0.4140625,
"learning_rate": 1.8857297875698455e-05,
"loss": 0.8559,
"step": 760
},
{
"epoch": 0.24,
"grad_norm": 0.4609375,
"learning_rate": 1.880595531856738e-05,
"loss": 0.8908,
"step": 770
},
{
"epoch": 0.24,
"grad_norm": 0.39453125,
"learning_rate": 1.875355762628171e-05,
"loss": 0.871,
"step": 780
},
{
"epoch": 0.25,
"grad_norm": 0.49609375,
"learning_rate": 1.8700111077166186e-05,
"loss": 0.8959,
"step": 790
},
{
"epoch": 0.25,
"grad_norm": 0.498046875,
"learning_rate": 1.8645622075220246e-05,
"loss": 0.8651,
"step": 800
},
{
"epoch": 0.25,
"grad_norm": 0.369140625,
"learning_rate": 1.859009714935067e-05,
"loss": 0.8543,
"step": 810
},
{
"epoch": 0.26,
"grad_norm": 0.390625,
"learning_rate": 1.8533542952589322e-05,
"loss": 0.8772,
"step": 820
},
{
"epoch": 0.26,
"grad_norm": 0.380859375,
"learning_rate": 1.8475966261295947e-05,
"loss": 0.8792,
"step": 830
},
{
"epoch": 0.26,
"grad_norm": 0.40625,
"learning_rate": 1.841737397434623e-05,
"loss": 0.8888,
"step": 840
},
{
"epoch": 0.27,
"grad_norm": 0.365234375,
"learning_rate": 1.8357773112305183e-05,
"loss": 0.8525,
"step": 850
},
{
"epoch": 0.27,
"grad_norm": 0.56640625,
"learning_rate": 1.829717081658591e-05,
"loss": 0.8562,
"step": 860
},
{
"epoch": 0.27,
"grad_norm": 0.4140625,
"learning_rate": 1.823557434859395e-05,
"loss": 0.8433,
"step": 870
},
{
"epoch": 0.28,
"grad_norm": 0.439453125,
"learning_rate": 1.8172991088857187e-05,
"loss": 0.873,
"step": 880
},
{
"epoch": 0.28,
"grad_norm": 0.396484375,
"learning_rate": 1.8109428536141515e-05,
"loss": 0.8523,
"step": 890
},
{
"epoch": 0.28,
"grad_norm": 0.416015625,
"learning_rate": 1.8044894306552338e-05,
"loss": 0.8178,
"step": 900
},
{
"epoch": 0.29,
"grad_norm": 0.408203125,
"learning_rate": 1.7979396132621997e-05,
"loss": 0.8542,
"step": 910
},
{
"epoch": 0.29,
"grad_norm": 0.427734375,
"learning_rate": 1.791294186238327e-05,
"loss": 0.8755,
"step": 920
},
{
"epoch": 0.29,
"grad_norm": 0.4609375,
"learning_rate": 1.7845539458428973e-05,
"loss": 0.8354,
"step": 930
},
{
"epoch": 0.29,
"grad_norm": 0.416015625,
"learning_rate": 1.7777196996957934e-05,
"loss": 0.9218,
"step": 940
},
{
"epoch": 0.3,
"grad_norm": 0.388671875,
"learning_rate": 1.770792266680725e-05,
"loss": 0.8447,
"step": 950
},
{
"epoch": 0.3,
"grad_norm": 0.45703125,
"learning_rate": 1.7637724768471127e-05,
"loss": 0.8294,
"step": 960
},
{
"epoch": 0.3,
"grad_norm": 0.388671875,
"learning_rate": 1.7566611713106287e-05,
"loss": 0.86,
"step": 970
},
{
"epoch": 0.31,
"grad_norm": 0.5078125,
"learning_rate": 1.7494592021524156e-05,
"loss": 0.878,
"step": 980
},
{
"epoch": 0.31,
"grad_norm": 0.435546875,
"learning_rate": 1.7421674323169885e-05,
"loss": 0.8435,
"step": 990
},
{
"epoch": 0.31,
"grad_norm": 0.404296875,
"learning_rate": 1.7347867355088358e-05,
"loss": 0.8335,
"step": 1000
},
{
"epoch": 0.32,
"grad_norm": 0.423828125,
"learning_rate": 1.7273179960877335e-05,
"loss": 0.8757,
"step": 1010
},
{
"epoch": 0.32,
"grad_norm": 0.419921875,
"learning_rate": 1.7197621089627785e-05,
"loss": 0.818,
"step": 1020
},
{
"epoch": 0.32,
"grad_norm": 0.44921875,
"learning_rate": 1.712119979485161e-05,
"loss": 0.883,
"step": 1030
},
{
"epoch": 0.33,
"grad_norm": 0.396484375,
"learning_rate": 1.7043925233396855e-05,
"loss": 0.8833,
"step": 1040
},
{
"epoch": 0.33,
"grad_norm": 0.392578125,
"learning_rate": 1.6965806664350505e-05,
"loss": 0.8822,
"step": 1050
},
{
"epoch": 0.33,
"grad_norm": 0.390625,
"learning_rate": 1.6886853447929082e-05,
"loss": 0.8766,
"step": 1060
},
{
"epoch": 0.34,
"grad_norm": 0.478515625,
"learning_rate": 1.6807075044357074e-05,
"loss": 0.8363,
"step": 1070
},
{
"epoch": 0.34,
"grad_norm": 0.4296875,
"learning_rate": 1.6726481012733437e-05,
"loss": 0.8681,
"step": 1080
},
{
"epoch": 0.34,
"grad_norm": 0.453125,
"learning_rate": 1.6645081009886178e-05,
"loss": 0.8692,
"step": 1090
},
{
"epoch": 0.34,
"grad_norm": 0.384765625,
"learning_rate": 1.6562884789215298e-05,
"loss": 0.8569,
"step": 1100
},
{
"epoch": 0.35,
"grad_norm": 0.431640625,
"learning_rate": 1.6479902199524116e-05,
"loss": 0.885,
"step": 1110
},
{
"epoch": 0.35,
"grad_norm": 0.4453125,
"learning_rate": 1.6396143183839192e-05,
"loss": 0.8938,
"step": 1120
},
{
"epoch": 0.35,
"grad_norm": 0.435546875,
"learning_rate": 1.6311617778218945e-05,
"loss": 0.8443,
"step": 1130
},
{
"epoch": 0.36,
"grad_norm": 0.4765625,
"learning_rate": 1.622633611055111e-05,
"loss": 0.8257,
"step": 1140
},
{
"epoch": 0.36,
"grad_norm": 0.453125,
"learning_rate": 1.614030839933923e-05,
"loss": 0.874,
"step": 1150
},
{
"epoch": 0.36,
"grad_norm": 0.439453125,
"learning_rate": 1.6053544952478258e-05,
"loss": 0.87,
"step": 1160
},
{
"epoch": 0.37,
"grad_norm": 0.40625,
"learning_rate": 1.5966056166019453e-05,
"loss": 0.8557,
"step": 1170
},
{
"epoch": 0.37,
"grad_norm": 0.439453125,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.8833,
"step": 1180
},
{
"epoch": 0.37,
"grad_norm": 0.435546875,
"learning_rate": 1.5788944591810588e-05,
"loss": 0.8315,
"step": 1190
},
{
"epoch": 0.38,
"grad_norm": 0.46484375,
"learning_rate": 1.5699343025681746e-05,
"loss": 0.8635,
"step": 1200
},
{
"epoch": 0.38,
"grad_norm": 0.443359375,
"learning_rate": 1.560905856065472e-05,
"loss": 0.8579,
"step": 1210
},
{
"epoch": 0.38,
"grad_norm": 0.45703125,
"learning_rate": 1.5518102014671405e-05,
"loss": 0.8502,
"step": 1220
},
{
"epoch": 0.39,
"grad_norm": 0.435546875,
"learning_rate": 1.5426484286202863e-05,
"loss": 0.8804,
"step": 1230
},
{
"epoch": 0.39,
"grad_norm": 0.390625,
"learning_rate": 1.5334216352943464e-05,
"loss": 0.8496,
"step": 1240
},
{
"epoch": 0.39,
"grad_norm": 0.4296875,
"learning_rate": 1.5241309270495524e-05,
"loss": 0.8425,
"step": 1250
},
{
"epoch": 0.4,
"grad_norm": 0.421875,
"learning_rate": 1.5147774171044619e-05,
"loss": 0.8578,
"step": 1260
},
{
"epoch": 0.4,
"grad_norm": 0.447265625,
"learning_rate": 1.5053622262025718e-05,
"loss": 0.8598,
"step": 1270
},
{
"epoch": 0.4,
"grad_norm": 0.419921875,
"learning_rate": 1.495886482478032e-05,
"loss": 0.8864,
"step": 1280
},
{
"epoch": 0.4,
"grad_norm": 0.412109375,
"learning_rate": 1.4863513213204681e-05,
"loss": 0.8334,
"step": 1290
},
{
"epoch": 0.41,
"grad_norm": 0.40625,
"learning_rate": 1.476757885238942e-05,
"loss": 0.8612,
"step": 1300
},
{
"epoch": 0.41,
"grad_norm": 0.4296875,
"learning_rate": 1.4671073237250519e-05,
"loss": 0.8449,
"step": 1310
},
{
"epoch": 0.41,
"grad_norm": 0.515625,
"learning_rate": 1.4574007931152037e-05,
"loss": 0.8501,
"step": 1320
},
{
"epoch": 0.42,
"grad_norm": 0.412109375,
"learning_rate": 1.4476394564520542e-05,
"loss": 0.8367,
"step": 1330
},
{
"epoch": 0.42,
"grad_norm": 0.44921875,
"learning_rate": 1.4378244833451576e-05,
"loss": 0.8656,
"step": 1340
},
{
"epoch": 0.42,
"grad_norm": 0.3984375,
"learning_rate": 1.4279570498308198e-05,
"loss": 0.8767,
"step": 1350
},
{
"epoch": 0.43,
"grad_norm": 0.46875,
"learning_rate": 1.4180383382311867e-05,
"loss": 0.8521,
"step": 1360
},
{
"epoch": 0.43,
"grad_norm": 0.470703125,
"learning_rate": 1.4080695370125761e-05,
"loss": 0.8959,
"step": 1370
},
{
"epoch": 0.43,
"grad_norm": 0.48828125,
"learning_rate": 1.3980518406430767e-05,
"loss": 0.849,
"step": 1380
},
{
"epoch": 0.44,
"grad_norm": 0.453125,
"learning_rate": 1.3879864494494252e-05,
"loss": 0.8495,
"step": 1390
},
{
"epoch": 0.44,
"grad_norm": 0.5234375,
"learning_rate": 1.3778745694731816e-05,
"loss": 0.826,
"step": 1400
},
{
"epoch": 0.44,
"grad_norm": 0.4453125,
"learning_rate": 1.3677174123262216e-05,
"loss": 0.8623,
"step": 1410
},
{
"epoch": 0.45,
"grad_norm": 0.466796875,
"learning_rate": 1.3575161950455604e-05,
"loss": 0.8474,
"step": 1420
},
{
"epoch": 0.45,
"grad_norm": 0.4609375,
"learning_rate": 1.3472721399475266e-05,
"loss": 0.8403,
"step": 1430
},
{
"epoch": 0.45,
"grad_norm": 0.4375,
"learning_rate": 1.3369864744813025e-05,
"loss": 0.8379,
"step": 1440
},
{
"epoch": 0.45,
"grad_norm": 0.466796875,
"learning_rate": 1.3266604310818525e-05,
"loss": 0.8801,
"step": 1450
},
{
"epoch": 0.46,
"grad_norm": 0.470703125,
"learning_rate": 1.3162952470222488e-05,
"loss": 0.8501,
"step": 1460
},
{
"epoch": 0.46,
"grad_norm": 0.48046875,
"learning_rate": 1.3058921642654235e-05,
"loss": 0.8755,
"step": 1470
},
{
"epoch": 0.46,
"grad_norm": 0.4296875,
"learning_rate": 1.2954524293153546e-05,
"loss": 0.8114,
"step": 1480
},
{
"epoch": 0.47,
"grad_norm": 0.419921875,
"learning_rate": 1.2849772930677087e-05,
"loss": 0.8221,
"step": 1490
},
{
"epoch": 0.47,
"grad_norm": 0.4609375,
"learning_rate": 1.274468010659959e-05,
"loss": 0.8379,
"step": 1500
},
{
"epoch": 0.47,
"grad_norm": 0.419921875,
"learning_rate": 1.2639258413209922e-05,
"loss": 0.8338,
"step": 1510
},
{
"epoch": 0.48,
"grad_norm": 0.416015625,
"learning_rate": 1.2533520482202293e-05,
"loss": 0.8463,
"step": 1520
},
{
"epoch": 0.48,
"grad_norm": 0.482421875,
"learning_rate": 1.2427478983162694e-05,
"loss": 0.8561,
"step": 1530
},
{
"epoch": 0.48,
"grad_norm": 0.43359375,
"learning_rate": 1.2321146622050838e-05,
"loss": 0.8377,
"step": 1540
},
{
"epoch": 0.49,
"grad_norm": 0.41015625,
"learning_rate": 1.2214536139677712e-05,
"loss": 0.8703,
"step": 1550
},
{
"epoch": 0.49,
"grad_norm": 0.44140625,
"learning_rate": 1.2107660310178966e-05,
"loss": 0.8378,
"step": 1560
},
{
"epoch": 0.49,
"grad_norm": 0.47265625,
"learning_rate": 1.2000531939484321e-05,
"loss": 0.8509,
"step": 1570
},
{
"epoch": 0.5,
"grad_norm": 0.4375,
"learning_rate": 1.1893163863783131e-05,
"loss": 0.8289,
"step": 1580
},
{
"epoch": 0.5,
"grad_norm": 0.4140625,
"learning_rate": 1.1785568947986368e-05,
"loss": 0.8373,
"step": 1590
},
{
"epoch": 0.5,
"grad_norm": 0.44140625,
"learning_rate": 1.1677760084185123e-05,
"loss": 0.8202,
"step": 1600
},
{
"epoch": 0.5,
"grad_norm": 0.4375,
"learning_rate": 1.1569750190105871e-05,
"loss": 0.8372,
"step": 1610
},
{
"epoch": 0.51,
"grad_norm": 0.5234375,
"learning_rate": 1.1461552207562665e-05,
"loss": 0.8803,
"step": 1620
},
{
"epoch": 0.51,
"grad_norm": 0.466796875,
"learning_rate": 1.1353179100906438e-05,
"loss": 0.8625,
"step": 1630
},
{
"epoch": 0.51,
"grad_norm": 0.423828125,
"learning_rate": 1.1244643855471603e-05,
"loss": 0.8754,
"step": 1640
},
{
"epoch": 0.52,
"grad_norm": 0.43359375,
"learning_rate": 1.1135959476020144e-05,
"loss": 0.8621,
"step": 1650
},
{
"epoch": 0.52,
"grad_norm": 0.416015625,
"learning_rate": 1.1027138985183381e-05,
"loss": 0.8583,
"step": 1660
},
{
"epoch": 0.52,
"grad_norm": 0.486328125,
"learning_rate": 1.0918195421901583e-05,
"loss": 0.8384,
"step": 1670
},
{
"epoch": 0.53,
"grad_norm": 0.427734375,
"learning_rate": 1.080914183986164e-05,
"loss": 0.8043,
"step": 1680
},
{
"epoch": 0.53,
"grad_norm": 0.435546875,
"learning_rate": 1.0699991305932955e-05,
"loss": 0.8519,
"step": 1690
},
{
"epoch": 0.53,
"grad_norm": 0.43359375,
"learning_rate": 1.0590756898601775e-05,
"loss": 0.8593,
"step": 1700
},
{
"epoch": 0.54,
"grad_norm": 0.43359375,
"learning_rate": 1.0481451706404104e-05,
"loss": 0.8366,
"step": 1710
},
{
"epoch": 0.54,
"grad_norm": 0.4296875,
"learning_rate": 1.0372088826357443e-05,
"loss": 0.8655,
"step": 1720
},
{
"epoch": 0.54,
"grad_norm": 0.5703125,
"learning_rate": 1.0262681362391473e-05,
"loss": 0.8525,
"step": 1730
},
{
"epoch": 0.55,
"grad_norm": 0.40625,
"learning_rate": 1.0153242423777964e-05,
"loss": 0.8553,
"step": 1740
},
{
"epoch": 0.55,
"grad_norm": 0.5234375,
"learning_rate": 1.004378512355999e-05,
"loss": 0.8411,
"step": 1750
},
{
"epoch": 0.55,
"grad_norm": 0.447265625,
"learning_rate": 9.934322576980721e-06,
"loss": 0.8177,
"step": 1760
},
{
"epoch": 0.55,
"grad_norm": 0.5078125,
"learning_rate": 9.824867899911962e-06,
"loss": 0.8661,
"step": 1770
},
{
"epoch": 0.56,
"grad_norm": 0.47265625,
"learning_rate": 9.715434207282574e-06,
"loss": 0.835,
"step": 1780
},
{
"epoch": 0.56,
"grad_norm": 0.408203125,
"learning_rate": 9.606034611507058e-06,
"loss": 0.8702,
"step": 1790
},
{
"epoch": 0.56,
"grad_norm": 0.4453125,
"learning_rate": 9.496682220914403e-06,
"loss": 0.8272,
"step": 1800
},
{
"epoch": 0.57,
"grad_norm": 0.416015625,
"learning_rate": 9.387390138177447e-06,
"loss": 0.8434,
"step": 1810
},
{
"epoch": 0.57,
"grad_norm": 0.478515625,
"learning_rate": 9.278171458742903e-06,
"loss": 0.8639,
"step": 1820
},
{
"epoch": 0.57,
"grad_norm": 0.419921875,
"learning_rate": 9.16903926926225e-06,
"loss": 0.8371,
"step": 1830
},
{
"epoch": 0.58,
"grad_norm": 0.3984375,
"learning_rate": 9.060006646023683e-06,
"loss": 0.8489,
"step": 1840
},
{
"epoch": 0.58,
"grad_norm": 0.412109375,
"learning_rate": 8.951086653385323e-06,
"loss": 0.848,
"step": 1850
},
{
"epoch": 0.58,
"grad_norm": 0.4375,
"learning_rate": 8.842292342209801e-06,
"loss": 0.8177,
"step": 1860
},
{
"epoch": 0.59,
"grad_norm": 0.458984375,
"learning_rate": 8.733636748300524e-06,
"loss": 0.837,
"step": 1870
},
{
"epoch": 0.59,
"grad_norm": 0.474609375,
"learning_rate": 8.625132890839706e-06,
"loss": 0.8526,
"step": 1880
},
{
"epoch": 0.59,
"grad_norm": 0.453125,
"learning_rate": 8.516793770828412e-06,
"loss": 0.871,
"step": 1890
},
{
"epoch": 0.6,
"grad_norm": 0.48046875,
"learning_rate": 8.40863236952875e-06,
"loss": 0.8217,
"step": 1900
},
{
"epoch": 0.6,
"grad_norm": 0.427734375,
"learning_rate": 8.30066164690847e-06,
"loss": 0.8344,
"step": 1910
},
{
"epoch": 0.6,
"grad_norm": 0.46484375,
"learning_rate": 8.192894540088061e-06,
"loss": 0.8646,
"step": 1920
},
{
"epoch": 0.61,
"grad_norm": 0.439453125,
"learning_rate": 8.085343961790666e-06,
"loss": 0.8456,
"step": 1930
},
{
"epoch": 0.61,
"grad_norm": 0.447265625,
"learning_rate": 7.978022798794825e-06,
"loss": 0.8373,
"step": 1940
},
{
"epoch": 0.61,
"grad_norm": 0.435546875,
"learning_rate": 7.870943910390392e-06,
"loss": 0.8718,
"step": 1950
},
{
"epoch": 0.61,
"grad_norm": 0.447265625,
"learning_rate": 7.764120126837731e-06,
"loss": 0.8596,
"step": 1960
},
{
"epoch": 0.62,
"grad_norm": 0.466796875,
"learning_rate": 7.657564247830381e-06,
"loss": 0.8568,
"step": 1970
},
{
"epoch": 0.62,
"grad_norm": 0.439453125,
"learning_rate": 7.551289040961381e-06,
"loss": 0.832,
"step": 1980
},
{
"epoch": 0.62,
"grad_norm": 0.4375,
"learning_rate": 7.445307240193462e-06,
"loss": 0.8485,
"step": 1990
},
{
"epoch": 0.63,
"grad_norm": 0.43359375,
"learning_rate": 7.33963154433325e-06,
"loss": 0.8321,
"step": 2000
},
{
"epoch": 0.63,
"grad_norm": 0.462890625,
"learning_rate": 7.234274615509686e-06,
"loss": 0.8492,
"step": 2010
},
{
"epoch": 0.63,
"grad_norm": 0.4609375,
"learning_rate": 7.129249077656844e-06,
"loss": 0.8391,
"step": 2020
},
{
"epoch": 0.64,
"grad_norm": 0.455078125,
"learning_rate": 7.02456751500131e-06,
"loss": 0.8256,
"step": 2030
},
{
"epoch": 0.64,
"grad_norm": 0.455078125,
"learning_rate": 6.920242470554366e-06,
"loss": 0.8354,
"step": 2040
},
{
"epoch": 0.64,
"grad_norm": 0.41015625,
"learning_rate": 6.816286444609037e-06,
"loss": 0.8848,
"step": 2050
},
{
"epoch": 0.65,
"grad_norm": 0.412109375,
"learning_rate": 6.712711893242325e-06,
"loss": 0.8308,
"step": 2060
},
{
"epoch": 0.65,
"grad_norm": 0.421875,
"learning_rate": 6.6095312268226955e-06,
"loss": 0.8383,
"step": 2070
},
{
"epoch": 0.65,
"grad_norm": 0.44921875,
"learning_rate": 6.5067568085230896e-06,
"loss": 0.8878,
"step": 2080
},
{
"epoch": 0.66,
"grad_norm": 0.4609375,
"learning_rate": 6.404400952839522e-06,
"loss": 0.8623,
"step": 2090
},
{
"epoch": 0.66,
"grad_norm": 0.45703125,
"learning_rate": 6.302475924115581e-06,
"loss": 0.872,
"step": 2100
},
{
"epoch": 0.66,
"grad_norm": 0.44921875,
"learning_rate": 6.2009939350728865e-06,
"loss": 0.8555,
"step": 2110
},
{
"epoch": 0.66,
"grad_norm": 0.458984375,
"learning_rate": 6.09996714534777e-06,
"loss": 0.8631,
"step": 2120
},
{
"epoch": 0.67,
"grad_norm": 0.478515625,
"learning_rate": 5.999407660034289e-06,
"loss": 0.8645,
"step": 2130
},
{
"epoch": 0.67,
"grad_norm": 0.4375,
"learning_rate": 5.899327528233787e-06,
"loss": 0.8597,
"step": 2140
},
{
"epoch": 0.67,
"grad_norm": 0.478515625,
"learning_rate": 5.7997387416111685e-06,
"loss": 0.8945,
"step": 2150
},
{
"epoch": 0.68,
"grad_norm": 0.412109375,
"learning_rate": 5.700653232958047e-06,
"loss": 0.8599,
"step": 2160
},
{
"epoch": 0.68,
"grad_norm": 0.4375,
"learning_rate": 5.602082874762952e-06,
"loss": 0.8354,
"step": 2170
},
{
"epoch": 0.68,
"grad_norm": 0.48828125,
"learning_rate": 5.50403947778875e-06,
"loss": 0.8688,
"step": 2180
},
{
"epoch": 0.69,
"grad_norm": 0.470703125,
"learning_rate": 5.40653478965749e-06,
"loss": 0.8757,
"step": 2190
},
{
"epoch": 0.69,
"grad_norm": 0.5,
"learning_rate": 5.309580493442784e-06,
"loss": 0.8201,
"step": 2200
},
{
"epoch": 0.69,
"grad_norm": 0.453125,
"learning_rate": 5.213188206269926e-06,
"loss": 0.8627,
"step": 2210
},
{
"epoch": 0.7,
"grad_norm": 0.427734375,
"learning_rate": 5.1173694779239415e-06,
"loss": 0.8582,
"step": 2220
},
{
"epoch": 0.7,
"grad_norm": 0.408203125,
"learning_rate": 5.0221357894656605e-06,
"loss": 0.8264,
"step": 2230
},
{
"epoch": 0.7,
"grad_norm": 0.4765625,
"learning_rate": 4.927498551856077e-06,
"loss": 0.8992,
"step": 2240
},
{
"epoch": 0.71,
"grad_norm": 0.470703125,
"learning_rate": 4.83346910458906e-06,
"loss": 0.8369,
"step": 2250
},
{
"epoch": 0.71,
"grad_norm": 0.41015625,
"learning_rate": 4.740058714332647e-06,
"loss": 0.8433,
"step": 2260
},
{
"epoch": 0.71,
"grad_norm": 0.423828125,
"learning_rate": 4.64727857357908e-06,
"loss": 0.8378,
"step": 2270
},
{
"epoch": 0.71,
"grad_norm": 0.458984375,
"learning_rate": 4.555139799303706e-06,
"loss": 0.8623,
"step": 2280
},
{
"epoch": 0.72,
"grad_norm": 0.4375,
"learning_rate": 4.463653431632926e-06,
"loss": 0.8542,
"step": 2290
},
{
"epoch": 0.72,
"grad_norm": 0.447265625,
"learning_rate": 4.372830432521377e-06,
"loss": 0.8069,
"step": 2300
},
{
"epoch": 0.72,
"grad_norm": 0.44140625,
"learning_rate": 4.282681684438439e-06,
"loss": 0.841,
"step": 2310
},
{
"epoch": 0.73,
"grad_norm": 0.439453125,
"learning_rate": 4.193217989064332e-06,
"loss": 0.862,
"step": 2320
},
{
"epoch": 0.73,
"grad_norm": 0.4296875,
"learning_rate": 4.104450065995799e-06,
"loss": 0.8575,
"step": 2330
},
{
"epoch": 0.73,
"grad_norm": 0.478515625,
"learning_rate": 4.0163885514617175e-06,
"loss": 0.8578,
"step": 2340
},
{
"epoch": 0.74,
"grad_norm": 0.435546875,
"learning_rate": 3.929043997048647e-06,
"loss": 0.8467,
"step": 2350
},
{
"epoch": 0.74,
"grad_norm": 0.44140625,
"learning_rate": 3.8424268684365204e-06,
"loss": 0.8407,
"step": 2360
},
{
"epoch": 0.74,
"grad_norm": 0.466796875,
"learning_rate": 3.756547544144664e-06,
"loss": 0.851,
"step": 2370
},
{
"epoch": 0.75,
"grad_norm": 0.4140625,
"learning_rate": 3.671416314288204e-06,
"loss": 0.8157,
"step": 2380
},
{
"epoch": 0.75,
"grad_norm": 0.453125,
"learning_rate": 3.587043379345134e-06,
"loss": 0.8471,
"step": 2390
},
{
"epoch": 0.75,
"grad_norm": 0.455078125,
"learning_rate": 3.503438848934063e-06,
"loss": 0.8575,
"step": 2400
},
{
"epoch": 0.76,
"grad_norm": 0.46875,
"learning_rate": 3.4206127406028744e-06,
"loss": 0.8357,
"step": 2410
},
{
"epoch": 0.76,
"grad_norm": 0.447265625,
"learning_rate": 3.338574978628436e-06,
"loss": 0.8485,
"step": 2420
},
{
"epoch": 0.76,
"grad_norm": 0.478515625,
"learning_rate": 3.257335392827451e-06,
"loss": 0.8503,
"step": 2430
},
{
"epoch": 0.77,
"grad_norm": 0.4453125,
"learning_rate": 3.1769037173786376e-06,
"loss": 0.848,
"step": 2440
},
{
"epoch": 0.77,
"grad_norm": 0.5390625,
"learning_rate": 3.0972895896564004e-06,
"loss": 0.847,
"step": 2450
},
{
"epoch": 0.77,
"grad_norm": 0.4140625,
"learning_rate": 3.0185025490760346e-06,
"loss": 0.8655,
"step": 2460
},
{
"epoch": 0.77,
"grad_norm": 0.44140625,
"learning_rate": 2.9405520359507543e-06,
"loss": 0.8491,
"step": 2470
},
{
"epoch": 0.78,
"grad_norm": 0.408203125,
"learning_rate": 2.8634473903605008e-06,
"loss": 0.8385,
"step": 2480
},
{
"epoch": 0.78,
"grad_norm": 0.453125,
"learning_rate": 2.787197851032848e-06,
"loss": 0.8366,
"step": 2490
},
{
"epoch": 0.78,
"grad_norm": 0.41796875,
"learning_rate": 2.7118125542359775e-06,
"loss": 0.8262,
"step": 2500
},
{
"epoch": 0.79,
"grad_norm": 0.43359375,
"learning_rate": 2.6373005326839973e-06,
"loss": 0.855,
"step": 2510
},
{
"epoch": 0.79,
"grad_norm": 0.44921875,
"learning_rate": 2.563670714454617e-06,
"loss": 0.808,
"step": 2520
},
{
"epoch": 0.79,
"grad_norm": 0.474609375,
"learning_rate": 2.4909319219193774e-06,
"loss": 0.8427,
"step": 2530
},
{
"epoch": 0.8,
"grad_norm": 0.455078125,
"learning_rate": 2.4190928706865634e-06,
"loss": 0.8568,
"step": 2540
},
{
"epoch": 0.8,
"grad_norm": 0.419921875,
"learning_rate": 2.3481621685568867e-06,
"loss": 0.8825,
"step": 2550
},
{
"epoch": 0.8,
"grad_norm": 0.494140625,
"learning_rate": 2.2781483144920833e-06,
"loss": 0.8631,
"step": 2560
},
{
"epoch": 0.81,
"grad_norm": 0.453125,
"learning_rate": 2.209059697596585e-06,
"loss": 0.8518,
"step": 2570
},
{
"epoch": 0.81,
"grad_norm": 0.43359375,
"learning_rate": 2.1409045961123067e-06,
"loss": 0.8221,
"step": 2580
},
{
"epoch": 0.81,
"grad_norm": 0.435546875,
"learning_rate": 2.073691176426761e-06,
"loss": 0.8804,
"step": 2590
},
{
"epoch": 0.82,
"grad_norm": 0.4140625,
"learning_rate": 2.0074274920945537e-06,
"loss": 0.8412,
"step": 2600
},
{
"epoch": 0.82,
"grad_norm": 0.462890625,
"learning_rate": 1.9421214828723857e-06,
"loss": 0.848,
"step": 2610
},
{
"epoch": 0.82,
"grad_norm": 0.39453125,
"learning_rate": 1.8777809737677299e-06,
"loss": 0.8372,
"step": 2620
},
{
"epoch": 0.82,
"grad_norm": 0.453125,
"learning_rate": 1.8144136741012209e-06,
"loss": 0.8216,
"step": 2630
},
{
"epoch": 0.83,
"grad_norm": 0.462890625,
"learning_rate": 1.7520271765829112e-06,
"loss": 0.8504,
"step": 2640
},
{
"epoch": 0.83,
"grad_norm": 0.4453125,
"learning_rate": 1.690628956402528e-06,
"loss": 0.8346,
"step": 2650
},
{
"epoch": 0.83,
"grad_norm": 0.408203125,
"learning_rate": 1.6302263703337774e-06,
"loss": 0.8484,
"step": 2660
},
{
"epoch": 0.84,
"grad_norm": 0.416015625,
"learning_rate": 1.5708266558528562e-06,
"loss": 0.8687,
"step": 2670
},
{
"epoch": 0.84,
"grad_norm": 0.44921875,
"learning_rate": 1.512436930271244e-06,
"loss": 0.8391,
"step": 2680
},
{
"epoch": 0.84,
"grad_norm": 0.451171875,
"learning_rate": 1.4550641898829165e-06,
"loss": 0.8207,
"step": 2690
},
{
"epoch": 0.85,
"grad_norm": 0.421875,
"learning_rate": 1.3987153091260398e-06,
"loss": 0.8343,
"step": 2700
},
{
"epoch": 0.85,
"grad_norm": 0.44921875,
"learning_rate": 1.3433970397592599e-06,
"loss": 0.8498,
"step": 2710
},
{
"epoch": 0.85,
"grad_norm": 0.458984375,
"learning_rate": 1.2891160100527222e-06,
"loss": 0.8372,
"step": 2720
},
{
"epoch": 0.86,
"grad_norm": 0.451171875,
"learning_rate": 1.2358787239938497e-06,
"loss": 0.8286,
"step": 2730
},
{
"epoch": 0.86,
"grad_norm": 0.455078125,
"learning_rate": 1.1836915605080445e-06,
"loss": 0.8899,
"step": 2740
},
{
"epoch": 0.86,
"grad_norm": 0.50390625,
"learning_rate": 1.1325607726943567e-06,
"loss": 0.8343,
"step": 2750
},
{
"epoch": 0.87,
"grad_norm": 0.41796875,
"learning_rate": 1.0824924870762243e-06,
"loss": 0.8134,
"step": 2760
},
{
"epoch": 0.87,
"grad_norm": 0.4453125,
"learning_rate": 1.033492702867407e-06,
"loss": 0.8391,
"step": 2770
},
{
"epoch": 0.87,
"grad_norm": 0.45703125,
"learning_rate": 9.855672912531455e-07,
"loss": 0.8719,
"step": 2780
},
{
"epoch": 0.87,
"grad_norm": 0.439453125,
"learning_rate": 9.387219946866699e-07,
"loss": 0.8157,
"step": 2790
},
{
"epoch": 0.88,
"grad_norm": 0.42578125,
"learning_rate": 8.929624262011472e-07,
"loss": 0.8486,
"step": 2800
},
{
"epoch": 0.88,
"grad_norm": 0.46484375,
"learning_rate": 8.482940687371067e-07,
"loss": 0.8514,
"step": 2810
},
{
"epoch": 0.88,
"grad_norm": 0.43359375,
"learning_rate": 8.047222744854943e-07,
"loss": 0.8384,
"step": 2820
},
{
"epoch": 0.89,
"grad_norm": 0.50390625,
"learning_rate": 7.622522642463425e-07,
"loss": 0.848,
"step": 2830
},
{
"epoch": 0.89,
"grad_norm": 0.462890625,
"learning_rate": 7.208891268032336e-07,
"loss": 0.8472,
"step": 2840
},
{
"epoch": 0.89,
"grad_norm": 0.4296875,
"learning_rate": 6.80637818313541e-07,
"loss": 0.8309,
"step": 2850
},
{
"epoch": 0.9,
"grad_norm": 0.46484375,
"learning_rate": 6.415031617145951e-07,
"loss": 0.8309,
"step": 2860
},
{
"epoch": 0.9,
"grad_norm": 0.453125,
"learning_rate": 6.034898461457861e-07,
"loss": 0.8478,
"step": 2870
},
{
"epoch": 0.9,
"grad_norm": 0.470703125,
"learning_rate": 5.666024263867042e-07,
"loss": 0.8276,
"step": 2880
},
{
"epoch": 0.91,
"grad_norm": 0.4453125,
"learning_rate": 5.308453223113962e-07,
"loss": 0.8676,
"step": 2890
},
{
"epoch": 0.91,
"grad_norm": 0.44140625,
"learning_rate": 4.962228183587669e-07,
"loss": 0.8098,
"step": 2900
},
{
"epoch": 0.91,
"grad_norm": 0.416015625,
"learning_rate": 4.6273906301920744e-07,
"loss": 0.8382,
"step": 2910
},
{
"epoch": 0.92,
"grad_norm": 0.4765625,
"learning_rate": 4.303980683375353e-07,
"loss": 0.8405,
"step": 2920
},
{
"epoch": 0.92,
"grad_norm": 0.431640625,
"learning_rate": 3.992037094322532e-07,
"loss": 0.8273,
"step": 2930
},
{
"epoch": 0.92,
"grad_norm": 0.984375,
"learning_rate": 3.691597240312439e-07,
"loss": 0.8326,
"step": 2940
},
{
"epoch": 0.92,
"grad_norm": 0.439453125,
"learning_rate": 3.4026971202390404e-07,
"loss": 0.8732,
"step": 2950
},
{
"epoch": 0.93,
"grad_norm": 0.470703125,
"learning_rate": 3.1253713502980566e-07,
"loss": 0.8436,
"step": 2960
},
{
"epoch": 0.93,
"grad_norm": 0.404296875,
"learning_rate": 2.8596531598392264e-07,
"loss": 0.8522,
"step": 2970
},
{
"epoch": 0.93,
"grad_norm": 0.44140625,
"learning_rate": 2.605574387384779e-07,
"loss": 0.8684,
"step": 2980
},
{
"epoch": 0.94,
"grad_norm": 0.42578125,
"learning_rate": 2.363165476814455e-07,
"loss": 0.8315,
"step": 2990
},
{
"epoch": 0.94,
"grad_norm": 0.416015625,
"learning_rate": 2.132455473717765e-07,
"loss": 0.8442,
"step": 3000
},
{
"epoch": 0.94,
"grad_norm": 0.451171875,
"learning_rate": 1.913472021913665e-07,
"loss": 0.8329,
"step": 3010
},
{
"epoch": 0.95,
"grad_norm": 0.416015625,
"learning_rate": 1.7062413601383498e-07,
"loss": 0.8163,
"step": 3020
},
{
"epoch": 0.95,
"grad_norm": 0.419921875,
"learning_rate": 1.5107883189012018e-07,
"loss": 0.8183,
"step": 3030
},
{
"epoch": 0.95,
"grad_norm": 0.416015625,
"learning_rate": 1.3271363175096696e-07,
"loss": 0.8535,
"step": 3040
},
{
"epoch": 0.96,
"grad_norm": 0.482421875,
"learning_rate": 1.1553073612631138e-07,
"loss": 0.8443,
"step": 3050
},
{
"epoch": 0.96,
"grad_norm": 0.443359375,
"learning_rate": 9.953220388160934e-08,
"loss": 0.8592,
"step": 3060
},
{
"epoch": 0.96,
"grad_norm": 0.451171875,
"learning_rate": 8.471995197114836e-08,
"loss": 0.866,
"step": 3070
},
{
"epoch": 0.97,
"grad_norm": 0.41796875,
"learning_rate": 7.109575520835244e-08,
"loss": 0.8496,
"step": 3080
},
{
"epoch": 0.97,
"grad_norm": 0.43359375,
"learning_rate": 5.866124605312329e-08,
"loss": 0.8474,
"step": 3090
},
{
"epoch": 0.97,
"grad_norm": 0.466796875,
"learning_rate": 4.7417914416239e-08,
"loss": 0.8686,
"step": 3100
},
{
"epoch": 0.98,
"grad_norm": 0.46875,
"learning_rate": 3.7367107480832385e-08,
"loss": 0.8247,
"step": 3110
},
{
"epoch": 0.98,
"grad_norm": 0.427734375,
"learning_rate": 2.8510029540967933e-08,
"loss": 0.8272,
"step": 3120
},
{
"epoch": 0.98,
"grad_norm": 0.45703125,
"learning_rate": 2.084774185734495e-08,
"loss": 0.854,
"step": 3130
},
{
"epoch": 0.98,
"grad_norm": 0.423828125,
"learning_rate": 1.4381162530135995e-08,
"loss": 0.8252,
"step": 3140
},
{
"epoch": 0.99,
"grad_norm": 0.470703125,
"learning_rate": 9.111066388981515e-09,
"loss": 0.8625,
"step": 3150
},
{
"epoch": 0.99,
"grad_norm": 0.45703125,
"learning_rate": 5.0380849001430145e-09,
"loss": 0.867,
"step": 3160
},
{
"epoch": 0.99,
"grad_norm": 0.443359375,
"learning_rate": 2.1627060908491204e-09,
"loss": 0.8625,
"step": 3170
},
{
"epoch": 1.0,
"grad_norm": 0.419921875,
"learning_rate": 4.85274490813481e-10,
"loss": 0.8536,
"step": 3180
}
],
"logging_steps": 10,
"max_steps": 3189,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 9.354140440917443e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}