{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.601358890533447, "learning_rate": 0.001, "loss": 10.5249, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.328374981880188, "learning_rate": 0.001, "loss": 8.6782, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.0325572490692139, "learning_rate": 0.001, "loss": 7.3989, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.6852174997329712, "learning_rate": 0.001, "loss": 7.3653, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.656384289264679, "learning_rate": 0.001, "loss": 7.3625, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.7547395825386047, "learning_rate": 0.001, "loss": 7.3089, "step": 25 }, { "epoch": 0.0, "grad_norm": 0.5359553694725037, "learning_rate": 0.001, "loss": 7.3326, "step": 30 }, { "epoch": 0.0, "grad_norm": 0.654456615447998, "learning_rate": 0.001, "loss": 7.279, "step": 35 }, { "epoch": 0.0, "grad_norm": 8.3489408493042, "learning_rate": 0.001, "loss": 7.1401, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.7769318222999573, "learning_rate": 0.001, "loss": 7.0276, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.961713969707489, "learning_rate": 0.001, "loss": 6.8993, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.5359512567520142, "learning_rate": 0.001, "loss": 6.7672, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.7930881381034851, "learning_rate": 0.001, "loss": 6.6585, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.6703987717628479, "learning_rate": 0.001, "loss": 6.6558, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.6621621251106262, "learning_rate": 0.001, "loss": 6.3967, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.8039225935935974, "learning_rate": 0.001, "loss": 6.3318, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.5961666703224182, "learning_rate": 0.001, "loss": 6.3381, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.5145270228385925, "learning_rate": 0.001, "loss": 6.1274, "step": 85 }, { "epoch": 0.01, "grad_norm": 0.7134149074554443, "learning_rate": 0.001, "loss": 6.1067, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.5910749435424805, "learning_rate": 0.001, "loss": 6.0012, "step": 95 }, { "epoch": 0.01, "grad_norm": 0.6277782917022705, "learning_rate": 0.001, "loss": 5.9658, "step": 100 }, { "epoch": 0.01, "grad_norm": 0.44600120186805725, "learning_rate": 0.001, "loss": 5.9172, "step": 105 }, { "epoch": 0.01, "grad_norm": 0.4796374440193176, "learning_rate": 0.001, "loss": 5.7681, "step": 110 }, { "epoch": 0.01, "grad_norm": 0.5527931451797485, "learning_rate": 0.001, "loss": 5.7746, "step": 115 }, { "epoch": 0.01, "grad_norm": 0.7416689991950989, "learning_rate": 0.001, "loss": 5.6928, "step": 120 }, { "epoch": 0.01, "grad_norm": 0.5323288440704346, "learning_rate": 0.001, "loss": 5.6177, "step": 125 }, { "epoch": 0.01, "grad_norm": 0.5754017233848572, "learning_rate": 0.001, "loss": 5.6344, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.690061092376709, "learning_rate": 0.001, "loss": 5.568, "step": 135 }, { "epoch": 0.02, "grad_norm": 0.6237902045249939, "learning_rate": 0.001, "loss": 5.4123, "step": 140 }, { "epoch": 0.02, "grad_norm": 0.702131450176239, "learning_rate": 0.001, "loss": 5.4907, "step": 145 }, { "epoch": 0.02, "grad_norm": 0.6115543842315674, "learning_rate": 0.001, "loss": 5.4457, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.5676571726799011, "learning_rate": 0.001, "loss": 5.3547, "step": 155 }, { "epoch": 0.02, "grad_norm": 0.6142025589942932, "learning_rate": 0.001, "loss": 5.3421, "step": 160 }, { "epoch": 0.02, "grad_norm": 0.5987421274185181, "learning_rate": 0.001, "loss": 5.3941, "step": 165 }, { "epoch": 0.02, "grad_norm": 0.6259231567382812, "learning_rate": 0.001, "loss": 5.231, "step": 170 }, { "epoch": 0.02, "grad_norm": 0.6100727319717407, "learning_rate": 0.001, "loss": 5.1879, "step": 175 }, { "epoch": 0.02, "grad_norm": 1.064252495765686, "learning_rate": 0.001, "loss": 5.1959, "step": 180 }, { "epoch": 0.02, "grad_norm": 0.7423781156539917, "learning_rate": 0.001, "loss": 5.1457, "step": 185 }, { "epoch": 0.02, "grad_norm": 0.788051426410675, "learning_rate": 0.001, "loss": 5.2344, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.6291590332984924, "learning_rate": 0.001, "loss": 5.1488, "step": 195 }, { "epoch": 0.02, "grad_norm": 0.6599926948547363, "learning_rate": 0.001, "loss": 5.0759, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.5811977386474609, "learning_rate": 0.001, "loss": 5.0278, "step": 205 }, { "epoch": 0.02, "grad_norm": 0.5062569379806519, "learning_rate": 0.001, "loss": 5.0043, "step": 210 }, { "epoch": 0.02, "grad_norm": 0.47193270921707153, "learning_rate": 0.001, "loss": 4.8949, "step": 215 }, { "epoch": 0.02, "grad_norm": 0.7771720290184021, "learning_rate": 0.001, "loss": 4.951, "step": 220 }, { "epoch": 0.03, "grad_norm": 0.5288494229316711, "learning_rate": 0.001, "loss": 4.9402, "step": 225 }, { "epoch": 0.03, "grad_norm": 0.5217804312705994, "learning_rate": 0.001, "loss": 4.8737, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.594836413860321, "learning_rate": 0.001, "loss": 4.8091, "step": 235 }, { "epoch": 0.03, "grad_norm": 0.6213844418525696, "learning_rate": 0.001, "loss": 4.83, "step": 240 }, { "epoch": 0.03, "grad_norm": 0.595021665096283, "learning_rate": 0.001, "loss": 4.8749, "step": 245 }, { "epoch": 0.03, "grad_norm": 0.5776046514511108, "learning_rate": 0.001, "loss": 4.7623, "step": 250 }, { "epoch": 0.03, "grad_norm": 0.5899957418441772, "learning_rate": 0.001, "loss": 4.7488, "step": 255 }, { "epoch": 0.03, "grad_norm": 0.598554253578186, "learning_rate": 0.001, "loss": 4.7859, "step": 260 }, { "epoch": 0.03, "grad_norm": 0.6169301271438599, "learning_rate": 0.001, "loss": 4.7067, "step": 265 }, { "epoch": 0.03, "grad_norm": 0.7676901817321777, "learning_rate": 0.001, "loss": 4.7561, "step": 270 }, { "epoch": 0.03, "grad_norm": 0.5868487358093262, "learning_rate": 0.001, "loss": 4.708, "step": 275 }, { "epoch": 0.03, "grad_norm": 0.4976160526275635, "learning_rate": 0.001, "loss": 4.6998, "step": 280 }, { "epoch": 0.03, "grad_norm": 0.5182490348815918, "learning_rate": 0.001, "loss": 4.5554, "step": 285 }, { "epoch": 0.03, "grad_norm": 0.6468132138252258, "learning_rate": 0.001, "loss": 4.6289, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.7456973195075989, "learning_rate": 0.001, "loss": 4.5579, "step": 295 }, { "epoch": 0.03, "grad_norm": 0.6524637937545776, "learning_rate": 0.001, "loss": 4.5488, "step": 300 }, { "epoch": 0.03, "grad_norm": 0.6689502596855164, "learning_rate": 0.001, "loss": 4.5551, "step": 305 }, { "epoch": 0.03, "grad_norm": 0.7009299397468567, "learning_rate": 0.001, "loss": 4.5232, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.5543584227561951, "learning_rate": 0.001, "loss": 4.5442, "step": 315 }, { "epoch": 0.04, "grad_norm": 0.5917452573776245, "learning_rate": 0.001, "loss": 4.5228, "step": 320 }, { "epoch": 0.04, "grad_norm": 0.5315392017364502, "learning_rate": 0.001, "loss": 4.5275, "step": 325 }, { "epoch": 0.04, "grad_norm": 0.603313684463501, "learning_rate": 0.001, "loss": 4.404, "step": 330 }, { "epoch": 0.04, "grad_norm": 0.6234457492828369, "learning_rate": 0.001, "loss": 4.4923, "step": 335 }, { "epoch": 0.04, "grad_norm": 0.5459450483322144, "learning_rate": 0.001, "loss": 4.4246, "step": 340 }, { "epoch": 0.04, "grad_norm": 0.5857668519020081, "learning_rate": 0.001, "loss": 4.4323, "step": 345 }, { "epoch": 0.04, "grad_norm": 0.723681628704071, "learning_rate": 0.001, "loss": 4.4539, "step": 350 }, { "epoch": 0.04, "grad_norm": 0.5971282124519348, "learning_rate": 0.001, "loss": 4.4693, "step": 355 }, { "epoch": 0.04, "grad_norm": 0.6778796315193176, "learning_rate": 0.001, "loss": 4.3177, "step": 360 }, { "epoch": 0.04, "grad_norm": 0.7114105820655823, "learning_rate": 0.001, "loss": 4.3722, "step": 365 }, { "epoch": 0.04, "grad_norm": 0.6328292489051819, "learning_rate": 0.001, "loss": 4.4167, "step": 370 }, { "epoch": 0.04, "grad_norm": 0.5529919266700745, "learning_rate": 0.001, "loss": 4.2594, "step": 375 }, { "epoch": 0.04, "grad_norm": 0.5717706680297852, "learning_rate": 0.001, "loss": 4.2347, "step": 380 }, { "epoch": 0.04, "grad_norm": 0.650095522403717, "learning_rate": 0.001, "loss": 4.401, "step": 385 }, { "epoch": 0.04, "grad_norm": 0.5702147483825684, "learning_rate": 0.001, "loss": 4.3214, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.5305518507957458, "learning_rate": 0.001, "loss": 4.3342, "step": 395 }, { "epoch": 0.04, "grad_norm": 0.7027683258056641, "learning_rate": 0.001, "loss": 4.2095, "step": 400 }, { "epoch": 0.05, "grad_norm": 0.6951459050178528, "learning_rate": 0.001, "loss": 4.2882, "step": 405 }, { "epoch": 0.05, "grad_norm": 0.7413341403007507, "learning_rate": 0.001, "loss": 4.2696, "step": 410 }, { "epoch": 0.05, "grad_norm": 0.5163285136222839, "learning_rate": 0.001, "loss": 4.1847, "step": 415 }, { "epoch": 0.05, "grad_norm": 0.6258194446563721, "learning_rate": 0.001, "loss": 4.1892, "step": 420 }, { "epoch": 0.05, "grad_norm": 0.50654536485672, "learning_rate": 0.001, "loss": 4.1689, "step": 425 }, { "epoch": 0.05, "grad_norm": 0.5324122905731201, "learning_rate": 0.001, "loss": 4.2116, "step": 430 }, { "epoch": 0.05, "grad_norm": 0.5191783905029297, "learning_rate": 0.001, "loss": 4.2668, "step": 435 }, { "epoch": 0.05, "grad_norm": 0.5304970145225525, "learning_rate": 0.001, "loss": 4.1726, "step": 440 }, { "epoch": 0.05, "grad_norm": 0.7103253602981567, "learning_rate": 0.001, "loss": 4.1596, "step": 445 }, { "epoch": 0.05, "grad_norm": 0.8040492534637451, "learning_rate": 0.001, "loss": 4.1499, "step": 450 }, { "epoch": 0.05, "grad_norm": 0.6443156599998474, "learning_rate": 0.001, "loss": 4.203, "step": 455 }, { "epoch": 0.05, "grad_norm": 0.5980875492095947, "learning_rate": 0.001, "loss": 4.0761, "step": 460 }, { "epoch": 0.05, "grad_norm": 0.5467867255210876, "learning_rate": 0.001, "loss": 4.0663, "step": 465 }, { "epoch": 0.05, "grad_norm": 0.5479826331138611, "learning_rate": 0.001, "loss": 4.0833, "step": 470 }, { "epoch": 0.05, "grad_norm": 0.5548881888389587, "learning_rate": 0.001, "loss": 4.0685, "step": 475 }, { "epoch": 0.05, "grad_norm": 0.5504749417304993, "learning_rate": 0.001, "loss": 4.0026, "step": 480 }, { "epoch": 0.05, "grad_norm": 0.586014449596405, "learning_rate": 0.001, "loss": 4.0989, "step": 485 }, { "epoch": 0.05, "grad_norm": 0.6106052398681641, "learning_rate": 0.001, "loss": 4.0259, "step": 490 }, { "epoch": 0.06, "grad_norm": 0.5603764653205872, "learning_rate": 0.001, "loss": 4.043, "step": 495 }, { "epoch": 0.06, "grad_norm": 0.5788707733154297, "learning_rate": 0.001, "loss": 3.9545, "step": 500 }, { "epoch": 0.06, "grad_norm": 0.6436803340911865, "learning_rate": 0.001, "loss": 3.9257, "step": 505 }, { "epoch": 0.06, "grad_norm": 0.5277396440505981, "learning_rate": 0.001, "loss": 4.0583, "step": 510 }, { "epoch": 0.06, "grad_norm": 0.6784444451332092, "learning_rate": 0.001, "loss": 4.0143, "step": 515 }, { "epoch": 0.06, "grad_norm": 0.49947619438171387, "learning_rate": 0.001, "loss": 4.044, "step": 520 }, { "epoch": 0.06, "grad_norm": 0.6773266196250916, "learning_rate": 0.001, "loss": 3.9882, "step": 525 }, { "epoch": 0.06, "grad_norm": 0.6483107209205627, "learning_rate": 0.001, "loss": 4.0046, "step": 530 }, { "epoch": 0.06, "grad_norm": 0.5643370747566223, "learning_rate": 0.001, "loss": 4.017, "step": 535 }, { "epoch": 0.06, "grad_norm": 0.5986424088478088, "learning_rate": 0.001, "loss": 4.0109, "step": 540 }, { "epoch": 0.06, "grad_norm": 0.622429609298706, "learning_rate": 0.001, "loss": 3.9896, "step": 545 }, { "epoch": 0.06, "grad_norm": 0.5016334652900696, "learning_rate": 0.001, "loss": 3.9602, "step": 550 }, { "epoch": 0.06, "grad_norm": 0.5102951526641846, "learning_rate": 0.001, "loss": 4.0103, "step": 555 }, { "epoch": 0.06, "grad_norm": 0.627109169960022, "learning_rate": 0.001, "loss": 3.9348, "step": 560 }, { "epoch": 0.06, "grad_norm": 0.4738805592060089, "learning_rate": 0.001, "loss": 4.0031, "step": 565 }, { "epoch": 0.06, "grad_norm": 0.481922447681427, "learning_rate": 0.001, "loss": 3.7813, "step": 570 }, { "epoch": 0.06, "grad_norm": 0.5989673733711243, "learning_rate": 0.001, "loss": 3.8269, "step": 575 }, { "epoch": 0.06, "grad_norm": 0.5524454116821289, "learning_rate": 0.001, "loss": 3.8124, "step": 580 }, { "epoch": 0.07, "grad_norm": 0.5364890098571777, "learning_rate": 0.001, "loss": 3.964, "step": 585 }, { "epoch": 0.07, "grad_norm": 0.5401674509048462, "learning_rate": 0.001, "loss": 3.857, "step": 590 }, { "epoch": 0.07, "grad_norm": 0.5142294764518738, "learning_rate": 0.001, "loss": 3.8363, "step": 595 }, { "epoch": 0.07, "grad_norm": 0.5484086275100708, "learning_rate": 0.001, "loss": 3.8658, "step": 600 }, { "epoch": 0.07, "grad_norm": 0.5811066627502441, "learning_rate": 0.001, "loss": 3.8457, "step": 605 }, { "epoch": 0.07, "grad_norm": 0.5071974396705627, "learning_rate": 0.001, "loss": 3.8835, "step": 610 }, { "epoch": 0.07, "grad_norm": 0.5145882964134216, "learning_rate": 0.001, "loss": 3.897, "step": 615 }, { "epoch": 0.07, "grad_norm": 0.6926450729370117, "learning_rate": 0.001, "loss": 3.8504, "step": 620 }, { "epoch": 0.07, "grad_norm": 0.5874271988868713, "learning_rate": 0.001, "loss": 3.8077, "step": 625 }, { "epoch": 0.07, "grad_norm": 0.5802385210990906, "learning_rate": 0.001, "loss": 3.7109, "step": 630 }, { "epoch": 0.07, "grad_norm": 0.6140554547309875, "learning_rate": 0.001, "loss": 3.7658, "step": 635 }, { "epoch": 0.07, "grad_norm": 0.5724272727966309, "learning_rate": 0.001, "loss": 3.8777, "step": 640 }, { "epoch": 0.07, "grad_norm": 0.5731592178344727, "learning_rate": 0.001, "loss": 3.7584, "step": 645 }, { "epoch": 0.07, "grad_norm": 0.5535034537315369, "learning_rate": 0.001, "loss": 3.724, "step": 650 }, { "epoch": 0.07, "grad_norm": 0.5416511297225952, "learning_rate": 0.001, "loss": 3.77, "step": 655 }, { "epoch": 0.07, "grad_norm": 0.5728431344032288, "learning_rate": 0.001, "loss": 3.7719, "step": 660 }, { "epoch": 0.07, "grad_norm": 0.5236804485321045, "learning_rate": 0.001, "loss": 3.6926, "step": 665 }, { "epoch": 0.07, "grad_norm": 0.45392197370529175, "learning_rate": 0.001, "loss": 3.7736, "step": 670 }, { "epoch": 0.08, "grad_norm": 0.7863075733184814, "learning_rate": 0.001, "loss": 3.7701, "step": 675 }, { "epoch": 0.08, "grad_norm": 0.5810198187828064, "learning_rate": 0.001, "loss": 3.8168, "step": 680 }, { "epoch": 0.08, "grad_norm": 0.5719917416572571, "learning_rate": 0.001, "loss": 3.752, "step": 685 }, { "epoch": 0.08, "grad_norm": 0.5634996294975281, "learning_rate": 0.001, "loss": 3.7123, "step": 690 }, { "epoch": 0.08, "grad_norm": 0.49231231212615967, "learning_rate": 0.001, "loss": 3.8302, "step": 695 }, { "epoch": 0.08, "grad_norm": 0.47931423783302307, "learning_rate": 0.001, "loss": 3.6192, "step": 700 }, { "epoch": 0.08, "grad_norm": 0.6042882204055786, "learning_rate": 0.001, "loss": 3.8259, "step": 705 }, { "epoch": 0.08, "grad_norm": 0.5771067142486572, "learning_rate": 0.001, "loss": 3.7338, "step": 710 }, { "epoch": 0.08, "grad_norm": 0.5514395236968994, "learning_rate": 0.001, "loss": 3.6775, "step": 715 }, { "epoch": 0.08, "grad_norm": 0.5697659850120544, "learning_rate": 0.001, "loss": 3.7105, "step": 720 }, { "epoch": 0.08, "grad_norm": 0.5804579257965088, "learning_rate": 0.001, "loss": 3.6896, "step": 725 }, { "epoch": 0.08, "grad_norm": 0.4920499622821808, "learning_rate": 0.001, "loss": 3.6511, "step": 730 }, { "epoch": 0.08, "grad_norm": 0.523162841796875, "learning_rate": 0.001, "loss": 3.6019, "step": 735 }, { "epoch": 0.08, "grad_norm": 0.6031882166862488, "learning_rate": 0.001, "loss": 3.655, "step": 740 }, { "epoch": 0.08, "grad_norm": 0.6534960269927979, "learning_rate": 0.001, "loss": 3.7626, "step": 745 }, { "epoch": 0.08, "grad_norm": 0.49020254611968994, "learning_rate": 0.001, "loss": 3.6494, "step": 750 }, { "epoch": 0.08, "grad_norm": 0.4870024621486664, "learning_rate": 0.001, "loss": 3.5926, "step": 755 }, { "epoch": 0.08, "grad_norm": 0.48725512623786926, "learning_rate": 0.001, "loss": 3.714, "step": 760 }, { "epoch": 0.09, "grad_norm": 0.5335116982460022, "learning_rate": 0.001, "loss": 3.6698, "step": 765 }, { "epoch": 0.09, "grad_norm": 0.5708632469177246, "learning_rate": 0.001, "loss": 3.5812, "step": 770 }, { "epoch": 0.09, "grad_norm": 0.5631666779518127, "learning_rate": 0.001, "loss": 3.677, "step": 775 }, { "epoch": 0.09, "grad_norm": 0.5864231586456299, "learning_rate": 0.001, "loss": 3.5446, "step": 780 }, { "epoch": 0.09, "grad_norm": 0.5556133985519409, "learning_rate": 0.001, "loss": 3.529, "step": 785 }, { "epoch": 0.09, "grad_norm": 0.5345197319984436, "learning_rate": 0.001, "loss": 3.4959, "step": 790 }, { "epoch": 0.09, "grad_norm": 0.5649410486221313, "learning_rate": 0.001, "loss": 3.5892, "step": 795 }, { "epoch": 0.09, "grad_norm": 0.5137830376625061, "learning_rate": 0.001, "loss": 3.664, "step": 800 }, { "epoch": 0.09, "grad_norm": 0.520827054977417, "learning_rate": 0.001, "loss": 3.6159, "step": 805 }, { "epoch": 0.09, "grad_norm": 0.5893942713737488, "learning_rate": 0.001, "loss": 3.5971, "step": 810 }, { "epoch": 0.09, "grad_norm": 0.43705499172210693, "learning_rate": 0.001, "loss": 3.5704, "step": 815 }, { "epoch": 0.09, "grad_norm": 0.5601233243942261, "learning_rate": 0.001, "loss": 3.5379, "step": 820 }, { "epoch": 0.09, "grad_norm": 0.5474940538406372, "learning_rate": 0.001, "loss": 3.5245, "step": 825 }, { "epoch": 0.09, "grad_norm": 0.5808173418045044, "learning_rate": 0.001, "loss": 3.545, "step": 830 }, { "epoch": 0.09, "grad_norm": 0.5676250457763672, "learning_rate": 0.001, "loss": 3.5477, "step": 835 }, { "epoch": 0.09, "grad_norm": 0.46730735898017883, "learning_rate": 0.001, "loss": 3.4416, "step": 840 }, { "epoch": 0.09, "grad_norm": 0.4779958128929138, "learning_rate": 0.001, "loss": 3.5729, "step": 845 }, { "epoch": 0.09, "grad_norm": 0.5286216735839844, "learning_rate": 0.001, "loss": 3.5727, "step": 850 }, { "epoch": 0.1, "grad_norm": 0.5003157258033752, "learning_rate": 0.001, "loss": 3.4412, "step": 855 }, { "epoch": 0.1, "grad_norm": 0.5391347408294678, "learning_rate": 0.001, "loss": 3.6227, "step": 860 }, { "epoch": 0.1, "grad_norm": 0.6406623721122742, "learning_rate": 0.001, "loss": 3.5755, "step": 865 }, { "epoch": 0.1, "grad_norm": 0.5844242572784424, "learning_rate": 0.001, "loss": 3.6629, "step": 870 }, { "epoch": 0.1, "grad_norm": 0.5803925395011902, "learning_rate": 0.001, "loss": 3.5022, "step": 875 }, { "epoch": 0.1, "grad_norm": 0.5397413372993469, "learning_rate": 0.001, "loss": 3.4458, "step": 880 }, { "epoch": 0.1, "grad_norm": 0.5229134559631348, "learning_rate": 0.001, "loss": 3.4075, "step": 885 }, { "epoch": 0.1, "grad_norm": 0.5586813688278198, "learning_rate": 0.001, "loss": 3.4069, "step": 890 }, { "epoch": 0.1, "grad_norm": 0.457573801279068, "learning_rate": 0.001, "loss": 3.4071, "step": 895 }, { "epoch": 0.1, "grad_norm": 0.511417031288147, "learning_rate": 0.001, "loss": 3.5552, "step": 900 }, { "epoch": 0.1, "grad_norm": 0.5040103197097778, "learning_rate": 0.001, "loss": 3.3518, "step": 905 }, { "epoch": 0.1, "grad_norm": 0.5554264187812805, "learning_rate": 0.001, "loss": 3.4197, "step": 910 }, { "epoch": 0.1, "grad_norm": 0.5565446019172668, "learning_rate": 0.001, "loss": 3.4614, "step": 915 }, { "epoch": 0.1, "grad_norm": 0.518672525882721, "learning_rate": 0.001, "loss": 3.4518, "step": 920 }, { "epoch": 0.1, "grad_norm": 0.5770706534385681, "learning_rate": 0.001, "loss": 3.3687, "step": 925 }, { "epoch": 0.1, "grad_norm": 0.4561229646205902, "learning_rate": 0.001, "loss": 3.4112, "step": 930 }, { "epoch": 0.1, "grad_norm": 0.49880415201187134, "learning_rate": 0.001, "loss": 3.4854, "step": 935 }, { "epoch": 0.1, "grad_norm": 0.49336493015289307, "learning_rate": 0.001, "loss": 3.5201, "step": 940 }, { "epoch": 0.11, "grad_norm": 0.6361943483352661, "learning_rate": 0.001, "loss": 3.4105, "step": 945 }, { "epoch": 0.11, "grad_norm": 0.49244019389152527, "learning_rate": 0.001, "loss": 3.4827, "step": 950 }, { "epoch": 0.11, "grad_norm": 0.5152875781059265, "learning_rate": 0.001, "loss": 3.4756, "step": 955 }, { "epoch": 0.11, "grad_norm": 0.5768316984176636, "learning_rate": 0.001, "loss": 3.3638, "step": 960 }, { "epoch": 0.11, "grad_norm": 0.5528039336204529, "learning_rate": 0.001, "loss": 3.3977, "step": 965 }, { "epoch": 0.11, "grad_norm": 0.5571544170379639, "learning_rate": 0.001, "loss": 3.4294, "step": 970 }, { "epoch": 0.11, "grad_norm": 0.7128731608390808, "learning_rate": 0.001, "loss": 3.4117, "step": 975 }, { "epoch": 0.11, "grad_norm": 0.5800164341926575, "learning_rate": 0.001, "loss": 3.4575, "step": 980 }, { "epoch": 0.11, "grad_norm": 0.5743172764778137, "learning_rate": 0.001, "loss": 3.4217, "step": 985 }, { "epoch": 0.11, "grad_norm": 0.526035726070404, "learning_rate": 0.001, "loss": 3.3472, "step": 990 }, { "epoch": 0.11, "grad_norm": 0.47266894578933716, "learning_rate": 0.001, "loss": 3.3167, "step": 995 }, { "epoch": 0.11, "grad_norm": 0.5033674240112305, "learning_rate": 0.001, "loss": 3.2838, "step": 1000 }, { "epoch": 0.11, "grad_norm": 0.46661603450775146, "learning_rate": 0.001, "loss": 3.2839, "step": 1005 }, { "epoch": 0.11, "grad_norm": 0.5624775886535645, "learning_rate": 0.001, "loss": 3.4624, "step": 1010 }, { "epoch": 0.11, "grad_norm": 0.5073076486587524, "learning_rate": 0.001, "loss": 3.3753, "step": 1015 }, { "epoch": 0.11, "grad_norm": 0.5086501240730286, "learning_rate": 0.001, "loss": 3.3817, "step": 1020 }, { "epoch": 0.11, "grad_norm": 0.5310328602790833, "learning_rate": 0.001, "loss": 3.4449, "step": 1025 }, { "epoch": 0.11, "grad_norm": 0.49915993213653564, "learning_rate": 0.001, "loss": 3.363, "step": 1030 }, { "epoch": 0.12, "grad_norm": 0.628730058670044, "learning_rate": 0.001, "loss": 3.2733, "step": 1035 }, { "epoch": 0.12, "grad_norm": 0.5633178949356079, "learning_rate": 0.001, "loss": 3.3406, "step": 1040 }, { "epoch": 0.12, "grad_norm": 0.5721198320388794, "learning_rate": 0.001, "loss": 3.3102, "step": 1045 }, { "epoch": 0.12, "grad_norm": 0.4597437381744385, "learning_rate": 0.001, "loss": 3.4369, "step": 1050 }, { "epoch": 0.12, "grad_norm": 0.5563680529594421, "learning_rate": 0.001, "loss": 3.3399, "step": 1055 }, { "epoch": 0.12, "grad_norm": 0.5712825059890747, "learning_rate": 0.001, "loss": 3.3234, "step": 1060 }, { "epoch": 0.12, "grad_norm": 0.5371583104133606, "learning_rate": 0.001, "loss": 3.3818, "step": 1065 }, { "epoch": 0.12, "grad_norm": 0.5913522243499756, "learning_rate": 0.001, "loss": 3.4158, "step": 1070 }, { "epoch": 0.12, "grad_norm": 0.552200436592102, "learning_rate": 0.001, "loss": 3.3218, "step": 1075 }, { "epoch": 0.12, "grad_norm": 0.5645663142204285, "learning_rate": 0.001, "loss": 3.2884, "step": 1080 }, { "epoch": 0.12, "grad_norm": 0.44569239020347595, "learning_rate": 0.001, "loss": 3.3263, "step": 1085 }, { "epoch": 0.12, "grad_norm": 0.8671941161155701, "learning_rate": 0.001, "loss": 3.3146, "step": 1090 }, { "epoch": 0.12, "grad_norm": 0.5645720362663269, "learning_rate": 0.001, "loss": 3.2641, "step": 1095 }, { "epoch": 0.12, "grad_norm": 0.8040971755981445, "learning_rate": 0.001, "loss": 3.3728, "step": 1100 }, { "epoch": 0.12, "grad_norm": 0.5215751528739929, "learning_rate": 0.001, "loss": 3.2835, "step": 1105 }, { "epoch": 0.12, "grad_norm": 0.5733566284179688, "learning_rate": 0.001, "loss": 3.346, "step": 1110 }, { "epoch": 0.12, "grad_norm": 0.5822365880012512, "learning_rate": 0.001, "loss": 3.315, "step": 1115 }, { "epoch": 0.12, "grad_norm": 0.5686581134796143, "learning_rate": 0.001, "loss": 3.3394, "step": 1120 }, { "epoch": 0.13, "grad_norm": 0.4820461571216583, "learning_rate": 0.001, "loss": 3.2288, "step": 1125 }, { "epoch": 0.13, "grad_norm": 0.5278226733207703, "learning_rate": 0.001, "loss": 3.2776, "step": 1130 }, { "epoch": 0.13, "grad_norm": 0.45684191584587097, "learning_rate": 0.001, "loss": 3.2376, "step": 1135 }, { "epoch": 0.13, "grad_norm": 0.5594207644462585, "learning_rate": 0.001, "loss": 3.2892, "step": 1140 }, { "epoch": 0.13, "grad_norm": 0.5143517255783081, "learning_rate": 0.001, "loss": 3.2246, "step": 1145 }, { "epoch": 0.13, "grad_norm": 0.5187541842460632, "learning_rate": 0.001, "loss": 3.2442, "step": 1150 }, { "epoch": 0.13, "grad_norm": 0.5015732049942017, "learning_rate": 0.001, "loss": 3.1744, "step": 1155 }, { "epoch": 0.13, "grad_norm": 0.4809734523296356, "learning_rate": 0.001, "loss": 3.1277, "step": 1160 }, { "epoch": 0.13, "grad_norm": 0.43246883153915405, "learning_rate": 0.001, "loss": 3.1667, "step": 1165 }, { "epoch": 0.13, "grad_norm": 0.465455561876297, "learning_rate": 0.001, "loss": 3.1439, "step": 1170 }, { "epoch": 0.13, "grad_norm": 0.5171617865562439, "learning_rate": 0.001, "loss": 3.2541, "step": 1175 }, { "epoch": 0.13, "grad_norm": 0.49306464195251465, "learning_rate": 0.001, "loss": 3.1276, "step": 1180 }, { "epoch": 0.13, "grad_norm": 0.49295562505722046, "learning_rate": 0.001, "loss": 3.2369, "step": 1185 }, { "epoch": 0.13, "grad_norm": 0.5079097151756287, "learning_rate": 0.001, "loss": 3.2598, "step": 1190 }, { "epoch": 0.13, "grad_norm": 0.5303475260734558, "learning_rate": 0.001, "loss": 3.2031, "step": 1195 }, { "epoch": 0.13, "grad_norm": 0.5068909525871277, "learning_rate": 0.001, "loss": 3.2112, "step": 1200 }, { "epoch": 0.13, "grad_norm": 0.509250819683075, "learning_rate": 0.001, "loss": 3.1927, "step": 1205 }, { "epoch": 0.13, "grad_norm": 0.4846991002559662, "learning_rate": 0.001, "loss": 3.1834, "step": 1210 }, { "epoch": 0.14, "grad_norm": 0.5194084644317627, "learning_rate": 0.001, "loss": 3.2082, "step": 1215 }, { "epoch": 0.14, "grad_norm": 0.45398786664009094, "learning_rate": 0.001, "loss": 3.1926, "step": 1220 }, { "epoch": 0.14, "grad_norm": 0.4347745478153229, "learning_rate": 0.001, "loss": 3.1371, "step": 1225 }, { "epoch": 0.14, "grad_norm": 0.5177764892578125, "learning_rate": 0.001, "loss": 3.1192, "step": 1230 }, { "epoch": 0.14, "grad_norm": 0.41405558586120605, "learning_rate": 0.001, "loss": 3.1038, "step": 1235 }, { "epoch": 0.14, "grad_norm": 0.5231586694717407, "learning_rate": 0.001, "loss": 3.1101, "step": 1240 }, { "epoch": 0.14, "grad_norm": 0.45989879965782166, "learning_rate": 0.001, "loss": 3.1313, "step": 1245 }, { "epoch": 0.14, "grad_norm": 0.4475898742675781, "learning_rate": 0.001, "loss": 3.164, "step": 1250 }, { "epoch": 0.14, "grad_norm": 0.4773527681827545, "learning_rate": 0.001, "loss": 3.0951, "step": 1255 }, { "epoch": 0.14, "grad_norm": 0.4193638265132904, "learning_rate": 0.001, "loss": 3.1184, "step": 1260 }, { "epoch": 0.14, "grad_norm": 0.5214313864707947, "learning_rate": 0.001, "loss": 3.0871, "step": 1265 }, { "epoch": 0.14, "grad_norm": 0.4901462197303772, "learning_rate": 0.001, "loss": 3.1454, "step": 1270 }, { "epoch": 0.14, "grad_norm": 0.46214476227760315, "learning_rate": 0.001, "loss": 3.0632, "step": 1275 }, { "epoch": 0.14, "grad_norm": 0.4842498302459717, "learning_rate": 0.001, "loss": 3.0784, "step": 1280 }, { "epoch": 0.14, "grad_norm": 0.4946521520614624, "learning_rate": 0.001, "loss": 3.0605, "step": 1285 }, { "epoch": 0.14, "grad_norm": 0.4697374999523163, "learning_rate": 0.001, "loss": 3.0256, "step": 1290 }, { "epoch": 0.14, "grad_norm": 0.6137799024581909, "learning_rate": 0.001, "loss": 3.1273, "step": 1295 }, { "epoch": 0.14, "grad_norm": 0.45613402128219604, "learning_rate": 0.001, "loss": 3.0416, "step": 1300 }, { "epoch": 0.15, "grad_norm": 0.5034254789352417, "learning_rate": 0.001, "loss": 3.0875, "step": 1305 }, { "epoch": 0.15, "grad_norm": 0.5534226894378662, "learning_rate": 0.001, "loss": 3.1326, "step": 1310 }, { "epoch": 0.15, "grad_norm": 0.4972108006477356, "learning_rate": 0.001, "loss": 3.0417, "step": 1315 }, { "epoch": 0.15, "grad_norm": 0.49605464935302734, "learning_rate": 0.001, "loss": 3.0207, "step": 1320 }, { "epoch": 0.15, "grad_norm": 0.4623696804046631, "learning_rate": 0.001, "loss": 3.0571, "step": 1325 }, { "epoch": 0.15, "grad_norm": 0.5162575244903564, "learning_rate": 0.001, "loss": 3.0163, "step": 1330 }, { "epoch": 0.15, "grad_norm": 0.4853294789791107, "learning_rate": 0.001, "loss": 3.0887, "step": 1335 }, { "epoch": 0.15, "grad_norm": 0.49405863881111145, "learning_rate": 0.001, "loss": 3.0364, "step": 1340 }, { "epoch": 0.15, "grad_norm": 0.43189990520477295, "learning_rate": 0.001, "loss": 2.9356, "step": 1345 }, { "epoch": 0.15, "grad_norm": 0.4488712251186371, "learning_rate": 0.001, "loss": 3.0452, "step": 1350 }, { "epoch": 0.15, "grad_norm": 0.44629403948783875, "learning_rate": 0.001, "loss": 2.9449, "step": 1355 }, { "epoch": 0.15, "grad_norm": 0.4826550781726837, "learning_rate": 0.001, "loss": 3.0303, "step": 1360 }, { "epoch": 0.15, "grad_norm": 0.48896515369415283, "learning_rate": 0.001, "loss": 3.0951, "step": 1365 }, { "epoch": 0.15, "grad_norm": 0.4570922553539276, "learning_rate": 0.001, "loss": 3.0345, "step": 1370 }, { "epoch": 0.15, "grad_norm": 0.4907234013080597, "learning_rate": 0.001, "loss": 2.9406, "step": 1375 }, { "epoch": 0.15, "grad_norm": 0.48720821738243103, "learning_rate": 0.001, "loss": 3.0605, "step": 1380 }, { "epoch": 0.15, "grad_norm": 0.46225032210350037, "learning_rate": 0.001, "loss": 3.0262, "step": 1385 }, { "epoch": 0.15, "grad_norm": 0.45703181624412537, "learning_rate": 0.001, "loss": 3.0684, "step": 1390 }, { "epoch": 0.16, "grad_norm": 0.43968960642814636, "learning_rate": 0.001, "loss": 3.0707, "step": 1395 }, { "epoch": 0.16, "grad_norm": 0.5045078992843628, "learning_rate": 0.001, "loss": 3.0192, "step": 1400 }, { "epoch": 0.16, "grad_norm": 0.49028080701828003, "learning_rate": 0.001, "loss": 2.9843, "step": 1405 }, { "epoch": 0.16, "grad_norm": 0.5171729326248169, "learning_rate": 0.001, "loss": 2.9823, "step": 1410 }, { "epoch": 0.16, "grad_norm": 0.4980088770389557, "learning_rate": 0.001, "loss": 3.0282, "step": 1415 }, { "epoch": 0.16, "grad_norm": 0.47428199648857117, "learning_rate": 0.001, "loss": 3.0371, "step": 1420 }, { "epoch": 0.16, "grad_norm": 0.49677643179893494, "learning_rate": 0.001, "loss": 2.8951, "step": 1425 }, { "epoch": 0.16, "grad_norm": 0.4643387198448181, "learning_rate": 0.001, "loss": 2.9398, "step": 1430 }, { "epoch": 0.16, "grad_norm": 0.4817788004875183, "learning_rate": 0.001, "loss": 3.0048, "step": 1435 }, { "epoch": 0.16, "grad_norm": 0.46131885051727295, "learning_rate": 0.001, "loss": 3.0599, "step": 1440 }, { "epoch": 0.16, "grad_norm": 0.48160094022750854, "learning_rate": 0.001, "loss": 2.9546, "step": 1445 }, { "epoch": 0.16, "grad_norm": 0.4246445894241333, "learning_rate": 0.001, "loss": 2.9025, "step": 1450 }, { "epoch": 0.16, "grad_norm": 0.4259074032306671, "learning_rate": 0.001, "loss": 2.9804, "step": 1455 }, { "epoch": 0.16, "grad_norm": 0.4976622462272644, "learning_rate": 0.001, "loss": 2.9303, "step": 1460 }, { "epoch": 0.16, "grad_norm": 0.40564754605293274, "learning_rate": 0.001, "loss": 2.9505, "step": 1465 }, { "epoch": 0.16, "grad_norm": 0.5583494305610657, "learning_rate": 0.001, "loss": 2.9898, "step": 1470 }, { "epoch": 0.16, "grad_norm": 0.45296064019203186, "learning_rate": 0.001, "loss": 3.0225, "step": 1475 }, { "epoch": 0.17, "grad_norm": 0.480425089597702, "learning_rate": 0.001, "loss": 2.9436, "step": 1480 }, { "epoch": 0.17, "grad_norm": 0.47425466775894165, "learning_rate": 0.001, "loss": 3.0388, "step": 1485 }, { "epoch": 0.17, "grad_norm": 0.4673284888267517, "learning_rate": 0.001, "loss": 3.0704, "step": 1490 }, { "epoch": 0.17, "grad_norm": 0.43329888582229614, "learning_rate": 0.001, "loss": 2.9181, "step": 1495 }, { "epoch": 0.17, "grad_norm": 0.44485437870025635, "learning_rate": 0.001, "loss": 2.9171, "step": 1500 }, { "epoch": 0.17, "grad_norm": 0.47537484765052795, "learning_rate": 0.001, "loss": 2.9375, "step": 1505 }, { "epoch": 0.17, "grad_norm": 0.45429906249046326, "learning_rate": 0.001, "loss": 2.9171, "step": 1510 }, { "epoch": 0.17, "grad_norm": 0.4643488824367523, "learning_rate": 0.001, "loss": 2.9756, "step": 1515 }, { "epoch": 0.17, "grad_norm": 0.5488468408584595, "learning_rate": 0.001, "loss": 2.9335, "step": 1520 }, { "epoch": 0.17, "grad_norm": 0.5614462494850159, "learning_rate": 0.001, "loss": 2.9381, "step": 1525 }, { "epoch": 0.17, "grad_norm": 0.5060487985610962, "learning_rate": 0.001, "loss": 2.9668, "step": 1530 }, { "epoch": 0.17, "grad_norm": 0.46779105067253113, "learning_rate": 0.001, "loss": 2.9549, "step": 1535 }, { "epoch": 0.17, "grad_norm": 0.43047505617141724, "learning_rate": 0.001, "loss": 2.9327, "step": 1540 }, { "epoch": 0.17, "grad_norm": 0.42190369963645935, "learning_rate": 0.001, "loss": 2.9513, "step": 1545 }, { "epoch": 0.17, "grad_norm": 0.44506198167800903, "learning_rate": 0.001, "loss": 2.896, "step": 1550 }, { "epoch": 0.17, "grad_norm": 0.4417656362056732, "learning_rate": 0.001, "loss": 2.8853, "step": 1555 }, { "epoch": 0.17, "grad_norm": 0.4769698977470398, "learning_rate": 0.001, "loss": 2.8021, "step": 1560 }, { "epoch": 0.17, "grad_norm": 0.49687662720680237, "learning_rate": 0.001, "loss": 2.9001, "step": 1565 }, { "epoch": 0.18, "grad_norm": 0.4586490988731384, "learning_rate": 0.001, "loss": 2.9464, "step": 1570 }, { "epoch": 0.18, "grad_norm": 0.45702454447746277, "learning_rate": 0.001, "loss": 2.9214, "step": 1575 }, { "epoch": 0.18, "grad_norm": 0.44937050342559814, "learning_rate": 0.001, "loss": 2.96, "step": 1580 }, { "epoch": 0.18, "grad_norm": 0.4406932592391968, "learning_rate": 0.001, "loss": 2.8779, "step": 1585 }, { "epoch": 0.18, "grad_norm": 0.41652610898017883, "learning_rate": 0.001, "loss": 2.9108, "step": 1590 }, { "epoch": 0.18, "grad_norm": 0.47193869948387146, "learning_rate": 0.001, "loss": 2.9891, "step": 1595 }, { "epoch": 0.18, "grad_norm": 0.44580379128456116, "learning_rate": 0.001, "loss": 2.8617, "step": 1600 }, { "epoch": 0.18, "grad_norm": 0.4470444619655609, "learning_rate": 0.001, "loss": 2.9058, "step": 1605 }, { "epoch": 0.18, "grad_norm": 0.42569637298583984, "learning_rate": 0.001, "loss": 2.8462, "step": 1610 }, { "epoch": 0.18, "grad_norm": 0.42253032326698303, "learning_rate": 0.001, "loss": 2.8579, "step": 1615 }, { "epoch": 0.18, "grad_norm": 0.4351730942726135, "learning_rate": 0.001, "loss": 2.9778, "step": 1620 }, { "epoch": 0.18, "grad_norm": 0.4730110168457031, "learning_rate": 0.001, "loss": 2.9028, "step": 1625 }, { "epoch": 0.18, "grad_norm": 0.48326411843299866, "learning_rate": 0.001, "loss": 2.8306, "step": 1630 }, { "epoch": 0.18, "grad_norm": 0.4432814419269562, "learning_rate": 0.001, "loss": 2.8039, "step": 1635 }, { "epoch": 0.18, "grad_norm": 0.4231279194355011, "learning_rate": 0.001, "loss": 2.9011, "step": 1640 }, { "epoch": 0.18, "grad_norm": 0.4152611494064331, "learning_rate": 0.001, "loss": 2.9133, "step": 1645 }, { "epoch": 0.18, "grad_norm": 0.4555603861808777, "learning_rate": 0.001, "loss": 2.931, "step": 1650 }, { "epoch": 0.18, "grad_norm": 0.4598769545555115, "learning_rate": 0.001, "loss": 2.8745, "step": 1655 }, { "epoch": 0.19, "grad_norm": 0.4184294044971466, "learning_rate": 0.001, "loss": 2.861, "step": 1660 }, { "epoch": 0.19, "grad_norm": 0.4301551282405853, "learning_rate": 0.001, "loss": 2.8547, "step": 1665 }, { "epoch": 0.19, "grad_norm": 0.4487614929676056, "learning_rate": 0.001, "loss": 2.9478, "step": 1670 }, { "epoch": 0.19, "grad_norm": 0.47245121002197266, "learning_rate": 0.001, "loss": 2.8881, "step": 1675 }, { "epoch": 0.19, "grad_norm": 0.4665432572364807, "learning_rate": 0.001, "loss": 2.8708, "step": 1680 }, { "epoch": 0.19, "grad_norm": 0.4603164792060852, "learning_rate": 0.001, "loss": 2.7779, "step": 1685 }, { "epoch": 0.19, "grad_norm": 0.39915090799331665, "learning_rate": 0.001, "loss": 2.8522, "step": 1690 }, { "epoch": 0.19, "grad_norm": 0.5415810346603394, "learning_rate": 0.001, "loss": 2.7831, "step": 1695 }, { "epoch": 0.19, "grad_norm": 0.407660573720932, "learning_rate": 0.001, "loss": 2.7968, "step": 1700 }, { "epoch": 0.19, "grad_norm": 0.41527602076530457, "learning_rate": 0.001, "loss": 2.9232, "step": 1705 }, { "epoch": 0.19, "grad_norm": 0.5355004072189331, "learning_rate": 0.001, "loss": 2.8865, "step": 1710 }, { "epoch": 0.19, "grad_norm": 0.49530091881752014, "learning_rate": 0.001, "loss": 2.8188, "step": 1715 }, { "epoch": 0.19, "grad_norm": 0.595953643321991, "learning_rate": 0.001, "loss": 2.8213, "step": 1720 }, { "epoch": 0.19, "grad_norm": 0.44346776604652405, "learning_rate": 0.001, "loss": 2.9774, "step": 1725 }, { "epoch": 0.19, "grad_norm": 0.4346086084842682, "learning_rate": 0.001, "loss": 2.7909, "step": 1730 }, { "epoch": 0.19, "grad_norm": 0.5057465434074402, "learning_rate": 0.001, "loss": 2.8719, "step": 1735 }, { "epoch": 0.19, "grad_norm": 0.422921746969223, "learning_rate": 0.001, "loss": 2.8014, "step": 1740 }, { "epoch": 0.19, "grad_norm": 0.41217803955078125, "learning_rate": 0.001, "loss": 2.8014, "step": 1745 }, { "epoch": 0.2, "grad_norm": 0.4260920584201813, "learning_rate": 0.001, "loss": 2.8018, "step": 1750 }, { "epoch": 0.2, "grad_norm": 0.3754720985889435, "learning_rate": 0.001, "loss": 2.7298, "step": 1755 }, { "epoch": 0.2, "grad_norm": 0.45944565534591675, "learning_rate": 0.001, "loss": 2.825, "step": 1760 }, { "epoch": 0.2, "grad_norm": 0.42768698930740356, "learning_rate": 0.001, "loss": 2.7903, "step": 1765 }, { "epoch": 0.2, "grad_norm": 0.4293604791164398, "learning_rate": 0.001, "loss": 2.7113, "step": 1770 }, { "epoch": 0.2, "grad_norm": 0.43066537380218506, "learning_rate": 0.001, "loss": 2.8701, "step": 1775 }, { "epoch": 0.2, "grad_norm": 0.4278125762939453, "learning_rate": 0.001, "loss": 2.83, "step": 1780 }, { "epoch": 0.2, "grad_norm": 0.430522084236145, "learning_rate": 0.001, "loss": 2.7792, "step": 1785 }, { "epoch": 0.2, "grad_norm": 0.43876639008522034, "learning_rate": 0.001, "loss": 2.7494, "step": 1790 }, { "epoch": 0.2, "grad_norm": 0.44354987144470215, "learning_rate": 0.001, "loss": 2.9732, "step": 1795 }, { "epoch": 0.2, "grad_norm": 0.40760278701782227, "learning_rate": 0.001, "loss": 2.7543, "step": 1800 }, { "epoch": 0.2, "grad_norm": 0.4250537157058716, "learning_rate": 0.001, "loss": 2.8073, "step": 1805 }, { "epoch": 0.2, "grad_norm": 0.4468509256839752, "learning_rate": 0.001, "loss": 2.9124, "step": 1810 }, { "epoch": 0.2, "grad_norm": 0.4494674801826477, "learning_rate": 0.001, "loss": 2.7521, "step": 1815 }, { "epoch": 0.2, "grad_norm": 0.4005078077316284, "learning_rate": 0.001, "loss": 2.8788, "step": 1820 }, { "epoch": 0.2, "grad_norm": 0.4095415771007538, "learning_rate": 0.001, "loss": 2.8621, "step": 1825 }, { "epoch": 0.2, "grad_norm": 0.5409905910491943, "learning_rate": 0.001, "loss": 2.8566, "step": 1830 }, { "epoch": 0.2, "grad_norm": 0.49218934774398804, "learning_rate": 0.001, "loss": 2.8128, "step": 1835 }, { "epoch": 0.21, "grad_norm": 0.45364123582839966, "learning_rate": 0.001, "loss": 2.8731, "step": 1840 }, { "epoch": 0.21, "grad_norm": 0.43244045972824097, "learning_rate": 0.001, "loss": 2.7162, "step": 1845 }, { "epoch": 0.21, "grad_norm": 0.4195582866668701, "learning_rate": 0.001, "loss": 2.7579, "step": 1850 }, { "epoch": 0.21, "grad_norm": 0.4952572286128998, "learning_rate": 0.001, "loss": 2.7702, "step": 1855 }, { "epoch": 0.21, "grad_norm": 0.44089606404304504, "learning_rate": 0.001, "loss": 2.7582, "step": 1860 }, { "epoch": 0.21, "grad_norm": 0.39600706100463867, "learning_rate": 0.001, "loss": 2.7592, "step": 1865 }, { "epoch": 0.21, "grad_norm": 0.43046873807907104, "learning_rate": 0.001, "loss": 2.8407, "step": 1870 }, { "epoch": 0.21, "grad_norm": 0.4137684404850006, "learning_rate": 0.001, "loss": 2.7494, "step": 1875 }, { "epoch": 0.21, "grad_norm": 0.393900066614151, "learning_rate": 0.001, "loss": 2.7848, "step": 1880 }, { "epoch": 0.21, "grad_norm": 0.4853545129299164, "learning_rate": 0.001, "loss": 2.7849, "step": 1885 }, { "epoch": 0.21, "grad_norm": 0.4614884555339813, "learning_rate": 0.001, "loss": 2.8006, "step": 1890 }, { "epoch": 0.21, "grad_norm": 0.41348981857299805, "learning_rate": 0.001, "loss": 2.7833, "step": 1895 }, { "epoch": 0.21, "grad_norm": 0.46095848083496094, "learning_rate": 0.001, "loss": 2.7932, "step": 1900 }, { "epoch": 0.21, "grad_norm": 0.4020320773124695, "learning_rate": 0.001, "loss": 2.8505, "step": 1905 }, { "epoch": 0.21, "grad_norm": 0.36754411458969116, "learning_rate": 0.001, "loss": 2.7562, "step": 1910 }, { "epoch": 0.21, "grad_norm": 0.4239796996116638, "learning_rate": 0.001, "loss": 2.8662, "step": 1915 }, { "epoch": 0.21, "grad_norm": 0.40584829449653625, "learning_rate": 0.001, "loss": 2.8202, "step": 1920 }, { "epoch": 0.21, "grad_norm": 0.39396342635154724, "learning_rate": 0.001, "loss": 2.8148, "step": 1925 }, { "epoch": 0.22, "grad_norm": 0.4060746729373932, "learning_rate": 0.001, "loss": 2.7949, "step": 1930 }, { "epoch": 0.22, "grad_norm": 0.43469181656837463, "learning_rate": 0.001, "loss": 2.7653, "step": 1935 }, { "epoch": 0.22, "grad_norm": 0.43287193775177, "learning_rate": 0.001, "loss": 2.8815, "step": 1940 }, { "epoch": 0.22, "grad_norm": 0.4579710066318512, "learning_rate": 0.001, "loss": 2.8147, "step": 1945 }, { "epoch": 0.22, "grad_norm": 0.4058091938495636, "learning_rate": 0.001, "loss": 2.8827, "step": 1950 }, { "epoch": 0.22, "grad_norm": 0.4505409300327301, "learning_rate": 0.001, "loss": 2.7635, "step": 1955 }, { "epoch": 0.22, "grad_norm": 0.45017167925834656, "learning_rate": 0.001, "loss": 2.7809, "step": 1960 }, { "epoch": 0.22, "grad_norm": 0.394645094871521, "learning_rate": 0.001, "loss": 2.6977, "step": 1965 }, { "epoch": 0.22, "grad_norm": 0.3600626587867737, "learning_rate": 0.001, "loss": 2.8135, "step": 1970 }, { "epoch": 0.22, "grad_norm": 0.4112387001514435, "learning_rate": 0.001, "loss": 2.7718, "step": 1975 }, { "epoch": 0.22, "grad_norm": 0.4643021523952484, "learning_rate": 0.001, "loss": 2.7279, "step": 1980 }, { "epoch": 0.22, "grad_norm": 0.4055080711841583, "learning_rate": 0.001, "loss": 2.8101, "step": 1985 }, { "epoch": 0.22, "grad_norm": 0.4093475639820099, "learning_rate": 0.001, "loss": 2.7491, "step": 1990 }, { "epoch": 0.22, "grad_norm": 0.43606457114219666, "learning_rate": 0.001, "loss": 2.7979, "step": 1995 }, { "epoch": 0.22, "grad_norm": 0.38987255096435547, "learning_rate": 0.001, "loss": 2.7067, "step": 2000 }, { "epoch": 0.22, "grad_norm": 0.44878268241882324, "learning_rate": 0.001, "loss": 2.7457, "step": 2005 }, { "epoch": 0.22, "grad_norm": 0.3909992277622223, "learning_rate": 0.001, "loss": 2.6962, "step": 2010 }, { "epoch": 0.22, "grad_norm": 0.3962748050689697, "learning_rate": 0.001, "loss": 2.7063, "step": 2015 }, { "epoch": 0.23, "grad_norm": 0.3859250843524933, "learning_rate": 0.001, "loss": 2.7814, "step": 2020 }, { "epoch": 0.23, "grad_norm": 0.4393429756164551, "learning_rate": 0.001, "loss": 2.7013, "step": 2025 }, { "epoch": 0.23, "grad_norm": 0.3963346481323242, "learning_rate": 0.001, "loss": 2.7637, "step": 2030 }, { "epoch": 0.23, "grad_norm": 0.40981605648994446, "learning_rate": 0.001, "loss": 2.7556, "step": 2035 }, { "epoch": 0.23, "grad_norm": 0.435086727142334, "learning_rate": 0.001, "loss": 2.7513, "step": 2040 }, { "epoch": 0.23, "grad_norm": 0.4393368661403656, "learning_rate": 0.001, "loss": 2.7835, "step": 2045 }, { "epoch": 0.23, "grad_norm": 0.3997877240180969, "learning_rate": 0.001, "loss": 2.6522, "step": 2050 }, { "epoch": 0.23, "grad_norm": 0.3757670521736145, "learning_rate": 0.001, "loss": 2.8765, "step": 2055 }, { "epoch": 0.23, "grad_norm": 0.3874036967754364, "learning_rate": 0.001, "loss": 2.7991, "step": 2060 }, { "epoch": 0.23, "grad_norm": 0.3644055128097534, "learning_rate": 0.001, "loss": 2.7529, "step": 2065 }, { "epoch": 0.23, "grad_norm": 0.3817943036556244, "learning_rate": 0.001, "loss": 2.703, "step": 2070 }, { "epoch": 0.23, "grad_norm": 0.3962540030479431, "learning_rate": 0.001, "loss": 2.672, "step": 2075 }, { "epoch": 0.23, "grad_norm": 0.41838338971138, "learning_rate": 0.001, "loss": 2.7528, "step": 2080 }, { "epoch": 0.23, "grad_norm": 0.4394073784351349, "learning_rate": 0.001, "loss": 2.7653, "step": 2085 }, { "epoch": 0.23, "grad_norm": 0.4334862232208252, "learning_rate": 0.001, "loss": 2.7096, "step": 2090 }, { "epoch": 0.23, "grad_norm": 0.41093143820762634, "learning_rate": 0.001, "loss": 2.7438, "step": 2095 }, { "epoch": 0.23, "grad_norm": 0.35660019516944885, "learning_rate": 0.001, "loss": 2.8409, "step": 2100 }, { "epoch": 0.23, "grad_norm": 0.373526394367218, "learning_rate": 0.001, "loss": 2.7351, "step": 2105 }, { "epoch": 0.24, "grad_norm": 0.4302496612071991, "learning_rate": 0.001, "loss": 2.8068, "step": 2110 }, { "epoch": 0.24, "grad_norm": 0.38068363070487976, "learning_rate": 0.001, "loss": 2.6477, "step": 2115 }, { "epoch": 0.24, "grad_norm": 0.3733093738555908, "learning_rate": 0.001, "loss": 2.7554, "step": 2120 }, { "epoch": 0.24, "grad_norm": 0.4144055247306824, "learning_rate": 0.001, "loss": 2.6993, "step": 2125 }, { "epoch": 0.24, "grad_norm": 0.4264536499977112, "learning_rate": 0.001, "loss": 2.6572, "step": 2130 }, { "epoch": 0.24, "grad_norm": 0.4430059492588043, "learning_rate": 0.001, "loss": 2.6923, "step": 2135 }, { "epoch": 0.24, "grad_norm": 0.42207950353622437, "learning_rate": 0.001, "loss": 2.7822, "step": 2140 }, { "epoch": 0.24, "grad_norm": 0.35369545221328735, "learning_rate": 0.001, "loss": 2.7023, "step": 2145 }, { "epoch": 0.24, "grad_norm": 0.47198721766471863, "learning_rate": 0.001, "loss": 2.6973, "step": 2150 }, { "epoch": 0.24, "grad_norm": 0.3969089388847351, "learning_rate": 0.001, "loss": 2.6911, "step": 2155 }, { "epoch": 0.24, "grad_norm": 0.40460652112960815, "learning_rate": 0.001, "loss": 2.6883, "step": 2160 }, { "epoch": 0.24, "grad_norm": 0.41705337166786194, "learning_rate": 0.001, "loss": 2.6843, "step": 2165 }, { "epoch": 0.24, "grad_norm": 0.37905824184417725, "learning_rate": 0.001, "loss": 2.6169, "step": 2170 }, { "epoch": 0.24, "grad_norm": 0.3908678889274597, "learning_rate": 0.001, "loss": 2.6789, "step": 2175 }, { "epoch": 0.24, "grad_norm": 0.4016311466693878, "learning_rate": 0.001, "loss": 2.6935, "step": 2180 }, { "epoch": 0.24, "grad_norm": 0.39033347368240356, "learning_rate": 0.001, "loss": 2.7653, "step": 2185 }, { "epoch": 0.24, "grad_norm": 0.4098024368286133, "learning_rate": 0.001, "loss": 2.6963, "step": 2190 }, { "epoch": 0.24, "grad_norm": 0.4102167785167694, "learning_rate": 0.001, "loss": 2.7452, "step": 2195 }, { "epoch": 0.25, "grad_norm": 0.3752821981906891, "learning_rate": 0.001, "loss": 2.6807, "step": 2200 }, { "epoch": 0.25, "grad_norm": 0.412567675113678, "learning_rate": 0.001, "loss": 2.7763, "step": 2205 }, { "epoch": 0.25, "grad_norm": 0.4417850375175476, "learning_rate": 0.001, "loss": 2.6831, "step": 2210 }, { "epoch": 0.25, "grad_norm": 0.41310781240463257, "learning_rate": 0.001, "loss": 2.6983, "step": 2215 }, { "epoch": 0.25, "grad_norm": 0.4148240089416504, "learning_rate": 0.001, "loss": 2.6653, "step": 2220 }, { "epoch": 0.25, "grad_norm": 0.35075607895851135, "learning_rate": 0.001, "loss": 2.6965, "step": 2225 }, { "epoch": 0.25, "grad_norm": 0.3815755546092987, "learning_rate": 0.001, "loss": 2.6334, "step": 2230 }, { "epoch": 0.25, "grad_norm": 0.35728955268859863, "learning_rate": 0.001, "loss": 2.7119, "step": 2235 }, { "epoch": 0.25, "grad_norm": 0.4024193584918976, "learning_rate": 0.001, "loss": 2.7637, "step": 2240 }, { "epoch": 0.25, "grad_norm": 0.38046032190322876, "learning_rate": 0.001, "loss": 2.7379, "step": 2245 }, { "epoch": 0.25, "grad_norm": 0.39821740984916687, "learning_rate": 0.001, "loss": 2.6981, "step": 2250 }, { "epoch": 0.25, "grad_norm": 0.3438183665275574, "learning_rate": 0.001, "loss": 2.738, "step": 2255 }, { "epoch": 0.25, "grad_norm": 0.39580243825912476, "learning_rate": 0.001, "loss": 2.662, "step": 2260 }, { "epoch": 0.25, "grad_norm": 0.3691055178642273, "learning_rate": 0.001, "loss": 2.7056, "step": 2265 }, { "epoch": 0.25, "grad_norm": 0.36736342310905457, "learning_rate": 0.001, "loss": 2.6655, "step": 2270 }, { "epoch": 0.25, "grad_norm": 0.36667105555534363, "learning_rate": 0.001, "loss": 2.7, "step": 2275 }, { "epoch": 0.25, "grad_norm": 0.5083950757980347, "learning_rate": 0.001, "loss": 2.7651, "step": 2280 }, { "epoch": 0.25, "grad_norm": 0.40544840693473816, "learning_rate": 0.001, "loss": 2.6053, "step": 2285 }, { "epoch": 0.26, "grad_norm": 0.3980036675930023, "learning_rate": 0.001, "loss": 2.64, "step": 2290 }, { "epoch": 0.26, "grad_norm": 0.39068588614463806, "learning_rate": 0.001, "loss": 2.6498, "step": 2295 }, { "epoch": 0.26, "grad_norm": 0.39119189977645874, "learning_rate": 0.001, "loss": 2.7336, "step": 2300 }, { "epoch": 0.26, "grad_norm": 0.3884316086769104, "learning_rate": 0.001, "loss": 2.6745, "step": 2305 }, { "epoch": 0.26, "grad_norm": 0.4157238006591797, "learning_rate": 0.001, "loss": 2.7262, "step": 2310 }, { "epoch": 0.26, "grad_norm": 0.38538315892219543, "learning_rate": 0.001, "loss": 2.7375, "step": 2315 }, { "epoch": 0.26, "grad_norm": 0.37677937746047974, "learning_rate": 0.001, "loss": 2.7534, "step": 2320 }, { "epoch": 0.26, "grad_norm": 0.35606712102890015, "learning_rate": 0.001, "loss": 2.671, "step": 2325 }, { "epoch": 0.26, "grad_norm": 0.37418973445892334, "learning_rate": 0.001, "loss": 2.7513, "step": 2330 }, { "epoch": 0.26, "grad_norm": 0.36637210845947266, "learning_rate": 0.001, "loss": 2.6842, "step": 2335 }, { "epoch": 0.26, "grad_norm": 0.4047016501426697, "learning_rate": 0.001, "loss": 2.6538, "step": 2340 }, { "epoch": 0.26, "grad_norm": 0.42498499155044556, "learning_rate": 0.001, "loss": 2.5852, "step": 2345 }, { "epoch": 0.26, "grad_norm": 0.4067090153694153, "learning_rate": 0.001, "loss": 2.6661, "step": 2350 }, { "epoch": 0.26, "grad_norm": 0.4145916998386383, "learning_rate": 0.001, "loss": 2.713, "step": 2355 }, { "epoch": 0.26, "grad_norm": 0.3945256173610687, "learning_rate": 0.001, "loss": 2.6483, "step": 2360 }, { "epoch": 0.26, "grad_norm": 0.4013041853904724, "learning_rate": 0.001, "loss": 2.6534, "step": 2365 }, { "epoch": 0.26, "grad_norm": 0.4283403754234314, "learning_rate": 0.001, "loss": 2.7215, "step": 2370 }, { "epoch": 0.26, "grad_norm": 0.4242289066314697, "learning_rate": 0.001, "loss": 2.6233, "step": 2375 }, { "epoch": 0.27, "grad_norm": 0.3863242566585541, "learning_rate": 0.001, "loss": 2.674, "step": 2380 }, { "epoch": 0.27, "grad_norm": 0.37469300627708435, "learning_rate": 0.001, "loss": 2.6534, "step": 2385 }, { "epoch": 0.27, "grad_norm": 0.36515700817108154, "learning_rate": 0.001, "loss": 2.6909, "step": 2390 }, { "epoch": 0.27, "grad_norm": 0.3611971437931061, "learning_rate": 0.001, "loss": 2.6615, "step": 2395 }, { "epoch": 0.27, "grad_norm": 0.3939352035522461, "learning_rate": 0.001, "loss": 2.6824, "step": 2400 }, { "epoch": 0.27, "grad_norm": 0.3572635054588318, "learning_rate": 0.001, "loss": 2.6378, "step": 2405 }, { "epoch": 0.27, "grad_norm": 0.3500962257385254, "learning_rate": 0.001, "loss": 2.6074, "step": 2410 }, { "epoch": 0.27, "grad_norm": 0.3555634021759033, "learning_rate": 0.001, "loss": 2.588, "step": 2415 }, { "epoch": 0.27, "grad_norm": 0.3631855547428131, "learning_rate": 0.001, "loss": 2.7935, "step": 2420 }, { "epoch": 0.27, "grad_norm": 0.35220634937286377, "learning_rate": 0.001, "loss": 2.7258, "step": 2425 }, { "epoch": 0.27, "grad_norm": 0.3524405360221863, "learning_rate": 0.001, "loss": 2.6489, "step": 2430 }, { "epoch": 0.27, "grad_norm": 0.3426003158092499, "learning_rate": 0.001, "loss": 2.6484, "step": 2435 }, { "epoch": 0.27, "grad_norm": 0.36606019735336304, "learning_rate": 0.001, "loss": 2.6621, "step": 2440 }, { "epoch": 0.27, "grad_norm": 0.37562504410743713, "learning_rate": 0.001, "loss": 2.6919, "step": 2445 }, { "epoch": 0.27, "grad_norm": 0.35107216238975525, "learning_rate": 0.001, "loss": 2.6227, "step": 2450 }, { "epoch": 0.27, "grad_norm": 0.35221317410469055, "learning_rate": 0.001, "loss": 2.6016, "step": 2455 }, { "epoch": 0.27, "grad_norm": 0.35811102390289307, "learning_rate": 0.001, "loss": 2.6422, "step": 2460 }, { "epoch": 0.27, "grad_norm": 0.37652724981307983, "learning_rate": 0.001, "loss": 2.6598, "step": 2465 }, { "epoch": 0.28, "grad_norm": 0.3458632826805115, "learning_rate": 0.001, "loss": 2.7, "step": 2470 }, { "epoch": 0.28, "grad_norm": 0.373162180185318, "learning_rate": 0.001, "loss": 2.6561, "step": 2475 }, { "epoch": 0.28, "grad_norm": 0.3542190492153168, "learning_rate": 0.001, "loss": 2.644, "step": 2480 }, { "epoch": 0.28, "grad_norm": 0.34209930896759033, "learning_rate": 0.001, "loss": 2.6474, "step": 2485 }, { "epoch": 0.28, "grad_norm": 0.3443635106086731, "learning_rate": 0.001, "loss": 2.6696, "step": 2490 }, { "epoch": 0.28, "grad_norm": 0.35547831654548645, "learning_rate": 0.001, "loss": 2.6165, "step": 2495 }, { "epoch": 0.28, "grad_norm": 0.4041297435760498, "learning_rate": 0.001, "loss": 2.729, "step": 2500 }, { "epoch": 0.28, "grad_norm": 0.38527968525886536, "learning_rate": 0.001, "loss": 2.5177, "step": 2505 }, { "epoch": 0.28, "grad_norm": 0.3569463789463043, "learning_rate": 0.001, "loss": 2.5567, "step": 2510 }, { "epoch": 0.28, "grad_norm": 0.35062989592552185, "learning_rate": 0.001, "loss": 2.6211, "step": 2515 }, { "epoch": 0.28, "grad_norm": 0.375937819480896, "learning_rate": 0.001, "loss": 2.5725, "step": 2520 }, { "epoch": 0.28, "grad_norm": 0.36163225769996643, "learning_rate": 0.001, "loss": 2.5525, "step": 2525 }, { "epoch": 0.28, "grad_norm": 0.3515682518482208, "learning_rate": 0.001, "loss": 2.5238, "step": 2530 }, { "epoch": 0.28, "grad_norm": 0.36847323179244995, "learning_rate": 0.001, "loss": 2.6618, "step": 2535 }, { "epoch": 0.28, "grad_norm": 0.39339104294776917, "learning_rate": 0.001, "loss": 2.5975, "step": 2540 }, { "epoch": 0.28, "grad_norm": 0.37011218070983887, "learning_rate": 0.001, "loss": 2.6396, "step": 2545 }, { "epoch": 0.28, "grad_norm": 0.31246432662010193, "learning_rate": 0.001, "loss": 2.5913, "step": 2550 }, { "epoch": 0.28, "grad_norm": 0.4024028778076172, "learning_rate": 0.001, "loss": 2.6436, "step": 2555 }, { "epoch": 0.29, "grad_norm": 0.41265320777893066, "learning_rate": 0.001, "loss": 2.6862, "step": 2560 }, { "epoch": 0.29, "grad_norm": 0.35133570432662964, "learning_rate": 0.001, "loss": 2.6341, "step": 2565 }, { "epoch": 0.29, "grad_norm": 0.3641490340232849, "learning_rate": 0.001, "loss": 2.6138, "step": 2570 }, { "epoch": 0.29, "grad_norm": 0.37135565280914307, "learning_rate": 0.001, "loss": 2.55, "step": 2575 }, { "epoch": 0.29, "grad_norm": 0.8955615758895874, "learning_rate": 0.001, "loss": 2.5981, "step": 2580 }, { "epoch": 0.29, "grad_norm": 0.3701138198375702, "learning_rate": 0.001, "loss": 2.6368, "step": 2585 }, { "epoch": 0.29, "grad_norm": 0.35533228516578674, "learning_rate": 0.001, "loss": 2.6313, "step": 2590 }, { "epoch": 0.29, "grad_norm": 0.3595050871372223, "learning_rate": 0.001, "loss": 2.5497, "step": 2595 }, { "epoch": 0.29, "grad_norm": 0.3303188383579254, "learning_rate": 0.001, "loss": 2.6971, "step": 2600 }, { "epoch": 0.29, "grad_norm": 0.3368977904319763, "learning_rate": 0.001, "loss": 2.6039, "step": 2605 }, { "epoch": 0.29, "grad_norm": 0.35667169094085693, "learning_rate": 0.001, "loss": 2.6295, "step": 2610 }, { "epoch": 0.29, "grad_norm": 0.343133807182312, "learning_rate": 0.001, "loss": 2.6248, "step": 2615 }, { "epoch": 0.29, "grad_norm": 0.39267104864120483, "learning_rate": 0.001, "loss": 2.6834, "step": 2620 }, { "epoch": 0.29, "grad_norm": 0.37460845708847046, "learning_rate": 0.001, "loss": 2.6252, "step": 2625 }, { "epoch": 0.29, "grad_norm": 0.37292206287384033, "learning_rate": 0.001, "loss": 2.5632, "step": 2630 }, { "epoch": 0.29, "grad_norm": 0.3470982015132904, "learning_rate": 0.001, "loss": 2.5831, "step": 2635 }, { "epoch": 0.29, "grad_norm": 0.33724093437194824, "learning_rate": 0.001, "loss": 2.6858, "step": 2640 }, { "epoch": 0.29, "grad_norm": 0.3472803831100464, "learning_rate": 0.001, "loss": 2.6371, "step": 2645 }, { "epoch": 0.3, "grad_norm": 0.5333802103996277, "learning_rate": 0.001, "loss": 2.6897, "step": 2650 }, { "epoch": 0.3, "grad_norm": 0.325080007314682, "learning_rate": 0.001, "loss": 2.5439, "step": 2655 }, { "epoch": 0.3, "grad_norm": 0.3787167966365814, "learning_rate": 0.001, "loss": 2.6045, "step": 2660 }, { "epoch": 0.3, "grad_norm": 0.36667078733444214, "learning_rate": 0.001, "loss": 2.6575, "step": 2665 }, { "epoch": 0.3, "grad_norm": 0.36500290036201477, "learning_rate": 0.001, "loss": 2.5465, "step": 2670 }, { "epoch": 0.3, "grad_norm": 0.3717701733112335, "learning_rate": 0.001, "loss": 2.6718, "step": 2675 }, { "epoch": 0.3, "grad_norm": 0.369501531124115, "learning_rate": 0.001, "loss": 2.6297, "step": 2680 }, { "epoch": 0.3, "grad_norm": 0.33758407831192017, "learning_rate": 0.001, "loss": 2.6133, "step": 2685 }, { "epoch": 0.3, "grad_norm": 0.383821964263916, "learning_rate": 0.001, "loss": 2.6689, "step": 2690 }, { "epoch": 0.3, "grad_norm": 0.3367357552051544, "learning_rate": 0.001, "loss": 2.636, "step": 2695 }, { "epoch": 0.3, "grad_norm": 0.3320029079914093, "learning_rate": 0.001, "loss": 2.6745, "step": 2700 }, { "epoch": 0.3, "grad_norm": 0.33185863494873047, "learning_rate": 0.001, "loss": 2.6448, "step": 2705 }, { "epoch": 0.3, "grad_norm": 0.35053524374961853, "learning_rate": 0.001, "loss": 2.7042, "step": 2710 }, { "epoch": 0.3, "grad_norm": 0.3864387571811676, "learning_rate": 0.001, "loss": 2.5968, "step": 2715 }, { "epoch": 0.3, "grad_norm": 0.357790470123291, "learning_rate": 0.001, "loss": 2.6152, "step": 2720 }, { "epoch": 0.3, "grad_norm": 0.37656790018081665, "learning_rate": 0.001, "loss": 2.608, "step": 2725 }, { "epoch": 0.3, "grad_norm": 0.3897472620010376, "learning_rate": 0.001, "loss": 2.6525, "step": 2730 }, { "epoch": 0.3, "grad_norm": 0.3524196147918701, "learning_rate": 0.001, "loss": 2.5498, "step": 2735 }, { "epoch": 0.31, "grad_norm": 0.3699324429035187, "learning_rate": 0.001, "loss": 2.5373, "step": 2740 }, { "epoch": 0.31, "grad_norm": 0.3401677906513214, "learning_rate": 0.001, "loss": 2.5847, "step": 2745 }, { "epoch": 0.31, "grad_norm": 0.3693203032016754, "learning_rate": 0.001, "loss": 2.5425, "step": 2750 }, { "epoch": 0.31, "grad_norm": 0.36268529295921326, "learning_rate": 0.001, "loss": 2.5404, "step": 2755 }, { "epoch": 0.31, "grad_norm": 0.365995854139328, "learning_rate": 0.001, "loss": 2.6491, "step": 2760 }, { "epoch": 0.31, "grad_norm": 0.3524530827999115, "learning_rate": 0.001, "loss": 2.5213, "step": 2765 }, { "epoch": 0.31, "grad_norm": 0.35072949528694153, "learning_rate": 0.001, "loss": 2.6201, "step": 2770 }, { "epoch": 0.31, "grad_norm": 0.34381788969039917, "learning_rate": 0.001, "loss": 2.6026, "step": 2775 }, { "epoch": 0.31, "grad_norm": 0.3447662591934204, "learning_rate": 0.001, "loss": 2.506, "step": 2780 }, { "epoch": 0.31, "grad_norm": 0.3632507622241974, "learning_rate": 0.001, "loss": 2.6266, "step": 2785 }, { "epoch": 0.31, "grad_norm": 0.3556525707244873, "learning_rate": 0.001, "loss": 2.5758, "step": 2790 }, { "epoch": 0.31, "grad_norm": 0.3616575300693512, "learning_rate": 0.001, "loss": 2.5722, "step": 2795 }, { "epoch": 0.31, "grad_norm": 0.32523220777511597, "learning_rate": 0.001, "loss": 2.534, "step": 2800 }, { "epoch": 0.31, "grad_norm": 0.3450740873813629, "learning_rate": 0.001, "loss": 2.6071, "step": 2805 }, { "epoch": 0.31, "grad_norm": 0.3417651951313019, "learning_rate": 0.001, "loss": 2.5259, "step": 2810 }, { "epoch": 0.31, "grad_norm": 0.33791735768318176, "learning_rate": 0.001, "loss": 2.541, "step": 2815 }, { "epoch": 0.31, "grad_norm": 0.6335378885269165, "learning_rate": 0.001, "loss": 2.5412, "step": 2820 }, { "epoch": 0.31, "grad_norm": 0.37333744764328003, "learning_rate": 0.001, "loss": 2.5993, "step": 2825 }, { "epoch": 0.32, "grad_norm": 0.35744789242744446, "learning_rate": 0.001, "loss": 2.5405, "step": 2830 }, { "epoch": 0.32, "grad_norm": 0.3333902955055237, "learning_rate": 0.001, "loss": 2.6641, "step": 2835 }, { "epoch": 0.32, "grad_norm": 0.3603854775428772, "learning_rate": 0.001, "loss": 2.5494, "step": 2840 }, { "epoch": 0.32, "grad_norm": 0.45514631271362305, "learning_rate": 0.001, "loss": 2.5488, "step": 2845 }, { "epoch": 0.32, "grad_norm": 0.3743136525154114, "learning_rate": 0.001, "loss": 2.6219, "step": 2850 }, { "epoch": 0.32, "grad_norm": 0.34752535820007324, "learning_rate": 0.001, "loss": 2.6314, "step": 2855 }, { "epoch": 0.32, "grad_norm": 0.3930839002132416, "learning_rate": 0.001, "loss": 2.573, "step": 2860 }, { "epoch": 0.32, "grad_norm": 0.36045753955841064, "learning_rate": 0.001, "loss": 2.5771, "step": 2865 }, { "epoch": 0.32, "grad_norm": 0.32223668694496155, "learning_rate": 0.001, "loss": 2.6166, "step": 2870 }, { "epoch": 0.32, "grad_norm": 0.33084774017333984, "learning_rate": 0.001, "loss": 2.6366, "step": 2875 }, { "epoch": 0.32, "grad_norm": 0.3649795651435852, "learning_rate": 0.001, "loss": 2.6151, "step": 2880 }, { "epoch": 0.32, "grad_norm": 0.3912632465362549, "learning_rate": 0.001, "loss": 2.6128, "step": 2885 }, { "epoch": 0.32, "grad_norm": 0.40557408332824707, "learning_rate": 0.001, "loss": 2.5726, "step": 2890 }, { "epoch": 0.32, "grad_norm": 0.36210837960243225, "learning_rate": 0.001, "loss": 2.603, "step": 2895 }, { "epoch": 0.32, "grad_norm": 0.3513619899749756, "learning_rate": 0.001, "loss": 2.5473, "step": 2900 }, { "epoch": 0.32, "grad_norm": 0.3209548592567444, "learning_rate": 0.001, "loss": 2.5749, "step": 2905 }, { "epoch": 0.32, "grad_norm": 0.3491326570510864, "learning_rate": 0.001, "loss": 2.6812, "step": 2910 }, { "epoch": 0.33, "grad_norm": 0.35135090351104736, "learning_rate": 0.001, "loss": 2.5705, "step": 2915 }, { "epoch": 0.33, "grad_norm": 0.3628234267234802, "learning_rate": 0.001, "loss": 2.5638, "step": 2920 }, { "epoch": 0.33, "grad_norm": 0.3620128333568573, "learning_rate": 0.001, "loss": 2.567, "step": 2925 }, { "epoch": 0.33, "grad_norm": 0.35019025206565857, "learning_rate": 0.001, "loss": 2.5266, "step": 2930 }, { "epoch": 0.33, "grad_norm": 0.36038318276405334, "learning_rate": 0.001, "loss": 2.5184, "step": 2935 }, { "epoch": 0.33, "grad_norm": 0.3591684401035309, "learning_rate": 0.001, "loss": 2.5427, "step": 2940 }, { "epoch": 0.33, "grad_norm": 0.3429654538631439, "learning_rate": 0.001, "loss": 2.5432, "step": 2945 }, { "epoch": 0.33, "grad_norm": 0.35099634528160095, "learning_rate": 0.001, "loss": 2.5396, "step": 2950 }, { "epoch": 0.33, "grad_norm": 0.3457493484020233, "learning_rate": 0.001, "loss": 2.5672, "step": 2955 }, { "epoch": 0.33, "grad_norm": 0.36206451058387756, "learning_rate": 0.001, "loss": 2.5766, "step": 2960 }, { "epoch": 0.33, "grad_norm": 0.3406093716621399, "learning_rate": 0.001, "loss": 2.6046, "step": 2965 }, { "epoch": 0.33, "grad_norm": 0.35393887758255005, "learning_rate": 0.001, "loss": 2.5463, "step": 2970 }, { "epoch": 0.33, "grad_norm": 0.3434804081916809, "learning_rate": 0.001, "loss": 2.5328, "step": 2975 }, { "epoch": 0.33, "grad_norm": 0.32725098729133606, "learning_rate": 0.001, "loss": 2.6078, "step": 2980 }, { "epoch": 0.33, "grad_norm": 0.3323499858379364, "learning_rate": 0.001, "loss": 2.5363, "step": 2985 }, { "epoch": 0.33, "grad_norm": 0.3031046390533447, "learning_rate": 0.001, "loss": 2.5489, "step": 2990 }, { "epoch": 0.33, "grad_norm": 0.3338392674922943, "learning_rate": 0.001, "loss": 2.5397, "step": 2995 }, { "epoch": 0.33, "grad_norm": 0.3185463547706604, "learning_rate": 0.001, "loss": 2.5963, "step": 3000 }, { "epoch": 0.34, "grad_norm": 0.34104982018470764, "learning_rate": 0.001, "loss": 2.5777, "step": 3005 }, { "epoch": 0.34, "grad_norm": 0.35556039214134216, "learning_rate": 0.001, "loss": 2.5151, "step": 3010 }, { "epoch": 0.34, "grad_norm": 0.3338676989078522, "learning_rate": 0.001, "loss": 2.501, "step": 3015 }, { "epoch": 0.34, "grad_norm": 0.35118651390075684, "learning_rate": 0.001, "loss": 2.586, "step": 3020 }, { "epoch": 0.34, "grad_norm": 0.32692933082580566, "learning_rate": 0.001, "loss": 2.6081, "step": 3025 }, { "epoch": 0.34, "grad_norm": 0.34374868869781494, "learning_rate": 0.001, "loss": 2.6544, "step": 3030 }, { "epoch": 0.34, "grad_norm": 0.3365413546562195, "learning_rate": 0.001, "loss": 2.5178, "step": 3035 }, { "epoch": 0.34, "grad_norm": 0.33963915705680847, "learning_rate": 0.001, "loss": 2.5041, "step": 3040 }, { "epoch": 0.34, "grad_norm": 0.3642427623271942, "learning_rate": 0.001, "loss": 2.5709, "step": 3045 }, { "epoch": 0.34, "grad_norm": 0.33113953471183777, "learning_rate": 0.001, "loss": 2.62, "step": 3050 }, { "epoch": 0.34, "grad_norm": 0.36132606863975525, "learning_rate": 0.001, "loss": 2.4981, "step": 3055 }, { "epoch": 0.34, "grad_norm": 0.34095078706741333, "learning_rate": 0.001, "loss": 2.5128, "step": 3060 }, { "epoch": 0.34, "grad_norm": 0.327793151140213, "learning_rate": 0.001, "loss": 2.4821, "step": 3065 }, { "epoch": 0.34, "grad_norm": 0.33965983986854553, "learning_rate": 0.001, "loss": 2.6404, "step": 3070 }, { "epoch": 0.34, "grad_norm": 0.3375244438648224, "learning_rate": 0.001, "loss": 2.5655, "step": 3075 }, { "epoch": 0.34, "grad_norm": 0.34356772899627686, "learning_rate": 0.001, "loss": 2.5837, "step": 3080 }, { "epoch": 0.34, "grad_norm": 0.34157508611679077, "learning_rate": 0.001, "loss": 2.6517, "step": 3085 }, { "epoch": 0.34, "grad_norm": 0.3461490571498871, "learning_rate": 0.001, "loss": 2.598, "step": 3090 }, { "epoch": 0.35, "grad_norm": 0.3149019777774811, "learning_rate": 0.001, "loss": 2.5534, "step": 3095 }, { "epoch": 0.35, "grad_norm": 0.3143615126609802, "learning_rate": 0.001, "loss": 2.5102, "step": 3100 }, { "epoch": 0.35, "grad_norm": 0.3524455726146698, "learning_rate": 0.001, "loss": 2.5703, "step": 3105 }, { "epoch": 0.35, "grad_norm": 0.3223250210285187, "learning_rate": 0.001, "loss": 2.4897, "step": 3110 }, { "epoch": 0.35, "grad_norm": 0.3235696852207184, "learning_rate": 0.001, "loss": 2.5577, "step": 3115 }, { "epoch": 0.35, "grad_norm": 0.3154476284980774, "learning_rate": 0.001, "loss": 2.551, "step": 3120 }, { "epoch": 0.35, "grad_norm": 0.3305743634700775, "learning_rate": 0.001, "loss": 2.5497, "step": 3125 }, { "epoch": 0.35, "grad_norm": 0.3358781337738037, "learning_rate": 0.001, "loss": 2.4787, "step": 3130 }, { "epoch": 0.35, "grad_norm": 0.33123084902763367, "learning_rate": 0.001, "loss": 2.5887, "step": 3135 }, { "epoch": 0.35, "grad_norm": 0.3147204518318176, "learning_rate": 0.001, "loss": 2.5253, "step": 3140 }, { "epoch": 0.35, "grad_norm": 0.31788182258605957, "learning_rate": 0.001, "loss": 2.5734, "step": 3145 }, { "epoch": 0.35, "grad_norm": 0.3127879202365875, "learning_rate": 0.001, "loss": 2.4203, "step": 3150 }, { "epoch": 0.35, "grad_norm": 0.3299925625324249, "learning_rate": 0.001, "loss": 2.5275, "step": 3155 }, { "epoch": 0.35, "grad_norm": 0.3190239667892456, "learning_rate": 0.001, "loss": 2.4905, "step": 3160 }, { "epoch": 0.35, "grad_norm": 0.34184321761131287, "learning_rate": 0.001, "loss": 2.4114, "step": 3165 }, { "epoch": 0.35, "grad_norm": 0.31968846917152405, "learning_rate": 0.001, "loss": 2.5814, "step": 3170 }, { "epoch": 0.35, "grad_norm": 0.3431030213832855, "learning_rate": 0.001, "loss": 2.6748, "step": 3175 }, { "epoch": 0.35, "grad_norm": 0.3149561882019043, "learning_rate": 0.001, "loss": 2.5587, "step": 3180 }, { "epoch": 0.36, "grad_norm": 0.3136409521102905, "learning_rate": 0.001, "loss": 2.587, "step": 3185 }, { "epoch": 0.36, "grad_norm": 0.31620776653289795, "learning_rate": 0.001, "loss": 2.5091, "step": 3190 }, { "epoch": 0.36, "grad_norm": 0.3681703805923462, "learning_rate": 0.001, "loss": 2.5609, "step": 3195 }, { "epoch": 0.36, "grad_norm": 0.30699193477630615, "learning_rate": 0.001, "loss": 2.6231, "step": 3200 }, { "epoch": 0.36, "grad_norm": 0.319044291973114, "learning_rate": 0.001, "loss": 2.652, "step": 3205 }, { "epoch": 0.36, "grad_norm": 0.3128690719604492, "learning_rate": 0.001, "loss": 2.4954, "step": 3210 }, { "epoch": 0.36, "grad_norm": 0.328692764043808, "learning_rate": 0.001, "loss": 2.4826, "step": 3215 }, { "epoch": 0.36, "grad_norm": 0.3168734014034271, "learning_rate": 0.001, "loss": 2.4623, "step": 3220 }, { "epoch": 0.36, "grad_norm": 0.31120923161506653, "learning_rate": 0.001, "loss": 2.5033, "step": 3225 }, { "epoch": 0.36, "grad_norm": 0.3216933608055115, "learning_rate": 0.001, "loss": 2.5691, "step": 3230 }, { "epoch": 0.36, "grad_norm": 0.33055299520492554, "learning_rate": 0.001, "loss": 2.5753, "step": 3235 }, { "epoch": 0.36, "grad_norm": 0.33167943358421326, "learning_rate": 0.001, "loss": 2.4651, "step": 3240 }, { "epoch": 0.36, "grad_norm": 0.3290786147117615, "learning_rate": 0.001, "loss": 2.557, "step": 3245 }, { "epoch": 0.36, "grad_norm": 0.3318544030189514, "learning_rate": 0.001, "loss": 2.5146, "step": 3250 }, { "epoch": 0.36, "grad_norm": 0.31007036566734314, "learning_rate": 0.001, "loss": 2.463, "step": 3255 }, { "epoch": 0.36, "grad_norm": 0.31148260831832886, "learning_rate": 0.001, "loss": 2.5997, "step": 3260 }, { "epoch": 0.36, "grad_norm": 0.3110112249851227, "learning_rate": 0.001, "loss": 2.5517, "step": 3265 }, { "epoch": 0.36, "grad_norm": 0.3323996663093567, "learning_rate": 0.001, "loss": 2.5036, "step": 3270 }, { "epoch": 0.37, "grad_norm": 0.3133048713207245, "learning_rate": 0.001, "loss": 2.4954, "step": 3275 }, { "epoch": 0.37, "grad_norm": 0.3396298289299011, "learning_rate": 0.001, "loss": 2.5113, "step": 3280 }, { "epoch": 0.37, "grad_norm": 0.3418448567390442, "learning_rate": 0.001, "loss": 2.5753, "step": 3285 }, { "epoch": 0.37, "grad_norm": 0.34545567631721497, "learning_rate": 0.001, "loss": 2.5325, "step": 3290 }, { "epoch": 0.37, "grad_norm": 0.3261316418647766, "learning_rate": 0.001, "loss": 2.5489, "step": 3295 }, { "epoch": 0.37, "grad_norm": 0.3248863220214844, "learning_rate": 0.001, "loss": 2.4554, "step": 3300 }, { "epoch": 0.37, "grad_norm": 0.30948251485824585, "learning_rate": 0.001, "loss": 2.497, "step": 3305 }, { "epoch": 0.37, "grad_norm": 0.32646802067756653, "learning_rate": 0.001, "loss": 2.5322, "step": 3310 }, { "epoch": 0.37, "grad_norm": 0.3012072741985321, "learning_rate": 0.001, "loss": 2.4644, "step": 3315 }, { "epoch": 0.37, "grad_norm": 0.3290029764175415, "learning_rate": 0.001, "loss": 2.5015, "step": 3320 }, { "epoch": 0.37, "grad_norm": 0.3250294625759125, "learning_rate": 0.001, "loss": 2.4714, "step": 3325 }, { "epoch": 0.37, "grad_norm": 0.2954888939857483, "learning_rate": 0.001, "loss": 2.5344, "step": 3330 }, { "epoch": 0.37, "grad_norm": 0.31430017948150635, "learning_rate": 0.001, "loss": 2.518, "step": 3335 }, { "epoch": 0.37, "grad_norm": 0.33378204703330994, "learning_rate": 0.001, "loss": 2.5019, "step": 3340 }, { "epoch": 0.37, "grad_norm": 0.33612725138664246, "learning_rate": 0.001, "loss": 2.5037, "step": 3345 }, { "epoch": 0.37, "grad_norm": 0.30687910318374634, "learning_rate": 0.001, "loss": 2.4912, "step": 3350 }, { "epoch": 0.37, "grad_norm": 0.30845963954925537, "learning_rate": 0.001, "loss": 2.4837, "step": 3355 }, { "epoch": 0.37, "grad_norm": 0.29248157143592834, "learning_rate": 0.001, "loss": 2.4905, "step": 3360 }, { "epoch": 0.38, "grad_norm": 0.2903001606464386, "learning_rate": 0.001, "loss": 2.4785, "step": 3365 }, { "epoch": 0.38, "grad_norm": 0.31211957335472107, "learning_rate": 0.001, "loss": 2.484, "step": 3370 }, { "epoch": 0.38, "grad_norm": 0.3141609728336334, "learning_rate": 0.001, "loss": 2.6068, "step": 3375 }, { "epoch": 0.38, "grad_norm": 0.30534839630126953, "learning_rate": 0.001, "loss": 2.4743, "step": 3380 }, { "epoch": 0.38, "grad_norm": 0.31994548439979553, "learning_rate": 0.001, "loss": 2.521, "step": 3385 }, { "epoch": 0.38, "grad_norm": 0.31002023816108704, "learning_rate": 0.001, "loss": 2.6565, "step": 3390 }, { "epoch": 0.38, "grad_norm": 0.3236083984375, "learning_rate": 0.001, "loss": 2.5602, "step": 3395 }, { "epoch": 0.38, "grad_norm": 0.3060869872570038, "learning_rate": 0.001, "loss": 2.4858, "step": 3400 }, { "epoch": 0.38, "grad_norm": 0.28208038210868835, "learning_rate": 0.001, "loss": 2.4991, "step": 3405 }, { "epoch": 0.38, "grad_norm": 0.33403393626213074, "learning_rate": 0.001, "loss": 2.4914, "step": 3410 }, { "epoch": 0.38, "grad_norm": 0.382112592458725, "learning_rate": 0.001, "loss": 2.5804, "step": 3415 }, { "epoch": 0.38, "grad_norm": 0.2976379692554474, "learning_rate": 0.001, "loss": 2.4831, "step": 3420 }, { "epoch": 0.38, "grad_norm": 0.29742851853370667, "learning_rate": 0.001, "loss": 2.5167, "step": 3425 }, { "epoch": 0.38, "grad_norm": 0.27965688705444336, "learning_rate": 0.001, "loss": 2.4606, "step": 3430 }, { "epoch": 0.38, "grad_norm": 0.3177298307418823, "learning_rate": 0.001, "loss": 2.4297, "step": 3435 }, { "epoch": 0.38, "grad_norm": 0.3082432150840759, "learning_rate": 0.001, "loss": 2.4864, "step": 3440 }, { "epoch": 0.38, "grad_norm": 0.30660197138786316, "learning_rate": 0.001, "loss": 2.4922, "step": 3445 }, { "epoch": 0.38, "grad_norm": 0.3045155704021454, "learning_rate": 0.001, "loss": 2.5424, "step": 3450 }, { "epoch": 0.39, "grad_norm": 0.3095363974571228, "learning_rate": 0.001, "loss": 2.5205, "step": 3455 }, { "epoch": 0.39, "grad_norm": 0.30330196022987366, "learning_rate": 0.001, "loss": 2.4601, "step": 3460 }, { "epoch": 0.39, "grad_norm": 0.31325945258140564, "learning_rate": 0.001, "loss": 2.4919, "step": 3465 }, { "epoch": 0.39, "grad_norm": 0.2912018597126007, "learning_rate": 0.001, "loss": 2.4719, "step": 3470 }, { "epoch": 0.39, "grad_norm": 0.3126067519187927, "learning_rate": 0.001, "loss": 2.5373, "step": 3475 }, { "epoch": 0.39, "grad_norm": 0.29371434450149536, "learning_rate": 0.001, "loss": 2.5213, "step": 3480 }, { "epoch": 0.39, "grad_norm": 0.30860206484794617, "learning_rate": 0.001, "loss": 2.4439, "step": 3485 }, { "epoch": 0.39, "grad_norm": 0.30670803785324097, "learning_rate": 0.001, "loss": 2.551, "step": 3490 }, { "epoch": 0.39, "grad_norm": 0.3091505765914917, "learning_rate": 0.001, "loss": 2.5826, "step": 3495 }, { "epoch": 0.39, "grad_norm": 0.3141428232192993, "learning_rate": 0.001, "loss": 2.4817, "step": 3500 }, { "epoch": 0.39, "grad_norm": 0.2985532283782959, "learning_rate": 0.001, "loss": 2.5519, "step": 3505 }, { "epoch": 0.39, "grad_norm": 0.3592018187046051, "learning_rate": 0.001, "loss": 2.4774, "step": 3510 }, { "epoch": 0.39, "grad_norm": 0.2912810444831848, "learning_rate": 0.001, "loss": 2.5316, "step": 3515 }, { "epoch": 0.39, "grad_norm": 0.3026273548603058, "learning_rate": 0.001, "loss": 2.451, "step": 3520 }, { "epoch": 0.39, "grad_norm": 0.3669629693031311, "learning_rate": 0.001, "loss": 2.4407, "step": 3525 }, { "epoch": 0.39, "grad_norm": 0.334534227848053, "learning_rate": 0.001, "loss": 2.5109, "step": 3530 }, { "epoch": 0.39, "grad_norm": 0.33897146582603455, "learning_rate": 0.001, "loss": 2.4755, "step": 3535 }, { "epoch": 0.39, "grad_norm": 0.30617454648017883, "learning_rate": 0.001, "loss": 2.5165, "step": 3540 }, { "epoch": 0.4, "grad_norm": 0.30545178055763245, "learning_rate": 0.001, "loss": 2.5267, "step": 3545 }, { "epoch": 0.4, "grad_norm": 0.3039405345916748, "learning_rate": 0.001, "loss": 2.508, "step": 3550 }, { "epoch": 0.4, "grad_norm": 0.31085509061813354, "learning_rate": 0.001, "loss": 2.5329, "step": 3555 }, { "epoch": 0.4, "grad_norm": 0.3017004132270813, "learning_rate": 0.001, "loss": 2.4686, "step": 3560 }, { "epoch": 0.4, "grad_norm": 0.2899893522262573, "learning_rate": 0.001, "loss": 2.5395, "step": 3565 }, { "epoch": 0.4, "grad_norm": 0.3174129128456116, "learning_rate": 0.001, "loss": 2.4909, "step": 3570 }, { "epoch": 0.4, "grad_norm": 0.32396823167800903, "learning_rate": 0.001, "loss": 2.4498, "step": 3575 }, { "epoch": 0.4, "grad_norm": 0.31135293841362, "learning_rate": 0.001, "loss": 2.4446, "step": 3580 }, { "epoch": 0.4, "grad_norm": 0.32165661454200745, "learning_rate": 0.001, "loss": 2.4139, "step": 3585 }, { "epoch": 0.4, "grad_norm": 0.3169751465320587, "learning_rate": 0.001, "loss": 2.4799, "step": 3590 }, { "epoch": 0.4, "grad_norm": 0.31132322549819946, "learning_rate": 0.001, "loss": 2.5442, "step": 3595 }, { "epoch": 0.4, "grad_norm": 0.3260294497013092, "learning_rate": 0.001, "loss": 2.4054, "step": 3600 }, { "epoch": 0.4, "grad_norm": 0.29337745904922485, "learning_rate": 0.001, "loss": 2.4915, "step": 3605 }, { "epoch": 0.4, "grad_norm": 0.31145504117012024, "learning_rate": 0.001, "loss": 2.4534, "step": 3610 }, { "epoch": 0.4, "grad_norm": 0.299142450094223, "learning_rate": 0.001, "loss": 2.4935, "step": 3615 }, { "epoch": 0.4, "grad_norm": 0.3256239593029022, "learning_rate": 0.001, "loss": 2.4649, "step": 3620 }, { "epoch": 0.4, "grad_norm": 0.30987122654914856, "learning_rate": 0.001, "loss": 2.4172, "step": 3625 }, { "epoch": 0.4, "grad_norm": 0.4387304484844208, "learning_rate": 0.001, "loss": 2.442, "step": 3630 }, { "epoch": 0.41, "grad_norm": 0.28370875120162964, "learning_rate": 0.001, "loss": 2.4926, "step": 3635 }, { "epoch": 0.41, "grad_norm": 0.27789434790611267, "learning_rate": 0.001, "loss": 2.4874, "step": 3640 }, { "epoch": 0.41, "grad_norm": 0.298215389251709, "learning_rate": 0.001, "loss": 2.5084, "step": 3645 }, { "epoch": 0.41, "grad_norm": 0.28470468521118164, "learning_rate": 0.001, "loss": 2.4685, "step": 3650 }, { "epoch": 0.41, "grad_norm": 0.31583115458488464, "learning_rate": 0.001, "loss": 2.4241, "step": 3655 }, { "epoch": 0.41, "grad_norm": 0.3249650001525879, "learning_rate": 0.001, "loss": 2.415, "step": 3660 }, { "epoch": 0.41, "grad_norm": 0.33161094784736633, "learning_rate": 0.001, "loss": 2.4805, "step": 3665 }, { "epoch": 0.41, "grad_norm": 0.29928529262542725, "learning_rate": 0.001, "loss": 2.4966, "step": 3670 }, { "epoch": 0.41, "grad_norm": 0.31636932492256165, "learning_rate": 0.001, "loss": 2.5237, "step": 3675 }, { "epoch": 0.41, "grad_norm": 0.3295460641384125, "learning_rate": 0.001, "loss": 2.4059, "step": 3680 }, { "epoch": 0.41, "grad_norm": 0.3235255777835846, "learning_rate": 0.001, "loss": 2.4589, "step": 3685 }, { "epoch": 0.41, "grad_norm": 0.32697299122810364, "learning_rate": 0.001, "loss": 2.3683, "step": 3690 }, { "epoch": 0.41, "grad_norm": 0.3994431793689728, "learning_rate": 0.001, "loss": 2.4794, "step": 3695 }, { "epoch": 0.41, "grad_norm": 0.3653365671634674, "learning_rate": 0.001, "loss": 2.5027, "step": 3700 }, { "epoch": 0.41, "grad_norm": 0.3047979772090912, "learning_rate": 0.001, "loss": 2.4745, "step": 3705 }, { "epoch": 0.41, "grad_norm": 0.4140577018260956, "learning_rate": 0.001, "loss": 2.5028, "step": 3710 }, { "epoch": 0.41, "grad_norm": 0.3812958300113678, "learning_rate": 0.001, "loss": 2.4999, "step": 3715 }, { "epoch": 0.41, "grad_norm": 0.32625457644462585, "learning_rate": 0.001, "loss": 2.4991, "step": 3720 }, { "epoch": 0.42, "grad_norm": 0.30464789271354675, "learning_rate": 0.001, "loss": 2.4738, "step": 3725 }, { "epoch": 0.42, "grad_norm": 0.3046870827674866, "learning_rate": 0.001, "loss": 2.5732, "step": 3730 }, { "epoch": 0.42, "grad_norm": 0.33171603083610535, "learning_rate": 0.001, "loss": 2.4796, "step": 3735 }, { "epoch": 0.42, "grad_norm": 0.2825039029121399, "learning_rate": 0.001, "loss": 2.4877, "step": 3740 }, { "epoch": 0.42, "grad_norm": 0.3741876184940338, "learning_rate": 0.001, "loss": 2.5031, "step": 3745 }, { "epoch": 0.42, "grad_norm": 0.30283525586128235, "learning_rate": 0.001, "loss": 2.4659, "step": 3750 }, { "epoch": 0.42, "grad_norm": 0.28069236874580383, "learning_rate": 0.001, "loss": 2.402, "step": 3755 }, { "epoch": 0.42, "grad_norm": 0.273360937833786, "learning_rate": 0.001, "loss": 2.5441, "step": 3760 }, { "epoch": 0.42, "grad_norm": 0.3147111237049103, "learning_rate": 0.001, "loss": 2.5057, "step": 3765 }, { "epoch": 0.42, "grad_norm": 0.3091120719909668, "learning_rate": 0.001, "loss": 2.4444, "step": 3770 }, { "epoch": 0.42, "grad_norm": 0.3067731559276581, "learning_rate": 0.001, "loss": 2.5245, "step": 3775 }, { "epoch": 0.42, "grad_norm": 0.2935093939304352, "learning_rate": 0.001, "loss": 2.5006, "step": 3780 }, { "epoch": 0.42, "grad_norm": 0.3160226345062256, "learning_rate": 0.001, "loss": 2.4026, "step": 3785 }, { "epoch": 0.42, "grad_norm": 0.306943416595459, "learning_rate": 0.001, "loss": 2.5193, "step": 3790 }, { "epoch": 0.42, "grad_norm": 0.28538256883621216, "learning_rate": 0.001, "loss": 2.4627, "step": 3795 }, { "epoch": 0.42, "grad_norm": 0.2843259871006012, "learning_rate": 0.001, "loss": 2.3829, "step": 3800 }, { "epoch": 0.42, "grad_norm": 0.2874767482280731, "learning_rate": 0.001, "loss": 2.511, "step": 3805 }, { "epoch": 0.42, "grad_norm": 0.26737627387046814, "learning_rate": 0.001, "loss": 2.439, "step": 3810 }, { "epoch": 0.43, "grad_norm": 0.30951711535453796, "learning_rate": 0.001, "loss": 2.4584, "step": 3815 }, { "epoch": 0.43, "grad_norm": 0.2935316264629364, "learning_rate": 0.001, "loss": 2.4268, "step": 3820 }, { "epoch": 0.43, "grad_norm": 0.2819925546646118, "learning_rate": 0.001, "loss": 2.4609, "step": 3825 }, { "epoch": 0.43, "grad_norm": 0.278676837682724, "learning_rate": 0.001, "loss": 2.3241, "step": 3830 }, { "epoch": 0.43, "grad_norm": 0.2986203134059906, "learning_rate": 0.001, "loss": 2.5406, "step": 3835 }, { "epoch": 0.43, "grad_norm": 0.2985837161540985, "learning_rate": 0.001, "loss": 2.441, "step": 3840 }, { "epoch": 0.43, "grad_norm": 0.2917584180831909, "learning_rate": 0.001, "loss": 2.4911, "step": 3845 }, { "epoch": 0.43, "grad_norm": 0.29989174008369446, "learning_rate": 0.001, "loss": 2.5409, "step": 3850 }, { "epoch": 0.43, "grad_norm": 0.30956900119781494, "learning_rate": 0.001, "loss": 2.55, "step": 3855 }, { "epoch": 0.43, "grad_norm": 0.3005373477935791, "learning_rate": 0.001, "loss": 2.5079, "step": 3860 }, { "epoch": 0.43, "grad_norm": 0.2777903079986572, "learning_rate": 0.001, "loss": 2.4218, "step": 3865 }, { "epoch": 0.43, "grad_norm": 0.3362637162208557, "learning_rate": 0.001, "loss": 2.5395, "step": 3870 }, { "epoch": 0.43, "grad_norm": 0.299837201833725, "learning_rate": 0.001, "loss": 2.5473, "step": 3875 }, { "epoch": 0.43, "grad_norm": 0.28119298815727234, "learning_rate": 0.001, "loss": 2.4785, "step": 3880 }, { "epoch": 0.43, "grad_norm": 0.29247012734413147, "learning_rate": 0.001, "loss": 2.5776, "step": 3885 }, { "epoch": 0.43, "grad_norm": 0.27871406078338623, "learning_rate": 0.001, "loss": 2.4137, "step": 3890 }, { "epoch": 0.43, "grad_norm": 0.2752448320388794, "learning_rate": 0.001, "loss": 2.463, "step": 3895 }, { "epoch": 0.43, "grad_norm": 0.291769802570343, "learning_rate": 0.001, "loss": 2.4766, "step": 3900 }, { "epoch": 0.44, "grad_norm": 0.31150388717651367, "learning_rate": 0.001, "loss": 2.4992, "step": 3905 }, { "epoch": 0.44, "grad_norm": 0.2958334684371948, "learning_rate": 0.001, "loss": 2.4278, "step": 3910 }, { "epoch": 0.44, "grad_norm": 0.30901768803596497, "learning_rate": 0.001, "loss": 2.4791, "step": 3915 }, { "epoch": 0.44, "grad_norm": 0.3076885938644409, "learning_rate": 0.001, "loss": 2.4737, "step": 3920 }, { "epoch": 0.44, "grad_norm": 0.29541951417922974, "learning_rate": 0.001, "loss": 2.5097, "step": 3925 }, { "epoch": 0.44, "grad_norm": 0.30541756749153137, "learning_rate": 0.001, "loss": 2.495, "step": 3930 }, { "epoch": 0.44, "grad_norm": 0.29810360074043274, "learning_rate": 0.001, "loss": 2.3795, "step": 3935 }, { "epoch": 0.44, "grad_norm": 0.30572912096977234, "learning_rate": 0.001, "loss": 2.4483, "step": 3940 }, { "epoch": 0.44, "grad_norm": 0.2885902523994446, "learning_rate": 0.001, "loss": 2.426, "step": 3945 }, { "epoch": 0.44, "grad_norm": 0.2982771396636963, "learning_rate": 0.001, "loss": 2.3061, "step": 3950 }, { "epoch": 0.44, "grad_norm": 0.3130563497543335, "learning_rate": 0.001, "loss": 2.4545, "step": 3955 }, { "epoch": 0.44, "grad_norm": 0.2932124137878418, "learning_rate": 0.001, "loss": 2.4065, "step": 3960 }, { "epoch": 0.44, "grad_norm": 0.28204530477523804, "learning_rate": 0.001, "loss": 2.4234, "step": 3965 }, { "epoch": 0.44, "grad_norm": 0.29289600253105164, "learning_rate": 0.001, "loss": 2.3772, "step": 3970 }, { "epoch": 0.44, "grad_norm": 0.2805161774158478, "learning_rate": 0.001, "loss": 2.5557, "step": 3975 }, { "epoch": 0.44, "grad_norm": 0.2907586097717285, "learning_rate": 0.001, "loss": 2.5438, "step": 3980 }, { "epoch": 0.44, "grad_norm": 0.2835649251937866, "learning_rate": 0.001, "loss": 2.4499, "step": 3985 }, { "epoch": 0.44, "grad_norm": 0.2827214300632477, "learning_rate": 0.001, "loss": 2.4512, "step": 3990 }, { "epoch": 0.45, "grad_norm": 0.34279364347457886, "learning_rate": 0.001, "loss": 2.4925, "step": 3995 }, { "epoch": 0.45, "grad_norm": 0.2893143594264984, "learning_rate": 0.001, "loss": 2.4309, "step": 4000 }, { "epoch": 0.45, "grad_norm": 0.27287694811820984, "learning_rate": 0.001, "loss": 2.417, "step": 4005 }, { "epoch": 0.45, "grad_norm": 0.29163962602615356, "learning_rate": 0.001, "loss": 2.3912, "step": 4010 }, { "epoch": 0.45, "grad_norm": 0.3035675287246704, "learning_rate": 0.001, "loss": 2.4203, "step": 4015 }, { "epoch": 0.45, "grad_norm": 0.28289881348609924, "learning_rate": 0.001, "loss": 2.3951, "step": 4020 }, { "epoch": 0.45, "grad_norm": 0.3087094724178314, "learning_rate": 0.001, "loss": 2.4091, "step": 4025 }, { "epoch": 0.45, "grad_norm": 0.29908227920532227, "learning_rate": 0.001, "loss": 2.4428, "step": 4030 }, { "epoch": 0.45, "grad_norm": 0.28541573882102966, "learning_rate": 0.001, "loss": 2.3673, "step": 4035 }, { "epoch": 0.45, "grad_norm": 0.28469517827033997, "learning_rate": 0.001, "loss": 2.4308, "step": 4040 }, { "epoch": 0.45, "grad_norm": 0.3017624020576477, "learning_rate": 0.001, "loss": 2.417, "step": 4045 }, { "epoch": 0.45, "grad_norm": 0.3090055286884308, "learning_rate": 0.001, "loss": 2.3774, "step": 4050 }, { "epoch": 0.45, "grad_norm": 0.30866459012031555, "learning_rate": 0.001, "loss": 2.4988, "step": 4055 }, { "epoch": 0.45, "grad_norm": 0.3137664794921875, "learning_rate": 0.001, "loss": 2.5087, "step": 4060 }, { "epoch": 0.45, "grad_norm": 0.2756161689758301, "learning_rate": 0.001, "loss": 2.4424, "step": 4065 }, { "epoch": 0.45, "grad_norm": 0.28431424498558044, "learning_rate": 0.001, "loss": 2.4091, "step": 4070 }, { "epoch": 0.45, "grad_norm": 0.2876141667366028, "learning_rate": 0.001, "loss": 2.4231, "step": 4075 }, { "epoch": 0.45, "grad_norm": 0.27080801129341125, "learning_rate": 0.001, "loss": 2.4929, "step": 4080 }, { "epoch": 0.46, "grad_norm": 0.2989480197429657, "learning_rate": 0.001, "loss": 2.4463, "step": 4085 }, { "epoch": 0.46, "grad_norm": 0.3767198622226715, "learning_rate": 0.001, "loss": 2.5178, "step": 4090 }, { "epoch": 0.46, "grad_norm": 0.27639487385749817, "learning_rate": 0.001, "loss": 2.4472, "step": 4095 }, { "epoch": 0.46, "grad_norm": 0.30378058552742004, "learning_rate": 0.001, "loss": 2.4917, "step": 4100 }, { "epoch": 0.46, "grad_norm": 0.2825583219528198, "learning_rate": 0.001, "loss": 2.4288, "step": 4105 }, { "epoch": 0.46, "grad_norm": 0.26925694942474365, "learning_rate": 0.001, "loss": 2.4241, "step": 4110 }, { "epoch": 0.46, "grad_norm": 0.287741482257843, "learning_rate": 0.001, "loss": 2.4622, "step": 4115 }, { "epoch": 0.46, "grad_norm": 0.312418133020401, "learning_rate": 0.001, "loss": 2.4169, "step": 4120 }, { "epoch": 0.46, "grad_norm": 0.2728055417537689, "learning_rate": 0.001, "loss": 2.464, "step": 4125 }, { "epoch": 0.46, "grad_norm": 0.2967456877231598, "learning_rate": 0.001, "loss": 2.5317, "step": 4130 }, { "epoch": 0.46, "grad_norm": 0.2888936698436737, "learning_rate": 0.001, "loss": 2.4206, "step": 4135 }, { "epoch": 0.46, "grad_norm": 0.2940239906311035, "learning_rate": 0.001, "loss": 2.5697, "step": 4140 }, { "epoch": 0.46, "grad_norm": 0.33775338530540466, "learning_rate": 0.001, "loss": 2.3773, "step": 4145 }, { "epoch": 0.46, "grad_norm": 0.2790275812149048, "learning_rate": 0.001, "loss": 2.3066, "step": 4150 }, { "epoch": 0.46, "grad_norm": 0.26847320795059204, "learning_rate": 0.001, "loss": 2.4747, "step": 4155 }, { "epoch": 0.46, "grad_norm": 0.3185865879058838, "learning_rate": 0.001, "loss": 2.4139, "step": 4160 }, { "epoch": 0.46, "grad_norm": 0.278110533952713, "learning_rate": 0.001, "loss": 2.5095, "step": 4165 }, { "epoch": 0.46, "grad_norm": 0.3306000828742981, "learning_rate": 0.001, "loss": 2.4582, "step": 4170 }, { "epoch": 0.47, "grad_norm": 0.2653733491897583, "learning_rate": 0.001, "loss": 2.4082, "step": 4175 }, { "epoch": 0.47, "grad_norm": 0.27316001057624817, "learning_rate": 0.001, "loss": 2.4328, "step": 4180 }, { "epoch": 0.47, "grad_norm": 0.2686551809310913, "learning_rate": 0.001, "loss": 2.3893, "step": 4185 }, { "epoch": 0.47, "grad_norm": 0.2632417380809784, "learning_rate": 0.001, "loss": 2.4319, "step": 4190 }, { "epoch": 0.47, "grad_norm": 0.3223254084587097, "learning_rate": 0.001, "loss": 2.459, "step": 4195 }, { "epoch": 0.47, "grad_norm": 0.31923434138298035, "learning_rate": 0.001, "loss": 2.4144, "step": 4200 }, { "epoch": 0.47, "grad_norm": 0.28261134028434753, "learning_rate": 0.001, "loss": 2.4485, "step": 4205 }, { "epoch": 0.47, "grad_norm": 0.2726362943649292, "learning_rate": 0.001, "loss": 2.5626, "step": 4210 }, { "epoch": 0.47, "grad_norm": 0.2785876393318176, "learning_rate": 0.001, "loss": 2.4329, "step": 4215 }, { "epoch": 0.47, "grad_norm": 0.28101497888565063, "learning_rate": 0.001, "loss": 2.4212, "step": 4220 }, { "epoch": 0.47, "grad_norm": 0.3232978284358978, "learning_rate": 0.001, "loss": 2.4316, "step": 4225 }, { "epoch": 0.47, "grad_norm": 0.26223674416542053, "learning_rate": 0.001, "loss": 2.3672, "step": 4230 }, { "epoch": 0.47, "grad_norm": 0.28561317920684814, "learning_rate": 0.001, "loss": 2.3806, "step": 4235 }, { "epoch": 0.47, "grad_norm": 0.2884681820869446, "learning_rate": 0.001, "loss": 2.4717, "step": 4240 }, { "epoch": 0.47, "grad_norm": 0.27227768301963806, "learning_rate": 0.001, "loss": 2.4416, "step": 4245 }, { "epoch": 0.47, "grad_norm": 0.272927850484848, "learning_rate": 0.001, "loss": 2.4401, "step": 4250 }, { "epoch": 0.47, "grad_norm": 0.2963358461856842, "learning_rate": 0.001, "loss": 2.3382, "step": 4255 }, { "epoch": 0.47, "grad_norm": 0.28761300444602966, "learning_rate": 0.001, "loss": 2.4503, "step": 4260 }, { "epoch": 0.48, "grad_norm": 0.278731107711792, "learning_rate": 0.001, "loss": 2.4327, "step": 4265 }, { "epoch": 0.48, "grad_norm": 0.25920960307121277, "learning_rate": 0.001, "loss": 2.3216, "step": 4270 }, { "epoch": 0.48, "grad_norm": 0.31134849786758423, "learning_rate": 0.001, "loss": 2.3888, "step": 4275 }, { "epoch": 0.48, "grad_norm": 0.30488523840904236, "learning_rate": 0.001, "loss": 2.4174, "step": 4280 }, { "epoch": 0.48, "grad_norm": 0.2729250192642212, "learning_rate": 0.001, "loss": 2.4526, "step": 4285 }, { "epoch": 0.48, "grad_norm": 0.26827767491340637, "learning_rate": 0.001, "loss": 2.4263, "step": 4290 }, { "epoch": 0.48, "grad_norm": 0.31564125418663025, "learning_rate": 0.001, "loss": 2.405, "step": 4295 }, { "epoch": 0.48, "grad_norm": 0.3122270405292511, "learning_rate": 0.001, "loss": 2.377, "step": 4300 }, { "epoch": 0.48, "grad_norm": 0.2852209508419037, "learning_rate": 0.001, "loss": 2.4195, "step": 4305 }, { "epoch": 0.48, "grad_norm": 0.2710750699043274, "learning_rate": 0.001, "loss": 2.4325, "step": 4310 }, { "epoch": 0.48, "grad_norm": 0.26912280917167664, "learning_rate": 0.001, "loss": 2.3645, "step": 4315 }, { "epoch": 0.48, "grad_norm": 0.32176169753074646, "learning_rate": 0.001, "loss": 2.4785, "step": 4320 }, { "epoch": 0.48, "grad_norm": 0.3187229633331299, "learning_rate": 0.001, "loss": 2.5005, "step": 4325 }, { "epoch": 0.48, "grad_norm": 0.3275264799594879, "learning_rate": 0.001, "loss": 2.4553, "step": 4330 }, { "epoch": 0.48, "grad_norm": 0.2660835087299347, "learning_rate": 0.001, "loss": 2.4242, "step": 4335 }, { "epoch": 0.48, "grad_norm": 0.257010817527771, "learning_rate": 0.001, "loss": 2.5045, "step": 4340 }, { "epoch": 0.48, "grad_norm": 0.274821013212204, "learning_rate": 0.001, "loss": 2.4028, "step": 4345 }, { "epoch": 0.49, "grad_norm": 0.2845490276813507, "learning_rate": 0.001, "loss": 2.4315, "step": 4350 }, { "epoch": 0.49, "grad_norm": 0.2754290997982025, "learning_rate": 0.001, "loss": 2.3878, "step": 4355 }, { "epoch": 0.49, "grad_norm": 0.2966209053993225, "learning_rate": 0.001, "loss": 2.4095, "step": 4360 }, { "epoch": 0.49, "grad_norm": 0.29566988348960876, "learning_rate": 0.001, "loss": 2.4789, "step": 4365 }, { "epoch": 0.49, "grad_norm": 0.27423152327537537, "learning_rate": 0.001, "loss": 2.3935, "step": 4370 }, { "epoch": 0.49, "grad_norm": 0.2520494759082794, "learning_rate": 0.001, "loss": 2.4523, "step": 4375 }, { "epoch": 0.49, "grad_norm": 0.2906912565231323, "learning_rate": 0.001, "loss": 2.4836, "step": 4380 }, { "epoch": 0.49, "grad_norm": 0.2802337408065796, "learning_rate": 0.001, "loss": 2.4883, "step": 4385 }, { "epoch": 0.49, "grad_norm": 0.2806122303009033, "learning_rate": 0.001, "loss": 2.4134, "step": 4390 }, { "epoch": 0.49, "grad_norm": 0.27291759848594666, "learning_rate": 0.001, "loss": 2.4205, "step": 4395 }, { "epoch": 0.49, "grad_norm": 0.3132713735103607, "learning_rate": 0.001, "loss": 2.4312, "step": 4400 }, { "epoch": 0.49, "grad_norm": 0.3540079891681671, "learning_rate": 0.001, "loss": 2.42, "step": 4405 }, { "epoch": 0.49, "grad_norm": 0.2768736183643341, "learning_rate": 0.001, "loss": 2.4446, "step": 4410 }, { "epoch": 0.49, "grad_norm": 0.28580641746520996, "learning_rate": 0.001, "loss": 2.4914, "step": 4415 }, { "epoch": 0.49, "grad_norm": 0.33273836970329285, "learning_rate": 0.001, "loss": 2.4596, "step": 4420 }, { "epoch": 0.49, "grad_norm": 0.27512285113334656, "learning_rate": 0.001, "loss": 2.4612, "step": 4425 }, { "epoch": 0.49, "grad_norm": 0.28600168228149414, "learning_rate": 0.001, "loss": 2.3727, "step": 4430 }, { "epoch": 0.49, "grad_norm": 0.28686290979385376, "learning_rate": 0.001, "loss": 2.4356, "step": 4435 }, { "epoch": 0.5, "grad_norm": 0.30344080924987793, "learning_rate": 0.001, "loss": 2.5105, "step": 4440 }, { "epoch": 0.5, "grad_norm": 0.34767141938209534, "learning_rate": 0.001, "loss": 2.4046, "step": 4445 }, { "epoch": 0.5, "grad_norm": 0.28735655546188354, "learning_rate": 0.001, "loss": 2.3914, "step": 4450 }, { "epoch": 0.5, "grad_norm": 0.2845683693885803, "learning_rate": 0.001, "loss": 2.3604, "step": 4455 }, { "epoch": 0.5, "grad_norm": 0.3194609582424164, "learning_rate": 0.001, "loss": 2.4498, "step": 4460 }, { "epoch": 0.5, "grad_norm": 0.2727400064468384, "learning_rate": 0.001, "loss": 2.4558, "step": 4465 }, { "epoch": 0.5, "grad_norm": 0.25680381059646606, "learning_rate": 0.001, "loss": 2.4769, "step": 4470 }, { "epoch": 0.5, "grad_norm": 0.29016733169555664, "learning_rate": 0.001, "loss": 2.3797, "step": 4475 }, { "epoch": 0.5, "grad_norm": 0.27482253313064575, "learning_rate": 0.001, "loss": 2.384, "step": 4480 }, { "epoch": 0.5, "grad_norm": 0.3203965723514557, "learning_rate": 0.001, "loss": 2.5246, "step": 4485 }, { "epoch": 0.5, "grad_norm": 0.27123692631721497, "learning_rate": 0.001, "loss": 2.408, "step": 4490 }, { "epoch": 0.5, "grad_norm": 0.2830926477909088, "learning_rate": 0.001, "loss": 2.4685, "step": 4495 }, { "epoch": 0.5, "grad_norm": 0.2557945251464844, "learning_rate": 0.001, "loss": 2.4439, "step": 4500 }, { "epoch": 0.5, "grad_norm": 0.30656105279922485, "learning_rate": 0.001, "loss": 2.3327, "step": 4505 }, { "epoch": 0.5, "grad_norm": 0.2598239481449127, "learning_rate": 0.001, "loss": 2.4493, "step": 4510 }, { "epoch": 0.5, "grad_norm": 0.31256821751594543, "learning_rate": 0.001, "loss": 2.3842, "step": 4515 }, { "epoch": 0.5, "grad_norm": 0.3096126317977905, "learning_rate": 0.001, "loss": 2.4043, "step": 4520 }, { "epoch": 0.5, "grad_norm": 0.29334813356399536, "learning_rate": 0.001, "loss": 2.4784, "step": 4525 }, { "epoch": 0.51, "grad_norm": 0.270281046628952, "learning_rate": 0.001, "loss": 2.3645, "step": 4530 }, { "epoch": 0.51, "grad_norm": 0.2868260443210602, "learning_rate": 0.001, "loss": 2.4196, "step": 4535 }, { "epoch": 0.51, "grad_norm": 0.26428335905075073, "learning_rate": 0.001, "loss": 2.3695, "step": 4540 }, { "epoch": 0.51, "grad_norm": 0.28040027618408203, "learning_rate": 0.001, "loss": 2.4872, "step": 4545 }, { "epoch": 0.51, "grad_norm": 0.26213884353637695, "learning_rate": 0.001, "loss": 2.3695, "step": 4550 }, { "epoch": 0.51, "grad_norm": 0.27567023038864136, "learning_rate": 0.001, "loss": 2.3853, "step": 4555 }, { "epoch": 0.51, "grad_norm": 0.29068320989608765, "learning_rate": 0.001, "loss": 2.3579, "step": 4560 }, { "epoch": 0.51, "grad_norm": 0.2767714262008667, "learning_rate": 0.001, "loss": 2.4266, "step": 4565 }, { "epoch": 0.51, "grad_norm": 0.27447810769081116, "learning_rate": 0.001, "loss": 2.4005, "step": 4570 }, { "epoch": 0.51, "grad_norm": 0.26019272208213806, "learning_rate": 0.001, "loss": 2.3552, "step": 4575 }, { "epoch": 0.51, "grad_norm": 0.273270845413208, "learning_rate": 0.001, "loss": 2.3849, "step": 4580 }, { "epoch": 0.51, "grad_norm": 0.2733108699321747, "learning_rate": 0.001, "loss": 2.3321, "step": 4585 }, { "epoch": 0.51, "grad_norm": 0.26178601384162903, "learning_rate": 0.001, "loss": 2.4505, "step": 4590 }, { "epoch": 0.51, "grad_norm": 0.25937095284461975, "learning_rate": 0.001, "loss": 2.3453, "step": 4595 }, { "epoch": 0.51, "grad_norm": 0.27965015172958374, "learning_rate": 0.001, "loss": 2.4615, "step": 4600 }, { "epoch": 0.51, "grad_norm": 0.2662375867366791, "learning_rate": 0.001, "loss": 2.3325, "step": 4605 }, { "epoch": 0.51, "grad_norm": 0.2801031470298767, "learning_rate": 0.001, "loss": 2.4223, "step": 4610 }, { "epoch": 0.51, "grad_norm": 0.2957525849342346, "learning_rate": 0.001, "loss": 2.4036, "step": 4615 }, { "epoch": 0.52, "grad_norm": 0.28883954882621765, "learning_rate": 0.001, "loss": 2.5068, "step": 4620 }, { "epoch": 0.52, "grad_norm": 0.2551986575126648, "learning_rate": 0.001, "loss": 2.405, "step": 4625 }, { "epoch": 0.52, "grad_norm": 0.2544376850128174, "learning_rate": 0.001, "loss": 2.4211, "step": 4630 }, { "epoch": 0.52, "grad_norm": 0.2596908211708069, "learning_rate": 0.001, "loss": 2.3166, "step": 4635 }, { "epoch": 0.52, "grad_norm": 0.285861074924469, "learning_rate": 0.001, "loss": 2.3782, "step": 4640 }, { "epoch": 0.52, "grad_norm": 0.2784794270992279, "learning_rate": 0.001, "loss": 2.4322, "step": 4645 }, { "epoch": 0.52, "grad_norm": 0.2579992413520813, "learning_rate": 0.001, "loss": 2.4158, "step": 4650 }, { "epoch": 0.52, "grad_norm": 0.27722522616386414, "learning_rate": 0.001, "loss": 2.4386, "step": 4655 }, { "epoch": 0.52, "grad_norm": 0.2655174732208252, "learning_rate": 0.001, "loss": 2.4496, "step": 4660 }, { "epoch": 0.52, "grad_norm": 0.27302199602127075, "learning_rate": 0.001, "loss": 2.5056, "step": 4665 }, { "epoch": 0.52, "grad_norm": 0.26454782485961914, "learning_rate": 0.001, "loss": 2.3612, "step": 4670 }, { "epoch": 0.52, "grad_norm": 0.267187237739563, "learning_rate": 0.001, "loss": 2.4129, "step": 4675 }, { "epoch": 0.52, "grad_norm": 0.2863868772983551, "learning_rate": 0.001, "loss": 2.4469, "step": 4680 }, { "epoch": 0.52, "grad_norm": 0.2754268944263458, "learning_rate": 0.001, "loss": 2.3578, "step": 4685 }, { "epoch": 0.52, "grad_norm": 0.25754669308662415, "learning_rate": 0.001, "loss": 2.4504, "step": 4690 }, { "epoch": 0.52, "grad_norm": 0.26824602484703064, "learning_rate": 0.001, "loss": 2.4302, "step": 4695 }, { "epoch": 0.52, "grad_norm": 0.27403566241264343, "learning_rate": 0.001, "loss": 2.391, "step": 4700 }, { "epoch": 0.52, "grad_norm": 0.248744934797287, "learning_rate": 0.001, "loss": 2.425, "step": 4705 }, { "epoch": 0.53, "grad_norm": 0.2514548599720001, "learning_rate": 0.001, "loss": 2.4186, "step": 4710 }, { "epoch": 0.53, "grad_norm": 0.2512078881263733, "learning_rate": 0.001, "loss": 2.3782, "step": 4715 }, { "epoch": 0.53, "grad_norm": 0.3069394528865814, "learning_rate": 0.001, "loss": 2.4268, "step": 4720 }, { "epoch": 0.53, "grad_norm": 0.3118564784526825, "learning_rate": 0.001, "loss": 2.4843, "step": 4725 }, { "epoch": 0.53, "grad_norm": 0.33608558773994446, "learning_rate": 0.001, "loss": 2.3966, "step": 4730 }, { "epoch": 0.53, "grad_norm": 0.2619701027870178, "learning_rate": 0.001, "loss": 2.3888, "step": 4735 }, { "epoch": 0.53, "grad_norm": 0.25927653908729553, "learning_rate": 0.001, "loss": 2.4065, "step": 4740 }, { "epoch": 0.53, "grad_norm": 0.28429317474365234, "learning_rate": 0.001, "loss": 2.3549, "step": 4745 }, { "epoch": 0.53, "grad_norm": 0.274682879447937, "learning_rate": 0.001, "loss": 2.2985, "step": 4750 }, { "epoch": 0.53, "grad_norm": 0.2774803638458252, "learning_rate": 0.001, "loss": 2.4104, "step": 4755 }, { "epoch": 0.53, "grad_norm": 0.27073538303375244, "learning_rate": 0.001, "loss": 2.3434, "step": 4760 }, { "epoch": 0.53, "grad_norm": 0.27918335795402527, "learning_rate": 0.001, "loss": 2.3986, "step": 4765 }, { "epoch": 0.53, "grad_norm": 0.2999061942100525, "learning_rate": 0.001, "loss": 2.3535, "step": 4770 }, { "epoch": 0.53, "grad_norm": 0.26616016030311584, "learning_rate": 0.001, "loss": 2.3756, "step": 4775 }, { "epoch": 0.53, "grad_norm": 0.2599700093269348, "learning_rate": 0.001, "loss": 2.3859, "step": 4780 }, { "epoch": 0.53, "grad_norm": 0.2553490102291107, "learning_rate": 0.001, "loss": 2.3737, "step": 4785 }, { "epoch": 0.53, "grad_norm": 0.329328328371048, "learning_rate": 0.001, "loss": 2.4573, "step": 4790 }, { "epoch": 0.53, "grad_norm": 0.3233380913734436, "learning_rate": 0.001, "loss": 2.4289, "step": 4795 }, { "epoch": 0.54, "grad_norm": 0.3129531145095825, "learning_rate": 0.001, "loss": 2.4391, "step": 4800 }, { "epoch": 0.54, "grad_norm": 0.2748904228210449, "learning_rate": 0.001, "loss": 2.3729, "step": 4805 }, { "epoch": 0.54, "grad_norm": 0.2609279453754425, "learning_rate": 0.001, "loss": 2.461, "step": 4810 }, { "epoch": 0.54, "grad_norm": 0.27427300810813904, "learning_rate": 0.001, "loss": 2.4313, "step": 4815 }, { "epoch": 0.54, "grad_norm": 0.2718832790851593, "learning_rate": 0.001, "loss": 2.3691, "step": 4820 }, { "epoch": 0.54, "grad_norm": 0.2804459035396576, "learning_rate": 0.001, "loss": 2.3728, "step": 4825 }, { "epoch": 0.54, "grad_norm": 0.26752111315727234, "learning_rate": 0.001, "loss": 2.37, "step": 4830 }, { "epoch": 0.54, "grad_norm": 0.27284350991249084, "learning_rate": 0.001, "loss": 2.4466, "step": 4835 }, { "epoch": 0.54, "grad_norm": 0.24688100814819336, "learning_rate": 0.001, "loss": 2.4122, "step": 4840 }, { "epoch": 0.54, "grad_norm": 0.27109193801879883, "learning_rate": 0.001, "loss": 2.3855, "step": 4845 }, { "epoch": 0.54, "grad_norm": 0.2605935335159302, "learning_rate": 0.001, "loss": 2.3628, "step": 4850 }, { "epoch": 0.54, "grad_norm": 0.2562985122203827, "learning_rate": 0.001, "loss": 2.4307, "step": 4855 }, { "epoch": 0.54, "grad_norm": 0.2749897837638855, "learning_rate": 0.001, "loss": 2.437, "step": 4860 }, { "epoch": 0.54, "grad_norm": 0.23961135745048523, "learning_rate": 0.001, "loss": 2.2798, "step": 4865 }, { "epoch": 0.54, "grad_norm": 0.25328007340431213, "learning_rate": 0.001, "loss": 2.3712, "step": 4870 }, { "epoch": 0.54, "grad_norm": 0.23832069337368011, "learning_rate": 0.001, "loss": 2.3946, "step": 4875 }, { "epoch": 0.54, "grad_norm": 0.2675718069076538, "learning_rate": 0.001, "loss": 2.304, "step": 4880 }, { "epoch": 0.54, "grad_norm": 0.24774715304374695, "learning_rate": 0.001, "loss": 2.4343, "step": 4885 }, { "epoch": 0.55, "grad_norm": 0.24921192228794098, "learning_rate": 0.001, "loss": 2.3984, "step": 4890 }, { "epoch": 0.55, "grad_norm": 0.3039065897464752, "learning_rate": 0.001, "loss": 2.3614, "step": 4895 }, { "epoch": 0.55, "grad_norm": 0.27676519751548767, "learning_rate": 0.001, "loss": 2.3913, "step": 4900 }, { "epoch": 0.55, "grad_norm": 0.26279640197753906, "learning_rate": 0.001, "loss": 2.3754, "step": 4905 }, { "epoch": 0.55, "grad_norm": 0.27109283208847046, "learning_rate": 0.001, "loss": 2.3579, "step": 4910 }, { "epoch": 0.55, "grad_norm": 0.3158806562423706, "learning_rate": 0.001, "loss": 2.3984, "step": 4915 }, { "epoch": 0.55, "grad_norm": 0.26809123158454895, "learning_rate": 0.001, "loss": 2.3454, "step": 4920 }, { "epoch": 0.55, "grad_norm": 0.24843958020210266, "learning_rate": 0.001, "loss": 2.3414, "step": 4925 }, { "epoch": 0.55, "grad_norm": 0.23875649273395538, "learning_rate": 0.001, "loss": 2.2866, "step": 4930 }, { "epoch": 0.55, "grad_norm": 0.25938940048217773, "learning_rate": 0.001, "loss": 2.2937, "step": 4935 }, { "epoch": 0.55, "grad_norm": 0.27505698800086975, "learning_rate": 0.001, "loss": 2.3627, "step": 4940 }, { "epoch": 0.55, "grad_norm": 0.29795336723327637, "learning_rate": 0.001, "loss": 2.3351, "step": 4945 }, { "epoch": 0.55, "grad_norm": 0.2589178681373596, "learning_rate": 0.001, "loss": 2.3451, "step": 4950 }, { "epoch": 0.55, "grad_norm": 0.2763518691062927, "learning_rate": 0.001, "loss": 2.5155, "step": 4955 }, { "epoch": 0.55, "grad_norm": 0.2927325367927551, "learning_rate": 0.001, "loss": 2.356, "step": 4960 }, { "epoch": 0.55, "grad_norm": 0.26324424147605896, "learning_rate": 0.001, "loss": 2.3444, "step": 4965 }, { "epoch": 0.55, "grad_norm": 0.2642908990383148, "learning_rate": 0.001, "loss": 2.3295, "step": 4970 }, { "epoch": 0.55, "grad_norm": 0.2945278286933899, "learning_rate": 0.001, "loss": 2.3377, "step": 4975 }, { "epoch": 0.56, "grad_norm": 0.2505294382572174, "learning_rate": 0.001, "loss": 2.3119, "step": 4980 }, { "epoch": 0.56, "grad_norm": 0.2456444352865219, "learning_rate": 0.001, "loss": 2.3599, "step": 4985 }, { "epoch": 0.56, "grad_norm": 0.27345308661460876, "learning_rate": 0.001, "loss": 2.3649, "step": 4990 }, { "epoch": 0.56, "grad_norm": 0.26777923107147217, "learning_rate": 0.001, "loss": 2.3936, "step": 4995 }, { "epoch": 0.56, "grad_norm": 0.2308703511953354, "learning_rate": 0.001, "loss": 2.3303, "step": 5000 }, { "epoch": 0.56, "grad_norm": 0.2660985589027405, "learning_rate": 0.001, "loss": 2.36, "step": 5005 }, { "epoch": 0.56, "grad_norm": 0.2525184750556946, "learning_rate": 0.001, "loss": 2.3628, "step": 5010 }, { "epoch": 0.56, "grad_norm": 0.2550134062767029, "learning_rate": 0.001, "loss": 2.4206, "step": 5015 }, { "epoch": 0.56, "grad_norm": 0.24980038404464722, "learning_rate": 0.001, "loss": 2.3157, "step": 5020 }, { "epoch": 0.56, "grad_norm": 0.27052590250968933, "learning_rate": 0.001, "loss": 2.3774, "step": 5025 }, { "epoch": 0.56, "grad_norm": 0.26681166887283325, "learning_rate": 0.001, "loss": 2.4122, "step": 5030 }, { "epoch": 0.56, "grad_norm": 0.2528935670852661, "learning_rate": 0.001, "loss": 2.318, "step": 5035 }, { "epoch": 0.56, "grad_norm": 0.2519877552986145, "learning_rate": 0.001, "loss": 2.2664, "step": 5040 }, { "epoch": 0.56, "grad_norm": 0.2654362916946411, "learning_rate": 0.001, "loss": 2.2941, "step": 5045 }, { "epoch": 0.56, "grad_norm": 0.2554594576358795, "learning_rate": 0.001, "loss": 2.4286, "step": 5050 }, { "epoch": 0.56, "grad_norm": 0.25834372639656067, "learning_rate": 0.001, "loss": 2.3451, "step": 5055 }, { "epoch": 0.56, "grad_norm": 0.26128920912742615, "learning_rate": 0.001, "loss": 2.3545, "step": 5060 }, { "epoch": 0.56, "grad_norm": 0.2541617751121521, "learning_rate": 0.001, "loss": 2.3366, "step": 5065 }, { "epoch": 0.57, "grad_norm": 0.2685922384262085, "learning_rate": 0.001, "loss": 2.3235, "step": 5070 }, { "epoch": 0.57, "grad_norm": 0.2549839913845062, "learning_rate": 0.001, "loss": 2.4049, "step": 5075 }, { "epoch": 0.57, "grad_norm": 0.2501363158226013, "learning_rate": 0.001, "loss": 2.3329, "step": 5080 }, { "epoch": 0.57, "grad_norm": 0.2490965723991394, "learning_rate": 0.001, "loss": 2.4172, "step": 5085 }, { "epoch": 0.57, "grad_norm": 0.23749250173568726, "learning_rate": 0.001, "loss": 2.2282, "step": 5090 }, { "epoch": 0.57, "grad_norm": 0.2824033498764038, "learning_rate": 0.001, "loss": 2.4628, "step": 5095 }, { "epoch": 0.57, "grad_norm": 0.2921418249607086, "learning_rate": 0.001, "loss": 2.341, "step": 5100 }, { "epoch": 0.57, "grad_norm": 0.3341614305973053, "learning_rate": 0.001, "loss": 2.3233, "step": 5105 }, { "epoch": 0.57, "grad_norm": 0.2599157392978668, "learning_rate": 0.001, "loss": 2.3244, "step": 5110 }, { "epoch": 0.57, "grad_norm": 0.2559368312358856, "learning_rate": 0.001, "loss": 2.4011, "step": 5115 }, { "epoch": 0.57, "grad_norm": 0.2784283757209778, "learning_rate": 0.001, "loss": 2.353, "step": 5120 }, { "epoch": 0.57, "grad_norm": 0.25590980052948, "learning_rate": 0.001, "loss": 2.374, "step": 5125 }, { "epoch": 0.57, "grad_norm": 0.29736170172691345, "learning_rate": 0.001, "loss": 2.3436, "step": 5130 }, { "epoch": 0.57, "grad_norm": 0.2628958225250244, "learning_rate": 0.001, "loss": 2.3212, "step": 5135 }, { "epoch": 0.57, "grad_norm": 0.24243566393852234, "learning_rate": 0.001, "loss": 2.3772, "step": 5140 }, { "epoch": 0.57, "grad_norm": 0.25928130745887756, "learning_rate": 0.001, "loss": 2.4252, "step": 5145 }, { "epoch": 0.57, "grad_norm": 0.24278123676776886, "learning_rate": 0.001, "loss": 2.3692, "step": 5150 }, { "epoch": 0.57, "grad_norm": 0.2492816299200058, "learning_rate": 0.001, "loss": 2.3512, "step": 5155 }, { "epoch": 0.58, "grad_norm": 0.26422119140625, "learning_rate": 0.001, "loss": 2.3008, "step": 5160 }, { "epoch": 0.58, "grad_norm": 0.2409730702638626, "learning_rate": 0.001, "loss": 2.3404, "step": 5165 }, { "epoch": 0.58, "grad_norm": 0.26327621936798096, "learning_rate": 0.001, "loss": 2.433, "step": 5170 }, { "epoch": 0.58, "grad_norm": 0.3097192943096161, "learning_rate": 0.001, "loss": 2.3845, "step": 5175 }, { "epoch": 0.58, "grad_norm": 0.2360316962003708, "learning_rate": 0.001, "loss": 2.3833, "step": 5180 }, { "epoch": 0.58, "grad_norm": 0.24150846898555756, "learning_rate": 0.001, "loss": 2.3128, "step": 5185 }, { "epoch": 0.58, "grad_norm": 0.24739116430282593, "learning_rate": 0.001, "loss": 2.3132, "step": 5190 }, { "epoch": 0.58, "grad_norm": 0.2537684440612793, "learning_rate": 0.001, "loss": 2.4189, "step": 5195 }, { "epoch": 0.58, "grad_norm": 0.2589970529079437, "learning_rate": 0.001, "loss": 2.3695, "step": 5200 }, { "epoch": 0.58, "grad_norm": 0.23109261691570282, "learning_rate": 0.001, "loss": 2.3342, "step": 5205 }, { "epoch": 0.58, "grad_norm": 0.24093829095363617, "learning_rate": 0.001, "loss": 2.3929, "step": 5210 }, { "epoch": 0.58, "grad_norm": 0.25576117634773254, "learning_rate": 0.001, "loss": 2.3288, "step": 5215 }, { "epoch": 0.58, "grad_norm": 0.24346935749053955, "learning_rate": 0.001, "loss": 2.3516, "step": 5220 }, { "epoch": 0.58, "grad_norm": 0.259819895029068, "learning_rate": 0.001, "loss": 2.3438, "step": 5225 }, { "epoch": 0.58, "grad_norm": 0.2514759302139282, "learning_rate": 0.001, "loss": 2.3943, "step": 5230 }, { "epoch": 0.58, "grad_norm": 0.23569801449775696, "learning_rate": 0.001, "loss": 2.4217, "step": 5235 }, { "epoch": 0.58, "grad_norm": 0.2914365828037262, "learning_rate": 0.001, "loss": 2.4228, "step": 5240 }, { "epoch": 0.58, "grad_norm": 0.2780640721321106, "learning_rate": 0.001, "loss": 2.4379, "step": 5245 }, { "epoch": 0.59, "grad_norm": 0.2788977026939392, "learning_rate": 0.001, "loss": 2.3052, "step": 5250 }, { "epoch": 0.59, "grad_norm": 0.26812538504600525, "learning_rate": 0.001, "loss": 2.2674, "step": 5255 }, { "epoch": 0.59, "grad_norm": 0.25950074195861816, "learning_rate": 0.001, "loss": 2.4089, "step": 5260 }, { "epoch": 0.59, "grad_norm": 0.25828009843826294, "learning_rate": 0.001, "loss": 2.3674, "step": 5265 }, { "epoch": 0.59, "grad_norm": 0.24739393591880798, "learning_rate": 0.001, "loss": 2.3111, "step": 5270 }, { "epoch": 0.59, "grad_norm": 0.2411363422870636, "learning_rate": 0.001, "loss": 2.4458, "step": 5275 }, { "epoch": 0.59, "grad_norm": 0.23981954157352448, "learning_rate": 0.001, "loss": 2.3483, "step": 5280 }, { "epoch": 0.59, "grad_norm": 0.24768058955669403, "learning_rate": 0.001, "loss": 2.3429, "step": 5285 }, { "epoch": 0.59, "grad_norm": 0.2422722429037094, "learning_rate": 0.001, "loss": 2.3193, "step": 5290 }, { "epoch": 0.59, "grad_norm": 0.23856380581855774, "learning_rate": 0.001, "loss": 2.3493, "step": 5295 }, { "epoch": 0.59, "grad_norm": 0.2557425796985626, "learning_rate": 0.001, "loss": 2.2181, "step": 5300 }, { "epoch": 0.59, "grad_norm": 0.2540566623210907, "learning_rate": 0.001, "loss": 2.2677, "step": 5305 }, { "epoch": 0.59, "grad_norm": 0.26815852522850037, "learning_rate": 0.001, "loss": 2.2979, "step": 5310 }, { "epoch": 0.59, "grad_norm": 0.25368693470954895, "learning_rate": 0.001, "loss": 2.3508, "step": 5315 }, { "epoch": 0.59, "grad_norm": 0.2505514919757843, "learning_rate": 0.001, "loss": 2.3665, "step": 5320 }, { "epoch": 0.59, "grad_norm": 0.27467647194862366, "learning_rate": 0.001, "loss": 2.2853, "step": 5325 }, { "epoch": 0.59, "grad_norm": 0.25152960419654846, "learning_rate": 0.001, "loss": 2.3663, "step": 5330 }, { "epoch": 0.59, "grad_norm": 0.24481120705604553, "learning_rate": 0.001, "loss": 2.4325, "step": 5335 }, { "epoch": 0.6, "grad_norm": 0.2353387176990509, "learning_rate": 0.001, "loss": 2.4131, "step": 5340 }, { "epoch": 0.6, "grad_norm": 0.26953062415122986, "learning_rate": 0.001, "loss": 2.3549, "step": 5345 }, { "epoch": 0.6, "grad_norm": 0.22940675914287567, "learning_rate": 0.001, "loss": 2.3911, "step": 5350 }, { "epoch": 0.6, "grad_norm": 0.3119536340236664, "learning_rate": 0.001, "loss": 2.3921, "step": 5355 }, { "epoch": 0.6, "grad_norm": 0.24042974412441254, "learning_rate": 0.001, "loss": 2.327, "step": 5360 }, { "epoch": 0.6, "grad_norm": 0.2505492866039276, "learning_rate": 0.001, "loss": 2.4174, "step": 5365 }, { "epoch": 0.6, "grad_norm": 0.2914923131465912, "learning_rate": 0.001, "loss": 2.2534, "step": 5370 }, { "epoch": 0.6, "grad_norm": 0.2598934769630432, "learning_rate": 0.001, "loss": 2.4125, "step": 5375 }, { "epoch": 0.6, "grad_norm": 0.24274258315563202, "learning_rate": 0.001, "loss": 2.3404, "step": 5380 }, { "epoch": 0.6, "grad_norm": 0.24509523808956146, "learning_rate": 0.001, "loss": 2.4063, "step": 5385 }, { "epoch": 0.6, "grad_norm": 0.25125619769096375, "learning_rate": 0.001, "loss": 2.3473, "step": 5390 }, { "epoch": 0.6, "grad_norm": 0.2509467601776123, "learning_rate": 0.001, "loss": 2.3411, "step": 5395 }, { "epoch": 0.6, "grad_norm": 0.2541724145412445, "learning_rate": 0.001, "loss": 2.4116, "step": 5400 }, { "epoch": 0.6, "grad_norm": 0.23472855985164642, "learning_rate": 0.001, "loss": 2.3242, "step": 5405 }, { "epoch": 0.6, "grad_norm": 0.2244972288608551, "learning_rate": 0.001, "loss": 2.3185, "step": 5410 }, { "epoch": 0.6, "grad_norm": 0.26157915592193604, "learning_rate": 0.001, "loss": 2.3682, "step": 5415 }, { "epoch": 0.6, "grad_norm": 0.23372791707515717, "learning_rate": 0.001, "loss": 2.3788, "step": 5420 }, { "epoch": 0.6, "grad_norm": 0.25492754578590393, "learning_rate": 0.001, "loss": 2.3125, "step": 5425 }, { "epoch": 0.61, "grad_norm": 0.2350878119468689, "learning_rate": 0.001, "loss": 2.3559, "step": 5430 }, { "epoch": 0.61, "grad_norm": 0.2786577343940735, "learning_rate": 0.001, "loss": 2.3551, "step": 5435 }, { "epoch": 0.61, "grad_norm": 0.24445191025733948, "learning_rate": 0.001, "loss": 2.3195, "step": 5440 }, { "epoch": 0.61, "grad_norm": 0.2601463794708252, "learning_rate": 0.001, "loss": 2.3156, "step": 5445 }, { "epoch": 0.61, "grad_norm": 0.2302105575799942, "learning_rate": 0.001, "loss": 2.3936, "step": 5450 }, { "epoch": 0.61, "grad_norm": 0.22858335077762604, "learning_rate": 0.001, "loss": 2.3565, "step": 5455 }, { "epoch": 0.61, "grad_norm": 0.24476759135723114, "learning_rate": 0.001, "loss": 2.36, "step": 5460 }, { "epoch": 0.61, "grad_norm": 0.2723000943660736, "learning_rate": 0.001, "loss": 2.2823, "step": 5465 }, { "epoch": 0.61, "grad_norm": 0.24212589859962463, "learning_rate": 0.001, "loss": 2.3353, "step": 5470 }, { "epoch": 0.61, "grad_norm": 0.249808669090271, "learning_rate": 0.001, "loss": 2.361, "step": 5475 }, { "epoch": 0.61, "grad_norm": 0.31722500920295715, "learning_rate": 0.001, "loss": 2.4057, "step": 5480 }, { "epoch": 0.61, "grad_norm": 0.2519189417362213, "learning_rate": 0.001, "loss": 2.3239, "step": 5485 }, { "epoch": 0.61, "grad_norm": 0.24214482307434082, "learning_rate": 0.001, "loss": 2.3009, "step": 5490 }, { "epoch": 0.61, "grad_norm": 0.2553914785385132, "learning_rate": 0.001, "loss": 2.3704, "step": 5495 }, { "epoch": 0.61, "grad_norm": 0.2316461056470871, "learning_rate": 0.001, "loss": 2.3725, "step": 5500 }, { "epoch": 0.61, "grad_norm": 0.2363290637731552, "learning_rate": 0.001, "loss": 2.3258, "step": 5505 }, { "epoch": 0.61, "grad_norm": 0.2391221523284912, "learning_rate": 0.001, "loss": 2.3662, "step": 5510 }, { "epoch": 0.61, "grad_norm": 0.25969377160072327, "learning_rate": 0.001, "loss": 2.344, "step": 5515 }, { "epoch": 0.62, "grad_norm": 0.23705187439918518, "learning_rate": 0.001, "loss": 2.2742, "step": 5520 }, { "epoch": 0.62, "grad_norm": 0.23285822570323944, "learning_rate": 0.001, "loss": 2.2341, "step": 5525 }, { "epoch": 0.62, "grad_norm": 0.22560162842273712, "learning_rate": 0.001, "loss": 2.4168, "step": 5530 }, { "epoch": 0.62, "grad_norm": 0.24206668138504028, "learning_rate": 0.001, "loss": 2.2389, "step": 5535 }, { "epoch": 0.62, "grad_norm": 0.22780267894268036, "learning_rate": 0.001, "loss": 2.2827, "step": 5540 }, { "epoch": 0.62, "grad_norm": 0.26254940032958984, "learning_rate": 0.001, "loss": 2.2634, "step": 5545 }, { "epoch": 0.62, "grad_norm": 0.2319927215576172, "learning_rate": 0.001, "loss": 2.3293, "step": 5550 }, { "epoch": 0.62, "grad_norm": 0.2692784070968628, "learning_rate": 0.001, "loss": 2.3751, "step": 5555 }, { "epoch": 0.62, "grad_norm": 0.23394356667995453, "learning_rate": 0.001, "loss": 2.3735, "step": 5560 }, { "epoch": 0.62, "grad_norm": 0.23533542454242706, "learning_rate": 0.001, "loss": 2.2469, "step": 5565 }, { "epoch": 0.62, "grad_norm": 0.24933494627475739, "learning_rate": 0.001, "loss": 2.3927, "step": 5570 }, { "epoch": 0.62, "grad_norm": 0.24868957698345184, "learning_rate": 0.001, "loss": 2.3305, "step": 5575 }, { "epoch": 0.62, "grad_norm": 0.23358029127120972, "learning_rate": 0.001, "loss": 2.3149, "step": 5580 }, { "epoch": 0.62, "grad_norm": 0.2536250352859497, "learning_rate": 0.001, "loss": 2.26, "step": 5585 }, { "epoch": 0.62, "grad_norm": 0.2484007626771927, "learning_rate": 0.001, "loss": 2.4374, "step": 5590 }, { "epoch": 0.62, "grad_norm": 0.2672274708747864, "learning_rate": 0.001, "loss": 2.2977, "step": 5595 }, { "epoch": 0.62, "grad_norm": 0.24046289920806885, "learning_rate": 0.001, "loss": 2.3444, "step": 5600 }, { "epoch": 0.62, "grad_norm": 0.24538683891296387, "learning_rate": 0.001, "loss": 2.3603, "step": 5605 }, { "epoch": 0.63, "grad_norm": 0.237552210688591, "learning_rate": 0.001, "loss": 2.3722, "step": 5610 }, { "epoch": 0.63, "grad_norm": 0.22240518033504486, "learning_rate": 0.001, "loss": 2.2714, "step": 5615 }, { "epoch": 0.63, "grad_norm": 0.24909433722496033, "learning_rate": 0.001, "loss": 2.3317, "step": 5620 }, { "epoch": 0.63, "grad_norm": 0.23186059296131134, "learning_rate": 0.001, "loss": 2.3821, "step": 5625 }, { "epoch": 0.63, "grad_norm": 0.23126690089702606, "learning_rate": 0.001, "loss": 2.3512, "step": 5630 }, { "epoch": 0.63, "grad_norm": 0.2562063932418823, "learning_rate": 0.001, "loss": 2.3948, "step": 5635 }, { "epoch": 0.63, "grad_norm": 0.2485012412071228, "learning_rate": 0.001, "loss": 2.3371, "step": 5640 }, { "epoch": 0.63, "grad_norm": 0.24076102674007416, "learning_rate": 0.001, "loss": 2.395, "step": 5645 }, { "epoch": 0.63, "grad_norm": 0.2525179088115692, "learning_rate": 0.001, "loss": 2.4241, "step": 5650 }, { "epoch": 0.63, "grad_norm": 0.23360827565193176, "learning_rate": 0.001, "loss": 2.3358, "step": 5655 }, { "epoch": 0.63, "grad_norm": 0.24136130511760712, "learning_rate": 0.001, "loss": 2.3938, "step": 5660 }, { "epoch": 0.63, "grad_norm": 0.2559976577758789, "learning_rate": 0.001, "loss": 2.3931, "step": 5665 }, { "epoch": 0.63, "grad_norm": 0.24539463222026825, "learning_rate": 0.001, "loss": 2.2946, "step": 5670 }, { "epoch": 0.63, "grad_norm": 0.24134691059589386, "learning_rate": 0.001, "loss": 2.3512, "step": 5675 }, { "epoch": 0.63, "grad_norm": 0.22826063632965088, "learning_rate": 0.001, "loss": 2.3152, "step": 5680 }, { "epoch": 0.63, "grad_norm": 0.24264606833457947, "learning_rate": 0.001, "loss": 2.3635, "step": 5685 }, { "epoch": 0.63, "grad_norm": 0.25732120871543884, "learning_rate": 0.001, "loss": 2.3409, "step": 5690 }, { "epoch": 0.63, "grad_norm": 0.2741478681564331, "learning_rate": 0.001, "loss": 2.3171, "step": 5695 }, { "epoch": 0.64, "grad_norm": 0.22876569628715515, "learning_rate": 0.001, "loss": 2.2749, "step": 5700 }, { "epoch": 0.64, "grad_norm": 0.2547457218170166, "learning_rate": 0.001, "loss": 2.3821, "step": 5705 }, { "epoch": 0.64, "grad_norm": 0.2326708734035492, "learning_rate": 0.001, "loss": 2.3686, "step": 5710 }, { "epoch": 0.64, "grad_norm": 0.2405540645122528, "learning_rate": 0.001, "loss": 2.3483, "step": 5715 }, { "epoch": 0.64, "grad_norm": 0.2408895492553711, "learning_rate": 0.001, "loss": 2.2978, "step": 5720 }, { "epoch": 0.64, "grad_norm": 0.23036791384220123, "learning_rate": 0.001, "loss": 2.3084, "step": 5725 }, { "epoch": 0.64, "grad_norm": 0.22999659180641174, "learning_rate": 0.001, "loss": 2.3285, "step": 5730 }, { "epoch": 0.64, "grad_norm": 0.23441249132156372, "learning_rate": 0.001, "loss": 2.3568, "step": 5735 }, { "epoch": 0.64, "grad_norm": 0.2443058043718338, "learning_rate": 0.001, "loss": 2.3631, "step": 5740 }, { "epoch": 0.64, "grad_norm": 0.2505609393119812, "learning_rate": 0.001, "loss": 2.4046, "step": 5745 }, { "epoch": 0.64, "grad_norm": 0.27520403265953064, "learning_rate": 0.001, "loss": 2.3677, "step": 5750 }, { "epoch": 0.64, "grad_norm": 0.23651844263076782, "learning_rate": 0.001, "loss": 2.404, "step": 5755 }, { "epoch": 0.64, "grad_norm": 0.2234681099653244, "learning_rate": 0.001, "loss": 2.2937, "step": 5760 }, { "epoch": 0.64, "grad_norm": 0.2529265880584717, "learning_rate": 0.001, "loss": 2.2717, "step": 5765 }, { "epoch": 0.64, "grad_norm": 0.22382284700870514, "learning_rate": 0.001, "loss": 2.3432, "step": 5770 }, { "epoch": 0.64, "grad_norm": 0.23001866042613983, "learning_rate": 0.001, "loss": 2.3937, "step": 5775 }, { "epoch": 0.64, "grad_norm": 0.22797128558158875, "learning_rate": 0.001, "loss": 2.3402, "step": 5780 }, { "epoch": 0.64, "grad_norm": 0.23282378911972046, "learning_rate": 0.001, "loss": 2.3246, "step": 5785 }, { "epoch": 0.65, "grad_norm": 0.233421191573143, "learning_rate": 0.001, "loss": 2.3537, "step": 5790 }, { "epoch": 0.65, "grad_norm": 0.22940601408481598, "learning_rate": 0.001, "loss": 2.3019, "step": 5795 }, { "epoch": 0.65, "grad_norm": 0.2530461251735687, "learning_rate": 0.001, "loss": 2.3856, "step": 5800 }, { "epoch": 0.65, "grad_norm": 0.22957460582256317, "learning_rate": 0.001, "loss": 2.3155, "step": 5805 }, { "epoch": 0.65, "grad_norm": 0.23455286026000977, "learning_rate": 0.001, "loss": 2.3144, "step": 5810 }, { "epoch": 0.65, "grad_norm": 0.23876363039016724, "learning_rate": 0.001, "loss": 2.324, "step": 5815 }, { "epoch": 0.65, "grad_norm": 0.22128742933273315, "learning_rate": 0.001, "loss": 2.2817, "step": 5820 }, { "epoch": 0.65, "grad_norm": 0.24940499663352966, "learning_rate": 0.001, "loss": 2.3068, "step": 5825 }, { "epoch": 0.65, "grad_norm": 0.2494128942489624, "learning_rate": 0.001, "loss": 2.2576, "step": 5830 }, { "epoch": 0.65, "grad_norm": 0.22584503889083862, "learning_rate": 0.001, "loss": 2.3599, "step": 5835 }, { "epoch": 0.65, "grad_norm": 0.2631079852581024, "learning_rate": 0.001, "loss": 2.272, "step": 5840 }, { "epoch": 0.65, "grad_norm": 0.2861349284648895, "learning_rate": 0.001, "loss": 2.4105, "step": 5845 }, { "epoch": 0.65, "grad_norm": 0.25299036502838135, "learning_rate": 0.001, "loss": 2.4458, "step": 5850 }, { "epoch": 0.65, "grad_norm": 0.2369646430015564, "learning_rate": 0.001, "loss": 2.3539, "step": 5855 }, { "epoch": 0.65, "grad_norm": 0.2523862421512604, "learning_rate": 0.001, "loss": 2.3134, "step": 5860 }, { "epoch": 0.65, "grad_norm": 0.21714548766613007, "learning_rate": 0.001, "loss": 2.3339, "step": 5865 }, { "epoch": 0.65, "grad_norm": 0.26811763644218445, "learning_rate": 0.001, "loss": 2.3447, "step": 5870 }, { "epoch": 0.66, "grad_norm": 0.23169931769371033, "learning_rate": 0.001, "loss": 2.3459, "step": 5875 }, { "epoch": 0.66, "grad_norm": 0.25803160667419434, "learning_rate": 0.001, "loss": 2.2959, "step": 5880 }, { "epoch": 0.66, "grad_norm": 0.23524072766304016, "learning_rate": 0.001, "loss": 2.3536, "step": 5885 }, { "epoch": 0.66, "grad_norm": 0.23739273846149445, "learning_rate": 0.001, "loss": 2.2686, "step": 5890 }, { "epoch": 0.66, "grad_norm": 0.22635112702846527, "learning_rate": 0.001, "loss": 2.3628, "step": 5895 }, { "epoch": 0.66, "grad_norm": 0.2273004949092865, "learning_rate": 0.001, "loss": 2.2628, "step": 5900 }, { "epoch": 0.66, "grad_norm": 0.2222699224948883, "learning_rate": 0.001, "loss": 2.3277, "step": 5905 }, { "epoch": 0.66, "grad_norm": 0.2190581113100052, "learning_rate": 0.001, "loss": 2.3848, "step": 5910 }, { "epoch": 0.66, "grad_norm": 0.2414412945508957, "learning_rate": 0.001, "loss": 2.3162, "step": 5915 }, { "epoch": 0.66, "grad_norm": 0.23341503739356995, "learning_rate": 0.001, "loss": 2.4144, "step": 5920 }, { "epoch": 0.66, "grad_norm": 0.2254437506198883, "learning_rate": 0.001, "loss": 2.4501, "step": 5925 }, { "epoch": 0.66, "grad_norm": 0.24044445157051086, "learning_rate": 0.001, "loss": 2.3355, "step": 5930 }, { "epoch": 0.66, "grad_norm": 0.225622296333313, "learning_rate": 0.001, "loss": 2.4403, "step": 5935 }, { "epoch": 0.66, "grad_norm": 0.22940073907375336, "learning_rate": 0.001, "loss": 2.3084, "step": 5940 }, { "epoch": 0.66, "grad_norm": 0.23389093577861786, "learning_rate": 0.001, "loss": 2.3437, "step": 5945 }, { "epoch": 0.66, "grad_norm": 0.2822270393371582, "learning_rate": 0.001, "loss": 2.3956, "step": 5950 }, { "epoch": 0.66, "grad_norm": 0.22801753878593445, "learning_rate": 0.001, "loss": 2.344, "step": 5955 }, { "epoch": 0.66, "grad_norm": 0.2555668354034424, "learning_rate": 0.001, "loss": 2.2703, "step": 5960 }, { "epoch": 0.67, "grad_norm": 0.24699978530406952, "learning_rate": 0.001, "loss": 2.3371, "step": 5965 }, { "epoch": 0.67, "grad_norm": 0.23384138941764832, "learning_rate": 0.001, "loss": 2.3097, "step": 5970 }, { "epoch": 0.67, "grad_norm": 0.255615234375, "learning_rate": 0.001, "loss": 2.3041, "step": 5975 }, { "epoch": 0.67, "grad_norm": 0.22265081107616425, "learning_rate": 0.001, "loss": 2.3446, "step": 5980 }, { "epoch": 0.67, "grad_norm": 0.238100066781044, "learning_rate": 0.001, "loss": 2.3431, "step": 5985 }, { "epoch": 0.67, "grad_norm": 0.22238829731941223, "learning_rate": 0.001, "loss": 2.4082, "step": 5990 }, { "epoch": 0.67, "grad_norm": 0.216730996966362, "learning_rate": 0.001, "loss": 2.3037, "step": 5995 }, { "epoch": 0.67, "grad_norm": 0.22031597793102264, "learning_rate": 0.001, "loss": 2.3185, "step": 6000 }, { "epoch": 0.67, "grad_norm": 0.21698999404907227, "learning_rate": 0.001, "loss": 2.3008, "step": 6005 }, { "epoch": 0.67, "grad_norm": 0.2661702632904053, "learning_rate": 0.001, "loss": 2.2595, "step": 6010 }, { "epoch": 0.67, "grad_norm": 0.2344740331172943, "learning_rate": 0.001, "loss": 2.3146, "step": 6015 }, { "epoch": 0.67, "grad_norm": 0.2401505559682846, "learning_rate": 0.001, "loss": 2.3003, "step": 6020 }, { "epoch": 0.67, "grad_norm": 0.2311105728149414, "learning_rate": 0.001, "loss": 2.3577, "step": 6025 }, { "epoch": 0.67, "grad_norm": 0.2426912486553192, "learning_rate": 0.001, "loss": 2.2576, "step": 6030 }, { "epoch": 0.67, "grad_norm": 0.23958609998226166, "learning_rate": 0.001, "loss": 2.2899, "step": 6035 }, { "epoch": 0.67, "grad_norm": 0.23275497555732727, "learning_rate": 0.001, "loss": 2.3389, "step": 6040 }, { "epoch": 0.67, "grad_norm": 0.2336193025112152, "learning_rate": 0.001, "loss": 2.2513, "step": 6045 }, { "epoch": 0.67, "grad_norm": 0.23353838920593262, "learning_rate": 0.001, "loss": 2.3194, "step": 6050 }, { "epoch": 0.68, "grad_norm": 0.2304724007844925, "learning_rate": 0.001, "loss": 2.3508, "step": 6055 }, { "epoch": 0.68, "grad_norm": 0.2317655384540558, "learning_rate": 0.001, "loss": 2.2612, "step": 6060 }, { "epoch": 0.68, "grad_norm": 0.25610777735710144, "learning_rate": 0.001, "loss": 2.4149, "step": 6065 }, { "epoch": 0.68, "grad_norm": 0.2321697473526001, "learning_rate": 0.001, "loss": 2.332, "step": 6070 }, { "epoch": 0.68, "grad_norm": 0.23013491928577423, "learning_rate": 0.001, "loss": 2.2204, "step": 6075 }, { "epoch": 0.68, "grad_norm": 0.2328411042690277, "learning_rate": 0.001, "loss": 2.3794, "step": 6080 }, { "epoch": 0.68, "grad_norm": 0.2270471155643463, "learning_rate": 0.001, "loss": 2.2641, "step": 6085 }, { "epoch": 0.68, "grad_norm": 0.2320142686367035, "learning_rate": 0.001, "loss": 2.3091, "step": 6090 }, { "epoch": 0.68, "grad_norm": 0.24872182309627533, "learning_rate": 0.001, "loss": 2.2783, "step": 6095 }, { "epoch": 0.68, "grad_norm": 0.249810591340065, "learning_rate": 0.001, "loss": 2.3178, "step": 6100 }, { "epoch": 0.68, "grad_norm": 0.2650211453437805, "learning_rate": 0.001, "loss": 2.2551, "step": 6105 }, { "epoch": 0.68, "grad_norm": 0.23337675631046295, "learning_rate": 0.001, "loss": 2.381, "step": 6110 }, { "epoch": 0.68, "grad_norm": 0.23248009383678436, "learning_rate": 0.001, "loss": 2.3057, "step": 6115 }, { "epoch": 0.68, "grad_norm": 0.2519432604312897, "learning_rate": 0.001, "loss": 2.2995, "step": 6120 }, { "epoch": 0.68, "grad_norm": 0.23362381756305695, "learning_rate": 0.001, "loss": 2.3268, "step": 6125 }, { "epoch": 0.68, "grad_norm": 0.23035821318626404, "learning_rate": 0.001, "loss": 2.2877, "step": 6130 }, { "epoch": 0.68, "grad_norm": 0.2515948712825775, "learning_rate": 0.001, "loss": 2.2686, "step": 6135 }, { "epoch": 0.68, "grad_norm": 0.22109408676624298, "learning_rate": 0.001, "loss": 2.3087, "step": 6140 }, { "epoch": 0.69, "grad_norm": 0.2815423905849457, "learning_rate": 0.001, "loss": 2.2718, "step": 6145 }, { "epoch": 0.69, "grad_norm": 0.27658113837242126, "learning_rate": 0.001, "loss": 2.2606, "step": 6150 }, { "epoch": 0.69, "grad_norm": 0.2339669018983841, "learning_rate": 0.001, "loss": 2.2081, "step": 6155 }, { "epoch": 0.69, "grad_norm": 0.22394266724586487, "learning_rate": 0.001, "loss": 2.2434, "step": 6160 }, { "epoch": 0.69, "grad_norm": 0.274379700422287, "learning_rate": 0.001, "loss": 2.3479, "step": 6165 }, { "epoch": 0.69, "grad_norm": 0.23967401683330536, "learning_rate": 0.001, "loss": 2.269, "step": 6170 }, { "epoch": 0.69, "grad_norm": 0.22441859543323517, "learning_rate": 0.001, "loss": 2.4106, "step": 6175 }, { "epoch": 0.69, "grad_norm": 0.22732356190681458, "learning_rate": 0.001, "loss": 2.2489, "step": 6180 }, { "epoch": 0.69, "grad_norm": 0.2360657900571823, "learning_rate": 0.001, "loss": 2.3203, "step": 6185 }, { "epoch": 0.69, "grad_norm": 0.23805424571037292, "learning_rate": 0.001, "loss": 2.365, "step": 6190 }, { "epoch": 0.69, "grad_norm": 0.23591791093349457, "learning_rate": 0.001, "loss": 2.3065, "step": 6195 }, { "epoch": 0.69, "grad_norm": 0.2685248553752899, "learning_rate": 0.001, "loss": 2.2762, "step": 6200 }, { "epoch": 0.69, "grad_norm": 0.21762576699256897, "learning_rate": 0.001, "loss": 2.2622, "step": 6205 }, { "epoch": 0.69, "grad_norm": 0.22365742921829224, "learning_rate": 0.001, "loss": 2.3262, "step": 6210 }, { "epoch": 0.69, "grad_norm": 0.2266640067100525, "learning_rate": 0.001, "loss": 2.1798, "step": 6215 }, { "epoch": 0.69, "grad_norm": 0.2408314198255539, "learning_rate": 0.001, "loss": 2.4467, "step": 6220 }, { "epoch": 0.69, "grad_norm": 0.2335503101348877, "learning_rate": 0.001, "loss": 2.2623, "step": 6225 }, { "epoch": 0.69, "grad_norm": 0.23915047943592072, "learning_rate": 0.001, "loss": 2.3316, "step": 6230 }, { "epoch": 0.7, "grad_norm": 0.23060446977615356, "learning_rate": 0.001, "loss": 2.2405, "step": 6235 }, { "epoch": 0.7, "grad_norm": 0.23039406538009644, "learning_rate": 0.001, "loss": 2.3572, "step": 6240 }, { "epoch": 0.7, "grad_norm": 0.2337615042924881, "learning_rate": 0.001, "loss": 2.2782, "step": 6245 }, { "epoch": 0.7, "grad_norm": 0.22629192471504211, "learning_rate": 0.001, "loss": 2.344, "step": 6250 }, { "epoch": 0.7, "grad_norm": 0.24617376923561096, "learning_rate": 0.001, "loss": 2.3693, "step": 6255 }, { "epoch": 0.7, "grad_norm": 0.22347915172576904, "learning_rate": 0.001, "loss": 2.2883, "step": 6260 }, { "epoch": 0.7, "grad_norm": 0.22970449924468994, "learning_rate": 0.001, "loss": 2.3835, "step": 6265 }, { "epoch": 0.7, "grad_norm": 0.23593968152999878, "learning_rate": 0.001, "loss": 2.3482, "step": 6270 }, { "epoch": 0.7, "grad_norm": 0.22513240575790405, "learning_rate": 0.001, "loss": 2.2519, "step": 6275 }, { "epoch": 0.7, "grad_norm": 0.21264764666557312, "learning_rate": 0.001, "loss": 2.3257, "step": 6280 }, { "epoch": 0.7, "grad_norm": 0.2323000133037567, "learning_rate": 0.001, "loss": 2.2972, "step": 6285 }, { "epoch": 0.7, "grad_norm": 0.22790469229221344, "learning_rate": 0.001, "loss": 2.2954, "step": 6290 }, { "epoch": 0.7, "grad_norm": 0.21939660608768463, "learning_rate": 0.001, "loss": 2.2985, "step": 6295 }, { "epoch": 0.7, "grad_norm": 0.2272881269454956, "learning_rate": 0.001, "loss": 2.2307, "step": 6300 }, { "epoch": 0.7, "grad_norm": 0.2288762778043747, "learning_rate": 0.001, "loss": 2.3405, "step": 6305 }, { "epoch": 0.7, "grad_norm": 0.233880415558815, "learning_rate": 0.001, "loss": 2.2826, "step": 6310 }, { "epoch": 0.7, "grad_norm": 0.22657474875450134, "learning_rate": 0.001, "loss": 2.3034, "step": 6315 }, { "epoch": 0.7, "grad_norm": 0.2148173451423645, "learning_rate": 0.001, "loss": 2.307, "step": 6320 }, { "epoch": 0.71, "grad_norm": 0.2334657907485962, "learning_rate": 0.001, "loss": 2.3141, "step": 6325 }, { "epoch": 0.71, "grad_norm": 0.2156243771314621, "learning_rate": 0.001, "loss": 2.2665, "step": 6330 }, { "epoch": 0.71, "grad_norm": 0.2339843213558197, "learning_rate": 0.001, "loss": 2.2798, "step": 6335 }, { "epoch": 0.71, "grad_norm": 0.22493353486061096, "learning_rate": 0.001, "loss": 2.372, "step": 6340 }, { "epoch": 0.71, "grad_norm": 0.23672647774219513, "learning_rate": 0.001, "loss": 2.2306, "step": 6345 }, { "epoch": 0.71, "grad_norm": 0.22123226523399353, "learning_rate": 0.001, "loss": 2.2672, "step": 6350 }, { "epoch": 0.71, "grad_norm": 0.22304557263851166, "learning_rate": 0.001, "loss": 2.2178, "step": 6355 }, { "epoch": 0.71, "grad_norm": 0.2457527369260788, "learning_rate": 0.001, "loss": 2.2743, "step": 6360 }, { "epoch": 0.71, "grad_norm": 0.21300479769706726, "learning_rate": 0.001, "loss": 2.2388, "step": 6365 }, { "epoch": 0.71, "grad_norm": 0.2059631049633026, "learning_rate": 0.001, "loss": 2.2377, "step": 6370 }, { "epoch": 0.71, "grad_norm": 0.24631153047084808, "learning_rate": 0.001, "loss": 2.2768, "step": 6375 }, { "epoch": 0.71, "grad_norm": 0.21497204899787903, "learning_rate": 0.001, "loss": 2.2844, "step": 6380 }, { "epoch": 0.71, "grad_norm": 0.22930744290351868, "learning_rate": 0.001, "loss": 2.312, "step": 6385 }, { "epoch": 0.71, "grad_norm": 0.23210054636001587, "learning_rate": 0.001, "loss": 2.2118, "step": 6390 }, { "epoch": 0.71, "grad_norm": 0.21625350415706635, "learning_rate": 0.001, "loss": 2.3245, "step": 6395 }, { "epoch": 0.71, "grad_norm": 0.23386377096176147, "learning_rate": 0.001, "loss": 2.3134, "step": 6400 }, { "epoch": 0.71, "grad_norm": 0.21839016675949097, "learning_rate": 0.001, "loss": 2.3539, "step": 6405 }, { "epoch": 0.71, "grad_norm": 0.2133498340845108, "learning_rate": 0.001, "loss": 2.2994, "step": 6410 }, { "epoch": 0.72, "grad_norm": 0.21773594617843628, "learning_rate": 0.001, "loss": 2.3248, "step": 6415 }, { "epoch": 0.72, "grad_norm": 0.23620709776878357, "learning_rate": 0.001, "loss": 2.2653, "step": 6420 }, { "epoch": 0.72, "grad_norm": 0.22743947803974152, "learning_rate": 0.001, "loss": 2.2736, "step": 6425 }, { "epoch": 0.72, "grad_norm": 0.2404935657978058, "learning_rate": 0.001, "loss": 2.3361, "step": 6430 }, { "epoch": 0.72, "grad_norm": 0.24400784075260162, "learning_rate": 0.001, "loss": 2.3089, "step": 6435 }, { "epoch": 0.72, "grad_norm": 0.23440074920654297, "learning_rate": 0.001, "loss": 2.2961, "step": 6440 }, { "epoch": 0.72, "grad_norm": 0.22357019782066345, "learning_rate": 0.001, "loss": 2.3432, "step": 6445 }, { "epoch": 0.72, "grad_norm": 0.2347543090581894, "learning_rate": 0.001, "loss": 2.304, "step": 6450 }, { "epoch": 0.72, "grad_norm": 0.22234396636486053, "learning_rate": 0.001, "loss": 2.2524, "step": 6455 }, { "epoch": 0.72, "grad_norm": 0.25612977147102356, "learning_rate": 0.001, "loss": 2.304, "step": 6460 }, { "epoch": 0.72, "grad_norm": 0.23432089388370514, "learning_rate": 0.001, "loss": 2.3512, "step": 6465 }, { "epoch": 0.72, "grad_norm": 0.23479154706001282, "learning_rate": 0.001, "loss": 2.243, "step": 6470 }, { "epoch": 0.72, "grad_norm": 0.2278670221567154, "learning_rate": 0.001, "loss": 2.3402, "step": 6475 }, { "epoch": 0.72, "grad_norm": 0.2140948623418808, "learning_rate": 0.001, "loss": 2.2892, "step": 6480 }, { "epoch": 0.72, "grad_norm": 0.23153026401996613, "learning_rate": 0.001, "loss": 2.2849, "step": 6485 }, { "epoch": 0.72, "grad_norm": 0.22485770285129547, "learning_rate": 0.001, "loss": 2.313, "step": 6490 }, { "epoch": 0.72, "grad_norm": 0.21722319722175598, "learning_rate": 0.001, "loss": 2.3453, "step": 6495 }, { "epoch": 0.72, "grad_norm": 0.21437281370162964, "learning_rate": 0.001, "loss": 2.2887, "step": 6500 }, { "epoch": 0.73, "grad_norm": 0.214768648147583, "learning_rate": 0.001, "loss": 2.2759, "step": 6505 }, { "epoch": 0.73, "grad_norm": 0.31841498613357544, "learning_rate": 0.001, "loss": 2.2481, "step": 6510 }, { "epoch": 0.73, "grad_norm": 0.22883611917495728, "learning_rate": 0.001, "loss": 2.2809, "step": 6515 }, { "epoch": 0.73, "grad_norm": 0.23380115628242493, "learning_rate": 0.001, "loss": 2.3128, "step": 6520 }, { "epoch": 0.73, "grad_norm": 0.21790826320648193, "learning_rate": 0.001, "loss": 2.3074, "step": 6525 }, { "epoch": 0.73, "grad_norm": 0.22439299523830414, "learning_rate": 0.001, "loss": 2.3287, "step": 6530 }, { "epoch": 0.73, "grad_norm": 0.21973837912082672, "learning_rate": 0.001, "loss": 2.3249, "step": 6535 }, { "epoch": 0.73, "grad_norm": 0.25569167733192444, "learning_rate": 0.001, "loss": 2.2878, "step": 6540 }, { "epoch": 0.73, "grad_norm": 0.21167606115341187, "learning_rate": 0.001, "loss": 2.2877, "step": 6545 }, { "epoch": 0.73, "grad_norm": 0.2405991554260254, "learning_rate": 0.001, "loss": 2.3446, "step": 6550 }, { "epoch": 0.73, "grad_norm": 0.2218019813299179, "learning_rate": 0.001, "loss": 2.315, "step": 6555 }, { "epoch": 0.73, "grad_norm": 0.2425190806388855, "learning_rate": 0.001, "loss": 2.2657, "step": 6560 }, { "epoch": 0.73, "grad_norm": 0.22708062827587128, "learning_rate": 0.001, "loss": 2.1857, "step": 6565 }, { "epoch": 0.73, "grad_norm": 0.2227758914232254, "learning_rate": 0.001, "loss": 2.2147, "step": 6570 }, { "epoch": 0.73, "grad_norm": 0.22275353968143463, "learning_rate": 0.001, "loss": 2.2327, "step": 6575 }, { "epoch": 0.73, "grad_norm": 0.2267688810825348, "learning_rate": 0.001, "loss": 2.3549, "step": 6580 }, { "epoch": 0.73, "grad_norm": 0.2187042087316513, "learning_rate": 0.001, "loss": 2.302, "step": 6585 }, { "epoch": 0.73, "grad_norm": 0.23258638381958008, "learning_rate": 0.001, "loss": 2.2425, "step": 6590 }, { "epoch": 0.74, "grad_norm": 0.25112131237983704, "learning_rate": 0.001, "loss": 2.3606, "step": 6595 }, { "epoch": 0.74, "grad_norm": 0.2182377427816391, "learning_rate": 0.001, "loss": 2.2458, "step": 6600 }, { "epoch": 0.74, "grad_norm": 0.24733804166316986, "learning_rate": 0.001, "loss": 2.3269, "step": 6605 }, { "epoch": 0.74, "grad_norm": 0.20609302818775177, "learning_rate": 0.001, "loss": 2.2328, "step": 6610 }, { "epoch": 0.74, "grad_norm": 0.20930394530296326, "learning_rate": 0.001, "loss": 2.3236, "step": 6615 }, { "epoch": 0.74, "grad_norm": 0.22153881192207336, "learning_rate": 0.001, "loss": 2.2531, "step": 6620 }, { "epoch": 0.74, "grad_norm": 0.2152048796415329, "learning_rate": 0.001, "loss": 2.407, "step": 6625 }, { "epoch": 0.74, "grad_norm": 0.255520224571228, "learning_rate": 0.001, "loss": 2.2682, "step": 6630 }, { "epoch": 0.74, "grad_norm": 0.23848092555999756, "learning_rate": 0.001, "loss": 2.3644, "step": 6635 }, { "epoch": 0.74, "grad_norm": 0.20903976261615753, "learning_rate": 0.001, "loss": 2.2142, "step": 6640 }, { "epoch": 0.74, "grad_norm": 0.24808953702449799, "learning_rate": 0.001, "loss": 2.4207, "step": 6645 }, { "epoch": 0.74, "grad_norm": 0.2541098892688751, "learning_rate": 0.001, "loss": 2.3043, "step": 6650 }, { "epoch": 0.74, "grad_norm": 0.2177581936120987, "learning_rate": 0.001, "loss": 2.3392, "step": 6655 }, { "epoch": 0.74, "grad_norm": 0.21430838108062744, "learning_rate": 0.001, "loss": 2.2786, "step": 6660 }, { "epoch": 0.74, "grad_norm": 0.22286362946033478, "learning_rate": 0.001, "loss": 2.4054, "step": 6665 }, { "epoch": 0.74, "grad_norm": 0.2035175859928131, "learning_rate": 0.001, "loss": 2.3528, "step": 6670 }, { "epoch": 0.74, "grad_norm": 0.24235355854034424, "learning_rate": 0.001, "loss": 2.2776, "step": 6675 }, { "epoch": 0.74, "grad_norm": 0.21710264682769775, "learning_rate": 0.001, "loss": 2.1995, "step": 6680 }, { "epoch": 0.75, "grad_norm": 0.20540495216846466, "learning_rate": 0.001, "loss": 2.262, "step": 6685 }, { "epoch": 0.75, "grad_norm": 0.22396127879619598, "learning_rate": 0.001, "loss": 2.2918, "step": 6690 }, { "epoch": 0.75, "grad_norm": 0.2071615308523178, "learning_rate": 0.001, "loss": 2.2222, "step": 6695 }, { "epoch": 0.75, "grad_norm": 0.22722716629505157, "learning_rate": 0.001, "loss": 2.268, "step": 6700 }, { "epoch": 0.75, "grad_norm": 0.22743773460388184, "learning_rate": 0.001, "loss": 2.2728, "step": 6705 }, { "epoch": 0.75, "grad_norm": 0.23747438192367554, "learning_rate": 0.001, "loss": 2.2671, "step": 6710 }, { "epoch": 0.75, "grad_norm": 0.21889004111289978, "learning_rate": 0.001, "loss": 2.2856, "step": 6715 }, { "epoch": 0.75, "grad_norm": 0.19960670173168182, "learning_rate": 0.001, "loss": 2.308, "step": 6720 }, { "epoch": 0.75, "grad_norm": 0.2208574414253235, "learning_rate": 0.001, "loss": 2.2424, "step": 6725 }, { "epoch": 0.75, "grad_norm": 0.2305884212255478, "learning_rate": 0.001, "loss": 2.2329, "step": 6730 }, { "epoch": 0.75, "grad_norm": 0.2227226197719574, "learning_rate": 0.001, "loss": 2.3162, "step": 6735 }, { "epoch": 0.75, "grad_norm": 0.21388469636440277, "learning_rate": 0.001, "loss": 2.2611, "step": 6740 }, { "epoch": 0.75, "grad_norm": 0.21388593316078186, "learning_rate": 0.001, "loss": 2.2672, "step": 6745 }, { "epoch": 0.75, "grad_norm": 0.22708441317081451, "learning_rate": 0.001, "loss": 2.2176, "step": 6750 }, { "epoch": 0.75, "grad_norm": 0.21498042345046997, "learning_rate": 0.001, "loss": 2.2735, "step": 6755 }, { "epoch": 0.75, "grad_norm": 0.26390013098716736, "learning_rate": 0.001, "loss": 2.3951, "step": 6760 }, { "epoch": 0.75, "grad_norm": 0.22930343449115753, "learning_rate": 0.001, "loss": 2.2718, "step": 6765 }, { "epoch": 0.75, "grad_norm": 0.22452136874198914, "learning_rate": 0.001, "loss": 2.2614, "step": 6770 }, { "epoch": 0.76, "grad_norm": 0.2295997589826584, "learning_rate": 0.001, "loss": 2.2575, "step": 6775 }, { "epoch": 0.76, "grad_norm": 0.2150869071483612, "learning_rate": 0.001, "loss": 2.2893, "step": 6780 }, { "epoch": 0.76, "grad_norm": 0.22733640670776367, "learning_rate": 0.001, "loss": 2.2378, "step": 6785 }, { "epoch": 0.76, "grad_norm": 0.2284800410270691, "learning_rate": 0.001, "loss": 2.2126, "step": 6790 }, { "epoch": 0.76, "grad_norm": 0.22430431842803955, "learning_rate": 0.001, "loss": 2.2428, "step": 6795 }, { "epoch": 0.76, "grad_norm": 0.2072402387857437, "learning_rate": 0.001, "loss": 2.278, "step": 6800 }, { "epoch": 0.76, "grad_norm": 0.23210042715072632, "learning_rate": 0.001, "loss": 2.2302, "step": 6805 }, { "epoch": 0.76, "grad_norm": 0.21655841171741486, "learning_rate": 0.001, "loss": 2.2909, "step": 6810 }, { "epoch": 0.76, "grad_norm": 0.21356946229934692, "learning_rate": 0.001, "loss": 2.2277, "step": 6815 }, { "epoch": 0.76, "grad_norm": 0.21072253584861755, "learning_rate": 0.001, "loss": 2.2721, "step": 6820 }, { "epoch": 0.76, "grad_norm": 0.21372920274734497, "learning_rate": 0.001, "loss": 2.3177, "step": 6825 }, { "epoch": 0.76, "grad_norm": 0.23465748131275177, "learning_rate": 0.001, "loss": 2.205, "step": 6830 }, { "epoch": 0.76, "grad_norm": 0.20318353176116943, "learning_rate": 0.001, "loss": 2.2573, "step": 6835 }, { "epoch": 0.76, "grad_norm": 0.22107534110546112, "learning_rate": 0.001, "loss": 2.2434, "step": 6840 }, { "epoch": 0.76, "grad_norm": 0.2000688910484314, "learning_rate": 0.001, "loss": 2.3421, "step": 6845 }, { "epoch": 0.76, "grad_norm": 0.24129514396190643, "learning_rate": 0.001, "loss": 2.2897, "step": 6850 }, { "epoch": 0.76, "grad_norm": 0.21887139976024628, "learning_rate": 0.001, "loss": 2.3154, "step": 6855 }, { "epoch": 0.76, "grad_norm": 0.2198597937822342, "learning_rate": 0.001, "loss": 2.3218, "step": 6860 }, { "epoch": 0.77, "grad_norm": 0.219230517745018, "learning_rate": 0.001, "loss": 2.3299, "step": 6865 }, { "epoch": 0.77, "grad_norm": 0.2002941071987152, "learning_rate": 0.001, "loss": 2.2028, "step": 6870 }, { "epoch": 0.77, "grad_norm": 0.22916488349437714, "learning_rate": 0.001, "loss": 2.2602, "step": 6875 }, { "epoch": 0.77, "grad_norm": 0.20966394245624542, "learning_rate": 0.001, "loss": 2.3592, "step": 6880 }, { "epoch": 0.77, "grad_norm": 0.22105638682842255, "learning_rate": 0.001, "loss": 2.1895, "step": 6885 }, { "epoch": 0.77, "grad_norm": 0.23469582200050354, "learning_rate": 0.001, "loss": 2.2608, "step": 6890 }, { "epoch": 0.77, "grad_norm": 0.21237854659557343, "learning_rate": 0.001, "loss": 2.3013, "step": 6895 }, { "epoch": 0.77, "grad_norm": 0.22927138209342957, "learning_rate": 0.001, "loss": 2.3076, "step": 6900 }, { "epoch": 0.77, "grad_norm": 0.21441708505153656, "learning_rate": 0.001, "loss": 2.2768, "step": 6905 }, { "epoch": 0.77, "grad_norm": 0.23027388751506805, "learning_rate": 0.001, "loss": 2.3414, "step": 6910 }, { "epoch": 0.77, "grad_norm": 0.20745432376861572, "learning_rate": 0.001, "loss": 2.3512, "step": 6915 }, { "epoch": 0.77, "grad_norm": 0.21277524530887604, "learning_rate": 0.001, "loss": 2.2667, "step": 6920 }, { "epoch": 0.77, "grad_norm": 0.20173713564872742, "learning_rate": 0.001, "loss": 2.2563, "step": 6925 }, { "epoch": 0.77, "grad_norm": 0.20181357860565186, "learning_rate": 0.001, "loss": 2.3219, "step": 6930 }, { "epoch": 0.77, "grad_norm": 0.22798535227775574, "learning_rate": 0.001, "loss": 2.2603, "step": 6935 }, { "epoch": 0.77, "grad_norm": 0.2111693024635315, "learning_rate": 0.001, "loss": 2.2972, "step": 6940 }, { "epoch": 0.77, "grad_norm": 0.225101500749588, "learning_rate": 0.001, "loss": 2.2476, "step": 6945 }, { "epoch": 0.77, "grad_norm": 0.2258484959602356, "learning_rate": 0.001, "loss": 2.3323, "step": 6950 }, { "epoch": 0.78, "grad_norm": 0.2221709042787552, "learning_rate": 0.001, "loss": 2.3256, "step": 6955 }, { "epoch": 0.78, "grad_norm": 0.21151743829250336, "learning_rate": 0.001, "loss": 2.2582, "step": 6960 }, { "epoch": 0.78, "grad_norm": 0.22967129945755005, "learning_rate": 0.001, "loss": 2.2557, "step": 6965 }, { "epoch": 0.78, "grad_norm": 0.2297360748052597, "learning_rate": 0.001, "loss": 2.3392, "step": 6970 }, { "epoch": 0.78, "grad_norm": 0.23820990324020386, "learning_rate": 0.001, "loss": 2.2955, "step": 6975 }, { "epoch": 0.78, "grad_norm": 0.2055855095386505, "learning_rate": 0.001, "loss": 2.2823, "step": 6980 }, { "epoch": 0.78, "grad_norm": 0.2106105536222458, "learning_rate": 0.001, "loss": 2.3535, "step": 6985 }, { "epoch": 0.78, "grad_norm": 0.2504548132419586, "learning_rate": 0.001, "loss": 2.3398, "step": 6990 }, { "epoch": 0.78, "grad_norm": 0.20728346705436707, "learning_rate": 0.001, "loss": 2.2791, "step": 6995 }, { "epoch": 0.78, "grad_norm": 0.2041543871164322, "learning_rate": 0.001, "loss": 2.2221, "step": 7000 }, { "epoch": 0.78, "grad_norm": 0.23212693631649017, "learning_rate": 0.001, "loss": 2.2897, "step": 7005 }, { "epoch": 0.78, "grad_norm": 0.20440858602523804, "learning_rate": 0.001, "loss": 2.2188, "step": 7010 }, { "epoch": 0.78, "grad_norm": 0.21270564198493958, "learning_rate": 0.001, "loss": 2.2856, "step": 7015 }, { "epoch": 0.78, "grad_norm": 0.23255744576454163, "learning_rate": 0.001, "loss": 2.2796, "step": 7020 }, { "epoch": 0.78, "grad_norm": 0.22522412240505219, "learning_rate": 0.001, "loss": 2.2653, "step": 7025 }, { "epoch": 0.78, "grad_norm": 0.21376082301139832, "learning_rate": 0.001, "loss": 2.2828, "step": 7030 }, { "epoch": 0.78, "grad_norm": 0.2234581708908081, "learning_rate": 0.001, "loss": 2.2241, "step": 7035 }, { "epoch": 0.78, "grad_norm": 0.22096699476242065, "learning_rate": 0.001, "loss": 2.2399, "step": 7040 }, { "epoch": 0.79, "grad_norm": 0.23003298044204712, "learning_rate": 0.001, "loss": 2.3065, "step": 7045 }, { "epoch": 0.79, "grad_norm": 0.22232045233249664, "learning_rate": 0.001, "loss": 2.1878, "step": 7050 }, { "epoch": 0.79, "grad_norm": 0.21559427678585052, "learning_rate": 0.001, "loss": 2.1721, "step": 7055 }, { "epoch": 0.79, "grad_norm": 0.2325166016817093, "learning_rate": 0.001, "loss": 2.3431, "step": 7060 }, { "epoch": 0.79, "grad_norm": 0.2172939032316208, "learning_rate": 0.001, "loss": 2.2579, "step": 7065 }, { "epoch": 0.79, "grad_norm": 0.21909840404987335, "learning_rate": 0.001, "loss": 2.2747, "step": 7070 }, { "epoch": 0.79, "grad_norm": 0.21307264268398285, "learning_rate": 0.001, "loss": 2.2371, "step": 7075 }, { "epoch": 0.79, "grad_norm": 0.1988152116537094, "learning_rate": 0.001, "loss": 2.2615, "step": 7080 }, { "epoch": 0.79, "grad_norm": 0.20012176036834717, "learning_rate": 0.001, "loss": 2.2191, "step": 7085 }, { "epoch": 0.79, "grad_norm": 0.21884466707706451, "learning_rate": 0.001, "loss": 2.2541, "step": 7090 }, { "epoch": 0.79, "grad_norm": 0.2405450940132141, "learning_rate": 0.001, "loss": 2.1851, "step": 7095 }, { "epoch": 0.79, "grad_norm": 0.20957379043102264, "learning_rate": 0.001, "loss": 2.29, "step": 7100 }, { "epoch": 0.79, "grad_norm": 0.2217535525560379, "learning_rate": 0.001, "loss": 2.1965, "step": 7105 }, { "epoch": 0.79, "grad_norm": 0.19948221743106842, "learning_rate": 0.001, "loss": 2.1932, "step": 7110 }, { "epoch": 0.79, "grad_norm": 0.2090318202972412, "learning_rate": 0.001, "loss": 2.2536, "step": 7115 }, { "epoch": 0.79, "grad_norm": 0.22281096875667572, "learning_rate": 0.001, "loss": 2.2951, "step": 7120 }, { "epoch": 0.79, "grad_norm": 0.20826506614685059, "learning_rate": 0.001, "loss": 2.3136, "step": 7125 }, { "epoch": 0.79, "grad_norm": 0.2151436060667038, "learning_rate": 0.001, "loss": 2.4252, "step": 7130 }, { "epoch": 0.8, "grad_norm": 0.213445782661438, "learning_rate": 0.001, "loss": 2.2703, "step": 7135 }, { "epoch": 0.8, "grad_norm": 0.20548775792121887, "learning_rate": 0.001, "loss": 2.2282, "step": 7140 }, { "epoch": 0.8, "grad_norm": 0.23199157416820526, "learning_rate": 0.001, "loss": 2.3681, "step": 7145 }, { "epoch": 0.8, "grad_norm": 0.21607224643230438, "learning_rate": 0.001, "loss": 2.3197, "step": 7150 }, { "epoch": 0.8, "grad_norm": 0.20880939066410065, "learning_rate": 0.001, "loss": 2.2826, "step": 7155 }, { "epoch": 0.8, "grad_norm": 0.21946680545806885, "learning_rate": 0.001, "loss": 2.2451, "step": 7160 }, { "epoch": 0.8, "grad_norm": 0.23108704388141632, "learning_rate": 0.001, "loss": 2.244, "step": 7165 }, { "epoch": 0.8, "grad_norm": 0.19656045734882355, "learning_rate": 0.001, "loss": 2.2871, "step": 7170 }, { "epoch": 0.8, "grad_norm": 0.22790350019931793, "learning_rate": 0.001, "loss": 2.3187, "step": 7175 }, { "epoch": 0.8, "grad_norm": 0.23438240587711334, "learning_rate": 0.001, "loss": 2.2798, "step": 7180 }, { "epoch": 0.8, "grad_norm": 0.2033395767211914, "learning_rate": 0.001, "loss": 2.1845, "step": 7185 }, { "epoch": 0.8, "grad_norm": 0.20368583500385284, "learning_rate": 0.001, "loss": 2.3395, "step": 7190 }, { "epoch": 0.8, "grad_norm": 0.20549830794334412, "learning_rate": 0.001, "loss": 2.3311, "step": 7195 }, { "epoch": 0.8, "grad_norm": 0.22930066287517548, "learning_rate": 0.001, "loss": 2.2961, "step": 7200 }, { "epoch": 0.8, "grad_norm": 0.2134595811367035, "learning_rate": 0.001, "loss": 2.3287, "step": 7205 }, { "epoch": 0.8, "grad_norm": 0.23215077817440033, "learning_rate": 0.001, "loss": 2.297, "step": 7210 }, { "epoch": 0.8, "grad_norm": 0.19765566289424896, "learning_rate": 0.001, "loss": 2.2268, "step": 7215 }, { "epoch": 0.8, "grad_norm": 0.24692144989967346, "learning_rate": 0.001, "loss": 2.3147, "step": 7220 }, { "epoch": 0.81, "grad_norm": 0.30222055315971375, "learning_rate": 0.001, "loss": 2.2585, "step": 7225 }, { "epoch": 0.81, "grad_norm": 0.19766630232334137, "learning_rate": 0.001, "loss": 2.327, "step": 7230 }, { "epoch": 0.81, "grad_norm": 0.20692983269691467, "learning_rate": 0.001, "loss": 2.3028, "step": 7235 }, { "epoch": 0.81, "grad_norm": 0.21192064881324768, "learning_rate": 0.001, "loss": 2.3742, "step": 7240 }, { "epoch": 0.81, "grad_norm": 0.21428699791431427, "learning_rate": 0.001, "loss": 2.1737, "step": 7245 }, { "epoch": 0.81, "grad_norm": 0.20385991036891937, "learning_rate": 0.001, "loss": 2.2721, "step": 7250 }, { "epoch": 0.81, "grad_norm": 0.20821918547153473, "learning_rate": 0.001, "loss": 2.2602, "step": 7255 }, { "epoch": 0.81, "grad_norm": 0.20553617179393768, "learning_rate": 0.001, "loss": 2.2719, "step": 7260 }, { "epoch": 0.81, "grad_norm": 0.21833737194538116, "learning_rate": 0.001, "loss": 2.2495, "step": 7265 }, { "epoch": 0.81, "grad_norm": 0.2193860560655594, "learning_rate": 0.001, "loss": 2.2582, "step": 7270 }, { "epoch": 0.81, "grad_norm": 0.21469643712043762, "learning_rate": 0.001, "loss": 2.2987, "step": 7275 }, { "epoch": 0.81, "grad_norm": 0.2157880663871765, "learning_rate": 0.001, "loss": 2.245, "step": 7280 }, { "epoch": 0.81, "grad_norm": 0.20697729289531708, "learning_rate": 0.001, "loss": 2.3218, "step": 7285 }, { "epoch": 0.81, "grad_norm": 0.23085835576057434, "learning_rate": 0.001, "loss": 2.2267, "step": 7290 }, { "epoch": 0.81, "grad_norm": 0.21796423196792603, "learning_rate": 0.001, "loss": 2.2688, "step": 7295 }, { "epoch": 0.81, "grad_norm": 0.24522145092487335, "learning_rate": 0.001, "loss": 2.3475, "step": 7300 }, { "epoch": 0.81, "grad_norm": 0.203556627035141, "learning_rate": 0.001, "loss": 2.2861, "step": 7305 }, { "epoch": 0.82, "grad_norm": 0.2439495325088501, "learning_rate": 0.001, "loss": 2.2658, "step": 7310 }, { "epoch": 0.82, "grad_norm": 0.22975510358810425, "learning_rate": 0.001, "loss": 2.1613, "step": 7315 }, { "epoch": 0.82, "grad_norm": 0.19879737496376038, "learning_rate": 0.001, "loss": 2.2234, "step": 7320 }, { "epoch": 0.82, "grad_norm": 0.2011747658252716, "learning_rate": 0.001, "loss": 2.2828, "step": 7325 }, { "epoch": 0.82, "grad_norm": 0.19531621038913727, "learning_rate": 0.001, "loss": 2.1786, "step": 7330 }, { "epoch": 0.82, "grad_norm": 0.2119741439819336, "learning_rate": 0.001, "loss": 2.2916, "step": 7335 }, { "epoch": 0.82, "grad_norm": 0.2018207460641861, "learning_rate": 0.001, "loss": 2.219, "step": 7340 }, { "epoch": 0.82, "grad_norm": 0.19718153774738312, "learning_rate": 0.001, "loss": 2.1588, "step": 7345 }, { "epoch": 0.82, "grad_norm": 0.19742237031459808, "learning_rate": 0.001, "loss": 2.2564, "step": 7350 }, { "epoch": 0.82, "grad_norm": 0.20226089656352997, "learning_rate": 0.001, "loss": 2.289, "step": 7355 }, { "epoch": 0.82, "grad_norm": 0.19770477712154388, "learning_rate": 0.001, "loss": 2.2675, "step": 7360 }, { "epoch": 0.82, "grad_norm": 0.20721545815467834, "learning_rate": 0.001, "loss": 2.1463, "step": 7365 }, { "epoch": 0.82, "grad_norm": 0.18459871411323547, "learning_rate": 0.001, "loss": 2.1891, "step": 7370 }, { "epoch": 0.82, "grad_norm": 0.19960848987102509, "learning_rate": 0.001, "loss": 2.2649, "step": 7375 }, { "epoch": 0.82, "grad_norm": 0.2147081345319748, "learning_rate": 0.001, "loss": 2.2833, "step": 7380 }, { "epoch": 0.82, "grad_norm": 0.19740645587444305, "learning_rate": 0.001, "loss": 2.2549, "step": 7385 }, { "epoch": 0.82, "grad_norm": 0.19742794334888458, "learning_rate": 0.001, "loss": 2.2949, "step": 7390 }, { "epoch": 0.82, "grad_norm": 0.19098269939422607, "learning_rate": 0.001, "loss": 2.2872, "step": 7395 }, { "epoch": 0.83, "grad_norm": 0.20648017525672913, "learning_rate": 0.001, "loss": 2.2777, "step": 7400 }, { "epoch": 0.83, "grad_norm": 0.19683200120925903, "learning_rate": 0.001, "loss": 2.2212, "step": 7405 }, { "epoch": 0.83, "grad_norm": 0.19511087238788605, "learning_rate": 0.001, "loss": 2.2753, "step": 7410 }, { "epoch": 0.83, "grad_norm": 0.20487697422504425, "learning_rate": 0.001, "loss": 2.2839, "step": 7415 }, { "epoch": 0.83, "grad_norm": 0.20138192176818848, "learning_rate": 0.001, "loss": 2.2486, "step": 7420 }, { "epoch": 0.83, "grad_norm": 0.20356020331382751, "learning_rate": 0.001, "loss": 2.1948, "step": 7425 }, { "epoch": 0.83, "grad_norm": 0.21195770800113678, "learning_rate": 0.001, "loss": 2.2343, "step": 7430 }, { "epoch": 0.83, "grad_norm": 0.2272544503211975, "learning_rate": 0.001, "loss": 2.3143, "step": 7435 }, { "epoch": 0.83, "grad_norm": 0.20329448580741882, "learning_rate": 0.001, "loss": 2.2508, "step": 7440 }, { "epoch": 0.83, "grad_norm": 0.19755227863788605, "learning_rate": 0.001, "loss": 2.3016, "step": 7445 }, { "epoch": 0.83, "grad_norm": 0.20580564439296722, "learning_rate": 0.001, "loss": 2.2072, "step": 7450 }, { "epoch": 0.83, "grad_norm": 0.19712607562541962, "learning_rate": 0.001, "loss": 2.262, "step": 7455 }, { "epoch": 0.83, "grad_norm": 0.20621292293071747, "learning_rate": 0.001, "loss": 2.2982, "step": 7460 }, { "epoch": 0.83, "grad_norm": 0.2071629762649536, "learning_rate": 0.001, "loss": 2.2891, "step": 7465 }, { "epoch": 0.83, "grad_norm": 0.2150653451681137, "learning_rate": 0.001, "loss": 2.1865, "step": 7470 }, { "epoch": 0.83, "grad_norm": 0.23920634388923645, "learning_rate": 0.001, "loss": 2.308, "step": 7475 }, { "epoch": 0.83, "grad_norm": 0.21500001847743988, "learning_rate": 0.001, "loss": 2.2884, "step": 7480 }, { "epoch": 0.83, "grad_norm": 0.21805007755756378, "learning_rate": 0.001, "loss": 2.1848, "step": 7485 }, { "epoch": 0.84, "grad_norm": 0.1949496865272522, "learning_rate": 0.001, "loss": 2.1732, "step": 7490 }, { "epoch": 0.84, "grad_norm": 0.21110284328460693, "learning_rate": 0.001, "loss": 2.2932, "step": 7495 }, { "epoch": 0.84, "grad_norm": 0.2048315703868866, "learning_rate": 0.001, "loss": 2.216, "step": 7500 }, { "epoch": 0.84, "grad_norm": 0.2010776698589325, "learning_rate": 0.001, "loss": 2.2891, "step": 7505 }, { "epoch": 0.84, "grad_norm": 0.27239951491355896, "learning_rate": 0.001, "loss": 2.2777, "step": 7510 }, { "epoch": 0.84, "grad_norm": 0.2029271274805069, "learning_rate": 0.001, "loss": 2.2677, "step": 7515 }, { "epoch": 0.84, "grad_norm": 0.21085187792778015, "learning_rate": 0.001, "loss": 2.2963, "step": 7520 }, { "epoch": 0.84, "grad_norm": 0.2122115045785904, "learning_rate": 0.001, "loss": 2.2493, "step": 7525 }, { "epoch": 0.84, "grad_norm": 0.20089121162891388, "learning_rate": 0.001, "loss": 2.2474, "step": 7530 }, { "epoch": 0.84, "grad_norm": 0.20639105141162872, "learning_rate": 0.001, "loss": 2.1943, "step": 7535 }, { "epoch": 0.84, "grad_norm": 0.21445663273334503, "learning_rate": 0.001, "loss": 2.1288, "step": 7540 }, { "epoch": 0.84, "grad_norm": 0.20437091588974, "learning_rate": 0.001, "loss": 2.1688, "step": 7545 }, { "epoch": 0.84, "grad_norm": 0.23248904943466187, "learning_rate": 0.001, "loss": 2.2279, "step": 7550 }, { "epoch": 0.84, "grad_norm": 0.34397122263908386, "learning_rate": 0.001, "loss": 2.326, "step": 7555 }, { "epoch": 0.84, "grad_norm": 0.2095007747411728, "learning_rate": 0.001, "loss": 2.2435, "step": 7560 }, { "epoch": 0.84, "grad_norm": 0.2087799459695816, "learning_rate": 0.001, "loss": 2.245, "step": 7565 }, { "epoch": 0.84, "grad_norm": 0.18876142799854279, "learning_rate": 0.001, "loss": 2.1585, "step": 7570 }, { "epoch": 0.84, "grad_norm": 0.19884787499904633, "learning_rate": 0.001, "loss": 2.2337, "step": 7575 }, { "epoch": 0.85, "grad_norm": 0.19892942905426025, "learning_rate": 0.001, "loss": 2.269, "step": 7580 }, { "epoch": 0.85, "grad_norm": 0.2039511501789093, "learning_rate": 0.001, "loss": 2.1756, "step": 7585 }, { "epoch": 0.85, "grad_norm": 0.2111709862947464, "learning_rate": 0.001, "loss": 2.2736, "step": 7590 }, { "epoch": 0.85, "grad_norm": 0.19605736434459686, "learning_rate": 0.001, "loss": 2.2364, "step": 7595 }, { "epoch": 0.85, "grad_norm": 0.21033748984336853, "learning_rate": 0.001, "loss": 2.2014, "step": 7600 }, { "epoch": 0.85, "grad_norm": 0.2066594660282135, "learning_rate": 0.001, "loss": 2.1916, "step": 7605 }, { "epoch": 0.85, "grad_norm": 0.21947403252124786, "learning_rate": 0.001, "loss": 2.2487, "step": 7610 }, { "epoch": 0.85, "grad_norm": 0.20312318205833435, "learning_rate": 0.001, "loss": 2.2154, "step": 7615 }, { "epoch": 0.85, "grad_norm": 0.19939804077148438, "learning_rate": 0.001, "loss": 2.2305, "step": 7620 }, { "epoch": 0.85, "grad_norm": 0.19940166175365448, "learning_rate": 0.001, "loss": 2.2305, "step": 7625 }, { "epoch": 0.85, "grad_norm": 0.21953539550304413, "learning_rate": 0.001, "loss": 2.1782, "step": 7630 }, { "epoch": 0.85, "grad_norm": 0.2001238316297531, "learning_rate": 0.001, "loss": 2.286, "step": 7635 }, { "epoch": 0.85, "grad_norm": 0.20314432680606842, "learning_rate": 0.001, "loss": 2.2022, "step": 7640 }, { "epoch": 0.85, "grad_norm": 0.2054559886455536, "learning_rate": 0.001, "loss": 2.2359, "step": 7645 }, { "epoch": 0.85, "grad_norm": 0.21550264954566956, "learning_rate": 0.001, "loss": 2.2974, "step": 7650 }, { "epoch": 0.85, "grad_norm": 0.22751198709011078, "learning_rate": 0.001, "loss": 2.1827, "step": 7655 }, { "epoch": 0.85, "grad_norm": 0.20444227755069733, "learning_rate": 0.001, "loss": 2.2039, "step": 7660 }, { "epoch": 0.85, "grad_norm": 0.19839997589588165, "learning_rate": 0.001, "loss": 2.2031, "step": 7665 }, { "epoch": 0.86, "grad_norm": 0.2035912275314331, "learning_rate": 0.001, "loss": 2.2535, "step": 7670 }, { "epoch": 0.86, "grad_norm": 0.2072770595550537, "learning_rate": 0.001, "loss": 2.2198, "step": 7675 }, { "epoch": 0.86, "grad_norm": 0.1923142671585083, "learning_rate": 0.001, "loss": 2.247, "step": 7680 }, { "epoch": 0.86, "grad_norm": 0.2125130444765091, "learning_rate": 0.001, "loss": 2.3123, "step": 7685 }, { "epoch": 0.86, "grad_norm": 0.2017953097820282, "learning_rate": 0.001, "loss": 2.279, "step": 7690 }, { "epoch": 0.86, "grad_norm": 0.21377520263195038, "learning_rate": 0.001, "loss": 2.2798, "step": 7695 }, { "epoch": 0.86, "grad_norm": 0.23891973495483398, "learning_rate": 0.001, "loss": 2.2924, "step": 7700 }, { "epoch": 0.86, "grad_norm": 0.22691906988620758, "learning_rate": 0.001, "loss": 2.2234, "step": 7705 }, { "epoch": 0.86, "grad_norm": 0.2150959074497223, "learning_rate": 0.001, "loss": 2.2182, "step": 7710 }, { "epoch": 0.86, "grad_norm": 0.19418391585350037, "learning_rate": 0.001, "loss": 2.1508, "step": 7715 }, { "epoch": 0.86, "grad_norm": 0.21434800326824188, "learning_rate": 0.001, "loss": 2.2382, "step": 7720 }, { "epoch": 0.86, "grad_norm": 0.21070006489753723, "learning_rate": 0.001, "loss": 2.2943, "step": 7725 }, { "epoch": 0.86, "grad_norm": 0.194297656416893, "learning_rate": 0.001, "loss": 2.2794, "step": 7730 }, { "epoch": 0.86, "grad_norm": 0.220197856426239, "learning_rate": 0.001, "loss": 2.242, "step": 7735 }, { "epoch": 0.86, "grad_norm": 0.2022203654050827, "learning_rate": 0.001, "loss": 2.2955, "step": 7740 }, { "epoch": 0.86, "grad_norm": 0.2011207491159439, "learning_rate": 0.001, "loss": 2.1795, "step": 7745 }, { "epoch": 0.86, "grad_norm": 0.18811127543449402, "learning_rate": 0.001, "loss": 2.255, "step": 7750 }, { "epoch": 0.86, "grad_norm": 0.1934177577495575, "learning_rate": 0.001, "loss": 2.2592, "step": 7755 }, { "epoch": 0.87, "grad_norm": 0.2053891122341156, "learning_rate": 0.001, "loss": 2.1615, "step": 7760 }, { "epoch": 0.87, "grad_norm": 0.19190414249897003, "learning_rate": 0.001, "loss": 2.2911, "step": 7765 }, { "epoch": 0.87, "grad_norm": 0.19915464520454407, "learning_rate": 0.001, "loss": 2.2466, "step": 7770 }, { "epoch": 0.87, "grad_norm": 0.20265983045101166, "learning_rate": 0.001, "loss": 2.2542, "step": 7775 }, { "epoch": 0.87, "grad_norm": 0.184738427400589, "learning_rate": 0.001, "loss": 2.2625, "step": 7780 }, { "epoch": 0.87, "grad_norm": 0.2045259177684784, "learning_rate": 0.001, "loss": 2.2833, "step": 7785 }, { "epoch": 0.87, "grad_norm": 0.21759961545467377, "learning_rate": 0.001, "loss": 2.2725, "step": 7790 }, { "epoch": 0.87, "grad_norm": 0.1975535899400711, "learning_rate": 0.001, "loss": 2.1523, "step": 7795 }, { "epoch": 0.87, "grad_norm": 0.20412267744541168, "learning_rate": 0.001, "loss": 2.1857, "step": 7800 }, { "epoch": 0.87, "grad_norm": 0.19706964492797852, "learning_rate": 0.001, "loss": 2.2819, "step": 7805 }, { "epoch": 0.87, "grad_norm": 0.1994798183441162, "learning_rate": 0.001, "loss": 2.1996, "step": 7810 }, { "epoch": 0.87, "grad_norm": 0.19474805891513824, "learning_rate": 0.001, "loss": 2.2869, "step": 7815 }, { "epoch": 0.87, "grad_norm": 0.20528604090213776, "learning_rate": 0.001, "loss": 2.2877, "step": 7820 }, { "epoch": 0.87, "grad_norm": 0.23693878948688507, "learning_rate": 0.001, "loss": 2.3314, "step": 7825 }, { "epoch": 0.87, "grad_norm": 0.2081892490386963, "learning_rate": 0.001, "loss": 2.2119, "step": 7830 }, { "epoch": 0.87, "grad_norm": 0.19748859107494354, "learning_rate": 0.001, "loss": 2.2803, "step": 7835 }, { "epoch": 0.87, "grad_norm": 0.22490812838077545, "learning_rate": 0.001, "loss": 2.2673, "step": 7840 }, { "epoch": 0.87, "grad_norm": 0.1880314201116562, "learning_rate": 0.001, "loss": 2.2421, "step": 7845 }, { "epoch": 0.88, "grad_norm": 0.20123490691184998, "learning_rate": 0.001, "loss": 2.2754, "step": 7850 }, { "epoch": 0.88, "grad_norm": 0.20010483264923096, "learning_rate": 0.001, "loss": 2.2223, "step": 7855 }, { "epoch": 0.88, "grad_norm": 0.20877912640571594, "learning_rate": 0.001, "loss": 2.286, "step": 7860 }, { "epoch": 0.88, "grad_norm": 0.1998739242553711, "learning_rate": 0.001, "loss": 2.2096, "step": 7865 }, { "epoch": 0.88, "grad_norm": 0.20182031393051147, "learning_rate": 0.001, "loss": 2.2369, "step": 7870 }, { "epoch": 0.88, "grad_norm": 0.21211646497249603, "learning_rate": 0.001, "loss": 2.2279, "step": 7875 }, { "epoch": 0.88, "grad_norm": 0.19902239739894867, "learning_rate": 0.001, "loss": 2.1724, "step": 7880 }, { "epoch": 0.88, "grad_norm": 0.2212173193693161, "learning_rate": 0.001, "loss": 2.1901, "step": 7885 }, { "epoch": 0.88, "grad_norm": 0.18968746066093445, "learning_rate": 0.001, "loss": 2.2669, "step": 7890 }, { "epoch": 0.88, "grad_norm": 0.19653159379959106, "learning_rate": 0.001, "loss": 2.157, "step": 7895 }, { "epoch": 0.88, "grad_norm": 0.19996388256549835, "learning_rate": 0.001, "loss": 2.276, "step": 7900 }, { "epoch": 0.88, "grad_norm": 0.20356236398220062, "learning_rate": 0.001, "loss": 2.1891, "step": 7905 }, { "epoch": 0.88, "grad_norm": 0.19595099985599518, "learning_rate": 0.001, "loss": 2.2522, "step": 7910 }, { "epoch": 0.88, "grad_norm": 0.19440218806266785, "learning_rate": 0.001, "loss": 2.2183, "step": 7915 }, { "epoch": 0.88, "grad_norm": 0.19151397049427032, "learning_rate": 0.001, "loss": 2.2508, "step": 7920 }, { "epoch": 0.88, "grad_norm": 0.20077724754810333, "learning_rate": 0.001, "loss": 2.2832, "step": 7925 }, { "epoch": 0.88, "grad_norm": 0.23355898261070251, "learning_rate": 0.001, "loss": 2.2868, "step": 7930 }, { "epoch": 0.88, "grad_norm": 0.2240302562713623, "learning_rate": 0.001, "loss": 2.2996, "step": 7935 }, { "epoch": 0.89, "grad_norm": 0.20705515146255493, "learning_rate": 0.001, "loss": 2.2632, "step": 7940 }, { "epoch": 0.89, "grad_norm": 0.19102813303470612, "learning_rate": 0.001, "loss": 2.2122, "step": 7945 }, { "epoch": 0.89, "grad_norm": 0.2303714156150818, "learning_rate": 0.001, "loss": 2.2268, "step": 7950 }, { "epoch": 0.89, "grad_norm": 0.19775277376174927, "learning_rate": 0.001, "loss": 2.2802, "step": 7955 }, { "epoch": 0.89, "grad_norm": 0.21944767236709595, "learning_rate": 0.001, "loss": 2.2109, "step": 7960 }, { "epoch": 0.89, "grad_norm": 0.20680417120456696, "learning_rate": 0.001, "loss": 2.2182, "step": 7965 }, { "epoch": 0.89, "grad_norm": 0.1878603845834732, "learning_rate": 0.001, "loss": 2.2759, "step": 7970 }, { "epoch": 0.89, "grad_norm": 0.19500446319580078, "learning_rate": 0.001, "loss": 2.25, "step": 7975 }, { "epoch": 0.89, "grad_norm": 0.1924324333667755, "learning_rate": 0.001, "loss": 2.2426, "step": 7980 }, { "epoch": 0.89, "grad_norm": 0.21792522072792053, "learning_rate": 0.001, "loss": 2.283, "step": 7985 }, { "epoch": 0.89, "grad_norm": 0.1874474287033081, "learning_rate": 0.001, "loss": 2.2719, "step": 7990 }, { "epoch": 0.89, "grad_norm": 0.1898234784603119, "learning_rate": 0.001, "loss": 2.2499, "step": 7995 }, { "epoch": 0.89, "grad_norm": 0.20445217192173004, "learning_rate": 0.001, "loss": 2.3275, "step": 8000 }, { "epoch": 0.89, "grad_norm": 0.20067258179187775, "learning_rate": 0.001, "loss": 2.1847, "step": 8005 }, { "epoch": 0.89, "grad_norm": 0.19402869045734406, "learning_rate": 0.001, "loss": 2.2531, "step": 8010 }, { "epoch": 0.89, "grad_norm": 0.2005668431520462, "learning_rate": 0.001, "loss": 2.1642, "step": 8015 }, { "epoch": 0.89, "grad_norm": 0.20115861296653748, "learning_rate": 0.001, "loss": 2.3334, "step": 8020 }, { "epoch": 0.89, "grad_norm": 0.19401215016841888, "learning_rate": 0.001, "loss": 2.2377, "step": 8025 }, { "epoch": 0.9, "grad_norm": 0.19128204882144928, "learning_rate": 0.001, "loss": 2.319, "step": 8030 }, { "epoch": 0.9, "grad_norm": 0.20478878915309906, "learning_rate": 0.001, "loss": 2.2586, "step": 8035 }, { "epoch": 0.9, "grad_norm": 0.20956303179264069, "learning_rate": 0.001, "loss": 2.2173, "step": 8040 }, { "epoch": 0.9, "grad_norm": 0.19278179109096527, "learning_rate": 0.001, "loss": 2.2481, "step": 8045 }, { "epoch": 0.9, "grad_norm": 0.19514437019824982, "learning_rate": 0.001, "loss": 2.2832, "step": 8050 }, { "epoch": 0.9, "grad_norm": 0.2172226905822754, "learning_rate": 0.001, "loss": 2.2327, "step": 8055 }, { "epoch": 0.9, "grad_norm": 0.18109092116355896, "learning_rate": 0.001, "loss": 2.1731, "step": 8060 }, { "epoch": 0.9, "grad_norm": 0.1890215128660202, "learning_rate": 0.001, "loss": 2.3154, "step": 8065 }, { "epoch": 0.9, "grad_norm": 0.19754280149936676, "learning_rate": 0.001, "loss": 2.2203, "step": 8070 }, { "epoch": 0.9, "grad_norm": 0.19780655205249786, "learning_rate": 0.001, "loss": 2.2826, "step": 8075 }, { "epoch": 0.9, "grad_norm": 0.18817149102687836, "learning_rate": 0.001, "loss": 2.2465, "step": 8080 }, { "epoch": 0.9, "grad_norm": 0.19357270002365112, "learning_rate": 0.001, "loss": 2.2105, "step": 8085 }, { "epoch": 0.9, "grad_norm": 0.1954236626625061, "learning_rate": 0.001, "loss": 2.2486, "step": 8090 }, { "epoch": 0.9, "grad_norm": 0.18649014830589294, "learning_rate": 0.001, "loss": 2.2103, "step": 8095 }, { "epoch": 0.9, "grad_norm": 0.20539119839668274, "learning_rate": 0.001, "loss": 2.1869, "step": 8100 }, { "epoch": 0.9, "grad_norm": 0.18983592092990875, "learning_rate": 0.001, "loss": 2.1514, "step": 8105 }, { "epoch": 0.9, "grad_norm": 0.1923411339521408, "learning_rate": 0.001, "loss": 2.2228, "step": 8110 }, { "epoch": 0.9, "grad_norm": 0.18866506218910217, "learning_rate": 0.001, "loss": 2.1797, "step": 8115 }, { "epoch": 0.91, "grad_norm": 0.19905278086662292, "learning_rate": 0.001, "loss": 2.2419, "step": 8120 }, { "epoch": 0.91, "grad_norm": 0.19118526577949524, "learning_rate": 0.001, "loss": 2.2174, "step": 8125 }, { "epoch": 0.91, "grad_norm": 0.20813317596912384, "learning_rate": 0.001, "loss": 2.2707, "step": 8130 }, { "epoch": 0.91, "grad_norm": 0.20310407876968384, "learning_rate": 0.001, "loss": 2.2669, "step": 8135 }, { "epoch": 0.91, "grad_norm": 0.20061352849006653, "learning_rate": 0.001, "loss": 2.2334, "step": 8140 }, { "epoch": 0.91, "grad_norm": 0.19646792113780975, "learning_rate": 0.001, "loss": 2.2195, "step": 8145 }, { "epoch": 0.91, "grad_norm": 0.21986974775791168, "learning_rate": 0.001, "loss": 2.2153, "step": 8150 }, { "epoch": 0.91, "grad_norm": 0.17849063873291016, "learning_rate": 0.001, "loss": 2.2178, "step": 8155 }, { "epoch": 0.91, "grad_norm": 0.19407208263874054, "learning_rate": 0.001, "loss": 2.1844, "step": 8160 }, { "epoch": 0.91, "grad_norm": 0.1898379921913147, "learning_rate": 0.001, "loss": 2.2834, "step": 8165 }, { "epoch": 0.91, "grad_norm": 0.20284302532672882, "learning_rate": 0.001, "loss": 2.304, "step": 8170 }, { "epoch": 0.91, "grad_norm": 0.21038895845413208, "learning_rate": 0.001, "loss": 2.1693, "step": 8175 }, { "epoch": 0.91, "grad_norm": 0.2112080454826355, "learning_rate": 0.001, "loss": 2.1756, "step": 8180 }, { "epoch": 0.91, "grad_norm": 0.21226562559604645, "learning_rate": 0.001, "loss": 2.2484, "step": 8185 }, { "epoch": 0.91, "grad_norm": 0.18678170442581177, "learning_rate": 0.001, "loss": 2.2438, "step": 8190 }, { "epoch": 0.91, "grad_norm": 0.22687122225761414, "learning_rate": 0.001, "loss": 2.2828, "step": 8195 }, { "epoch": 0.91, "grad_norm": 0.2054503709077835, "learning_rate": 0.001, "loss": 2.2264, "step": 8200 }, { "epoch": 0.91, "grad_norm": 0.19257859885692596, "learning_rate": 0.001, "loss": 2.3202, "step": 8205 }, { "epoch": 0.92, "grad_norm": 0.19209985435009003, "learning_rate": 0.001, "loss": 2.1799, "step": 8210 }, { "epoch": 0.92, "grad_norm": 0.2024695873260498, "learning_rate": 0.001, "loss": 2.1492, "step": 8215 }, { "epoch": 0.92, "grad_norm": 0.1906440705060959, "learning_rate": 0.001, "loss": 2.2483, "step": 8220 }, { "epoch": 0.92, "grad_norm": 0.1945749670267105, "learning_rate": 0.001, "loss": 2.2186, "step": 8225 }, { "epoch": 0.92, "grad_norm": 0.21156622469425201, "learning_rate": 0.001, "loss": 2.2877, "step": 8230 }, { "epoch": 0.92, "grad_norm": 0.18485133349895477, "learning_rate": 0.001, "loss": 2.3249, "step": 8235 }, { "epoch": 0.92, "grad_norm": 0.18751317262649536, "learning_rate": 0.001, "loss": 2.1932, "step": 8240 }, { "epoch": 0.92, "grad_norm": 0.1993100941181183, "learning_rate": 0.001, "loss": 2.2184, "step": 8245 }, { "epoch": 0.92, "grad_norm": 0.2515455186367035, "learning_rate": 0.001, "loss": 2.2294, "step": 8250 }, { "epoch": 0.92, "grad_norm": 0.19279171526432037, "learning_rate": 0.001, "loss": 2.2683, "step": 8255 }, { "epoch": 0.92, "grad_norm": 0.27863675355911255, "learning_rate": 0.001, "loss": 2.2202, "step": 8260 }, { "epoch": 0.92, "grad_norm": 0.20159432291984558, "learning_rate": 0.001, "loss": 2.2538, "step": 8265 }, { "epoch": 0.92, "grad_norm": 0.2060721218585968, "learning_rate": 0.001, "loss": 2.2585, "step": 8270 }, { "epoch": 0.92, "grad_norm": 0.19700761139392853, "learning_rate": 0.001, "loss": 2.1878, "step": 8275 }, { "epoch": 0.92, "grad_norm": 0.19809959828853607, "learning_rate": 0.001, "loss": 2.2367, "step": 8280 }, { "epoch": 0.92, "grad_norm": 0.2082154005765915, "learning_rate": 0.001, "loss": 2.1445, "step": 8285 }, { "epoch": 0.92, "grad_norm": 0.1983455866575241, "learning_rate": 0.001, "loss": 2.2885, "step": 8290 }, { "epoch": 0.92, "grad_norm": 0.19476141035556793, "learning_rate": 0.001, "loss": 2.2266, "step": 8295 }, { "epoch": 0.93, "grad_norm": 0.20148876309394836, "learning_rate": 0.001, "loss": 2.1522, "step": 8300 }, { "epoch": 0.93, "grad_norm": 0.2016109824180603, "learning_rate": 0.001, "loss": 2.2349, "step": 8305 }, { "epoch": 0.93, "grad_norm": 0.23375721275806427, "learning_rate": 0.001, "loss": 2.1436, "step": 8310 }, { "epoch": 0.93, "grad_norm": 0.19160257279872894, "learning_rate": 0.001, "loss": 2.1778, "step": 8315 }, { "epoch": 0.93, "grad_norm": 0.18439500033855438, "learning_rate": 0.001, "loss": 2.1937, "step": 8320 }, { "epoch": 0.93, "grad_norm": 0.19977669417858124, "learning_rate": 0.001, "loss": 2.2322, "step": 8325 }, { "epoch": 0.93, "grad_norm": 0.20961971580982208, "learning_rate": 0.001, "loss": 2.2496, "step": 8330 }, { "epoch": 0.93, "grad_norm": 0.18950814008712769, "learning_rate": 0.001, "loss": 2.3144, "step": 8335 }, { "epoch": 0.93, "grad_norm": 0.18986515700817108, "learning_rate": 0.001, "loss": 2.2437, "step": 8340 }, { "epoch": 0.93, "grad_norm": 0.19119036197662354, "learning_rate": 0.001, "loss": 2.2634, "step": 8345 }, { "epoch": 0.93, "grad_norm": 0.18153807520866394, "learning_rate": 0.001, "loss": 2.1471, "step": 8350 }, { "epoch": 0.93, "grad_norm": 0.18010474741458893, "learning_rate": 0.001, "loss": 2.1389, "step": 8355 }, { "epoch": 0.93, "grad_norm": 0.19723130762577057, "learning_rate": 0.001, "loss": 2.2782, "step": 8360 }, { "epoch": 0.93, "grad_norm": 0.2162715643644333, "learning_rate": 0.001, "loss": 2.1351, "step": 8365 }, { "epoch": 0.93, "grad_norm": 0.1888781636953354, "learning_rate": 0.001, "loss": 2.2107, "step": 8370 }, { "epoch": 0.93, "grad_norm": 0.2023809850215912, "learning_rate": 0.001, "loss": 2.1802, "step": 8375 }, { "epoch": 0.93, "grad_norm": 0.18921934068202972, "learning_rate": 0.001, "loss": 2.1953, "step": 8380 }, { "epoch": 0.93, "grad_norm": 0.19186371564865112, "learning_rate": 0.001, "loss": 2.2515, "step": 8385 }, { "epoch": 0.94, "grad_norm": 0.2036081999540329, "learning_rate": 0.001, "loss": 2.2583, "step": 8390 }, { "epoch": 0.94, "grad_norm": 0.21693940460681915, "learning_rate": 0.001, "loss": 2.2567, "step": 8395 }, { "epoch": 0.94, "grad_norm": 0.20340251922607422, "learning_rate": 0.001, "loss": 2.1928, "step": 8400 }, { "epoch": 0.94, "grad_norm": 0.1897629350423813, "learning_rate": 0.001, "loss": 2.1738, "step": 8405 }, { "epoch": 0.94, "grad_norm": 0.19895780086517334, "learning_rate": 0.001, "loss": 2.1841, "step": 8410 }, { "epoch": 0.94, "grad_norm": 0.19295699894428253, "learning_rate": 0.001, "loss": 2.194, "step": 8415 }, { "epoch": 0.94, "grad_norm": 0.19201216101646423, "learning_rate": 0.001, "loss": 2.2543, "step": 8420 }, { "epoch": 0.94, "grad_norm": 0.18177099525928497, "learning_rate": 0.001, "loss": 2.1576, "step": 8425 }, { "epoch": 0.94, "grad_norm": 0.2475714534521103, "learning_rate": 0.001, "loss": 2.1712, "step": 8430 }, { "epoch": 0.94, "grad_norm": 0.18510274589061737, "learning_rate": 0.001, "loss": 2.2584, "step": 8435 }, { "epoch": 0.94, "grad_norm": 0.19641967117786407, "learning_rate": 0.001, "loss": 2.2638, "step": 8440 }, { "epoch": 0.94, "grad_norm": 0.18982478976249695, "learning_rate": 0.001, "loss": 2.1609, "step": 8445 }, { "epoch": 0.94, "grad_norm": 0.18552134931087494, "learning_rate": 0.001, "loss": 2.1553, "step": 8450 }, { "epoch": 0.94, "grad_norm": 0.198546901345253, "learning_rate": 0.001, "loss": 2.133, "step": 8455 }, { "epoch": 0.94, "grad_norm": 0.19203849136829376, "learning_rate": 0.001, "loss": 2.2076, "step": 8460 }, { "epoch": 0.94, "grad_norm": 0.22504118084907532, "learning_rate": 0.001, "loss": 2.2539, "step": 8465 }, { "epoch": 0.94, "grad_norm": 0.18785686790943146, "learning_rate": 0.001, "loss": 2.2552, "step": 8470 }, { "epoch": 0.94, "grad_norm": 0.1946154534816742, "learning_rate": 0.001, "loss": 2.1435, "step": 8475 }, { "epoch": 0.95, "grad_norm": 0.1913706511259079, "learning_rate": 0.001, "loss": 2.2436, "step": 8480 }, { "epoch": 0.95, "grad_norm": 0.17650993168354034, "learning_rate": 0.001, "loss": 2.2237, "step": 8485 }, { "epoch": 0.95, "grad_norm": 0.183892622590065, "learning_rate": 0.001, "loss": 2.2427, "step": 8490 }, { "epoch": 0.95, "grad_norm": 0.2386084794998169, "learning_rate": 0.001, "loss": 2.2556, "step": 8495 }, { "epoch": 0.95, "grad_norm": 0.21051189303398132, "learning_rate": 0.001, "loss": 2.1528, "step": 8500 }, { "epoch": 0.95, "grad_norm": 0.1852414757013321, "learning_rate": 0.001, "loss": 2.2319, "step": 8505 }, { "epoch": 0.95, "grad_norm": 0.19965070486068726, "learning_rate": 0.001, "loss": 2.1629, "step": 8510 }, { "epoch": 0.95, "grad_norm": 0.18793615698814392, "learning_rate": 0.001, "loss": 2.273, "step": 8515 }, { "epoch": 0.95, "grad_norm": 0.194820836186409, "learning_rate": 0.001, "loss": 2.2136, "step": 8520 }, { "epoch": 0.95, "grad_norm": 0.25766825675964355, "learning_rate": 0.001, "loss": 2.1721, "step": 8525 }, { "epoch": 0.95, "grad_norm": 0.22273294627666473, "learning_rate": 0.001, "loss": 2.1595, "step": 8530 }, { "epoch": 0.95, "grad_norm": 0.22989904880523682, "learning_rate": 0.001, "loss": 2.2719, "step": 8535 }, { "epoch": 0.95, "grad_norm": 0.21977895498275757, "learning_rate": 0.001, "loss": 2.2358, "step": 8540 }, { "epoch": 0.95, "grad_norm": 0.20909038186073303, "learning_rate": 0.001, "loss": 2.2797, "step": 8545 }, { "epoch": 0.95, "grad_norm": 0.17358806729316711, "learning_rate": 0.001, "loss": 2.2295, "step": 8550 }, { "epoch": 0.95, "grad_norm": 0.19235506653785706, "learning_rate": 0.001, "loss": 2.2181, "step": 8555 }, { "epoch": 0.95, "grad_norm": 0.20009635388851166, "learning_rate": 0.001, "loss": 2.2965, "step": 8560 }, { "epoch": 0.95, "grad_norm": 0.1902618110179901, "learning_rate": 0.001, "loss": 2.2593, "step": 8565 }, { "epoch": 0.96, "grad_norm": 0.20280583202838898, "learning_rate": 0.001, "loss": 2.1473, "step": 8570 }, { "epoch": 0.96, "grad_norm": 0.19154559075832367, "learning_rate": 0.001, "loss": 2.2924, "step": 8575 }, { "epoch": 0.96, "grad_norm": 0.20159044861793518, "learning_rate": 0.001, "loss": 2.214, "step": 8580 }, { "epoch": 0.96, "grad_norm": 0.23447231948375702, "learning_rate": 0.001, "loss": 2.2696, "step": 8585 }, { "epoch": 0.96, "grad_norm": 0.18314625322818756, "learning_rate": 0.001, "loss": 2.1882, "step": 8590 }, { "epoch": 0.96, "grad_norm": 0.1972244381904602, "learning_rate": 0.001, "loss": 2.2503, "step": 8595 }, { "epoch": 0.96, "grad_norm": 0.18517006933689117, "learning_rate": 0.001, "loss": 2.1281, "step": 8600 }, { "epoch": 0.96, "grad_norm": 0.18247254192829132, "learning_rate": 0.001, "loss": 2.2121, "step": 8605 }, { "epoch": 0.96, "grad_norm": 0.18291489779949188, "learning_rate": 0.001, "loss": 2.1932, "step": 8610 }, { "epoch": 0.96, "grad_norm": 0.20838534832000732, "learning_rate": 0.001, "loss": 2.1754, "step": 8615 }, { "epoch": 0.96, "grad_norm": 0.18746349215507507, "learning_rate": 0.001, "loss": 2.1692, "step": 8620 }, { "epoch": 0.96, "grad_norm": 0.20475895702838898, "learning_rate": 0.001, "loss": 2.158, "step": 8625 }, { "epoch": 0.96, "grad_norm": 0.19268833100795746, "learning_rate": 0.001, "loss": 2.1658, "step": 8630 }, { "epoch": 0.96, "grad_norm": 0.18458105623722076, "learning_rate": 0.001, "loss": 2.1426, "step": 8635 }, { "epoch": 0.96, "grad_norm": 0.18215860426425934, "learning_rate": 0.001, "loss": 2.2663, "step": 8640 }, { "epoch": 0.96, "grad_norm": 0.20346863567829132, "learning_rate": 0.001, "loss": 2.1614, "step": 8645 }, { "epoch": 0.96, "grad_norm": 0.20370927453041077, "learning_rate": 0.001, "loss": 2.1839, "step": 8650 }, { "epoch": 0.96, "grad_norm": 0.19011425971984863, "learning_rate": 0.001, "loss": 2.1797, "step": 8655 }, { "epoch": 0.97, "grad_norm": 0.19454734027385712, "learning_rate": 0.001, "loss": 2.2564, "step": 8660 }, { "epoch": 0.97, "grad_norm": 0.18686948716640472, "learning_rate": 0.001, "loss": 2.1875, "step": 8665 }, { "epoch": 0.97, "grad_norm": 0.19724272191524506, "learning_rate": 0.001, "loss": 2.2285, "step": 8670 }, { "epoch": 0.97, "grad_norm": 0.18537306785583496, "learning_rate": 0.001, "loss": 2.1898, "step": 8675 }, { "epoch": 0.97, "grad_norm": 0.18430756032466888, "learning_rate": 0.001, "loss": 2.1661, "step": 8680 }, { "epoch": 0.97, "grad_norm": 0.18186238408088684, "learning_rate": 0.001, "loss": 2.1505, "step": 8685 }, { "epoch": 0.97, "grad_norm": 0.2050350457429886, "learning_rate": 0.001, "loss": 2.19, "step": 8690 }, { "epoch": 0.97, "grad_norm": 0.22094227373600006, "learning_rate": 0.001, "loss": 2.231, "step": 8695 }, { "epoch": 0.97, "grad_norm": 0.17757384479045868, "learning_rate": 0.001, "loss": 2.201, "step": 8700 }, { "epoch": 0.97, "grad_norm": 0.1837390959262848, "learning_rate": 0.001, "loss": 2.1807, "step": 8705 }, { "epoch": 0.97, "grad_norm": 0.18438223004341125, "learning_rate": 0.001, "loss": 2.2083, "step": 8710 }, { "epoch": 0.97, "grad_norm": 0.20233049988746643, "learning_rate": 0.001, "loss": 2.2958, "step": 8715 }, { "epoch": 0.97, "grad_norm": 0.21688610315322876, "learning_rate": 0.001, "loss": 2.2159, "step": 8720 }, { "epoch": 0.97, "grad_norm": 0.1822272092103958, "learning_rate": 0.001, "loss": 2.2242, "step": 8725 }, { "epoch": 0.97, "grad_norm": 0.21040040254592896, "learning_rate": 0.001, "loss": 2.2087, "step": 8730 }, { "epoch": 0.97, "grad_norm": 0.17634455859661102, "learning_rate": 0.001, "loss": 2.223, "step": 8735 }, { "epoch": 0.97, "grad_norm": 0.17778807878494263, "learning_rate": 0.001, "loss": 2.317, "step": 8740 }, { "epoch": 0.98, "grad_norm": 0.27437514066696167, "learning_rate": 0.001, "loss": 2.2154, "step": 8745 }, { "epoch": 0.98, "grad_norm": 0.20957055687904358, "learning_rate": 0.001, "loss": 2.1963, "step": 8750 }, { "epoch": 0.98, "grad_norm": 0.19060905277729034, "learning_rate": 0.001, "loss": 2.1631, "step": 8755 }, { "epoch": 0.98, "grad_norm": 0.18315349519252777, "learning_rate": 0.001, "loss": 2.1854, "step": 8760 }, { "epoch": 0.98, "grad_norm": 0.20113958418369293, "learning_rate": 0.001, "loss": 2.1993, "step": 8765 }, { "epoch": 0.98, "grad_norm": 0.19529324769973755, "learning_rate": 0.001, "loss": 2.1558, "step": 8770 }, { "epoch": 0.98, "grad_norm": 0.19535817205905914, "learning_rate": 0.001, "loss": 2.2532, "step": 8775 }, { "epoch": 0.98, "grad_norm": 0.18067863583564758, "learning_rate": 0.001, "loss": 2.1366, "step": 8780 }, { "epoch": 0.98, "grad_norm": 0.18170098960399628, "learning_rate": 0.001, "loss": 2.2162, "step": 8785 }, { "epoch": 0.98, "grad_norm": 0.18915463984012604, "learning_rate": 0.001, "loss": 2.2737, "step": 8790 }, { "epoch": 0.98, "grad_norm": 0.20589640736579895, "learning_rate": 0.001, "loss": 2.2656, "step": 8795 }, { "epoch": 0.98, "grad_norm": 0.21199221909046173, "learning_rate": 0.001, "loss": 2.2281, "step": 8800 }, { "epoch": 0.98, "grad_norm": 0.21070407330989838, "learning_rate": 0.001, "loss": 2.2422, "step": 8805 }, { "epoch": 0.98, "grad_norm": 0.1806258261203766, "learning_rate": 0.001, "loss": 2.1982, "step": 8810 }, { "epoch": 0.98, "grad_norm": 0.1874959021806717, "learning_rate": 0.001, "loss": 2.2128, "step": 8815 }, { "epoch": 0.98, "grad_norm": 0.2044542133808136, "learning_rate": 0.001, "loss": 2.1611, "step": 8820 }, { "epoch": 0.98, "grad_norm": 0.18704530596733093, "learning_rate": 0.001, "loss": 2.0984, "step": 8825 }, { "epoch": 0.98, "grad_norm": 0.18768665194511414, "learning_rate": 0.001, "loss": 2.2514, "step": 8830 }, { "epoch": 0.99, "grad_norm": 0.20076428353786469, "learning_rate": 0.001, "loss": 2.212, "step": 8835 }, { "epoch": 0.99, "grad_norm": 0.19733686745166779, "learning_rate": 0.001, "loss": 2.21, "step": 8840 }, { "epoch": 0.99, "grad_norm": 0.17910441756248474, "learning_rate": 0.001, "loss": 2.237, "step": 8845 }, { "epoch": 0.99, "grad_norm": 0.18687348067760468, "learning_rate": 0.001, "loss": 2.2015, "step": 8850 }, { "epoch": 0.99, "grad_norm": 0.18390965461730957, "learning_rate": 0.001, "loss": 2.1904, "step": 8855 }, { "epoch": 0.99, "grad_norm": 0.18445414304733276, "learning_rate": 0.001, "loss": 2.2473, "step": 8860 }, { "epoch": 0.99, "grad_norm": 0.21044015884399414, "learning_rate": 0.001, "loss": 2.1581, "step": 8865 }, { "epoch": 0.99, "grad_norm": 0.20410551130771637, "learning_rate": 0.001, "loss": 2.2001, "step": 8870 }, { "epoch": 0.99, "grad_norm": 0.17719769477844238, "learning_rate": 0.001, "loss": 2.187, "step": 8875 }, { "epoch": 0.99, "grad_norm": 0.17933371663093567, "learning_rate": 0.001, "loss": 2.1969, "step": 8880 }, { "epoch": 0.99, "grad_norm": 0.1819513589143753, "learning_rate": 0.001, "loss": 2.2288, "step": 8885 }, { "epoch": 0.99, "grad_norm": 0.1850760281085968, "learning_rate": 0.001, "loss": 2.1351, "step": 8890 }, { "epoch": 0.99, "grad_norm": 0.1834917962551117, "learning_rate": 0.001, "loss": 2.2392, "step": 8895 }, { "epoch": 0.99, "grad_norm": 0.183976948261261, "learning_rate": 0.001, "loss": 2.2063, "step": 8900 }, { "epoch": 0.99, "grad_norm": 0.18732498586177826, "learning_rate": 0.001, "loss": 2.2379, "step": 8905 }, { "epoch": 0.99, "grad_norm": 0.18440058827400208, "learning_rate": 0.001, "loss": 2.1353, "step": 8910 }, { "epoch": 0.99, "grad_norm": 0.18112874031066895, "learning_rate": 0.001, "loss": 2.219, "step": 8915 }, { "epoch": 0.99, "grad_norm": 0.1814718097448349, "learning_rate": 0.001, "loss": 2.1945, "step": 8920 }, { "epoch": 1.0, "grad_norm": 0.17877763509750366, "learning_rate": 0.001, "loss": 2.2328, "step": 8925 }, { "epoch": 1.0, "grad_norm": 0.1817835569381714, "learning_rate": 0.001, "loss": 2.1293, "step": 8930 }, { "epoch": 1.0, "grad_norm": 0.19791525602340698, "learning_rate": 0.001, "loss": 2.2204, "step": 8935 }, { "epoch": 1.0, "grad_norm": 0.19058136641979218, "learning_rate": 0.001, "loss": 2.2824, "step": 8940 }, { "epoch": 1.0, "grad_norm": 0.18067437410354614, "learning_rate": 0.001, "loss": 2.1005, "step": 8945 }, { "epoch": 1.0, "grad_norm": 0.17680200934410095, "learning_rate": 0.001, "loss": 2.0916, "step": 8950 }, { "epoch": 1.0, "grad_norm": 0.17984385788440704, "learning_rate": 0.001, "loss": 2.1737, "step": 8955 }, { "epoch": 1.0, "grad_norm": 0.20398467779159546, "learning_rate": 0.001, "loss": 2.2369, "step": 8960 }, { "epoch": 1.0, "grad_norm": 0.186319962143898, "learning_rate": 0.001, "loss": 2.249, "step": 8965 }, { "epoch": 1.0, "eval_loss": 2.2163608074188232, "eval_runtime": 830.4777, "eval_samples_per_second": 19.124, "eval_steps_per_second": 2.391, "step": 8969 }, { "epoch": 1.0, "step": 8969, "total_flos": 1.7053077415447757e+17, "train_loss": 2.6744996346363004, "train_runtime": 27779.5691, "train_samples_per_second": 5.165, "train_steps_per_second": 0.323 } ], "logging_steps": 5, "max_steps": 8969, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 1.7053077415447757e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }