barc-llama3.1-8b-instruct-lora64-induction-gpt4mini20k-llama20k_lr2e-4_epoch3
/
checkpoint-266
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 0.99812382739212, | |
"eval_steps": 500, | |
"global_step": 266, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.00375234521575985, | |
"grad_norm": 1.1422045632346107, | |
"learning_rate": 2.5e-06, | |
"loss": 1.4765, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.0075046904315197, | |
"grad_norm": 1.10606272348653, | |
"learning_rate": 5e-06, | |
"loss": 1.4679, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.01125703564727955, | |
"grad_norm": 1.124285036588602, | |
"learning_rate": 7.5e-06, | |
"loss": 1.4926, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.0150093808630394, | |
"grad_norm": 1.175650360755083, | |
"learning_rate": 1e-05, | |
"loss": 1.4946, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.01876172607879925, | |
"grad_norm": 1.1176055565878193, | |
"learning_rate": 1.25e-05, | |
"loss": 1.4803, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.0225140712945591, | |
"grad_norm": 1.080965163228283, | |
"learning_rate": 1.5e-05, | |
"loss": 1.4069, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.02626641651031895, | |
"grad_norm": 1.073199125902437, | |
"learning_rate": 1.75e-05, | |
"loss": 1.4175, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.0300187617260788, | |
"grad_norm": 0.9895651667655394, | |
"learning_rate": 2e-05, | |
"loss": 1.3952, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.03377110694183865, | |
"grad_norm": 0.8484409515335725, | |
"learning_rate": 2.25e-05, | |
"loss": 1.3084, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.0375234521575985, | |
"grad_norm": 0.656224580389129, | |
"learning_rate": 2.5e-05, | |
"loss": 1.2224, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.04127579737335835, | |
"grad_norm": 0.6681802871972625, | |
"learning_rate": 2.7500000000000004e-05, | |
"loss": 1.2279, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.0450281425891182, | |
"grad_norm": 0.6445930931164492, | |
"learning_rate": 3e-05, | |
"loss": 1.1869, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.04878048780487805, | |
"grad_norm": 0.6774830464098534, | |
"learning_rate": 3.2500000000000004e-05, | |
"loss": 1.1345, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.0525328330206379, | |
"grad_norm": 0.7129957171173121, | |
"learning_rate": 3.5e-05, | |
"loss": 1.0342, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.05628517823639775, | |
"grad_norm": 0.6988046692034513, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.9683, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.0600375234521576, | |
"grad_norm": 0.7305746200421179, | |
"learning_rate": 4e-05, | |
"loss": 0.8998, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.06378986866791744, | |
"grad_norm": 0.6303366234907746, | |
"learning_rate": 4.25e-05, | |
"loss": 0.8585, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.0675422138836773, | |
"grad_norm": 0.6262466336688131, | |
"learning_rate": 4.5e-05, | |
"loss": 0.7913, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.07129455909943715, | |
"grad_norm": 0.5934168135285741, | |
"learning_rate": 4.75e-05, | |
"loss": 0.7358, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.075046904315197, | |
"grad_norm": 0.5003901957180881, | |
"learning_rate": 5e-05, | |
"loss": 0.6762, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.07879924953095685, | |
"grad_norm": 0.44247641980699626, | |
"learning_rate": 5.25e-05, | |
"loss": 0.6412, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.0825515947467167, | |
"grad_norm": 0.33108999413889184, | |
"learning_rate": 5.500000000000001e-05, | |
"loss": 0.6021, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.08630393996247655, | |
"grad_norm": 0.30987297699741684, | |
"learning_rate": 5.7499999999999995e-05, | |
"loss": 0.5678, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.0900562851782364, | |
"grad_norm": 0.2879383883871797, | |
"learning_rate": 6e-05, | |
"loss": 0.5653, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.09380863039399624, | |
"grad_norm": 0.4921785771111378, | |
"learning_rate": 6.25e-05, | |
"loss": 0.5397, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.0975609756097561, | |
"grad_norm": 0.23455468567206647, | |
"learning_rate": 6.500000000000001e-05, | |
"loss": 0.5392, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.10131332082551595, | |
"grad_norm": 0.21551936996375468, | |
"learning_rate": 6.750000000000001e-05, | |
"loss": 0.5423, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.1050656660412758, | |
"grad_norm": 0.2138475404490417, | |
"learning_rate": 7e-05, | |
"loss": 0.5072, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.10881801125703565, | |
"grad_norm": 0.1981260579789532, | |
"learning_rate": 7.25e-05, | |
"loss": 0.4927, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.1125703564727955, | |
"grad_norm": 0.19766175304738637, | |
"learning_rate": 7.500000000000001e-05, | |
"loss": 0.4992, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.11632270168855535, | |
"grad_norm": 0.16180823154197033, | |
"learning_rate": 7.75e-05, | |
"loss": 0.5078, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.1200750469043152, | |
"grad_norm": 0.15792678361397225, | |
"learning_rate": 8e-05, | |
"loss": 0.4834, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.12382739212007504, | |
"grad_norm": 0.17901823211719936, | |
"learning_rate": 8.25e-05, | |
"loss": 0.5038, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.1275797373358349, | |
"grad_norm": 0.15291985686600748, | |
"learning_rate": 8.5e-05, | |
"loss": 0.463, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.13133208255159476, | |
"grad_norm": 0.1402347205035838, | |
"learning_rate": 8.75e-05, | |
"loss": 0.4678, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.1350844277673546, | |
"grad_norm": 0.1292157193781673, | |
"learning_rate": 9e-05, | |
"loss": 0.48, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.13883677298311445, | |
"grad_norm": 0.12200374600393228, | |
"learning_rate": 9.250000000000001e-05, | |
"loss": 0.4678, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.1425891181988743, | |
"grad_norm": 0.12645974836123272, | |
"learning_rate": 9.5e-05, | |
"loss": 0.4783, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.14634146341463414, | |
"grad_norm": 0.12515993032794973, | |
"learning_rate": 9.75e-05, | |
"loss": 0.4558, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.150093808630394, | |
"grad_norm": 0.1257915818218713, | |
"learning_rate": 0.0001, | |
"loss": 0.4582, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.15384615384615385, | |
"grad_norm": 0.11519718216680118, | |
"learning_rate": 0.0001025, | |
"loss": 0.4433, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.1575984990619137, | |
"grad_norm": 0.11408287464445384, | |
"learning_rate": 0.000105, | |
"loss": 0.4566, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.16135084427767354, | |
"grad_norm": 0.11355997287120467, | |
"learning_rate": 0.0001075, | |
"loss": 0.4601, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.1651031894934334, | |
"grad_norm": 0.1236061343834286, | |
"learning_rate": 0.00011000000000000002, | |
"loss": 0.4279, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.16885553470919323, | |
"grad_norm": 0.11292335583297317, | |
"learning_rate": 0.00011250000000000001, | |
"loss": 0.4242, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.1726078799249531, | |
"grad_norm": 0.10830414207227934, | |
"learning_rate": 0.00011499999999999999, | |
"loss": 0.4392, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.17636022514071295, | |
"grad_norm": 0.1130446420034329, | |
"learning_rate": 0.00011750000000000001, | |
"loss": 0.4175, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.1801125703564728, | |
"grad_norm": 0.10972733489410498, | |
"learning_rate": 0.00012, | |
"loss": 0.4064, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.18386491557223264, | |
"grad_norm": 0.11723858927517143, | |
"learning_rate": 0.00012250000000000002, | |
"loss": 0.4618, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.18761726078799248, | |
"grad_norm": 0.12979793592348535, | |
"learning_rate": 0.000125, | |
"loss": 0.4413, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.19136960600375236, | |
"grad_norm": 0.12190484063649769, | |
"learning_rate": 0.0001275, | |
"loss": 0.4212, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.1951219512195122, | |
"grad_norm": 0.1200977200253699, | |
"learning_rate": 0.00013000000000000002, | |
"loss": 0.4236, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.19887429643527205, | |
"grad_norm": 0.11576799257930886, | |
"learning_rate": 0.0001325, | |
"loss": 0.4328, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.2026266416510319, | |
"grad_norm": 0.11804398873031127, | |
"learning_rate": 0.00013500000000000003, | |
"loss": 0.3906, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.20637898686679174, | |
"grad_norm": 0.11890529087801377, | |
"learning_rate": 0.0001375, | |
"loss": 0.4092, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.2101313320825516, | |
"grad_norm": 0.11537178670561035, | |
"learning_rate": 0.00014, | |
"loss": 0.4026, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.21388367729831145, | |
"grad_norm": 0.11591938376106178, | |
"learning_rate": 0.00014250000000000002, | |
"loss": 0.3678, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.2176360225140713, | |
"grad_norm": 0.12025566814049414, | |
"learning_rate": 0.000145, | |
"loss": 0.3791, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.22138836772983114, | |
"grad_norm": 0.13088656560108905, | |
"learning_rate": 0.0001475, | |
"loss": 0.3906, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.225140712945591, | |
"grad_norm": 0.12366551138693345, | |
"learning_rate": 0.00015000000000000001, | |
"loss": 0.3769, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.22889305816135083, | |
"grad_norm": 0.12338960635908504, | |
"learning_rate": 0.0001525, | |
"loss": 0.3806, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.2326454033771107, | |
"grad_norm": 0.12957742480845902, | |
"learning_rate": 0.000155, | |
"loss": 0.365, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.23639774859287055, | |
"grad_norm": 0.1282017025800552, | |
"learning_rate": 0.0001575, | |
"loss": 0.3637, | |
"step": 63 | |
}, | |
{ | |
"epoch": 0.2401500938086304, | |
"grad_norm": 0.12685377163368308, | |
"learning_rate": 0.00016, | |
"loss": 0.3813, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.24390243902439024, | |
"grad_norm": 0.12515445789228635, | |
"learning_rate": 0.00016250000000000002, | |
"loss": 0.3665, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.24765478424015008, | |
"grad_norm": 0.12780302020094111, | |
"learning_rate": 0.000165, | |
"loss": 0.372, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.25140712945590993, | |
"grad_norm": 0.13509915457231544, | |
"learning_rate": 0.0001675, | |
"loss": 0.3964, | |
"step": 67 | |
}, | |
{ | |
"epoch": 0.2551594746716698, | |
"grad_norm": 0.1324450895974203, | |
"learning_rate": 0.00017, | |
"loss": 0.3809, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.2589118198874296, | |
"grad_norm": 0.14039471561000108, | |
"learning_rate": 0.00017250000000000002, | |
"loss": 0.3788, | |
"step": 69 | |
}, | |
{ | |
"epoch": 0.2626641651031895, | |
"grad_norm": 0.13748884493823293, | |
"learning_rate": 0.000175, | |
"loss": 0.3477, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.26641651031894936, | |
"grad_norm": 0.12981102084999996, | |
"learning_rate": 0.0001775, | |
"loss": 0.3803, | |
"step": 71 | |
}, | |
{ | |
"epoch": 0.2701688555347092, | |
"grad_norm": 0.12375391443012415, | |
"learning_rate": 0.00018, | |
"loss": 0.3557, | |
"step": 72 | |
}, | |
{ | |
"epoch": 0.27392120075046905, | |
"grad_norm": 0.12792478465226367, | |
"learning_rate": 0.0001825, | |
"loss": 0.351, | |
"step": 73 | |
}, | |
{ | |
"epoch": 0.2776735459662289, | |
"grad_norm": 0.1281934594676182, | |
"learning_rate": 0.00018500000000000002, | |
"loss": 0.3662, | |
"step": 74 | |
}, | |
{ | |
"epoch": 0.28142589118198874, | |
"grad_norm": 0.13402822886419208, | |
"learning_rate": 0.0001875, | |
"loss": 0.3491, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.2851782363977486, | |
"grad_norm": 0.1292536897601892, | |
"learning_rate": 0.00019, | |
"loss": 0.3415, | |
"step": 76 | |
}, | |
{ | |
"epoch": 0.28893058161350843, | |
"grad_norm": 0.16014546584829106, | |
"learning_rate": 0.00019250000000000002, | |
"loss": 0.3493, | |
"step": 77 | |
}, | |
{ | |
"epoch": 0.2926829268292683, | |
"grad_norm": 0.1393384528675237, | |
"learning_rate": 0.000195, | |
"loss": 0.3509, | |
"step": 78 | |
}, | |
{ | |
"epoch": 0.2964352720450281, | |
"grad_norm": 0.15248843449290234, | |
"learning_rate": 0.00019750000000000003, | |
"loss": 0.3328, | |
"step": 79 | |
}, | |
{ | |
"epoch": 0.300187617260788, | |
"grad_norm": 0.1478683373584156, | |
"learning_rate": 0.0002, | |
"loss": 0.339, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.30393996247654786, | |
"grad_norm": 0.1457678828189889, | |
"learning_rate": 0.00019999904276147618, | |
"loss": 0.3536, | |
"step": 81 | |
}, | |
{ | |
"epoch": 0.3076923076923077, | |
"grad_norm": 0.15185000879528737, | |
"learning_rate": 0.00019999617106423082, | |
"loss": 0.3529, | |
"step": 82 | |
}, | |
{ | |
"epoch": 0.31144465290806755, | |
"grad_norm": 0.15201189365883755, | |
"learning_rate": 0.0001999913849632419, | |
"loss": 0.3548, | |
"step": 83 | |
}, | |
{ | |
"epoch": 0.3151969981238274, | |
"grad_norm": 0.14879326753679958, | |
"learning_rate": 0.00019998468455013823, | |
"loss": 0.3264, | |
"step": 84 | |
}, | |
{ | |
"epoch": 0.31894934333958724, | |
"grad_norm": 0.14083771591440533, | |
"learning_rate": 0.00019997606995319768, | |
"loss": 0.3331, | |
"step": 85 | |
}, | |
{ | |
"epoch": 0.3227016885553471, | |
"grad_norm": 0.1503929432468549, | |
"learning_rate": 0.00019996554133734474, | |
"loss": 0.3282, | |
"step": 86 | |
}, | |
{ | |
"epoch": 0.32645403377110693, | |
"grad_norm": 0.14030605779758232, | |
"learning_rate": 0.00019995309890414732, | |
"loss": 0.3216, | |
"step": 87 | |
}, | |
{ | |
"epoch": 0.3302063789868668, | |
"grad_norm": 0.13891895714301467, | |
"learning_rate": 0.000199938742891813, | |
"loss": 0.3049, | |
"step": 88 | |
}, | |
{ | |
"epoch": 0.3339587242026266, | |
"grad_norm": 0.13742909920708118, | |
"learning_rate": 0.00019992247357518428, | |
"loss": 0.3252, | |
"step": 89 | |
}, | |
{ | |
"epoch": 0.33771106941838647, | |
"grad_norm": 0.14398237502236147, | |
"learning_rate": 0.0001999042912657335, | |
"loss": 0.3226, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.34146341463414637, | |
"grad_norm": 0.14292774523614082, | |
"learning_rate": 0.00019988419631155683, | |
"loss": 0.3323, | |
"step": 91 | |
}, | |
{ | |
"epoch": 0.3452157598499062, | |
"grad_norm": 0.14529808441186043, | |
"learning_rate": 0.00019986218909736757, | |
"loss": 0.3621, | |
"step": 92 | |
}, | |
{ | |
"epoch": 0.34896810506566606, | |
"grad_norm": 0.14363660567228131, | |
"learning_rate": 0.00019983827004448873, | |
"loss": 0.3325, | |
"step": 93 | |
}, | |
{ | |
"epoch": 0.3527204502814259, | |
"grad_norm": 0.14053215950288314, | |
"learning_rate": 0.00019981243961084515, | |
"loss": 0.3317, | |
"step": 94 | |
}, | |
{ | |
"epoch": 0.35647279549718575, | |
"grad_norm": 0.12839662363868307, | |
"learning_rate": 0.0001997846982909545, | |
"loss": 0.3017, | |
"step": 95 | |
}, | |
{ | |
"epoch": 0.3602251407129456, | |
"grad_norm": 0.1421301998134749, | |
"learning_rate": 0.000199755046615918, | |
"loss": 0.3236, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.36397748592870544, | |
"grad_norm": 0.1475029420066679, | |
"learning_rate": 0.00019972348515341016, | |
"loss": 0.3362, | |
"step": 97 | |
}, | |
{ | |
"epoch": 0.3677298311444653, | |
"grad_norm": 0.13378279730516257, | |
"learning_rate": 0.00019969001450766794, | |
"loss": 0.3254, | |
"step": 98 | |
}, | |
{ | |
"epoch": 0.3714821763602251, | |
"grad_norm": 0.1497261207938794, | |
"learning_rate": 0.0001996546353194792, | |
"loss": 0.3156, | |
"step": 99 | |
}, | |
{ | |
"epoch": 0.37523452157598497, | |
"grad_norm": 0.1356839966194173, | |
"learning_rate": 0.00019961734826617035, | |
"loss": 0.3282, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.3789868667917448, | |
"grad_norm": 0.12386942577985954, | |
"learning_rate": 0.0001995781540615934, | |
"loss": 0.3207, | |
"step": 101 | |
}, | |
{ | |
"epoch": 0.3827392120075047, | |
"grad_norm": 0.16584604505517364, | |
"learning_rate": 0.0001995370534561125, | |
"loss": 0.3026, | |
"step": 102 | |
}, | |
{ | |
"epoch": 0.38649155722326456, | |
"grad_norm": 0.1277560294599099, | |
"learning_rate": 0.0001994940472365893, | |
"loss": 0.322, | |
"step": 103 | |
}, | |
{ | |
"epoch": 0.3902439024390244, | |
"grad_norm": 0.13567813426924816, | |
"learning_rate": 0.00019944913622636795, | |
"loss": 0.3232, | |
"step": 104 | |
}, | |
{ | |
"epoch": 0.39399624765478425, | |
"grad_norm": 0.12123496832228846, | |
"learning_rate": 0.0001994023212852595, | |
"loss": 0.2972, | |
"step": 105 | |
}, | |
{ | |
"epoch": 0.3977485928705441, | |
"grad_norm": 0.13879373741004694, | |
"learning_rate": 0.00019935360330952518, | |
"loss": 0.3005, | |
"step": 106 | |
}, | |
{ | |
"epoch": 0.40150093808630394, | |
"grad_norm": 0.1274679949876301, | |
"learning_rate": 0.00019930298323185945, | |
"loss": 0.3119, | |
"step": 107 | |
}, | |
{ | |
"epoch": 0.4052532833020638, | |
"grad_norm": 0.13101222758435194, | |
"learning_rate": 0.00019925046202137216, | |
"loss": 0.2939, | |
"step": 108 | |
}, | |
{ | |
"epoch": 0.4090056285178236, | |
"grad_norm": 0.12738472548497895, | |
"learning_rate": 0.00019919604068356978, | |
"loss": 0.3093, | |
"step": 109 | |
}, | |
{ | |
"epoch": 0.41275797373358347, | |
"grad_norm": 0.1490015817444115, | |
"learning_rate": 0.00019913972026033632, | |
"loss": 0.2844, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.4165103189493433, | |
"grad_norm": 0.1470790264142207, | |
"learning_rate": 0.00019908150182991339, | |
"loss": 0.2872, | |
"step": 111 | |
}, | |
{ | |
"epoch": 0.4202626641651032, | |
"grad_norm": 0.12721396486874495, | |
"learning_rate": 0.00019902138650687942, | |
"loss": 0.3043, | |
"step": 112 | |
}, | |
{ | |
"epoch": 0.42401500938086306, | |
"grad_norm": 0.13891744298891914, | |
"learning_rate": 0.00019895937544212858, | |
"loss": 0.3009, | |
"step": 113 | |
}, | |
{ | |
"epoch": 0.4277673545966229, | |
"grad_norm": 0.134346074178801, | |
"learning_rate": 0.00019889546982284834, | |
"loss": 0.3013, | |
"step": 114 | |
}, | |
{ | |
"epoch": 0.43151969981238275, | |
"grad_norm": 0.1379066741076229, | |
"learning_rate": 0.00019882967087249718, | |
"loss": 0.3052, | |
"step": 115 | |
}, | |
{ | |
"epoch": 0.4352720450281426, | |
"grad_norm": 0.12972548899740632, | |
"learning_rate": 0.0001987619798507809, | |
"loss": 0.3124, | |
"step": 116 | |
}, | |
{ | |
"epoch": 0.43902439024390244, | |
"grad_norm": 0.12813310196115213, | |
"learning_rate": 0.0001986923980536286, | |
"loss": 0.2893, | |
"step": 117 | |
}, | |
{ | |
"epoch": 0.4427767354596623, | |
"grad_norm": 0.13797054317394944, | |
"learning_rate": 0.00019862092681316776, | |
"loss": 0.3016, | |
"step": 118 | |
}, | |
{ | |
"epoch": 0.44652908067542213, | |
"grad_norm": 0.13780600670778337, | |
"learning_rate": 0.0001985475674976989, | |
"loss": 0.3158, | |
"step": 119 | |
}, | |
{ | |
"epoch": 0.450281425891182, | |
"grad_norm": 0.13926178383999727, | |
"learning_rate": 0.0001984723215116693, | |
"loss": 0.2801, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.4540337711069418, | |
"grad_norm": 0.1369353496922525, | |
"learning_rate": 0.00019839519029564605, | |
"loss": 0.305, | |
"step": 121 | |
}, | |
{ | |
"epoch": 0.45778611632270166, | |
"grad_norm": 0.13937382639705567, | |
"learning_rate": 0.00019831617532628862, | |
"loss": 0.3176, | |
"step": 122 | |
}, | |
{ | |
"epoch": 0.46153846153846156, | |
"grad_norm": 0.14086276027188518, | |
"learning_rate": 0.00019823527811632042, | |
"loss": 0.2879, | |
"step": 123 | |
}, | |
{ | |
"epoch": 0.4652908067542214, | |
"grad_norm": 0.13282215800163436, | |
"learning_rate": 0.00019815250021449997, | |
"loss": 0.2996, | |
"step": 124 | |
}, | |
{ | |
"epoch": 0.46904315196998125, | |
"grad_norm": 0.12757163326850707, | |
"learning_rate": 0.00019806784320559127, | |
"loss": 0.3006, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.4727954971857411, | |
"grad_norm": 0.14854709123219104, | |
"learning_rate": 0.00019798130871033322, | |
"loss": 0.301, | |
"step": 126 | |
}, | |
{ | |
"epoch": 0.47654784240150094, | |
"grad_norm": 0.13087500973091548, | |
"learning_rate": 0.00019789289838540897, | |
"loss": 0.2902, | |
"step": 127 | |
}, | |
{ | |
"epoch": 0.4803001876172608, | |
"grad_norm": 0.1433475392806627, | |
"learning_rate": 0.00019780261392341383, | |
"loss": 0.2926, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.48405253283302063, | |
"grad_norm": 0.1341283559656879, | |
"learning_rate": 0.0001977104570528231, | |
"loss": 0.2602, | |
"step": 129 | |
}, | |
{ | |
"epoch": 0.4878048780487805, | |
"grad_norm": 0.1607197394251248, | |
"learning_rate": 0.00019761642953795895, | |
"loss": 0.2984, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.4915572232645403, | |
"grad_norm": 0.11856150621760517, | |
"learning_rate": 0.0001975205331789566, | |
"loss": 0.2988, | |
"step": 131 | |
}, | |
{ | |
"epoch": 0.49530956848030017, | |
"grad_norm": 0.14014139613661877, | |
"learning_rate": 0.00019742276981172976, | |
"loss": 0.291, | |
"step": 132 | |
}, | |
{ | |
"epoch": 0.49906191369606, | |
"grad_norm": 0.12881861735846314, | |
"learning_rate": 0.00019732314130793568, | |
"loss": 0.2971, | |
"step": 133 | |
}, | |
{ | |
"epoch": 0.5028142589118199, | |
"grad_norm": 0.11788683351931176, | |
"learning_rate": 0.00019722164957493922, | |
"loss": 0.2766, | |
"step": 134 | |
}, | |
{ | |
"epoch": 0.5065666041275797, | |
"grad_norm": 0.13746078706666037, | |
"learning_rate": 0.0001971182965557763, | |
"loss": 0.2886, | |
"step": 135 | |
}, | |
{ | |
"epoch": 0.5103189493433395, | |
"grad_norm": 0.12745519285890888, | |
"learning_rate": 0.00019701308422911672, | |
"loss": 0.2963, | |
"step": 136 | |
}, | |
{ | |
"epoch": 0.5140712945590994, | |
"grad_norm": 0.11835270726835292, | |
"learning_rate": 0.0001969060146092264, | |
"loss": 0.2995, | |
"step": 137 | |
}, | |
{ | |
"epoch": 0.5178236397748592, | |
"grad_norm": 0.14011034379489426, | |
"learning_rate": 0.0001967970897459286, | |
"loss": 0.2881, | |
"step": 138 | |
}, | |
{ | |
"epoch": 0.5215759849906192, | |
"grad_norm": 0.13060776440495228, | |
"learning_rate": 0.0001966863117245648, | |
"loss": 0.2765, | |
"step": 139 | |
}, | |
{ | |
"epoch": 0.525328330206379, | |
"grad_norm": 0.14161693580554588, | |
"learning_rate": 0.00019657368266595476, | |
"loss": 0.281, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.5290806754221389, | |
"grad_norm": 0.12125364150709082, | |
"learning_rate": 0.00019645920472635608, | |
"loss": 0.2732, | |
"step": 141 | |
}, | |
{ | |
"epoch": 0.5328330206378987, | |
"grad_norm": 0.1334127552945295, | |
"learning_rate": 0.00019634288009742255, | |
"loss": 0.2523, | |
"step": 142 | |
}, | |
{ | |
"epoch": 0.5365853658536586, | |
"grad_norm": 0.12113573146827264, | |
"learning_rate": 0.0001962247110061625, | |
"loss": 0.2775, | |
"step": 143 | |
}, | |
{ | |
"epoch": 0.5403377110694184, | |
"grad_norm": 0.12331032028922699, | |
"learning_rate": 0.00019610469971489608, | |
"loss": 0.2687, | |
"step": 144 | |
}, | |
{ | |
"epoch": 0.5440900562851783, | |
"grad_norm": 0.13237586077608754, | |
"learning_rate": 0.00019598284852121188, | |
"loss": 0.2774, | |
"step": 145 | |
}, | |
{ | |
"epoch": 0.5478424015009381, | |
"grad_norm": 0.12199880756983131, | |
"learning_rate": 0.0001958591597579231, | |
"loss": 0.2815, | |
"step": 146 | |
}, | |
{ | |
"epoch": 0.551594746716698, | |
"grad_norm": 0.11915746795874955, | |
"learning_rate": 0.00019573363579302266, | |
"loss": 0.2558, | |
"step": 147 | |
}, | |
{ | |
"epoch": 0.5553470919324578, | |
"grad_norm": 0.11644382804351376, | |
"learning_rate": 0.00019560627902963807, | |
"loss": 0.2951, | |
"step": 148 | |
}, | |
{ | |
"epoch": 0.5590994371482176, | |
"grad_norm": 0.1317161794959933, | |
"learning_rate": 0.00019547709190598534, | |
"loss": 0.2629, | |
"step": 149 | |
}, | |
{ | |
"epoch": 0.5628517823639775, | |
"grad_norm": 0.13859313218362884, | |
"learning_rate": 0.00019534607689532233, | |
"loss": 0.2884, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.5666041275797373, | |
"grad_norm": 0.1643061756146766, | |
"learning_rate": 0.00019521323650590133, | |
"loss": 0.2932, | |
"step": 151 | |
}, | |
{ | |
"epoch": 0.5703564727954972, | |
"grad_norm": 0.12366306539172685, | |
"learning_rate": 0.00019507857328092108, | |
"loss": 0.2861, | |
"step": 152 | |
}, | |
{ | |
"epoch": 0.574108818011257, | |
"grad_norm": 0.12624207186548378, | |
"learning_rate": 0.00019494208979847812, | |
"loss": 0.2796, | |
"step": 153 | |
}, | |
{ | |
"epoch": 0.5778611632270169, | |
"grad_norm": 0.12237336350000451, | |
"learning_rate": 0.00019480378867151746, | |
"loss": 0.273, | |
"step": 154 | |
}, | |
{ | |
"epoch": 0.5816135084427767, | |
"grad_norm": 0.12323433685041912, | |
"learning_rate": 0.00019466367254778233, | |
"loss": 0.2747, | |
"step": 155 | |
}, | |
{ | |
"epoch": 0.5853658536585366, | |
"grad_norm": 0.12577598956544817, | |
"learning_rate": 0.0001945217441097638, | |
"loss": 0.2634, | |
"step": 156 | |
}, | |
{ | |
"epoch": 0.5891181988742964, | |
"grad_norm": 0.12244570380339517, | |
"learning_rate": 0.00019437800607464932, | |
"loss": 0.2701, | |
"step": 157 | |
}, | |
{ | |
"epoch": 0.5928705440900562, | |
"grad_norm": 0.12004670825182381, | |
"learning_rate": 0.00019423246119427043, | |
"loss": 0.2781, | |
"step": 158 | |
}, | |
{ | |
"epoch": 0.5966228893058161, | |
"grad_norm": 0.13091796767694497, | |
"learning_rate": 0.00019408511225505056, | |
"loss": 0.2646, | |
"step": 159 | |
}, | |
{ | |
"epoch": 0.600375234521576, | |
"grad_norm": 0.11771920694416416, | |
"learning_rate": 0.00019393596207795136, | |
"loss": 0.2795, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.6041275797373359, | |
"grad_norm": 0.12447218651645564, | |
"learning_rate": 0.00019378501351841865, | |
"loss": 0.2767, | |
"step": 161 | |
}, | |
{ | |
"epoch": 0.6078799249530957, | |
"grad_norm": 0.11854916742534294, | |
"learning_rate": 0.000193632269466328, | |
"loss": 0.2595, | |
"step": 162 | |
}, | |
{ | |
"epoch": 0.6116322701688556, | |
"grad_norm": 0.11517649062994549, | |
"learning_rate": 0.0001934777328459292, | |
"loss": 0.2611, | |
"step": 163 | |
}, | |
{ | |
"epoch": 0.6153846153846154, | |
"grad_norm": 0.12291906434338017, | |
"learning_rate": 0.00019332140661579042, | |
"loss": 0.2569, | |
"step": 164 | |
}, | |
{ | |
"epoch": 0.6191369606003753, | |
"grad_norm": 0.12768661337225065, | |
"learning_rate": 0.00019316329376874145, | |
"loss": 0.2802, | |
"step": 165 | |
}, | |
{ | |
"epoch": 0.6228893058161351, | |
"grad_norm": 0.12224468589372722, | |
"learning_rate": 0.00019300339733181642, | |
"loss": 0.2742, | |
"step": 166 | |
}, | |
{ | |
"epoch": 0.626641651031895, | |
"grad_norm": 0.11873375913983374, | |
"learning_rate": 0.00019284172036619594, | |
"loss": 0.2496, | |
"step": 167 | |
}, | |
{ | |
"epoch": 0.6303939962476548, | |
"grad_norm": 0.1094029489278503, | |
"learning_rate": 0.0001926782659671484, | |
"loss": 0.2834, | |
"step": 168 | |
}, | |
{ | |
"epoch": 0.6341463414634146, | |
"grad_norm": 0.11667364916992014, | |
"learning_rate": 0.00019251303726397078, | |
"loss": 0.2749, | |
"step": 169 | |
}, | |
{ | |
"epoch": 0.6378986866791745, | |
"grad_norm": 0.10721206701910313, | |
"learning_rate": 0.00019234603741992862, | |
"loss": 0.2833, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.6416510318949343, | |
"grad_norm": 0.11114975628124507, | |
"learning_rate": 0.00019217726963219567, | |
"loss": 0.2412, | |
"step": 171 | |
}, | |
{ | |
"epoch": 0.6454033771106942, | |
"grad_norm": 0.11052789377191914, | |
"learning_rate": 0.00019200673713179245, | |
"loss": 0.2629, | |
"step": 172 | |
}, | |
{ | |
"epoch": 0.649155722326454, | |
"grad_norm": 0.1254877320751365, | |
"learning_rate": 0.00019183444318352457, | |
"loss": 0.2676, | |
"step": 173 | |
}, | |
{ | |
"epoch": 0.6529080675422139, | |
"grad_norm": 0.11436464042758997, | |
"learning_rate": 0.0001916603910859201, | |
"loss": 0.2786, | |
"step": 174 | |
}, | |
{ | |
"epoch": 0.6566604127579737, | |
"grad_norm": 0.12040982753537727, | |
"learning_rate": 0.00019148458417116645, | |
"loss": 0.255, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.6604127579737336, | |
"grad_norm": 0.1215472428194096, | |
"learning_rate": 0.00019130702580504676, | |
"loss": 0.2933, | |
"step": 176 | |
}, | |
{ | |
"epoch": 0.6641651031894934, | |
"grad_norm": 0.11127574852727158, | |
"learning_rate": 0.0001911277193868751, | |
"loss": 0.2638, | |
"step": 177 | |
}, | |
{ | |
"epoch": 0.6679174484052532, | |
"grad_norm": 0.11297276732299613, | |
"learning_rate": 0.00019094666834943179, | |
"loss": 0.2553, | |
"step": 178 | |
}, | |
{ | |
"epoch": 0.6716697936210131, | |
"grad_norm": 0.11230362581933455, | |
"learning_rate": 0.00019076387615889727, | |
"loss": 0.2656, | |
"step": 179 | |
}, | |
{ | |
"epoch": 0.6754221388367729, | |
"grad_norm": 0.11339982024848368, | |
"learning_rate": 0.00019057934631478617, | |
"loss": 0.2608, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.6791744840525328, | |
"grad_norm": 0.1157018708653507, | |
"learning_rate": 0.00019039308234987992, | |
"loss": 0.2661, | |
"step": 181 | |
}, | |
{ | |
"epoch": 0.6829268292682927, | |
"grad_norm": 0.12120354653706046, | |
"learning_rate": 0.00019020508783015942, | |
"loss": 0.2655, | |
"step": 182 | |
}, | |
{ | |
"epoch": 0.6866791744840526, | |
"grad_norm": 0.11650498536100079, | |
"learning_rate": 0.00019001536635473664, | |
"loss": 0.2617, | |
"step": 183 | |
}, | |
{ | |
"epoch": 0.6904315196998124, | |
"grad_norm": 0.11284326019455035, | |
"learning_rate": 0.0001898239215557856, | |
"loss": 0.2604, | |
"step": 184 | |
}, | |
{ | |
"epoch": 0.6941838649155723, | |
"grad_norm": 0.11137366023131207, | |
"learning_rate": 0.0001896307570984731, | |
"loss": 0.2695, | |
"step": 185 | |
}, | |
{ | |
"epoch": 0.6979362101313321, | |
"grad_norm": 0.10909150712308537, | |
"learning_rate": 0.00018943587668088832, | |
"loss": 0.261, | |
"step": 186 | |
}, | |
{ | |
"epoch": 0.701688555347092, | |
"grad_norm": 0.11533104627662898, | |
"learning_rate": 0.00018923928403397208, | |
"loss": 0.2662, | |
"step": 187 | |
}, | |
{ | |
"epoch": 0.7054409005628518, | |
"grad_norm": 0.11085301527387796, | |
"learning_rate": 0.00018904098292144554, | |
"loss": 0.26, | |
"step": 188 | |
}, | |
{ | |
"epoch": 0.7091932457786116, | |
"grad_norm": 0.1040125545017247, | |
"learning_rate": 0.00018884097713973798, | |
"loss": 0.2641, | |
"step": 189 | |
}, | |
{ | |
"epoch": 0.7129455909943715, | |
"grad_norm": 0.10775777270108124, | |
"learning_rate": 0.00018863927051791416, | |
"loss": 0.2553, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.7166979362101313, | |
"grad_norm": 0.11556746781951048, | |
"learning_rate": 0.00018843586691760108, | |
"loss": 0.2817, | |
"step": 191 | |
}, | |
{ | |
"epoch": 0.7204502814258912, | |
"grad_norm": 0.11370972134361729, | |
"learning_rate": 0.00018823077023291397, | |
"loss": 0.2715, | |
"step": 192 | |
}, | |
{ | |
"epoch": 0.724202626641651, | |
"grad_norm": 0.10785721109445355, | |
"learning_rate": 0.00018802398439038176, | |
"loss": 0.2604, | |
"step": 193 | |
}, | |
{ | |
"epoch": 0.7279549718574109, | |
"grad_norm": 0.10825278350141479, | |
"learning_rate": 0.00018781551334887201, | |
"loss": 0.2498, | |
"step": 194 | |
}, | |
{ | |
"epoch": 0.7317073170731707, | |
"grad_norm": 0.09965163182891702, | |
"learning_rate": 0.0001876053610995149, | |
"loss": 0.2504, | |
"step": 195 | |
}, | |
{ | |
"epoch": 0.7354596622889306, | |
"grad_norm": 0.1026489808604617, | |
"learning_rate": 0.000187393531665627, | |
"loss": 0.2587, | |
"step": 196 | |
}, | |
{ | |
"epoch": 0.7392120075046904, | |
"grad_norm": 0.10399821510438714, | |
"learning_rate": 0.00018718002910263426, | |
"loss": 0.273, | |
"step": 197 | |
}, | |
{ | |
"epoch": 0.7429643527204502, | |
"grad_norm": 0.10994775687961979, | |
"learning_rate": 0.0001869648574979942, | |
"loss": 0.2659, | |
"step": 198 | |
}, | |
{ | |
"epoch": 0.7467166979362101, | |
"grad_norm": 0.10593465784705908, | |
"learning_rate": 0.00018674802097111784, | |
"loss": 0.26, | |
"step": 199 | |
}, | |
{ | |
"epoch": 0.7504690431519699, | |
"grad_norm": 0.11280493763136354, | |
"learning_rate": 0.0001865295236732907, | |
"loss": 0.2677, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.7542213883677298, | |
"grad_norm": 0.10536591132251391, | |
"learning_rate": 0.00018630936978759338, | |
"loss": 0.2513, | |
"step": 201 | |
}, | |
{ | |
"epoch": 0.7579737335834896, | |
"grad_norm": 0.10796354732338231, | |
"learning_rate": 0.00018608756352882152, | |
"loss": 0.2757, | |
"step": 202 | |
}, | |
{ | |
"epoch": 0.7617260787992496, | |
"grad_norm": 0.10552783825603758, | |
"learning_rate": 0.00018586410914340497, | |
"loss": 0.2552, | |
"step": 203 | |
}, | |
{ | |
"epoch": 0.7654784240150094, | |
"grad_norm": 0.10937928150050989, | |
"learning_rate": 0.00018563901090932672, | |
"loss": 0.2675, | |
"step": 204 | |
}, | |
{ | |
"epoch": 0.7692307692307693, | |
"grad_norm": 0.11537632950908651, | |
"learning_rate": 0.00018541227313604078, | |
"loss": 0.2402, | |
"step": 205 | |
}, | |
{ | |
"epoch": 0.7729831144465291, | |
"grad_norm": 0.11524821367403956, | |
"learning_rate": 0.0001851839001643898, | |
"loss": 0.2628, | |
"step": 206 | |
}, | |
{ | |
"epoch": 0.776735459662289, | |
"grad_norm": 0.10266098148088061, | |
"learning_rate": 0.00018495389636652185, | |
"loss": 0.2484, | |
"step": 207 | |
}, | |
{ | |
"epoch": 0.7804878048780488, | |
"grad_norm": 0.10807777719284456, | |
"learning_rate": 0.0001847222661458069, | |
"loss": 0.2648, | |
"step": 208 | |
}, | |
{ | |
"epoch": 0.7842401500938087, | |
"grad_norm": 0.10744597380010515, | |
"learning_rate": 0.00018448901393675233, | |
"loss": 0.2575, | |
"step": 209 | |
}, | |
{ | |
"epoch": 0.7879924953095685, | |
"grad_norm": 0.10942201726245399, | |
"learning_rate": 0.00018425414420491815, | |
"loss": 0.266, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.7917448405253283, | |
"grad_norm": 0.10660876081865972, | |
"learning_rate": 0.00018401766144683147, | |
"loss": 0.2438, | |
"step": 211 | |
}, | |
{ | |
"epoch": 0.7954971857410882, | |
"grad_norm": 0.11694393967537217, | |
"learning_rate": 0.0001837795701899004, | |
"loss": 0.2787, | |
"step": 212 | |
}, | |
{ | |
"epoch": 0.799249530956848, | |
"grad_norm": 0.11981272200535166, | |
"learning_rate": 0.00018353987499232746, | |
"loss": 0.264, | |
"step": 213 | |
}, | |
{ | |
"epoch": 0.8030018761726079, | |
"grad_norm": 0.10661350248202765, | |
"learning_rate": 0.00018329858044302213, | |
"loss": 0.2467, | |
"step": 214 | |
}, | |
{ | |
"epoch": 0.8067542213883677, | |
"grad_norm": 0.10372037225439175, | |
"learning_rate": 0.0001830556911615132, | |
"loss": 0.2718, | |
"step": 215 | |
}, | |
{ | |
"epoch": 0.8105065666041276, | |
"grad_norm": 0.10573394846211595, | |
"learning_rate": 0.00018281121179786024, | |
"loss": 0.2414, | |
"step": 216 | |
}, | |
{ | |
"epoch": 0.8142589118198874, | |
"grad_norm": 0.10765219346551154, | |
"learning_rate": 0.0001825651470325645, | |
"loss": 0.2516, | |
"step": 217 | |
}, | |
{ | |
"epoch": 0.8180112570356473, | |
"grad_norm": 0.09961054466797757, | |
"learning_rate": 0.0001823175015764795, | |
"loss": 0.2337, | |
"step": 218 | |
}, | |
{ | |
"epoch": 0.8217636022514071, | |
"grad_norm": 0.10573680507484315, | |
"learning_rate": 0.00018206828017072057, | |
"loss": 0.2443, | |
"step": 219 | |
}, | |
{ | |
"epoch": 0.8255159474671669, | |
"grad_norm": 0.10617911818381037, | |
"learning_rate": 0.00018181748758657438, | |
"loss": 0.2409, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.8292682926829268, | |
"grad_norm": 0.10190011860666479, | |
"learning_rate": 0.0001815651286254074, | |
"loss": 0.2699, | |
"step": 221 | |
}, | |
{ | |
"epoch": 0.8330206378986866, | |
"grad_norm": 0.10217498312134918, | |
"learning_rate": 0.000181311208118574, | |
"loss": 0.261, | |
"step": 222 | |
}, | |
{ | |
"epoch": 0.8367729831144465, | |
"grad_norm": 0.10290805625127751, | |
"learning_rate": 0.000181055730927324, | |
"loss": 0.2544, | |
"step": 223 | |
}, | |
{ | |
"epoch": 0.8405253283302064, | |
"grad_norm": 0.10273441373621256, | |
"learning_rate": 0.00018079870194270958, | |
"loss": 0.2394, | |
"step": 224 | |
}, | |
{ | |
"epoch": 0.8442776735459663, | |
"grad_norm": 0.09880435844395785, | |
"learning_rate": 0.00018054012608549166, | |
"loss": 0.263, | |
"step": 225 | |
}, | |
{ | |
"epoch": 0.8480300187617261, | |
"grad_norm": 0.10357276059735837, | |
"learning_rate": 0.0001802800083060457, | |
"loss": 0.2853, | |
"step": 226 | |
}, | |
{ | |
"epoch": 0.851782363977486, | |
"grad_norm": 0.10804308023574893, | |
"learning_rate": 0.00018001835358426687, | |
"loss": 0.2595, | |
"step": 227 | |
}, | |
{ | |
"epoch": 0.8555347091932458, | |
"grad_norm": 0.09776326620940605, | |
"learning_rate": 0.00017975516692947475, | |
"loss": 0.253, | |
"step": 228 | |
}, | |
{ | |
"epoch": 0.8592870544090057, | |
"grad_norm": 0.0995125991589646, | |
"learning_rate": 0.00017949045338031745, | |
"loss": 0.2536, | |
"step": 229 | |
}, | |
{ | |
"epoch": 0.8630393996247655, | |
"grad_norm": 0.10281461790899643, | |
"learning_rate": 0.00017922421800467512, | |
"loss": 0.2592, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.8667917448405253, | |
"grad_norm": 0.11374858278223317, | |
"learning_rate": 0.0001789564658995629, | |
"loss": 0.2694, | |
"step": 231 | |
}, | |
{ | |
"epoch": 0.8705440900562852, | |
"grad_norm": 0.10048956101218906, | |
"learning_rate": 0.00017868720219103344, | |
"loss": 0.2563, | |
"step": 232 | |
}, | |
{ | |
"epoch": 0.874296435272045, | |
"grad_norm": 0.11978050473597157, | |
"learning_rate": 0.00017841643203407852, | |
"loss": 0.2671, | |
"step": 233 | |
}, | |
{ | |
"epoch": 0.8780487804878049, | |
"grad_norm": 0.1022948197426214, | |
"learning_rate": 0.00017814416061253077, | |
"loss": 0.2442, | |
"step": 234 | |
}, | |
{ | |
"epoch": 0.8818011257035647, | |
"grad_norm": 0.10648409702487768, | |
"learning_rate": 0.000177870393138964, | |
"loss": 0.2172, | |
"step": 235 | |
}, | |
{ | |
"epoch": 0.8855534709193246, | |
"grad_norm": 0.09682467776295996, | |
"learning_rate": 0.00017759513485459367, | |
"loss": 0.2503, | |
"step": 236 | |
}, | |
{ | |
"epoch": 0.8893058161350844, | |
"grad_norm": 0.10093582432576866, | |
"learning_rate": 0.00017731839102917644, | |
"loss": 0.2526, | |
"step": 237 | |
}, | |
{ | |
"epoch": 0.8930581613508443, | |
"grad_norm": 0.10283968277186326, | |
"learning_rate": 0.00017704016696090937, | |
"loss": 0.2467, | |
"step": 238 | |
}, | |
{ | |
"epoch": 0.8968105065666041, | |
"grad_norm": 0.1016691703162235, | |
"learning_rate": 0.00017676046797632835, | |
"loss": 0.2458, | |
"step": 239 | |
}, | |
{ | |
"epoch": 0.900562851782364, | |
"grad_norm": 0.09871178549145665, | |
"learning_rate": 0.00017647929943020625, | |
"loss": 0.2387, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.9043151969981238, | |
"grad_norm": 0.11005062968397657, | |
"learning_rate": 0.00017619666670545033, | |
"loss": 0.2485, | |
"step": 241 | |
}, | |
{ | |
"epoch": 0.9080675422138836, | |
"grad_norm": 0.10636010374538316, | |
"learning_rate": 0.00017591257521299932, | |
"loss": 0.2344, | |
"step": 242 | |
}, | |
{ | |
"epoch": 0.9118198874296435, | |
"grad_norm": 0.10269265934208162, | |
"learning_rate": 0.00017562703039171955, | |
"loss": 0.2449, | |
"step": 243 | |
}, | |
{ | |
"epoch": 0.9155722326454033, | |
"grad_norm": 0.1123496871025115, | |
"learning_rate": 0.0001753400377083011, | |
"loss": 0.2472, | |
"step": 244 | |
}, | |
{ | |
"epoch": 0.9193245778611632, | |
"grad_norm": 0.10731321325088286, | |
"learning_rate": 0.00017505160265715304, | |
"loss": 0.2257, | |
"step": 245 | |
}, | |
{ | |
"epoch": 0.9230769230769231, | |
"grad_norm": 0.10122280465712044, | |
"learning_rate": 0.0001747617307602982, | |
"loss": 0.2673, | |
"step": 246 | |
}, | |
{ | |
"epoch": 0.926829268292683, | |
"grad_norm": 0.10287633377626088, | |
"learning_rate": 0.00017447042756726754, | |
"loss": 0.2623, | |
"step": 247 | |
}, | |
{ | |
"epoch": 0.9305816135084428, | |
"grad_norm": 0.11180813962431274, | |
"learning_rate": 0.0001741776986549938, | |
"loss": 0.2588, | |
"step": 248 | |
}, | |
{ | |
"epoch": 0.9343339587242027, | |
"grad_norm": 0.10342918680770019, | |
"learning_rate": 0.00017388354962770487, | |
"loss": 0.2365, | |
"step": 249 | |
}, | |
{ | |
"epoch": 0.9380863039399625, | |
"grad_norm": 0.10248241650027715, | |
"learning_rate": 0.0001735879861168163, | |
"loss": 0.2453, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.9418386491557224, | |
"grad_norm": 0.11730400265701718, | |
"learning_rate": 0.00017329101378082374, | |
"loss": 0.2486, | |
"step": 251 | |
}, | |
{ | |
"epoch": 0.9455909943714822, | |
"grad_norm": 0.09685186553299667, | |
"learning_rate": 0.0001729926383051943, | |
"loss": 0.2572, | |
"step": 252 | |
}, | |
{ | |
"epoch": 0.949343339587242, | |
"grad_norm": 0.12090818479499119, | |
"learning_rate": 0.00017269286540225805, | |
"loss": 0.2248, | |
"step": 253 | |
}, | |
{ | |
"epoch": 0.9530956848030019, | |
"grad_norm": 0.10260399450357141, | |
"learning_rate": 0.0001723917008110984, | |
"loss": 0.2527, | |
"step": 254 | |
}, | |
{ | |
"epoch": 0.9568480300187617, | |
"grad_norm": 0.10114612523395812, | |
"learning_rate": 0.0001720891502974423, | |
"loss": 0.2602, | |
"step": 255 | |
}, | |
{ | |
"epoch": 0.9606003752345216, | |
"grad_norm": 0.11613810011247953, | |
"learning_rate": 0.00017178521965354992, | |
"loss": 0.2535, | |
"step": 256 | |
}, | |
{ | |
"epoch": 0.9643527204502814, | |
"grad_norm": 0.10548781228478918, | |
"learning_rate": 0.00017147991469810368, | |
"loss": 0.2616, | |
"step": 257 | |
}, | |
{ | |
"epoch": 0.9681050656660413, | |
"grad_norm": 0.10337010169414873, | |
"learning_rate": 0.00017117324127609686, | |
"loss": 0.2506, | |
"step": 258 | |
}, | |
{ | |
"epoch": 0.9718574108818011, | |
"grad_norm": 0.1022753450493229, | |
"learning_rate": 0.00017086520525872172, | |
"loss": 0.2536, | |
"step": 259 | |
}, | |
{ | |
"epoch": 0.975609756097561, | |
"grad_norm": 0.10274802198295474, | |
"learning_rate": 0.00017055581254325715, | |
"loss": 0.2444, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.9793621013133208, | |
"grad_norm": 0.10073944882387982, | |
"learning_rate": 0.00017024506905295565, | |
"loss": 0.2583, | |
"step": 261 | |
}, | |
{ | |
"epoch": 0.9831144465290806, | |
"grad_norm": 0.10220040335882648, | |
"learning_rate": 0.00016993298073693003, | |
"loss": 0.2431, | |
"step": 262 | |
}, | |
{ | |
"epoch": 0.9868667917448405, | |
"grad_norm": 0.1060948209024435, | |
"learning_rate": 0.00016961955357003947, | |
"loss": 0.262, | |
"step": 263 | |
}, | |
{ | |
"epoch": 0.9906191369606003, | |
"grad_norm": 0.10004277645336798, | |
"learning_rate": 0.0001693047935527751, | |
"loss": 0.234, | |
"step": 264 | |
}, | |
{ | |
"epoch": 0.9943714821763602, | |
"grad_norm": 0.1000376814502259, | |
"learning_rate": 0.00016898870671114527, | |
"loss": 0.2566, | |
"step": 265 | |
}, | |
{ | |
"epoch": 0.99812382739212, | |
"grad_norm": 0.09911659249018077, | |
"learning_rate": 0.00016867129909655998, | |
"loss": 0.2657, | |
"step": 266 | |
}, | |
{ | |
"epoch": 0.99812382739212, | |
"eval_loss": 0.25076788663864136, | |
"eval_runtime": 54.8199, | |
"eval_samples_per_second": 32.725, | |
"eval_steps_per_second": 1.04, | |
"step": 266 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 798, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 7.692263947344282e+16, | |
"train_batch_size": 8, | |
"trial_name": null, | |
"trial_params": null | |
} | |