|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4284490145672665, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000856898029134533, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 5e-05, |
|
"loss": 3.6595, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001713796058269066, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0001, |
|
"loss": 3.6973, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002570694087403599, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00015, |
|
"loss": 3.6343, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.003427592116538132, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3538, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004284490145672665, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00025, |
|
"loss": 3.2142, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005141388174807198, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0003, |
|
"loss": 2.9167, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005998286203941731, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00035, |
|
"loss": 2.7017, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006855184233076264, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0004, |
|
"loss": 2.4232, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007712082262210797, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 2.1348, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00856898029134533, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0005, |
|
"loss": 2.0355, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009425878320479864, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00055, |
|
"loss": 2.0537, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010282776349614395, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0006, |
|
"loss": 2.0642, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011139674378748929, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 2.0116, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011996572407883462, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0007, |
|
"loss": 2.0132, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.012853470437017995, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00075, |
|
"loss": 1.9433, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013710368466152529, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0008, |
|
"loss": 1.774, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01456726649528706, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00085, |
|
"loss": 1.7933, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015424164524421594, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 1.6388, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.016281062553556127, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00095, |
|
"loss": 1.6048, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01713796058269066, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.001, |
|
"loss": 1.5744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017994858611825194, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0009999892908320648, |
|
"loss": 1.4603, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.018851756640959727, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0009999571637870036, |
|
"loss": 1.4607, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01970865467009426, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0009999036202410325, |
|
"loss": 1.3065, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02056555269922879, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0009998286624877785, |
|
"loss": 1.2436, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.021422450728363324, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.0009997322937381828, |
|
"loss": 1.1979, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022279348757497857, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0009996145181203615, |
|
"loss": 1.1425, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02313624678663239, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 0.00099947534067943, |
|
"loss": 1.0843, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.023993144815766924, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0009993147673772868, |
|
"loss": 1.12, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.024850042844901457, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 0.000999132805092358, |
|
"loss": 1.0019, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.02570694087403599, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.0009989294616193018, |
|
"loss": 1.0082, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.026563838903170524, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.000998704745668676, |
|
"loss": 0.9653, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027420736932305057, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 0.000998458666866564, |
|
"loss": 0.9137, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028277634961439587, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 0.0009981912357541628, |
|
"loss": 0.8958, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02913453299057412, |
|
"grad_norm": 0.125, |
|
"learning_rate": 0.0009979024637873308, |
|
"loss": 0.8628, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.029991431019708654, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 0.0009975923633360985, |
|
"loss": 0.8233, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.030848329048843187, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 0.0009972609476841367, |
|
"loss": 0.8045, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.031705227077977724, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 0.0009969082310281891, |
|
"loss": 0.7961, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.032562125107112254, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 0.0009965342284774632, |
|
"loss": 0.7864, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.033419023136246784, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.0009961389560529835, |
|
"loss": 0.7664, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03427592116538132, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 0.0009957224306869053, |
|
"loss": 0.7723, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03513281919451585, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.0009952846702217886, |
|
"loss": 0.7501, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03598971722365039, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.0009948256934098352, |
|
"loss": 0.6932, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03684661525278492, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.0009943455199120836, |
|
"loss": 0.675, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.037703513281919454, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 0.0009938441702975688, |
|
"loss": 0.6838, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.038560411311053984, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 0.0009933216660424397, |
|
"loss": 0.6546, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03941730934018852, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.0009927780295290389, |
|
"loss": 0.6443, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04027420736932305, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.0009922132840449458, |
|
"loss": 0.6705, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04113110539845758, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.0009916274537819774, |
|
"loss": 0.6176, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04198800342759212, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.000991020563835152, |
|
"loss": 0.683, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04284490145672665, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.0009903926402016153, |
|
"loss": 0.5799, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.043701799485861184, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.0009897437097795257, |
|
"loss": 0.6293, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.044558697514995714, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.0009890738003669028, |
|
"loss": 0.5864, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04541559554413025, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 0.0009883829406604362, |
|
"loss": 0.5672, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04627249357326478, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 0.0009876711602542563, |
|
"loss": 0.607, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04712939160239932, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.0009869384896386668, |
|
"loss": 0.6006, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04798628963153385, |
|
"grad_norm": 0.0625, |
|
"learning_rate": 0.0009861849601988384, |
|
"loss": 0.536, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04884318766066838, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 0.0009854106042134641, |
|
"loss": 0.5153, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.049700085689802914, |
|
"grad_norm": 0.0615234375, |
|
"learning_rate": 0.0009846154548533773, |
|
"loss": 0.5317, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.050556983718937444, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.0009837995461801298, |
|
"loss": 0.5354, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.06005859375, |
|
"learning_rate": 0.0009829629131445341, |
|
"loss": 0.5109, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05227077977720651, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.0009821055915851646, |
|
"loss": 0.5122, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.05312767780634105, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.0009812276182268236, |
|
"loss": 0.5057, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.05398457583547558, |
|
"grad_norm": 0.058349609375, |
|
"learning_rate": 0.0009803290306789677, |
|
"loss": 0.4955, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.054841473864610114, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 0.0009794098674340967, |
|
"loss": 0.4997, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.055698371893744644, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 0.0009784701678661044, |
|
"loss": 0.4673, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.056555269922879174, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 0.0009775099722285933, |
|
"loss": 0.4822, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05741216795201371, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 0.0009765293216531485, |
|
"loss": 0.4716, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05826906598114824, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.0009755282581475768, |
|
"loss": 0.463, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05912596401028278, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.000974506824594107, |
|
"loss": 0.461, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05998286203941731, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 0.0009734650647475529, |
|
"loss": 0.4503, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.060839760068551844, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0009724030232334391, |
|
"loss": 0.4586, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.061696658097686374, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.0009713207455460893, |
|
"loss": 0.4326, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06255355612682091, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0009702182780466775, |
|
"loss": 0.4312, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06341045415595545, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0009690956679612422, |
|
"loss": 0.4472, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06426735218508997, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.0009679529633786629, |
|
"loss": 0.4427, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06512425021422451, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0009667902132486009, |
|
"loss": 0.4308, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.06598114824335904, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.0009656074673794017, |
|
"loss": 0.431, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.06683804627249357, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0009644047764359622, |
|
"loss": 0.4219, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0676949443016281, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0009631821919375591, |
|
"loss": 0.413, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.06855184233076264, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 0.0009619397662556434, |
|
"loss": 0.4065, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06940874035989718, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 0.0009606775526115963, |
|
"loss": 0.447, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0702656383890317, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 0.0009593956050744492, |
|
"loss": 0.4243, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07112253641816624, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 0.0009580939785585681, |
|
"loss": 0.4003, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07197943444730077, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 0.0009567727288213005, |
|
"loss": 0.4098, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0728363324764353, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 0.000955431912460588, |
|
"loss": 0.415, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07369323050556983, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 0.0009540715869125407, |
|
"loss": 0.4239, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07455012853470437, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.0009526918104489777, |
|
"loss": 0.4058, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07540702656383891, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.0009512926421749304, |
|
"loss": 0.3894, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07626392459297343, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 0.0009498741420261108, |
|
"loss": 0.389, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07712082262210797, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0009484363707663442, |
|
"loss": 0.3865, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0779777206512425, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.0009469793899849661, |
|
"loss": 0.3823, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.07883461868037704, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 0.0009455032620941839, |
|
"loss": 0.3963, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07969151670951156, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.0009440080503264037, |
|
"loss": 0.382, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0805484147386461, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 0.0009424938187315209, |
|
"loss": 0.3723, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08140531276778064, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 0.0009409606321741775, |
|
"loss": 0.3766, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08226221079691516, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 0.0009394085563309827, |
|
"loss": 0.3798, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0831191088260497, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.0009378376576876999, |
|
"loss": 0.386, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.08397600685518423, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.0009362480035363986, |
|
"loss": 0.4009, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08483290488431877, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.0009346396619725719, |
|
"loss": 0.3651, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0856898029134533, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.0009330127018922195, |
|
"loss": 0.3922, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08654670094258783, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 0.0009313671929888959, |
|
"loss": 0.3604, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08740359897172237, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 0.0009297032057507264, |
|
"loss": 0.3547, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.08826049700085689, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 0.0009280208114573858, |
|
"loss": 0.3611, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.08911739502999143, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 0.0009263200821770461, |
|
"loss": 0.3789, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.08997429305912596, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 0.0009246010907632895, |
|
"loss": 0.3512, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0908311910882605, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 0.0009228639108519867, |
|
"loss": 0.3634, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09168808911739502, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.0009211086168581433, |
|
"loss": 0.3509, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09254498714652956, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.0009193352839727121, |
|
"loss": 0.3474, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.0934018851756641, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 0.0009175439881593715, |
|
"loss": 0.3742, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.09425878320479864, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 0.0009157348061512727, |
|
"loss": 0.3422, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09511568123393316, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 0.0009139078154477511, |
|
"loss": 0.3379, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0959725792630677, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 0.0009120630943110077, |
|
"loss": 0.3374, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.09682947729220223, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.0009102007217627568, |
|
"loss": 0.3629, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.09768637532133675, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.0009083207775808396, |
|
"loss": 0.3537, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.09854327335047129, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 0.0009064233422958076, |
|
"loss": 0.3473, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.09940017137960583, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 0.0009045084971874737, |
|
"loss": 0.3549, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.10025706940874037, |
|
"grad_norm": 0.03271484375, |
|
"learning_rate": 0.0009025763242814291, |
|
"loss": 0.3407, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.10111396743787489, |
|
"grad_norm": 0.03271484375, |
|
"learning_rate": 0.0009006269063455304, |
|
"loss": 0.3304, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.10197086546700942, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.0008986603268863536, |
|
"loss": 0.3473, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 0.0008966766701456176, |
|
"loss": 0.3376, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1036846615252785, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 0.000894676021096575, |
|
"loss": 0.3262, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.10454155955441302, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 0.0008926584654403724, |
|
"loss": 0.3222, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10539845758354756, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 0.0008906240896023794, |
|
"loss": 0.3278, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.1062553556126821, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 0.0008885729807284854, |
|
"loss": 0.3251, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.10711225364181662, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 0.0008865052266813684, |
|
"loss": 0.3267, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10796915167095116, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 0.0008844209160367298, |
|
"loss": 0.3176, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.10882604970008569, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 0.0008823201380795002, |
|
"loss": 0.3374, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.10968294772922023, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.0008802029828000156, |
|
"loss": 0.314, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11053984575835475, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.0008780695408901613, |
|
"loss": 0.324, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11139674378748929, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 0.0008759199037394887, |
|
"loss": 0.3199, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11225364181662383, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0008737541634312985, |
|
"loss": 0.3034, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11311053984575835, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.0008715724127386971, |
|
"loss": 0.3153, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11396743787489289, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 0.0008693747451206231, |
|
"loss": 0.3202, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.11482433590402742, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 0.0008671612547178428, |
|
"loss": 0.3325, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11568123393316196, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.0008649320363489178, |
|
"loss": 0.3207, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11653813196229648, |
|
"grad_norm": 0.031982421875, |
|
"learning_rate": 0.0008626871855061438, |
|
"loss": 0.3279, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.11739502999143102, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.0008604267983514594, |
|
"loss": 0.3236, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.11825192802056556, |
|
"grad_norm": 0.03076171875, |
|
"learning_rate": 0.0008581509717123273, |
|
"loss": 0.315, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.11910882604970009, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 0.0008558598030775857, |
|
"loss": 0.3124, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.11996572407883462, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 0.0008535533905932737, |
|
"loss": 0.3064, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12082262210796915, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0008512318330584259, |
|
"loss": 0.3055, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12167952013710369, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 0.0008488952299208401, |
|
"loss": 0.2951, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.12253641816623821, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 0.000846543681272818, |
|
"loss": 0.3288, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12339331619537275, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.000844177287846877, |
|
"loss": 0.3015, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12425021422450729, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 0.0008417961510114356, |
|
"loss": 0.3013, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12510711225364182, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0008394003727664709, |
|
"loss": 0.2914, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.12596401028277635, |
|
"grad_norm": 0.0306396484375, |
|
"learning_rate": 0.000836990055739149, |
|
"loss": 0.3018, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.1268209083119109, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 0.0008345653031794292, |
|
"loss": 0.3074, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.12767780634104542, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 0.0008321262189556409, |
|
"loss": 0.3094, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.12853470437017994, |
|
"grad_norm": 0.0311279296875, |
|
"learning_rate": 0.0008296729075500344, |
|
"loss": 0.2971, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1293916023993145, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.0008272054740543053, |
|
"loss": 0.307, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.13024850042844902, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 0.0008247240241650918, |
|
"loss": 0.2955, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13110539845758354, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.0008222286641794488, |
|
"loss": 0.2955, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1319622964867181, |
|
"grad_norm": 0.0306396484375, |
|
"learning_rate": 0.0008197195009902923, |
|
"loss": 0.2904, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1328191945158526, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0008171966420818228, |
|
"loss": 0.3027, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13367609254498714, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 0.0008146601955249188, |
|
"loss": 0.2864, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.13453299057412169, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 0.0008121102699725089, |
|
"loss": 0.2965, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1353898886032562, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 0.0008095469746549171, |
|
"loss": 0.3123, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.13624678663239073, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.0008069704193751832, |
|
"loss": 0.2912, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.13710368466152528, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 0.0008043807145043603, |
|
"loss": 0.309, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1379605826906598, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.0008017779709767858, |
|
"loss": 0.2938, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.13881748071979436, |
|
"grad_norm": 0.031982421875, |
|
"learning_rate": 0.0007991623002853296, |
|
"loss": 0.2923, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.13967437874892888, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 0.0007965338144766185, |
|
"loss": 0.3003, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1405312767780634, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 0.0007938926261462366, |
|
"loss": 0.2923, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14138817480719795, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 0.0007912388484339011, |
|
"loss": 0.2892, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14224507283633248, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 0.0007885725950186169, |
|
"loss": 0.3198, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.143101970865467, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 0.000785893980113806, |
|
"loss": 0.2814, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14395886889460155, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0007832031184624164, |
|
"loss": 0.2911, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14481576692373607, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.000780500125332005, |
|
"loss": 0.2893, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1456726649528706, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 0.0007777851165098011, |
|
"loss": 0.2884, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14652956298200515, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 0.0007750582082977468, |
|
"loss": 0.3052, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.14738646101113967, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 0.0007723195175075137, |
|
"loss": 0.2833, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.14824335904027422, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 0.0007695691614555002, |
|
"loss": 0.2795, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.14910025706940874, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 0.0007668072579578058, |
|
"loss": 0.3104, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.14995715509854327, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.000764033925325184, |
|
"loss": 0.2931, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15081405312767782, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 0.0007612492823579744, |
|
"loss": 0.2867, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15167095115681234, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 0.0007584534483410137, |
|
"loss": 0.3051, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15252784918594686, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0007556465430385259, |
|
"loss": 0.2852, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1533847472150814, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 0.0007528286866889924, |
|
"loss": 0.2919, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.0294189453125, |
|
"learning_rate": 0.00075, |
|
"loss": 0.2707, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15509854327335046, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 0.0007471606041430723, |
|
"loss": 0.275, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.155955441302485, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 0.0007443106207484776, |
|
"loss": 0.2718, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15681233933161953, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 0.0007414501719000186, |
|
"loss": 0.2869, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.15766923736075408, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 0.0007385793801298042, |
|
"loss": 0.275, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.1585261353898886, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 0.000735698368412999, |
|
"loss": 0.2852, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15938303341902313, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 0.0007328072601625557, |
|
"loss": 0.2959, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.16023993144815768, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 0.00072990617922393, |
|
"loss": 0.2681, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1610968294772922, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 0.0007269952498697733, |
|
"loss": 0.2897, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.16195372750642673, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 0.0007240745967946113, |
|
"loss": 0.2775, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16281062553556128, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 0.0007211443451095007, |
|
"loss": 0.2692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1636675235646958, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 0.000718204620336671, |
|
"loss": 0.2847, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16452442159383032, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 0.0007152555484041476, |
|
"loss": 0.2859, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.16538131962296487, |
|
"grad_norm": 0.0299072265625, |
|
"learning_rate": 0.0007122972556403566, |
|
"loss": 0.2784, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1662382176520994, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 0.0007093298687687141, |
|
"loss": 0.2801, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.16709511568123395, |
|
"grad_norm": 0.027587890625, |
|
"learning_rate": 0.0007063535149021973, |
|
"loss": 0.2787, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.16795201371036847, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 0.0007033683215379002, |
|
"loss": 0.2796, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.168808911739503, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 0.0007003744165515704, |
|
"loss": 0.2817, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.16966580976863754, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 0.0006973719281921336, |
|
"loss": 0.2648, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.17052270779777207, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 0.0006943609850761978, |
|
"loss": 0.2822, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.1713796058269066, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.000691341716182545, |
|
"loss": 0.2867, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17223650385604114, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 0.0006883142508466054, |
|
"loss": 0.2901, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.17309340188517566, |
|
"grad_norm": 0.027099609375, |
|
"learning_rate": 0.0006852787187549182, |
|
"loss": 0.2644, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17395029991431019, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 0.000682235249939575, |
|
"loss": 0.277, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17480719794344474, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 0.0006791839747726501, |
|
"loss": 0.2932, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.17566409597257926, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 0.0006761250239606168, |
|
"loss": 0.2822, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17652099400171378, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.0006730585285387465, |
|
"loss": 0.3618, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.17737789203084833, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 0.000669984619865497, |
|
"loss": 0.2766, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.17823479005998286, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 0.0006669034296168854, |
|
"loss": 0.2795, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1790916880891174, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 0.0006638150897808468, |
|
"loss": 0.2788, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.17994858611825193, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.0006607197326515808, |
|
"loss": 0.2795, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18080548414738645, |
|
"grad_norm": 0.027099609375, |
|
"learning_rate": 0.0006576174908238849, |
|
"loss": 0.2742, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.181662382176521, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.0006545084971874737, |
|
"loss": 0.2704, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18251928020565553, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 0.0006513928849212874, |
|
"loss": 0.2725, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.18337617823479005, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 0.0006482707874877854, |
|
"loss": 0.2742, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.1842330762639246, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0006451423386272311, |
|
"loss": 0.268, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18508997429305912, |
|
"grad_norm": 0.0308837890625, |
|
"learning_rate": 0.0006420076723519614, |
|
"loss": 0.2617, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.18594687232219365, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 0.0006388669229406462, |
|
"loss": 0.2629, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.1868037703513282, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 0.0006357202249325371, |
|
"loss": 0.2812, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.18766066838046272, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 0.000632567713121704, |
|
"loss": 0.2766, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.18851756640959727, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 0.0006294095225512603, |
|
"loss": 0.2816, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1893744644387318, |
|
"grad_norm": 0.027099609375, |
|
"learning_rate": 0.000626245788507579, |
|
"loss": 0.2744, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.19023136246786632, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 0.0006230766465144965, |
|
"loss": 0.2777, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.19108826049700087, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0006199022323275083, |
|
"loss": 0.2632, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.1919451585261354, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 0.0006167226819279528, |
|
"loss": 0.2759, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.1928020565552699, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 0.0006135381315171866, |
|
"loss": 0.2926, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19365895458440446, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0006103487175107507, |
|
"loss": 0.2759, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.194515852613539, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 0.0006071545765325253, |
|
"loss": 0.2706, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1953727506426735, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 0.0006039558454088796, |
|
"loss": 0.2816, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.19622964867180806, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0006007526611628086, |
|
"loss": 0.2698, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.19708654670094258, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0005975451610080642, |
|
"loss": 0.2719, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19794344473007713, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 0.0005943334823432777, |
|
"loss": 0.2647, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.19880034275921166, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 0.0005911177627460738, |
|
"loss": 0.2688, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.19965724078834618, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.0005878981399671774, |
|
"loss": 0.2762, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.20051413881748073, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.0005846747519245122, |
|
"loss": 0.2664, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.20137103684661525, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 0.0005814477366972944, |
|
"loss": 0.2715, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.20222793487574978, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 0.0005782172325201155, |
|
"loss": 0.2728, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.20308483290488433, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0005749833777770225, |
|
"loss": 0.2638, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.20394173093401885, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 0.0005717463109955896, |
|
"loss": 0.271, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20479862896315337, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 0.0005685061708409841, |
|
"loss": 0.2682, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 0.000565263096110026, |
|
"loss": 0.2635, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.20651242502142245, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 0.0005620172257252427, |
|
"loss": 0.2513, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.207369323050557, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 0.0005587686987289189, |
|
"loss": 0.2672, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.20822622107969152, |
|
"grad_norm": 0.0311279296875, |
|
"learning_rate": 0.0005555176542771388, |
|
"loss": 0.2777, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.20908311910882604, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 0.0005522642316338268, |
|
"loss": 0.2669, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.2099400171379606, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 0.0005490085701647804, |
|
"loss": 0.2708, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21079691516709512, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 0.0005457508093317013, |
|
"loss": 0.2727, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.21165381319622964, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.0005424910886862209, |
|
"loss": 0.2751, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.2125107112253642, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 0.0005392295478639225, |
|
"loss": 0.2649, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.2133676092544987, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 0.0005359663265783598, |
|
"loss": 0.2647, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.21422450728363324, |
|
"grad_norm": 0.0283203125, |
|
"learning_rate": 0.0005327015646150716, |
|
"loss": 0.2594, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2150814053127678, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 0.0005294354018255945, |
|
"loss": 0.2944, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.2159383033419023, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 0.000526167978121472, |
|
"loss": 0.2886, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.21679520137103683, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 0.0005228994334682604, |
|
"loss": 0.2558, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.21765209940017138, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 0.0005196299078795343, |
|
"loss": 0.2571, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2185089974293059, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 0.0005163595414108881, |
|
"loss": 0.2524, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.21936589545844046, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 0.0005130884741539367, |
|
"loss": 0.2698, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.22022279348757498, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.0005098168462303141, |
|
"loss": 0.2716, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2210796915167095, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 0.0005065447977856722, |
|
"loss": 0.2605, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.22193658954584405, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.0005032724689836759, |
|
"loss": 0.2584, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22279348757497858, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.0005, |
|
"loss": 0.2618, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2236503856041131, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 0.0004967275310163241, |
|
"loss": 0.2602, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.22450728363324765, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0004934552022143279, |
|
"loss": 0.2744, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22536418166238217, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 0.0004901831537696859, |
|
"loss": 0.2598, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.2262210796915167, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 0.0004869115258460635, |
|
"loss": 0.2629, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.22707797772065125, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.00048364045858911197, |
|
"loss": 0.2601, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.22793487574978577, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 0.00048037009212046586, |
|
"loss": 0.261, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.22879177377892032, |
|
"grad_norm": 0.03076171875, |
|
"learning_rate": 0.0004771005665317397, |
|
"loss": 0.2531, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.22964867180805484, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 0.0004738320218785281, |
|
"loss": 0.2707, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.23050556983718937, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 0.00047056459817440544, |
|
"loss": 0.2636, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.23136246786632392, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 0.00046729843538492847, |
|
"loss": 0.2606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23221936589545844, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 0.00046403367342164026, |
|
"loss": 0.257, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.23307626392459296, |
|
"grad_norm": 0.028564453125, |
|
"learning_rate": 0.0004607704521360776, |
|
"loss": 0.2646, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.23393316195372751, |
|
"grad_norm": 0.028076171875, |
|
"learning_rate": 0.0004575089113137792, |
|
"loss": 0.2735, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.23479005998286204, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 0.00045424919066829885, |
|
"loss": 0.272, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.23564695801199656, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 0.0004509914298352197, |
|
"loss": 0.266, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2365038560411311, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 0.00044773576836617336, |
|
"loss": 0.2607, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.23736075407026563, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.0004444823457228612, |
|
"loss": 0.2696, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.23821765209940018, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.00044123130127108126, |
|
"loss": 0.2598, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2390745501285347, |
|
"grad_norm": 0.0301513671875, |
|
"learning_rate": 0.0004379827742747575, |
|
"loss": 0.2581, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.23993144815766923, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 0.00043473690388997434, |
|
"loss": 0.2652, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24078834618680378, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.0004314938291590161, |
|
"loss": 0.2635, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.2416452442159383, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 0.0004282536890044104, |
|
"loss": 0.2546, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.24250214224507283, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 0.0004250166222229774, |
|
"loss": 0.2533, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24335904027420738, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 0.0004217827674798845, |
|
"loss": 0.2712, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.2442159383033419, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.0004185522633027057, |
|
"loss": 0.2658, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.24507283633247642, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 0.0004153252480754877, |
|
"loss": 0.2588, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.24592973436161097, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 0.00041210186003282274, |
|
"loss": 0.2671, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.2467866323907455, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.00040888223725392626, |
|
"loss": 0.2741, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.24764353041988005, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 0.00040566651765672245, |
|
"loss": 0.27, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.24850042844901457, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.00040245483899193594, |
|
"loss": 0.2679, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2493573264781491, |
|
"grad_norm": 0.0224609375, |
|
"learning_rate": 0.00039924733883719147, |
|
"loss": 0.2685, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.25021422450728364, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 0.0003960441545911204, |
|
"loss": 0.2687, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.25107112253641817, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 0.0003928454234674747, |
|
"loss": 0.2554, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.2519280205655527, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0003896512824892495, |
|
"loss": 0.268, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2527849185946872, |
|
"grad_norm": 0.0296630859375, |
|
"learning_rate": 0.00038646186848281344, |
|
"loss": 0.2694, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2536418166238218, |
|
"grad_norm": 0.0283203125, |
|
"learning_rate": 0.00038327731807204744, |
|
"loss": 0.2585, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.2544987146529563, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0003800977676724919, |
|
"loss": 0.2764, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.25535561268209084, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 0.0003769233534855035, |
|
"loss": 0.2688, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.25621251071122536, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 0.00037375421149242103, |
|
"loss": 0.2561, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 0.0003705904774487396, |
|
"loss": 0.2564, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2579263067694944, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 0.0003674322868782959, |
|
"loss": 0.2543, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.258783204798629, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 0.0003642797750674629, |
|
"loss": 0.2586, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.2596401028277635, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 0.00036113307705935393, |
|
"loss": 0.2624, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.26049700085689803, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.0003579923276480387, |
|
"loss": 0.2658, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.26135389888603255, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.0003548576613727689, |
|
"loss": 0.2793, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2622107969151671, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.0003517292125122146, |
|
"loss": 0.2605, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.26306769494430166, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 0.0003486071150787128, |
|
"loss": 0.2654, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.2639245929734362, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 0.00034549150281252633, |
|
"loss": 0.2711, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.2647814910025707, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 0.0003423825091761153, |
|
"loss": 0.2686, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2656383890317052, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 0.0003392802673484193, |
|
"loss": 0.2539, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26649528706083975, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 0.0003361849102191533, |
|
"loss": 0.2706, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.26735218508997427, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 0.00033309657038311456, |
|
"loss": 0.2854, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.26820908311910885, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 0.00033001538013450283, |
|
"loss": 0.2714, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.26906598114824337, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 0.0003269414714612534, |
|
"loss": 0.2624, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2699228791773779, |
|
"grad_norm": 0.0224609375, |
|
"learning_rate": 0.00032387497603938325, |
|
"loss": 0.264, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2707797772065124, |
|
"grad_norm": 0.022705078125, |
|
"learning_rate": 0.00032081602522734986, |
|
"loss": 0.2611, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.27163667523564694, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.0003177647500604252, |
|
"loss": 0.2697, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.27249357326478146, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 0.00031472128124508187, |
|
"loss": 0.2684, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.27335047129391604, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 0.00031168574915339467, |
|
"loss": 0.2627, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.27420736932305056, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.0003086582838174551, |
|
"loss": 0.2661, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2750642673521851, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 0.0003056390149238022, |
|
"loss": 0.2733, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2759211653813196, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 0.00030262807180786645, |
|
"loss": 0.2619, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.27677806341045413, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 0.00029962558344842963, |
|
"loss": 0.2607, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.2776349614395887, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 0.0002966316784621, |
|
"loss": 0.2662, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.27849185946872324, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 0.0002936464850978027, |
|
"loss": 0.2581, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.27934875749785776, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 0.0002906701312312861, |
|
"loss": 0.2662, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2802056555269923, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 0.00028770274435964356, |
|
"loss": 0.26, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2810625535561268, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.0002847444515958523, |
|
"loss": 0.2645, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.2819194515852613, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 0.0002817953796633289, |
|
"loss": 0.2635, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2827763496143959, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 0.00027885565489049947, |
|
"loss": 0.2619, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.28363324764353043, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 0.0002759254032053888, |
|
"loss": 0.2668, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.28449014567266495, |
|
"grad_norm": 0.0216064453125, |
|
"learning_rate": 0.00027300475013022663, |
|
"loss": 0.2553, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2853470437017995, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 0.0002700938207760701, |
|
"loss": 0.2614, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.286203941730934, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 0.0002671927398374443, |
|
"loss": 0.2541, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2870608397600686, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 0.00026430163158700117, |
|
"loss": 0.256, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2879177377892031, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 0.00026142061987019576, |
|
"loss": 0.2675, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2887746358183376, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 0.0002585498280999815, |
|
"loss": 0.2666, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.28963153384747214, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.0002556893792515227, |
|
"loss": 0.2888, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.29048843187660667, |
|
"grad_norm": 0.030029296875, |
|
"learning_rate": 0.00025283939585692784, |
|
"loss": 0.2674, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.2913453299057412, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 0.0002500000000000001, |
|
"loss": 0.2624, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29220222793487577, |
|
"grad_norm": 0.0205078125, |
|
"learning_rate": 0.0002471713133110078, |
|
"loss": 0.2457, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2930591259640103, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 0.00024435345696147403, |
|
"loss": 0.2567, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.2939160239931448, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 0.00024154655165898627, |
|
"loss": 0.2569, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.29477292202227934, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 0.00023875071764202561, |
|
"loss": 0.2583, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.29562982005141386, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 0.00023596607467481602, |
|
"loss": 0.2549, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.29648671808054844, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 0.00023319274204219425, |
|
"loss": 0.2647, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.29734361610968296, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 0.00023043083854449987, |
|
"loss": 0.2848, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2982005141388175, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 0.00022768048249248646, |
|
"loss": 0.2724, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.299057412167952, |
|
"grad_norm": 0.027587890625, |
|
"learning_rate": 0.00022494179170225333, |
|
"loss": 0.2684, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.29991431019708653, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 0.00022221488349019903, |
|
"loss": 0.2623, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.30077120822622105, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 0.0002194998746679952, |
|
"loss": 0.2608, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.30162810625535563, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 0.0002167968815375837, |
|
"loss": 0.2671, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.30248500428449016, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 0.00021410601988619394, |
|
"loss": 0.2583, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3033419023136247, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 0.00021142740498138323, |
|
"loss": 0.2617, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3041988003427592, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 0.000208761151566099, |
|
"loss": 0.2569, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3050556983718937, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 0.00020610737385376348, |
|
"loss": 0.2612, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3059125964010283, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 0.00020346618552338148, |
|
"loss": 0.2629, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3067694944301628, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 0.00020083769971467047, |
|
"loss": 0.2629, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.30762639245929735, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 0.0001982220290232143, |
|
"loss": 0.2847, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 0.00019561928549563967, |
|
"loss": 0.266, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3093401885175664, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 0.00019302958062481672, |
|
"loss": 0.2563, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3101970865467009, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 0.00019045302534508295, |
|
"loss": 0.2696, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3110539845758355, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 0.0001878897300274911, |
|
"loss": 0.2636, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.31191088260497, |
|
"grad_norm": 0.0220947265625, |
|
"learning_rate": 0.00018533980447508135, |
|
"loss": 0.258, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.31276778063410454, |
|
"grad_norm": 0.022705078125, |
|
"learning_rate": 0.00018280335791817732, |
|
"loss": 0.2534, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.31362467866323906, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 0.00018028049900970766, |
|
"loss": 0.2709, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3144815766923736, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 0.0001777713358205514, |
|
"loss": 0.2708, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.31533847472150817, |
|
"grad_norm": 0.0205078125, |
|
"learning_rate": 0.00017527597583490823, |
|
"loss": 0.2587, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.3161953727506427, |
|
"grad_norm": 0.020263671875, |
|
"learning_rate": 0.00017279452594569483, |
|
"loss": 0.2597, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.3170522707797772, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 0.00017032709244996558, |
|
"loss": 0.2611, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.31790916880891174, |
|
"grad_norm": 0.021484375, |
|
"learning_rate": 0.00016787378104435928, |
|
"loss": 0.2697, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.31876606683804626, |
|
"grad_norm": 0.020751953125, |
|
"learning_rate": 0.00016543469682057105, |
|
"loss": 0.2641, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.3196229648671808, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 0.00016300994426085103, |
|
"loss": 0.2658, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.32047986289631536, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 0.0001605996272335291, |
|
"loss": 0.2641, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3213367609254499, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 0.00015820384898856434, |
|
"loss": 0.2651, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3221936589545844, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.00015582271215312294, |
|
"loss": 0.2559, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.32305055698371893, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 0.00015345631872718213, |
|
"loss": 0.2558, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.32390745501285345, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 0.00015110477007916002, |
|
"loss": 0.2537, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.32476435304198803, |
|
"grad_norm": 0.019775390625, |
|
"learning_rate": 0.0001487681669415742, |
|
"loss": 0.2565, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.32562125107112255, |
|
"grad_norm": 0.019775390625, |
|
"learning_rate": 0.00014644660940672628, |
|
"loss": 0.2562, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3264781491002571, |
|
"grad_norm": 0.0301513671875, |
|
"learning_rate": 0.00014414019692241437, |
|
"loss": 0.2644, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.3273350471293916, |
|
"grad_norm": 0.019287109375, |
|
"learning_rate": 0.00014184902828767287, |
|
"loss": 0.2671, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.3281919451585261, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 0.0001395732016485406, |
|
"loss": 0.249, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 0.0001373128144938563, |
|
"loss": 0.2558, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.3299057412167952, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 0.00013506796365108232, |
|
"loss": 0.2693, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.33076263924592975, |
|
"grad_norm": 0.021484375, |
|
"learning_rate": 0.00013283874528215734, |
|
"loss": 0.2686, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.33161953727506427, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 0.00013062525487937698, |
|
"loss": 0.2711, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3324764353041988, |
|
"grad_norm": 0.018798828125, |
|
"learning_rate": 0.00012842758726130281, |
|
"loss": 0.2559, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.0189208984375, |
|
"learning_rate": 0.00012624583656870153, |
|
"loss": 0.2639, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.3341902313624679, |
|
"grad_norm": 0.0196533203125, |
|
"learning_rate": 0.00012408009626051135, |
|
"loss": 0.2681, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3350471293916024, |
|
"grad_norm": 0.02001953125, |
|
"learning_rate": 0.00012193045910983863, |
|
"loss": 0.2629, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.33590402742073694, |
|
"grad_norm": 0.019775390625, |
|
"learning_rate": 0.00011979701719998454, |
|
"loss": 0.2671, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.33676092544987146, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 0.00011767986192049984, |
|
"loss": 0.2651, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.337617823479006, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 0.00011557908396327027, |
|
"loss": 0.2646, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.3384747215081405, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 0.00011349477331863151, |
|
"loss": 0.2723, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3393316195372751, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 0.00011142701927151455, |
|
"loss": 0.2775, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.3401885175664096, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.00010937591039762085, |
|
"loss": 0.265, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.34104541559554413, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 0.00010734153455962764, |
|
"loss": 0.2661, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.34190231362467866, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 0.00010532397890342504, |
|
"loss": 0.2526, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.3427592116538132, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 0.00010332332985438247, |
|
"loss": 0.2583, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34361610968294776, |
|
"grad_norm": 0.0216064453125, |
|
"learning_rate": 0.0001013396731136465, |
|
"loss": 0.2544, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.3444730077120823, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 9.937309365446973e-05, |
|
"loss": 0.2796, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.3453299057412168, |
|
"grad_norm": 0.0189208984375, |
|
"learning_rate": 9.742367571857092e-05, |
|
"loss": 0.2611, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.3461868037703513, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 9.549150281252633e-05, |
|
"loss": 0.2568, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.34704370179948585, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 9.357665770419243e-05, |
|
"loss": 0.2661, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.34790059982862037, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 9.167922241916055e-05, |
|
"loss": 0.27, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.34875749785775495, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 8.979927823724321e-05, |
|
"loss": 0.2665, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3496143958868895, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 8.793690568899215e-05, |
|
"loss": 0.26, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.350471293916024, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 8.609218455224893e-05, |
|
"loss": 0.2625, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.3513281919451585, |
|
"grad_norm": 0.019287109375, |
|
"learning_rate": 8.426519384872733e-05, |
|
"loss": 0.2581, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35218508997429304, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 8.24560118406285e-05, |
|
"loss": 0.2629, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.35304198800342756, |
|
"grad_norm": 0.020263671875, |
|
"learning_rate": 8.066471602728804e-05, |
|
"loss": 0.2522, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.35389888603256214, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 7.889138314185678e-05, |
|
"loss": 0.2648, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.35475578406169667, |
|
"grad_norm": 0.0191650390625, |
|
"learning_rate": 7.71360891480134e-05, |
|
"loss": 0.2633, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.3556126820908312, |
|
"grad_norm": 0.02099609375, |
|
"learning_rate": 7.53989092367106e-05, |
|
"loss": 0.2681, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3564695801199657, |
|
"grad_norm": 0.0294189453125, |
|
"learning_rate": 7.367991782295391e-05, |
|
"loss": 0.2681, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.35732647814910024, |
|
"grad_norm": 0.020751953125, |
|
"learning_rate": 7.197918854261431e-05, |
|
"loss": 0.2531, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.3581833761782348, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 7.029679424927366e-05, |
|
"loss": 0.2607, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.35904027420736934, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 6.863280701110408e-05, |
|
"loss": 0.2623, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 6.698729810778065e-05, |
|
"loss": 0.2641, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3607540702656384, |
|
"grad_norm": 0.019775390625, |
|
"learning_rate": 6.536033802742814e-05, |
|
"loss": 0.2809, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3616109682947729, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 6.375199646360142e-05, |
|
"loss": 0.2679, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.36246786632390743, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 6.21623423123001e-05, |
|
"loss": 0.3452, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.363324764353042, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 6.059144366901737e-05, |
|
"loss": 0.2508, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.36418166238217653, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 5.903936782582253e-05, |
|
"loss": 0.2516, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.36503856041131105, |
|
"grad_norm": 0.0203857421875, |
|
"learning_rate": 5.750618126847912e-05, |
|
"loss": 0.2633, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.3658954584404456, |
|
"grad_norm": 0.0186767578125, |
|
"learning_rate": 5.599194967359639e-05, |
|
"loss": 0.263, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3667523564695801, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 5.449673790581611e-05, |
|
"loss": 0.2754, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3676092544987147, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 5.3020610015033946e-05, |
|
"loss": 0.2628, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3684661525278492, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 5.1563629233655876e-05, |
|
"loss": 0.2775, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3693230505569837, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 5.0125857973889355e-05, |
|
"loss": 0.2529, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.37017994858611825, |
|
"grad_norm": 0.0189208984375, |
|
"learning_rate": 4.87073578250698e-05, |
|
"loss": 0.2672, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.37103684661525277, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 4.730818955102234e-05, |
|
"loss": 0.2576, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3718937446443873, |
|
"grad_norm": 0.027587890625, |
|
"learning_rate": 4.592841308745932e-05, |
|
"loss": 0.2575, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.37275064267352187, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 4.456808753941205e-05, |
|
"loss": 0.257, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3736075407026564, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 4.322727117869951e-05, |
|
"loss": 0.2661, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3744644387317909, |
|
"grad_norm": 0.0302734375, |
|
"learning_rate": 4.190602144143207e-05, |
|
"loss": 0.278, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.37532133676092544, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 4.06043949255509e-05, |
|
"loss": 0.2695, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.37617823479005996, |
|
"grad_norm": 0.0216064453125, |
|
"learning_rate": 3.932244738840379e-05, |
|
"loss": 0.2559, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.37703513281919454, |
|
"grad_norm": 0.020751953125, |
|
"learning_rate": 3.806023374435663e-05, |
|
"loss": 0.2721, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.37789203084832906, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 3.681780806244095e-05, |
|
"loss": 0.2479, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.3787489288774636, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 3.559522356403788e-05, |
|
"loss": 0.2686, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3796058269065981, |
|
"grad_norm": 0.018798828125, |
|
"learning_rate": 3.439253262059822e-05, |
|
"loss": 0.2404, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.38046272493573263, |
|
"grad_norm": 0.021240234375, |
|
"learning_rate": 3.3209786751399184e-05, |
|
"loss": 0.2702, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.38131962296486716, |
|
"grad_norm": 0.0194091796875, |
|
"learning_rate": 3.2047036621337236e-05, |
|
"loss": 0.2568, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.38217652099400173, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 3.0904332038757974e-05, |
|
"loss": 0.2586, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.38303341902313626, |
|
"grad_norm": 0.0205078125, |
|
"learning_rate": 2.9781721953322627e-05, |
|
"loss": 0.2557, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.3838903170522708, |
|
"grad_norm": 0.018798828125, |
|
"learning_rate": 2.8679254453910786e-05, |
|
"loss": 0.2515, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3847472150814053, |
|
"grad_norm": 0.0186767578125, |
|
"learning_rate": 2.7596976766560976e-05, |
|
"loss": 0.2532, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.3856041131105398, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 2.653493525244721e-05, |
|
"loss": 0.2555, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3864610111396744, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 2.5493175405893076e-05, |
|
"loss": 0.2469, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.3873179091688089, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 2.4471741852423235e-05, |
|
"loss": 0.2566, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.38817480719794345, |
|
"grad_norm": 0.020751953125, |
|
"learning_rate": 2.3470678346851513e-05, |
|
"loss": 0.273, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.389031705227078, |
|
"grad_norm": 0.01904296875, |
|
"learning_rate": 2.2490027771406685e-05, |
|
"loss": 0.2599, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.3898886032562125, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 2.152983213389559e-05, |
|
"loss": 0.2591, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.390745501285347, |
|
"grad_norm": 0.01953125, |
|
"learning_rate": 2.0590132565903473e-05, |
|
"loss": 0.2733, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.3916023993144816, |
|
"grad_norm": 0.019287109375, |
|
"learning_rate": 1.9670969321032406e-05, |
|
"loss": 0.2603, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.3924592973436161, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 1.8772381773176416e-05, |
|
"loss": 0.2568, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.39331619537275064, |
|
"grad_norm": 0.022216796875, |
|
"learning_rate": 1.7894408414835363e-05, |
|
"loss": 0.2858, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.39417309340188517, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 1.70370868554659e-05, |
|
"loss": 0.2589, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3950299914310197, |
|
"grad_norm": 0.01904296875, |
|
"learning_rate": 1.620045381987012e-05, |
|
"loss": 0.2503, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.39588688946015427, |
|
"grad_norm": 0.0205078125, |
|
"learning_rate": 1.538454514662285e-05, |
|
"loss": 0.2695, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.3967437874892888, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 1.4589395786535953e-05, |
|
"loss": 0.2616, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.3976006855184233, |
|
"grad_norm": 0.019287109375, |
|
"learning_rate": 1.3815039801161721e-05, |
|
"loss": 0.2542, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.39845758354755784, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 1.3061510361333184e-05, |
|
"loss": 0.254, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.39931448157669236, |
|
"grad_norm": 0.0216064453125, |
|
"learning_rate": 1.232883974574367e-05, |
|
"loss": 0.2671, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.4001713796058269, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 1.1617059339563806e-05, |
|
"loss": 0.2515, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.40102827763496146, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 1.0926199633097156e-05, |
|
"loss": 0.2528, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.401885175664096, |
|
"grad_norm": 0.02001953125, |
|
"learning_rate": 1.0256290220474307e-05, |
|
"loss": 0.2661, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.4027420736932305, |
|
"grad_norm": 0.01953125, |
|
"learning_rate": 9.607359798384786e-06, |
|
"loss": 0.2616, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.40359897172236503, |
|
"grad_norm": 0.0208740234375, |
|
"learning_rate": 8.979436164848088e-06, |
|
"loss": 0.2668, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.40445586975149955, |
|
"grad_norm": 0.0196533203125, |
|
"learning_rate": 8.372546218022748e-06, |
|
"loss": 0.2446, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.40531276778063413, |
|
"grad_norm": 0.0181884765625, |
|
"learning_rate": 7.786715955054202e-06, |
|
"loss": 0.2594, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.40616966580976865, |
|
"grad_norm": 0.019775390625, |
|
"learning_rate": 7.221970470961125e-06, |
|
"loss": 0.2543, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4070265638389032, |
|
"grad_norm": 0.01904296875, |
|
"learning_rate": 6.678333957560512e-06, |
|
"loss": 0.267, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4078834618680377, |
|
"grad_norm": 0.02099609375, |
|
"learning_rate": 6.15582970243117e-06, |
|
"loss": 0.2606, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4087403598971722, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 5.6544800879163026e-06, |
|
"loss": 0.2652, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.40959725792630675, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 5.174306590164879e-06, |
|
"loss": 0.2613, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4104541559554413, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 4.715329778211374e-06, |
|
"loss": 0.2791, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.0194091796875, |
|
"learning_rate": 4.277569313094809e-06, |
|
"loss": 0.2666, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.41216795201371037, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 3.861043947016474e-06, |
|
"loss": 0.2592, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.4130248500428449, |
|
"grad_norm": 0.02294921875, |
|
"learning_rate": 3.4657715225368535e-06, |
|
"loss": 0.2629, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4138817480719794, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 3.09176897181096e-06, |
|
"loss": 0.2624, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.414738646101114, |
|
"grad_norm": 0.017578125, |
|
"learning_rate": 2.739052315863355e-06, |
|
"loss": 0.2556, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4155955441302485, |
|
"grad_norm": 0.02099609375, |
|
"learning_rate": 2.4076366639015913e-06, |
|
"loss": 0.2665, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.41645244215938304, |
|
"grad_norm": 0.01904296875, |
|
"learning_rate": 2.097536212669171e-06, |
|
"loss": 0.2584, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.41730934018851756, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.8087642458373132e-06, |
|
"loss": 0.263, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.4181662382176521, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 1.541333133436018e-06, |
|
"loss": 0.2611, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.4190231362467866, |
|
"grad_norm": 0.01806640625, |
|
"learning_rate": 1.2952543313240472e-06, |
|
"loss": 0.255, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.4198800342759212, |
|
"grad_norm": 0.0194091796875, |
|
"learning_rate": 1.0705383806982606e-06, |
|
"loss": 0.2719, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4207369323050557, |
|
"grad_norm": 0.0206298828125, |
|
"learning_rate": 8.671949076420882e-07, |
|
"loss": 0.2695, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.42159383033419023, |
|
"grad_norm": 0.0198974609375, |
|
"learning_rate": 6.852326227130834e-07, |
|
"loss": 0.2709, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.42245072836332476, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 5.246593205699424e-07, |
|
"loss": 0.2517, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.4233076263924593, |
|
"grad_norm": 0.0211181640625, |
|
"learning_rate": 3.854818796385495e-07, |
|
"loss": 0.2614, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.4241645244215938, |
|
"grad_norm": 0.0208740234375, |
|
"learning_rate": 2.677062618171577e-07, |
|
"loss": 0.2542, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4250214224507284, |
|
"grad_norm": 0.0194091796875, |
|
"learning_rate": 1.7133751222137007e-07, |
|
"loss": 0.2673, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4258783204798629, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 9.637975896759077e-08, |
|
"loss": 0.2686, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.4267352185089974, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 4.283621299649987e-08, |
|
"loss": 0.2779, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.42759211653813195, |
|
"grad_norm": 0.0191650390625, |
|
"learning_rate": 1.0709167935385456e-08, |
|
"loss": 0.2736, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4284490145672665, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 0.0, |
|
"loss": 0.2556, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4284490145672665, |
|
"step": 500, |
|
"total_flos": 4.430379024908288e+19, |
|
"train_loss": 0.41707064187526705, |
|
"train_runtime": 21021.7192, |
|
"train_samples_per_second": 0.381, |
|
"train_steps_per_second": 0.024 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.430379024908288e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|