|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.2107792653621696, |
|
"eval_steps": 91, |
|
"global_step": 1183, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027463096464126332, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.6232, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027463096464126332, |
|
"eval_loss": 0.6296440362930298, |
|
"eval_runtime": 599.6568, |
|
"eval_samples_per_second": 15.285, |
|
"eval_steps_per_second": 15.285, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0054926192928252664, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5596, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008238928939237899, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 9e-06, |
|
"loss": 0.7123, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010985238585650533, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.6337, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013731548232063165, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5764, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016477857878475798, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7453, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01922416752488843, |
|
"grad_norm": 0.057373046875, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.7076, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.021970477171301066, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.5094, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024716786817713696, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.6062, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02746309646412633, |
|
"grad_norm": 0.056884765625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6501, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030209406110538965, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 2.9999964598289033e-05, |
|
"loss": 0.6403, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.032955715756951595, |
|
"grad_norm": 0.061279296875, |
|
"learning_rate": 2.999985839332323e-05, |
|
"loss": 0.6464, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03570202540336423, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 2.9999681385603907e-05, |
|
"loss": 0.6528, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03844833504977686, |
|
"grad_norm": 0.062255859375, |
|
"learning_rate": 2.9999433575966585e-05, |
|
"loss": 0.7109, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0411946446961895, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 2.999911496558097e-05, |
|
"loss": 0.4795, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04394095434260213, |
|
"grad_norm": 0.061767578125, |
|
"learning_rate": 2.9998725555950983e-05, |
|
"loss": 0.6743, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04668726398901476, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 2.9998265348914726e-05, |
|
"loss": 0.6143, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04943357363542739, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 2.9997734346644482e-05, |
|
"loss": 0.5521, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05217988328184003, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 2.99971325516467e-05, |
|
"loss": 0.6291, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05492619292825266, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 2.9996459966761994e-05, |
|
"loss": 0.5828, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.057672502574665295, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.9995716595165114e-05, |
|
"loss": 0.5432, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06041881222107793, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 2.9994902440364943e-05, |
|
"loss": 0.5769, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06316512186749056, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 2.999401750620448e-05, |
|
"loss": 0.469, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06591143151390319, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 2.999306179686082e-05, |
|
"loss": 0.4414, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06865774116031582, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 2.9992035316845125e-05, |
|
"loss": 0.7219, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07140405080672846, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 2.9990938071002606e-05, |
|
"loss": 0.6742, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07415036045314109, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.998977006451253e-05, |
|
"loss": 0.5633, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07689667009955373, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.998853130288814e-05, |
|
"loss": 0.5486, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07964297974596636, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.9987221791976687e-05, |
|
"loss": 0.4064, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.082389289392379, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.9985841537959345e-05, |
|
"loss": 0.5184, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08513559903879163, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.9984390547351244e-05, |
|
"loss": 0.5407, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08788190868520426, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.998286882700138e-05, |
|
"loss": 0.5532, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09062821833161688, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 2.9981276384092628e-05, |
|
"loss": 1.2318, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09337452797802952, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.9979613226141672e-05, |
|
"loss": 0.5457, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09612083762444215, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.9977879360999007e-05, |
|
"loss": 0.5391, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09886714727085479, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.9976074796848866e-05, |
|
"loss": 0.5919, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10161345691726742, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 2.99741995422092e-05, |
|
"loss": 0.6211, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10435976656368005, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 2.997225360593165e-05, |
|
"loss": 0.5296, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10710607621009269, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.997023699720147e-05, |
|
"loss": 0.5469, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10985238585650532, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.9968149725537515e-05, |
|
"loss": 0.6447, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11259869550291796, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9965991800792185e-05, |
|
"loss": 0.5941, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11534500514933059, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.9963763233151377e-05, |
|
"loss": 0.5228, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11809131479574322, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9961464033134444e-05, |
|
"loss": 0.6166, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12083762444215586, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.9959094211594122e-05, |
|
"loss": 0.6156, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12358393408856849, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 2.9956653779716517e-05, |
|
"loss": 0.5492, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1263302437349811, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9954142749021023e-05, |
|
"loss": 0.5405, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12907655338139376, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.9951561131360278e-05, |
|
"loss": 0.584, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13182286302780638, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.994890893892011e-05, |
|
"loss": 0.62, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13456917267421903, |
|
"grad_norm": 0.031982421875, |
|
"learning_rate": 2.994618618421946e-05, |
|
"loss": 0.4026, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13731548232063165, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.994339288011037e-05, |
|
"loss": 0.6126, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1400617919670443, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.9940529039777855e-05, |
|
"loss": 0.6316, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14280810161345692, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 2.9937594676739907e-05, |
|
"loss": 0.3887, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14555441125986954, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9934589804847382e-05, |
|
"loss": 0.5214, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14830072090628219, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 2.9931514438283966e-05, |
|
"loss": 0.5914, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1510470305526948, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9928368591566085e-05, |
|
"loss": 0.5443, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15379334019910745, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.9925152279542856e-05, |
|
"loss": 0.5522, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15653964984552007, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 2.9921865517396008e-05, |
|
"loss": 0.5146, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15928595949193272, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 2.9918508320639803e-05, |
|
"loss": 0.5396, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16203226913834534, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9915080705120976e-05, |
|
"loss": 0.5118, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.164778578784758, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.991158268701866e-05, |
|
"loss": 0.6652, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1675248884311706, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.9908014282844295e-05, |
|
"loss": 0.4211, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.17027119807758326, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.9904375509441562e-05, |
|
"loss": 0.4445, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.17301750772399588, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.9900666383986303e-05, |
|
"loss": 0.588, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17576381737040853, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 2.9896886923986433e-05, |
|
"loss": 0.5705, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17851012701682115, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.989303714728187e-05, |
|
"loss": 0.5068, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18125643666323377, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 2.9889117072044436e-05, |
|
"loss": 0.5196, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18400274630964641, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 2.9885126716777776e-05, |
|
"loss": 0.5952, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18674905595605903, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 2.9881066100317288e-05, |
|
"loss": 0.6194, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18949536560247168, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.987693524183e-05, |
|
"loss": 0.4453, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1922416752488843, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.987273416081451e-05, |
|
"loss": 0.524, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19498798489529695, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.9868462877100875e-05, |
|
"loss": 0.5899, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19773429454170957, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9864121410850527e-05, |
|
"loss": 0.4603, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.20048060418812222, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.9859709782556185e-05, |
|
"loss": 0.4829, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.20322691383453484, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 2.9855228013041737e-05, |
|
"loss": 0.5735, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2059732234809475, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.9850676123462157e-05, |
|
"loss": 0.5104, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2087195331273601, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.98460541353034e-05, |
|
"loss": 0.5501, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.21146584277377276, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 2.9841362070382307e-05, |
|
"loss": 0.5119, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.21421215242018538, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.9836599950846493e-05, |
|
"loss": 0.589, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.216958462066598, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 2.9831767799174255e-05, |
|
"loss": 0.4544, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21970477171301064, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9826865638174445e-05, |
|
"loss": 0.4294, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22245108135942326, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.9821893490986382e-05, |
|
"loss": 0.5649, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2251973910058359, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.981685138107974e-05, |
|
"loss": 0.532, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22794370065224853, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.9811739332254418e-05, |
|
"loss": 0.6026, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.23069001029866118, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.9806557368640457e-05, |
|
"loss": 0.5516, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2334363199450738, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 2.9801305514697913e-05, |
|
"loss": 0.4544, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23618262959148645, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 2.9795983795216727e-05, |
|
"loss": 0.5327, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23892893923789907, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.979059223531664e-05, |
|
"loss": 0.5217, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.24167524888431172, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.978513086044703e-05, |
|
"loss": 0.4562, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.24442155853072434, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.9779599696386846e-05, |
|
"loss": 0.763, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24716786817713698, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 2.9773998769244434e-05, |
|
"loss": 0.4698, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2499141778235496, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.976832810545745e-05, |
|
"loss": 0.5602, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2499141778235496, |
|
"eval_loss": 0.5245938897132874, |
|
"eval_runtime": 620.8292, |
|
"eval_samples_per_second": 14.764, |
|
"eval_steps_per_second": 14.764, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2526604874699622, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 2.9762587731792725e-05, |
|
"loss": 0.477, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2554067971163749, |
|
"grad_norm": 0.03271484375, |
|
"learning_rate": 2.9756777675346128e-05, |
|
"loss": 0.5536, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2581531067627875, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9750897963542453e-05, |
|
"loss": 0.5581, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2608994164092001, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.974494862413528e-05, |
|
"loss": 0.5737, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.26364572605561276, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.973892968520685e-05, |
|
"loss": 0.5191, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2663920357020254, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9732841175167924e-05, |
|
"loss": 0.6794, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.26913834534843806, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9726683122757664e-05, |
|
"loss": 0.5615, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.27188465499485065, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.972045555704348e-05, |
|
"loss": 0.521, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2746309646412633, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.97141585074209e-05, |
|
"loss": 0.4473, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27737727428767595, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 2.9707792003613434e-05, |
|
"loss": 0.6017, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2801235839340886, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.9701356075672442e-05, |
|
"loss": 0.5079, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2828698935805012, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.969485075397696e-05, |
|
"loss": 0.5738, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.28561620322691383, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 2.9688276069233596e-05, |
|
"loss": 0.4251, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2883625128733265, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.968163205247636e-05, |
|
"loss": 0.5902, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2911088225197391, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.9674918735066534e-05, |
|
"loss": 0.4307, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2938551321661517, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.9668136148692497e-05, |
|
"loss": 0.4018, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.29660144181256437, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.966128432536961e-05, |
|
"loss": 0.5109, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.299347751458977, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.9654363297440045e-05, |
|
"loss": 0.6136, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3020940611053896, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.964737309757262e-05, |
|
"loss": 0.4161, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30484037075180226, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.9640313758762692e-05, |
|
"loss": 0.4268, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3075866803982149, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.9633185314331933e-05, |
|
"loss": 0.4809, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.31033299004462755, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.9625987797928237e-05, |
|
"loss": 0.4976, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.31307929969104015, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.9618721243525522e-05, |
|
"loss": 0.5508, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3158256093374528, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.9611385685423582e-05, |
|
"loss": 0.4852, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31857191898386544, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.9603981158247918e-05, |
|
"loss": 0.4301, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3213182286302781, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.9596507696949598e-05, |
|
"loss": 0.4456, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3240645382766907, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 2.9588965336805065e-05, |
|
"loss": 0.6092, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.32681084792310333, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 2.958135411341597e-05, |
|
"loss": 0.4823, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.329557157569516, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.9573674062709024e-05, |
|
"loss": 0.4666, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33230346721592857, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9565925220935828e-05, |
|
"loss": 0.4868, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3350497768623412, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 2.9558107624672673e-05, |
|
"loss": 0.529, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.33779608650875387, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.9550221310820405e-05, |
|
"loss": 0.4308, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3405423961551665, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.9542266316604213e-05, |
|
"loss": 0.4058, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3432887058015791, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.95342426795735e-05, |
|
"loss": 0.4788, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.34603501544799176, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.952615043760165e-05, |
|
"loss": 1.2963, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3487813250944044, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.95179896288859e-05, |
|
"loss": 0.5734, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.35152763474081705, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 2.9509760291947128e-05, |
|
"loss": 0.4352, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.35427394438722964, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9501462465629672e-05, |
|
"loss": 0.6082, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3570202540336423, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.949309618910118e-05, |
|
"loss": 0.4699, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35976656368005494, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.9484661501852373e-05, |
|
"loss": 0.5504, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.36251287332646753, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 2.94761584436969e-05, |
|
"loss": 0.545, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3652591829728802, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 2.9467587054771146e-05, |
|
"loss": 0.445, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.36800549261929283, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.945894737553401e-05, |
|
"loss": 1.1891, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3707518022657055, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.945023944676676e-05, |
|
"loss": 0.565, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.37349811191211807, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.9441463309572797e-05, |
|
"loss": 0.6599, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3762444215585307, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.9432619005377496e-05, |
|
"loss": 0.4754, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.37899073120494337, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9423706575927985e-05, |
|
"loss": 0.4966, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.381737040851356, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.9414726063292974e-05, |
|
"loss": 0.4269, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3844833504977686, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.940567750986252e-05, |
|
"loss": 0.5516, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.38722966014418125, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.9396560958347865e-05, |
|
"loss": 0.486, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3899759697905939, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 2.9387376451781215e-05, |
|
"loss": 0.4506, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.39272227943700655, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.9378124033515533e-05, |
|
"loss": 0.6122, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.39546858908341914, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.936880374722434e-05, |
|
"loss": 0.4776, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3982148987298318, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.9359415636901522e-05, |
|
"loss": 0.5574, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.40096120837624444, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9349959746861093e-05, |
|
"loss": 0.5289, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.40370751802265703, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.9340436121737018e-05, |
|
"loss": 0.4664, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4064538276690697, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.9330844806482974e-05, |
|
"loss": 0.5322, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4092001373154823, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.9321185846372162e-05, |
|
"loss": 0.4143, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.411946446961895, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.9311459286997073e-05, |
|
"loss": 0.5038, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.41469275660830757, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.930166517426929e-05, |
|
"loss": 0.4905, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4174390662547202, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.929180355441926e-05, |
|
"loss": 0.5357, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.42018537590113286, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.9281874473996077e-05, |
|
"loss": 0.5449, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4229316855475455, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 2.9271877979867263e-05, |
|
"loss": 1.3347, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4256779951939581, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.926181411921855e-05, |
|
"loss": 0.4532, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.42842430484037075, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.9251682939553662e-05, |
|
"loss": 0.5425, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4311706144867834, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.9241484488694074e-05, |
|
"loss": 0.4875, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.433916924133196, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.92312188147788e-05, |
|
"loss": 0.4574, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.43666323377960864, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.9220885966264174e-05, |
|
"loss": 0.5003, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4394095434260213, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.9210485991923577e-05, |
|
"loss": 0.4766, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.44215585307243394, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 2.9200018940847278e-05, |
|
"loss": 0.3866, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.44490216271884653, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.918948486244214e-05, |
|
"loss": 0.4401, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4476484723652592, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.917888380643142e-05, |
|
"loss": 0.5193, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4503947820116718, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.916821582285451e-05, |
|
"loss": 0.4802, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.45314109165808447, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 2.915748096206674e-05, |
|
"loss": 0.4693, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.45588740130449706, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.914667927473909e-05, |
|
"loss": 0.4949, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4586337109509097, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9135810811857994e-05, |
|
"loss": 0.5453, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.46138002059732236, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 2.912487562472508e-05, |
|
"loss": 0.4653, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.464126330243735, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.9113873764956917e-05, |
|
"loss": 0.5032, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4668726398901476, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.91028052844848e-05, |
|
"loss": 0.4736, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.46961894953656025, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.9091670235554478e-05, |
|
"loss": 0.4773, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4723652591829729, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.9080468670725922e-05, |
|
"loss": 0.5689, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4751115688293855, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.906920064287308e-05, |
|
"loss": 0.51, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.47785787847579814, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 2.9057866205183606e-05, |
|
"loss": 0.446, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4806041881222108, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 2.9046465411158634e-05, |
|
"loss": 0.4956, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.48335049776862343, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.9034998314612516e-05, |
|
"loss": 0.4963, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.486096807415036, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.902346496967256e-05, |
|
"loss": 0.4928, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4888431170614487, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.9011865430778782e-05, |
|
"loss": 0.4731, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4915894267078613, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.9000199752683663e-05, |
|
"loss": 0.5374, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.49433573635427397, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.8988467990451853e-05, |
|
"loss": 0.6108, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.49708204600068656, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.8976670199459953e-05, |
|
"loss": 0.4189, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4998283556470992, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.8964806435396227e-05, |
|
"loss": 0.4773, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4998283556470992, |
|
"eval_loss": 0.5154861807823181, |
|
"eval_runtime": 620.9342, |
|
"eval_samples_per_second": 14.762, |
|
"eval_steps_per_second": 14.762, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5025746652935118, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.8952876754260342e-05, |
|
"loss": 0.5624, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5053209749399244, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 2.8940881212363124e-05, |
|
"loss": 1.2595, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5080672845863371, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.8928819866326262e-05, |
|
"loss": 0.6287, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5108135942327497, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.891669277308206e-05, |
|
"loss": 0.4508, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5135599038791624, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.8904499989873166e-05, |
|
"loss": 0.5141, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.516306213525575, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.88922415742523e-05, |
|
"loss": 0.4496, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5190525231719877, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.8879917584081975e-05, |
|
"loss": 0.5467, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5217988328184002, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 2.886752807753424e-05, |
|
"loss": 0.4188, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5245451424648129, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.8855073113090395e-05, |
|
"loss": 0.5347, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5272914521112255, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.8842552749540708e-05, |
|
"loss": 0.4117, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5300377617576382, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.8829967045984155e-05, |
|
"loss": 0.5413, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5327840714040508, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 2.8817316061828126e-05, |
|
"loss": 0.5683, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5355303810504635, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.8804599856788154e-05, |
|
"loss": 0.3851, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5382766906968761, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.8791818490887628e-05, |
|
"loss": 0.42, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5410230003432887, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.8778972024457504e-05, |
|
"loss": 0.5491, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5437693099897013, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.876606051813604e-05, |
|
"loss": 0.5299, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.546515619636114, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.8753084032868494e-05, |
|
"loss": 0.4881, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5492619292825266, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8740042629906833e-05, |
|
"loss": 0.4698, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5520082389289392, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.8726936370809455e-05, |
|
"loss": 0.5685, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5547545485753519, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.8713765317440895e-05, |
|
"loss": 0.5536, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5575008582217645, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.870052953197152e-05, |
|
"loss": 0.4891, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5602471678681772, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.8687229076877274e-05, |
|
"loss": 0.4182, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5629934775145897, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.867386401493932e-05, |
|
"loss": 0.507, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5657397871610024, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 2.8660434409243817e-05, |
|
"loss": 0.4052, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.568486096807415, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.8646940323181553e-05, |
|
"loss": 0.4503, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5712324064538277, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.86333818204477e-05, |
|
"loss": 0.4234, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5739787161002403, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8619758965041488e-05, |
|
"loss": 0.5319, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.576725025746653, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.8606071821265888e-05, |
|
"loss": 0.5282, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5794713353930656, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.8592320453727356e-05, |
|
"loss": 0.4596, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5822176450394781, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.857850492733548e-05, |
|
"loss": 0.5258, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5849639546858908, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.856462530730269e-05, |
|
"loss": 0.4836, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5877102643323034, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.855068165914397e-05, |
|
"loss": 0.4973, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5904565739787161, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8536674048676506e-05, |
|
"loss": 0.5643, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5932028836251287, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.8522602542019425e-05, |
|
"loss": 0.476, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5959491932715414, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.850846720559345e-05, |
|
"loss": 0.4767, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.598695502917954, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.8494268106120586e-05, |
|
"loss": 0.5567, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6014418125643667, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.8480005310623823e-05, |
|
"loss": 0.536, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6041881222107792, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.8465678886426814e-05, |
|
"loss": 0.4813, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6069344318571919, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.845128890115355e-05, |
|
"loss": 0.4215, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6096807415036045, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8436835422728036e-05, |
|
"loss": 0.547, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6124270511500172, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.8422318519373996e-05, |
|
"loss": 0.4629, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6151733607964298, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 2.8407738259614524e-05, |
|
"loss": 0.4823, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6179196704428425, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.8393094712271772e-05, |
|
"loss": 0.5568, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6206659800892551, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8378387946466623e-05, |
|
"loss": 0.4709, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6234122897356676, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.8363618031618364e-05, |
|
"loss": 0.4205, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6261585993820803, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.8348785037444366e-05, |
|
"loss": 0.4985, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6289049090284929, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.8333889033959746e-05, |
|
"loss": 0.4527, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6316512186749056, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8318930091477037e-05, |
|
"loss": 0.582, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6343975283213182, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.8303908280605854e-05, |
|
"loss": 0.5028, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6371438379677309, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8288823672252586e-05, |
|
"loss": 0.5349, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6398901476141435, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.827367633762001e-05, |
|
"loss": 0.4251, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6426364572605562, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.825846634820701e-05, |
|
"loss": 0.5079, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6453827669069687, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.824319377580821e-05, |
|
"loss": 1.2174, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6481290765533814, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 2.8227858692513626e-05, |
|
"loss": 0.4188, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.650875386199794, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.821246117070835e-05, |
|
"loss": 0.4767, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6536216958462067, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.8197001283072205e-05, |
|
"loss": 0.4736, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6563680054926193, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 2.8181479102579383e-05, |
|
"loss": 0.388, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.659114315139032, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.8165894702498116e-05, |
|
"loss": 0.6023, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6618606247854446, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.8150248156390327e-05, |
|
"loss": 0.5319, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6646069344318571, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 2.8134539538111286e-05, |
|
"loss": 0.5133, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6673532440782698, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.8118768921809258e-05, |
|
"loss": 0.4813, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6700995537246824, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.8102936381925143e-05, |
|
"loss": 0.5085, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6728458633710951, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.8087041993192148e-05, |
|
"loss": 0.4245, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6755921730175077, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.8071085830635404e-05, |
|
"loss": 0.5026, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6783384826639204, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.8055067969571647e-05, |
|
"loss": 0.5615, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.681084792310333, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.803898848560883e-05, |
|
"loss": 0.4929, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6838311019567456, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.802284745464579e-05, |
|
"loss": 0.5747, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6865774116031582, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.800664495287187e-05, |
|
"loss": 0.4181, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6893237212495709, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.7990381056766583e-05, |
|
"loss": 0.548, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6920700308959835, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.797405584309922e-05, |
|
"loss": 0.5344, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6948163405423962, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.7957669388928517e-05, |
|
"loss": 0.4484, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6975626501888088, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 2.7941221771602278e-05, |
|
"loss": 0.5194, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7003089598352215, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.7924713068757004e-05, |
|
"loss": 0.4297, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7030552694816341, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.7908143358317545e-05, |
|
"loss": 0.4723, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7058015791280466, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.7891512718496712e-05, |
|
"loss": 0.4401, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7085478887744593, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.7874821227794915e-05, |
|
"loss": 0.5961, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7112941984208719, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.78580689649998e-05, |
|
"loss": 0.5483, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7140405080672846, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 2.7841256009185876e-05, |
|
"loss": 0.493, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7167868177136972, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.782438243971412e-05, |
|
"loss": 0.5366, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7195331273601099, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.7807448336231635e-05, |
|
"loss": 0.3991, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7222794370065225, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.7790453778671248e-05, |
|
"loss": 0.528, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7250257466529351, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 2.7773398847251152e-05, |
|
"loss": 0.4221, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7277720562993477, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.7756283622474515e-05, |
|
"loss": 0.4483, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7305183659457604, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.77391081851291e-05, |
|
"loss": 0.4633, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.733264675592173, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.7721872616286888e-05, |
|
"loss": 0.5595, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7360109852385857, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.7704576997303694e-05, |
|
"loss": 0.5091, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7387572948849983, |
|
"grad_norm": 0.0361328125, |
|
"learning_rate": 2.768722140981879e-05, |
|
"loss": 0.4357, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.741503604531411, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.766980593575451e-05, |
|
"loss": 0.4608, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7442499141778236, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 2.765233065731586e-05, |
|
"loss": 0.4593, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7469962238242361, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.7634795656990143e-05, |
|
"loss": 0.5097, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7497425334706488, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.761720101754656e-05, |
|
"loss": 0.4375, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7497425334706488, |
|
"eval_loss": 0.5116191506385803, |
|
"eval_runtime": 620.1922, |
|
"eval_samples_per_second": 14.779, |
|
"eval_steps_per_second": 14.779, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7524888431170614, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.7599546822035817e-05, |
|
"loss": 0.5089, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7552351527634741, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.758183315378976e-05, |
|
"loss": 0.5961, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7579814624098867, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.7564060096420925e-05, |
|
"loss": 0.4763, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7607277720562994, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.754622773382221e-05, |
|
"loss": 0.5076, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.763474081702712, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.7528336150166436e-05, |
|
"loss": 0.4411, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7662203913491246, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.751038542990595e-05, |
|
"loss": 0.5316, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7689667009955372, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.7492375657772254e-05, |
|
"loss": 0.4153, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7717130106419499, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.7474306918775576e-05, |
|
"loss": 0.5106, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7744593202883625, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 2.745617929820449e-05, |
|
"loss": 0.474, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7772056299347752, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 2.74379928816255e-05, |
|
"loss": 1.2147, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7799519395811878, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.7419747754882637e-05, |
|
"loss": 0.5727, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7826982492276005, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 2.740144400409707e-05, |
|
"loss": 0.5203, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7854445588740131, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.738308171566667e-05, |
|
"loss": 0.5998, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7881908685204256, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.7364660976265624e-05, |
|
"loss": 0.5133, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7909371781668383, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.7346181872844037e-05, |
|
"loss": 0.4711, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7936834878132509, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.7327644492627487e-05, |
|
"loss": 0.5563, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7964297974596636, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.7309048923116635e-05, |
|
"loss": 0.4684, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7991761071060762, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.729039525208682e-05, |
|
"loss": 0.4581, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8019224167524889, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.7271683567587608e-05, |
|
"loss": 0.4502, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8046687263989015, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.7252913957942435e-05, |
|
"loss": 0.564, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8074150360453141, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.723408651174813e-05, |
|
"loss": 0.4386, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8101613456917267, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.7215201317874537e-05, |
|
"loss": 0.5623, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8129076553381394, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.7196258465464087e-05, |
|
"loss": 0.5303, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.815653964984552, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.7177258043931354e-05, |
|
"loss": 0.5094, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8184002746309647, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.7158200142962665e-05, |
|
"loss": 0.502, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8211465842773773, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.7139084852515665e-05, |
|
"loss": 0.4744, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.82389289392379, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.7119912262818878e-05, |
|
"loss": 0.5895, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8266392035702025, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.7100682464371306e-05, |
|
"loss": 0.3948, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8293855132166151, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 2.7081395547941986e-05, |
|
"loss": 0.4514, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8321318228630278, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.7062051604569562e-05, |
|
"loss": 0.4525, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8348781325094404, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.7042650725561854e-05, |
|
"loss": 0.4161, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8376244421558531, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.7023193002495447e-05, |
|
"loss": 0.5065, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8403707518022657, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 2.7003678527215224e-05, |
|
"loss": 1.3831, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8431170614486784, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.6984107391833972e-05, |
|
"loss": 0.5368, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.845863371095091, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 2.6964479688731897e-05, |
|
"loss": 0.4434, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8486096807415036, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.694479551055625e-05, |
|
"loss": 0.5286, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8513559903879162, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.6925054950220834e-05, |
|
"loss": 0.4054, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8541023000343289, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 2.69052581009056e-05, |
|
"loss": 0.3735, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8568486096807415, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.68854050560562e-05, |
|
"loss": 0.5696, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8595949193271541, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.6865495909383525e-05, |
|
"loss": 0.4851, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8623412289735668, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.684553075486329e-05, |
|
"loss": 0.5755, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8650875386199794, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.682550968673558e-05, |
|
"loss": 0.5376, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.867833848266392, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.6805432799504407e-05, |
|
"loss": 0.5374, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8705801579128046, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.6785300187937264e-05, |
|
"loss": 0.421, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8733264675592173, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 2.6765111947064654e-05, |
|
"loss": 0.4206, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8760727772056299, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.6744868172179692e-05, |
|
"loss": 0.5895, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8788190868520426, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.672456895883761e-05, |
|
"loss": 0.4784, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8815653964984552, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.670421440285533e-05, |
|
"loss": 0.4898, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8843117061448679, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.6683804600310997e-05, |
|
"loss": 0.6258, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8870580157912805, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.6663339647543528e-05, |
|
"loss": 0.5587, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8898043254376931, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 2.664281964115218e-05, |
|
"loss": 0.4539, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8925506350841057, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 2.6622244677996058e-05, |
|
"loss": 0.4652, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8952969447305184, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 2.660161485519368e-05, |
|
"loss": 0.4624, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.898043254376931, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.6580930270122524e-05, |
|
"loss": 0.5089, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9007895640233436, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.6560191020418545e-05, |
|
"loss": 0.4246, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9035358736697563, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.6539397203975732e-05, |
|
"loss": 0.516, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9062821833161689, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.6518548918945646e-05, |
|
"loss": 0.5008, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9090284929625815, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.6497646263736943e-05, |
|
"loss": 0.5195, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9117748026089941, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.6476689337014925e-05, |
|
"loss": 0.5701, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9145211122554068, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.6455678237701072e-05, |
|
"loss": 0.5766, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9172674219018194, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.643461306497256e-05, |
|
"loss": 0.4613, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9200137315482321, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.641349391826182e-05, |
|
"loss": 0.4347, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9227600411946447, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.6392320897256034e-05, |
|
"loss": 0.4371, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9255063508410574, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.637109410189669e-05, |
|
"loss": 0.5219, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.92825266048747, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.6349813632379103e-05, |
|
"loss": 0.5435, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9309989701338826, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.6328479589151953e-05, |
|
"loss": 0.4764, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9337452797802952, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.6307092072916786e-05, |
|
"loss": 0.4664, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9364915894267078, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.628565118462756e-05, |
|
"loss": 0.4723, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9392378990731205, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.626415702549015e-05, |
|
"loss": 0.5179, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9419842087195331, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.62426096969619e-05, |
|
"loss": 0.5736, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9447305183659458, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.6221009300751113e-05, |
|
"loss": 0.5238, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9474768280123584, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.6199355938816586e-05, |
|
"loss": 0.4591, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.950223137658771, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.6177649713367136e-05, |
|
"loss": 0.5288, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9529694473051836, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.6155890726861084e-05, |
|
"loss": 0.5066, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9557157569515963, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.613407908200582e-05, |
|
"loss": 0.4485, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9584620665980089, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.6112214881757285e-05, |
|
"loss": 0.5076, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9612083762444216, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.6090298229319477e-05, |
|
"loss": 0.5024, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9639546858908342, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.6068329228144016e-05, |
|
"loss": 0.4839, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9667009955372469, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.604630798192959e-05, |
|
"loss": 0.5425, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9694473051836594, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.60242345946215e-05, |
|
"loss": 0.4468, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.972193614830072, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.6002109170411178e-05, |
|
"loss": 0.5624, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9749399244764847, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 2.597993181373567e-05, |
|
"loss": 0.3949, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9776862341228973, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.5957702629277154e-05, |
|
"loss": 0.5243, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.98043254376931, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.593542172196246e-05, |
|
"loss": 0.574, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9831788534157226, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 2.5913089196962547e-05, |
|
"loss": 0.4708, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9859251630621353, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.5890705159692036e-05, |
|
"loss": 0.4344, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9886714727085479, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.5868269715808685e-05, |
|
"loss": 0.4977, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9914177823549605, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.58457829712129e-05, |
|
"loss": 0.551, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9941640920013731, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.5823245032047255e-05, |
|
"loss": 0.5069, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9969104016477858, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.5800656004695962e-05, |
|
"loss": 0.5246, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9996567112941984, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.5778015995784385e-05, |
|
"loss": 0.6325, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9996567112941984, |
|
"eval_loss": 0.509181559085846, |
|
"eval_runtime": 618.8303, |
|
"eval_samples_per_second": 14.812, |
|
"eval_steps_per_second": 14.812, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.002403020940611, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.575532511217852e-05, |
|
"loss": 0.607, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0051493305870236, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.5732583460984527e-05, |
|
"loss": 0.5572, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.0078956402334363, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 2.5709791149548184e-05, |
|
"loss": 1.256, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.010641949879849, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.56869482854544e-05, |
|
"loss": 0.4604, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.0020597322348095, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.5664054976526702e-05, |
|
"loss": 0.5396, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.0048060418812221, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.564111133082674e-05, |
|
"loss": 0.4803, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0075523515276348, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 2.561811745665374e-05, |
|
"loss": 0.3781, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.0102986611740474, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.5595073462544046e-05, |
|
"loss": 0.4143, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.01304497082046, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.5571979457270565e-05, |
|
"loss": 0.4698, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.0157912804668727, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.5548835549842274e-05, |
|
"loss": 0.5101, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.0185375901132854, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 2.5525641849503685e-05, |
|
"loss": 0.4252, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0212838997596978, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 2.5502398465734357e-05, |
|
"loss": 0.5116, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.0240302094061104, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.5479105508248373e-05, |
|
"loss": 0.4816, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.026776519052523, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.54557630869938e-05, |
|
"loss": 0.4521, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0295228286989357, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.543237131215219e-05, |
|
"loss": 0.4769, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.0322691383453484, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.5408930294138065e-05, |
|
"loss": 0.5011, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.035015447991761, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.538544014359837e-05, |
|
"loss": 0.407, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0377617576381737, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 2.536190097141197e-05, |
|
"loss": 0.4991, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.0405080672845863, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.5338312888689137e-05, |
|
"loss": 0.5129, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.043254376930999, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.5314676006771e-05, |
|
"loss": 0.4409, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.0460006865774116, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 2.529099043722903e-05, |
|
"loss": 0.542, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0487469962238243, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.526725629186452e-05, |
|
"loss": 0.5767, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.051493305870237, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.5243473682708057e-05, |
|
"loss": 0.5457, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.0542396155166496, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 2.5219642722018975e-05, |
|
"loss": 0.4768, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.0569859251630622, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.5195763522284848e-05, |
|
"loss": 0.58, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0597322348094749, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.5171836196220946e-05, |
|
"loss": 0.5176, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0624785444558873, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.51478608567697e-05, |
|
"loss": 0.4992, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.0652248541023, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 2.512383761710019e-05, |
|
"loss": 0.5167, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.0679711637487126, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 2.5099766590607587e-05, |
|
"loss": 1.119, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0707174733951252, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.5075647890912628e-05, |
|
"loss": 0.4643, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.0734637830415379, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.505148163186107e-05, |
|
"loss": 0.5572, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0762100926879505, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.5027267927523178e-05, |
|
"loss": 0.4685, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0789564023343632, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.500300689219315e-05, |
|
"loss": 0.5597, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.0817027119807758, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.4978698640388617e-05, |
|
"loss": 0.47, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.0844490216271885, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.495434328685007e-05, |
|
"loss": 0.5364, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0871953312736011, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.492994094654033e-05, |
|
"loss": 0.4303, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0899416409200138, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 2.490549173464402e-05, |
|
"loss": 1.1982, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.0926879505664264, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.4880995766566986e-05, |
|
"loss": 0.5137, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.095434260212839, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.4856453157935795e-05, |
|
"loss": 0.4997, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.0981805698592517, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 2.483186402459715e-05, |
|
"loss": 0.5209, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.1009268795056641, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.4807228482617376e-05, |
|
"loss": 0.483, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1036731891520768, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.4782546648281848e-05, |
|
"loss": 0.5055, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.1064194987984894, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.4757818638094457e-05, |
|
"loss": 0.462, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.109165808444902, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.473304456877705e-05, |
|
"loss": 0.4663, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.1119121180913147, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.470822455726889e-05, |
|
"loss": 0.5343, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.1146584277377274, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.468335872072609e-05, |
|
"loss": 0.4854, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.11740473738414, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 2.4658447176521076e-05, |
|
"loss": 0.5206, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.1201510470305527, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.463349004224201e-05, |
|
"loss": 0.4738, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.1228973566769653, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.460848743569227e-05, |
|
"loss": 0.5632, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.125643666323378, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.458343947488985e-05, |
|
"loss": 0.6056, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.1283899759697906, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 2.4558346278066853e-05, |
|
"loss": 1.1007, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1311362856162033, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.4533207963668883e-05, |
|
"loss": 0.4747, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.133882595262616, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.4508024650354525e-05, |
|
"loss": 0.439, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.1366289049090286, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.4482796456994757e-05, |
|
"loss": 0.4913, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.1393752145554412, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 2.4457523502672415e-05, |
|
"loss": 0.5722, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.1421215242018539, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.44322059066816e-05, |
|
"loss": 0.3971, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1448678338482665, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.440684378852714e-05, |
|
"loss": 0.4724, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.147614143494679, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 2.438143726792403e-05, |
|
"loss": 0.5305, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.1503604531410916, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 2.435598646479683e-05, |
|
"loss": 0.4924, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.1531067627875042, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.4330491499279148e-05, |
|
"loss": 0.4927, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.1558530724339169, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.4304952491713035e-05, |
|
"loss": 0.45, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.1585993820803295, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.4279369562648424e-05, |
|
"loss": 0.5892, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.1613456917267422, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.4253742832842583e-05, |
|
"loss": 0.4727, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.1640920013731548, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.4228072423259527e-05, |
|
"loss": 0.5063, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.1668383110195675, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 2.420235845506944e-05, |
|
"loss": 0.4872, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.1695846206659801, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.4176601049648116e-05, |
|
"loss": 0.3843, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1723309303123928, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 2.415080032857639e-05, |
|
"loss": 0.4478, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.1750772399588054, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.4124956413639548e-05, |
|
"loss": 0.4964, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.177823549605218, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.4099069426826766e-05, |
|
"loss": 0.5176, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.1805698592516307, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.4073139490330526e-05, |
|
"loss": 0.5596, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.1833161688980431, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 2.4047166726546047e-05, |
|
"loss": 0.485, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1860624785444558, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.4021151258070694e-05, |
|
"loss": 0.4768, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.1888087881908684, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 2.3995093207703413e-05, |
|
"loss": 0.5097, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.191555097837281, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.3968992698444153e-05, |
|
"loss": 0.5401, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1943014074836937, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.394284985349327e-05, |
|
"loss": 0.425, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.1970477171301064, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.3916664796250946e-05, |
|
"loss": 0.3752, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.199794026776519, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.389043765031664e-05, |
|
"loss": 0.4724, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.2025403364229317, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 2.386416853948845e-05, |
|
"loss": 0.5598, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.2052866460693443, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 2.3837857587762583e-05, |
|
"loss": 0.3885, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.208032955715757, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 2.3811504919332727e-05, |
|
"loss": 0.4608, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.2107792653621696, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 2.378511065858949e-05, |
|
"loss": 0.4457, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2135255750085823, |
|
"grad_norm": 0.039306640625, |
|
"learning_rate": 2.3758674930119807e-05, |
|
"loss": 0.4162, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.216271884654995, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 2.3732197858706343e-05, |
|
"loss": 0.4656, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.2190181943014076, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.370567956932692e-05, |
|
"loss": 0.4525, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.2217645039478202, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.367912018715391e-05, |
|
"loss": 0.498, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.2245108135942329, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.3652519837553655e-05, |
|
"loss": 0.3724, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2272571232406453, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 2.3625878646085873e-05, |
|
"loss": 0.3611, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.230003432887058, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.3599196738503068e-05, |
|
"loss": 0.4002, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.2327497425334706, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.3572474240749932e-05, |
|
"loss": 0.5691, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.2354960521798832, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.354571127896275e-05, |
|
"loss": 0.536, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.2382423618262959, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.3518907979468807e-05, |
|
"loss": 0.4385, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2382423618262959, |
|
"eval_loss": 0.5073373913764954, |
|
"eval_runtime": 627.5271, |
|
"eval_samples_per_second": 14.607, |
|
"eval_steps_per_second": 14.607, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2409886714727085, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.349206446878578e-05, |
|
"loss": 0.5131, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.2437349811191212, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.346518087362118e-05, |
|
"loss": 0.4821, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.2464812907655338, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.3438257320871704e-05, |
|
"loss": 0.5344, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.2492276004119465, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.3411293937622658e-05, |
|
"loss": 0.4752, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.2519739100583591, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.338429085114737e-05, |
|
"loss": 0.4887, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2547202197047718, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.335724818890656e-05, |
|
"loss": 0.4445, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.2574665293511844, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.3330166078547763e-05, |
|
"loss": 0.5841, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.2602128389975968, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.3303044647904725e-05, |
|
"loss": 0.519, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.2629591486440095, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.3275884024996784e-05, |
|
"loss": 0.5149, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.2657054582904221, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.324868433802827e-05, |
|
"loss": 0.4681, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2684517679368348, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.3221445715387917e-05, |
|
"loss": 0.5058, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.2711980775832474, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 2.319416828564824e-05, |
|
"loss": 0.5142, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.27394438722966, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.3166852177564925e-05, |
|
"loss": 0.4682, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.2766906968760727, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.3139497520076233e-05, |
|
"loss": 0.4361, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.2794370065224854, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.3112104442302393e-05, |
|
"loss": 0.5738, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.282183316168898, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.3084673073544976e-05, |
|
"loss": 0.4828, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.2849296258153107, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.3057203543286297e-05, |
|
"loss": 0.503, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.2876759354617233, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.3029695981188818e-05, |
|
"loss": 0.5526, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.290422245108136, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.3002150517094496e-05, |
|
"loss": 0.4757, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.2931685547545486, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.297456728102421e-05, |
|
"loss": 0.5773, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2959148644009613, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.294694640317713e-05, |
|
"loss": 0.5248, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.298661174047374, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.2919288013930094e-05, |
|
"loss": 0.4915, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.3014074836937866, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.2891592243837015e-05, |
|
"loss": 0.5389, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.3041537933401992, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.286385922362824e-05, |
|
"loss": 0.4232, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.3069001029866119, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.2836089084209955e-05, |
|
"loss": 0.5072, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3096464126330245, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.280828195666355e-05, |
|
"loss": 0.54, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.312392722279437, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.2780437972245014e-05, |
|
"loss": 0.5446, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.3151390319258496, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 2.2752557262384307e-05, |
|
"loss": 0.4725, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.3178853415722622, |
|
"grad_norm": 0.1650390625, |
|
"learning_rate": 2.2724639958684733e-05, |
|
"loss": 1.2587, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.3206316512186749, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.2696686192922342e-05, |
|
"loss": 0.4965, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.3233779608650875, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.2668696097045284e-05, |
|
"loss": 0.5382, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.3261242705115002, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.2640669803173195e-05, |
|
"loss": 0.4305, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.3288705801579128, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.2612607443596572e-05, |
|
"loss": 0.4622, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.3316168898043255, |
|
"grad_norm": 0.05908203125, |
|
"learning_rate": 2.258450915077616e-05, |
|
"loss": 0.4975, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.3343631994507381, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.2556375057342306e-05, |
|
"loss": 0.6356, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3371095090971508, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.2528205296094356e-05, |
|
"loss": 0.4422, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.3398558187435634, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.446, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.3426021283899758, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 2.247175930219468e-05, |
|
"loss": 0.5996, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.3453484380363885, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.2443483335980924e-05, |
|
"loss": 0.5905, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.3480947476828011, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.2415172234827754e-05, |
|
"loss": 0.5824, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3508410573292138, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.238682613237001e-05, |
|
"loss": 0.4885, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.3535873669756264, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.2358445162407775e-05, |
|
"loss": 0.587, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.356333676622039, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.2330029458905697e-05, |
|
"loss": 0.5453, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.3590799862684517, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.230157915599238e-05, |
|
"loss": 0.4596, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.3618262959148644, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.2273094387959747e-05, |
|
"loss": 0.4349, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.364572605561277, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 2.2244575289262394e-05, |
|
"loss": 0.4613, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.3673189152076897, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.221602199451698e-05, |
|
"loss": 0.4176, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.3700652248541023, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 2.2187434638501564e-05, |
|
"loss": 0.4799, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.372811534500515, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 2.215881335615499e-05, |
|
"loss": 0.4335, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.3755578441469276, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.2130158282576245e-05, |
|
"loss": 0.5999, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.3783041537933403, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.2101469553023807e-05, |
|
"loss": 0.4654, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.381050463439753, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.2072747302915026e-05, |
|
"loss": 0.4423, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.3837967730861656, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.2043991667825478e-05, |
|
"loss": 0.5145, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.3865430827325782, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 2.2015202783488316e-05, |
|
"loss": 0.5894, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.3892893923789909, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.1986380785793646e-05, |
|
"loss": 0.5228, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3920357020254035, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.195752581078787e-05, |
|
"loss": 0.5529, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.394782011671816, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.1928637994673053e-05, |
|
"loss": 0.5783, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.3975283213182286, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 2.1899717473806273e-05, |
|
"loss": 0.418, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.4002746309646412, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.1870764384698992e-05, |
|
"loss": 0.4945, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.4030209406110539, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 2.1841778864016396e-05, |
|
"loss": 0.496, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.4057672502574665, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.1812761048576752e-05, |
|
"loss": 0.5087, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.4085135599038792, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.1783711075350766e-05, |
|
"loss": 0.4898, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.4112598695502918, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 2.1754629081460947e-05, |
|
"loss": 0.4379, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.4140061791967045, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.172551520418093e-05, |
|
"loss": 0.4827, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.416752488843117, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.169636958093487e-05, |
|
"loss": 0.5007, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4194987984895298, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.1667192349296746e-05, |
|
"loss": 0.4651, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.4222451081359424, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.1637983646989758e-05, |
|
"loss": 0.4674, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.4249914177823548, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.1608743611885633e-05, |
|
"loss": 0.4794, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.4277377274287675, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.1579472382004015e-05, |
|
"loss": 0.5292, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.4304840370751801, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.1550170095511784e-05, |
|
"loss": 0.4964, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.4332303467215928, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 2.1520836890722416e-05, |
|
"loss": 0.4236, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.4359766563680054, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.149147290609533e-05, |
|
"loss": 0.4859, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.438722966014418, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 2.146207828023524e-05, |
|
"loss": 0.4659, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.4414692756608307, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.1432653151891473e-05, |
|
"loss": 0.4424, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.4442155853072434, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 2.1403197659957356e-05, |
|
"loss": 0.4515, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.446961894953656, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.137371194346953e-05, |
|
"loss": 0.4618, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.4497082046000687, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.1344196141607297e-05, |
|
"loss": 0.3928, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.4524545142464813, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.1314650393691984e-05, |
|
"loss": 0.4598, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.455200823892894, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.1285074839186257e-05, |
|
"loss": 0.5646, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.4579471335393066, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 2.1255469617693476e-05, |
|
"loss": 0.5984, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.4606934431857193, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 2.122583486895705e-05, |
|
"loss": 0.5419, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.463439752832132, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.119617073285974e-05, |
|
"loss": 0.5481, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.4661860624785445, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.116647734942305e-05, |
|
"loss": 0.5588, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.4689323721249572, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.113675485880652e-05, |
|
"loss": 0.5621, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.4716786817713698, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.110700340130708e-05, |
|
"loss": 0.5056, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4744249914177825, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.1077223117358395e-05, |
|
"loss": 0.5526, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.477171301064195, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.104741414753021e-05, |
|
"loss": 0.5414, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.4799176107106076, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 2.1017576632527662e-05, |
|
"loss": 0.5472, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.4826639203570202, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 2.098771071319062e-05, |
|
"loss": 0.4568, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.4854102300034329, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 2.0957816530493037e-05, |
|
"loss": 0.4277, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.4881565396498455, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.0927894225542282e-05, |
|
"loss": 0.4949, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.4881565396498455, |
|
"eval_loss": 0.5060501098632812, |
|
"eval_runtime": 630.5882, |
|
"eval_samples_per_second": 14.536, |
|
"eval_steps_per_second": 14.536, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.4909028492962582, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.089794393957846e-05, |
|
"loss": 0.3558, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.4936491589426708, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 2.086796581397374e-05, |
|
"loss": 0.4622, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.4963954685890835, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.083795999023173e-05, |
|
"loss": 0.5402, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.499141778235496, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.080792660998676e-05, |
|
"loss": 0.5271, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5018880878819085, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.0777865815003234e-05, |
|
"loss": 0.5152, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.5046343975283212, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.074777774717496e-05, |
|
"loss": 0.5099, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.5073807071747338, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.0717662548524482e-05, |
|
"loss": 0.4075, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.5101270168211465, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.068752036120241e-05, |
|
"loss": 0.5205, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.5128733264675591, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.0657351327486745e-05, |
|
"loss": 0.5127, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.5156196361139718, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 2.0627155589782212e-05, |
|
"loss": 0.5399, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.5183659457603844, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.0596933290619572e-05, |
|
"loss": 0.4869, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.521112255406797, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 2.0566684572654978e-05, |
|
"loss": 0.6318, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.5238585650532097, |
|
"grad_norm": 0.0576171875, |
|
"learning_rate": 2.0536409578669277e-05, |
|
"loss": 0.4729, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.5266048746996224, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.0506108451567347e-05, |
|
"loss": 0.5059, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.529351184346035, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.0475781334377426e-05, |
|
"loss": 0.3829, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.5320974939924477, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.044542837025042e-05, |
|
"loss": 0.4582, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.5348438036388603, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.0415049702459244e-05, |
|
"loss": 0.5344, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.537590113285273, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.0384645474398137e-05, |
|
"loss": 0.4508, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.5403364229316856, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 2.0354215829582005e-05, |
|
"loss": 0.4973, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5430827325780982, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 2.03237609116457e-05, |
|
"loss": 0.5406, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.545829042224511, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 2.029328086434339e-05, |
|
"loss": 0.4956, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.5485753518709235, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 2.0262775831547847e-05, |
|
"loss": 0.5642, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.5513216615173362, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 2.0232245957249788e-05, |
|
"loss": 0.5424, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.5540679711637488, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.020169138555718e-05, |
|
"loss": 0.4972, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5568142808101615, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 2.0171112260694576e-05, |
|
"loss": 0.4511, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.5595605904565741, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.0140508727002422e-05, |
|
"loss": 0.4669, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.5623069001029866, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 2.0109880928936375e-05, |
|
"loss": 0.5472, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.5650532097493992, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 2.007922901106663e-05, |
|
"loss": 0.5493, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.5677995193958119, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 2.0048553118077238e-05, |
|
"loss": 0.46, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5705458290422245, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 2.0017853394765402e-05, |
|
"loss": 0.6062, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.5732921386886372, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.9987129986040825e-05, |
|
"loss": 0.5053, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.5760384483350498, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.9956383036925006e-05, |
|
"loss": 0.5205, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.5787847579814624, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.9925612692550554e-05, |
|
"loss": 0.5296, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.581531067627875, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.989481909816052e-05, |
|
"loss": 0.577, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5842773772742875, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.986400239910769e-05, |
|
"loss": 0.5867, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.5870236869207002, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.9833162740853916e-05, |
|
"loss": 0.5371, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.5897699965671128, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.980230026896942e-05, |
|
"loss": 0.4848, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.5925163062135255, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 1.977141512913211e-05, |
|
"loss": 0.4747, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.5952626158599381, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.974050746712689e-05, |
|
"loss": 0.4296, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5980089255063508, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.9709577428844984e-05, |
|
"loss": 0.4943, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.6007552351527634, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.967862516028321e-05, |
|
"loss": 0.487, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.603501544799176, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 1.9647650807543358e-05, |
|
"loss": 0.5275, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.6062478544455887, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.961665451683143e-05, |
|
"loss": 0.557, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.6089941640920014, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 1.9585636434456988e-05, |
|
"loss": 0.4689, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.611740473738414, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.9554596706832457e-05, |
|
"loss": 0.5351, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.6144867833848267, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.952353548047243e-05, |
|
"loss": 0.5714, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.6172330930312393, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.9492452901992987e-05, |
|
"loss": 0.5468, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.619979402677652, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.946134911811099e-05, |
|
"loss": 0.5812, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.6227257123240646, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.9430224275643388e-05, |
|
"loss": 0.5367, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.6254720219704772, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.9399078521506546e-05, |
|
"loss": 0.5746, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.62821833161689, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.9367912002715524e-05, |
|
"loss": 0.4458, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.6309646412633025, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 1.93367248663834e-05, |
|
"loss": 0.4413, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.6337109509097152, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.9305517259720573e-05, |
|
"loss": 0.5666, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.6364572605561278, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.9274289330034068e-05, |
|
"loss": 0.5282, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6392035702025405, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.924304122472683e-05, |
|
"loss": 0.5065, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.6419498798489531, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.9211773091297057e-05, |
|
"loss": 0.5519, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.6446961894953656, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.9180485077337462e-05, |
|
"loss": 0.5044, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.6474424991417782, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 1.9149177330534614e-05, |
|
"loss": 0.4895, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.6501888087881909, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.9117849998668212e-05, |
|
"loss": 0.4553, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6529351184346035, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.9086503229610418e-05, |
|
"loss": 0.5583, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.6556814280810161, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 1.905513717132513e-05, |
|
"loss": 0.3757, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.6584277377274288, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.90237519718673e-05, |
|
"loss": 0.5956, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.6611740473738414, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 1.899234777938222e-05, |
|
"loss": 1.1236, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.6639203570202539, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.8960924742104856e-05, |
|
"loss": 0.5466, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.892948300835911e-05, |
|
"loss": 0.4874, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.6694129763130792, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.889802272655713e-05, |
|
"loss": 0.5116, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.6721592859594918, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.8866544045198634e-05, |
|
"loss": 0.587, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.6749055956059045, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.8835047112870163e-05, |
|
"loss": 0.4174, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.677651905252317, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 1.880353207824444e-05, |
|
"loss": 0.4023, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6803982148987298, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.8771999090079613e-05, |
|
"loss": 0.5134, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.6831445245451424, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.8740448297218575e-05, |
|
"loss": 0.4694, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.685890834191555, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.8708879848588268e-05, |
|
"loss": 0.5185, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.6886371438379677, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.8677293893198976e-05, |
|
"loss": 0.5077, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.6913834534843804, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.864569058014361e-05, |
|
"loss": 0.4517, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.694129763130793, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 1.8614070058597014e-05, |
|
"loss": 0.4703, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.6968760727772056, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.8582432477815268e-05, |
|
"loss": 0.5061, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.6996223824236183, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.855077798713497e-05, |
|
"loss": 0.5413, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.702368692070031, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 1.8519106735972535e-05, |
|
"loss": 0.4586, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.7051150017164436, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.84874188738235e-05, |
|
"loss": 0.5022, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.7078613113628562, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 1.8455714550261793e-05, |
|
"loss": 0.4945, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.7106076210092689, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.8423993914939063e-05, |
|
"loss": 0.5806, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.7133539306556815, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.8392257117583944e-05, |
|
"loss": 0.462, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.7161002403020942, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.836050430800135e-05, |
|
"loss": 0.4944, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.7188465499485068, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 1.83287356360718e-05, |
|
"loss": 1.1722, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7215928595949195, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.8296951251750667e-05, |
|
"loss": 0.3718, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.7243391692413321, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.8265151305067486e-05, |
|
"loss": 0.484, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.7270854788877446, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.8233335946125275e-05, |
|
"loss": 0.4783, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.7298317885341572, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.8201505325099782e-05, |
|
"loss": 0.5684, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.7325780981805698, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 1.8169659592238797e-05, |
|
"loss": 0.4518, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.7353244078269825, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.813779889786144e-05, |
|
"loss": 0.4535, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.7380707174733951, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.8105923392357464e-05, |
|
"loss": 0.503, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.7380707174733951, |
|
"eval_loss": 0.5051947832107544, |
|
"eval_runtime": 630.1537, |
|
"eval_samples_per_second": 14.546, |
|
"eval_steps_per_second": 14.546, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.7408170271198078, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.807403322618653e-05, |
|
"loss": 0.4961, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.7435633367662204, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 1.8042128549877483e-05, |
|
"loss": 0.519, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.7463096464126329, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.8010209514027687e-05, |
|
"loss": 0.4011, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7490559560590455, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.7978276269302275e-05, |
|
"loss": 0.3935, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.7518022657054582, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.794632896643343e-05, |
|
"loss": 0.6534, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.7545485753518708, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.7914367756219725e-05, |
|
"loss": 0.5715, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.7572948849982835, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.7882392789525358e-05, |
|
"loss": 0.5439, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.760041194644696, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.7850404217279467e-05, |
|
"loss": 0.5277, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.7627875042911088, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.781840219047541e-05, |
|
"loss": 0.586, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.7655338139375214, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.7786386860170054e-05, |
|
"loss": 0.5291, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.768280123583934, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.775435837748306e-05, |
|
"loss": 0.3863, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.7710264332303467, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.7722316893596176e-05, |
|
"loss": 0.5247, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.7737727428767593, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.7690262559752516e-05, |
|
"loss": 0.4046, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.776519052523172, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.7658195527255847e-05, |
|
"loss": 0.4744, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.7792653621695846, |
|
"grad_norm": 0.0625, |
|
"learning_rate": 1.7626115947469877e-05, |
|
"loss": 0.424, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.7820116718159973, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.759402397181754e-05, |
|
"loss": 0.4644, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.78475798146241, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.7561919751780278e-05, |
|
"loss": 0.6509, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.7875042911088226, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.7529803438897346e-05, |
|
"loss": 0.4544, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.7902506007552352, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.7497675184765064e-05, |
|
"loss": 0.4991, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.7929969104016479, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.746553514103611e-05, |
|
"loss": 0.5494, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.7957432200480605, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.743338345941883e-05, |
|
"loss": 0.4772, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.7984895296944732, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.74012202916765e-05, |
|
"loss": 0.5995, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.8012358393408858, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.7369045789626603e-05, |
|
"loss": 0.5156, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8039821489872985, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.7336860105140134e-05, |
|
"loss": 0.3329, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.806728458633711, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 1.730466339014086e-05, |
|
"loss": 0.4797, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.8094747682801235, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 1.7272455796604622e-05, |
|
"loss": 0.4494, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.8122210779265362, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.7240237476558615e-05, |
|
"loss": 0.5881, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.8149673875729488, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.7208008582080652e-05, |
|
"loss": 0.451, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.8177136972193615, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 1.7175769265298472e-05, |
|
"loss": 0.3846, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.8204600068657741, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.7143519678389004e-05, |
|
"loss": 0.4766, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.8232063165121868, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.7111259973577655e-05, |
|
"loss": 0.4932, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.8259526261585994, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.7078990303137584e-05, |
|
"loss": 0.4978, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.8286989358050119, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.7046710819389012e-05, |
|
"loss": 0.5164, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8314452454514245, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.7014421674698458e-05, |
|
"loss": 0.5542, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.8341915550978372, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.6982123021478046e-05, |
|
"loss": 0.3729, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.8369378647442498, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.6949815012184795e-05, |
|
"loss": 0.4723, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.8396841743906625, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.6917497799319876e-05, |
|
"loss": 0.5643, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.842430484037075, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 1.6885171535427913e-05, |
|
"loss": 0.4695, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.8451767936834877, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.685283637309623e-05, |
|
"loss": 0.4316, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.8479231033299004, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.6820492464954187e-05, |
|
"loss": 0.4624, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.850669412976313, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.67881399636724e-05, |
|
"loss": 0.4515, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.8534157226227257, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.6755779021962056e-05, |
|
"loss": 0.5498, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.8561620322691383, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 1.6723409792574185e-05, |
|
"loss": 0.4184, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.858908341915551, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.6691032428298934e-05, |
|
"loss": 0.437, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.8616546515619636, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.665864708196485e-05, |
|
"loss": 0.5498, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.8644009612083763, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.6626253906438148e-05, |
|
"loss": 0.4403, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.867147270854789, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.6593853054622016e-05, |
|
"loss": 0.5116, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.8698935805012016, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.6561444679455858e-05, |
|
"loss": 0.4179, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.8726398901476142, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.6529028933914604e-05, |
|
"loss": 0.4291, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.8753861997940269, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.649660597100797e-05, |
|
"loss": 0.4856, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.8781325094404395, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.646417594377973e-05, |
|
"loss": 0.5419, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.8808788190868522, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.6431739005307014e-05, |
|
"loss": 0.4287, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.8836251287332648, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 1.6399295308699572e-05, |
|
"loss": 0.4848, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8863714383796775, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.636684500709905e-05, |
|
"loss": 0.3635, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.88911774802609, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.6334388253678285e-05, |
|
"loss": 0.5319, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.8918640576725025, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.6301925201640542e-05, |
|
"loss": 0.5852, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.8946103673189152, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 1.6269456004218844e-05, |
|
"loss": 0.5184, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.8973566769653278, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.6236980814675204e-05, |
|
"loss": 0.4528, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.9001029866117405, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.620449978629993e-05, |
|
"loss": 0.4608, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.9028492962581531, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.617201307241088e-05, |
|
"loss": 0.5007, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.9055956059045658, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.6139520826352765e-05, |
|
"loss": 0.5226, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.9083419155509782, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.6107023201496378e-05, |
|
"loss": 0.4345, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.9110882251973909, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 1.6074520351237947e-05, |
|
"loss": 0.4386, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9138345348438035, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.6042012428998325e-05, |
|
"loss": 0.4791, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.9165808444902162, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.6009499588222325e-05, |
|
"loss": 0.3982, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.9193271541366288, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.597698198237797e-05, |
|
"loss": 0.3487, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.9220734637830414, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.5944459764955784e-05, |
|
"loss": 0.3082, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.924819773429454, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.5911933089468048e-05, |
|
"loss": 0.4835, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.9275660830758667, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.5879402109448093e-05, |
|
"loss": 0.503, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.9303123927222794, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.584686697844956e-05, |
|
"loss": 0.5597, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.933058702368692, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.5814327850045697e-05, |
|
"loss": 0.5074, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.9358050120151047, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.5781784877828607e-05, |
|
"loss": 0.6022, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.9385513216615173, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.5749238215408548e-05, |
|
"loss": 0.5197, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.94129763130793, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.571668801641319e-05, |
|
"loss": 0.5147, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.9440439409543426, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.5684134434486893e-05, |
|
"loss": 0.5506, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.9467902506007553, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.565157762329e-05, |
|
"loss": 0.4598, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.949536560247168, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.5619017736498076e-05, |
|
"loss": 0.5802, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.9522828698935806, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.5586454927801223e-05, |
|
"loss": 0.567, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.9550291795399932, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.555388935090332e-05, |
|
"loss": 0.5956, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.9577754891864059, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 1.5521321159521326e-05, |
|
"loss": 0.4019, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.9605217988328185, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.548875050738453e-05, |
|
"loss": 0.4996, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.9632681084792312, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.545617754823384e-05, |
|
"loss": 0.4999, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.9660144181256438, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.5423602435821055e-05, |
|
"loss": 0.6049, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9687607277720565, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.5391025323908134e-05, |
|
"loss": 0.4799, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.9715070374184689, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 1.5358446366266483e-05, |
|
"loss": 0.4836, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.9742533470648815, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 1.532586571667621e-05, |
|
"loss": 1.1204, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.9769996567112942, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.5293283528925412e-05, |
|
"loss": 0.4647, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.9797459663577068, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.5260699956809456e-05, |
|
"loss": 0.4984, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.9824922760041195, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 1.522811515413023e-05, |
|
"loss": 0.4019, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.9852385856505321, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.5195529274695436e-05, |
|
"loss": 0.4028, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.9879848952969448, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.5162942472317858e-05, |
|
"loss": 0.5023, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.9879848952969448, |
|
"eval_loss": 0.5045637488365173, |
|
"eval_runtime": 616.8097, |
|
"eval_samples_per_second": 14.86, |
|
"eval_steps_per_second": 14.86, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.9907312049433572, |
|
"grad_norm": 0.056884765625, |
|
"learning_rate": 1.5130354900814643e-05, |
|
"loss": 0.4917, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.9934775145897699, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.5097766714006553e-05, |
|
"loss": 0.4892, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9962238242361825, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.5065178065717274e-05, |
|
"loss": 0.4261, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.9989701338825951, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.5032589109772655e-05, |
|
"loss": 0.5345, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.001716443529008, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.5285, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 2.0044627531754204, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.4967410890227347e-05, |
|
"loss": 0.4216, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 2.007209062821833, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.4934821934282728e-05, |
|
"loss": 0.5005, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.0099553724682457, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.4902233285993447e-05, |
|
"loss": 0.495, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 2.0127016821146584, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.4869645099185361e-05, |
|
"loss": 0.4652, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 2.0013731548232063, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 1.4837057527682142e-05, |
|
"loss": 0.621, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.004119464469619, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 1.4804470725304567e-05, |
|
"loss": 0.353, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 2.0068657741160316, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.4771884845869772e-05, |
|
"loss": 0.431, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0096120837624443, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.4739300043190547e-05, |
|
"loss": 0.5136, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.012358393408857, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 1.470671647107459e-05, |
|
"loss": 0.3657, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 2.0151047030552696, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.4674134283323792e-05, |
|
"loss": 0.5771, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 2.017851012701682, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 1.4641553633733519e-05, |
|
"loss": 0.3684, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.020597322348095, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.460897467609187e-05, |
|
"loss": 0.5238, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.0233436319945075, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 1.4576397564178951e-05, |
|
"loss": 0.451, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 2.02608994164092, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.4543822451766166e-05, |
|
"loss": 0.5708, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 2.028836251287333, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.4511249492615477e-05, |
|
"loss": 0.6172, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 2.0315825609337455, |
|
"grad_norm": 0.056884765625, |
|
"learning_rate": 1.447867884047868e-05, |
|
"loss": 0.6446, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 2.034328870580158, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.4446110649096683e-05, |
|
"loss": 0.5011, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0370751802265707, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 1.4413545072198783e-05, |
|
"loss": 0.4862, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 2.0398214898729834, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 1.438098226350193e-05, |
|
"loss": 0.4955, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 2.0425677995193956, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.4348422376710009e-05, |
|
"loss": 0.3889, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.0453141091658082, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 1.4315865565513111e-05, |
|
"loss": 1.1553, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 2.048060418812221, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.4283311983586818e-05, |
|
"loss": 0.5192, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.0508067284586335, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.4250761784591451e-05, |
|
"loss": 0.4926, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.053553038105046, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.4218215122171392e-05, |
|
"loss": 0.5362, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 2.056299347751459, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 1.4185672149954304e-05, |
|
"loss": 0.4926, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 2.0590456573978715, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.4153133021550438e-05, |
|
"loss": 0.6137, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 2.061791967044284, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.4120597890551908e-05, |
|
"loss": 0.4648, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.064538276690697, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.4088066910531951e-05, |
|
"loss": 0.6486, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 2.0672845863371094, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 1.4055540235044213e-05, |
|
"loss": 0.4291, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 2.070030895983522, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.402301801762203e-05, |
|
"loss": 0.515, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 2.0727772056299347, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 1.3990500411777677e-05, |
|
"loss": 0.6079, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 2.0755235152763474, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 1.3957987571001676e-05, |
|
"loss": 0.5589, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.07826982492276, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.3925479648762055e-05, |
|
"loss": 0.6439, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 2.0810161345691727, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 1.3892976798503621e-05, |
|
"loss": 0.4723, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 2.0837624442155853, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.3860479173647241e-05, |
|
"loss": 0.5328, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 2.086508753861998, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.3827986927589118e-05, |
|
"loss": 0.5182, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 2.0892550635084106, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.3795500213700072e-05, |
|
"loss": 0.4433, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0920013731548233, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.3763019185324797e-05, |
|
"loss": 0.483, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 2.094747682801236, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.3730543995781158e-05, |
|
"loss": 0.4826, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 2.0974939924476486, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 1.3698074798359458e-05, |
|
"loss": 0.5313, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 2.100240302094061, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.3665611746321718e-05, |
|
"loss": 0.4303, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 2.102986611740474, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 1.363315499290095e-05, |
|
"loss": 0.5252, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.1057329213868865, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.360070469130043e-05, |
|
"loss": 0.4501, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 2.108479231033299, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.3568260994692988e-05, |
|
"loss": 0.4423, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.111225540679712, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.3535824056220273e-05, |
|
"loss": 0.5341, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 2.1139718503261244, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.3503394028992032e-05, |
|
"loss": 0.4019, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 2.116718159972537, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.3470971066085395e-05, |
|
"loss": 0.5329, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.1194644696189497, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.3438555320544143e-05, |
|
"loss": 0.5412, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 2.1222107792653624, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.3406146945377987e-05, |
|
"loss": 0.4902, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 2.1249570889117746, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.3373746093561855e-05, |
|
"loss": 0.6356, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 2.1277033985581872, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.3341352918035156e-05, |
|
"loss": 0.4674, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 2.1304497082046, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.330896757170107e-05, |
|
"loss": 0.5155, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.1331960178510125, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 1.327659020742582e-05, |
|
"loss": 0.5342, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.135942327497425, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.3244220978037945e-05, |
|
"loss": 0.5219, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 2.138688637143838, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.3211860036327604e-05, |
|
"loss": 0.5404, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 2.1414349467902505, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 1.3179507535045819e-05, |
|
"loss": 0.4683, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 2.144181256436663, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.3147163626903774e-05, |
|
"loss": 0.4784, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1469275660830758, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.3114828464572096e-05, |
|
"loss": 0.5399, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 2.1496738757294884, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.3082502200680128e-05, |
|
"loss": 0.425, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.152420185375901, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.305018498781521e-05, |
|
"loss": 0.4779, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 2.1551664950223137, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.301787697852196e-05, |
|
"loss": 0.5025, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 2.1579128046687264, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.298557832530155e-05, |
|
"loss": 0.5144, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.160659114315139, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.2953289180610994e-05, |
|
"loss": 0.4723, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 2.1634054239615517, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.2921009696862419e-05, |
|
"loss": 0.5309, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 2.1661517336079643, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 1.2888740026422354e-05, |
|
"loss": 0.3767, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.168898043254377, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.2856480321611004e-05, |
|
"loss": 0.559, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 2.1716443529007896, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.2824230734701535e-05, |
|
"loss": 0.565, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1743906625472023, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.2791991417919347e-05, |
|
"loss": 0.5005, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 2.177136972193615, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.2759762523441386e-05, |
|
"loss": 0.4312, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 2.1798832818400276, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.2727544203395377e-05, |
|
"loss": 0.5526, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 2.18262959148644, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.269533660985914e-05, |
|
"loss": 0.4463, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.185375901132853, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.2663139894859867e-05, |
|
"loss": 0.4219, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.1881222107792655, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.2630954210373396e-05, |
|
"loss": 0.3865, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 2.190868520425678, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.2598779708323499e-05, |
|
"loss": 0.5792, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.193614830072091, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.2566616540581168e-05, |
|
"loss": 0.462, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.1963611397185034, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.2534464858963892e-05, |
|
"loss": 0.4869, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 2.199107449364916, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.2502324815234942e-05, |
|
"loss": 0.5559, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2018537590113283, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 1.2470196561102655e-05, |
|
"loss": 0.3752, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 2.204600068657741, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.2438080248219723e-05, |
|
"loss": 0.5054, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.2073463783041536, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.2405976028182464e-05, |
|
"loss": 0.3525, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 2.2100926879505662, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.2373884052530127e-05, |
|
"loss": 0.4951, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.212838997596979, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.2341804472744157e-05, |
|
"loss": 0.4484, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.2155853072433915, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.2309737440247486e-05, |
|
"loss": 0.5412, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.218331616889804, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 1.2277683106403826e-05, |
|
"loss": 0.6162, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 2.221077926536217, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.2245641622516943e-05, |
|
"loss": 0.4606, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 2.2238242361826295, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 1.2213613139829949e-05, |
|
"loss": 0.3737, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.2238242361826295, |
|
"eval_loss": 0.504136323928833, |
|
"eval_runtime": 615.1173, |
|
"eval_samples_per_second": 14.901, |
|
"eval_steps_per_second": 14.901, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.226570545829042, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.2181597809524594e-05, |
|
"loss": 0.3953, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2293168554754548, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.2149595782720537e-05, |
|
"loss": 0.4174, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 2.2320631651218674, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.2117607210474645e-05, |
|
"loss": 0.5269, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.23480947476828, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.2085632243780278e-05, |
|
"loss": 0.4668, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 2.2375557844146927, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.205367103356657e-05, |
|
"loss": 0.4565, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 2.2403020940611054, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.202172373069773e-05, |
|
"loss": 0.4427, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.243048403707518, |
|
"grad_norm": 0.06103515625, |
|
"learning_rate": 1.1989790485972312e-05, |
|
"loss": 0.4414, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 2.2457947133539307, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.1957871450122516e-05, |
|
"loss": 0.5547, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 2.2485410230003433, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.1925966773813476e-05, |
|
"loss": 0.5273, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.251287332646756, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.1894076607642537e-05, |
|
"loss": 0.5066, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 2.2540336422931686, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.1862201102138562e-05, |
|
"loss": 0.5397, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2567799519395813, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.1830340407761207e-05, |
|
"loss": 0.4944, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 2.259526261585994, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 1.1798494674900222e-05, |
|
"loss": 0.4056, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 2.2622725712324065, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.1766664053874726e-05, |
|
"loss": 0.5453, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 2.265018880878819, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.1734848694932514e-05, |
|
"loss": 0.456, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 2.267765190525232, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.170304874824934e-05, |
|
"loss": 0.5696, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.2705115001716445, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.1671264363928205e-05, |
|
"loss": 0.4873, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 2.273257809818057, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.1639495691998653e-05, |
|
"loss": 0.5142, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.27600411946447, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.1607742882416064e-05, |
|
"loss": 0.4905, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 2.2787504291108824, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.1576006085060941e-05, |
|
"loss": 0.4352, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 2.281496738757295, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 1.1544285449738211e-05, |
|
"loss": 0.5675, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2842430484037077, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.1512581126176508e-05, |
|
"loss": 0.4553, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 2.2869893580501204, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 1.1480893264027469e-05, |
|
"loss": 0.5391, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 2.289735667696533, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.1449222012865037e-05, |
|
"loss": 0.5003, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 2.2924819773429452, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.1417567522184738e-05, |
|
"loss": 0.5302, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 2.295228286989358, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 1.1385929941402993e-05, |
|
"loss": 0.571, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.2979745966357705, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.1354309419856392e-05, |
|
"loss": 0.5726, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 2.300720906282183, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.1322706106801025e-05, |
|
"loss": 0.5884, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 2.303467215928596, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.1291120151411731e-05, |
|
"loss": 0.5926, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 2.3062135255750085, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.1259551702781426e-05, |
|
"loss": 0.4487, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 2.308959835221421, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 1.1228000909920388e-05, |
|
"loss": 0.3924, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3117061448678338, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.119646792175556e-05, |
|
"loss": 0.4218, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 2.3144524545142464, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 1.1164952887129836e-05, |
|
"loss": 1.1613, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 2.317198764160659, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.1133455954801372e-05, |
|
"loss": 0.4224, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 2.3199450738070717, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.1101977273442873e-05, |
|
"loss": 0.4405, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 2.3226913834534844, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 1.1070516991640894e-05, |
|
"loss": 0.4972, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.325437693099897, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.1039075257895146e-05, |
|
"loss": 0.5403, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 2.3281840027463097, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.1007652220617778e-05, |
|
"loss": 0.5295, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 2.3309303123927223, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.0976248028132705e-05, |
|
"loss": 0.5899, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 2.333676622039135, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 1.0944862828674872e-05, |
|
"loss": 0.4907, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 2.3364229316855476, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.0913496770389585e-05, |
|
"loss": 0.5142, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3391692413319602, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 1.088215000133179e-05, |
|
"loss": 0.5103, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 2.341915550978373, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.0850822669465392e-05, |
|
"loss": 0.5814, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 2.3446618606247855, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 1.081951492266254e-05, |
|
"loss": 0.5544, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 2.347408170271198, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 1.0788226908702945e-05, |
|
"loss": 1.1435, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 2.350154479917611, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.0756958775273169e-05, |
|
"loss": 0.4895, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.3529007895640235, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 1.0725710669965936e-05, |
|
"loss": 0.5886, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 2.355647099210436, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.0694482740279428e-05, |
|
"loss": 0.4469, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 2.358393408856849, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 1.0663275133616603e-05, |
|
"loss": 0.4049, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 2.3611397185032614, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 1.063208799728448e-05, |
|
"loss": 0.3659, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 2.3638860281496736, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 1.0600921478493455e-05, |
|
"loss": 0.5023, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3666323377960863, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.0569775724356611e-05, |
|
"loss": 0.4065, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 2.369378647442499, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.0538650881889013e-05, |
|
"loss": 0.4033, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 2.3721249570889116, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 1.0507547098007015e-05, |
|
"loss": 0.4139, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 2.3748712667353242, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.0476464519527574e-05, |
|
"loss": 0.5499, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 2.377617576381737, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.0445403293167547e-05, |
|
"loss": 0.4546, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.3803638860281495, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 1.0414363565543016e-05, |
|
"loss": 0.4263, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.383110195674562, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.0383345483168573e-05, |
|
"loss": 0.446, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 2.385856505320975, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 1.0352349192456643e-05, |
|
"loss": 0.445, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 2.3886028149673875, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.032137483971679e-05, |
|
"loss": 0.5788, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 2.3913491246138, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 1.0290422571155024e-05, |
|
"loss": 0.4657, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.3940954342602128, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 1.0259492532873113e-05, |
|
"loss": 0.5917, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 2.3968417439066254, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 1.0228584870867896e-05, |
|
"loss": 0.4403, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 2.399588053553038, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.0197699731030584e-05, |
|
"loss": 0.4274, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 2.4023343631994507, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.016683725914609e-05, |
|
"loss": 0.5997, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 2.4050806728458634, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.0135997600892316e-05, |
|
"loss": 0.4612, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.407826982492276, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 1.0105180901839487e-05, |
|
"loss": 0.4969, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 2.4105732921386887, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.0074387307449452e-05, |
|
"loss": 0.4989, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 2.4133196017851013, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.0043616963075001e-05, |
|
"loss": 0.4879, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 2.416065911431514, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.0012870013959182e-05, |
|
"loss": 0.4521, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 2.4188122210779266, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 9.982146605234604e-06, |
|
"loss": 0.6295, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4215585307243392, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 9.95144688192277e-06, |
|
"loss": 0.4266, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 2.424304840370752, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 9.920770988933366e-06, |
|
"loss": 0.3733, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 2.4270511500171645, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 9.890119071063624e-06, |
|
"loss": 0.5311, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 2.429797459663577, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 9.859491272997579e-06, |
|
"loss": 0.3879, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 2.43254376930999, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 9.828887739305423e-06, |
|
"loss": 0.4734, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.4352900789564025, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 9.798308614442822e-06, |
|
"loss": 0.4965, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 2.438036388602815, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 9.767754042750214e-06, |
|
"loss": 0.502, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 2.4407826982492278, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 9.737224168452154e-06, |
|
"loss": 0.4924, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 2.4435290078956404, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 9.706719135656613e-06, |
|
"loss": 0.5116, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 2.446275317542053, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 9.676239088354302e-06, |
|
"loss": 0.5165, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4490216271884657, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 9.645784170417996e-06, |
|
"loss": 0.4293, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 2.4517679368348784, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 9.615354525601859e-06, |
|
"loss": 0.5352, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 2.4545142464812906, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 9.584950297540759e-06, |
|
"loss": 0.474, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 2.457260556127703, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 9.554571629749585e-06, |
|
"loss": 0.495, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 2.460006865774116, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.524218665622578e-06, |
|
"loss": 0.4432, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.4627531754205285, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 9.493891548432654e-06, |
|
"loss": 0.4222, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 2.465499485066941, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 9.463590421330727e-06, |
|
"loss": 0.5396, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 2.468245794713354, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 9.433315427345028e-06, |
|
"loss": 0.4899, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 2.4709921043597665, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 9.403066709380432e-06, |
|
"loss": 0.6021, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 2.473738414006179, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 9.372844410217792e-06, |
|
"loss": 0.505, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.473738414006179, |
|
"eval_loss": 0.5039077997207642, |
|
"eval_runtime": 617.6957, |
|
"eval_samples_per_second": 14.839, |
|
"eval_steps_per_second": 14.839, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4764847236525918, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 9.342648672513254e-06, |
|
"loss": 0.5927, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 2.4792310332990044, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 9.31247963879759e-06, |
|
"loss": 0.4284, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 2.481977342945417, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 9.28233745147552e-06, |
|
"loss": 0.4401, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 2.4847236525918297, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 9.252222252825043e-06, |
|
"loss": 0.5268, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 2.4874699622382423, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 9.222134184996769e-06, |
|
"loss": 0.5029, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.490216271884655, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 9.19207339001324e-06, |
|
"loss": 0.4947, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 2.4929625815310676, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 9.16204000976827e-06, |
|
"loss": 0.5041, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 2.4957088911774803, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 9.13203418602626e-06, |
|
"loss": 0.4975, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 2.498455200823893, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 9.102056060421545e-06, |
|
"loss": 0.4445, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 2.5012015104703056, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.07210577445772e-06, |
|
"loss": 0.4441, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5039478201167182, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 9.042183469506964e-06, |
|
"loss": 0.5294, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 2.506694129763131, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.012289286809384e-06, |
|
"loss": 0.5546, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 2.5094404394095435, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 8.982423367472344e-06, |
|
"loss": 1.2097, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 2.512186749055956, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 8.95258585246979e-06, |
|
"loss": 0.5349, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 2.514933058702369, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 8.922776882641604e-06, |
|
"loss": 0.531, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.5176793683487815, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 8.892996598692928e-06, |
|
"loss": 0.4151, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 2.5204256779951937, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 8.863245141193487e-06, |
|
"loss": 0.4963, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 2.5231719876416063, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 8.833522650576955e-06, |
|
"loss": 0.5466, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 2.525918297288019, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 8.803829267140263e-06, |
|
"loss": 0.4034, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 2.5286646069344316, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 8.774165131042957e-06, |
|
"loss": 0.4094, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5314109165808443, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 8.744530382306528e-06, |
|
"loss": 0.47, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 2.534157226227257, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 8.714925160813752e-06, |
|
"loss": 0.4783, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 2.5369035358736696, |
|
"grad_norm": 0.0390625, |
|
"learning_rate": 8.68534960630802e-06, |
|
"loss": 0.4795, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 2.539649845520082, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 8.655803858392707e-06, |
|
"loss": 0.5637, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 2.542396155166495, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 8.626288056530474e-06, |
|
"loss": 0.5958, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.5451424648129075, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 8.596802340042648e-06, |
|
"loss": 0.5443, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 2.54788877445932, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 8.567346848108523e-06, |
|
"loss": 0.5042, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 2.550635084105733, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 8.53792171976476e-06, |
|
"loss": 0.3745, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 2.5533813937521455, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 8.508527093904663e-06, |
|
"loss": 0.4595, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 2.556127703398558, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 8.479163109277583e-06, |
|
"loss": 0.5502, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5588740130449708, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 8.449829904488216e-06, |
|
"loss": 0.4784, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 2.5616203226913834, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 8.42052761799599e-06, |
|
"loss": 0.5084, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 2.564366632337796, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 8.391256388114367e-06, |
|
"loss": 0.4844, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 2.5671129419842087, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 8.362016353010248e-06, |
|
"loss": 0.5863, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 2.5698592516306213, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 8.332807650703255e-06, |
|
"loss": 0.453, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.572605561277034, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 8.303630419065136e-06, |
|
"loss": 0.6364, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 2.5753518709234466, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 8.274484795819068e-06, |
|
"loss": 0.521, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 2.5780981805698593, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 8.245370918539057e-06, |
|
"loss": 0.47, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 2.580844490216272, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 8.216288924649233e-06, |
|
"loss": 0.516, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 2.5835907998626846, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 8.187238951423254e-06, |
|
"loss": 0.4951, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5863371095090972, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 8.158221135983606e-06, |
|
"loss": 0.4366, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 2.58908341915551, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 8.129235615301012e-06, |
|
"loss": 0.5727, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 2.5918297288019225, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 8.10028252619373e-06, |
|
"loss": 0.601, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 2.594576038448335, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 8.07136200532695e-06, |
|
"loss": 0.474, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 2.597322348094748, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 8.042474189212133e-06, |
|
"loss": 0.3888, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.6000686577411605, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 8.013619214206353e-06, |
|
"loss": 0.4508, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 2.602814967387573, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 7.984797216511686e-06, |
|
"loss": 0.45, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 2.6055612770339858, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 7.956008332174523e-06, |
|
"loss": 0.4348, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 2.6083075866803984, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 7.927252697084976e-06, |
|
"loss": 0.4279, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 2.611053896326811, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 7.898530446976194e-06, |
|
"loss": 0.5555, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6138002059732237, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 7.86984171742376e-06, |
|
"loss": 0.5695, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 2.6165465156196364, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 7.841186643845009e-06, |
|
"loss": 0.4705, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 2.619292825266049, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 7.81256536149844e-06, |
|
"loss": 0.486, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 2.6220391349124617, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 7.783978005483024e-06, |
|
"loss": 0.5018, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 2.624785444558874, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 7.75542471073761e-06, |
|
"loss": 0.4491, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.6275317542052865, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 7.726905612040257e-06, |
|
"loss": 0.4566, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 2.630278063851699, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 7.698420844007624e-06, |
|
"loss": 0.5227, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 2.633024373498112, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 7.669970541094304e-06, |
|
"loss": 0.4866, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 2.6357706831445245, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 7.64155483759223e-06, |
|
"loss": 0.4499, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 2.638516992790937, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 7.613173867629991e-06, |
|
"loss": 0.9577, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6412633024373497, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 7.584827765172254e-06, |
|
"loss": 0.51, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 2.6440096120837624, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 7.5565166640190784e-06, |
|
"loss": 0.4697, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 2.646755921730175, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 7.528240697805321e-06, |
|
"loss": 0.4789, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 2.6495022313765877, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 7.500000000000004e-06, |
|
"loss": 0.5087, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 2.6522485410230003, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 7.471794703905647e-06, |
|
"loss": 0.5238, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.654994850669413, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 7.443624942657698e-06, |
|
"loss": 0.5521, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 2.6577411603158256, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 7.415490849223844e-06, |
|
"loss": 0.4471, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 2.6604874699622383, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 7.387392556403433e-06, |
|
"loss": 0.4795, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 2.663233779608651, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 7.359330196826808e-06, |
|
"loss": 0.4769, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 2.6659800892550636, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 7.33130390295472e-06, |
|
"loss": 0.3953, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6687263989014762, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 7.303313807077658e-06, |
|
"loss": 0.5334, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 2.671472708547889, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 7.275360041315263e-06, |
|
"loss": 0.512, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 2.6742190181943015, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 7.24744273761569e-06, |
|
"loss": 0.4317, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 2.676965327840714, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 7.219562027754985e-06, |
|
"loss": 0.5105, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 2.679711637487127, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 7.191718043336447e-06, |
|
"loss": 0.4319, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.6824579471335395, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 7.163910915790047e-06, |
|
"loss": 0.4596, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 2.6852042567799517, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 7.13614077637176e-06, |
|
"loss": 0.4915, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 2.6879505664263643, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 7.108407756162988e-06, |
|
"loss": 0.4317, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 2.690696876072777, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 7.080711986069905e-06, |
|
"loss": 0.5411, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 2.6934431857191896, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 7.053053596822872e-06, |
|
"loss": 0.3315, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6961894953656023, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 7.025432718975787e-06, |
|
"loss": 0.417, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 2.698935805012015, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 6.997849482905506e-06, |
|
"loss": 0.5751, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 2.7016821146584276, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 6.970304018811183e-06, |
|
"loss": 0.5515, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 2.70442842430484, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 6.942796456713706e-06, |
|
"loss": 0.553, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 2.707174733951253, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 6.915326926455029e-06, |
|
"loss": 0.4753, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.7099210435976655, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 6.887895557697614e-06, |
|
"loss": 0.4289, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 2.712667353244078, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 6.860502479923769e-06, |
|
"loss": 0.4171, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 2.715413662890491, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 6.833147822435075e-06, |
|
"loss": 0.4769, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 2.7181599725369034, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 6.8058317143517615e-06, |
|
"loss": 0.4042, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 2.720906282183316, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 6.778554284612078e-06, |
|
"loss": 0.5019, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7236525918297287, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 6.751315661971731e-06, |
|
"loss": 0.4833, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 2.7236525918297287, |
|
"eval_loss": 0.5037957429885864, |
|
"eval_runtime": 619.9243, |
|
"eval_samples_per_second": 14.786, |
|
"eval_steps_per_second": 14.786, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 2.7263989014761414, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 6.724115975003217e-06, |
|
"loss": 0.4036, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.729145211122554, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 6.696955352095277e-06, |
|
"loss": 0.4995, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 2.7318915207689667, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 6.6698339214522374e-06, |
|
"loss": 0.39, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 2.7346378304153793, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 6.642751811093446e-06, |
|
"loss": 0.4771, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.737384140061792, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 6.6157091488526324e-06, |
|
"loss": 0.4343, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 2.7401304497082046, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 6.588706062377344e-06, |
|
"loss": 0.4141, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 2.7428767593546173, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 6.561742679128296e-06, |
|
"loss": 0.4756, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 2.74562306900103, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 6.534819126378821e-06, |
|
"loss": 0.6022, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 2.7483693786474426, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 6.507935531214218e-06, |
|
"loss": 0.5495, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.751115688293855, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 6.4810920205312006e-06, |
|
"loss": 0.4997, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 2.753861997940268, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 6.454288721037252e-06, |
|
"loss": 0.438, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 2.7566083075866805, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 6.427525759250071e-06, |
|
"loss": 0.5343, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 2.759354617233093, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 6.400803261496933e-06, |
|
"loss": 1.0934, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 2.762100926879506, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 6.374121353914132e-06, |
|
"loss": 0.4902, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.7648472365259185, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 6.347480162446349e-06, |
|
"loss": 0.6164, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 2.767593546172331, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 6.320879812846093e-06, |
|
"loss": 0.3764, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 2.7703398558187438, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 6.294320430673085e-06, |
|
"loss": 0.5365, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 2.7730861654651564, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 6.267802141293657e-06, |
|
"loss": 0.4324, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 2.775832475111569, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 6.241325069880198e-06, |
|
"loss": 0.367, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7785787847579817, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 6.214889341410512e-06, |
|
"loss": 0.4586, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 2.7813250944043943, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 6.188495080667278e-06, |
|
"loss": 0.5402, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 2.784071404050807, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 6.162142412237421e-06, |
|
"loss": 0.5498, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 2.786817713697219, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 6.135831460511555e-06, |
|
"loss": 0.4409, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 2.789564023343632, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 6.109562349683366e-06, |
|
"loss": 0.4341, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.7923103329900445, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 6.083335203749059e-06, |
|
"loss": 0.6233, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 2.795056642636457, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 6.057150146506732e-06, |
|
"loss": 0.5764, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 2.79780295228287, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 6.031007301555849e-06, |
|
"loss": 0.4758, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 2.8005492619292824, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 6.004906792296584e-06, |
|
"loss": 0.4903, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 2.803295571575695, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 5.978848741929308e-06, |
|
"loss": 0.5788, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.8060418812221077, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 5.952833273453953e-06, |
|
"loss": 0.4795, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 2.8087881908685204, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 5.926860509669474e-06, |
|
"loss": 0.4128, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 2.811534500514933, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 5.900930573173232e-06, |
|
"loss": 0.5129, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 2.8142808101613457, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 5.8750435863604515e-06, |
|
"loss": 0.5751, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 2.8170271198077583, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 5.849199671423609e-06, |
|
"loss": 0.4868, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.819773429454171, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 5.823398950351886e-06, |
|
"loss": 0.5558, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 2.8225197391005836, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 5.797641544930561e-06, |
|
"loss": 0.497, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 2.8252660487469963, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 5.771927576740476e-06, |
|
"loss": 0.4415, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 2.828012358393409, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 5.746257167157416e-06, |
|
"loss": 0.5724, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 2.8307586680398216, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 5.72063043735158e-06, |
|
"loss": 0.5275, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.833504977686234, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 5.6950475082869685e-06, |
|
"loss": 0.4577, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 2.836251287332647, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 5.669508500720849e-06, |
|
"loss": 0.5401, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 2.8389975969790595, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 5.6440135352031695e-06, |
|
"loss": 0.4133, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 2.841743906625472, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 5.618562732075969e-06, |
|
"loss": 0.4756, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 2.844490216271885, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 5.593156211472861e-06, |
|
"loss": 0.5736, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.847236525918297, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 5.567794093318403e-06, |
|
"loss": 0.5078, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 2.8499828355647097, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 5.542476497327591e-06, |
|
"loss": 0.5637, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 2.8527291452111223, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 5.517203543005242e-06, |
|
"loss": 0.4383, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 2.855475454857535, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 5.491975349645479e-06, |
|
"loss": 0.5174, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 2.8582217645039476, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 5.466792036331117e-06, |
|
"loss": 0.4554, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.8609680741503603, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 5.44165372193315e-06, |
|
"loss": 0.537, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 2.863714383796773, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 5.416560525110149e-06, |
|
"loss": 0.4111, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 2.8664606934431855, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 5.391512564307737e-06, |
|
"loss": 0.5282, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 2.869207003089598, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 5.36650995775799e-06, |
|
"loss": 0.5688, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 2.871953312736011, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 5.341552823478929e-06, |
|
"loss": 0.3545, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.8746996223824235, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 5.316641279273909e-06, |
|
"loss": 0.3866, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 2.877445932028836, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 5.291775442731112e-06, |
|
"loss": 0.4777, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 2.880192241675249, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 5.266955431222949e-06, |
|
"loss": 0.498, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.8829385513216614, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 5.242181361905548e-06, |
|
"loss": 0.4791, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 2.885684860968074, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 5.217453351718155e-06, |
|
"loss": 0.435, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8884311706144867, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 5.192771517382627e-06, |
|
"loss": 0.4513, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 2.8911774802608994, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 5.168135975402854e-06, |
|
"loss": 0.5548, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 2.893923789907312, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 5.143546842064209e-06, |
|
"loss": 0.4624, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 2.8966700995537247, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 5.1190042334330185e-06, |
|
"loss": 0.5901, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 2.8994164092001373, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 5.094508265355983e-06, |
|
"loss": 0.5007, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.90216271884655, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 5.070059053459672e-06, |
|
"loss": 0.3924, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 2.9049090284929626, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 5.045656713149932e-06, |
|
"loss": 0.5346, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 2.9076553381393753, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 5.021301359611387e-06, |
|
"loss": 0.4761, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 2.910401647785788, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 4.996993107806853e-06, |
|
"loss": 0.4432, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 2.9131479574322006, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 4.972732072476831e-06, |
|
"loss": 0.4404, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.915894267078613, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 4.948518368138933e-06, |
|
"loss": 0.5556, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 2.918640576725026, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.9243521090873745e-06, |
|
"loss": 0.523, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 2.9213868863714385, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 4.900233409392409e-06, |
|
"loss": 0.5381, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 2.924133196017851, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 4.876162382899809e-06, |
|
"loss": 0.5505, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 2.926879505664264, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 4.852139143230296e-06, |
|
"loss": 1.176, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.9296258153106765, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 4.828163803779057e-06, |
|
"loss": 0.4169, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 2.932372124957089, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 4.804236477715152e-06, |
|
"loss": 0.6101, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 2.9351184346035017, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 4.780357277981027e-06, |
|
"loss": 0.4059, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 2.9378647442499144, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 4.7565263172919415e-06, |
|
"loss": 0.5825, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 2.940611053896327, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 4.732743708135479e-06, |
|
"loss": 0.45, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9433573635427397, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.709009562770971e-06, |
|
"loss": 0.4906, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 2.9461036731891523, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.685323993229005e-06, |
|
"loss": 0.5843, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 2.948849982835565, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 4.661687111310865e-06, |
|
"loss": 0.4679, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 2.951596292481977, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.638099028588034e-06, |
|
"loss": 0.5253, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 2.95434260212839, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 4.614559856401635e-06, |
|
"loss": 0.4255, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.9570889117748025, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 4.591069705861935e-06, |
|
"loss": 0.4591, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 2.959835221421215, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 4.567628687847808e-06, |
|
"loss": 0.4433, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 2.962581531067628, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 4.544236913006199e-06, |
|
"loss": 0.4516, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 2.9653278407140404, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 4.520894491751629e-06, |
|
"loss": 0.5292, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 2.968074150360453, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 4.497601534265641e-06, |
|
"loss": 0.5397, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9708204600068657, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 4.4743581504963206e-06, |
|
"loss": 0.5584, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 2.9735667696532784, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 4.451164450157729e-06, |
|
"loss": 0.4986, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 2.9735667696532784, |
|
"eval_loss": 0.5037021636962891, |
|
"eval_runtime": 614.9978, |
|
"eval_samples_per_second": 14.904, |
|
"eval_steps_per_second": 14.904, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 2.976313079299691, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 4.428020542729436e-06, |
|
"loss": 0.5396, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 2.9790593889461037, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 4.4049265374559536e-06, |
|
"loss": 0.4538, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 2.9818056985925163, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 4.381882543346262e-06, |
|
"loss": 0.3633, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.984552008238929, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 4.358888669173264e-06, |
|
"loss": 0.5483, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 2.9872983178853416, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 4.3359450234733e-06, |
|
"loss": 0.3848, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 2.9900446275317543, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 4.3130517145456e-06, |
|
"loss": 0.6011, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 2.992790937178167, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 4.29020885045182e-06, |
|
"loss": 0.4609, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 2.9955372468245796, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 4.267416539015474e-06, |
|
"loss": 0.3615, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.998283556470992, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 4.244674887821483e-06, |
|
"loss": 0.4688, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 3.0010298661174044, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 4.221984004215623e-06, |
|
"loss": 0.3021, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 3.003776175763817, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 4.199343995304044e-06, |
|
"loss": 0.3841, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 3.0065224854102297, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 4.176754967952749e-06, |
|
"loss": 0.4316, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 3.0092687950566424, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 4.154217028787101e-06, |
|
"loss": 0.5092, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 3.012015104703055, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 4.131730284191321e-06, |
|
"loss": 0.4633, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 3.0020597322348093, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 4.109294840307966e-06, |
|
"loss": 0.4454, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 3.004806041881222, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 4.086910803037453e-06, |
|
"loss": 0.4654, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 3.0075523515276346, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 4.064578278037542e-06, |
|
"loss": 0.4323, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 3.010298661174047, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 4.042297370722851e-06, |
|
"loss": 0.4796, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.01304497082046, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 4.0200681862643355e-06, |
|
"loss": 0.4253, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 3.0157912804668725, |
|
"grad_norm": 0.04345703125, |
|
"learning_rate": 3.9978908295888285e-06, |
|
"loss": 0.4095, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 3.018537590113285, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 3.975765405378502e-06, |
|
"loss": 0.4575, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 3.021283899759698, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 3.953692018070417e-06, |
|
"loss": 0.4556, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 3.0240302094061104, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 3.931670771855986e-06, |
|
"loss": 0.5403, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 3.026776519052523, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 3.909701770680524e-06, |
|
"loss": 0.4718, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 3.0295228286989357, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 3.887785118242722e-06, |
|
"loss": 0.4633, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 3.0322691383453484, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 3.8659209179941804e-06, |
|
"loss": 0.5703, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 3.035015447991761, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 3.844109273138914e-06, |
|
"loss": 0.5709, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 3.0377617576381737, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 3.822350286632867e-06, |
|
"loss": 0.4592, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.0405080672845863, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 3.8006440611834103e-06, |
|
"loss": 0.4843, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 3.043254376930999, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.7789906992488875e-06, |
|
"loss": 0.4962, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 3.0460006865774116, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 3.7573903030381003e-06, |
|
"loss": 0.5629, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 3.0487469962238243, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.7358429745098525e-06, |
|
"loss": 0.5103, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 3.051493305870237, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 3.7143488153724454e-06, |
|
"loss": 0.4677, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.0542396155166496, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 3.692907927083217e-06, |
|
"loss": 0.423, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 3.0569859251630622, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.6715204108480473e-06, |
|
"loss": 0.4903, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 3.059732234809475, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 3.6501863676208984e-06, |
|
"loss": 0.9496, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 3.0624785444558875, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 3.6289058981033136e-06, |
|
"loss": 0.4253, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 3.0652248541023, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 3.607679102743968e-06, |
|
"loss": 0.5686, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.067971163748713, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.586506081738181e-06, |
|
"loss": 0.5278, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 3.0707174733951255, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 3.5653869350274357e-06, |
|
"loss": 0.4348, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 3.073463783041538, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 3.5443217622989294e-06, |
|
"loss": 0.5263, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 3.0762100926879508, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 3.5233106629850736e-06, |
|
"loss": 0.5263, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 3.0789564023343634, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 3.5023537362630605e-06, |
|
"loss": 0.4807, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 3.0817027119807756, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.4814510810543553e-06, |
|
"loss": 0.6053, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 3.0844490216271883, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 3.46060279602427e-06, |
|
"loss": 0.391, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 3.087195331273601, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 3.439808979581455e-06, |
|
"loss": 0.4525, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 3.0899416409200136, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.4190697298774772e-06, |
|
"loss": 0.532, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 3.092687950566426, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 3.398385144806318e-06, |
|
"loss": 0.5811, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.095434260212839, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.3777553220039455e-06, |
|
"loss": 0.5059, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 3.0981805698592515, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.357180358847822e-06, |
|
"loss": 0.4974, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 3.100926879505664, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 3.3366603524564736e-06, |
|
"loss": 0.52, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 3.103673189152077, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 3.316195399689007e-06, |
|
"loss": 0.4295, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 3.1064194987984894, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 3.2957855971446737e-06, |
|
"loss": 0.4381, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 3.109165808444902, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.2754310411623888e-06, |
|
"loss": 0.4879, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 3.1119121180913147, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 3.255131827820311e-06, |
|
"loss": 0.4444, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 3.1146584277377274, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 3.2348880529353484e-06, |
|
"loss": 0.4969, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 3.11740473738414, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 3.21469981206274e-06, |
|
"loss": 0.5399, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 3.1201510470305527, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.194567200495593e-06, |
|
"loss": 0.3839, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.1228973566769653, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 3.1744903132644197e-06, |
|
"loss": 0.5803, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 3.125643666323378, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.1544692451367147e-06, |
|
"loss": 0.5422, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 3.1283899759697906, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 3.1345040906164787e-06, |
|
"loss": 0.4212, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 3.1311362856162033, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 3.1145949439438054e-06, |
|
"loss": 0.4019, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 3.133882595262616, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 3.094741899094399e-06, |
|
"loss": 0.3445, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.1366289049090286, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 3.0749450497791693e-06, |
|
"loss": 0.518, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 3.139375214555441, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.055204489443753e-06, |
|
"loss": 0.4594, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 3.142121524201854, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 3.0355203112681063e-06, |
|
"loss": 0.5042, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 3.1448678338482665, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 3.0158926081660338e-06, |
|
"loss": 0.4187, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 3.147614143494679, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.9963214727847773e-06, |
|
"loss": 0.5166, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.150360453141092, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.976806997504555e-06, |
|
"loss": 0.4656, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 3.1531067627875045, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.9573492744381475e-06, |
|
"loss": 0.4555, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 3.155853072433917, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 2.9379483954304386e-06, |
|
"loss": 0.5357, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 3.1585993820803298, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 2.9186044520580145e-06, |
|
"loss": 1.2069, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 3.1613456917267424, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.8993175356286934e-06, |
|
"loss": 0.5469, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 3.164092001373155, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.8800877371811245e-06, |
|
"loss": 0.5138, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 3.1668383110195673, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 2.8609151474843377e-06, |
|
"loss": 0.486, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 3.16958462066598, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.841799857037337e-06, |
|
"loss": 0.5253, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 3.1723309303123925, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 2.822741956068648e-06, |
|
"loss": 0.4386, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 3.175077239958805, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 2.803741534535916e-06, |
|
"loss": 0.4587, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.177823549605218, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.7847986821254605e-06, |
|
"loss": 0.445, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 3.1805698592516305, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.7659134882518715e-06, |
|
"loss": 0.532, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 3.183316168898043, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 2.747086042057566e-06, |
|
"loss": 0.5697, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 3.186062478544456, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.7283164324123904e-06, |
|
"loss": 0.5351, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 3.1888087881908684, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.7096047479131848e-06, |
|
"loss": 0.5825, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.191555097837281, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 2.6909510768833606e-06, |
|
"loss": 0.4346, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 3.1943014074836937, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 2.6723555073725125e-06, |
|
"loss": 0.4932, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 3.1970477171301064, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 2.653818127155959e-06, |
|
"loss": 0.5453, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 3.199794026776519, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 2.635339023734374e-06, |
|
"loss": 0.5351, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 3.2025403364229317, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.6169182843333334e-06, |
|
"loss": 0.4668, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.2052866460693443, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 2.5985559959029347e-06, |
|
"loss": 0.4176, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 3.208032955715757, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 2.5802522451173627e-06, |
|
"loss": 0.4717, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 3.2107792653621696, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 2.562007118374504e-06, |
|
"loss": 0.5227, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 3.2107792653621696, |
|
"eval_loss": 0.5037118196487427, |
|
"eval_runtime": 639.2982, |
|
"eval_samples_per_second": 14.338, |
|
"eval_steps_per_second": 14.338, |
|
"step": 1183 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1456, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 91, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3266302743887217e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|