|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 201, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 1.690108060836792, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.0929, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 1.8146634101867676, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.2396, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 1.9881230592727661, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2041, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 2.016158103942871, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 2.176, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 1.7050724029541016, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 2.0313, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 1.3839370012283325, |
|
"learning_rate": 0.0002, |
|
"loss": 1.818, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 1.2064529657363892, |
|
"learning_rate": 0.00019998832024742372, |
|
"loss": 1.6223, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 1.4260923862457275, |
|
"learning_rate": 0.00019995328402117537, |
|
"loss": 1.5066, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 1.4008389711380005, |
|
"learning_rate": 0.0001998949004149094, |
|
"loss": 1.2986, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 1.0532835721969604, |
|
"learning_rate": 0.00019981318458209423, |
|
"loss": 1.2209, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 1.0162363052368164, |
|
"learning_rate": 0.0001997081577320789, |
|
"loss": 1.1007, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 1.0316762924194336, |
|
"learning_rate": 0.00019957984712458838, |
|
"loss": 1.0087, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 1.4354606866836548, |
|
"learning_rate": 0.00019942828606264818, |
|
"loss": 1.2594, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 2.1245648860931396, |
|
"learning_rate": 0.0001992535138839406, |
|
"loss": 1.4087, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 1.2807283401489258, |
|
"learning_rate": 0.0001990555759505946, |
|
"loss": 1.3497, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.7393147349357605, |
|
"learning_rate": 0.00019883452363741216, |
|
"loss": 1.263, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 0.9481880068778992, |
|
"learning_rate": 0.0001985904143185338, |
|
"loss": 1.2411, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 0.8055089116096497, |
|
"learning_rate": 0.00019832331135254724, |
|
"loss": 1.1185, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 0.5172561407089233, |
|
"learning_rate": 0.00019803328406604252, |
|
"loss": 1.1209, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.49716681241989136, |
|
"learning_rate": 0.00019772040773561854, |
|
"loss": 1.0457, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 0.6212261915206909, |
|
"learning_rate": 0.0001973847635683447, |
|
"loss": 1.0181, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 0.5520172119140625, |
|
"learning_rate": 0.0001970264386806839, |
|
"loss": 1.0484, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 0.4611297845840454, |
|
"learning_rate": 0.00019664552607588117, |
|
"loss": 0.9581, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.4654655158519745, |
|
"learning_rate": 0.00019624212461982497, |
|
"loss": 0.9156, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.6210163831710815, |
|
"learning_rate": 0.00019581633901538626, |
|
"loss": 0.8496, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 0.8395450711250305, |
|
"learning_rate": 0.0001953682797752431, |
|
"loss": 1.3176, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 0.695261538028717, |
|
"learning_rate": 0.00019489806319319687, |
|
"loss": 1.1809, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 0.48507070541381836, |
|
"learning_rate": 0.0001944058113139884, |
|
"loss": 1.166, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 0.4849218726158142, |
|
"learning_rate": 0.00019389165190162114, |
|
"loss": 1.1374, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.5968400835990906, |
|
"learning_rate": 0.00019335571840619988, |
|
"loss": 1.1094, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 0.7121988534927368, |
|
"learning_rate": 0.00019279814992929418, |
|
"loss": 1.0958, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.5847436189651489, |
|
"learning_rate": 0.00019221909118783407, |
|
"loss": 0.9935, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 0.4457413852214813, |
|
"learning_rate": 0.0001916186924765491, |
|
"loss": 0.9889, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.3817737102508545, |
|
"learning_rate": 0.00019099710962895908, |
|
"loss": 0.9586, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.4596458077430725, |
|
"learning_rate": 0.0001903545039769278, |
|
"loss": 0.9506, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 0.45744407176971436, |
|
"learning_rate": 0.0001896910423087889, |
|
"loss": 0.8788, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.4180223345756531, |
|
"learning_rate": 0.00018900689682605642, |
|
"loss": 0.8059, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 0.5419154167175293, |
|
"learning_rate": 0.00018830224509872953, |
|
"loss": 1.0703, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 0.8379240036010742, |
|
"learning_rate": 0.00018757727001920445, |
|
"loss": 1.2347, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.7408146262168884, |
|
"learning_rate": 0.00018683215975480452, |
|
"loss": 1.2051, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 0.4733333885669708, |
|
"learning_rate": 0.00018606710769894153, |
|
"loss": 1.112, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.3522099554538727, |
|
"learning_rate": 0.0001852823124209204, |
|
"loss": 1.0547, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 0.4999963939189911, |
|
"learning_rate": 0.00018447797761440051, |
|
"loss": 1.0523, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 0.6665084362030029, |
|
"learning_rate": 0.00018365431204452683, |
|
"loss": 1.0204, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 0.6554574370384216, |
|
"learning_rate": 0.00018281152949374527, |
|
"loss": 1.0063, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 0.6276643872261047, |
|
"learning_rate": 0.00018194984870631512, |
|
"loss": 0.9716, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 0.49094897508621216, |
|
"learning_rate": 0.00018106949333153405, |
|
"loss": 0.934, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.4001779556274414, |
|
"learning_rate": 0.00018017069186569001, |
|
"loss": 0.9578, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.40681931376457214, |
|
"learning_rate": 0.00017925367759275495, |
|
"loss": 0.8893, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.47569212317466736, |
|
"learning_rate": 0.00017831868852383583, |
|
"loss": 0.7641, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 0.6968297362327576, |
|
"learning_rate": 0.00017736596733539909, |
|
"loss": 1.2385, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 0.7256303429603577, |
|
"learning_rate": 0.00017639576130628376, |
|
"loss": 1.2058, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 0.6893110275268555, |
|
"learning_rate": 0.00017540832225352012, |
|
"loss": 1.2222, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 0.4945465326309204, |
|
"learning_rate": 0.0001744039064669709, |
|
"loss": 1.1218, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.4019485414028168, |
|
"learning_rate": 0.00017338277464281108, |
|
"loss": 1.0477, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.3658827245235443, |
|
"learning_rate": 0.00017234519181586396, |
|
"loss": 1.0071, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 0.3937952220439911, |
|
"learning_rate": 0.00017129142729081177, |
|
"loss": 0.9923, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 0.4745718836784363, |
|
"learning_rate": 0.00017022175457229725, |
|
"loss": 0.9878, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 0.5112557411193848, |
|
"learning_rate": 0.00016913645129393578, |
|
"loss": 0.9179, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.5845867991447449, |
|
"learning_rate": 0.00016803579914625535, |
|
"loss": 0.9084, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 0.5979167222976685, |
|
"learning_rate": 0.00016692008380358395, |
|
"loss": 0.953, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 0.5538232922554016, |
|
"learning_rate": 0.00016578959484990263, |
|
"loss": 0.8545, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 0.3931979835033417, |
|
"learning_rate": 0.00016464462570368402, |
|
"loss": 0.9686, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 0.36252066493034363, |
|
"learning_rate": 0.00016348547354173558, |
|
"loss": 1.1047, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.4850124418735504, |
|
"learning_rate": 0.0001623124392220673, |
|
"loss": 1.0296, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 0.4648304879665375, |
|
"learning_rate": 0.00016112582720580402, |
|
"loss": 0.9287, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3998699486255646, |
|
"learning_rate": 0.0001599259454781625, |
|
"loss": 0.9156, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0149253731343284, |
|
"grad_norm": 0.421875536441803, |
|
"learning_rate": 0.00015871310546851383, |
|
"loss": 1.1974, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0298507462686568, |
|
"grad_norm": 0.39584335684776306, |
|
"learning_rate": 0.00015748762196955197, |
|
"loss": 1.1015, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.37235844135284424, |
|
"learning_rate": 0.00015624981305558918, |
|
"loss": 1.0992, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0597014925373134, |
|
"grad_norm": 0.3330038785934448, |
|
"learning_rate": 0.000155, |
|
"loss": 1.0614, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 0.3231871724128723, |
|
"learning_rate": 0.00015373850719183454, |
|
"loss": 0.9913, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0895522388059702, |
|
"grad_norm": 0.3235574960708618, |
|
"learning_rate": 0.0001524656620516234, |
|
"loss": 0.9477, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1044776119402986, |
|
"grad_norm": 0.37417852878570557, |
|
"learning_rate": 0.0001511817949463956, |
|
"loss": 0.9286, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.39474931359291077, |
|
"learning_rate": 0.00014988723910393175, |
|
"loss": 0.9042, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1343283582089552, |
|
"grad_norm": 0.485606849193573, |
|
"learning_rate": 0.00014858233052627488, |
|
"loss": 0.8646, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 0.49617916345596313, |
|
"learning_rate": 0.00014726740790252108, |
|
"loss": 0.8742, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.164179104477612, |
|
"grad_norm": 0.46328800916671753, |
|
"learning_rate": 0.0001459428125209126, |
|
"loss": 0.8256, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.1791044776119404, |
|
"grad_norm": 0.41910260915756226, |
|
"learning_rate": 0.0001446088881802566, |
|
"loss": 0.7242, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.3721103072166443, |
|
"learning_rate": 0.000143265981100692, |
|
"loss": 0.9349, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.208955223880597, |
|
"grad_norm": 0.35940876603126526, |
|
"learning_rate": 0.00014191443983382822, |
|
"loss": 1.1032, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 0.4461018145084381, |
|
"learning_rate": 0.00014055461517227847, |
|
"loss": 1.0979, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 0.5155666470527649, |
|
"learning_rate": 0.00013918686005861145, |
|
"loss": 1.0231, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 0.5260865092277527, |
|
"learning_rate": 0.00013781152949374526, |
|
"loss": 1.0228, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.47857630252838135, |
|
"learning_rate": 0.0001364289804448068, |
|
"loss": 0.9576, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.2835820895522387, |
|
"grad_norm": 0.4128516912460327, |
|
"learning_rate": 0.00013503957175248075, |
|
"loss": 0.9763, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 0.3800605535507202, |
|
"learning_rate": 0.00013364366403787283, |
|
"loss": 0.9272, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 0.3839597702026367, |
|
"learning_rate": 0.00013224161960891025, |
|
"loss": 0.8632, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.328358208955224, |
|
"grad_norm": 0.4173763394355774, |
|
"learning_rate": 0.0001308338023663049, |
|
"loss": 0.8838, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.5093969106674194, |
|
"learning_rate": 0.00012942057770910255, |
|
"loss": 0.8444, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3582089552238805, |
|
"grad_norm": 0.5481818914413452, |
|
"learning_rate": 0.00012800231243984401, |
|
"loss": 0.7714, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 0.6201927065849304, |
|
"learning_rate": 0.00012657937466936106, |
|
"loss": 0.707, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.3880597014925373, |
|
"grad_norm": 0.5079281330108643, |
|
"learning_rate": 0.00012515213372123345, |
|
"loss": 1.1623, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.4029850746268657, |
|
"grad_norm": 0.4763680398464203, |
|
"learning_rate": 0.0001237209600359311, |
|
"loss": 1.1181, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.38111981749534607, |
|
"learning_rate": 0.00012228622507466587, |
|
"loss": 0.9978, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.38934579491615295, |
|
"learning_rate": 0.00012084830122297907, |
|
"loss": 1.0207, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": 0.5346177816390991, |
|
"learning_rate": 0.00011940756169408881, |
|
"loss": 1.0358, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 0.5502073764801025, |
|
"learning_rate": 0.00011796438043202227, |
|
"loss": 0.919, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.4776119402985075, |
|
"grad_norm": 0.5650199055671692, |
|
"learning_rate": 0.00011651913201455864, |
|
"loss": 0.9101, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.46825388073921204, |
|
"learning_rate": 0.00011507219155600737, |
|
"loss": 0.9228, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5074626865671643, |
|
"grad_norm": 0.42388054728507996, |
|
"learning_rate": 0.00011362393460984737, |
|
"loss": 0.8535, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 0.39270514249801636, |
|
"learning_rate": 0.00011217473707125192, |
|
"loss": 0.8353, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 0.42990821599960327, |
|
"learning_rate": 0.0001107249750795251, |
|
"loss": 0.7997, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.4879288375377655, |
|
"learning_rate": 0.00010927502492047492, |
|
"loss": 0.7087, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.4773794114589691, |
|
"learning_rate": 0.00010782526292874813, |
|
"loss": 0.9446, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.582089552238806, |
|
"grad_norm": 0.3559342622756958, |
|
"learning_rate": 0.00010637606539015268, |
|
"loss": 1.0926, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": 0.38176605105400085, |
|
"learning_rate": 0.00010492780844399264, |
|
"loss": 1.0961, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.6119402985074627, |
|
"grad_norm": 0.37334877252578735, |
|
"learning_rate": 0.00010348086798544141, |
|
"loss": 1.0699, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.626865671641791, |
|
"grad_norm": 0.34911951422691345, |
|
"learning_rate": 0.00010203561956797775, |
|
"loss": 0.9812, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 0.35348981618881226, |
|
"learning_rate": 0.00010059243830591121, |
|
"loss": 0.9875, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6567164179104479, |
|
"grad_norm": 0.36440154910087585, |
|
"learning_rate": 9.915169877702095e-05, |
|
"loss": 0.9164, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.3861341178417206, |
|
"learning_rate": 9.771377492533418e-05, |
|
"loss": 0.865, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.6865671641791045, |
|
"grad_norm": 0.4054892957210541, |
|
"learning_rate": 9.627903996406892e-05, |
|
"loss": 0.8351, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.7014925373134329, |
|
"grad_norm": 0.4275088906288147, |
|
"learning_rate": 9.484786627876654e-05, |
|
"loss": 0.8281, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 0.420304536819458, |
|
"learning_rate": 9.342062533063898e-05, |
|
"loss": 0.8573, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.7313432835820897, |
|
"grad_norm": 0.3914477527141571, |
|
"learning_rate": 9.199768756015603e-05, |
|
"loss": 0.7773, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 0.43206727504730225, |
|
"learning_rate": 9.057942229089747e-05, |
|
"loss": 0.7021, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.7611940298507462, |
|
"grad_norm": 0.43827471137046814, |
|
"learning_rate": 8.916619763369516e-05, |
|
"loss": 1.1523, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.7761194029850746, |
|
"grad_norm": 0.3688608407974243, |
|
"learning_rate": 8.775838039108974e-05, |
|
"loss": 1.0733, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.35826849937438965, |
|
"learning_rate": 8.635633596212718e-05, |
|
"loss": 1.0442, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8059701492537314, |
|
"grad_norm": 0.35874509811401367, |
|
"learning_rate": 8.496042824751926e-05, |
|
"loss": 1.0133, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 0.3662901818752289, |
|
"learning_rate": 8.357101955519324e-05, |
|
"loss": 0.9586, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 0.3741875886917114, |
|
"learning_rate": 8.218847050625476e-05, |
|
"loss": 0.9409, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.8507462686567164, |
|
"grad_norm": 0.378964364528656, |
|
"learning_rate": 8.081313994138857e-05, |
|
"loss": 0.8868, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.3909272849559784, |
|
"learning_rate": 7.944538482772156e-05, |
|
"loss": 0.8837, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 0.41188931465148926, |
|
"learning_rate": 7.808556016617178e-05, |
|
"loss": 0.8514, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 0.4490172863006592, |
|
"learning_rate": 7.673401889930802e-05, |
|
"loss": 0.8751, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.47430258989334106, |
|
"learning_rate": 7.539111181974343e-05, |
|
"loss": 0.8221, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.9253731343283582, |
|
"grad_norm": 0.4374563992023468, |
|
"learning_rate": 7.405718747908743e-05, |
|
"loss": 0.7239, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.4078747034072876, |
|
"learning_rate": 7.273259209747896e-05, |
|
"loss": 0.8874, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.955223880597015, |
|
"grad_norm": 0.42563360929489136, |
|
"learning_rate": 7.141766947372512e-05, |
|
"loss": 1.0419, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 0.4324108362197876, |
|
"learning_rate": 7.011276089606829e-05, |
|
"loss": 0.948, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.9850746268656716, |
|
"grad_norm": 0.42789846658706665, |
|
"learning_rate": 6.881820505360443e-05, |
|
"loss": 0.852, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.39731353521347046, |
|
"learning_rate": 6.753433794837662e-05, |
|
"loss": 0.8727, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.014925373134328, |
|
"grad_norm": 0.35240477323532104, |
|
"learning_rate": 6.626149280816546e-05, |
|
"loss": 1.0679, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.029850746268657, |
|
"grad_norm": 0.33283913135528564, |
|
"learning_rate": 6.500000000000002e-05, |
|
"loss": 1.0181, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.044776119402985, |
|
"grad_norm": 0.3615310788154602, |
|
"learning_rate": 6.375018694441084e-05, |
|
"loss": 0.9978, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.0597014925373136, |
|
"grad_norm": 0.36938410997390747, |
|
"learning_rate": 6.251237803044805e-05, |
|
"loss": 0.9883, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.074626865671642, |
|
"grad_norm": 0.3807987570762634, |
|
"learning_rate": 6.128689453148619e-05, |
|
"loss": 0.9572, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 0.37807610630989075, |
|
"learning_rate": 6.00740545218375e-05, |
|
"loss": 0.9174, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.1044776119402986, |
|
"grad_norm": 0.37602487206459045, |
|
"learning_rate": 5.887417279419599e-05, |
|
"loss": 0.8229, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.1194029850746268, |
|
"grad_norm": 0.35350197553634644, |
|
"learning_rate": 5.7687560777932735e-05, |
|
"loss": 0.8076, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.1343283582089554, |
|
"grad_norm": 0.3940332233905792, |
|
"learning_rate": 5.651452645826445e-05, |
|
"loss": 0.788, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.1492537313432836, |
|
"grad_norm": 0.46034398674964905, |
|
"learning_rate": 5.5355374296315995e-05, |
|
"loss": 0.7882, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.1641791044776117, |
|
"grad_norm": 0.4225603938102722, |
|
"learning_rate": 5.421040515009737e-05, |
|
"loss": 0.7197, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.1791044776119404, |
|
"grad_norm": 0.46008700132369995, |
|
"learning_rate": 5.3079916196416055e-05, |
|
"loss": 0.6569, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.1940298507462686, |
|
"grad_norm": 0.41973477602005005, |
|
"learning_rate": 5.196420085374467e-05, |
|
"loss": 0.8682, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.208955223880597, |
|
"grad_norm": 0.3677213191986084, |
|
"learning_rate": 5.0863548706064245e-05, |
|
"loss": 1.0353, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.2238805970149254, |
|
"grad_norm": 0.37162861227989197, |
|
"learning_rate": 4.977824542770279e-05, |
|
"loss": 1.001, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.39737215638160706, |
|
"learning_rate": 4.870857270918825e-05, |
|
"loss": 0.9846, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.253731343283582, |
|
"grad_norm": 0.38380125164985657, |
|
"learning_rate": 4.7654808184136064e-05, |
|
"loss": 0.9606, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.2686567164179103, |
|
"grad_norm": 0.40244144201278687, |
|
"learning_rate": 4.6617225357188976e-05, |
|
"loss": 0.8571, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.283582089552239, |
|
"grad_norm": 0.4329751431941986, |
|
"learning_rate": 4.5596093533029116e-05, |
|
"loss": 0.8531, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.298507462686567, |
|
"grad_norm": 0.45405519008636475, |
|
"learning_rate": 4.459167774647993e-05, |
|
"loss": 0.8512, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.3134328358208958, |
|
"grad_norm": 0.45590460300445557, |
|
"learning_rate": 4.360423869371629e-05, |
|
"loss": 0.8208, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.328358208955224, |
|
"grad_norm": 0.4376915395259857, |
|
"learning_rate": 4.2634032664600895e-05, |
|
"loss": 0.7654, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.343283582089552, |
|
"grad_norm": 0.45759543776512146, |
|
"learning_rate": 4.168131147616417e-05, |
|
"loss": 0.7857, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.3582089552238807, |
|
"grad_norm": 0.4490528702735901, |
|
"learning_rate": 4.0746322407245066e-05, |
|
"loss": 0.7051, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.373134328358209, |
|
"grad_norm": 0.4924563765525818, |
|
"learning_rate": 3.982930813430999e-05, |
|
"loss": 0.6348, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.35502833127975464, |
|
"learning_rate": 3.893050666846596e-05, |
|
"loss": 1.1142, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.4029850746268657, |
|
"grad_norm": 0.3795003890991211, |
|
"learning_rate": 3.805015129368492e-05, |
|
"loss": 1.0387, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.417910447761194, |
|
"grad_norm": 0.3922593593597412, |
|
"learning_rate": 3.718847050625475e-05, |
|
"loss": 1.0402, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.4328358208955225, |
|
"grad_norm": 0.4245050251483917, |
|
"learning_rate": 3.6345687955473166e-05, |
|
"loss": 0.9854, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.4477611940298507, |
|
"grad_norm": 0.39441049098968506, |
|
"learning_rate": 3.552202238559953e-05, |
|
"loss": 0.9561, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.4626865671641793, |
|
"grad_norm": 0.39788442850112915, |
|
"learning_rate": 3.4717687579079596e-05, |
|
"loss": 0.9104, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.4776119402985075, |
|
"grad_norm": 0.4182056784629822, |
|
"learning_rate": 3.393289230105849e-05, |
|
"loss": 0.8841, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.4925373134328357, |
|
"grad_norm": 0.42861151695251465, |
|
"learning_rate": 3.316784024519553e-05, |
|
"loss": 0.8055, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"grad_norm": 0.42246565222740173, |
|
"learning_rate": 3.242272998079557e-05, |
|
"loss": 0.7947, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.5223880597014925, |
|
"grad_norm": 0.46474263072013855, |
|
"learning_rate": 3.1697754901270473e-05, |
|
"loss": 0.8153, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"grad_norm": 0.4996289312839508, |
|
"learning_rate": 3.099310317394359e-05, |
|
"loss": 0.7579, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.5522388059701493, |
|
"grad_norm": 0.47399628162384033, |
|
"learning_rate": 3.030895769121112e-05, |
|
"loss": 0.6813, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.5671641791044775, |
|
"grad_norm": 0.4417833983898163, |
|
"learning_rate": 2.9645496023072244e-05, |
|
"loss": 0.8971, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.582089552238806, |
|
"grad_norm": 0.3691651225090027, |
|
"learning_rate": 2.9002890371040918e-05, |
|
"loss": 1.0862, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.5970149253731343, |
|
"grad_norm": 0.4065288007259369, |
|
"learning_rate": 2.8381307523450916e-05, |
|
"loss": 1.031, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 0.3905118405818939, |
|
"learning_rate": 2.778090881216592e-05, |
|
"loss": 0.9701, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.626865671641791, |
|
"grad_norm": 0.39984792470932007, |
|
"learning_rate": 2.7201850070705826e-05, |
|
"loss": 0.9493, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.6417910447761193, |
|
"grad_norm": 0.415585994720459, |
|
"learning_rate": 2.664428159380013e-05, |
|
"loss": 0.9129, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.656716417910448, |
|
"grad_norm": 0.42336076498031616, |
|
"learning_rate": 2.610834809837891e-05, |
|
"loss": 0.8791, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.671641791044776, |
|
"grad_norm": 0.45662274956703186, |
|
"learning_rate": 2.5594188686011615e-05, |
|
"loss": 0.871, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"grad_norm": 0.4160149395465851, |
|
"learning_rate": 2.5101936806803117e-05, |
|
"loss": 0.7626, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.701492537313433, |
|
"grad_norm": 0.43893417716026306, |
|
"learning_rate": 2.463172022475691e-05, |
|
"loss": 0.8046, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.716417910447761, |
|
"grad_norm": 0.4579525291919708, |
|
"learning_rate": 2.418366098461374e-05, |
|
"loss": 0.7713, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.7313432835820897, |
|
"grad_norm": 0.4761490523815155, |
|
"learning_rate": 2.3757875380175044e-05, |
|
"loss": 0.69, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.746268656716418, |
|
"grad_norm": 0.5591773986816406, |
|
"learning_rate": 2.3354473924118842e-05, |
|
"loss": 0.6075, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.7611940298507465, |
|
"grad_norm": 0.3821795880794525, |
|
"learning_rate": 2.297356131931614e-05, |
|
"loss": 1.0839, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.7761194029850746, |
|
"grad_norm": 0.37466174364089966, |
|
"learning_rate": 2.261523643165532e-05, |
|
"loss": 1.0221, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.791044776119403, |
|
"grad_norm": 0.3825508654117584, |
|
"learning_rate": 2.22795922643815e-05, |
|
"loss": 1.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.8059701492537314, |
|
"grad_norm": 0.41949060559272766, |
|
"learning_rate": 2.196671593395749e-05, |
|
"loss": 0.9473, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.8208955223880596, |
|
"grad_norm": 0.42044076323509216, |
|
"learning_rate": 2.167668864745279e-05, |
|
"loss": 0.8887, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 0.4112393856048584, |
|
"learning_rate": 2.1409585681466204e-05, |
|
"loss": 0.8724, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.8507462686567164, |
|
"grad_norm": 0.45745235681533813, |
|
"learning_rate": 2.1165476362587846e-05, |
|
"loss": 0.8562, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.8656716417910446, |
|
"grad_norm": 0.4491675794124603, |
|
"learning_rate": 2.09444240494054e-05, |
|
"loss": 0.8593, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.8805970149253732, |
|
"grad_norm": 0.4317816197872162, |
|
"learning_rate": 2.0746486116059418e-05, |
|
"loss": 0.7933, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.8955223880597014, |
|
"grad_norm": 0.46604618430137634, |
|
"learning_rate": 2.0571713937351834e-05, |
|
"loss": 0.7903, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.91044776119403, |
|
"grad_norm": 0.48169732093811035, |
|
"learning_rate": 2.0420152875411624e-05, |
|
"loss": 0.7668, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.925373134328358, |
|
"grad_norm": 0.4627380073070526, |
|
"learning_rate": 2.0291842267921108e-05, |
|
"loss": 0.6404, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.9402985074626864, |
|
"grad_norm": 0.44821980595588684, |
|
"learning_rate": 2.0186815417905787e-05, |
|
"loss": 0.8672, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.955223880597015, |
|
"grad_norm": 0.3909732699394226, |
|
"learning_rate": 2.0105099585090603e-05, |
|
"loss": 0.9487, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.970149253731343, |
|
"grad_norm": 0.4137306213378906, |
|
"learning_rate": 2.0046715978824664e-05, |
|
"loss": 0.8438, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.4548170864582062, |
|
"learning_rate": 2.001167975257628e-05, |
|
"loss": 0.8052, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4302070140838623, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8199, |
|
"step": 201 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 201, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.25183025013588e+17, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|