{ "best_metric": 0.5749279856681824, "best_model_checkpoint": "models/lusa_en/checkpoint-110", "epoch": 5.0, "eval_steps": 500, "global_step": 110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045454545454545456, "grad_norm": 4.094762325286865, "learning_rate": 1.9990909090909092e-05, "loss": 2.1525, "step": 1 }, { "epoch": 0.09090909090909091, "grad_norm": 4.10725212097168, "learning_rate": 1.9981818181818185e-05, "loss": 2.0303, "step": 2 }, { "epoch": 0.13636363636363635, "grad_norm": 4.080411434173584, "learning_rate": 1.9972727272727275e-05, "loss": 1.926, "step": 3 }, { "epoch": 0.18181818181818182, "grad_norm": 3.544022560119629, "learning_rate": 1.9963636363636365e-05, "loss": 1.8315, "step": 4 }, { "epoch": 0.22727272727272727, "grad_norm": 2.855754852294922, "learning_rate": 1.9954545454545455e-05, "loss": 1.7865, "step": 5 }, { "epoch": 0.2727272727272727, "grad_norm": 2.752898931503296, "learning_rate": 1.994545454545455e-05, "loss": 1.682, "step": 6 }, { "epoch": 0.3181818181818182, "grad_norm": 2.3124074935913086, "learning_rate": 1.993636363636364e-05, "loss": 1.6458, "step": 7 }, { "epoch": 0.36363636363636365, "grad_norm": 2.361560583114624, "learning_rate": 1.992727272727273e-05, "loss": 1.5782, "step": 8 }, { "epoch": 0.4090909090909091, "grad_norm": 2.4446144104003906, "learning_rate": 1.991818181818182e-05, "loss": 1.5693, "step": 9 }, { "epoch": 0.45454545454545453, "grad_norm": 2.2963356971740723, "learning_rate": 1.9909090909090913e-05, "loss": 1.5906, "step": 10 }, { "epoch": 0.5, "grad_norm": 1.9563844203948975, "learning_rate": 1.9900000000000003e-05, "loss": 1.4818, "step": 11 }, { "epoch": 0.5454545454545454, "grad_norm": 1.904161810874939, "learning_rate": 1.9890909090909093e-05, "loss": 1.4469, "step": 12 }, { "epoch": 0.5909090909090909, "grad_norm": 1.6757051944732666, "learning_rate": 1.9881818181818183e-05, "loss": 1.3387, "step": 13 }, { "epoch": 0.6363636363636364, "grad_norm": 1.670788049697876, "learning_rate": 1.9872727272727276e-05, "loss": 1.3976, "step": 14 }, { "epoch": 0.6818181818181818, "grad_norm": 1.7997887134552002, "learning_rate": 1.9863636363636366e-05, "loss": 1.2961, "step": 15 }, { "epoch": 0.7272727272727273, "grad_norm": 1.6230303049087524, "learning_rate": 1.9854545454545456e-05, "loss": 1.2923, "step": 16 }, { "epoch": 0.7727272727272727, "grad_norm": 1.810758352279663, "learning_rate": 1.9845454545454546e-05, "loss": 1.2559, "step": 17 }, { "epoch": 0.8181818181818182, "grad_norm": 2.564798593521118, "learning_rate": 1.9836363636363636e-05, "loss": 1.2239, "step": 18 }, { "epoch": 0.8636363636363636, "grad_norm": 1.6320985555648804, "learning_rate": 1.982727272727273e-05, "loss": 1.1662, "step": 19 }, { "epoch": 0.9090909090909091, "grad_norm": 1.5779709815979004, "learning_rate": 1.981818181818182e-05, "loss": 1.0843, "step": 20 }, { "epoch": 0.9545454545454546, "grad_norm": 1.7842929363250732, "learning_rate": 1.980909090909091e-05, "loss": 1.0595, "step": 21 }, { "epoch": 1.0, "grad_norm": 2.1895947456359863, "learning_rate": 1.98e-05, "loss": 1.2116, "step": 22 }, { "epoch": 1.0, "eval_accuracy": 0.7246807246807246, "eval_f1": 0.5378067100650976, "eval_loss": 0.9455885291099548, "eval_precision": 0.5496417604912999, "eval_recall": 0.5264705882352941, "eval_runtime": 5.3772, "eval_samples_per_second": 24.362, "eval_steps_per_second": 0.93, "step": 22 }, { "epoch": 1.0454545454545454, "grad_norm": 1.5596168041229248, "learning_rate": 1.979090909090909e-05, "loss": 1.1374, "step": 23 }, { "epoch": 1.0909090909090908, "grad_norm": 1.4826650619506836, "learning_rate": 1.9781818181818184e-05, "loss": 0.9918, "step": 24 }, { "epoch": 1.1363636363636362, "grad_norm": 1.6731046438217163, "learning_rate": 1.9772727272727274e-05, "loss": 0.8705, "step": 25 }, { "epoch": 1.1818181818181819, "grad_norm": 1.8360694646835327, "learning_rate": 1.9763636363636364e-05, "loss": 1.0117, "step": 26 }, { "epoch": 1.2272727272727273, "grad_norm": 1.6999843120574951, "learning_rate": 1.9754545454545454e-05, "loss": 0.9752, "step": 27 }, { "epoch": 1.2727272727272727, "grad_norm": 1.629109263420105, "learning_rate": 1.9745454545454547e-05, "loss": 0.9142, "step": 28 }, { "epoch": 1.3181818181818181, "grad_norm": 1.4835275411605835, "learning_rate": 1.9736363636363637e-05, "loss": 0.8984, "step": 29 }, { "epoch": 1.3636363636363638, "grad_norm": 1.3297476768493652, "learning_rate": 1.9727272727272728e-05, "loss": 0.9224, "step": 30 }, { "epoch": 1.4090909090909092, "grad_norm": 1.582480788230896, "learning_rate": 1.971818181818182e-05, "loss": 0.8557, "step": 31 }, { "epoch": 1.4545454545454546, "grad_norm": 1.3994060754776, "learning_rate": 1.970909090909091e-05, "loss": 0.748, "step": 32 }, { "epoch": 1.5, "grad_norm": 2.084137201309204, "learning_rate": 1.97e-05, "loss": 1.1522, "step": 33 }, { "epoch": 1.5454545454545454, "grad_norm": 1.8021726608276367, "learning_rate": 1.969090909090909e-05, "loss": 0.8038, "step": 34 }, { "epoch": 1.5909090909090908, "grad_norm": 1.3661516904830933, "learning_rate": 1.9681818181818185e-05, "loss": 0.8233, "step": 35 }, { "epoch": 1.6363636363636362, "grad_norm": 1.5308856964111328, "learning_rate": 1.9672727272727275e-05, "loss": 0.8182, "step": 36 }, { "epoch": 1.6818181818181817, "grad_norm": 1.5709282159805298, "learning_rate": 1.9663636363636365e-05, "loss": 0.7915, "step": 37 }, { "epoch": 1.7272727272727273, "grad_norm": 1.6001756191253662, "learning_rate": 1.9654545454545458e-05, "loss": 0.8671, "step": 38 }, { "epoch": 1.7727272727272727, "grad_norm": 1.243642807006836, "learning_rate": 1.964545454545455e-05, "loss": 0.8128, "step": 39 }, { "epoch": 1.8181818181818183, "grad_norm": 1.9662197828292847, "learning_rate": 1.963636363636364e-05, "loss": 0.7816, "step": 40 }, { "epoch": 1.8636363636363638, "grad_norm": 1.392984390258789, "learning_rate": 1.962727272727273e-05, "loss": 0.7299, "step": 41 }, { "epoch": 1.9090909090909092, "grad_norm": 1.7500344514846802, "learning_rate": 1.9618181818181822e-05, "loss": 0.8295, "step": 42 }, { "epoch": 1.9545454545454546, "grad_norm": 1.3700010776519775, "learning_rate": 1.9609090909090912e-05, "loss": 0.7227, "step": 43 }, { "epoch": 2.0, "grad_norm": 1.7891302108764648, "learning_rate": 1.9600000000000002e-05, "loss": 0.8371, "step": 44 }, { "epoch": 2.0, "eval_accuracy": 0.7840807840807841, "eval_f1": 0.6456470588235295, "eval_loss": 0.68104088306427, "eval_precision": 0.6208144796380091, "eval_recall": 0.6725490196078432, "eval_runtime": 5.7284, "eval_samples_per_second": 22.869, "eval_steps_per_second": 0.873, "step": 44 }, { "epoch": 2.0454545454545454, "grad_norm": 1.324338436126709, "learning_rate": 1.9590909090909092e-05, "loss": 0.6347, "step": 45 }, { "epoch": 2.090909090909091, "grad_norm": 1.5182204246520996, "learning_rate": 1.9581818181818186e-05, "loss": 0.713, "step": 46 }, { "epoch": 2.1363636363636362, "grad_norm": 1.2550334930419922, "learning_rate": 1.9572727272727276e-05, "loss": 0.7751, "step": 47 }, { "epoch": 2.1818181818181817, "grad_norm": 1.2773672342300415, "learning_rate": 1.9563636363636366e-05, "loss": 0.6857, "step": 48 }, { "epoch": 2.227272727272727, "grad_norm": 1.2220115661621094, "learning_rate": 1.9554545454545456e-05, "loss": 0.6481, "step": 49 }, { "epoch": 2.2727272727272725, "grad_norm": 1.2490642070770264, "learning_rate": 1.9545454545454546e-05, "loss": 0.6727, "step": 50 }, { "epoch": 2.3181818181818183, "grad_norm": 1.7800368070602417, "learning_rate": 1.953636363636364e-05, "loss": 0.7261, "step": 51 }, { "epoch": 2.3636363636363638, "grad_norm": 1.466561198234558, "learning_rate": 1.952727272727273e-05, "loss": 0.679, "step": 52 }, { "epoch": 2.409090909090909, "grad_norm": 1.6164387464523315, "learning_rate": 1.951818181818182e-05, "loss": 0.6693, "step": 53 }, { "epoch": 2.4545454545454546, "grad_norm": 1.4198397397994995, "learning_rate": 1.950909090909091e-05, "loss": 0.608, "step": 54 }, { "epoch": 2.5, "grad_norm": 1.4696757793426514, "learning_rate": 1.95e-05, "loss": 0.749, "step": 55 }, { "epoch": 2.5454545454545454, "grad_norm": 1.4631857872009277, "learning_rate": 1.9490909090909093e-05, "loss": 0.6256, "step": 56 }, { "epoch": 2.590909090909091, "grad_norm": 1.4575796127319336, "learning_rate": 1.9481818181818183e-05, "loss": 0.6964, "step": 57 }, { "epoch": 2.6363636363636362, "grad_norm": 1.2283692359924316, "learning_rate": 1.9472727272727273e-05, "loss": 0.7201, "step": 58 }, { "epoch": 2.6818181818181817, "grad_norm": 1.2565014362335205, "learning_rate": 1.9463636363636363e-05, "loss": 0.6112, "step": 59 }, { "epoch": 2.7272727272727275, "grad_norm": 1.3832677602767944, "learning_rate": 1.9454545454545457e-05, "loss": 0.5858, "step": 60 }, { "epoch": 2.7727272727272725, "grad_norm": 1.679886817932129, "learning_rate": 1.9445454545454547e-05, "loss": 0.8611, "step": 61 }, { "epoch": 2.8181818181818183, "grad_norm": 1.3422367572784424, "learning_rate": 1.9436363636363637e-05, "loss": 0.6522, "step": 62 }, { "epoch": 2.8636363636363638, "grad_norm": 1.2293131351470947, "learning_rate": 1.9427272727272727e-05, "loss": 0.5756, "step": 63 }, { "epoch": 2.909090909090909, "grad_norm": 1.7241827249526978, "learning_rate": 1.941818181818182e-05, "loss": 0.5771, "step": 64 }, { "epoch": 2.9545454545454546, "grad_norm": 1.3407361507415771, "learning_rate": 1.940909090909091e-05, "loss": 0.6396, "step": 65 }, { "epoch": 3.0, "grad_norm": 1.4412184953689575, "learning_rate": 1.94e-05, "loss": 0.7121, "step": 66 }, { "epoch": 3.0, "eval_accuracy": 0.809028809028809, "eval_f1": 0.6716697936210131, "eval_loss": 0.6103370189666748, "eval_precision": 0.6438848920863309, "eval_recall": 0.7019607843137254, "eval_runtime": 6.4253, "eval_samples_per_second": 20.388, "eval_steps_per_second": 0.778, "step": 66 }, { "epoch": 3.0454545454545454, "grad_norm": 1.5174870491027832, "learning_rate": 1.9390909090909094e-05, "loss": 0.5993, "step": 67 }, { "epoch": 3.090909090909091, "grad_norm": 1.2559189796447754, "learning_rate": 1.9381818181818184e-05, "loss": 0.599, "step": 68 }, { "epoch": 3.1363636363636362, "grad_norm": 1.3573830127716064, "learning_rate": 1.9372727272727274e-05, "loss": 0.6561, "step": 69 }, { "epoch": 3.1818181818181817, "grad_norm": 1.2237664461135864, "learning_rate": 1.9363636363636364e-05, "loss": 0.5644, "step": 70 }, { "epoch": 3.227272727272727, "grad_norm": 1.7508504390716553, "learning_rate": 1.9354545454545458e-05, "loss": 0.6542, "step": 71 }, { "epoch": 3.2727272727272725, "grad_norm": 1.35462486743927, "learning_rate": 1.9345454545454548e-05, "loss": 0.5616, "step": 72 }, { "epoch": 3.3181818181818183, "grad_norm": 1.3244951963424683, "learning_rate": 1.9336363636363638e-05, "loss": 0.6016, "step": 73 }, { "epoch": 3.3636363636363638, "grad_norm": 1.3806241750717163, "learning_rate": 1.9327272727272728e-05, "loss": 0.5168, "step": 74 }, { "epoch": 3.409090909090909, "grad_norm": 1.3947114944458008, "learning_rate": 1.931818181818182e-05, "loss": 0.5554, "step": 75 }, { "epoch": 3.4545454545454546, "grad_norm": 1.4679116010665894, "learning_rate": 1.930909090909091e-05, "loss": 0.5753, "step": 76 }, { "epoch": 3.5, "grad_norm": 1.2916615009307861, "learning_rate": 1.93e-05, "loss": 0.4835, "step": 77 }, { "epoch": 3.5454545454545454, "grad_norm": 1.713321328163147, "learning_rate": 1.9290909090909095e-05, "loss": 0.5753, "step": 78 }, { "epoch": 3.590909090909091, "grad_norm": 1.3171674013137817, "learning_rate": 1.9281818181818185e-05, "loss": 0.5477, "step": 79 }, { "epoch": 3.6363636363636362, "grad_norm": 1.6939257383346558, "learning_rate": 1.9272727272727275e-05, "loss": 0.7424, "step": 80 }, { "epoch": 3.6818181818181817, "grad_norm": 1.6177845001220703, "learning_rate": 1.9263636363636365e-05, "loss": 0.6225, "step": 81 }, { "epoch": 3.7272727272727275, "grad_norm": 1.7323453426361084, "learning_rate": 1.9254545454545455e-05, "loss": 0.4644, "step": 82 }, { "epoch": 3.7727272727272725, "grad_norm": 1.2443021535873413, "learning_rate": 1.924545454545455e-05, "loss": 0.524, "step": 83 }, { "epoch": 3.8181818181818183, "grad_norm": 1.6111009120941162, "learning_rate": 1.923636363636364e-05, "loss": 0.6301, "step": 84 }, { "epoch": 3.8636363636363638, "grad_norm": 1.3090589046478271, "learning_rate": 1.922727272727273e-05, "loss": 0.6209, "step": 85 }, { "epoch": 3.909090909090909, "grad_norm": 1.2143287658691406, "learning_rate": 1.921818181818182e-05, "loss": 0.523, "step": 86 }, { "epoch": 3.9545454545454546, "grad_norm": 1.551576018333435, "learning_rate": 1.920909090909091e-05, "loss": 0.6025, "step": 87 }, { "epoch": 4.0, "grad_norm": 1.606203317642212, "learning_rate": 1.9200000000000003e-05, "loss": 0.59, "step": 88 }, { "epoch": 4.0, "eval_accuracy": 0.8238788238788238, "eval_f1": 0.7027790861987753, "eval_loss": 0.5813168883323669, "eval_precision": 0.6763372620126926, "eval_recall": 0.7313725490196078, "eval_runtime": 5.433, "eval_samples_per_second": 24.112, "eval_steps_per_second": 0.92, "step": 88 }, { "epoch": 4.045454545454546, "grad_norm": 1.6170856952667236, "learning_rate": 1.9190909090909093e-05, "loss": 0.5609, "step": 89 }, { "epoch": 4.090909090909091, "grad_norm": 1.21828031539917, "learning_rate": 1.9181818181818183e-05, "loss": 0.5444, "step": 90 }, { "epoch": 4.136363636363637, "grad_norm": 1.3539512157440186, "learning_rate": 1.9172727272727273e-05, "loss": 0.5608, "step": 91 }, { "epoch": 4.181818181818182, "grad_norm": 1.2486921548843384, "learning_rate": 1.9163636363636363e-05, "loss": 0.4976, "step": 92 }, { "epoch": 4.2272727272727275, "grad_norm": 1.474204659461975, "learning_rate": 1.9154545454545456e-05, "loss": 0.4646, "step": 93 }, { "epoch": 4.2727272727272725, "grad_norm": 1.4018511772155762, "learning_rate": 1.9145454545454546e-05, "loss": 0.4138, "step": 94 }, { "epoch": 4.318181818181818, "grad_norm": 1.6630454063415527, "learning_rate": 1.9136363636363636e-05, "loss": 0.6184, "step": 95 }, { "epoch": 4.363636363636363, "grad_norm": 1.681337594985962, "learning_rate": 1.912727272727273e-05, "loss": 0.4985, "step": 96 }, { "epoch": 4.409090909090909, "grad_norm": 1.1836535930633545, "learning_rate": 1.911818181818182e-05, "loss": 0.4481, "step": 97 }, { "epoch": 4.454545454545454, "grad_norm": 1.2125182151794434, "learning_rate": 1.910909090909091e-05, "loss": 0.4427, "step": 98 }, { "epoch": 4.5, "grad_norm": 1.1907086372375488, "learning_rate": 1.91e-05, "loss": 0.4656, "step": 99 }, { "epoch": 4.545454545454545, "grad_norm": 1.6791960000991821, "learning_rate": 1.9090909090909094e-05, "loss": 0.5132, "step": 100 }, { "epoch": 4.590909090909091, "grad_norm": 2.0220723152160645, "learning_rate": 1.9081818181818184e-05, "loss": 0.7157, "step": 101 }, { "epoch": 4.636363636363637, "grad_norm": 1.3273769617080688, "learning_rate": 1.9072727272727274e-05, "loss": 0.5186, "step": 102 }, { "epoch": 4.681818181818182, "grad_norm": 1.971346378326416, "learning_rate": 1.9063636363636364e-05, "loss": 0.4861, "step": 103 }, { "epoch": 4.7272727272727275, "grad_norm": 1.4221469163894653, "learning_rate": 1.9054545454545457e-05, "loss": 0.5415, "step": 104 }, { "epoch": 4.7727272727272725, "grad_norm": 1.5409835577011108, "learning_rate": 1.9045454545454547e-05, "loss": 0.4741, "step": 105 }, { "epoch": 4.818181818181818, "grad_norm": 2.085914134979248, "learning_rate": 1.9036363636363637e-05, "loss": 0.5062, "step": 106 }, { "epoch": 4.863636363636363, "grad_norm": 1.571999430656433, "learning_rate": 1.902727272727273e-05, "loss": 0.6323, "step": 107 }, { "epoch": 4.909090909090909, "grad_norm": 1.3681639432907104, "learning_rate": 1.901818181818182e-05, "loss": 0.5369, "step": 108 }, { "epoch": 4.954545454545455, "grad_norm": 1.8829340934753418, "learning_rate": 1.900909090909091e-05, "loss": 0.4372, "step": 109 }, { "epoch": 5.0, "grad_norm": 1.9513062238693237, "learning_rate": 1.9e-05, "loss": 0.5141, "step": 110 }, { "epoch": 5.0, "eval_accuracy": 0.8241758241758241, "eval_f1": 0.6876172607879925, "eval_loss": 0.5749279856681824, "eval_precision": 0.6591726618705036, "eval_recall": 0.7186274509803922, "eval_runtime": 5.3919, "eval_samples_per_second": 24.296, "eval_steps_per_second": 0.927, "step": 110 } ], "logging_steps": 1, "max_steps": 2200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 156073447450800.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }