{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991245987744383, "eval_steps": 36, "global_step": 107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009337613072658301, "grad_norm": 1.3116651734261826, "learning_rate": 1.25e-06, "loss": 11.8241, "step": 1 }, { "epoch": 0.009337613072658301, "eval_loss": 11.651493072509766, "eval_runtime": 43.8088, "eval_samples_per_second": 6.3, "eval_steps_per_second": 0.799, "step": 1 }, { "epoch": 0.018675226145316602, "grad_norm": 1.3220486767280257, "learning_rate": 2.5e-06, "loss": 11.8134, "step": 2 }, { "epoch": 0.028012839217974907, "grad_norm": 1.323342086038301, "learning_rate": 3.7500000000000005e-06, "loss": 11.7915, "step": 3 }, { "epoch": 0.037350452290633204, "grad_norm": 1.32468270774218, "learning_rate": 5e-06, "loss": 11.7306, "step": 4 }, { "epoch": 0.04668806536329151, "grad_norm": 1.3580804351059697, "learning_rate": 6.25e-06, "loss": 11.4493, "step": 5 }, { "epoch": 0.05602567843594981, "grad_norm": 1.4657782226093532, "learning_rate": 7.500000000000001e-06, "loss": 10.5146, "step": 6 }, { "epoch": 0.06536329150860812, "grad_norm": 1.4570176977534868, "learning_rate": 8.750000000000001e-06, "loss": 7.5687, "step": 7 }, { "epoch": 0.07470090458126641, "grad_norm": 1.3101839933421826, "learning_rate": 1e-05, "loss": 6.6374, "step": 8 }, { "epoch": 0.08403851765392471, "grad_norm": 0.39797650661474604, "learning_rate": 1.125e-05, "loss": 4.0048, "step": 9 }, { "epoch": 0.09337613072658302, "grad_norm": 0.29538196193252036, "learning_rate": 1.25e-05, "loss": 3.7439, "step": 10 }, { "epoch": 0.10271374379924132, "grad_norm": 0.21371312404721451, "learning_rate": 1.375e-05, "loss": 3.5406, "step": 11 }, { "epoch": 0.11205135687189963, "grad_norm": 0.36485008261730323, "learning_rate": 1.5000000000000002e-05, "loss": 3.3211, "step": 12 }, { "epoch": 0.12138896994455792, "grad_norm": 0.2805852343578138, "learning_rate": 1.6250000000000002e-05, "loss": 3.1236, "step": 13 }, { "epoch": 0.13072658301721624, "grad_norm": 0.2021156853252266, "learning_rate": 1.7500000000000002e-05, "loss": 2.8668, "step": 14 }, { "epoch": 0.14006419608987453, "grad_norm": 0.15215118244217535, "learning_rate": 1.8750000000000002e-05, "loss": 2.6099, "step": 15 }, { "epoch": 0.14940180916253282, "grad_norm": 0.13687350701334702, "learning_rate": 2e-05, "loss": 2.3842, "step": 16 }, { "epoch": 0.15873942223519114, "grad_norm": 0.13750612099868206, "learning_rate": 1.9999469523400122e-05, "loss": 2.1991, "step": 17 }, { "epoch": 0.16807703530784943, "grad_norm": 0.12651196118653027, "learning_rate": 1.9997878149881576e-05, "loss": 2.0405, "step": 18 }, { "epoch": 0.17741464838050774, "grad_norm": 0.12217963693020709, "learning_rate": 1.999522604828164e-05, "loss": 1.867, "step": 19 }, { "epoch": 0.18675226145316604, "grad_norm": 0.11161741227643726, "learning_rate": 1.9991513499975883e-05, "loss": 1.7081, "step": 20 }, { "epoch": 0.19608987452582433, "grad_norm": 0.09431005577364608, "learning_rate": 1.9986740898848306e-05, "loss": 1.6022, "step": 21 }, { "epoch": 0.20542748759848264, "grad_norm": 0.07544503987145361, "learning_rate": 1.9980908751249556e-05, "loss": 1.5179, "step": 22 }, { "epoch": 0.21476510067114093, "grad_norm": 0.06853247429967851, "learning_rate": 1.997401767594319e-05, "loss": 1.4395, "step": 23 }, { "epoch": 0.22410271374379925, "grad_norm": 0.05892769062015809, "learning_rate": 1.996606840404006e-05, "loss": 1.3577, "step": 24 }, { "epoch": 0.23344032681645754, "grad_norm": 0.05337539608721018, "learning_rate": 1.9957061778920703e-05, "loss": 1.3032, "step": 25 }, { "epoch": 0.24277793988911583, "grad_norm": 0.05279602192024426, "learning_rate": 1.9946998756145894e-05, "loss": 1.2436, "step": 26 }, { "epoch": 0.2521155529617741, "grad_norm": 0.04928056172617358, "learning_rate": 1.9935880403355255e-05, "loss": 1.1964, "step": 27 }, { "epoch": 0.26145316603443247, "grad_norm": 0.046581318628423386, "learning_rate": 1.9923707900153984e-05, "loss": 1.1161, "step": 28 }, { "epoch": 0.27079077910709076, "grad_norm": 0.04485518736789617, "learning_rate": 1.9910482537987704e-05, "loss": 1.1, "step": 29 }, { "epoch": 0.28012839217974905, "grad_norm": 0.03813950781953075, "learning_rate": 1.989620572000544e-05, "loss": 1.0508, "step": 30 }, { "epoch": 0.28946600525240734, "grad_norm": 0.036622498559561, "learning_rate": 1.9880878960910772e-05, "loss": 1.0364, "step": 31 }, { "epoch": 0.29880361832506563, "grad_norm": 0.03579080814636808, "learning_rate": 1.9864503886801108e-05, "loss": 1.0058, "step": 32 }, { "epoch": 0.308141231397724, "grad_norm": 0.03146608814811976, "learning_rate": 1.9847082234995172e-05, "loss": 0.9954, "step": 33 }, { "epoch": 0.31747884447038227, "grad_norm": 0.027371561053018377, "learning_rate": 1.982861585384869e-05, "loss": 0.9445, "step": 34 }, { "epoch": 0.32681645754304056, "grad_norm": 0.024499632261339315, "learning_rate": 1.9809106702558277e-05, "loss": 0.938, "step": 35 }, { "epoch": 0.33615407061569885, "grad_norm": 0.022391297579903467, "learning_rate": 1.978855685095358e-05, "loss": 0.9141, "step": 36 }, { "epoch": 0.33615407061569885, "eval_loss": 0.8478350639343262, "eval_runtime": 43.6583, "eval_samples_per_second": 6.322, "eval_steps_per_second": 0.802, "step": 36 }, { "epoch": 0.34549168368835714, "grad_norm": 0.021886974051633205, "learning_rate": 1.9766968479277684e-05, "loss": 0.8907, "step": 37 }, { "epoch": 0.3548292967610155, "grad_norm": 0.023142495322344372, "learning_rate": 1.974434387795579e-05, "loss": 0.9146, "step": 38 }, { "epoch": 0.3641669098336738, "grad_norm": 0.021168627691228573, "learning_rate": 1.972068544735221e-05, "loss": 0.8564, "step": 39 }, { "epoch": 0.37350452290633207, "grad_norm": 0.019500940158346962, "learning_rate": 1.969599569751571e-05, "loss": 0.8691, "step": 40 }, { "epoch": 0.38284213597899036, "grad_norm": 0.017342490295839202, "learning_rate": 1.9670277247913205e-05, "loss": 0.8169, "step": 41 }, { "epoch": 0.39217974905164865, "grad_norm": 0.01834359475842193, "learning_rate": 1.964353282715183e-05, "loss": 0.8285, "step": 42 }, { "epoch": 0.401517362124307, "grad_norm": 0.01743926340900274, "learning_rate": 1.961576527268946e-05, "loss": 0.8185, "step": 43 }, { "epoch": 0.4108549751969653, "grad_norm": 0.015589382188249705, "learning_rate": 1.9586977530533677e-05, "loss": 0.7824, "step": 44 }, { "epoch": 0.4201925882696236, "grad_norm": 0.015465707621862244, "learning_rate": 1.95571726549292e-05, "loss": 0.7994, "step": 45 }, { "epoch": 0.42953020134228187, "grad_norm": 0.01348325969968343, "learning_rate": 1.9526353808033827e-05, "loss": 0.786, "step": 46 }, { "epoch": 0.43886781441494016, "grad_norm": 0.01248344639342667, "learning_rate": 1.9494524259582994e-05, "loss": 0.7841, "step": 47 }, { "epoch": 0.4482054274875985, "grad_norm": 0.012287865038500807, "learning_rate": 1.9461687386542826e-05, "loss": 0.7703, "step": 48 }, { "epoch": 0.4575430405602568, "grad_norm": 0.013137558740636913, "learning_rate": 1.9427846672751873e-05, "loss": 0.7522, "step": 49 }, { "epoch": 0.4668806536329151, "grad_norm": 0.012405785040338313, "learning_rate": 1.93930057085515e-05, "loss": 0.7513, "step": 50 }, { "epoch": 0.4762182667055734, "grad_norm": 0.012489962634362767, "learning_rate": 1.9357168190404937e-05, "loss": 0.743, "step": 51 }, { "epoch": 0.48555587977823167, "grad_norm": 0.011604499619002773, "learning_rate": 1.932033792050515e-05, "loss": 0.7377, "step": 52 }, { "epoch": 0.49489349285089, "grad_norm": 0.010226240702119492, "learning_rate": 1.928251880637141e-05, "loss": 0.723, "step": 53 }, { "epoch": 0.5042311059235483, "grad_norm": 0.010262055155701614, "learning_rate": 1.924371486043473e-05, "loss": 0.7371, "step": 54 }, { "epoch": 0.5135687189962066, "grad_norm": 0.009630481842938549, "learning_rate": 1.920393019961217e-05, "loss": 0.7295, "step": 55 }, { "epoch": 0.5229063320688649, "grad_norm": 0.0093680195601059, "learning_rate": 1.916316904487005e-05, "loss": 0.712, "step": 56 }, { "epoch": 0.5322439451415232, "grad_norm": 0.009110104257000179, "learning_rate": 1.9121435720776122e-05, "loss": 0.6959, "step": 57 }, { "epoch": 0.5415815582141815, "grad_norm": 0.008883791285743962, "learning_rate": 1.9078734655040763e-05, "loss": 0.7056, "step": 58 }, { "epoch": 0.5509191712868398, "grad_norm": 0.008661290168572833, "learning_rate": 1.9035070378047204e-05, "loss": 0.7031, "step": 59 }, { "epoch": 0.5602567843594981, "grad_norm": 0.008574085915641946, "learning_rate": 1.8990447522370886e-05, "loss": 0.6973, "step": 60 }, { "epoch": 0.5695943974321565, "grad_norm": 0.008014005266988974, "learning_rate": 1.8944870822287957e-05, "loss": 0.7075, "step": 61 }, { "epoch": 0.5789320105048147, "grad_norm": 0.007587538749978625, "learning_rate": 1.8898345113273e-05, "loss": 0.6736, "step": 62 }, { "epoch": 0.588269623577473, "grad_norm": 0.007458374217548868, "learning_rate": 1.8850875331485996e-05, "loss": 0.6811, "step": 63 }, { "epoch": 0.5976072366501313, "grad_norm": 0.007383373412606801, "learning_rate": 1.8802466513248635e-05, "loss": 0.682, "step": 64 }, { "epoch": 0.6069448497227896, "grad_norm": 0.007069434956330871, "learning_rate": 1.8753123794509974e-05, "loss": 0.6723, "step": 65 }, { "epoch": 0.616282462795448, "grad_norm": 0.007152713173479847, "learning_rate": 1.8702852410301556e-05, "loss": 0.6663, "step": 66 }, { "epoch": 0.6256200758681062, "grad_norm": 0.00764154365674685, "learning_rate": 1.865165769418196e-05, "loss": 0.6954, "step": 67 }, { "epoch": 0.6349576889407645, "grad_norm": 0.007184416838244711, "learning_rate": 1.8599545077670983e-05, "loss": 0.6738, "step": 68 }, { "epoch": 0.6442953020134228, "grad_norm": 0.006849637138186313, "learning_rate": 1.854652008967335e-05, "loss": 0.6679, "step": 69 }, { "epoch": 0.6536329150860811, "grad_norm": 0.007061882929419886, "learning_rate": 1.8492588355892125e-05, "loss": 0.6765, "step": 70 }, { "epoch": 0.6629705281587395, "grad_norm": 0.006338175952472431, "learning_rate": 1.8437755598231857e-05, "loss": 0.6695, "step": 71 }, { "epoch": 0.6723081412313977, "grad_norm": 0.0060579092781975295, "learning_rate": 1.8382027634191523e-05, "loss": 0.6484, "step": 72 }, { "epoch": 0.6723081412313977, "eval_loss": 0.6214176416397095, "eval_runtime": 43.7617, "eval_samples_per_second": 6.307, "eval_steps_per_second": 0.8, "step": 72 }, { "epoch": 0.681645754304056, "grad_norm": 0.006328927046317208, "learning_rate": 1.8325410376247295e-05, "loss": 0.6537, "step": 73 }, { "epoch": 0.6909833673767143, "grad_norm": 0.006322879070385963, "learning_rate": 1.826790983122527e-05, "loss": 0.6792, "step": 74 }, { "epoch": 0.7003209804493726, "grad_norm": 0.0059267041402792315, "learning_rate": 1.8209532099664177e-05, "loss": 0.6497, "step": 75 }, { "epoch": 0.709658593522031, "grad_norm": 0.006265125029941557, "learning_rate": 1.8150283375168112e-05, "loss": 0.6375, "step": 76 }, { "epoch": 0.7189962065946892, "grad_norm": 0.0057329086343299585, "learning_rate": 1.8090169943749477e-05, "loss": 0.6505, "step": 77 }, { "epoch": 0.7283338196673476, "grad_norm": 0.006315125437389684, "learning_rate": 1.8029198183162e-05, "loss": 0.641, "step": 78 }, { "epoch": 0.7376714327400058, "grad_norm": 0.00602643377500652, "learning_rate": 1.796737456222413e-05, "loss": 0.6702, "step": 79 }, { "epoch": 0.7470090458126641, "grad_norm": 0.00618264758307804, "learning_rate": 1.7904705640132717e-05, "loss": 0.6414, "step": 80 }, { "epoch": 0.7563466588853225, "grad_norm": 0.0058530581998478, "learning_rate": 1.7841198065767107e-05, "loss": 0.65, "step": 81 }, { "epoch": 0.7656842719579807, "grad_norm": 0.005741561897188635, "learning_rate": 1.7776858576983713e-05, "loss": 0.659, "step": 82 }, { "epoch": 0.7750218850306391, "grad_norm": 0.005350843870415206, "learning_rate": 1.771169399990119e-05, "loss": 0.6464, "step": 83 }, { "epoch": 0.7843594981032973, "grad_norm": 0.005628603669293866, "learning_rate": 1.7645711248176198e-05, "loss": 0.6347, "step": 84 }, { "epoch": 0.7936971111759556, "grad_norm": 0.006024684364906471, "learning_rate": 1.7578917322269885e-05, "loss": 0.6133, "step": 85 }, { "epoch": 0.803034724248614, "grad_norm": 0.005557283692065632, "learning_rate": 1.7511319308705198e-05, "loss": 0.6244, "step": 86 }, { "epoch": 0.8123723373212722, "grad_norm": 0.00553799083367584, "learning_rate": 1.744292437931502e-05, "loss": 0.6528, "step": 87 }, { "epoch": 0.8217099503939306, "grad_norm": 0.005384068308149398, "learning_rate": 1.7373739790481263e-05, "loss": 0.6337, "step": 88 }, { "epoch": 0.8310475634665888, "grad_norm": 0.005255083208402985, "learning_rate": 1.7303772882365018e-05, "loss": 0.6063, "step": 89 }, { "epoch": 0.8403851765392472, "grad_norm": 0.005656017120071371, "learning_rate": 1.723303107812779e-05, "loss": 0.6194, "step": 90 }, { "epoch": 0.8497227896119055, "grad_norm": 0.005727064114913842, "learning_rate": 1.7161521883143936e-05, "loss": 0.6359, "step": 91 }, { "epoch": 0.8590604026845637, "grad_norm": 0.005332007932524027, "learning_rate": 1.7089252884204376e-05, "loss": 0.6188, "step": 92 }, { "epoch": 0.8683980157572221, "grad_norm": 0.004895963878943312, "learning_rate": 1.701623174871168e-05, "loss": 0.6156, "step": 93 }, { "epoch": 0.8777356288298803, "grad_norm": 0.0053198813356673084, "learning_rate": 1.6942466223866582e-05, "loss": 0.6165, "step": 94 }, { "epoch": 0.8870732419025387, "grad_norm": 0.005384472223204587, "learning_rate": 1.6867964135846043e-05, "loss": 0.6182, "step": 95 }, { "epoch": 0.896410854975197, "grad_norm": 0.005145515044871062, "learning_rate": 1.679273338897293e-05, "loss": 0.622, "step": 96 }, { "epoch": 0.9057484680478552, "grad_norm": 0.005287999487944072, "learning_rate": 1.6716781964877413e-05, "loss": 0.6346, "step": 97 }, { "epoch": 0.9150860811205136, "grad_norm": 0.005190281800590422, "learning_rate": 1.664011792165012e-05, "loss": 0.6219, "step": 98 }, { "epoch": 0.9244236941931718, "grad_norm": 0.0051116136333854015, "learning_rate": 1.6562749392987255e-05, "loss": 0.6234, "step": 99 }, { "epoch": 0.9337613072658302, "grad_norm": 0.005528069555146713, "learning_rate": 1.648468458732762e-05, "loss": 0.6238, "step": 100 }, { "epoch": 0.9430989203384885, "grad_norm": 0.00505378298539984, "learning_rate": 1.6405931786981753e-05, "loss": 0.611, "step": 101 }, { "epoch": 0.9524365334111468, "grad_norm": 0.005251284958033568, "learning_rate": 1.6326499347253206e-05, "loss": 0.6016, "step": 102 }, { "epoch": 0.9617741464838051, "grad_norm": 0.0052283843925876325, "learning_rate": 1.6246395695552086e-05, "loss": 0.6224, "step": 103 }, { "epoch": 0.9711117595564633, "grad_norm": 0.005202512120139698, "learning_rate": 1.6165629330500952e-05, "loss": 0.6082, "step": 104 }, { "epoch": 0.9804493726291217, "grad_norm": 0.004915735280290327, "learning_rate": 1.6084208821033152e-05, "loss": 0.5931, "step": 105 }, { "epoch": 0.98978698570178, "grad_norm": 0.0050879698990566845, "learning_rate": 1.6002142805483686e-05, "loss": 0.5945, "step": 106 }, { "epoch": 0.9991245987744383, "grad_norm": 0.005267364898779865, "learning_rate": 1.591943999067273e-05, "loss": 0.6108, "step": 107 } ], "logging_steps": 1, "max_steps": 321, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 107, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6387261951962513e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }