{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9945945945945946, "eval_steps": 500, "global_step": 115, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008648648648648649, "grad_norm": 2.249741315841675, "learning_rate": 0.0001, "loss": 1.8319, "step": 1 }, { "epoch": 0.017297297297297298, "grad_norm": 2.1813502311706543, "learning_rate": 0.0002, "loss": 1.4027, "step": 2 }, { "epoch": 0.025945945945945945, "grad_norm": 0.8601759672164917, "learning_rate": 0.00019823008849557524, "loss": 1.1102, "step": 3 }, { "epoch": 0.034594594594594595, "grad_norm": 1.7297605276107788, "learning_rate": 0.00019646017699115044, "loss": 1.3774, "step": 4 }, { "epoch": 0.043243243243243246, "grad_norm": 1.0936262607574463, "learning_rate": 0.00019469026548672567, "loss": 0.895, "step": 5 }, { "epoch": 0.05189189189189189, "grad_norm": 0.6946480870246887, "learning_rate": 0.00019292035398230087, "loss": 0.7451, "step": 6 }, { "epoch": 0.06054054054054054, "grad_norm": 0.45863592624664307, "learning_rate": 0.00019115044247787613, "loss": 0.876, "step": 7 }, { "epoch": 0.06918918918918919, "grad_norm": 0.5447478890419006, "learning_rate": 0.00018938053097345133, "loss": 0.7719, "step": 8 }, { "epoch": 0.07783783783783783, "grad_norm": 0.45514124631881714, "learning_rate": 0.00018761061946902656, "loss": 0.5759, "step": 9 }, { "epoch": 0.08648648648648649, "grad_norm": 0.4590395987033844, "learning_rate": 0.0001858407079646018, "loss": 0.5838, "step": 10 }, { "epoch": 0.09513513513513513, "grad_norm": 0.5425634384155273, "learning_rate": 0.000184070796460177, "loss": 0.6641, "step": 11 }, { "epoch": 0.10378378378378378, "grad_norm": 1.0379027128219604, "learning_rate": 0.00018230088495575222, "loss": 0.9623, "step": 12 }, { "epoch": 0.11243243243243244, "grad_norm": 0.5286022424697876, "learning_rate": 0.00018053097345132742, "loss": 0.4761, "step": 13 }, { "epoch": 0.12108108108108108, "grad_norm": 0.6451830267906189, "learning_rate": 0.00017876106194690265, "loss": 0.547, "step": 14 }, { "epoch": 0.12972972972972974, "grad_norm": 0.6369953751564026, "learning_rate": 0.0001769911504424779, "loss": 0.5872, "step": 15 }, { "epoch": 0.13837837837837838, "grad_norm": 0.4720052182674408, "learning_rate": 0.0001752212389380531, "loss": 0.3248, "step": 16 }, { "epoch": 0.14702702702702702, "grad_norm": 0.5918360352516174, "learning_rate": 0.00017345132743362834, "loss": 0.6277, "step": 17 }, { "epoch": 0.15567567567567567, "grad_norm": 0.5242601037025452, "learning_rate": 0.00017168141592920354, "loss": 0.5645, "step": 18 }, { "epoch": 0.1643243243243243, "grad_norm": 0.474292129278183, "learning_rate": 0.00016991150442477877, "loss": 0.2115, "step": 19 }, { "epoch": 0.17297297297297298, "grad_norm": 0.6523647904396057, "learning_rate": 0.000168141592920354, "loss": 0.5803, "step": 20 }, { "epoch": 0.18162162162162163, "grad_norm": 0.521297812461853, "learning_rate": 0.0001663716814159292, "loss": 0.4483, "step": 21 }, { "epoch": 0.19027027027027027, "grad_norm": 0.5689568519592285, "learning_rate": 0.00016460176991150443, "loss": 0.6231, "step": 22 }, { "epoch": 0.1989189189189189, "grad_norm": 0.4570567011833191, "learning_rate": 0.00016283185840707966, "loss": 0.2368, "step": 23 }, { "epoch": 0.20756756756756756, "grad_norm": 0.414307564496994, "learning_rate": 0.0001610619469026549, "loss": 0.4674, "step": 24 }, { "epoch": 0.21621621621621623, "grad_norm": 0.5027227997779846, "learning_rate": 0.0001592920353982301, "loss": 0.3558, "step": 25 }, { "epoch": 0.22486486486486487, "grad_norm": 0.4441507160663605, "learning_rate": 0.00015752212389380532, "loss": 0.437, "step": 26 }, { "epoch": 0.23351351351351352, "grad_norm": 0.4098701477050781, "learning_rate": 0.00015575221238938055, "loss": 0.3553, "step": 27 }, { "epoch": 0.24216216216216216, "grad_norm": 0.3602244257926941, "learning_rate": 0.00015398230088495575, "loss": 0.3689, "step": 28 }, { "epoch": 0.2508108108108108, "grad_norm": 0.4340718984603882, "learning_rate": 0.00015221238938053098, "loss": 0.318, "step": 29 }, { "epoch": 0.2594594594594595, "grad_norm": 0.44470590353012085, "learning_rate": 0.00015044247787610618, "loss": 0.4992, "step": 30 }, { "epoch": 0.2681081081081081, "grad_norm": 0.43699413537979126, "learning_rate": 0.00014867256637168144, "loss": 0.3362, "step": 31 }, { "epoch": 0.27675675675675676, "grad_norm": 0.4950752258300781, "learning_rate": 0.00014690265486725664, "loss": 0.4464, "step": 32 }, { "epoch": 0.28540540540540543, "grad_norm": 0.4312315881252289, "learning_rate": 0.00014513274336283187, "loss": 0.4786, "step": 33 }, { "epoch": 0.29405405405405405, "grad_norm": 0.45234543085098267, "learning_rate": 0.0001433628318584071, "loss": 0.5572, "step": 34 }, { "epoch": 0.3027027027027027, "grad_norm": 0.4373219311237335, "learning_rate": 0.0001415929203539823, "loss": 0.3873, "step": 35 }, { "epoch": 0.31135135135135134, "grad_norm": 0.35862988233566284, "learning_rate": 0.00013982300884955753, "loss": 0.2902, "step": 36 }, { "epoch": 0.32, "grad_norm": 0.41014787554740906, "learning_rate": 0.00013805309734513276, "loss": 0.3806, "step": 37 }, { "epoch": 0.3286486486486486, "grad_norm": 0.4181463420391083, "learning_rate": 0.00013628318584070796, "loss": 0.3036, "step": 38 }, { "epoch": 0.3372972972972973, "grad_norm": 0.3663095235824585, "learning_rate": 0.00013451327433628321, "loss": 0.1979, "step": 39 }, { "epoch": 0.34594594594594597, "grad_norm": 0.46295005083084106, "learning_rate": 0.00013274336283185842, "loss": 0.4204, "step": 40 }, { "epoch": 0.3545945945945946, "grad_norm": 0.39596325159072876, "learning_rate": 0.00013097345132743365, "loss": 0.3512, "step": 41 }, { "epoch": 0.36324324324324325, "grad_norm": 0.7628335952758789, "learning_rate": 0.00012920353982300885, "loss": 0.4965, "step": 42 }, { "epoch": 0.37189189189189187, "grad_norm": 0.5216770172119141, "learning_rate": 0.00012743362831858408, "loss": 0.4658, "step": 43 }, { "epoch": 0.38054054054054054, "grad_norm": 0.38578447699546814, "learning_rate": 0.0001256637168141593, "loss": 0.2661, "step": 44 }, { "epoch": 0.3891891891891892, "grad_norm": 0.2811882197856903, "learning_rate": 0.0001238938053097345, "loss": 0.1545, "step": 45 }, { "epoch": 0.3978378378378378, "grad_norm": 0.3812131881713867, "learning_rate": 0.00012212389380530974, "loss": 0.3295, "step": 46 }, { "epoch": 0.4064864864864865, "grad_norm": 0.3791070878505707, "learning_rate": 0.00012035398230088497, "loss": 0.2472, "step": 47 }, { "epoch": 0.4151351351351351, "grad_norm": 0.38515138626098633, "learning_rate": 0.0001185840707964602, "loss": 0.4042, "step": 48 }, { "epoch": 0.4237837837837838, "grad_norm": 0.5093116164207458, "learning_rate": 0.00011681415929203541, "loss": 0.8376, "step": 49 }, { "epoch": 0.43243243243243246, "grad_norm": 0.2971178889274597, "learning_rate": 0.00011504424778761063, "loss": 0.4082, "step": 50 }, { "epoch": 0.4410810810810811, "grad_norm": 0.30018818378448486, "learning_rate": 0.00011327433628318584, "loss": 0.129, "step": 51 }, { "epoch": 0.44972972972972974, "grad_norm": 0.4631483256816864, "learning_rate": 0.00011150442477876106, "loss": 0.3752, "step": 52 }, { "epoch": 0.45837837837837836, "grad_norm": 0.3890452980995178, "learning_rate": 0.00010973451327433629, "loss": 0.4054, "step": 53 }, { "epoch": 0.46702702702702703, "grad_norm": 0.3566686511039734, "learning_rate": 0.0001079646017699115, "loss": 0.2452, "step": 54 }, { "epoch": 0.4756756756756757, "grad_norm": 0.4903372526168823, "learning_rate": 0.00010619469026548674, "loss": 0.4505, "step": 55 }, { "epoch": 0.4843243243243243, "grad_norm": 0.3836239278316498, "learning_rate": 0.00010442477876106196, "loss": 0.3952, "step": 56 }, { "epoch": 0.492972972972973, "grad_norm": 0.42047417163848877, "learning_rate": 0.00010265486725663717, "loss": 0.5074, "step": 57 }, { "epoch": 0.5016216216216216, "grad_norm": 0.24409635365009308, "learning_rate": 0.00010088495575221239, "loss": 0.1389, "step": 58 }, { "epoch": 0.5102702702702703, "grad_norm": 0.3819220960140228, "learning_rate": 9.911504424778762e-05, "loss": 0.3945, "step": 59 }, { "epoch": 0.518918918918919, "grad_norm": 0.31148406863212585, "learning_rate": 9.734513274336283e-05, "loss": 0.5203, "step": 60 }, { "epoch": 0.5275675675675676, "grad_norm": 0.3157011866569519, "learning_rate": 9.557522123893806e-05, "loss": 0.262, "step": 61 }, { "epoch": 0.5362162162162162, "grad_norm": 0.40180379152297974, "learning_rate": 9.380530973451328e-05, "loss": 0.2404, "step": 62 }, { "epoch": 0.5448648648648649, "grad_norm": 0.4064180552959442, "learning_rate": 9.20353982300885e-05, "loss": 0.6118, "step": 63 }, { "epoch": 0.5535135135135135, "grad_norm": 0.3912467956542969, "learning_rate": 9.026548672566371e-05, "loss": 0.271, "step": 64 }, { "epoch": 0.5621621621621622, "grad_norm": 0.31059980392456055, "learning_rate": 8.849557522123895e-05, "loss": 0.2373, "step": 65 }, { "epoch": 0.5708108108108109, "grad_norm": 0.30928152799606323, "learning_rate": 8.672566371681417e-05, "loss": 0.4169, "step": 66 }, { "epoch": 0.5794594594594594, "grad_norm": 0.40631791949272156, "learning_rate": 8.495575221238938e-05, "loss": 0.4175, "step": 67 }, { "epoch": 0.5881081081081081, "grad_norm": 0.40440961718559265, "learning_rate": 8.31858407079646e-05, "loss": 0.3269, "step": 68 }, { "epoch": 0.5967567567567568, "grad_norm": 0.4534294009208679, "learning_rate": 8.141592920353983e-05, "loss": 0.2242, "step": 69 }, { "epoch": 0.6054054054054054, "grad_norm": 0.41317978501319885, "learning_rate": 7.964601769911504e-05, "loss": 0.2633, "step": 70 }, { "epoch": 0.614054054054054, "grad_norm": 0.272535115480423, "learning_rate": 7.787610619469027e-05, "loss": 0.1455, "step": 71 }, { "epoch": 0.6227027027027027, "grad_norm": 0.4280416667461395, "learning_rate": 7.610619469026549e-05, "loss": 0.5289, "step": 72 }, { "epoch": 0.6313513513513513, "grad_norm": 0.4870530664920807, "learning_rate": 7.433628318584072e-05, "loss": 0.5633, "step": 73 }, { "epoch": 0.64, "grad_norm": 0.38074707984924316, "learning_rate": 7.256637168141593e-05, "loss": 0.4738, "step": 74 }, { "epoch": 0.6486486486486487, "grad_norm": 0.32775411009788513, "learning_rate": 7.079646017699115e-05, "loss": 0.2764, "step": 75 }, { "epoch": 0.6572972972972972, "grad_norm": 0.3663316071033478, "learning_rate": 6.902654867256638e-05, "loss": 0.4794, "step": 76 }, { "epoch": 0.6659459459459459, "grad_norm": 0.36854031682014465, "learning_rate": 6.725663716814161e-05, "loss": 0.1809, "step": 77 }, { "epoch": 0.6745945945945946, "grad_norm": 0.37296342849731445, "learning_rate": 6.548672566371682e-05, "loss": 0.4067, "step": 78 }, { "epoch": 0.6832432432432433, "grad_norm": 0.4202044606208801, "learning_rate": 6.371681415929204e-05, "loss": 0.2752, "step": 79 }, { "epoch": 0.6918918918918919, "grad_norm": 0.29250282049179077, "learning_rate": 6.194690265486725e-05, "loss": 0.1461, "step": 80 }, { "epoch": 0.7005405405405405, "grad_norm": 0.37763354182243347, "learning_rate": 6.017699115044248e-05, "loss": 0.2817, "step": 81 }, { "epoch": 0.7091891891891892, "grad_norm": 0.30031171441078186, "learning_rate": 5.8407079646017705e-05, "loss": 0.1572, "step": 82 }, { "epoch": 0.7178378378378378, "grad_norm": 0.4519175887107849, "learning_rate": 5.663716814159292e-05, "loss": 0.3046, "step": 83 }, { "epoch": 0.7264864864864865, "grad_norm": 0.3103352189064026, "learning_rate": 5.486725663716814e-05, "loss": 0.1347, "step": 84 }, { "epoch": 0.7351351351351352, "grad_norm": 0.7960600852966309, "learning_rate": 5.309734513274337e-05, "loss": 0.3168, "step": 85 }, { "epoch": 0.7437837837837837, "grad_norm": 0.3281419277191162, "learning_rate": 5.132743362831859e-05, "loss": 0.2045, "step": 86 }, { "epoch": 0.7524324324324324, "grad_norm": 0.35785752534866333, "learning_rate": 4.955752212389381e-05, "loss": 0.4077, "step": 87 }, { "epoch": 0.7610810810810811, "grad_norm": 0.37461650371551514, "learning_rate": 4.778761061946903e-05, "loss": 0.3227, "step": 88 }, { "epoch": 0.7697297297297298, "grad_norm": 0.3365744352340698, "learning_rate": 4.601769911504425e-05, "loss": 0.2306, "step": 89 }, { "epoch": 0.7783783783783784, "grad_norm": 0.29543980956077576, "learning_rate": 4.4247787610619477e-05, "loss": 0.3661, "step": 90 }, { "epoch": 0.787027027027027, "grad_norm": 0.3135324716567993, "learning_rate": 4.247787610619469e-05, "loss": 0.2503, "step": 91 }, { "epoch": 0.7956756756756757, "grad_norm": 0.23556429147720337, "learning_rate": 4.0707964601769914e-05, "loss": 0.1044, "step": 92 }, { "epoch": 0.8043243243243243, "grad_norm": 0.2718769907951355, "learning_rate": 3.893805309734514e-05, "loss": 0.1471, "step": 93 }, { "epoch": 0.812972972972973, "grad_norm": 0.25528448820114136, "learning_rate": 3.716814159292036e-05, "loss": 0.1126, "step": 94 }, { "epoch": 0.8216216216216217, "grad_norm": 0.514164388179779, "learning_rate": 3.5398230088495574e-05, "loss": 0.3423, "step": 95 }, { "epoch": 0.8302702702702702, "grad_norm": 0.33162716031074524, "learning_rate": 3.3628318584070804e-05, "loss": 0.3637, "step": 96 }, { "epoch": 0.8389189189189189, "grad_norm": 0.25161704421043396, "learning_rate": 3.185840707964602e-05, "loss": 0.1284, "step": 97 }, { "epoch": 0.8475675675675676, "grad_norm": 0.32825589179992676, "learning_rate": 3.008849557522124e-05, "loss": 0.2171, "step": 98 }, { "epoch": 0.8562162162162162, "grad_norm": 0.23435255885124207, "learning_rate": 2.831858407079646e-05, "loss": 0.16, "step": 99 }, { "epoch": 0.8648648648648649, "grad_norm": 0.2661581337451935, "learning_rate": 2.6548672566371686e-05, "loss": 0.2421, "step": 100 }, { "epoch": 0.8735135135135135, "grad_norm": 0.2724602222442627, "learning_rate": 2.4778761061946905e-05, "loss": 0.1246, "step": 101 }, { "epoch": 0.8821621621621621, "grad_norm": 0.47894561290740967, "learning_rate": 2.3008849557522124e-05, "loss": 0.4472, "step": 102 }, { "epoch": 0.8908108108108108, "grad_norm": 0.3064163327217102, "learning_rate": 2.1238938053097346e-05, "loss": 0.2987, "step": 103 }, { "epoch": 0.8994594594594595, "grad_norm": 0.4226900637149811, "learning_rate": 1.946902654867257e-05, "loss": 0.4185, "step": 104 }, { "epoch": 0.9081081081081082, "grad_norm": 0.34745219349861145, "learning_rate": 1.7699115044247787e-05, "loss": 0.2572, "step": 105 }, { "epoch": 0.9167567567567567, "grad_norm": 0.35236531496047974, "learning_rate": 1.592920353982301e-05, "loss": 0.3427, "step": 106 }, { "epoch": 0.9254054054054054, "grad_norm": 0.37095391750335693, "learning_rate": 1.415929203539823e-05, "loss": 0.4018, "step": 107 }, { "epoch": 0.9340540540540541, "grad_norm": 0.3331229090690613, "learning_rate": 1.2389380530973452e-05, "loss": 0.2038, "step": 108 }, { "epoch": 0.9427027027027027, "grad_norm": 0.2652183175086975, "learning_rate": 1.0619469026548673e-05, "loss": 0.1072, "step": 109 }, { "epoch": 0.9513513513513514, "grad_norm": 0.29123690724372864, "learning_rate": 8.849557522123894e-06, "loss": 0.1406, "step": 110 }, { "epoch": 0.96, "grad_norm": 0.3317340612411499, "learning_rate": 7.079646017699115e-06, "loss": 0.2202, "step": 111 }, { "epoch": 0.9686486486486486, "grad_norm": 0.47986647486686707, "learning_rate": 5.3097345132743365e-06, "loss": 0.3464, "step": 112 }, { "epoch": 0.9772972972972973, "grad_norm": 0.2612822949886322, "learning_rate": 3.5398230088495575e-06, "loss": 0.1271, "step": 113 }, { "epoch": 0.985945945945946, "grad_norm": 0.26845863461494446, "learning_rate": 1.7699115044247788e-06, "loss": 0.1044, "step": 114 }, { "epoch": 0.9945945945945946, "grad_norm": 0.2526237368583679, "learning_rate": 0.0, "loss": 0.1158, "step": 115 }, { "epoch": 0.9945945945945946, "step": 115, "total_flos": 1.3431114641260646e+17, "train_loss": 0.4029887131374815, "train_runtime": 1125.7865, "train_samples_per_second": 0.822, "train_steps_per_second": 0.102 } ], "logging_steps": 1, "max_steps": 115, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3431114641260646e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }