{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.13647698934482, "eval_steps": 187, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001813647698934482, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.5015, "step": 1 }, { "epoch": 0.001813647698934482, "eval_loss": 1.6466877460479736, "eval_runtime": 184.4231, "eval_samples_per_second": 5.422, "eval_steps_per_second": 5.422, "step": 1 }, { "epoch": 0.001813647698934482, "mmlu_eval_accuracy": 0.33621221873608415, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.5333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.4782608695652174, "mmlu_eval_accuracy_high_school_us_history": 0.45454545454545453, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.23076923076923078, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.5185185185185185, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.47368421052631576, "mmlu_loss": 3.0115479675482333, "step": 1 }, { "epoch": 0.003627295397868964, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.4377, "step": 2 }, { "epoch": 0.005440943096803446, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.3771, "step": 3 }, { "epoch": 0.007254590795737928, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.1892, "step": 4 }, { "epoch": 0.00906823849467241, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2242, "step": 5 }, { "epoch": 0.010881886193606891, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1738, "step": 6 }, { "epoch": 0.012695533892541374, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.2274, "step": 7 }, { "epoch": 0.014509181591475856, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.2588, "step": 8 }, { "epoch": 0.01632282929041034, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.341, "step": 9 }, { "epoch": 0.01813647698934482, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3113, "step": 10 }, { "epoch": 0.0199501246882793, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.4729, "step": 11 }, { "epoch": 0.021763772387213783, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2172, "step": 12 }, { "epoch": 0.023577420086148267, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.229, "step": 13 }, { "epoch": 0.02539106778508275, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.3288, "step": 14 }, { "epoch": 0.02720471548401723, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.2193, "step": 15 }, { "epoch": 0.02901836318295171, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1073, "step": 16 }, { "epoch": 0.030832010881886193, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1827, "step": 17 }, { "epoch": 0.03264565858082068, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.202, "step": 18 }, { "epoch": 0.03445930627975516, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1891, "step": 19 }, { "epoch": 0.03627295397868964, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2195, "step": 20 }, { "epoch": 0.03808660167762412, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3, "step": 21 }, { "epoch": 0.0399002493765586, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1547, "step": 22 }, { "epoch": 0.041713897075493084, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.4449, "step": 23 }, { "epoch": 0.043527544774427565, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.4047, "step": 24 }, { "epoch": 0.045341192473362046, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2845, "step": 25 }, { "epoch": 0.047154840172296535, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.1272, "step": 26 }, { "epoch": 0.048968487871231016, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.3677, "step": 27 }, { "epoch": 0.0507821355701655, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.1898, "step": 28 }, { "epoch": 0.05259578326909998, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.5051, "step": 29 }, { "epoch": 0.05440943096803446, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.3389, "step": 30 }, { "epoch": 0.05622307866696894, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.5376, "step": 31 }, { "epoch": 0.05803672636590342, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.2573, "step": 32 }, { "epoch": 0.059850374064837904, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.5189, "step": 33 }, { "epoch": 0.061664021763772385, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.4587, "step": 34 }, { "epoch": 0.06347766946270687, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.6342, "step": 35 }, { "epoch": 0.06529131716164135, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.4001, "step": 36 }, { "epoch": 0.06710496486057584, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.4888, "step": 37 }, { "epoch": 0.06891861255951032, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.4825, "step": 38 }, { "epoch": 0.0707322602584448, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.4546, "step": 39 }, { "epoch": 0.07254590795737928, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.5669, "step": 40 }, { "epoch": 0.07435955565631376, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.5562, "step": 41 }, { "epoch": 0.07617320335524824, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.5764, "step": 42 }, { "epoch": 0.07798685105418272, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.6263, "step": 43 }, { "epoch": 0.0798004987531172, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.398, "step": 44 }, { "epoch": 0.08161414645205169, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.7397, "step": 45 }, { "epoch": 0.08342779415098617, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 1.5049, "step": 46 }, { "epoch": 0.08524144184992065, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.4864, "step": 47 }, { "epoch": 0.08705508954885513, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.5446, "step": 48 }, { "epoch": 0.08886873724778961, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.6262, "step": 49 }, { "epoch": 0.09068238494672409, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 1.6154, "step": 50 }, { "epoch": 0.09249603264565857, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4766, "step": 51 }, { "epoch": 0.09430968034459307, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.227, "step": 52 }, { "epoch": 0.09612332804352755, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2837, "step": 53 }, { "epoch": 0.09793697574246203, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1381, "step": 54 }, { "epoch": 0.09975062344139651, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.3538, "step": 55 }, { "epoch": 0.101564271140331, "grad_norm": 0.21875, "learning_rate": 0.0002, "loss": 1.213, "step": 56 }, { "epoch": 0.10337791883926548, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.2243, "step": 57 }, { "epoch": 0.10519156653819996, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.2613, "step": 58 }, { "epoch": 0.10700521423713444, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1426, "step": 59 }, { "epoch": 0.10881886193606892, "grad_norm": 0.2294921875, "learning_rate": 0.0002, "loss": 1.1833, "step": 60 }, { "epoch": 0.1106325096350034, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1682, "step": 61 }, { "epoch": 0.11244615733393788, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.118, "step": 62 }, { "epoch": 0.11425980503287236, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.2574, "step": 63 }, { "epoch": 0.11607345273180684, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.0472, "step": 64 }, { "epoch": 0.11788710043074133, "grad_norm": 0.2353515625, "learning_rate": 0.0002, "loss": 1.1215, "step": 65 }, { "epoch": 0.11970074812967581, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.1675, "step": 66 }, { "epoch": 0.12151439582861029, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.3034, "step": 67 }, { "epoch": 0.12332804352754477, "grad_norm": 0.2412109375, "learning_rate": 0.0002, "loss": 1.169, "step": 68 }, { "epoch": 0.12514169122647925, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1797, "step": 69 }, { "epoch": 0.12695533892541375, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3288, "step": 70 }, { "epoch": 0.12876898662434821, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1534, "step": 71 }, { "epoch": 0.1305826343232827, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.187, "step": 72 }, { "epoch": 0.13239628202221718, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.3006, "step": 73 }, { "epoch": 0.13420992972115167, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.4077, "step": 74 }, { "epoch": 0.13602357742008614, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.3565, "step": 75 }, { "epoch": 0.13783722511902063, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.3168, "step": 76 }, { "epoch": 0.1396508728179551, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.4921, "step": 77 }, { "epoch": 0.1414645205168896, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.3016, "step": 78 }, { "epoch": 0.14327816821582406, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.2557, "step": 79 }, { "epoch": 0.14509181591475856, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.2711, "step": 80 }, { "epoch": 0.14690546361369303, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.2416, "step": 81 }, { "epoch": 0.14871911131262752, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.1167, "step": 82 }, { "epoch": 0.15053275901156202, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.6469, "step": 83 }, { "epoch": 0.15234640671049648, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.3807, "step": 84 }, { "epoch": 0.15416005440943098, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.5197, "step": 85 }, { "epoch": 0.15597370210836545, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.4334, "step": 86 }, { "epoch": 0.15778734980729994, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1994, "step": 87 }, { "epoch": 0.1596009975062344, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.4853, "step": 88 }, { "epoch": 0.1614146452051689, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.6212, "step": 89 }, { "epoch": 0.16322829290410337, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.5533, "step": 90 }, { "epoch": 0.16504194060303787, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.454, "step": 91 }, { "epoch": 0.16685558830197234, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.4495, "step": 92 }, { "epoch": 0.16866923600090683, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.5247, "step": 93 }, { "epoch": 0.1704828836998413, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.6935, "step": 94 }, { "epoch": 0.1722965313987758, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.4708, "step": 95 }, { "epoch": 0.17411017909771026, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.5192, "step": 96 }, { "epoch": 0.17592382679664476, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.4805, "step": 97 }, { "epoch": 0.17773747449557922, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.7075, "step": 98 }, { "epoch": 0.17955112219451372, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.4596, "step": 99 }, { "epoch": 0.18136476989344819, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 1.6481, "step": 100 }, { "epoch": 0.18317841759238268, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.2084, "step": 101 }, { "epoch": 0.18499206529131715, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.2892, "step": 102 }, { "epoch": 0.18680571299025164, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.357, "step": 103 }, { "epoch": 0.18861936068918614, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.1254, "step": 104 }, { "epoch": 0.1904330083881206, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.06, "step": 105 }, { "epoch": 0.1922466560870551, "grad_norm": 0.2177734375, "learning_rate": 0.0002, "loss": 1.2552, "step": 106 }, { "epoch": 0.19406030378598957, "grad_norm": 0.2138671875, "learning_rate": 0.0002, "loss": 1.0762, "step": 107 }, { "epoch": 0.19587395148492406, "grad_norm": 0.232421875, "learning_rate": 0.0002, "loss": 1.0012, "step": 108 }, { "epoch": 0.19768759918385853, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 1.104, "step": 109 }, { "epoch": 0.19950124688279303, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.1978, "step": 110 }, { "epoch": 0.2013148945817275, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 1.1635, "step": 111 }, { "epoch": 0.203128542280662, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.264, "step": 112 }, { "epoch": 0.20494218997959646, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3028, "step": 113 }, { "epoch": 0.20675583767853095, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1005, "step": 114 }, { "epoch": 0.20856948537746542, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0968, "step": 115 }, { "epoch": 0.21038313307639991, "grad_norm": 0.2236328125, "learning_rate": 0.0002, "loss": 1.0876, "step": 116 }, { "epoch": 0.21219678077533438, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.1072, "step": 117 }, { "epoch": 0.21401042847426888, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3396, "step": 118 }, { "epoch": 0.21582407617320334, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.4276, "step": 119 }, { "epoch": 0.21763772387213784, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.1021, "step": 120 }, { "epoch": 0.2194513715710723, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1605, "step": 121 }, { "epoch": 0.2212650192700068, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.195, "step": 122 }, { "epoch": 0.22307866696894127, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3376, "step": 123 }, { "epoch": 0.22489231466787576, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3084, "step": 124 }, { "epoch": 0.22670596236681026, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.3883, "step": 125 }, { "epoch": 0.22851961006574473, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2899, "step": 126 }, { "epoch": 0.23033325776467922, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2261, "step": 127 }, { "epoch": 0.2321469054636137, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1584, "step": 128 }, { "epoch": 0.23396055316254818, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2669, "step": 129 }, { "epoch": 0.23577420086148265, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2131, "step": 130 }, { "epoch": 0.23758784856041715, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.4114, "step": 131 }, { "epoch": 0.23940149625935161, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.6023, "step": 132 }, { "epoch": 0.2412151439582861, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.4415, "step": 133 }, { "epoch": 0.24302879165722058, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.461, "step": 134 }, { "epoch": 0.24484243935615507, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.4245, "step": 135 }, { "epoch": 0.24665608705508954, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.2804, "step": 136 }, { "epoch": 0.24846973475402404, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.4295, "step": 137 }, { "epoch": 0.2502833824529585, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.4614, "step": 138 }, { "epoch": 0.252097030151893, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.4262, "step": 139 }, { "epoch": 0.2539106778508275, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.4931, "step": 140 }, { "epoch": 0.25572432554976193, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.6645, "step": 141 }, { "epoch": 0.25753797324869643, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.6593, "step": 142 }, { "epoch": 0.2593516209476309, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.6971, "step": 143 }, { "epoch": 0.2611652686465654, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.496, "step": 144 }, { "epoch": 0.2629789163454999, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.655, "step": 145 }, { "epoch": 0.26479256404443435, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.5476, "step": 146 }, { "epoch": 0.26660621174336885, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.6437, "step": 147 }, { "epoch": 0.26841985944230334, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.6532, "step": 148 }, { "epoch": 0.27023350714123784, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.562, "step": 149 }, { "epoch": 0.2720471548401723, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 1.4171, "step": 150 }, { "epoch": 0.2738608025391068, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.1292, "step": 151 }, { "epoch": 0.27567445023804127, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.282, "step": 152 }, { "epoch": 0.27748809793697576, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2425, "step": 153 }, { "epoch": 0.2793017456359102, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1044, "step": 154 }, { "epoch": 0.2811153933348447, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.5104, "step": 155 }, { "epoch": 0.2829290410337792, "grad_norm": 0.2197265625, "learning_rate": 0.0002, "loss": 1.0225, "step": 156 }, { "epoch": 0.2847426887327137, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1639, "step": 157 }, { "epoch": 0.28655633643164813, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.0176, "step": 158 }, { "epoch": 0.2883699841305826, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2877, "step": 159 }, { "epoch": 0.2901836318295171, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 1.1055, "step": 160 }, { "epoch": 0.2919972795284516, "grad_norm": 0.23828125, "learning_rate": 0.0002, "loss": 1.1787, "step": 161 }, { "epoch": 0.29381092722738605, "grad_norm": 0.2431640625, "learning_rate": 0.0002, "loss": 1.1531, "step": 162 }, { "epoch": 0.29562457492632055, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.2722, "step": 163 }, { "epoch": 0.29743822262525504, "grad_norm": 0.2373046875, "learning_rate": 0.0002, "loss": 1.3058, "step": 164 }, { "epoch": 0.29925187032418954, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1056, "step": 165 }, { "epoch": 0.30106551802312403, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1966, "step": 166 }, { "epoch": 0.3028791657220585, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2526, "step": 167 }, { "epoch": 0.30469281342099297, "grad_norm": 0.228515625, "learning_rate": 0.0002, "loss": 0.9436, "step": 168 }, { "epoch": 0.30650646111992746, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2531, "step": 169 }, { "epoch": 0.30832010881886196, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.096, "step": 170 }, { "epoch": 0.3101337565177964, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.22, "step": 171 }, { "epoch": 0.3119474042167309, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.3183, "step": 172 }, { "epoch": 0.3137610519156654, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2699, "step": 173 }, { "epoch": 0.3155746996145999, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1881, "step": 174 }, { "epoch": 0.3173883473135343, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.3254, "step": 175 }, { "epoch": 0.3192019950124688, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2557, "step": 176 }, { "epoch": 0.3210156427114033, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.4129, "step": 177 }, { "epoch": 0.3228292904103378, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4197, "step": 178 }, { "epoch": 0.32464293810927225, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.3244, "step": 179 }, { "epoch": 0.32645658580820675, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2684, "step": 180 }, { "epoch": 0.32827023350714124, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.4048, "step": 181 }, { "epoch": 0.33008388120607574, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.1783, "step": 182 }, { "epoch": 0.3318975289050102, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.5841, "step": 183 }, { "epoch": 0.33371117660394467, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1746, "step": 184 }, { "epoch": 0.33552482430287917, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.5421, "step": 185 }, { "epoch": 0.33733847200181366, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.4201, "step": 186 }, { "epoch": 0.33915211970074816, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.5031, "step": 187 }, { "epoch": 0.33915211970074816, "eval_loss": 1.3133106231689453, "eval_runtime": 188.1934, "eval_samples_per_second": 5.314, "eval_steps_per_second": 5.314, "step": 187 }, { "epoch": 0.33915211970074816, "mmlu_eval_accuracy": 0.3312593434354807, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.0, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.0625, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.058823529411764705, "mmlu_eval_accuracy_high_school_psychology": 0.48333333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.22857142857142856, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.2235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.16666666666666666, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 1.7405756021698342, "step": 187 }, { "epoch": 0.3409657673996826, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.408, "step": 188 }, { "epoch": 0.3427794150986171, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.4726, "step": 189 }, { "epoch": 0.3445930627975516, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.4017, "step": 190 }, { "epoch": 0.3464067104964861, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.5738, "step": 191 }, { "epoch": 0.3482203581954205, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.5379, "step": 192 }, { "epoch": 0.350034005894355, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.5852, "step": 193 }, { "epoch": 0.3518476535932895, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.6836, "step": 194 }, { "epoch": 0.353661301292224, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.629, "step": 195 }, { "epoch": 0.35547494899115845, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.4376, "step": 196 }, { "epoch": 0.35728859669009294, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 1.5276, "step": 197 }, { "epoch": 0.35910224438902744, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.7393, "step": 198 }, { "epoch": 0.36091589208796193, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 1.6959, "step": 199 }, { "epoch": 0.36272953978689637, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 1.5106, "step": 200 }, { "epoch": 0.36454318748583087, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.3627, "step": 201 }, { "epoch": 0.36635683518476536, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.318, "step": 202 }, { "epoch": 0.36817048288369986, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.2995, "step": 203 }, { "epoch": 0.3699841305826343, "grad_norm": 0.2490234375, "learning_rate": 0.0002, "loss": 1.1392, "step": 204 }, { "epoch": 0.3717977782815688, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1414, "step": 205 }, { "epoch": 0.3736114259805033, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.2664, "step": 206 }, { "epoch": 0.3754250736794378, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2554, "step": 207 }, { "epoch": 0.3772387213783723, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.245, "step": 208 }, { "epoch": 0.3790523690773067, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.1193, "step": 209 }, { "epoch": 0.3808660167762412, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.2551, "step": 210 }, { "epoch": 0.3826796644751757, "grad_norm": 0.240234375, "learning_rate": 0.0002, "loss": 1.2059, "step": 211 }, { "epoch": 0.3844933121741102, "grad_norm": 0.2333984375, "learning_rate": 0.0002, "loss": 1.1943, "step": 212 }, { "epoch": 0.38630695987304464, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.3448, "step": 213 }, { "epoch": 0.38812060757197914, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1542, "step": 214 }, { "epoch": 0.38993425527091363, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.1178, "step": 215 }, { "epoch": 0.3917479029698481, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0975, "step": 216 }, { "epoch": 0.39356155066878257, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1133, "step": 217 }, { "epoch": 0.39537519836771706, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.2995, "step": 218 }, { "epoch": 0.39718884606665156, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.5084, "step": 219 }, { "epoch": 0.39900249376558605, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.2239, "step": 220 }, { "epoch": 0.4008161414645205, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.3434, "step": 221 }, { "epoch": 0.402629789163455, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2954, "step": 222 }, { "epoch": 0.4044434368623895, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.1402, "step": 223 }, { "epoch": 0.406257084561324, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2346, "step": 224 }, { "epoch": 0.4080707322602584, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 1.1126, "step": 225 }, { "epoch": 0.4098843799591929, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.611, "step": 226 }, { "epoch": 0.4116980276581274, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2428, "step": 227 }, { "epoch": 0.4135116753570619, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.5344, "step": 228 }, { "epoch": 0.4153253230559964, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.4064, "step": 229 }, { "epoch": 0.41713897075493084, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.4556, "step": 230 }, { "epoch": 0.41895261845386533, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.4315, "step": 231 }, { "epoch": 0.42076626615279983, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.6069, "step": 232 }, { "epoch": 0.4225799138517343, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.5175, "step": 233 }, { "epoch": 0.42439356155066876, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.3385, "step": 234 }, { "epoch": 0.42620720924960326, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.3146, "step": 235 }, { "epoch": 0.42802085694853775, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.2962, "step": 236 }, { "epoch": 0.42983450464747225, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.6408, "step": 237 }, { "epoch": 0.4316481523464067, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.7029, "step": 238 }, { "epoch": 0.4334618000453412, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.2781, "step": 239 }, { "epoch": 0.4352754477442757, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.5138, "step": 240 }, { "epoch": 0.4370890954432102, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 1.4857, "step": 241 }, { "epoch": 0.4389027431421446, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.5972, "step": 242 }, { "epoch": 0.4407163908410791, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.75, "step": 243 }, { "epoch": 0.4425300385400136, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.4883, "step": 244 }, { "epoch": 0.4443436862389481, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.7075, "step": 245 }, { "epoch": 0.44615733393788254, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.891, "step": 246 }, { "epoch": 0.44797098163681703, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.5443, "step": 247 }, { "epoch": 0.44978462933575153, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 1.751, "step": 248 }, { "epoch": 0.451598277034686, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.6492, "step": 249 }, { "epoch": 0.4534119247336205, "grad_norm": 1.5, "learning_rate": 0.0002, "loss": 1.5454, "step": 250 }, { "epoch": 0.45522557243255496, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2872, "step": 251 }, { "epoch": 0.45703922013148945, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.0105, "step": 252 }, { "epoch": 0.45885286783042395, "grad_norm": 0.30078125, "learning_rate": 0.0002, "loss": 1.1036, "step": 253 }, { "epoch": 0.46066651552935844, "grad_norm": 0.244140625, "learning_rate": 0.0002, "loss": 1.2631, "step": 254 }, { "epoch": 0.4624801632282929, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.0232, "step": 255 }, { "epoch": 0.4642938109272274, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.0303, "step": 256 }, { "epoch": 0.4661074586261619, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.2779, "step": 257 }, { "epoch": 0.46792110632509637, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.0825, "step": 258 }, { "epoch": 0.4697347540240308, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.1631, "step": 259 }, { "epoch": 0.4715484017229653, "grad_norm": 0.234375, "learning_rate": 0.0002, "loss": 1.1376, "step": 260 }, { "epoch": 0.4733620494218998, "grad_norm": 0.2314453125, "learning_rate": 0.0002, "loss": 1.0957, "step": 261 }, { "epoch": 0.4751756971208343, "grad_norm": 0.23046875, "learning_rate": 0.0002, "loss": 0.9679, "step": 262 }, { "epoch": 0.47698934481976873, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.0217, "step": 263 }, { "epoch": 0.47880299251870323, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.3054, "step": 264 }, { "epoch": 0.4806166402176377, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.0746, "step": 265 }, { "epoch": 0.4824302879165722, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1915, "step": 266 }, { "epoch": 0.4842439356155067, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.0596, "step": 267 }, { "epoch": 0.48605758331444116, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.245, "step": 268 }, { "epoch": 0.48787123101337565, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1634, "step": 269 }, { "epoch": 0.48968487871231015, "grad_norm": 0.251953125, "learning_rate": 0.0002, "loss": 1.0669, "step": 270 }, { "epoch": 0.49149852641124464, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.3429, "step": 271 }, { "epoch": 0.4933121741101791, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.2615, "step": 272 }, { "epoch": 0.4951258218091136, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.293, "step": 273 }, { "epoch": 0.49693946950804807, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.3402, "step": 274 }, { "epoch": 0.49875311720698257, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.2295, "step": 275 }, { "epoch": 0.500566764905917, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.4477, "step": 276 }, { "epoch": 0.5023804126048516, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 1.4013, "step": 277 }, { "epoch": 0.504194060303786, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.2456, "step": 278 }, { "epoch": 0.5060077080027204, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.3309, "step": 279 }, { "epoch": 0.507821355701655, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2746, "step": 280 }, { "epoch": 0.5096350034005894, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2293, "step": 281 }, { "epoch": 0.5114486510995239, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.426, "step": 282 }, { "epoch": 0.5132622987984584, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.4193, "step": 283 }, { "epoch": 0.5150759464973929, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.2746, "step": 284 }, { "epoch": 0.5168895941963274, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.5684, "step": 285 }, { "epoch": 0.5187032418952618, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.2694, "step": 286 }, { "epoch": 0.5205168895941963, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.4593, "step": 287 }, { "epoch": 0.5223305372931308, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.5412, "step": 288 }, { "epoch": 0.5241441849920653, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.5721, "step": 289 }, { "epoch": 0.5259578326909998, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.537, "step": 290 }, { "epoch": 0.5277714803899343, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.4568, "step": 291 }, { "epoch": 0.5295851280888687, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.5471, "step": 292 }, { "epoch": 0.5313987757878033, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.7897, "step": 293 }, { "epoch": 0.5332124234867377, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.6854, "step": 294 }, { "epoch": 0.5350260711856721, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.7163, "step": 295 }, { "epoch": 0.5368397188846067, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.7795, "step": 296 }, { "epoch": 0.5386533665835411, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.3912, "step": 297 }, { "epoch": 0.5404670142824757, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.5641, "step": 298 }, { "epoch": 0.5422806619814101, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 1.6289, "step": 299 }, { "epoch": 0.5440943096803446, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 1.1803, "step": 300 }, { "epoch": 0.5459079573792791, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2861, "step": 301 }, { "epoch": 0.5477216050782135, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.2247, "step": 302 }, { "epoch": 0.549535252777148, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2261, "step": 303 }, { "epoch": 0.5513489004760825, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2244, "step": 304 }, { "epoch": 0.553162548175017, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1726, "step": 305 }, { "epoch": 0.5549761958739515, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.0531, "step": 306 }, { "epoch": 0.556789843572886, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 1.2132, "step": 307 }, { "epoch": 0.5586034912718204, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0823, "step": 308 }, { "epoch": 0.560417138970755, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.3665, "step": 309 }, { "epoch": 0.5622307866696894, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.2339, "step": 310 }, { "epoch": 0.564044434368624, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.3419, "step": 311 }, { "epoch": 0.5658580820675584, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.2144, "step": 312 }, { "epoch": 0.5676717297664928, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1296, "step": 313 }, { "epoch": 0.5694853774654274, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.122, "step": 314 }, { "epoch": 0.5712990251643618, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.2799, "step": 315 }, { "epoch": 0.5731126728632963, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.2014, "step": 316 }, { "epoch": 0.5749263205622308, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 0.98, "step": 317 }, { "epoch": 0.5767399682611652, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1339, "step": 318 }, { "epoch": 0.5785536159600998, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.1581, "step": 319 }, { "epoch": 0.5803672636590342, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.139, "step": 320 }, { "epoch": 0.5821809113579687, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2013, "step": 321 }, { "epoch": 0.5839945590569032, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 1.2367, "step": 322 }, { "epoch": 0.5858082067558377, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.0817, "step": 323 }, { "epoch": 0.5876218544547721, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1253, "step": 324 }, { "epoch": 0.5894355021537067, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.2042, "step": 325 }, { "epoch": 0.5912491498526411, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.3985, "step": 326 }, { "epoch": 0.5930627975515756, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.2344, "step": 327 }, { "epoch": 0.5948764452505101, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.2887, "step": 328 }, { "epoch": 0.5966900929494445, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.453, "step": 329 }, { "epoch": 0.5985037406483791, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.3783, "step": 330 }, { "epoch": 0.6003173883473135, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1576, "step": 331 }, { "epoch": 0.6021310360462481, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.4443, "step": 332 }, { "epoch": 0.6039446837451825, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.7867, "step": 333 }, { "epoch": 0.605758331444117, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.4991, "step": 334 }, { "epoch": 0.6075719791430515, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.6615, "step": 335 }, { "epoch": 0.6093856268419859, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.5153, "step": 336 }, { "epoch": 0.6111992745409204, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.5653, "step": 337 }, { "epoch": 0.6130129222398549, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 1.6538, "step": 338 }, { "epoch": 0.6148265699387894, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.4496, "step": 339 }, { "epoch": 0.6166402176377239, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 1.4053, "step": 340 }, { "epoch": 0.6184538653366584, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.5777, "step": 341 }, { "epoch": 0.6202675130355928, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.5032, "step": 342 }, { "epoch": 0.6220811607345273, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 1.467, "step": 343 }, { "epoch": 0.6238948084334618, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.4448, "step": 344 }, { "epoch": 0.6257084561323962, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.777, "step": 345 }, { "epoch": 0.6275221038313308, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.6, "step": 346 }, { "epoch": 0.6293357515302652, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.7073, "step": 347 }, { "epoch": 0.6311493992291998, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.488, "step": 348 }, { "epoch": 0.6329630469281342, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 1.698, "step": 349 }, { "epoch": 0.6347766946270686, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 1.5281, "step": 350 }, { "epoch": 0.6365903423260032, "grad_norm": 0.333984375, "learning_rate": 0.0002, "loss": 1.1551, "step": 351 }, { "epoch": 0.6384039900249376, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.1898, "step": 352 }, { "epoch": 0.6402176377238722, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.403, "step": 353 }, { "epoch": 0.6420312854228066, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1012, "step": 354 }, { "epoch": 0.6438449331217411, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1918, "step": 355 }, { "epoch": 0.6456585808206756, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.1242, "step": 356 }, { "epoch": 0.6474722285196101, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.0429, "step": 357 }, { "epoch": 0.6492858762185445, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.3523, "step": 358 }, { "epoch": 0.651099523917479, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1875, "step": 359 }, { "epoch": 0.6529131716164135, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.2178, "step": 360 }, { "epoch": 0.654726819315348, "grad_norm": 0.236328125, "learning_rate": 0.0002, "loss": 1.2441, "step": 361 }, { "epoch": 0.6565404670142825, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1908, "step": 362 }, { "epoch": 0.6583541147132169, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2172, "step": 363 }, { "epoch": 0.6601677624121515, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.1093, "step": 364 }, { "epoch": 0.6619814101110859, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1313, "step": 365 }, { "epoch": 0.6637950578100204, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.2267, "step": 366 }, { "epoch": 0.6656087055089549, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.3657, "step": 367 }, { "epoch": 0.6674223532078893, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.1652, "step": 368 }, { "epoch": 0.6692360009068239, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1241, "step": 369 }, { "epoch": 0.6710496486057583, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.0701, "step": 370 }, { "epoch": 0.6728632963046928, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.1091, "step": 371 }, { "epoch": 0.6746769440036273, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.4014, "step": 372 }, { "epoch": 0.6764905917025618, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.2299, "step": 373 }, { "epoch": 0.6783042394014963, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.1305, "step": 374 }, { "epoch": 0.6783042394014963, "eval_loss": 1.3023930788040161, "eval_runtime": 186.0283, "eval_samples_per_second": 5.376, "eval_steps_per_second": 5.376, "step": 374 }, { "epoch": 0.6783042394014963, "mmlu_eval_accuracy": 0.3136796523053793, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.7272727272727273, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.5454545454545454, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.0625, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.25, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724, "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, "mmlu_eval_accuracy_high_school_physics": 0.17647058823529413, "mmlu_eval_accuracy_high_school_psychology": 0.4666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.21739130434782608, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.23076923076923078, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.4, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3333333333333333, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.22857142857142856, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.21764705882352942, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.25, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.47368421052631576, "mmlu_loss": 2.1696450489942425, "step": 374 }, { "epoch": 0.6801178871004308, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.1191, "step": 375 }, { "epoch": 0.6819315347993652, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.3652, "step": 376 }, { "epoch": 0.6837451824982997, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2818, "step": 377 }, { "epoch": 0.6855588301972342, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2144, "step": 378 }, { "epoch": 0.6873724778961686, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.2651, "step": 379 }, { "epoch": 0.6891861255951032, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.4844, "step": 380 }, { "epoch": 0.6909997732940376, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.4189, "step": 381 }, { "epoch": 0.6928134209929722, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.4348, "step": 382 }, { "epoch": 0.6946270686919066, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.441, "step": 383 }, { "epoch": 0.696440716390841, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.3865, "step": 384 }, { "epoch": 0.6982543640897756, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.3256, "step": 385 }, { "epoch": 0.70006801178871, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.4633, "step": 386 }, { "epoch": 0.7018816594876445, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.5899, "step": 387 }, { "epoch": 0.703695307186579, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.3027, "step": 388 }, { "epoch": 0.7055089548855135, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.3775, "step": 389 }, { "epoch": 0.707322602584448, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.5785, "step": 390 }, { "epoch": 0.7091362502833825, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.4817, "step": 391 }, { "epoch": 0.7109498979823169, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.5678, "step": 392 }, { "epoch": 0.7127635456812514, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.6544, "step": 393 }, { "epoch": 0.7145771933801859, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.7613, "step": 394 }, { "epoch": 0.7163908410791204, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.5232, "step": 395 }, { "epoch": 0.7182044887780549, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 1.4288, "step": 396 }, { "epoch": 0.7200181364769893, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.5251, "step": 397 }, { "epoch": 0.7218317841759239, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 1.5403, "step": 398 }, { "epoch": 0.7236454318748583, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.4416, "step": 399 }, { "epoch": 0.7254590795737927, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 1.2659, "step": 400 }, { "epoch": 0.7272727272727273, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 1.2122, "step": 401 }, { "epoch": 0.7290863749716617, "grad_norm": 0.294921875, "learning_rate": 0.0002, "loss": 1.4167, "step": 402 }, { "epoch": 0.7309000226705963, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1115, "step": 403 }, { "epoch": 0.7327136703695307, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.1862, "step": 404 }, { "epoch": 0.7345273180684652, "grad_norm": 0.2451171875, "learning_rate": 0.0002, "loss": 1.1375, "step": 405 }, { "epoch": 0.7363409657673997, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1284, "step": 406 }, { "epoch": 0.7381546134663342, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.0784, "step": 407 }, { "epoch": 0.7399682611652686, "grad_norm": 0.24609375, "learning_rate": 0.0002, "loss": 1.1046, "step": 408 }, { "epoch": 0.7417819088642031, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.0311, "step": 409 }, { "epoch": 0.7435955565631376, "grad_norm": 0.259765625, "learning_rate": 0.0002, "loss": 0.9772, "step": 410 }, { "epoch": 0.7454092042620721, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 0.9911, "step": 411 }, { "epoch": 0.7472228519610066, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2253, "step": 412 }, { "epoch": 0.749036499659941, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.0447, "step": 413 }, { "epoch": 0.7508501473588756, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.0139, "step": 414 }, { "epoch": 0.75266379505781, "grad_norm": 0.255859375, "learning_rate": 0.0002, "loss": 1.198, "step": 415 }, { "epoch": 0.7544774427567446, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1241, "step": 416 }, { "epoch": 0.756291090455679, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.1835, "step": 417 }, { "epoch": 0.7581047381546134, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.3203, "step": 418 }, { "epoch": 0.759918385853548, "grad_norm": 0.248046875, "learning_rate": 0.0002, "loss": 1.0885, "step": 419 }, { "epoch": 0.7617320335524824, "grad_norm": 0.302734375, "learning_rate": 0.0002, "loss": 1.2132, "step": 420 }, { "epoch": 0.7635456812514169, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 1.3443, "step": 421 }, { "epoch": 0.7653593289503514, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2337, "step": 422 }, { "epoch": 0.7671729766492859, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 1.1246, "step": 423 }, { "epoch": 0.7689866243482204, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.2807, "step": 424 }, { "epoch": 0.7708002720471548, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1381, "step": 425 }, { "epoch": 0.7726139197460893, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.2952, "step": 426 }, { "epoch": 0.7744275674450238, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.2337, "step": 427 }, { "epoch": 0.7762412151439583, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.2612, "step": 428 }, { "epoch": 0.7780548628428927, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.3941, "step": 429 }, { "epoch": 0.7798685105418273, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.2281, "step": 430 }, { "epoch": 0.7816821582407617, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.6503, "step": 431 }, { "epoch": 0.7834958059396963, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.5759, "step": 432 }, { "epoch": 0.7853094536386307, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.3439, "step": 433 }, { "epoch": 0.7871231013375651, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.4722, "step": 434 }, { "epoch": 0.7889367490364997, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.3159, "step": 435 }, { "epoch": 0.7907503967354341, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.401, "step": 436 }, { "epoch": 0.7925640444343687, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.5007, "step": 437 }, { "epoch": 0.7943776921333031, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.571, "step": 438 }, { "epoch": 0.7961913398322376, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 1.8376, "step": 439 }, { "epoch": 0.7980049875311721, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.5594, "step": 440 }, { "epoch": 0.7998186352301065, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.8833, "step": 441 }, { "epoch": 0.801632282929041, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.5904, "step": 442 }, { "epoch": 0.8034459306279755, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.6674, "step": 443 }, { "epoch": 0.80525957832691, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.7212, "step": 444 }, { "epoch": 0.8070732260258445, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 1.3363, "step": 445 }, { "epoch": 0.808886873724779, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.5612, "step": 446 }, { "epoch": 0.8107005214237134, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.7908, "step": 447 }, { "epoch": 0.812514169122648, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.5888, "step": 448 }, { "epoch": 0.8143278168215824, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 1.4739, "step": 449 }, { "epoch": 0.8161414645205168, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 1.4889, "step": 450 }, { "epoch": 0.8179551122194514, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.2179, "step": 451 }, { "epoch": 0.8197687599183858, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1862, "step": 452 }, { "epoch": 0.8215824076173204, "grad_norm": 0.2470703125, "learning_rate": 0.0002, "loss": 0.9227, "step": 453 }, { "epoch": 0.8233960553162548, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1819, "step": 454 }, { "epoch": 0.8252097030151893, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1065, "step": 455 }, { "epoch": 0.8270233507141238, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0868, "step": 456 }, { "epoch": 0.8288369984130582, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.3115, "step": 457 }, { "epoch": 0.8306506461119928, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0932, "step": 458 }, { "epoch": 0.8324642938109272, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.1924, "step": 459 }, { "epoch": 0.8342779415098617, "grad_norm": 0.25390625, "learning_rate": 0.0002, "loss": 1.1142, "step": 460 }, { "epoch": 0.8360915892087962, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.1131, "step": 461 }, { "epoch": 0.8379052369077307, "grad_norm": 0.2421875, "learning_rate": 0.0002, "loss": 1.0699, "step": 462 }, { "epoch": 0.8397188846066651, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.2745, "step": 463 }, { "epoch": 0.8415325323055997, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.2706, "step": 464 }, { "epoch": 0.8433461800045341, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.0591, "step": 465 }, { "epoch": 0.8451598277034686, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.0202, "step": 466 }, { "epoch": 0.8469734754024031, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.0544, "step": 467 }, { "epoch": 0.8487871231013375, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.0813, "step": 468 }, { "epoch": 0.8506007708002721, "grad_norm": 0.28125, "learning_rate": 0.0002, "loss": 1.1605, "step": 469 }, { "epoch": 0.8524144184992065, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1818, "step": 470 }, { "epoch": 0.854228066198141, "grad_norm": 0.29296875, "learning_rate": 0.0002, "loss": 1.1301, "step": 471 }, { "epoch": 0.8560417138970755, "grad_norm": 0.306640625, "learning_rate": 0.0002, "loss": 1.2639, "step": 472 }, { "epoch": 0.85785536159601, "grad_norm": 0.3046875, "learning_rate": 0.0002, "loss": 1.2298, "step": 473 }, { "epoch": 0.8596690092949445, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.3902, "step": 474 }, { "epoch": 0.8614826569938789, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1954, "step": 475 }, { "epoch": 0.8632963046928134, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.0341, "step": 476 }, { "epoch": 0.8651099523917479, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.5285, "step": 477 }, { "epoch": 0.8669236000906824, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.2754, "step": 478 }, { "epoch": 0.8687372477896169, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 1.206, "step": 479 }, { "epoch": 0.8705508954885514, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.3143, "step": 480 }, { "epoch": 0.8723645431874858, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.4437, "step": 481 }, { "epoch": 0.8741781908864203, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.246, "step": 482 }, { "epoch": 0.8759918385853548, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.3295, "step": 483 }, { "epoch": 0.8778054862842892, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.4447, "step": 484 }, { "epoch": 0.8796191339832238, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 1.4323, "step": 485 }, { "epoch": 0.8814327816821582, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.4802, "step": 486 }, { "epoch": 0.8832464293810928, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.5318, "step": 487 }, { "epoch": 0.8850600770800272, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.4659, "step": 488 }, { "epoch": 0.8868737247789616, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.4863, "step": 489 }, { "epoch": 0.8886873724778962, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 1.6515, "step": 490 }, { "epoch": 0.8905010201768306, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 1.4753, "step": 491 }, { "epoch": 0.8923146678757651, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.6447, "step": 492 }, { "epoch": 0.8941283155746996, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 1.8025, "step": 493 }, { "epoch": 0.8959419632736341, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.5968, "step": 494 }, { "epoch": 0.8977556109725686, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.7615, "step": 495 }, { "epoch": 0.8995692586715031, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.5668, "step": 496 }, { "epoch": 0.9013829063704375, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.5634, "step": 497 }, { "epoch": 0.903196554069372, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 1.5558, "step": 498 }, { "epoch": 0.9050102017683065, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 1.8796, "step": 499 }, { "epoch": 0.906823849467241, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 1.3844, "step": 500 }, { "epoch": 0.9086374971661755, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.2639, "step": 501 }, { "epoch": 0.9104511448651099, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.046, "step": 502 }, { "epoch": 0.9122647925640445, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.0359, "step": 503 }, { "epoch": 0.9140784402629789, "grad_norm": 0.2578125, "learning_rate": 0.0002, "loss": 1.0403, "step": 504 }, { "epoch": 0.9158920879619133, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.1221, "step": 505 }, { "epoch": 0.9177057356608479, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1616, "step": 506 }, { "epoch": 0.9195193833597823, "grad_norm": 0.275390625, "learning_rate": 0.0002, "loss": 1.1921, "step": 507 }, { "epoch": 0.9213330310587169, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.2143, "step": 508 }, { "epoch": 0.9231466787576513, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.1714, "step": 509 }, { "epoch": 0.9249603264565858, "grad_norm": 0.263671875, "learning_rate": 0.0002, "loss": 1.0514, "step": 510 }, { "epoch": 0.9267739741555203, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.3183, "step": 511 }, { "epoch": 0.9285876218544548, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.2968, "step": 512 }, { "epoch": 0.9304012695533893, "grad_norm": 0.265625, "learning_rate": 0.0002, "loss": 1.1773, "step": 513 }, { "epoch": 0.9322149172523237, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1384, "step": 514 }, { "epoch": 0.9340285649512582, "grad_norm": 0.25, "learning_rate": 0.0002, "loss": 1.1475, "step": 515 }, { "epoch": 0.9358422126501927, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.2867, "step": 516 }, { "epoch": 0.9376558603491272, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1377, "step": 517 }, { "epoch": 0.9394695080480616, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.0727, "step": 518 }, { "epoch": 0.9412831557469962, "grad_norm": 0.2890625, "learning_rate": 0.0002, "loss": 1.255, "step": 519 }, { "epoch": 0.9430968034459306, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 1.0452, "step": 520 }, { "epoch": 0.9449104511448652, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.4331, "step": 521 }, { "epoch": 0.9467240988437996, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.0905, "step": 522 }, { "epoch": 0.948537746542734, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.2676, "step": 523 }, { "epoch": 0.9503513942416686, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 1.1812, "step": 524 }, { "epoch": 0.952165041940603, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.1154, "step": 525 }, { "epoch": 0.9539786896395375, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 1.1382, "step": 526 }, { "epoch": 0.955792337338472, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.2664, "step": 527 }, { "epoch": 0.9576059850374065, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 1.4355, "step": 528 }, { "epoch": 0.959419632736341, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.62, "step": 529 }, { "epoch": 0.9612332804352755, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.3598, "step": 530 }, { "epoch": 0.9630469281342099, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.3033, "step": 531 }, { "epoch": 0.9648605758331444, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.4873, "step": 532 }, { "epoch": 0.9666742235320789, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.4179, "step": 533 }, { "epoch": 0.9684878712310134, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.2967, "step": 534 }, { "epoch": 0.9703015189299479, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.5159, "step": 535 }, { "epoch": 0.9721151666288823, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.5027, "step": 536 }, { "epoch": 0.9739288143278169, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.2173, "step": 537 }, { "epoch": 0.9757424620267513, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.5244, "step": 538 }, { "epoch": 0.9775561097256857, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.5312, "step": 539 }, { "epoch": 0.9793697574246203, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.5789, "step": 540 }, { "epoch": 0.9811834051235547, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.221, "step": 541 }, { "epoch": 0.9829970528224893, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.7273, "step": 542 }, { "epoch": 0.9848107005214237, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.3231, "step": 543 }, { "epoch": 0.9866243482203582, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.4912, "step": 544 }, { "epoch": 0.9884379959192927, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.4419, "step": 545 }, { "epoch": 0.9902516436182272, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.6741, "step": 546 }, { "epoch": 0.9920652913171616, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.5213, "step": 547 }, { "epoch": 0.9938789390160961, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.6728, "step": 548 }, { "epoch": 0.9956925867150306, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 1.4617, "step": 549 }, { "epoch": 0.9975062344139651, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 1.4499, "step": 550 }, { "epoch": 0.9993198821128996, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.4419, "step": 551 }, { "epoch": 1.001133529811834, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.2958, "step": 552 }, { "epoch": 1.0029471775107686, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 1.1073, "step": 553 }, { "epoch": 1.0047608252097031, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 0.959, "step": 554 }, { "epoch": 1.0065744729086374, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.0505, "step": 555 }, { "epoch": 1.008388120607572, "grad_norm": 0.26953125, "learning_rate": 0.0002, "loss": 1.0185, "step": 556 }, { "epoch": 1.0102017683065065, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.0867, "step": 557 }, { "epoch": 1.0120154160054409, "grad_norm": 0.26171875, "learning_rate": 0.0002, "loss": 1.207, "step": 558 }, { "epoch": 1.0138290637043754, "grad_norm": 0.267578125, "learning_rate": 0.0002, "loss": 1.1806, "step": 559 }, { "epoch": 1.01564271140331, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 1.1202, "step": 560 }, { "epoch": 1.0174563591022443, "grad_norm": 0.28515625, "learning_rate": 0.0002, "loss": 1.2813, "step": 561 }, { "epoch": 1.0174563591022443, "eval_loss": 1.3087825775146484, "eval_runtime": 186.0784, "eval_samples_per_second": 5.374, "eval_steps_per_second": 5.374, "step": 561 }, { "epoch": 1.0174563591022443, "mmlu_eval_accuracy": 0.3137830478164283, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.25, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.2727272727272727, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.0, "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.2413793103448276, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.11764705882352941, "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.45454545454545453, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.2571428571428571, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2235294117647059, "mmlu_eval_accuracy_professional_medicine": 0.3870967741935484, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.3684210526315789, "mmlu_loss": 1.895603528955573, "step": 561 }, { "epoch": 1.0192700068011789, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.1503, "step": 562 }, { "epoch": 1.0210836545001134, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.0253, "step": 563 }, { "epoch": 1.0228973021990477, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.0189, "step": 564 }, { "epoch": 1.0247109498979823, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 0.9643, "step": 565 }, { "epoch": 1.0265245975969168, "grad_norm": 0.283203125, "learning_rate": 0.0002, "loss": 0.9652, "step": 566 }, { "epoch": 1.0283382452958514, "grad_norm": 0.2734375, "learning_rate": 0.0002, "loss": 1.2185, "step": 567 }, { "epoch": 1.0301518929947857, "grad_norm": 0.271484375, "learning_rate": 0.0002, "loss": 0.8569, "step": 568 }, { "epoch": 1.0319655406937203, "grad_norm": 0.279296875, "learning_rate": 0.0002, "loss": 0.9985, "step": 569 }, { "epoch": 1.0337791883926548, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.9606, "step": 570 }, { "epoch": 1.0355928360915891, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.9466, "step": 571 }, { "epoch": 1.0374064837905237, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.0585, "step": 572 }, { "epoch": 1.0392201314894582, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 1.0081, "step": 573 }, { "epoch": 1.0410337791883926, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 1.0179, "step": 574 }, { "epoch": 1.0428474268873271, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0345, "step": 575 }, { "epoch": 1.0446610745862617, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.0002, "step": 576 }, { "epoch": 1.046474722285196, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.3126, "step": 577 }, { "epoch": 1.0482883699841306, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 1.3275, "step": 578 }, { "epoch": 1.050102017683065, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.9948, "step": 579 }, { "epoch": 1.0519156653819997, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0417, "step": 580 }, { "epoch": 1.053729313080934, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.8949, "step": 581 }, { "epoch": 1.0555429607798685, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.3989, "step": 582 }, { "epoch": 1.057356608478803, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.0548, "step": 583 }, { "epoch": 1.0591702561777374, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.0148, "step": 584 }, { "epoch": 1.060983903876672, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 1.336, "step": 585 }, { "epoch": 1.0627975515756065, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.947, "step": 586 }, { "epoch": 1.0646111992745408, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.0958, "step": 587 }, { "epoch": 1.0664248469734754, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.2236, "step": 588 }, { "epoch": 1.06823849467241, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.1424, "step": 589 }, { "epoch": 1.0700521423713443, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.1045, "step": 590 }, { "epoch": 1.0718657900702788, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.0649, "step": 591 }, { "epoch": 1.0736794377692134, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 1.3731, "step": 592 }, { "epoch": 1.075493085468148, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.1269, "step": 593 }, { "epoch": 1.0773067331670823, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.2366, "step": 594 }, { "epoch": 1.0791203808660168, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.3126, "step": 595 }, { "epoch": 1.0809340285649514, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.2801, "step": 596 }, { "epoch": 1.0827476762638857, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.1353, "step": 597 }, { "epoch": 1.0845613239628202, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.1172, "step": 598 }, { "epoch": 1.0863749716617548, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.414, "step": 599 }, { "epoch": 1.0881886193606891, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 1.0668, "step": 600 }, { "epoch": 1.0900022670596237, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 1.0065, "step": 601 }, { "epoch": 1.0918159147585582, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.1066, "step": 602 }, { "epoch": 1.0936295624574925, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.152, "step": 603 }, { "epoch": 1.095443210156427, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.0338, "step": 604 }, { "epoch": 1.0972568578553616, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1365, "step": 605 }, { "epoch": 1.099070505554296, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.055, "step": 606 }, { "epoch": 1.1008841532532305, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.002, "step": 607 }, { "epoch": 1.102697800952165, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.9918, "step": 608 }, { "epoch": 1.1045114486510996, "grad_norm": 0.322265625, "learning_rate": 0.0002, "loss": 1.0651, "step": 609 }, { "epoch": 1.106325096350034, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.999, "step": 610 }, { "epoch": 1.1081387440489685, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.968, "step": 611 }, { "epoch": 1.109952391747903, "grad_norm": 0.287109375, "learning_rate": 0.0002, "loss": 1.0446, "step": 612 }, { "epoch": 1.1117660394468374, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.996, "step": 613 }, { "epoch": 1.113579687145772, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.9761, "step": 614 }, { "epoch": 1.1153933348447065, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.9968, "step": 615 }, { "epoch": 1.1172069825436408, "grad_norm": 0.314453125, "learning_rate": 0.0002, "loss": 0.8517, "step": 616 }, { "epoch": 1.1190206302425754, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.9781, "step": 617 }, { "epoch": 1.12083427794151, "grad_norm": 0.296875, "learning_rate": 0.0002, "loss": 0.9407, "step": 618 }, { "epoch": 1.1226479256404442, "grad_norm": 0.30859375, "learning_rate": 0.0002, "loss": 1.0508, "step": 619 }, { "epoch": 1.1244615733393788, "grad_norm": 0.291015625, "learning_rate": 0.0002, "loss": 0.9679, "step": 620 }, { "epoch": 1.1262752210383133, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 1.0524, "step": 621 }, { "epoch": 1.128088868737248, "grad_norm": 0.318359375, "learning_rate": 0.0002, "loss": 0.8769, "step": 622 }, { "epoch": 1.1299025164361822, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.1003, "step": 623 }, { "epoch": 1.1317161641351168, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.9972, "step": 624 }, { "epoch": 1.1335298118340513, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 1.027, "step": 625 }, { "epoch": 1.1353434595329857, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.8696, "step": 626 }, { "epoch": 1.1371571072319202, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.9838, "step": 627 }, { "epoch": 1.1389707549308548, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.0411, "step": 628 }, { "epoch": 1.140784402629789, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.0912, "step": 629 }, { "epoch": 1.1425980503287236, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.0335, "step": 630 }, { "epoch": 1.1444116980276582, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.9602, "step": 631 }, { "epoch": 1.1462253457265925, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.0953, "step": 632 }, { "epoch": 1.148038993425527, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.0101, "step": 633 }, { "epoch": 1.1498526411244616, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.1159, "step": 634 }, { "epoch": 1.151666288823396, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2389, "step": 635 }, { "epoch": 1.1534799365223305, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.2099, "step": 636 }, { "epoch": 1.155293584221265, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.2122, "step": 637 }, { "epoch": 1.1571072319201996, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.172, "step": 638 }, { "epoch": 1.158920879619134, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.0683, "step": 639 }, { "epoch": 1.1607345273180685, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.0005, "step": 640 }, { "epoch": 1.162548175017003, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.1648, "step": 641 }, { "epoch": 1.1643618227159374, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.286, "step": 642 }, { "epoch": 1.166175470414872, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.0166, "step": 643 }, { "epoch": 1.1679891181138065, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.2449, "step": 644 }, { "epoch": 1.1698027658127408, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.337, "step": 645 }, { "epoch": 1.1716164135116753, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.1278, "step": 646 }, { "epoch": 1.1734300612106099, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.3293, "step": 647 }, { "epoch": 1.1752437089095444, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 1.1442, "step": 648 }, { "epoch": 1.1770573566084788, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.0162, "step": 649 }, { "epoch": 1.1788710043074133, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 1.1672, "step": 650 }, { "epoch": 1.1806846520063479, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 1.1817, "step": 651 }, { "epoch": 1.1824982997052822, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 1.0051, "step": 652 }, { "epoch": 1.1843119474042167, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.0073, "step": 653 }, { "epoch": 1.1861255951031513, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.9907, "step": 654 }, { "epoch": 1.1879392428020856, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.9565, "step": 655 }, { "epoch": 1.1897528905010202, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.0513, "step": 656 }, { "epoch": 1.1915665381999547, "grad_norm": 0.3125, "learning_rate": 0.0002, "loss": 0.9232, "step": 657 }, { "epoch": 1.193380185898889, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 0.9321, "step": 658 }, { "epoch": 1.1951938335978236, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 1.053, "step": 659 }, { "epoch": 1.1970074812967582, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.8885, "step": 660 }, { "epoch": 1.1988211289956925, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 1.0229, "step": 661 }, { "epoch": 1.200634776694627, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.9782, "step": 662 }, { "epoch": 1.2024484243935616, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.9612, "step": 663 }, { "epoch": 1.204262072092496, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.8867, "step": 664 }, { "epoch": 1.2060757197914305, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.0413, "step": 665 }, { "epoch": 1.207889367490365, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 1.0497, "step": 666 }, { "epoch": 1.2097030151892996, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.9435, "step": 667 }, { "epoch": 1.211516662888234, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 1.0361, "step": 668 }, { "epoch": 1.2133303105871684, "grad_norm": 0.298828125, "learning_rate": 0.0002, "loss": 0.82, "step": 669 }, { "epoch": 1.215143958286103, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.0616, "step": 670 }, { "epoch": 1.2169576059850373, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.8381, "step": 671 }, { "epoch": 1.2187712536839719, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.9432, "step": 672 }, { "epoch": 1.2205849013829064, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.0964, "step": 673 }, { "epoch": 1.2223985490818408, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.0581, "step": 674 }, { "epoch": 1.2242121967807753, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.2126, "step": 675 }, { "epoch": 1.2260258444797099, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.9664, "step": 676 }, { "epoch": 1.2278394921786444, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.9676, "step": 677 }, { "epoch": 1.2296531398775787, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0767, "step": 678 }, { "epoch": 1.2314667875765133, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.0624, "step": 679 }, { "epoch": 1.2332804352754478, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.0404, "step": 680 }, { "epoch": 1.2350940829743822, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.9588, "step": 681 }, { "epoch": 1.2369077306733167, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.1889, "step": 682 }, { "epoch": 1.2387213783722513, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.0662, "step": 683 }, { "epoch": 1.2405350260711856, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.1588, "step": 684 }, { "epoch": 1.2423486737701201, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 1.0837, "step": 685 }, { "epoch": 1.2441623214690547, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.2982, "step": 686 }, { "epoch": 1.245975969167989, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.0978, "step": 687 }, { "epoch": 1.2477896168669236, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.1693, "step": 688 }, { "epoch": 1.2496032645658581, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.2647, "step": 689 }, { "epoch": 1.2514169122647925, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.3017, "step": 690 }, { "epoch": 1.253230559963727, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.2823, "step": 691 }, { "epoch": 1.2550442076626616, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.195, "step": 692 }, { "epoch": 1.2568578553615959, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.1837, "step": 693 }, { "epoch": 1.2586715030605304, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.13, "step": 694 }, { "epoch": 1.260485150759465, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 1.5439, "step": 695 }, { "epoch": 1.2622987984583993, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 1.1141, "step": 696 }, { "epoch": 1.2641124461573339, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.4548, "step": 697 }, { "epoch": 1.2659260938562684, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.2669, "step": 698 }, { "epoch": 1.267739741555203, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 1.375, "step": 699 }, { "epoch": 1.2695533892541375, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.1015, "step": 700 }, { "epoch": 1.2713670369530718, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 1.1171, "step": 701 }, { "epoch": 1.2731806846520064, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 1.0236, "step": 702 }, { "epoch": 1.274994332350941, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.2365, "step": 703 }, { "epoch": 1.2768079800498753, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 1.1731, "step": 704 }, { "epoch": 1.2786216277488098, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.9808, "step": 705 }, { "epoch": 1.2804352754477444, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.9518, "step": 706 }, { "epoch": 1.2822489231466787, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.059, "step": 707 }, { "epoch": 1.2840625708456133, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 1.0339, "step": 708 }, { "epoch": 1.2858762185445478, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 1.1769, "step": 709 }, { "epoch": 1.2876898662434821, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.9517, "step": 710 }, { "epoch": 1.2895035139424167, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.0548, "step": 711 }, { "epoch": 1.2913171616413512, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.9448, "step": 712 }, { "epoch": 1.2931308093402856, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.975, "step": 713 }, { "epoch": 1.2949444570392201, "grad_norm": 0.310546875, "learning_rate": 0.0002, "loss": 0.8819, "step": 714 }, { "epoch": 1.2967581047381547, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.1451, "step": 715 }, { "epoch": 1.298571752437089, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.9, "step": 716 }, { "epoch": 1.3003854001360236, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.9173, "step": 717 }, { "epoch": 1.302199047834958, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.0065, "step": 718 }, { "epoch": 1.3040126955338924, "grad_norm": 0.31640625, "learning_rate": 0.0002, "loss": 0.8532, "step": 719 }, { "epoch": 1.305826343232827, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 1.033, "step": 720 }, { "epoch": 1.3076399909317615, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 1.1255, "step": 721 }, { "epoch": 1.3094536386306959, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.9544, "step": 722 }, { "epoch": 1.3112672863296304, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.1084, "step": 723 }, { "epoch": 1.313080934028565, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.1912, "step": 724 }, { "epoch": 1.3148945817274995, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 1.0042, "step": 725 }, { "epoch": 1.3167082294264338, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.0245, "step": 726 }, { "epoch": 1.3185218771253684, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.0397, "step": 727 }, { "epoch": 1.320335524824303, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.9238, "step": 728 }, { "epoch": 1.3221491725232375, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.9639, "step": 729 }, { "epoch": 1.3239628202221718, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.2548, "step": 730 }, { "epoch": 1.3257764679211064, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.3046, "step": 731 }, { "epoch": 1.327590115620041, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.0847, "step": 732 }, { "epoch": 1.3294037633189753, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.1524, "step": 733 }, { "epoch": 1.3312174110179098, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 1.2186, "step": 734 }, { "epoch": 1.3330310587168444, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.0869, "step": 735 }, { "epoch": 1.3348447064157787, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.1275, "step": 736 }, { "epoch": 1.3366583541147132, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.1573, "step": 737 }, { "epoch": 1.3384720018136478, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.0194, "step": 738 }, { "epoch": 1.3402856495125821, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.2154, "step": 739 }, { "epoch": 1.3420992972115167, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.1871, "step": 740 }, { "epoch": 1.3439129449104512, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.0981, "step": 741 }, { "epoch": 1.3457265926093855, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.1404, "step": 742 }, { "epoch": 1.34754024030832, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.2253, "step": 743 }, { "epoch": 1.3493538880072546, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 1.36, "step": 744 }, { "epoch": 1.351167535706189, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.2659, "step": 745 }, { "epoch": 1.3529811834051235, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 1.1625, "step": 746 }, { "epoch": 1.354794831104058, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.1521, "step": 747 }, { "epoch": 1.3566084788029924, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.1541, "step": 748 }, { "epoch": 1.3566084788029924, "eval_loss": 1.3332726955413818, "eval_runtime": 185.617, "eval_samples_per_second": 5.387, "eval_steps_per_second": 5.387, "step": 748 }, { "epoch": 1.3566084788029924, "mmlu_eval_accuracy": 0.323347968487159, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.1875, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.35714285714285715, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.4666666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667, "mmlu_eval_accuracy_international_law": 0.23076923076923078, "mmlu_eval_accuracy_jurisprudence": 0.09090909090909091, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.36363636363636365, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.2411764705882353, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.42105263157894735, "mmlu_loss": 1.453398456338338, "step": 748 }, { "epoch": 1.358422126501927, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 1.132, "step": 749 }, { "epoch": 1.3602357742008615, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 1.1479, "step": 750 }, { "epoch": 1.3620494218997958, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 1.163, "step": 751 }, { "epoch": 1.3638630695987304, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 1.1212, "step": 752 }, { "epoch": 1.365676717297665, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 1.0803, "step": 753 }, { "epoch": 1.3674903649965995, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.8941, "step": 754 }, { "epoch": 1.3693040126955338, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.0914, "step": 755 }, { "epoch": 1.3711176603944684, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.9574, "step": 756 }, { "epoch": 1.372931308093403, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1723, "step": 757 }, { "epoch": 1.3747449557923375, "grad_norm": 0.337890625, "learning_rate": 0.0002, "loss": 1.1145, "step": 758 }, { "epoch": 1.3765586034912718, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.1032, "step": 759 }, { "epoch": 1.3783722511902063, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.0776, "step": 760 }, { "epoch": 1.380185898889141, "grad_norm": 0.32421875, "learning_rate": 0.0002, "loss": 0.9636, "step": 761 }, { "epoch": 1.3819995465880752, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 1.1415, "step": 762 }, { "epoch": 1.3838131942870098, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.1594, "step": 763 }, { "epoch": 1.3856268419859443, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 1.0068, "step": 764 }, { "epoch": 1.3874404896848787, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.1015, "step": 765 }, { "epoch": 1.3892541373838132, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.0336, "step": 766 }, { "epoch": 1.3910677850827478, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 1.0387, "step": 767 }, { "epoch": 1.392881432781682, "grad_norm": 0.33203125, "learning_rate": 0.0002, "loss": 0.9446, "step": 768 }, { "epoch": 1.3946950804806166, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 1.0795, "step": 769 }, { "epoch": 1.3965087281795512, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.8576, "step": 770 }, { "epoch": 1.3983223758784855, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.9668, "step": 771 }, { "epoch": 1.40013602357742, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.936, "step": 772 }, { "epoch": 1.4019496712763546, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 1.0929, "step": 773 }, { "epoch": 1.403763318975289, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.087, "step": 774 }, { "epoch": 1.4055769666742235, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.9413, "step": 775 }, { "epoch": 1.407390614373158, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.0501, "step": 776 }, { "epoch": 1.4092042620720924, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.9804, "step": 777 }, { "epoch": 1.411017909771027, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 1.0712, "step": 778 }, { "epoch": 1.4128315574699615, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.1064, "step": 779 }, { "epoch": 1.414645205168896, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 1.1328, "step": 780 }, { "epoch": 1.4164588528678304, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 1.3042, "step": 781 }, { "epoch": 1.418272500566765, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 1.1637, "step": 782 }, { "epoch": 1.4200861482656995, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.1009, "step": 783 }, { "epoch": 1.421899795964634, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 1.0247, "step": 784 }, { "epoch": 1.4237134436635683, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 1.2177, "step": 785 }, { "epoch": 1.4255270913625029, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.2114, "step": 786 }, { "epoch": 1.4273407390614374, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.1233, "step": 787 }, { "epoch": 1.4291543867603718, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 1.1694, "step": 788 }, { "epoch": 1.4309680344593063, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.2571, "step": 789 }, { "epoch": 1.4327816821582409, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.1806, "step": 790 }, { "epoch": 1.4345953298571752, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.3243, "step": 791 }, { "epoch": 1.4364089775561097, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.2546, "step": 792 }, { "epoch": 1.4382226252550443, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 1.1647, "step": 793 }, { "epoch": 1.4400362729539786, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.0702, "step": 794 }, { "epoch": 1.4418499206529132, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.2722, "step": 795 }, { "epoch": 1.4436635683518477, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 1.1607, "step": 796 }, { "epoch": 1.445477216050782, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.3516, "step": 797 }, { "epoch": 1.4472908637497166, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.2323, "step": 798 }, { "epoch": 1.4491045114486512, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 1.1486, "step": 799 }, { "epoch": 1.4509181591475855, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 1.0743, "step": 800 }, { "epoch": 1.45273180684652, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 1.0043, "step": 801 }, { "epoch": 1.4545454545454546, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.9166, "step": 802 }, { "epoch": 1.456359102244389, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 1.1894, "step": 803 }, { "epoch": 1.4581727499433235, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0164, "step": 804 }, { "epoch": 1.459986397642258, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.057, "step": 805 }, { "epoch": 1.4618000453411923, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.8426, "step": 806 }, { "epoch": 1.463613693040127, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.0384, "step": 807 }, { "epoch": 1.4654273407390614, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.935, "step": 808 }, { "epoch": 1.467240988437996, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.9289, "step": 809 }, { "epoch": 1.4690546361369303, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.897, "step": 810 }, { "epoch": 1.4708682838358649, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.962, "step": 811 }, { "epoch": 1.4726819315347994, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.9371, "step": 812 }, { "epoch": 1.474495579233734, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 1.0029, "step": 813 }, { "epoch": 1.4763092269326683, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.9079, "step": 814 }, { "epoch": 1.4781228746316029, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.0711, "step": 815 }, { "epoch": 1.4799365223305374, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.9411, "step": 816 }, { "epoch": 1.4817501700294717, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0328, "step": 817 }, { "epoch": 1.4835638177284063, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 1.035, "step": 818 }, { "epoch": 1.4853774654273408, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.0472, "step": 819 }, { "epoch": 1.4871911131262752, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.8977, "step": 820 }, { "epoch": 1.4890047608252097, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.9769, "step": 821 }, { "epoch": 1.4908184085241443, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.954, "step": 822 }, { "epoch": 1.4926320562230786, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.0446, "step": 823 }, { "epoch": 1.4944457039220131, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.0384, "step": 824 }, { "epoch": 1.4962593516209477, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.9979, "step": 825 }, { "epoch": 1.498072999319882, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.1362, "step": 826 }, { "epoch": 1.4998866470188166, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.011, "step": 827 }, { "epoch": 1.5017002947177511, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.0002, "step": 828 }, { "epoch": 1.5035139424166855, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 1.0655, "step": 829 }, { "epoch": 1.50532759011562, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.9916, "step": 830 }, { "epoch": 1.5071412378145546, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 1.1136, "step": 831 }, { "epoch": 1.5089548855134889, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.2488, "step": 832 }, { "epoch": 1.5107685332124234, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.1293, "step": 833 }, { "epoch": 1.512582180911358, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 1.1062, "step": 834 }, { "epoch": 1.5143958286102923, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.2056, "step": 835 }, { "epoch": 1.516209476309227, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.995, "step": 836 }, { "epoch": 1.5180231240081614, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.2347, "step": 837 }, { "epoch": 1.5198367717070957, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.3434, "step": 838 }, { "epoch": 1.5216504194060305, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.2942, "step": 839 }, { "epoch": 1.5234640671049648, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.2702, "step": 840 }, { "epoch": 1.5252777148038992, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.0757, "step": 841 }, { "epoch": 1.527091362502834, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 1.2989, "step": 842 }, { "epoch": 1.5289050102017683, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.2326, "step": 843 }, { "epoch": 1.5307186579007028, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.307, "step": 844 }, { "epoch": 1.5325323055996374, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.2767, "step": 845 }, { "epoch": 1.5343459532985717, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 1.2491, "step": 846 }, { "epoch": 1.5361596009975063, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 1.3714, "step": 847 }, { "epoch": 1.5379732486964408, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 1.1377, "step": 848 }, { "epoch": 1.5397868963953751, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 1.0654, "step": 849 }, { "epoch": 1.5416005440943097, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 1.2524, "step": 850 }, { "epoch": 1.5434141917932442, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 1.1143, "step": 851 }, { "epoch": 1.5452278394921786, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.9904, "step": 852 }, { "epoch": 1.5470414871911131, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.9534, "step": 853 }, { "epoch": 1.5488551348900477, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.8924, "step": 854 }, { "epoch": 1.550668782588982, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 1.0201, "step": 855 }, { "epoch": 1.5524824302879165, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.1098, "step": 856 }, { "epoch": 1.554296077986851, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.9669, "step": 857 }, { "epoch": 1.5561097256857854, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.1127, "step": 858 }, { "epoch": 1.55792337338472, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.9041, "step": 859 }, { "epoch": 1.5597370210836545, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.0138, "step": 860 }, { "epoch": 1.5615506687825889, "grad_norm": 0.33984375, "learning_rate": 0.0002, "loss": 0.906, "step": 861 }, { "epoch": 1.5633643164815236, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.9815, "step": 862 }, { "epoch": 1.565177964180458, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 1.0134, "step": 863 }, { "epoch": 1.5669916118793923, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 1.0529, "step": 864 }, { "epoch": 1.568805259578327, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 1.0714, "step": 865 }, { "epoch": 1.5706189072772614, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.1258, "step": 866 }, { "epoch": 1.5724325549761957, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.9006, "step": 867 }, { "epoch": 1.5742462026751305, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.9467, "step": 868 }, { "epoch": 1.5760598503740648, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 1.1596, "step": 869 }, { "epoch": 1.5778734980729994, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.9521, "step": 870 }, { "epoch": 1.579687145771934, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 1.0002, "step": 871 }, { "epoch": 1.5815007934708682, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 1.0734, "step": 872 }, { "epoch": 1.5833144411698028, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 1.1564, "step": 873 }, { "epoch": 1.5851280888687374, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.975, "step": 874 }, { "epoch": 1.5869417365676717, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 1.0354, "step": 875 }, { "epoch": 1.5887553842666062, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.0049, "step": 876 }, { "epoch": 1.5905690319655408, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.1345, "step": 877 }, { "epoch": 1.592382679664475, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 1.0272, "step": 878 }, { "epoch": 1.5941963273634097, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 1.1008, "step": 879 }, { "epoch": 1.5960099750623442, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.2352, "step": 880 }, { "epoch": 1.5978236227612785, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 1.0289, "step": 881 }, { "epoch": 1.599637270460213, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 1.1604, "step": 882 }, { "epoch": 1.6014509181591476, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 1.286, "step": 883 }, { "epoch": 1.603264565858082, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2724, "step": 884 }, { "epoch": 1.6050782135570165, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.931, "step": 885 }, { "epoch": 1.606891861255951, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.0891, "step": 886 }, { "epoch": 1.6087055089548854, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.982, "step": 887 }, { "epoch": 1.61051915665382, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.061, "step": 888 }, { "epoch": 1.6123328043527545, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 1.1838, "step": 889 }, { "epoch": 1.6141464520516888, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.0621, "step": 890 }, { "epoch": 1.6159600997506236, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.1551, "step": 891 }, { "epoch": 1.617773747449558, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.0548, "step": 892 }, { "epoch": 1.6195873951484923, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 1.2508, "step": 893 }, { "epoch": 1.621401042847427, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.2455, "step": 894 }, { "epoch": 1.6232146905463614, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 1.2151, "step": 895 }, { "epoch": 1.6250283382452957, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 1.2464, "step": 896 }, { "epoch": 1.6268419859442305, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 1.184, "step": 897 }, { "epoch": 1.6286556336431648, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 1.2976, "step": 898 }, { "epoch": 1.6304692813420993, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.0167, "step": 899 }, { "epoch": 1.632282929041034, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 1.1079, "step": 900 }, { "epoch": 1.6340965767399682, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 1.2695, "step": 901 }, { "epoch": 1.6359102244389028, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 1.0757, "step": 902 }, { "epoch": 1.6377238721378373, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.883, "step": 903 }, { "epoch": 1.6395375198367717, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.0784, "step": 904 }, { "epoch": 1.6413511675357062, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 1.0178, "step": 905 }, { "epoch": 1.6431648152346408, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.1552, "step": 906 }, { "epoch": 1.644978462933575, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.1909, "step": 907 }, { "epoch": 1.6467921106325096, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.034, "step": 908 }, { "epoch": 1.6486057583314442, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.9236, "step": 909 }, { "epoch": 1.6504194060303785, "grad_norm": 0.349609375, "learning_rate": 0.0002, "loss": 0.9381, "step": 910 }, { "epoch": 1.652233053729313, "grad_norm": 0.34375, "learning_rate": 0.0002, "loss": 0.9001, "step": 911 }, { "epoch": 1.6540467014282476, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.9317, "step": 912 }, { "epoch": 1.655860349127182, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.9501, "step": 913 }, { "epoch": 1.6576739968261165, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 1.1684, "step": 914 }, { "epoch": 1.659487644525051, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.9256, "step": 915 }, { "epoch": 1.6613012922239854, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.0456, "step": 916 }, { "epoch": 1.66311493992292, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.8571, "step": 917 }, { "epoch": 1.6649285876218545, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.0265, "step": 918 }, { "epoch": 1.6667422353207888, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.8097, "step": 919 }, { "epoch": 1.6685558830197236, "grad_norm": 0.345703125, "learning_rate": 0.0002, "loss": 0.8663, "step": 920 }, { "epoch": 1.670369530718658, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.9378, "step": 921 }, { "epoch": 1.6721831784175922, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.9402, "step": 922 }, { "epoch": 1.673996826116527, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.069, "step": 923 }, { "epoch": 1.6758104738154613, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.9064, "step": 924 }, { "epoch": 1.6776241215143959, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.114, "step": 925 }, { "epoch": 1.6794377692133304, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.9939, "step": 926 }, { "epoch": 1.6812514169122648, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 1.0159, "step": 927 }, { "epoch": 1.6830650646111993, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.8874, "step": 928 }, { "epoch": 1.6848787123101339, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.8997, "step": 929 }, { "epoch": 1.6866923600090682, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.192, "step": 930 }, { "epoch": 1.6885060077080027, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 1.0925, "step": 931 }, { "epoch": 1.6903196554069373, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.9844, "step": 932 }, { "epoch": 1.6921333031058716, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.9969, "step": 933 }, { "epoch": 1.6939469508048062, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.0469, "step": 934 }, { "epoch": 1.6957605985037407, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.0665, "step": 935 }, { "epoch": 1.6957605985037407, "eval_loss": 1.3310478925704956, "eval_runtime": 186.2155, "eval_samples_per_second": 5.37, "eval_steps_per_second": 5.37, "step": 935 }, { "epoch": 1.6957605985037407, "mmlu_eval_accuracy": 0.3264374577697034, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.0, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.3125, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.5, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.30434782608695654, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.23076923076923078, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.09090909090909091, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.47674418604651164, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3333333333333333, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.22941176470588234, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.6363636363636364, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.3684210526315789, "mmlu_loss": 1.827259869259374, "step": 935 }, { "epoch": 1.697574246202675, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.2897, "step": 936 }, { "epoch": 1.6993878939016096, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.1101, "step": 937 }, { "epoch": 1.7012015416005442, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.1277, "step": 938 }, { "epoch": 1.7030151892994785, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 1.2044, "step": 939 }, { "epoch": 1.704828836998413, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.2564, "step": 940 }, { "epoch": 1.7066424846973476, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 1.1677, "step": 941 }, { "epoch": 1.708456132396282, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.2714, "step": 942 }, { "epoch": 1.7102697800952165, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.3053, "step": 943 }, { "epoch": 1.712083427794151, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 1.3784, "step": 944 }, { "epoch": 1.7138970754930853, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.1089, "step": 945 }, { "epoch": 1.7157107231920201, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 1.345, "step": 946 }, { "epoch": 1.7175243708909544, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.3185, "step": 947 }, { "epoch": 1.7193380185898888, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.2093, "step": 948 }, { "epoch": 1.7211516662888235, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 1.2966, "step": 949 }, { "epoch": 1.7229653139877579, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 1.0968, "step": 950 }, { "epoch": 1.7247789616866922, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 1.0479, "step": 951 }, { "epoch": 1.726592609385627, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 1.0872, "step": 952 }, { "epoch": 1.7284062570845613, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.2082, "step": 953 }, { "epoch": 1.7302199047834959, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.8991, "step": 954 }, { "epoch": 1.7320335524824304, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.0741, "step": 955 }, { "epoch": 1.7338472001813647, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.9182, "step": 956 }, { "epoch": 1.7356608478802993, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.9185, "step": 957 }, { "epoch": 1.7374744955792338, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.8024, "step": 958 }, { "epoch": 1.7392881432781682, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.8567, "step": 959 }, { "epoch": 1.7411017909771027, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.9807, "step": 960 }, { "epoch": 1.7429154386760373, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 1.1005, "step": 961 }, { "epoch": 1.7447290863749716, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.9461, "step": 962 }, { "epoch": 1.7465427340739061, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.9579, "step": 963 }, { "epoch": 1.7483563817728407, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.9263, "step": 964 }, { "epoch": 1.750170029471775, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.9587, "step": 965 }, { "epoch": 1.7519836771707096, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.7967, "step": 966 }, { "epoch": 1.7537973248696441, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.9873, "step": 967 }, { "epoch": 1.7556109725685785, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.8435, "step": 968 }, { "epoch": 1.757424620267513, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.987, "step": 969 }, { "epoch": 1.7592382679664476, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 1.2338, "step": 970 }, { "epoch": 1.7610519156653819, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.9796, "step": 971 }, { "epoch": 1.7628655633643164, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.1957, "step": 972 }, { "epoch": 1.764679211063251, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.9203, "step": 973 }, { "epoch": 1.7664928587621853, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 1.1501, "step": 974 }, { "epoch": 1.76830650646112, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 1.0063, "step": 975 }, { "epoch": 1.7701201541600544, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 1.1053, "step": 976 }, { "epoch": 1.7719338018589887, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.9871, "step": 977 }, { "epoch": 1.7737474495579235, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 1.349, "step": 978 }, { "epoch": 1.7755610972568578, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.0443, "step": 979 }, { "epoch": 1.7773747449557922, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.8922, "step": 980 }, { "epoch": 1.779188392654727, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.0508, "step": 981 }, { "epoch": 1.7810020403536613, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.185, "step": 982 }, { "epoch": 1.7828156880525958, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 1.1192, "step": 983 }, { "epoch": 1.7846293357515304, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 1.2335, "step": 984 }, { "epoch": 1.7864429834504647, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 1.226, "step": 985 }, { "epoch": 1.7882566311493993, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.1869, "step": 986 }, { "epoch": 1.7900702788483338, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 1.1249, "step": 987 }, { "epoch": 1.7918839265472681, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 1.1519, "step": 988 }, { "epoch": 1.7936975742462027, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.1811, "step": 989 }, { "epoch": 1.7955112219451372, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 1.0995, "step": 990 }, { "epoch": 1.7973248696440716, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 1.0938, "step": 991 }, { "epoch": 1.7991385173430061, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 1.2931, "step": 992 }, { "epoch": 1.8009521650419407, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 1.5371, "step": 993 }, { "epoch": 1.802765812740875, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 1.3052, "step": 994 }, { "epoch": 1.8045794604398095, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 1.2571, "step": 995 }, { "epoch": 1.806393108138744, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 1.2274, "step": 996 }, { "epoch": 1.8082067558376784, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 1.2686, "step": 997 }, { "epoch": 1.810020403536613, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.98, "step": 998 }, { "epoch": 1.8118340512355475, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 1.355, "step": 999 }, { "epoch": 1.8136476989344819, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 1.0669, "step": 1000 }, { "epoch": 1.8154613466334164, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 1.2389, "step": 1001 }, { "epoch": 1.817274994332351, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 1.2125, "step": 1002 }, { "epoch": 1.8190886420312853, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.856, "step": 1003 }, { "epoch": 1.82090228973022, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 1.0871, "step": 1004 }, { "epoch": 1.8227159374291544, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 1.0637, "step": 1005 }, { "epoch": 1.8245295851280887, "grad_norm": 0.3515625, "learning_rate": 0.0002, "loss": 0.9106, "step": 1006 }, { "epoch": 1.8263432328270235, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.9687, "step": 1007 }, { "epoch": 1.8281568805259578, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.9577, "step": 1008 }, { "epoch": 1.8299705282248924, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 1.0904, "step": 1009 }, { "epoch": 1.831784175923827, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.9006, "step": 1010 }, { "epoch": 1.8335978236227612, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 1.0433, "step": 1011 }, { "epoch": 1.8354114713216958, "grad_norm": 0.359375, "learning_rate": 0.0002, "loss": 0.9716, "step": 1012 }, { "epoch": 1.8372251190206303, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 1.205, "step": 1013 }, { "epoch": 1.8390387667195647, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 1.0396, "step": 1014 }, { "epoch": 1.8408524144184992, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.9822, "step": 1015 }, { "epoch": 1.8426660621174338, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.9133, "step": 1016 }, { "epoch": 1.844479709816368, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.092, "step": 1017 }, { "epoch": 1.8462933575153027, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.0417, "step": 1018 }, { "epoch": 1.8481070052142372, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 1.1685, "step": 1019 }, { "epoch": 1.8499206529131715, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.9152, "step": 1020 }, { "epoch": 1.851734300612106, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.9144, "step": 1021 }, { "epoch": 1.8535479483110406, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 1.0166, "step": 1022 }, { "epoch": 1.855361596009975, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 1.2926, "step": 1023 }, { "epoch": 1.8571752437089095, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.9926, "step": 1024 }, { "epoch": 1.858988891407844, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.9213, "step": 1025 }, { "epoch": 1.8608025391067784, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.9663, "step": 1026 }, { "epoch": 1.862616186805713, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 1.039, "step": 1027 }, { "epoch": 1.8644298345046475, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.917, "step": 1028 }, { "epoch": 1.8662434822035818, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 1.062, "step": 1029 }, { "epoch": 1.8680571299025166, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 1.1194, "step": 1030 }, { "epoch": 1.869870777601451, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.9947, "step": 1031 }, { "epoch": 1.8716844253003853, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 1.1242, "step": 1032 }, { "epoch": 1.87349807299932, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.1571, "step": 1033 }, { "epoch": 1.8753117206982544, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 1.0039, "step": 1034 }, { "epoch": 1.8771253683971887, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 1.0585, "step": 1035 }, { "epoch": 1.8789390160961235, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.9928, "step": 1036 }, { "epoch": 1.8807526637950578, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.0909, "step": 1037 }, { "epoch": 1.8825663114939923, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 1.3759, "step": 1038 }, { "epoch": 1.884379959192927, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 1.0947, "step": 1039 }, { "epoch": 1.8861936068918612, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 1.1204, "step": 1040 }, { "epoch": 1.8880072545907958, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 1.38, "step": 1041 }, { "epoch": 1.8898209022897303, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.1763, "step": 1042 }, { "epoch": 1.8916345499886646, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.0582, "step": 1043 }, { "epoch": 1.8934481976875992, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 1.2542, "step": 1044 }, { "epoch": 1.8952618453865338, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 1.2931, "step": 1045 }, { "epoch": 1.897075493085468, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 1.1239, "step": 1046 }, { "epoch": 1.8988891407844026, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.2628, "step": 1047 }, { "epoch": 1.9007027884833372, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 1.1644, "step": 1048 }, { "epoch": 1.9025164361822715, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 1.2183, "step": 1049 }, { "epoch": 1.904330083881206, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 1.1222, "step": 1050 }, { "epoch": 1.9061437315801406, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 1.1521, "step": 1051 }, { "epoch": 1.907957379279075, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.9587, "step": 1052 }, { "epoch": 1.9097710269780095, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.9704, "step": 1053 }, { "epoch": 1.911584674676944, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.9936, "step": 1054 }, { "epoch": 1.9133983223758784, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 1.0674, "step": 1055 }, { "epoch": 1.915211970074813, "grad_norm": 0.353515625, "learning_rate": 0.0002, "loss": 0.9639, "step": 1056 }, { "epoch": 1.9170256177737475, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.8781, "step": 1057 }, { "epoch": 1.9188392654726818, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.9669, "step": 1058 }, { "epoch": 1.9206529131716166, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 1.0363, "step": 1059 }, { "epoch": 1.922466560870551, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.8616, "step": 1060 }, { "epoch": 1.9242802085694852, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 1.0274, "step": 1061 }, { "epoch": 1.92609385626842, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.9495, "step": 1062 }, { "epoch": 1.9279075039673543, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 1.1934, "step": 1063 }, { "epoch": 1.9297211516662887, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.0885, "step": 1064 }, { "epoch": 1.9315347993652234, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 1.0724, "step": 1065 }, { "epoch": 1.9333484470641578, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.9793, "step": 1066 }, { "epoch": 1.9351620947630923, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.0264, "step": 1067 }, { "epoch": 1.9369757424620269, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.9737, "step": 1068 }, { "epoch": 1.9387893901609612, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.9725, "step": 1069 }, { "epoch": 1.9406030378598957, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.9427, "step": 1070 }, { "epoch": 1.9424166855588303, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.9189, "step": 1071 }, { "epoch": 1.9442303332577646, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.8971, "step": 1072 }, { "epoch": 1.9460439809566992, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 1.1025, "step": 1073 }, { "epoch": 1.9478576286556337, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.1456, "step": 1074 }, { "epoch": 1.949671276354568, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.9514, "step": 1075 }, { "epoch": 1.9514849240535026, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.9329, "step": 1076 }, { "epoch": 1.9532985717524372, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 1.1284, "step": 1077 }, { "epoch": 1.9551122194513715, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 1.0883, "step": 1078 }, { "epoch": 1.956925867150306, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.9165, "step": 1079 }, { "epoch": 1.9587395148492406, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.9499, "step": 1080 }, { "epoch": 1.960553162548175, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.2901, "step": 1081 }, { "epoch": 1.9623668102471095, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 1.2063, "step": 1082 }, { "epoch": 1.964180457946044, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 1.0528, "step": 1083 }, { "epoch": 1.9659941056449783, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 1.2891, "step": 1084 }, { "epoch": 1.967807753343913, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.4138, "step": 1085 }, { "epoch": 1.9696214010428474, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.094, "step": 1086 }, { "epoch": 1.9714350487417818, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 1.2854, "step": 1087 }, { "epoch": 1.9732486964407165, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.2548, "step": 1088 }, { "epoch": 1.9750623441396509, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.1538, "step": 1089 }, { "epoch": 1.9768759918385852, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 1.3796, "step": 1090 }, { "epoch": 1.97868963953752, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 1.0574, "step": 1091 }, { "epoch": 1.9805032872364543, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 1.1007, "step": 1092 }, { "epoch": 1.9823169349353889, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 1.3384, "step": 1093 }, { "epoch": 1.9841305826343234, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 1.0607, "step": 1094 }, { "epoch": 1.9859442303332577, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 1.188, "step": 1095 }, { "epoch": 1.9877578780321923, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 1.2736, "step": 1096 }, { "epoch": 1.9895715257311268, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.0757, "step": 1097 }, { "epoch": 1.9913851734300612, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 1.1525, "step": 1098 }, { "epoch": 1.9931988211289957, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 1.1116, "step": 1099 }, { "epoch": 1.9950124688279303, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 1.3349, "step": 1100 }, { "epoch": 1.9968261165268646, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 1.1574, "step": 1101 }, { "epoch": 1.9986397642257991, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.8847, "step": 1102 }, { "epoch": 2.0004534119247337, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 1.0232, "step": 1103 }, { "epoch": 2.002267059623668, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.9252, "step": 1104 }, { "epoch": 2.004080707322603, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.8367, "step": 1105 }, { "epoch": 2.005894355021537, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.8968, "step": 1106 }, { "epoch": 2.0077080027204715, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.6093, "step": 1107 }, { "epoch": 2.0095216504194062, "grad_norm": 0.3359375, "learning_rate": 0.0002, "loss": 0.6806, "step": 1108 }, { "epoch": 2.0113352981183406, "grad_norm": 0.341796875, "learning_rate": 0.0002, "loss": 0.655, "step": 1109 }, { "epoch": 2.013148945817275, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.8284, "step": 1110 }, { "epoch": 2.0149625935162097, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.7124, "step": 1111 }, { "epoch": 2.016776241215144, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.6792, "step": 1112 }, { "epoch": 2.0185898889140783, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.7572, "step": 1113 }, { "epoch": 2.020403536613013, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.8903, "step": 1114 }, { "epoch": 2.0222171843119474, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.7905, "step": 1115 }, { "epoch": 2.0240308320108817, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.6726, "step": 1116 }, { "epoch": 2.0258444797098165, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.8554, "step": 1117 }, { "epoch": 2.027658127408751, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.7315, "step": 1118 }, { "epoch": 2.029471775107685, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.8157, "step": 1119 }, { "epoch": 2.03128542280662, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.7904, "step": 1120 }, { "epoch": 2.0330990705055543, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.7374, "step": 1121 }, { "epoch": 2.0349127182044886, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.6245, "step": 1122 }, { "epoch": 2.0349127182044886, "eval_loss": 1.4156805276870728, "eval_runtime": 186.3245, "eval_samples_per_second": 5.367, "eval_steps_per_second": 5.367, "step": 1122 }, { "epoch": 2.0349127182044886, "mmlu_eval_accuracy": 0.34016491114964403, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.2857142857142857, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.6363636363636364, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.1, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.11538461538461539, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.48333333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.30434782608695654, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.09090909090909091, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.46511627906976744, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.5, "mmlu_eval_accuracy_prehistory": 0.2857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.22941176470588234, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.42105263157894735, "mmlu_loss": 1.983622645841246, "step": 1122 }, { "epoch": 2.0367263659034234, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.6042, "step": 1123 }, { "epoch": 2.0385400136023577, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.7414, "step": 1124 }, { "epoch": 2.040353661301292, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.7539, "step": 1125 }, { "epoch": 2.042167309000227, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.7233, "step": 1126 }, { "epoch": 2.043980956699161, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.6889, "step": 1127 }, { "epoch": 2.0457946043980955, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.8237, "step": 1128 }, { "epoch": 2.0476082520970302, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.7575, "step": 1129 }, { "epoch": 2.0494218997959646, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.7481, "step": 1130 }, { "epoch": 2.0512355474948993, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.7239, "step": 1131 }, { "epoch": 2.0530491951938337, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.816, "step": 1132 }, { "epoch": 2.054862842892768, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.5329, "step": 1133 }, { "epoch": 2.0566764905917028, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.5966, "step": 1134 }, { "epoch": 2.058490138290637, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.7259, "step": 1135 }, { "epoch": 2.0603037859895714, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.7786, "step": 1136 }, { "epoch": 2.062117433688506, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.9427, "step": 1137 }, { "epoch": 2.0639310813874405, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.8629, "step": 1138 }, { "epoch": 2.065744729086375, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.7241, "step": 1139 }, { "epoch": 2.0675583767853096, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.7563, "step": 1140 }, { "epoch": 2.069372024484244, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.8399, "step": 1141 }, { "epoch": 2.0711856721831783, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.6719, "step": 1142 }, { "epoch": 2.072999319882113, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.7751, "step": 1143 }, { "epoch": 2.0748129675810474, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.7446, "step": 1144 }, { "epoch": 2.0766266152799817, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.8919, "step": 1145 }, { "epoch": 2.0784402629789165, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 1.0338, "step": 1146 }, { "epoch": 2.080253910677851, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.7892, "step": 1147 }, { "epoch": 2.082067558376785, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.7774, "step": 1148 }, { "epoch": 2.08388120607572, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.6578, "step": 1149 }, { "epoch": 2.0856948537746542, "grad_norm": 1.5625, "learning_rate": 0.0002, "loss": 0.7487, "step": 1150 }, { "epoch": 2.0875085014735886, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.6472, "step": 1151 }, { "epoch": 2.0893221491725233, "grad_norm": 1.53125, "learning_rate": 0.0002, "loss": 0.6487, "step": 1152 }, { "epoch": 2.0911357968714577, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.8076, "step": 1153 }, { "epoch": 2.092949444570392, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 1.0193, "step": 1154 }, { "epoch": 2.0947630922693268, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.7507, "step": 1155 }, { "epoch": 2.096576739968261, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.7407, "step": 1156 }, { "epoch": 2.0983903876671954, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.8455, "step": 1157 }, { "epoch": 2.10020403536613, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.7653, "step": 1158 }, { "epoch": 2.1020176830650645, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.6976, "step": 1159 }, { "epoch": 2.1038313307639993, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.733, "step": 1160 }, { "epoch": 2.1056449784629336, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.6201, "step": 1161 }, { "epoch": 2.107458626161868, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.6487, "step": 1162 }, { "epoch": 2.1092722738608027, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.7657, "step": 1163 }, { "epoch": 2.111085921559737, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.7751, "step": 1164 }, { "epoch": 2.1128995692586714, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.7139, "step": 1165 }, { "epoch": 2.114713216957606, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.9633, "step": 1166 }, { "epoch": 2.1165268646565405, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.7313, "step": 1167 }, { "epoch": 2.118340512355475, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.6421, "step": 1168 }, { "epoch": 2.1201541600544096, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.7378, "step": 1169 }, { "epoch": 2.121967807753344, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7859, "step": 1170 }, { "epoch": 2.1237814554522783, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.833, "step": 1171 }, { "epoch": 2.125595103151213, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.6635, "step": 1172 }, { "epoch": 2.1274087508501474, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.9965, "step": 1173 }, { "epoch": 2.1292223985490817, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.7302, "step": 1174 }, { "epoch": 2.1310360462480165, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.7755, "step": 1175 }, { "epoch": 2.132849693946951, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.967, "step": 1176 }, { "epoch": 2.134663341645885, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.8846, "step": 1177 }, { "epoch": 2.13647698934482, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.6882, "step": 1178 }, { "epoch": 2.138290637043754, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.8257, "step": 1179 }, { "epoch": 2.1401042847426885, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.7863, "step": 1180 }, { "epoch": 2.1419179324416233, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.6873, "step": 1181 }, { "epoch": 2.1437315801405576, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.8097, "step": 1182 }, { "epoch": 2.145545227839492, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.8079, "step": 1183 }, { "epoch": 2.1473588755384267, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.9057, "step": 1184 }, { "epoch": 2.149172523237361, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.8119, "step": 1185 }, { "epoch": 2.150986170936296, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.7241, "step": 1186 }, { "epoch": 2.15279981863523, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.8518, "step": 1187 }, { "epoch": 2.1546134663341645, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.6611, "step": 1188 }, { "epoch": 2.1564271140330993, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.8582, "step": 1189 }, { "epoch": 2.1582407617320336, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.7817, "step": 1190 }, { "epoch": 2.160054409430968, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.7146, "step": 1191 }, { "epoch": 2.1618680571299027, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.7671, "step": 1192 }, { "epoch": 2.163681704828837, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.6767, "step": 1193 }, { "epoch": 2.1654953525277714, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.829, "step": 1194 }, { "epoch": 2.167309000226706, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.8236, "step": 1195 }, { "epoch": 2.1691226479256405, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.8942, "step": 1196 }, { "epoch": 2.170936295624575, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.7349, "step": 1197 }, { "epoch": 2.1727499433235096, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.9268, "step": 1198 }, { "epoch": 2.174563591022444, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.7323, "step": 1199 }, { "epoch": 2.1763772387213782, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.6531, "step": 1200 }, { "epoch": 2.178190886420313, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.6609, "step": 1201 }, { "epoch": 2.1800045341192473, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.6747, "step": 1202 }, { "epoch": 2.1818181818181817, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.6657, "step": 1203 }, { "epoch": 2.1836318295171164, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.6779, "step": 1204 }, { "epoch": 2.1854454772160508, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.6881, "step": 1205 }, { "epoch": 2.187259124914985, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.8846, "step": 1206 }, { "epoch": 2.18907277261392, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.8727, "step": 1207 }, { "epoch": 2.190886420312854, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.7713, "step": 1208 }, { "epoch": 2.1927000680117885, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.7995, "step": 1209 }, { "epoch": 2.1945137157107233, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.7911, "step": 1210 }, { "epoch": 2.1963273634096576, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.6902, "step": 1211 }, { "epoch": 2.198141011108592, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.9407, "step": 1212 }, { "epoch": 2.1999546588075267, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.8687, "step": 1213 }, { "epoch": 2.201768306506461, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.6513, "step": 1214 }, { "epoch": 2.2035819542053954, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.7447, "step": 1215 }, { "epoch": 2.20539560190433, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.7248, "step": 1216 }, { "epoch": 2.2072092496032645, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.7495, "step": 1217 }, { "epoch": 2.2090228973021993, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.6448, "step": 1218 }, { "epoch": 2.2108365450011336, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.6581, "step": 1219 }, { "epoch": 2.212650192700068, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.6267, "step": 1220 }, { "epoch": 2.2144638403990027, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.6254, "step": 1221 }, { "epoch": 2.216277488097937, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.729, "step": 1222 }, { "epoch": 2.2180911357968713, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.7175, "step": 1223 }, { "epoch": 2.219904783495806, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.7662, "step": 1224 }, { "epoch": 2.2217184311947404, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.7723, "step": 1225 }, { "epoch": 2.2235320788936748, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.5802, "step": 1226 }, { "epoch": 2.2253457265926095, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.6554, "step": 1227 }, { "epoch": 2.227159374291544, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.7236, "step": 1228 }, { "epoch": 2.228973021990478, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.7363, "step": 1229 }, { "epoch": 2.230786669689413, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.6888, "step": 1230 }, { "epoch": 2.2326003173883473, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.9558, "step": 1231 }, { "epoch": 2.2344139650872816, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.8325, "step": 1232 }, { "epoch": 2.2362276127862164, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.7763, "step": 1233 }, { "epoch": 2.2380412604851507, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.7593, "step": 1234 }, { "epoch": 2.239854908184085, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.8299, "step": 1235 }, { "epoch": 2.24166855588302, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.7616, "step": 1236 }, { "epoch": 2.243482203581954, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.7108, "step": 1237 }, { "epoch": 2.2452958512808885, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.7332, "step": 1238 }, { "epoch": 2.2471094989798233, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.6754, "step": 1239 }, { "epoch": 2.2489231466787576, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.8371, "step": 1240 }, { "epoch": 2.2507367943776924, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.7751, "step": 1241 }, { "epoch": 2.2525504420766267, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.8315, "step": 1242 }, { "epoch": 2.254364089775561, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.7467, "step": 1243 }, { "epoch": 2.256177737474496, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.6778, "step": 1244 }, { "epoch": 2.25799138517343, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.8362, "step": 1245 }, { "epoch": 2.2598050328723644, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 1.1029, "step": 1246 }, { "epoch": 2.2616186805712992, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.739, "step": 1247 }, { "epoch": 2.2634323282702336, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.7094, "step": 1248 }, { "epoch": 2.265245975969168, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.7852, "step": 1249 }, { "epoch": 2.2670596236681027, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.6742, "step": 1250 }, { "epoch": 2.268873271367037, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.6942, "step": 1251 }, { "epoch": 2.2706869190659713, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.6605, "step": 1252 }, { "epoch": 2.272500566764906, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.8144, "step": 1253 }, { "epoch": 2.2743142144638404, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.9138, "step": 1254 }, { "epoch": 2.2761278621627747, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.7712, "step": 1255 }, { "epoch": 2.2779415098617095, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.813, "step": 1256 }, { "epoch": 2.279755157560644, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.7306, "step": 1257 }, { "epoch": 2.281568805259578, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.7427, "step": 1258 }, { "epoch": 2.283382452958513, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.7266, "step": 1259 }, { "epoch": 2.2851961006574473, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.786, "step": 1260 }, { "epoch": 2.2870097483563816, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.8043, "step": 1261 }, { "epoch": 2.2888233960553164, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.9279, "step": 1262 }, { "epoch": 2.2906370437542507, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.685, "step": 1263 }, { "epoch": 2.292450691453185, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.7903, "step": 1264 }, { "epoch": 2.29426433915212, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.7476, "step": 1265 }, { "epoch": 2.296077986851054, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.6634, "step": 1266 }, { "epoch": 2.2978916345499885, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.6803, "step": 1267 }, { "epoch": 2.2997052822489232, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.701, "step": 1268 }, { "epoch": 2.3015189299478576, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.6426, "step": 1269 }, { "epoch": 2.303332577646792, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7536, "step": 1270 }, { "epoch": 2.3051462253457267, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.5627, "step": 1271 }, { "epoch": 2.306959873044661, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.5225, "step": 1272 }, { "epoch": 2.3087735207435953, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.7773, "step": 1273 }, { "epoch": 2.31058716844253, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.7058, "step": 1274 }, { "epoch": 2.3124008161414644, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.6455, "step": 1275 }, { "epoch": 2.314214463840399, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.7742, "step": 1276 }, { "epoch": 2.3160281115393335, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.6842, "step": 1277 }, { "epoch": 2.317841759238268, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.8503, "step": 1278 }, { "epoch": 2.3196554069372026, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.6905, "step": 1279 }, { "epoch": 2.321469054636137, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.7503, "step": 1280 }, { "epoch": 2.3232827023350713, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.6377, "step": 1281 }, { "epoch": 2.325096350034006, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.8142, "step": 1282 }, { "epoch": 2.3269099977329404, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.8101, "step": 1283 }, { "epoch": 2.3287236454318747, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.6955, "step": 1284 }, { "epoch": 2.3305372931308095, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.8859, "step": 1285 }, { "epoch": 2.332350940829744, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.7873, "step": 1286 }, { "epoch": 2.334164588528678, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.7453, "step": 1287 }, { "epoch": 2.335978236227613, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.7637, "step": 1288 }, { "epoch": 2.3377918839265472, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.7625, "step": 1289 }, { "epoch": 2.3396055316254816, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.8722, "step": 1290 }, { "epoch": 2.3414191793244163, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.8554, "step": 1291 }, { "epoch": 2.3432328270233507, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.6556, "step": 1292 }, { "epoch": 2.3450464747222854, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.8853, "step": 1293 }, { "epoch": 2.3468601224212198, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.7814, "step": 1294 }, { "epoch": 2.348673770120154, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.6787, "step": 1295 }, { "epoch": 2.350487417819089, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.891, "step": 1296 }, { "epoch": 2.352301065518023, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.9314, "step": 1297 }, { "epoch": 2.3541147132169575, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.7153, "step": 1298 }, { "epoch": 2.3559283609158923, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.6697, "step": 1299 }, { "epoch": 2.3577420086148266, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.8425, "step": 1300 }, { "epoch": 2.359555656313761, "grad_norm": 1.4140625, "learning_rate": 0.0002, "loss": 0.7561, "step": 1301 }, { "epoch": 2.3613693040126957, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.6737, "step": 1302 }, { "epoch": 2.36318295171163, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.7612, "step": 1303 }, { "epoch": 2.3649965994105644, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 1.0148, "step": 1304 }, { "epoch": 2.366810247109499, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.8529, "step": 1305 }, { "epoch": 2.3686238948084335, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.7331, "step": 1306 }, { "epoch": 2.370437542507368, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.8337, "step": 1307 }, { "epoch": 2.3722511902063026, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.6065, "step": 1308 }, { "epoch": 2.374064837905237, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.808, "step": 1309 }, { "epoch": 2.374064837905237, "eval_loss": 1.4322690963745117, "eval_runtime": 185.6753, "eval_samples_per_second": 5.386, "eval_steps_per_second": 5.386, "step": 1309 }, { "epoch": 2.374064837905237, "mmlu_eval_accuracy": 0.3048925814533493, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.2857142857142857, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.045454545454545456, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.15384615384615385, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.17073170731707318, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.1, "mmlu_eval_accuracy_high_school_biology": 0.5, "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.15384615384615385, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.48333333333333334, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.21739130434782608, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.30303030303030304, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.22857142857142856, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.2529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 1.368047034970146, "step": 1309 }, { "epoch": 2.3758784856041713, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.6671, "step": 1310 }, { "epoch": 2.377692133303106, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.7192, "step": 1311 }, { "epoch": 2.3795057810020404, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.6541, "step": 1312 }, { "epoch": 2.3813194287009747, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.9003, "step": 1313 }, { "epoch": 2.3831330763999095, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.9077, "step": 1314 }, { "epoch": 2.384946724098844, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.7373, "step": 1315 }, { "epoch": 2.386760371797778, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.8176, "step": 1316 }, { "epoch": 2.388574019496713, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.7814, "step": 1317 }, { "epoch": 2.390387667195647, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.5939, "step": 1318 }, { "epoch": 2.3922013148945815, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.6761, "step": 1319 }, { "epoch": 2.3940149625935163, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.6437, "step": 1320 }, { "epoch": 2.3958286102924506, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.7211, "step": 1321 }, { "epoch": 2.397642257991385, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.6734, "step": 1322 }, { "epoch": 2.3994559056903197, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.7187, "step": 1323 }, { "epoch": 2.401269553389254, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.6911, "step": 1324 }, { "epoch": 2.4030832010881884, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.6135, "step": 1325 }, { "epoch": 2.404896848787123, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.6557, "step": 1326 }, { "epoch": 2.4067104964860575, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.6279, "step": 1327 }, { "epoch": 2.408524144184992, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.671, "step": 1328 }, { "epoch": 2.4103377918839266, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.7283, "step": 1329 }, { "epoch": 2.412151439582861, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.8353, "step": 1330 }, { "epoch": 2.4139650872817953, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.7321, "step": 1331 }, { "epoch": 2.41577873498073, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.8127, "step": 1332 }, { "epoch": 2.4175923826796644, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.6468, "step": 1333 }, { "epoch": 2.419406030378599, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.6013, "step": 1334 }, { "epoch": 2.4212196780775335, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.6567, "step": 1335 }, { "epoch": 2.423033325776468, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.88, "step": 1336 }, { "epoch": 2.4248469734754026, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.6784, "step": 1337 }, { "epoch": 2.426660621174337, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.719, "step": 1338 }, { "epoch": 2.4284742688732712, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.6861, "step": 1339 }, { "epoch": 2.430287916572206, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.7835, "step": 1340 }, { "epoch": 2.4321015642711403, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.9276, "step": 1341 }, { "epoch": 2.4339152119700747, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.94, "step": 1342 }, { "epoch": 2.4357288596690094, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.8813, "step": 1343 }, { "epoch": 2.4375425073679438, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.7477, "step": 1344 }, { "epoch": 2.439356155066878, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.863, "step": 1345 }, { "epoch": 2.441169802765813, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.7105, "step": 1346 }, { "epoch": 2.442983450464747, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.7327, "step": 1347 }, { "epoch": 2.4447970981636815, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.7593, "step": 1348 }, { "epoch": 2.4466107458626163, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.7667, "step": 1349 }, { "epoch": 2.4484243935615506, "grad_norm": 1.3046875, "learning_rate": 0.0002, "loss": 0.7585, "step": 1350 }, { "epoch": 2.4502380412604854, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.6608, "step": 1351 }, { "epoch": 2.4520516889594197, "grad_norm": 1.4921875, "learning_rate": 0.0002, "loss": 0.7367, "step": 1352 }, { "epoch": 2.453865336658354, "grad_norm": 2.546875, "learning_rate": 0.0002, "loss": 0.7805, "step": 1353 }, { "epoch": 2.455678984357289, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.8664, "step": 1354 }, { "epoch": 2.457492632056223, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.9025, "step": 1355 }, { "epoch": 2.4593062797551575, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.7757, "step": 1356 }, { "epoch": 2.4611199274540922, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.7691, "step": 1357 }, { "epoch": 2.4629335751530266, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.8602, "step": 1358 }, { "epoch": 2.464747222851961, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.8575, "step": 1359 }, { "epoch": 2.4665608705508957, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.7623, "step": 1360 }, { "epoch": 2.46837451824983, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.8785, "step": 1361 }, { "epoch": 2.4701881659487643, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.7959, "step": 1362 }, { "epoch": 2.472001813647699, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.6367, "step": 1363 }, { "epoch": 2.4738154613466334, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.8154, "step": 1364 }, { "epoch": 2.4756291090455678, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.9087, "step": 1365 }, { "epoch": 2.4774427567445025, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.7169, "step": 1366 }, { "epoch": 2.479256404443437, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.6061, "step": 1367 }, { "epoch": 2.481070052142371, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.7614, "step": 1368 }, { "epoch": 2.482883699841306, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.8085, "step": 1369 }, { "epoch": 2.4846973475402403, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.5876, "step": 1370 }, { "epoch": 2.4865109952391746, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.7097, "step": 1371 }, { "epoch": 2.4883246429381094, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.7023, "step": 1372 }, { "epoch": 2.4901382906370437, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.53, "step": 1373 }, { "epoch": 2.491951938335978, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.7661, "step": 1374 }, { "epoch": 2.493765586034913, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.7228, "step": 1375 }, { "epoch": 2.495579233733847, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.6155, "step": 1376 }, { "epoch": 2.4973928814327815, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.5835, "step": 1377 }, { "epoch": 2.4992065291317163, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.7788, "step": 1378 }, { "epoch": 2.5010201768306506, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.7256, "step": 1379 }, { "epoch": 2.502833824529585, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.6738, "step": 1380 }, { "epoch": 2.5046474722285197, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.8898, "step": 1381 }, { "epoch": 2.506461119927454, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.877, "step": 1382 }, { "epoch": 2.5082747676263883, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.8086, "step": 1383 }, { "epoch": 2.510088415325323, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.9278, "step": 1384 }, { "epoch": 2.5119020630242574, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 1.1486, "step": 1385 }, { "epoch": 2.5137157107231918, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.9584, "step": 1386 }, { "epoch": 2.5155293584221265, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.7387, "step": 1387 }, { "epoch": 2.517343006121061, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.7446, "step": 1388 }, { "epoch": 2.519156653819995, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.6883, "step": 1389 }, { "epoch": 2.52097030151893, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.7939, "step": 1390 }, { "epoch": 2.5227839492178643, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.7188, "step": 1391 }, { "epoch": 2.5245975969167986, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.8039, "step": 1392 }, { "epoch": 2.5264112446157334, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.7837, "step": 1393 }, { "epoch": 2.5282248923146677, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.9178, "step": 1394 }, { "epoch": 2.5300385400136025, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.8081, "step": 1395 }, { "epoch": 2.531852187712537, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 1.0493, "step": 1396 }, { "epoch": 2.533665835411471, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.8363, "step": 1397 }, { "epoch": 2.535479483110406, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.7728, "step": 1398 }, { "epoch": 2.5372931308093403, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.8389, "step": 1399 }, { "epoch": 2.539106778508275, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.7281, "step": 1400 }, { "epoch": 2.5409204262072094, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.6656, "step": 1401 }, { "epoch": 2.5427340739061437, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.7143, "step": 1402 }, { "epoch": 2.5445477216050785, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.7592, "step": 1403 }, { "epoch": 2.546361369304013, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.8537, "step": 1404 }, { "epoch": 2.548175017002947, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.9253, "step": 1405 }, { "epoch": 2.549988664701882, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6732, "step": 1406 }, { "epoch": 2.5518023124008162, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.7883, "step": 1407 }, { "epoch": 2.5536159600997506, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.6605, "step": 1408 }, { "epoch": 2.5554296077986853, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.8294, "step": 1409 }, { "epoch": 2.5572432554976197, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.7053, "step": 1410 }, { "epoch": 2.559056903196554, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.7854, "step": 1411 }, { "epoch": 2.5608705508954888, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.6559, "step": 1412 }, { "epoch": 2.562684198594423, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.7825, "step": 1413 }, { "epoch": 2.5644978462933574, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.7533, "step": 1414 }, { "epoch": 2.566311493992292, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.7064, "step": 1415 }, { "epoch": 2.5681251416912265, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.7083, "step": 1416 }, { "epoch": 2.569938789390161, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.7584, "step": 1417 }, { "epoch": 2.5717524370890956, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7642, "step": 1418 }, { "epoch": 2.57356608478803, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.6215, "step": 1419 }, { "epoch": 2.5753797324869643, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.7491, "step": 1420 }, { "epoch": 2.577193380185899, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.5511, "step": 1421 }, { "epoch": 2.5790070278848334, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.7933, "step": 1422 }, { "epoch": 2.5808206755837677, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.994, "step": 1423 }, { "epoch": 2.5826343232827025, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.7887, "step": 1424 }, { "epoch": 2.584447970981637, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.7297, "step": 1425 }, { "epoch": 2.586261618680571, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.758, "step": 1426 }, { "epoch": 2.588075266379506, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.7274, "step": 1427 }, { "epoch": 2.5898889140784402, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.676, "step": 1428 }, { "epoch": 2.5917025617773746, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.5893, "step": 1429 }, { "epoch": 2.5935162094763093, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.8747, "step": 1430 }, { "epoch": 2.5953298571752437, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 1.0213, "step": 1431 }, { "epoch": 2.597143504874178, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.883, "step": 1432 }, { "epoch": 2.5989571525731128, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.8502, "step": 1433 }, { "epoch": 2.600770800272047, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.7192, "step": 1434 }, { "epoch": 2.6025844479709814, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.6886, "step": 1435 }, { "epoch": 2.604398095669916, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.644, "step": 1436 }, { "epoch": 2.6062117433688505, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.7452, "step": 1437 }, { "epoch": 2.608025391067785, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.7404, "step": 1438 }, { "epoch": 2.6098390387667196, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.7534, "step": 1439 }, { "epoch": 2.611652686465654, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.7091, "step": 1440 }, { "epoch": 2.6134663341645883, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.8488, "step": 1441 }, { "epoch": 2.615279981863523, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.781, "step": 1442 }, { "epoch": 2.6170936295624574, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.8016, "step": 1443 }, { "epoch": 2.6189072772613917, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.7065, "step": 1444 }, { "epoch": 2.6207209249603265, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.9411, "step": 1445 }, { "epoch": 2.622534572659261, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.8014, "step": 1446 }, { "epoch": 2.624348220358195, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.8403, "step": 1447 }, { "epoch": 2.62616186805713, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.9253, "step": 1448 }, { "epoch": 2.6279755157560643, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.7423, "step": 1449 }, { "epoch": 2.629789163454999, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.7751, "step": 1450 }, { "epoch": 2.6316028111539334, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.6493, "step": 1451 }, { "epoch": 2.6334164588528677, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.7168, "step": 1452 }, { "epoch": 2.6352301065518025, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.6675, "step": 1453 }, { "epoch": 2.637043754250737, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.8853, "step": 1454 }, { "epoch": 2.638857401949671, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.6991, "step": 1455 }, { "epoch": 2.640671049648606, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.9509, "step": 1456 }, { "epoch": 2.64248469734754, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.7185, "step": 1457 }, { "epoch": 2.644298345046475, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.8694, "step": 1458 }, { "epoch": 2.6461119927454093, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7356, "step": 1459 }, { "epoch": 2.6479256404443436, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.6547, "step": 1460 }, { "epoch": 2.6497392881432784, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.7023, "step": 1461 }, { "epoch": 2.6515529358422127, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.6661, "step": 1462 }, { "epoch": 2.653366583541147, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.8523, "step": 1463 }, { "epoch": 2.655180231240082, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.7061, "step": 1464 }, { "epoch": 2.656993878939016, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.8657, "step": 1465 }, { "epoch": 2.6588075266379505, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.7878, "step": 1466 }, { "epoch": 2.6606211743368853, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.7661, "step": 1467 }, { "epoch": 2.6624348220358196, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.6593, "step": 1468 }, { "epoch": 2.664248469734754, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.6774, "step": 1469 }, { "epoch": 2.6660621174336887, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.5882, "step": 1470 }, { "epoch": 2.667875765132623, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.7373, "step": 1471 }, { "epoch": 2.6696894128315574, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.8063, "step": 1472 }, { "epoch": 2.671503060530492, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.8606, "step": 1473 }, { "epoch": 2.6733167082294265, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7212, "step": 1474 }, { "epoch": 2.675130355928361, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.6764, "step": 1475 }, { "epoch": 2.6769440036272956, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.7447, "step": 1476 }, { "epoch": 2.67875765132623, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.8477, "step": 1477 }, { "epoch": 2.6805712990251642, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.7016, "step": 1478 }, { "epoch": 2.682384946724099, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.6619, "step": 1479 }, { "epoch": 2.6841985944230333, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.7196, "step": 1480 }, { "epoch": 2.6860122421219677, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.7806, "step": 1481 }, { "epoch": 2.6878258898209024, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.7405, "step": 1482 }, { "epoch": 2.6896395375198368, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.9716, "step": 1483 }, { "epoch": 2.691453185218771, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 1.0984, "step": 1484 }, { "epoch": 2.693266832917706, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.8496, "step": 1485 }, { "epoch": 2.69508048061664, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.7213, "step": 1486 }, { "epoch": 2.6968941283155745, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.8827, "step": 1487 }, { "epoch": 2.6987077760145093, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.6888, "step": 1488 }, { "epoch": 2.7005214237134436, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.8175, "step": 1489 }, { "epoch": 2.702335071412378, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.6942, "step": 1490 }, { "epoch": 2.7041487191113127, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.8116, "step": 1491 }, { "epoch": 2.705962366810247, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.8, "step": 1492 }, { "epoch": 2.7077760145091814, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.8114, "step": 1493 }, { "epoch": 2.709589662208116, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.7998, "step": 1494 }, { "epoch": 2.7114033099070505, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.66, "step": 1495 }, { "epoch": 2.713216957605985, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.7045, "step": 1496 }, { "epoch": 2.713216957605985, "eval_loss": 1.5096887350082397, "eval_runtime": 185.6701, "eval_samples_per_second": 5.386, "eval_steps_per_second": 5.386, "step": 1496 }, { "epoch": 2.713216957605985, "mmlu_eval_accuracy": 0.3229999266952241, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.6428571428571429, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.15384615384615385, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.2857142857142857, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.5, "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.3333333333333333, "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793, "mmlu_eval_accuracy_high_school_microeconomics": 0.19230769230769232, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.45, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.47368421052631576, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.2571428571428571, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 1.5884114981164814, "step": 1496 }, { "epoch": 2.7150306053049196, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.8011, "step": 1497 }, { "epoch": 2.716844253003854, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.9656, "step": 1498 }, { "epoch": 2.7186579007027882, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.7153, "step": 1499 }, { "epoch": 2.720471548401723, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.6009, "step": 1500 }, { "epoch": 2.7222851961006573, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.7672, "step": 1501 }, { "epoch": 2.7240988437995917, "grad_norm": 2.015625, "learning_rate": 0.0002, "loss": 0.7252, "step": 1502 }, { "epoch": 2.7259124914985264, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.7359, "step": 1503 }, { "epoch": 2.7277261391974608, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.9036, "step": 1504 }, { "epoch": 2.7295397868963955, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.8649, "step": 1505 }, { "epoch": 2.73135343459533, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.7268, "step": 1506 }, { "epoch": 2.733167082294264, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.7853, "step": 1507 }, { "epoch": 2.734980729993199, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.8551, "step": 1508 }, { "epoch": 2.7367943776921333, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.6664, "step": 1509 }, { "epoch": 2.7386080253910676, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.9457, "step": 1510 }, { "epoch": 2.7404216730900024, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.7911, "step": 1511 }, { "epoch": 2.7422353207889367, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.764, "step": 1512 }, { "epoch": 2.7440489684878715, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.756, "step": 1513 }, { "epoch": 2.745862616186806, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.965, "step": 1514 }, { "epoch": 2.74767626388574, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.6918, "step": 1515 }, { "epoch": 2.749489911584675, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.8003, "step": 1516 }, { "epoch": 2.7513035592836093, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.8251, "step": 1517 }, { "epoch": 2.7531172069825436, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.6798, "step": 1518 }, { "epoch": 2.7549308546814784, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.8147, "step": 1519 }, { "epoch": 2.7567445023804127, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.7718, "step": 1520 }, { "epoch": 2.758558150079347, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.5667, "step": 1521 }, { "epoch": 2.760371797778282, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.6512, "step": 1522 }, { "epoch": 2.762185445477216, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.7768, "step": 1523 }, { "epoch": 2.7639990931761504, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.7229, "step": 1524 }, { "epoch": 2.765812740875085, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.7096, "step": 1525 }, { "epoch": 2.7676263885740195, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.7946, "step": 1526 }, { "epoch": 2.769440036272954, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.8046, "step": 1527 }, { "epoch": 2.7712536839718886, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.7106, "step": 1528 }, { "epoch": 2.773067331670823, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.6599, "step": 1529 }, { "epoch": 2.7748809793697573, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.786, "step": 1530 }, { "epoch": 2.776694627068692, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.7343, "step": 1531 }, { "epoch": 2.7785082747676264, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.5622, "step": 1532 }, { "epoch": 2.7803219224665607, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.8755, "step": 1533 }, { "epoch": 2.7821355701654955, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.7946, "step": 1534 }, { "epoch": 2.78394921786443, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.8326, "step": 1535 }, { "epoch": 2.785762865563364, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.7726, "step": 1536 }, { "epoch": 2.787576513262299, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.9868, "step": 1537 }, { "epoch": 2.7893901609612333, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.7599, "step": 1538 }, { "epoch": 2.7912038086601676, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.7565, "step": 1539 }, { "epoch": 2.7930174563591024, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.7182, "step": 1540 }, { "epoch": 2.7948311040580367, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.9035, "step": 1541 }, { "epoch": 2.796644751756971, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.835, "step": 1542 }, { "epoch": 2.798458399455906, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.9128, "step": 1543 }, { "epoch": 2.80027204715484, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.8621, "step": 1544 }, { "epoch": 2.8020856948537745, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.7654, "step": 1545 }, { "epoch": 2.8038993425527092, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.7246, "step": 1546 }, { "epoch": 2.8057129902516436, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.8056, "step": 1547 }, { "epoch": 2.807526637950578, "grad_norm": 1.703125, "learning_rate": 0.0002, "loss": 0.7688, "step": 1548 }, { "epoch": 2.8093402856495127, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.643, "step": 1549 }, { "epoch": 2.811153933348447, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.6528, "step": 1550 }, { "epoch": 2.8129675810473813, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.6862, "step": 1551 }, { "epoch": 2.814781228746316, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.7092, "step": 1552 }, { "epoch": 2.8165948764452504, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.857, "step": 1553 }, { "epoch": 2.8184085241441847, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 1.0095, "step": 1554 }, { "epoch": 2.8202221718431195, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.8687, "step": 1555 }, { "epoch": 2.822035819542054, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.9661, "step": 1556 }, { "epoch": 2.823849467240988, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 1.0035, "step": 1557 }, { "epoch": 2.825663114939923, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.7098, "step": 1558 }, { "epoch": 2.8274767626388573, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7902, "step": 1559 }, { "epoch": 2.829290410337792, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.6299, "step": 1560 }, { "epoch": 2.8311040580367264, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.7999, "step": 1561 }, { "epoch": 2.8329177057356607, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.7236, "step": 1562 }, { "epoch": 2.8347313534345955, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.7391, "step": 1563 }, { "epoch": 2.83654500113353, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.7182, "step": 1564 }, { "epoch": 2.838358648832464, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.7748, "step": 1565 }, { "epoch": 2.840172296531399, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.5909, "step": 1566 }, { "epoch": 2.8419859442303332, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.766, "step": 1567 }, { "epoch": 2.843799591929268, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.6714, "step": 1568 }, { "epoch": 2.8456132396282023, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.6843, "step": 1569 }, { "epoch": 2.8474268873271367, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.8587, "step": 1570 }, { "epoch": 2.8492405350260714, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.7329, "step": 1571 }, { "epoch": 2.8510541827250058, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.9186, "step": 1572 }, { "epoch": 2.85286783042394, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.7435, "step": 1573 }, { "epoch": 2.854681478122875, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.7159, "step": 1574 }, { "epoch": 2.856495125821809, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.8404, "step": 1575 }, { "epoch": 2.8583087735207435, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.9556, "step": 1576 }, { "epoch": 2.8601224212196783, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.6242, "step": 1577 }, { "epoch": 2.8619360689186126, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6878, "step": 1578 }, { "epoch": 2.863749716617547, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.7249, "step": 1579 }, { "epoch": 2.8655633643164817, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.7595, "step": 1580 }, { "epoch": 2.867377012015416, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.6473, "step": 1581 }, { "epoch": 2.8691906597143504, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.8204, "step": 1582 }, { "epoch": 2.871004307413285, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.7689, "step": 1583 }, { "epoch": 2.8728179551122195, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.8689, "step": 1584 }, { "epoch": 2.874631602811154, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.6645, "step": 1585 }, { "epoch": 2.8764452505100886, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.8073, "step": 1586 }, { "epoch": 2.878258898209023, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.6955, "step": 1587 }, { "epoch": 2.8800725459079572, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.7682, "step": 1588 }, { "epoch": 2.881886193606892, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.9319, "step": 1589 }, { "epoch": 2.8836998413058264, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.7774, "step": 1590 }, { "epoch": 2.8855134890047607, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.7948, "step": 1591 }, { "epoch": 2.8873271367036955, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.7482, "step": 1592 }, { "epoch": 2.88914078440263, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.8381, "step": 1593 }, { "epoch": 2.890954432101564, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.8624, "step": 1594 }, { "epoch": 2.892768079800499, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.8111, "step": 1595 }, { "epoch": 2.894581727499433, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.8507, "step": 1596 }, { "epoch": 2.8963953751983675, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.7685, "step": 1597 }, { "epoch": 2.8982090228973023, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.7802, "step": 1598 }, { "epoch": 2.9000226705962366, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.902, "step": 1599 }, { "epoch": 2.901836318295171, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.7469, "step": 1600 }, { "epoch": 2.9036499659941057, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.9198, "step": 1601 }, { "epoch": 2.90546361369304, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.7258, "step": 1602 }, { "epoch": 2.9072772613919744, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.7238, "step": 1603 }, { "epoch": 2.909090909090909, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.8645, "step": 1604 }, { "epoch": 2.9109045567898435, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.763, "step": 1605 }, { "epoch": 2.912718204488778, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.7634, "step": 1606 }, { "epoch": 2.9145318521877126, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.7899, "step": 1607 }, { "epoch": 2.916345499886647, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.8022, "step": 1608 }, { "epoch": 2.9181591475855813, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.7029, "step": 1609 }, { "epoch": 2.919972795284516, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.7821, "step": 1610 }, { "epoch": 2.9217864429834504, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.7179, "step": 1611 }, { "epoch": 2.9236000906823847, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.7488, "step": 1612 }, { "epoch": 2.9254137383813195, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.9018, "step": 1613 }, { "epoch": 2.927227386080254, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.8234, "step": 1614 }, { "epoch": 2.929041033779188, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.6498, "step": 1615 }, { "epoch": 2.930854681478123, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.8833, "step": 1616 }, { "epoch": 2.932668329177057, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.6604, "step": 1617 }, { "epoch": 2.934481976875992, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.8496, "step": 1618 }, { "epoch": 2.9362956245749263, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.6432, "step": 1619 }, { "epoch": 2.9381092722738607, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.844, "step": 1620 }, { "epoch": 2.9399229199727954, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.7477, "step": 1621 }, { "epoch": 2.9417365676717298, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.8115, "step": 1622 }, { "epoch": 2.9435502153706645, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.8432, "step": 1623 }, { "epoch": 2.945363863069599, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6519, "step": 1624 }, { "epoch": 2.947177510768533, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.7142, "step": 1625 }, { "epoch": 2.948991158467468, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.6704, "step": 1626 }, { "epoch": 2.9508048061664023, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.8109, "step": 1627 }, { "epoch": 2.9526184538653366, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.5757, "step": 1628 }, { "epoch": 2.9544321015642714, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.6925, "step": 1629 }, { "epoch": 2.9562457492632057, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 1.0056, "step": 1630 }, { "epoch": 2.95805939696214, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.8941, "step": 1631 }, { "epoch": 2.959873044661075, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.8279, "step": 1632 }, { "epoch": 2.961686692360009, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.6158, "step": 1633 }, { "epoch": 2.9635003400589435, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.7789, "step": 1634 }, { "epoch": 2.9653139877578782, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.8719, "step": 1635 }, { "epoch": 2.9671276354568126, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.8926, "step": 1636 }, { "epoch": 2.968941283155747, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.8645, "step": 1637 }, { "epoch": 2.9707549308546817, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.7009, "step": 1638 }, { "epoch": 2.972568578553616, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.9466, "step": 1639 }, { "epoch": 2.9743822262525503, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.9983, "step": 1640 }, { "epoch": 2.976195873951485, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.8655, "step": 1641 }, { "epoch": 2.9780095216504194, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.7755, "step": 1642 }, { "epoch": 2.9798231693493538, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.7042, "step": 1643 }, { "epoch": 2.9816368170482885, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.8664, "step": 1644 }, { "epoch": 2.983450464747223, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.6967, "step": 1645 }, { "epoch": 2.985264112446157, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 1.0135, "step": 1646 }, { "epoch": 2.987077760145092, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.7669, "step": 1647 }, { "epoch": 2.9888914078440263, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.812, "step": 1648 }, { "epoch": 2.9907050555429606, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.7584, "step": 1649 }, { "epoch": 2.9925187032418954, "grad_norm": 1.453125, "learning_rate": 0.0002, "loss": 0.8122, "step": 1650 }, { "epoch": 2.9943323509408297, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.8925, "step": 1651 }, { "epoch": 2.996145998639764, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.8048, "step": 1652 }, { "epoch": 2.997959646338699, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.7362, "step": 1653 }, { "epoch": 2.999773294037633, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.8726, "step": 1654 }, { "epoch": 3.0015869417365675, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.6781, "step": 1655 }, { "epoch": 3.0034005894355023, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.5064, "step": 1656 }, { "epoch": 3.0052142371344366, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.4911, "step": 1657 }, { "epoch": 3.007027884833371, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.5256, "step": 1658 }, { "epoch": 3.0088415325323057, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.555, "step": 1659 }, { "epoch": 3.01065518023124, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.4604, "step": 1660 }, { "epoch": 3.0124688279301743, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.5095, "step": 1661 }, { "epoch": 3.014282475629109, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.5571, "step": 1662 }, { "epoch": 3.0160961233280434, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.4904, "step": 1663 }, { "epoch": 3.017909771026978, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.5121, "step": 1664 }, { "epoch": 3.0197234187259125, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.5369, "step": 1665 }, { "epoch": 3.021537066424847, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.5187, "step": 1666 }, { "epoch": 3.0233507141237816, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.4612, "step": 1667 }, { "epoch": 3.025164361822716, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.5179, "step": 1668 }, { "epoch": 3.0269780095216503, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.481, "step": 1669 }, { "epoch": 3.028791657220585, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.4055, "step": 1670 }, { "epoch": 3.0306053049195194, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.4574, "step": 1671 }, { "epoch": 3.0324189526184537, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.5085, "step": 1672 }, { "epoch": 3.0342326003173885, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.484, "step": 1673 }, { "epoch": 3.036046248016323, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.4714, "step": 1674 }, { "epoch": 3.037859895715257, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.4605, "step": 1675 }, { "epoch": 3.039673543414192, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4788, "step": 1676 }, { "epoch": 3.0414871911131263, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.3927, "step": 1677 }, { "epoch": 3.0433008388120606, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.5375, "step": 1678 }, { "epoch": 3.0451144865109954, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.5196, "step": 1679 }, { "epoch": 3.0469281342099297, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.4974, "step": 1680 }, { "epoch": 3.048741781908864, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.4041, "step": 1681 }, { "epoch": 3.050555429607799, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.5237, "step": 1682 }, { "epoch": 3.052369077306733, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.4423, "step": 1683 }, { "epoch": 3.052369077306733, "eval_loss": 1.6008633375167847, "eval_runtime": 186.0575, "eval_samples_per_second": 5.375, "eval_steps_per_second": 5.375, "step": 1683 }, { "epoch": 3.052369077306733, "mmlu_eval_accuracy": 0.3352774521766115, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.5, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.42857142857142855, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.43333333333333335, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.30434782608695654, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.5, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.6363636363636364, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.6363636363636364, "mmlu_eval_accuracy_miscellaneous": 0.43023255813953487, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.42424242424242425, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.24705882352941178, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5909090909090909, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 1.7692674889352724, "step": 1683 }, { "epoch": 3.0541827250056675, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.3699, "step": 1684 }, { "epoch": 3.0559963727046022, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.4646, "step": 1685 }, { "epoch": 3.0578100204035366, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.5002, "step": 1686 }, { "epoch": 3.059623668102471, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.4837, "step": 1687 }, { "epoch": 3.0614373158014057, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.4933, "step": 1688 }, { "epoch": 3.06325096350034, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.532, "step": 1689 }, { "epoch": 3.0650646111992748, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.5149, "step": 1690 }, { "epoch": 3.066878258898209, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.3592, "step": 1691 }, { "epoch": 3.0686919065971434, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.5222, "step": 1692 }, { "epoch": 3.070505554296078, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.4753, "step": 1693 }, { "epoch": 3.0723192019950125, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.4891, "step": 1694 }, { "epoch": 3.074132849693947, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.3951, "step": 1695 }, { "epoch": 3.0759464973928816, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.4321, "step": 1696 }, { "epoch": 3.077760145091816, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.433, "step": 1697 }, { "epoch": 3.0795737927907503, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.4954, "step": 1698 }, { "epoch": 3.081387440489685, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.6306, "step": 1699 }, { "epoch": 3.0832010881886194, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.3782, "step": 1700 }, { "epoch": 3.0850147358875537, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.4723, "step": 1701 }, { "epoch": 3.0868283835864885, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.4607, "step": 1702 }, { "epoch": 3.088642031285423, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.4662, "step": 1703 }, { "epoch": 3.090455678984357, "grad_norm": 1.34375, "learning_rate": 0.0002, "loss": 0.4884, "step": 1704 }, { "epoch": 3.092269326683292, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.7024, "step": 1705 }, { "epoch": 3.0940829743822262, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.6416, "step": 1706 }, { "epoch": 3.0958966220811606, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.5901, "step": 1707 }, { "epoch": 3.0977102697800953, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.4678, "step": 1708 }, { "epoch": 3.0995239174790297, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.5016, "step": 1709 }, { "epoch": 3.101337565177964, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.4354, "step": 1710 }, { "epoch": 3.1031512128768988, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.4269, "step": 1711 }, { "epoch": 3.104964860575833, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.4888, "step": 1712 }, { "epoch": 3.1067785082747674, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.5981, "step": 1713 }, { "epoch": 3.108592155973702, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.5245, "step": 1714 }, { "epoch": 3.1104058036726365, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.4337, "step": 1715 }, { "epoch": 3.112219451371571, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.5152, "step": 1716 }, { "epoch": 3.1140330990705056, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.5545, "step": 1717 }, { "epoch": 3.11584674676944, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.4646, "step": 1718 }, { "epoch": 3.1176603944683747, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.4206, "step": 1719 }, { "epoch": 3.119474042167309, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6276, "step": 1720 }, { "epoch": 3.1212876898662434, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.4795, "step": 1721 }, { "epoch": 3.123101337565178, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.4178, "step": 1722 }, { "epoch": 3.1249149852641125, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.5487, "step": 1723 }, { "epoch": 3.126728632963047, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.5457, "step": 1724 }, { "epoch": 3.1285422806619816, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.4152, "step": 1725 }, { "epoch": 3.130355928360916, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.4472, "step": 1726 }, { "epoch": 3.1321695760598502, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.3621, "step": 1727 }, { "epoch": 3.133983223758785, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.3981, "step": 1728 }, { "epoch": 3.1357968714577193, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.4365, "step": 1729 }, { "epoch": 3.1376105191566537, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.4719, "step": 1730 }, { "epoch": 3.1394241668555884, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.4722, "step": 1731 }, { "epoch": 3.141237814554523, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.537, "step": 1732 }, { "epoch": 3.143051462253457, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.3892, "step": 1733 }, { "epoch": 3.144865109952392, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.4311, "step": 1734 }, { "epoch": 3.146678757651326, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.472, "step": 1735 }, { "epoch": 3.1484924053502605, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.5701, "step": 1736 }, { "epoch": 3.1503060530491953, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.5156, "step": 1737 }, { "epoch": 3.1521197007481296, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.5554, "step": 1738 }, { "epoch": 3.153933348447064, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.5052, "step": 1739 }, { "epoch": 3.1557469961459987, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.3593, "step": 1740 }, { "epoch": 3.157560643844933, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.518, "step": 1741 }, { "epoch": 3.1593742915438674, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.5203, "step": 1742 }, { "epoch": 3.161187939242802, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.5742, "step": 1743 }, { "epoch": 3.1630015869417365, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.5114, "step": 1744 }, { "epoch": 3.1648152346406713, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.5045, "step": 1745 }, { "epoch": 3.1666288823396056, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.444, "step": 1746 }, { "epoch": 3.16844253003854, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.4844, "step": 1747 }, { "epoch": 3.1702561777374747, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.4894, "step": 1748 }, { "epoch": 3.172069825436409, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.3708, "step": 1749 }, { "epoch": 3.1738834731353434, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.4297, "step": 1750 }, { "epoch": 3.175697120834278, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.4897, "step": 1751 }, { "epoch": 3.1775107685332125, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.437, "step": 1752 }, { "epoch": 3.179324416232147, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.4832, "step": 1753 }, { "epoch": 3.1811380639310816, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.4484, "step": 1754 }, { "epoch": 3.182951711630016, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.8592, "step": 1755 }, { "epoch": 3.18476535932895, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.6183, "step": 1756 }, { "epoch": 3.186579007027885, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.5251, "step": 1757 }, { "epoch": 3.1883926547268193, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.5979, "step": 1758 }, { "epoch": 3.1902063024257536, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.5816, "step": 1759 }, { "epoch": 3.1920199501246884, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.494, "step": 1760 }, { "epoch": 3.1938335978236227, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.4116, "step": 1761 }, { "epoch": 3.195647245522557, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.4041, "step": 1762 }, { "epoch": 3.197460893221492, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.5564, "step": 1763 }, { "epoch": 3.199274540920426, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.5238, "step": 1764 }, { "epoch": 3.2010881886193605, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.5001, "step": 1765 }, { "epoch": 3.2029018363182953, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.5589, "step": 1766 }, { "epoch": 3.2047154840172296, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.5279, "step": 1767 }, { "epoch": 3.206529131716164, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.5342, "step": 1768 }, { "epoch": 3.2083427794150987, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.4334, "step": 1769 }, { "epoch": 3.210156427114033, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.4541, "step": 1770 }, { "epoch": 3.2119700748129674, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.4123, "step": 1771 }, { "epoch": 3.213783722511902, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.5014, "step": 1772 }, { "epoch": 3.2155973702108365, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.4386, "step": 1773 }, { "epoch": 3.217411017909771, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.4232, "step": 1774 }, { "epoch": 3.2192246656087056, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.5426, "step": 1775 }, { "epoch": 3.22103831330764, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.4889, "step": 1776 }, { "epoch": 3.2228519610065747, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.5589, "step": 1777 }, { "epoch": 3.224665608705509, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.4389, "step": 1778 }, { "epoch": 3.2264792564044433, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.5435, "step": 1779 }, { "epoch": 3.228292904103378, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.5554, "step": 1780 }, { "epoch": 3.2301065518023124, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.67, "step": 1781 }, { "epoch": 3.2319201995012468, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.4882, "step": 1782 }, { "epoch": 3.2337338472001815, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.4884, "step": 1783 }, { "epoch": 3.235547494899116, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.5314, "step": 1784 }, { "epoch": 3.23736114259805, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.5417, "step": 1785 }, { "epoch": 3.239174790296985, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.5305, "step": 1786 }, { "epoch": 3.2409884379959193, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.4965, "step": 1787 }, { "epoch": 3.2428020856948536, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.4792, "step": 1788 }, { "epoch": 3.2446157333937884, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.4827, "step": 1789 }, { "epoch": 3.2464293810927227, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.4815, "step": 1790 }, { "epoch": 3.248243028791657, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.4866, "step": 1791 }, { "epoch": 3.250056676490592, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.5372, "step": 1792 }, { "epoch": 3.251870324189526, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.5213, "step": 1793 }, { "epoch": 3.2536839718884605, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.5228, "step": 1794 }, { "epoch": 3.2554976195873953, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.5386, "step": 1795 }, { "epoch": 3.2573112672863296, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.524, "step": 1796 }, { "epoch": 3.2591249149852644, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.483, "step": 1797 }, { "epoch": 3.2609385626841987, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.5355, "step": 1798 }, { "epoch": 3.262752210383133, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.4124, "step": 1799 }, { "epoch": 3.264565858082068, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.4749, "step": 1800 }, { "epoch": 3.266379505781002, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.4941, "step": 1801 }, { "epoch": 3.2681931534799364, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.4347, "step": 1802 }, { "epoch": 3.270006801178871, "grad_norm": 1.4296875, "learning_rate": 0.0002, "loss": 0.456, "step": 1803 }, { "epoch": 3.2718204488778055, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.5124, "step": 1804 }, { "epoch": 3.27363409657674, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.691, "step": 1805 }, { "epoch": 3.2754477442756746, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.4971, "step": 1806 }, { "epoch": 3.277261391974609, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.6091, "step": 1807 }, { "epoch": 3.2790750396735433, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.6068, "step": 1808 }, { "epoch": 3.280888687372478, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.5746, "step": 1809 }, { "epoch": 3.2827023350714124, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.5971, "step": 1810 }, { "epoch": 3.2845159827703467, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.6126, "step": 1811 }, { "epoch": 3.2863296304692815, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.4691, "step": 1812 }, { "epoch": 3.288143278168216, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.5085, "step": 1813 }, { "epoch": 3.28995692586715, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.5759, "step": 1814 }, { "epoch": 3.291770573566085, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.4838, "step": 1815 }, { "epoch": 3.2935842212650193, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.5905, "step": 1816 }, { "epoch": 3.2953978689639536, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.5319, "step": 1817 }, { "epoch": 3.2972115166628884, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.5706, "step": 1818 }, { "epoch": 3.2990251643618227, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.5264, "step": 1819 }, { "epoch": 3.300838812060757, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.4418, "step": 1820 }, { "epoch": 3.302652459759692, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.5327, "step": 1821 }, { "epoch": 3.304466107458626, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.3494, "step": 1822 }, { "epoch": 3.3062797551575605, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.435, "step": 1823 }, { "epoch": 3.3080934028564952, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.4033, "step": 1824 }, { "epoch": 3.3099070505554296, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.4516, "step": 1825 }, { "epoch": 3.311720698254364, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.5069, "step": 1826 }, { "epoch": 3.3135343459532987, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.4486, "step": 1827 }, { "epoch": 3.315347993652233, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4645, "step": 1828 }, { "epoch": 3.3171616413511673, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.5161, "step": 1829 }, { "epoch": 3.318975289050102, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.5204, "step": 1830 }, { "epoch": 3.3207889367490364, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.4536, "step": 1831 }, { "epoch": 3.3226025844479707, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.4846, "step": 1832 }, { "epoch": 3.3244162321469055, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.4192, "step": 1833 }, { "epoch": 3.32622987984584, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.4727, "step": 1834 }, { "epoch": 3.3280435275447746, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.4201, "step": 1835 }, { "epoch": 3.329857175243709, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.4876, "step": 1836 }, { "epoch": 3.3316708229426433, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.5732, "step": 1837 }, { "epoch": 3.333484470641578, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.5549, "step": 1838 }, { "epoch": 3.3352981183405124, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.4655, "step": 1839 }, { "epoch": 3.3371117660394467, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.526, "step": 1840 }, { "epoch": 3.3389254137383815, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.4532, "step": 1841 }, { "epoch": 3.340739061437316, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.466, "step": 1842 }, { "epoch": 3.34255270913625, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.4073, "step": 1843 }, { "epoch": 3.344366356835185, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.52, "step": 1844 }, { "epoch": 3.3461800045341192, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.367, "step": 1845 }, { "epoch": 3.3479936522330536, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.5402, "step": 1846 }, { "epoch": 3.3498072999319883, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 0.4122, "step": 1847 }, { "epoch": 3.3516209476309227, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.4545, "step": 1848 }, { "epoch": 3.353434595329857, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.4938, "step": 1849 }, { "epoch": 3.3552482430287918, "grad_norm": 1.453125, "learning_rate": 0.0002, "loss": 0.5484, "step": 1850 }, { "epoch": 3.357061890727726, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.4852, "step": 1851 }, { "epoch": 3.358875538426661, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.3711, "step": 1852 }, { "epoch": 3.360689186125595, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.501, "step": 1853 }, { "epoch": 3.3625028338245295, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.4909, "step": 1854 }, { "epoch": 3.3643164815234643, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.6792, "step": 1855 }, { "epoch": 3.3661301292223986, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.7313, "step": 1856 }, { "epoch": 3.367943776921333, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.4352, "step": 1857 }, { "epoch": 3.3697574246202677, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.6231, "step": 1858 }, { "epoch": 3.371571072319202, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.4951, "step": 1859 }, { "epoch": 3.3733847200181364, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.5991, "step": 1860 }, { "epoch": 3.375198367717071, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.4503, "step": 1861 }, { "epoch": 3.3770120154160055, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.4836, "step": 1862 }, { "epoch": 3.37882566311494, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.5874, "step": 1863 }, { "epoch": 3.3806393108138746, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.3869, "step": 1864 }, { "epoch": 3.382452958512809, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.6038, "step": 1865 }, { "epoch": 3.3842666062117432, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.5394, "step": 1866 }, { "epoch": 3.386080253910678, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.5372, "step": 1867 }, { "epoch": 3.3878939016096123, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.5765, "step": 1868 }, { "epoch": 3.3897075493085467, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.4063, "step": 1869 }, { "epoch": 3.3915211970074814, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.4282, "step": 1870 }, { "epoch": 3.3915211970074814, "eval_loss": 1.5678393840789795, "eval_runtime": 185.9417, "eval_samples_per_second": 5.378, "eval_steps_per_second": 5.378, "step": 1870 }, { "epoch": 3.3915211970074814, "mmlu_eval_accuracy": 0.31439500286743083, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.38095238095238093, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.4166666666666667, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.2608695652173913, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.6363636363636364, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3333333333333333, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2823529411764706, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.16666666666666666, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 1.755542906571805, "step": 1870 }, { "epoch": 3.3933348447064158, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.4482, "step": 1871 }, { "epoch": 3.39514849240535, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.4408, "step": 1872 }, { "epoch": 3.396962140104285, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.4235, "step": 1873 }, { "epoch": 3.398775787803219, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.436, "step": 1874 }, { "epoch": 3.4005894355021535, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.4906, "step": 1875 }, { "epoch": 3.4024030832010883, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.4891, "step": 1876 }, { "epoch": 3.4042167309000226, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.5688, "step": 1877 }, { "epoch": 3.406030378598957, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.3877, "step": 1878 }, { "epoch": 3.4078440262978917, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.4992, "step": 1879 }, { "epoch": 3.409657673996826, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.61, "step": 1880 }, { "epoch": 3.4114713216957604, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.4636, "step": 1881 }, { "epoch": 3.413284969394695, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.4462, "step": 1882 }, { "epoch": 3.4150986170936295, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.4211, "step": 1883 }, { "epoch": 3.416912264792564, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.4567, "step": 1884 }, { "epoch": 3.4187259124914986, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.4835, "step": 1885 }, { "epoch": 3.420539560190433, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.5668, "step": 1886 }, { "epoch": 3.4223532078893673, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.6476, "step": 1887 }, { "epoch": 3.424166855588302, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.4449, "step": 1888 }, { "epoch": 3.4259805032872364, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.4857, "step": 1889 }, { "epoch": 3.427794150986171, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.5826, "step": 1890 }, { "epoch": 3.4296077986851055, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.4664, "step": 1891 }, { "epoch": 3.43142144638404, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.5282, "step": 1892 }, { "epoch": 3.4332350940829746, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.4794, "step": 1893 }, { "epoch": 3.435048741781909, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.3955, "step": 1894 }, { "epoch": 3.436862389480843, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.4055, "step": 1895 }, { "epoch": 3.438676037179778, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.3937, "step": 1896 }, { "epoch": 3.4404896848787123, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.5467, "step": 1897 }, { "epoch": 3.4423033325776466, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.6508, "step": 1898 }, { "epoch": 3.4441169802765814, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.4785, "step": 1899 }, { "epoch": 3.4459306279755157, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.4902, "step": 1900 }, { "epoch": 3.44774427567445, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.3916, "step": 1901 }, { "epoch": 3.449557923373385, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.4802, "step": 1902 }, { "epoch": 3.451371571072319, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.5062, "step": 1903 }, { "epoch": 3.4531852187712535, "grad_norm": 1.8515625, "learning_rate": 0.0002, "loss": 0.5535, "step": 1904 }, { "epoch": 3.4549988664701883, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.7742, "step": 1905 }, { "epoch": 3.4568125141691226, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.6357, "step": 1906 }, { "epoch": 3.458626161868057, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.4765, "step": 1907 }, { "epoch": 3.4604398095669917, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.5327, "step": 1908 }, { "epoch": 3.462253457265926, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.5231, "step": 1909 }, { "epoch": 3.464067104964861, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.4076, "step": 1910 }, { "epoch": 3.465880752663795, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.6423, "step": 1911 }, { "epoch": 3.4676944003627295, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.5476, "step": 1912 }, { "epoch": 3.4695080480616642, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.5294, "step": 1913 }, { "epoch": 3.4713216957605986, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.446, "step": 1914 }, { "epoch": 3.473135343459533, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.4647, "step": 1915 }, { "epoch": 3.4749489911584677, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.5575, "step": 1916 }, { "epoch": 3.476762638857402, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.5456, "step": 1917 }, { "epoch": 3.4785762865563363, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6151, "step": 1918 }, { "epoch": 3.480389934255271, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.4952, "step": 1919 }, { "epoch": 3.4822035819542054, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.4365, "step": 1920 }, { "epoch": 3.4840172296531398, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.4243, "step": 1921 }, { "epoch": 3.4858308773520745, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.4045, "step": 1922 }, { "epoch": 3.487644525051009, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.4155, "step": 1923 }, { "epoch": 3.489458172749943, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.5411, "step": 1924 }, { "epoch": 3.491271820448878, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.5173, "step": 1925 }, { "epoch": 3.4930854681478123, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.5042, "step": 1926 }, { "epoch": 3.4948991158467466, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.5632, "step": 1927 }, { "epoch": 3.4967127635456814, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.4437, "step": 1928 }, { "epoch": 3.4985264112446157, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.4804, "step": 1929 }, { "epoch": 3.50034005894355, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.4311, "step": 1930 }, { "epoch": 3.502153706642485, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.444, "step": 1931 }, { "epoch": 3.503967354341419, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.4757, "step": 1932 }, { "epoch": 3.5057810020403535, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3869, "step": 1933 }, { "epoch": 3.5075946497392883, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.6791, "step": 1934 }, { "epoch": 3.5094082974382226, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.4685, "step": 1935 }, { "epoch": 3.511221945137157, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.5249, "step": 1936 }, { "epoch": 3.5130355928360917, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.5557, "step": 1937 }, { "epoch": 3.514849240535026, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.4059, "step": 1938 }, { "epoch": 3.5166628882339603, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.4084, "step": 1939 }, { "epoch": 3.518476535932895, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.3634, "step": 1940 }, { "epoch": 3.5202901836318294, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.3574, "step": 1941 }, { "epoch": 3.5221038313307638, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.5928, "step": 1942 }, { "epoch": 3.5239174790296985, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.4324, "step": 1943 }, { "epoch": 3.525731126728633, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.4199, "step": 1944 }, { "epoch": 3.527544774427567, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.4196, "step": 1945 }, { "epoch": 3.529358422126502, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.5212, "step": 1946 }, { "epoch": 3.5311720698254363, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.4895, "step": 1947 }, { "epoch": 3.5329857175243706, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.4729, "step": 1948 }, { "epoch": 3.5347993652233054, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.5106, "step": 1949 }, { "epoch": 3.5366130129222397, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.6181, "step": 1950 }, { "epoch": 3.5384266606211745, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.4578, "step": 1951 }, { "epoch": 3.540240308320109, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.4387, "step": 1952 }, { "epoch": 3.542053956019043, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.4692, "step": 1953 }, { "epoch": 3.543867603717978, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.4818, "step": 1954 }, { "epoch": 3.5456812514169123, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.7295, "step": 1955 }, { "epoch": 3.5474948991158466, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.6078, "step": 1956 }, { "epoch": 3.5493085468147814, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.5549, "step": 1957 }, { "epoch": 3.5511221945137157, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.6204, "step": 1958 }, { "epoch": 3.5529358422126505, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.5452, "step": 1959 }, { "epoch": 3.554749489911585, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.5034, "step": 1960 }, { "epoch": 3.556563137610519, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.5669, "step": 1961 }, { "epoch": 3.558376785309454, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.5791, "step": 1962 }, { "epoch": 3.5601904330083882, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.6439, "step": 1963 }, { "epoch": 3.5620040807073226, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.6472, "step": 1964 }, { "epoch": 3.5638177284062573, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.5765, "step": 1965 }, { "epoch": 3.5656313761051917, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.6333, "step": 1966 }, { "epoch": 3.567445023804126, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.455, "step": 1967 }, { "epoch": 3.5692586715030608, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.6569, "step": 1968 }, { "epoch": 3.571072319201995, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.5325, "step": 1969 }, { "epoch": 3.5728859669009294, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.5204, "step": 1970 }, { "epoch": 3.574699614599864, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.5541, "step": 1971 }, { "epoch": 3.5765132622987985, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.5943, "step": 1972 }, { "epoch": 3.578326909997733, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.4496, "step": 1973 }, { "epoch": 3.5801405576966676, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.5393, "step": 1974 }, { "epoch": 3.581954205395602, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4725, "step": 1975 }, { "epoch": 3.5837678530945363, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.3682, "step": 1976 }, { "epoch": 3.585581500793471, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.4065, "step": 1977 }, { "epoch": 3.5873951484924054, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.3748, "step": 1978 }, { "epoch": 3.5892087961913397, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.5236, "step": 1979 }, { "epoch": 3.5910224438902745, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.4925, "step": 1980 }, { "epoch": 3.592836091589209, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.5416, "step": 1981 }, { "epoch": 3.594649739288143, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.5261, "step": 1982 }, { "epoch": 3.596463386987078, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.4942, "step": 1983 }, { "epoch": 3.5982770346860122, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.5972, "step": 1984 }, { "epoch": 3.6000906823849466, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.5165, "step": 1985 }, { "epoch": 3.6019043300838813, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.5656, "step": 1986 }, { "epoch": 3.6037179777828157, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.521, "step": 1987 }, { "epoch": 3.60553162548175, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.5349, "step": 1988 }, { "epoch": 3.6073452731806848, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.4853, "step": 1989 }, { "epoch": 3.609158920879619, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.4559, "step": 1990 }, { "epoch": 3.6109725685785534, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.4938, "step": 1991 }, { "epoch": 3.612786216277488, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.5641, "step": 1992 }, { "epoch": 3.6145998639764225, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.5668, "step": 1993 }, { "epoch": 3.616413511675357, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.4099, "step": 1994 }, { "epoch": 3.6182271593742916, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.4269, "step": 1995 }, { "epoch": 3.620040807073226, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.5807, "step": 1996 }, { "epoch": 3.6218544547721603, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.4576, "step": 1997 }, { "epoch": 3.623668102471095, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.5142, "step": 1998 }, { "epoch": 3.6254817501700294, "grad_norm": 1.4921875, "learning_rate": 0.0002, "loss": 0.5009, "step": 1999 }, { "epoch": 3.6272953978689637, "grad_norm": 1.4140625, "learning_rate": 0.0002, "loss": 0.4833, "step": 2000 }, { "epoch": 3.6291090455678985, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.4463, "step": 2001 }, { "epoch": 3.630922693266833, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.4262, "step": 2002 }, { "epoch": 3.632736340965767, "grad_norm": 1.3046875, "learning_rate": 0.0002, "loss": 0.4438, "step": 2003 }, { "epoch": 3.634549988664702, "grad_norm": 1.546875, "learning_rate": 0.0002, "loss": 0.4708, "step": 2004 }, { "epoch": 3.6363636363636362, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.7408, "step": 2005 }, { "epoch": 3.6381772840625706, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.768, "step": 2006 }, { "epoch": 3.6399909317615053, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.6181, "step": 2007 }, { "epoch": 3.6418045794604397, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.7146, "step": 2008 }, { "epoch": 3.6436182271593744, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.491, "step": 2009 }, { "epoch": 3.6454318748583088, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.5871, "step": 2010 }, { "epoch": 3.647245522557243, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.6286, "step": 2011 }, { "epoch": 3.649059170256178, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.4404, "step": 2012 }, { "epoch": 3.650872817955112, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.4984, "step": 2013 }, { "epoch": 3.652686465654047, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.5419, "step": 2014 }, { "epoch": 3.6545001133529813, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.556, "step": 2015 }, { "epoch": 3.6563137610519156, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4452, "step": 2016 }, { "epoch": 3.6581274087508504, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.4272, "step": 2017 }, { "epoch": 3.6599410564497847, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.4879, "step": 2018 }, { "epoch": 3.661754704148719, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.4669, "step": 2019 }, { "epoch": 3.663568351847654, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.5104, "step": 2020 }, { "epoch": 3.665381999546588, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.4626, "step": 2021 }, { "epoch": 3.6671956472455225, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.483, "step": 2022 }, { "epoch": 3.6690092949444573, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.46, "step": 2023 }, { "epoch": 3.6708229426433916, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.5955, "step": 2024 }, { "epoch": 3.672636590342326, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.5862, "step": 2025 }, { "epoch": 3.6744502380412607, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4234, "step": 2026 }, { "epoch": 3.676263885740195, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.5384, "step": 2027 }, { "epoch": 3.6780775334391294, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.6655, "step": 2028 }, { "epoch": 3.679891181138064, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.5449, "step": 2029 }, { "epoch": 3.6817048288369985, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.5406, "step": 2030 }, { "epoch": 3.683518476535933, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.4287, "step": 2031 }, { "epoch": 3.6853321242348676, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.5354, "step": 2032 }, { "epoch": 3.687145771933802, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.4416, "step": 2033 }, { "epoch": 3.688959419632736, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.8169, "step": 2034 }, { "epoch": 3.690773067331671, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.5222, "step": 2035 }, { "epoch": 3.6925867150306053, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.4924, "step": 2036 }, { "epoch": 3.6944003627295396, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.4336, "step": 2037 }, { "epoch": 3.6962140104284744, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.672, "step": 2038 }, { "epoch": 3.6980276581274087, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.4529, "step": 2039 }, { "epoch": 3.699841305826343, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.4828, "step": 2040 }, { "epoch": 3.701654953525278, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.5254, "step": 2041 }, { "epoch": 3.703468601224212, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.4783, "step": 2042 }, { "epoch": 3.7052822489231465, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.4028, "step": 2043 }, { "epoch": 3.7070958966220813, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.54, "step": 2044 }, { "epoch": 3.7089095443210156, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.4931, "step": 2045 }, { "epoch": 3.71072319201995, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.5333, "step": 2046 }, { "epoch": 3.7125368397188847, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.5407, "step": 2047 }, { "epoch": 3.714350487417819, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.5216, "step": 2048 }, { "epoch": 3.7161641351167534, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.4734, "step": 2049 }, { "epoch": 3.717977782815688, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.5389, "step": 2050 }, { "epoch": 3.7197914305146225, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.5207, "step": 2051 }, { "epoch": 3.721605078213557, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.4492, "step": 2052 }, { "epoch": 3.7234187259124916, "grad_norm": 1.453125, "learning_rate": 0.0002, "loss": 0.4856, "step": 2053 }, { "epoch": 3.725232373611426, "grad_norm": 1.578125, "learning_rate": 0.0002, "loss": 0.4821, "step": 2054 }, { "epoch": 3.7270460213103602, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.7761, "step": 2055 }, { "epoch": 3.728859669009295, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.6029, "step": 2056 }, { "epoch": 3.7306733167082293, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.6079, "step": 2057 }, { "epoch": 3.7306733167082293, "eval_loss": 1.6936880350112915, "eval_runtime": 186.2178, "eval_samples_per_second": 5.37, "eval_steps_per_second": 5.37, "step": 2057 }, { "epoch": 3.7306733167082293, "mmlu_eval_accuracy": 0.31220310361784703, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.6428571428571429, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.6363636363636364, "mmlu_eval_accuracy_conceptual_physics": 0.15384615384615385, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.45, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, "mmlu_eval_accuracy_human_aging": 0.30434782608695654, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.32, "mmlu_eval_accuracy_nutrition": 0.3333333333333333, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.29411764705882354, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 1.4699823759024164, "step": 2057 }, { "epoch": 3.7324869644071637, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.5203, "step": 2058 }, { "epoch": 3.7343006121060984, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.5513, "step": 2059 }, { "epoch": 3.7361142598050328, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.5158, "step": 2060 }, { "epoch": 3.737927907503967, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.5995, "step": 2061 }, { "epoch": 3.739741555202902, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.548, "step": 2062 }, { "epoch": 3.741555202901836, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.6186, "step": 2063 }, { "epoch": 3.743368850600771, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.5333, "step": 2064 }, { "epoch": 3.7451824982997053, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.5276, "step": 2065 }, { "epoch": 3.7469961459986396, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.5383, "step": 2066 }, { "epoch": 3.7488097936975744, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.4888, "step": 2067 }, { "epoch": 3.7506234413965087, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.5158, "step": 2068 }, { "epoch": 3.752437089095443, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.6198, "step": 2069 }, { "epoch": 3.754250736794378, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.5514, "step": 2070 }, { "epoch": 3.756064384493312, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.5683, "step": 2071 }, { "epoch": 3.757878032192247, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.521, "step": 2072 }, { "epoch": 3.7596916798911812, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.4558, "step": 2073 }, { "epoch": 3.7615053275901156, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.4513, "step": 2074 }, { "epoch": 3.7633189752890503, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.4387, "step": 2075 }, { "epoch": 3.7651326229879847, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.4774, "step": 2076 }, { "epoch": 3.766946270686919, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.484, "step": 2077 }, { "epoch": 3.768759918385854, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.5018, "step": 2078 }, { "epoch": 3.770573566084788, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.6721, "step": 2079 }, { "epoch": 3.7723872137837224, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.3489, "step": 2080 }, { "epoch": 3.774200861482657, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.4632, "step": 2081 }, { "epoch": 3.7760145091815915, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.515, "step": 2082 }, { "epoch": 3.777828156880526, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.6069, "step": 2083 }, { "epoch": 3.7796418045794606, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.51, "step": 2084 }, { "epoch": 3.781455452278395, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.6225, "step": 2085 }, { "epoch": 3.7832690999773293, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.5586, "step": 2086 }, { "epoch": 3.785082747676264, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.5127, "step": 2087 }, { "epoch": 3.7868963953751984, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.5515, "step": 2088 }, { "epoch": 3.7887100430741327, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.4979, "step": 2089 }, { "epoch": 3.7905236907730675, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.4542, "step": 2090 }, { "epoch": 3.792337338472002, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.486, "step": 2091 }, { "epoch": 3.794150986170936, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.5664, "step": 2092 }, { "epoch": 3.795964633869871, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.5796, "step": 2093 }, { "epoch": 3.7977782815688053, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.4389, "step": 2094 }, { "epoch": 3.7995919292677396, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.6171, "step": 2095 }, { "epoch": 3.8014055769666744, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.5798, "step": 2096 }, { "epoch": 3.8032192246656087, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.6899, "step": 2097 }, { "epoch": 3.805032872364543, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.4588, "step": 2098 }, { "epoch": 3.806846520063478, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.5765, "step": 2099 }, { "epoch": 3.808660167762412, "grad_norm": 1.4765625, "learning_rate": 0.0002, "loss": 0.5294, "step": 2100 }, { "epoch": 3.8104738154613464, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.4472, "step": 2101 }, { "epoch": 3.812287463160281, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.4715, "step": 2102 }, { "epoch": 3.8141011108592155, "grad_norm": 1.328125, "learning_rate": 0.0002, "loss": 0.4925, "step": 2103 }, { "epoch": 3.81591475855815, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.4179, "step": 2104 }, { "epoch": 3.8177284062570847, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.6075, "step": 2105 }, { "epoch": 3.819542053956019, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.7754, "step": 2106 }, { "epoch": 3.8213557016549533, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.6315, "step": 2107 }, { "epoch": 3.823169349353888, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.5765, "step": 2108 }, { "epoch": 3.8249829970528224, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.5878, "step": 2109 }, { "epoch": 3.8267966447517567, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.5921, "step": 2110 }, { "epoch": 3.8286102924506915, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.6315, "step": 2111 }, { "epoch": 3.830423940149626, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.4689, "step": 2112 }, { "epoch": 3.83223758784856, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.5205, "step": 2113 }, { "epoch": 3.834051235547495, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.4688, "step": 2114 }, { "epoch": 3.8358648832464293, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.5453, "step": 2115 }, { "epoch": 3.8376785309453636, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.6167, "step": 2116 }, { "epoch": 3.8394921786442984, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.5765, "step": 2117 }, { "epoch": 3.8413058263432327, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.5949, "step": 2118 }, { "epoch": 3.8431194740421675, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.5339, "step": 2119 }, { "epoch": 3.844933121741102, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.6172, "step": 2120 }, { "epoch": 3.846746769440036, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.5778, "step": 2121 }, { "epoch": 3.848560417138971, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.4831, "step": 2122 }, { "epoch": 3.8503740648379052, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.5508, "step": 2123 }, { "epoch": 3.8521877125368396, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.4883, "step": 2124 }, { "epoch": 3.8540013602357743, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.5546, "step": 2125 }, { "epoch": 3.8558150079347087, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.4838, "step": 2126 }, { "epoch": 3.8576286556336434, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.4749, "step": 2127 }, { "epoch": 3.8594423033325778, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.6901, "step": 2128 }, { "epoch": 3.861255951031512, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.5382, "step": 2129 }, { "epoch": 3.863069598730447, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.4617, "step": 2130 }, { "epoch": 3.864883246429381, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.4167, "step": 2131 }, { "epoch": 3.8666968941283155, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.5677, "step": 2132 }, { "epoch": 3.8685105418272503, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.489, "step": 2133 }, { "epoch": 3.8703241895261846, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.4429, "step": 2134 }, { "epoch": 3.872137837225119, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.6874, "step": 2135 }, { "epoch": 3.8739514849240537, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.5061, "step": 2136 }, { "epoch": 3.875765132622988, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.5592, "step": 2137 }, { "epoch": 3.8775787803219224, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.4844, "step": 2138 }, { "epoch": 3.879392428020857, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.5534, "step": 2139 }, { "epoch": 3.8812060757197915, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.4633, "step": 2140 }, { "epoch": 3.883019723418726, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.3925, "step": 2141 }, { "epoch": 3.8848333711176606, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.4933, "step": 2142 }, { "epoch": 3.886647018816595, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.5488, "step": 2143 }, { "epoch": 3.8884606665155292, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.6722, "step": 2144 }, { "epoch": 3.890274314214464, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.6316, "step": 2145 }, { "epoch": 3.8920879619133983, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.5293, "step": 2146 }, { "epoch": 3.8939016096123327, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.601, "step": 2147 }, { "epoch": 3.8957152573112674, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.5224, "step": 2148 }, { "epoch": 3.8975289050102018, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.4994, "step": 2149 }, { "epoch": 3.899342552709136, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.5432, "step": 2150 }, { "epoch": 3.901156200408071, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.4151, "step": 2151 }, { "epoch": 3.902969848107005, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.4759, "step": 2152 }, { "epoch": 3.9047834958059395, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.4619, "step": 2153 }, { "epoch": 3.9065971435048743, "grad_norm": 1.3984375, "learning_rate": 0.0002, "loss": 0.5407, "step": 2154 }, { "epoch": 3.9084107912038086, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.8615, "step": 2155 }, { "epoch": 3.910224438902743, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.6851, "step": 2156 }, { "epoch": 3.9120380866016777, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.531, "step": 2157 }, { "epoch": 3.913851734300612, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.5666, "step": 2158 }, { "epoch": 3.9156653819995464, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.5662, "step": 2159 }, { "epoch": 3.917479029698481, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.6373, "step": 2160 }, { "epoch": 3.9192926773974155, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.5341, "step": 2161 }, { "epoch": 3.92110632509635, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.5691, "step": 2162 }, { "epoch": 3.9229199727952846, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.667, "step": 2163 }, { "epoch": 3.924733620494219, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.5824, "step": 2164 }, { "epoch": 3.9265472681931533, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.6121, "step": 2165 }, { "epoch": 3.928360915892088, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.5028, "step": 2166 }, { "epoch": 3.9301745635910224, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.4924, "step": 2167 }, { "epoch": 3.9319882112899567, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.4943, "step": 2168 }, { "epoch": 3.9338018589888915, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.5293, "step": 2169 }, { "epoch": 3.935615506687826, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.5068, "step": 2170 }, { "epoch": 3.93742915438676, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.4712, "step": 2171 }, { "epoch": 3.939242802085695, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.511, "step": 2172 }, { "epoch": 3.941056449784629, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.4839, "step": 2173 }, { "epoch": 3.9428700974835635, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.4707, "step": 2174 }, { "epoch": 3.9446837451824983, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.5202, "step": 2175 }, { "epoch": 3.9464973928814326, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.5899, "step": 2176 }, { "epoch": 3.9483110405803674, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.5027, "step": 2177 }, { "epoch": 3.9501246882793017, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.5331, "step": 2178 }, { "epoch": 3.951938335978236, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.5341, "step": 2179 }, { "epoch": 3.953751983677171, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.395, "step": 2180 }, { "epoch": 3.955565631376105, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.5651, "step": 2181 }, { "epoch": 3.95737927907504, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.6368, "step": 2182 }, { "epoch": 3.9591929267739743, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.543, "step": 2183 }, { "epoch": 3.9610065744729086, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.4859, "step": 2184 }, { "epoch": 3.9628202221718434, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.4277, "step": 2185 }, { "epoch": 3.9646338698707777, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.6458, "step": 2186 }, { "epoch": 3.966447517569712, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.5273, "step": 2187 }, { "epoch": 3.968261165268647, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.6272, "step": 2188 }, { "epoch": 3.970074812967581, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.5275, "step": 2189 }, { "epoch": 3.9718884606665155, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.597, "step": 2190 }, { "epoch": 3.9737021083654502, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.4836, "step": 2191 }, { "epoch": 3.9755157560643846, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.4984, "step": 2192 }, { "epoch": 3.977329403763319, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.5156, "step": 2193 }, { "epoch": 3.9791430514622537, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.4877, "step": 2194 }, { "epoch": 3.980956699161188, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.5417, "step": 2195 }, { "epoch": 3.9827703468601223, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.5238, "step": 2196 }, { "epoch": 3.984583994559057, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.4577, "step": 2197 }, { "epoch": 3.9863976422579914, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.5376, "step": 2198 }, { "epoch": 3.9882112899569258, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.5196, "step": 2199 }, { "epoch": 3.9900249376558605, "grad_norm": 1.4296875, "learning_rate": 0.0002, "loss": 0.4403, "step": 2200 }, { "epoch": 3.991838585354795, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.4276, "step": 2201 }, { "epoch": 3.993652233053729, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.6232, "step": 2202 }, { "epoch": 3.995465880752664, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.4736, "step": 2203 }, { "epoch": 3.9972795284515983, "grad_norm": 1.5, "learning_rate": 0.0002, "loss": 0.5145, "step": 2204 }, { "epoch": 3.9990931761505326, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.6992, "step": 2205 }, { "epoch": 4.000906823849467, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.5409, "step": 2206 }, { "epoch": 4.002720471548402, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.4211, "step": 2207 }, { "epoch": 4.004534119247336, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.4131, "step": 2208 }, { "epoch": 4.006347766946271, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.4112, "step": 2209 }, { "epoch": 4.008161414645206, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.3425, "step": 2210 }, { "epoch": 4.0099750623441395, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.2896, "step": 2211 }, { "epoch": 4.011788710043074, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.272, "step": 2212 }, { "epoch": 4.013602357742009, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.3855, "step": 2213 }, { "epoch": 4.015416005440943, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.3259, "step": 2214 }, { "epoch": 4.017229653139878, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.329, "step": 2215 }, { "epoch": 4.0190433008388124, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.2705, "step": 2216 }, { "epoch": 4.020856948537746, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.3281, "step": 2217 }, { "epoch": 4.022670596236681, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.2929, "step": 2218 }, { "epoch": 4.024484243935616, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.2506, "step": 2219 }, { "epoch": 4.02629789163455, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.3009, "step": 2220 }, { "epoch": 4.0281115393334845, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.318, "step": 2221 }, { "epoch": 4.029925187032419, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2764, "step": 2222 }, { "epoch": 4.031738834731353, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.254, "step": 2223 }, { "epoch": 4.033552482430288, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.2798, "step": 2224 }, { "epoch": 4.035366130129223, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2852, "step": 2225 }, { "epoch": 4.037179777828157, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2645, "step": 2226 }, { "epoch": 4.038993425527091, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.311, "step": 2227 }, { "epoch": 4.040807073226026, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.265, "step": 2228 }, { "epoch": 4.04262072092496, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.2723, "step": 2229 }, { "epoch": 4.044434368623895, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.2317, "step": 2230 }, { "epoch": 4.04624801632283, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2922, "step": 2231 }, { "epoch": 4.0480616640217635, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.4089, "step": 2232 }, { "epoch": 4.049875311720698, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.3067, "step": 2233 }, { "epoch": 4.051688959419633, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.3066, "step": 2234 }, { "epoch": 4.053502607118567, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.3035, "step": 2235 }, { "epoch": 4.055316254817502, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.3312, "step": 2236 }, { "epoch": 4.0571299025164365, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2893, "step": 2237 }, { "epoch": 4.05894355021537, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2686, "step": 2238 }, { "epoch": 4.060757197914305, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.3628, "step": 2239 }, { "epoch": 4.06257084561324, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.3644, "step": 2240 }, { "epoch": 4.064384493312174, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2979, "step": 2241 }, { "epoch": 4.0661981410111085, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.3037, "step": 2242 }, { "epoch": 4.068011788710043, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.2855, "step": 2243 }, { "epoch": 4.069825436408977, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2882, "step": 2244 }, { "epoch": 4.069825436408977, "eval_loss": 1.804315209388733, "eval_runtime": 185.2035, "eval_samples_per_second": 5.399, "eval_steps_per_second": 5.399, "step": 2244 }, { "epoch": 4.069825436408977, "mmlu_eval_accuracy": 0.2972479355271409, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.7142857142857143, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.0625, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.5454545454545454, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.7218648528523262, "step": 2244 }, { "epoch": 4.071639084107912, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.3239, "step": 2245 }, { "epoch": 4.073452731806847, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2774, "step": 2246 }, { "epoch": 4.075266379505781, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.3176, "step": 2247 }, { "epoch": 4.077080027204715, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.2854, "step": 2248 }, { "epoch": 4.07889367490365, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.2375, "step": 2249 }, { "epoch": 4.080707322602584, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.3933, "step": 2250 }, { "epoch": 4.082520970301519, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.22, "step": 2251 }, { "epoch": 4.084334618000454, "grad_norm": 1.53125, "learning_rate": 0.0002, "loss": 0.3475, "step": 2252 }, { "epoch": 4.0861482656993875, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.292, "step": 2253 }, { "epoch": 4.087961913398322, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.3037, "step": 2254 }, { "epoch": 4.089775561097257, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.3314, "step": 2255 }, { "epoch": 4.091589208796191, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.3984, "step": 2256 }, { "epoch": 4.093402856495126, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3672, "step": 2257 }, { "epoch": 4.0952165041940605, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3608, "step": 2258 }, { "epoch": 4.097030151892994, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.4293, "step": 2259 }, { "epoch": 4.098843799591929, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.279, "step": 2260 }, { "epoch": 4.100657447290864, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.3028, "step": 2261 }, { "epoch": 4.102471094989799, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2893, "step": 2262 }, { "epoch": 4.104284742688733, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.2675, "step": 2263 }, { "epoch": 4.106098390387667, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.2993, "step": 2264 }, { "epoch": 4.107912038086602, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3349, "step": 2265 }, { "epoch": 4.109725685785536, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.3115, "step": 2266 }, { "epoch": 4.111539333484471, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.352, "step": 2267 }, { "epoch": 4.1133529811834055, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.2274, "step": 2268 }, { "epoch": 4.115166628882339, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3256, "step": 2269 }, { "epoch": 4.116980276581274, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.4924, "step": 2270 }, { "epoch": 4.118793924280209, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.2573, "step": 2271 }, { "epoch": 4.120607571979143, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.2972, "step": 2272 }, { "epoch": 4.122421219678078, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.2826, "step": 2273 }, { "epoch": 4.124234867377012, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.283, "step": 2274 }, { "epoch": 4.126048515075946, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.379, "step": 2275 }, { "epoch": 4.127862162774881, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.2754, "step": 2276 }, { "epoch": 4.129675810473816, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.268, "step": 2277 }, { "epoch": 4.13148945817275, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.3404, "step": 2278 }, { "epoch": 4.1333031058716845, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2696, "step": 2279 }, { "epoch": 4.135116753570619, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.296, "step": 2280 }, { "epoch": 4.136930401269553, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.3213, "step": 2281 }, { "epoch": 4.138744048968488, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.3747, "step": 2282 }, { "epoch": 4.140557696667423, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.2565, "step": 2283 }, { "epoch": 4.142371344366357, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2953, "step": 2284 }, { "epoch": 4.144184992065291, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.2946, "step": 2285 }, { "epoch": 4.145998639764226, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2422, "step": 2286 }, { "epoch": 4.14781228746316, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.3714, "step": 2287 }, { "epoch": 4.149625935162095, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.3559, "step": 2288 }, { "epoch": 4.1514395828610295, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.2239, "step": 2289 }, { "epoch": 4.153253230559963, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.3569, "step": 2290 }, { "epoch": 4.155066878258898, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.3043, "step": 2291 }, { "epoch": 4.156880525957833, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.2726, "step": 2292 }, { "epoch": 4.158694173656767, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.3021, "step": 2293 }, { "epoch": 4.160507821355702, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.239, "step": 2294 }, { "epoch": 4.162321469054636, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.2377, "step": 2295 }, { "epoch": 4.16413511675357, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.2588, "step": 2296 }, { "epoch": 4.165948764452505, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.3018, "step": 2297 }, { "epoch": 4.16776241215144, "grad_norm": 1.6953125, "learning_rate": 0.0002, "loss": 0.3913, "step": 2298 }, { "epoch": 4.169576059850374, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2184, "step": 2299 }, { "epoch": 4.1713897075493085, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.2648, "step": 2300 }, { "epoch": 4.173203355248243, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.3336, "step": 2301 }, { "epoch": 4.175017002947177, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.326, "step": 2302 }, { "epoch": 4.176830650646112, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3579, "step": 2303 }, { "epoch": 4.178644298345047, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.3088, "step": 2304 }, { "epoch": 4.180457946043981, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.3015, "step": 2305 }, { "epoch": 4.182271593742915, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.41, "step": 2306 }, { "epoch": 4.18408524144185, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3527, "step": 2307 }, { "epoch": 4.185898889140784, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.5012, "step": 2308 }, { "epoch": 4.187712536839719, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.3779, "step": 2309 }, { "epoch": 4.1895261845386536, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.309, "step": 2310 }, { "epoch": 4.191339832237587, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.2891, "step": 2311 }, { "epoch": 4.193153479936522, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3526, "step": 2312 }, { "epoch": 4.194967127635457, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.4176, "step": 2313 }, { "epoch": 4.196780775334391, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3144, "step": 2314 }, { "epoch": 4.198594423033326, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.343, "step": 2315 }, { "epoch": 4.20040807073226, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2817, "step": 2316 }, { "epoch": 4.202221718431195, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.2745, "step": 2317 }, { "epoch": 4.204035366130129, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3825, "step": 2318 }, { "epoch": 4.205849013829064, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.2586, "step": 2319 }, { "epoch": 4.207662661527999, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.4035, "step": 2320 }, { "epoch": 4.2094763092269325, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.3068, "step": 2321 }, { "epoch": 4.211289956925867, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.3283, "step": 2322 }, { "epoch": 4.213103604624802, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.2737, "step": 2323 }, { "epoch": 4.214917252323736, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2905, "step": 2324 }, { "epoch": 4.216730900022671, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.2689, "step": 2325 }, { "epoch": 4.2185445477216055, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2698, "step": 2326 }, { "epoch": 4.220358195420539, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3823, "step": 2327 }, { "epoch": 4.222171843119474, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.2422, "step": 2328 }, { "epoch": 4.223985490818409, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.3256, "step": 2329 }, { "epoch": 4.225799138517343, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.265, "step": 2330 }, { "epoch": 4.227612786216278, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.3665, "step": 2331 }, { "epoch": 4.229426433915212, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.319, "step": 2332 }, { "epoch": 4.231240081614146, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.3417, "step": 2333 }, { "epoch": 4.233053729313081, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.3051, "step": 2334 }, { "epoch": 4.234867377012016, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.4986, "step": 2335 }, { "epoch": 4.23668102471095, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.299, "step": 2336 }, { "epoch": 4.238494672409884, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.3171, "step": 2337 }, { "epoch": 4.240308320108819, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.4118, "step": 2338 }, { "epoch": 4.242121967807753, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2378, "step": 2339 }, { "epoch": 4.243935615506688, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.3342, "step": 2340 }, { "epoch": 4.245749263205623, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.3071, "step": 2341 }, { "epoch": 4.2475629109045565, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.23, "step": 2342 }, { "epoch": 4.249376558603491, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.29, "step": 2343 }, { "epoch": 4.251190206302426, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.28, "step": 2344 }, { "epoch": 4.25300385400136, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.3592, "step": 2345 }, { "epoch": 4.254817501700295, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.3234, "step": 2346 }, { "epoch": 4.2566311493992295, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.2899, "step": 2347 }, { "epoch": 4.258444797098163, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.3805, "step": 2348 }, { "epoch": 4.260258444797098, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.3224, "step": 2349 }, { "epoch": 4.262072092496033, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2416, "step": 2350 }, { "epoch": 4.263885740194967, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.4244, "step": 2351 }, { "epoch": 4.265699387893902, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.3064, "step": 2352 }, { "epoch": 4.267513035592836, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.3252, "step": 2353 }, { "epoch": 4.26932668329177, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.2854, "step": 2354 }, { "epoch": 4.271140330990705, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.3263, "step": 2355 }, { "epoch": 4.27295397868964, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.649, "step": 2356 }, { "epoch": 4.274767626388574, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.4069, "step": 2357 }, { "epoch": 4.276581274087508, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.367, "step": 2358 }, { "epoch": 4.278394921786443, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.4638, "step": 2359 }, { "epoch": 4.280208569485377, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.462, "step": 2360 }, { "epoch": 4.282022217184312, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.2647, "step": 2361 }, { "epoch": 4.283835864883247, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.317, "step": 2362 }, { "epoch": 4.2856495125821805, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.3479, "step": 2363 }, { "epoch": 4.287463160281115, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.3432, "step": 2364 }, { "epoch": 4.28927680798005, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.3278, "step": 2365 }, { "epoch": 4.291090455678984, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.3387, "step": 2366 }, { "epoch": 4.292904103377919, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.3803, "step": 2367 }, { "epoch": 4.2947177510768535, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3616, "step": 2368 }, { "epoch": 4.296531398775787, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.3792, "step": 2369 }, { "epoch": 4.298345046474722, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.3396, "step": 2370 }, { "epoch": 4.300158694173657, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.2769, "step": 2371 }, { "epoch": 4.301972341872592, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.2511, "step": 2372 }, { "epoch": 4.303785989571526, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.2643, "step": 2373 }, { "epoch": 4.30559963727046, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.3134, "step": 2374 }, { "epoch": 4.307413284969394, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.3345, "step": 2375 }, { "epoch": 4.309226932668329, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3739, "step": 2376 }, { "epoch": 4.311040580367264, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.341, "step": 2377 }, { "epoch": 4.312854228066199, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.3853, "step": 2378 }, { "epoch": 4.314667875765132, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.3588, "step": 2379 }, { "epoch": 4.316481523464067, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3149, "step": 2380 }, { "epoch": 4.318295171163002, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.398, "step": 2381 }, { "epoch": 4.320108818861936, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.327, "step": 2382 }, { "epoch": 4.321922466560871, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.2979, "step": 2383 }, { "epoch": 4.323736114259805, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.3049, "step": 2384 }, { "epoch": 4.325549761958739, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2349, "step": 2385 }, { "epoch": 4.327363409657674, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.3115, "step": 2386 }, { "epoch": 4.329177057356609, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2287, "step": 2387 }, { "epoch": 4.330990705055543, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.3246, "step": 2388 }, { "epoch": 4.3328043527544775, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.3419, "step": 2389 }, { "epoch": 4.334618000453412, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.3425, "step": 2390 }, { "epoch": 4.336431648152346, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.3635, "step": 2391 }, { "epoch": 4.338245295851281, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2691, "step": 2392 }, { "epoch": 4.340058943550216, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.2798, "step": 2393 }, { "epoch": 4.34187259124915, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.2597, "step": 2394 }, { "epoch": 4.343686238948084, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.2443, "step": 2395 }, { "epoch": 4.345499886647019, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.2969, "step": 2396 }, { "epoch": 4.347313534345953, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.3356, "step": 2397 }, { "epoch": 4.349127182044888, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3008, "step": 2398 }, { "epoch": 4.350940829743823, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.4134, "step": 2399 }, { "epoch": 4.3527544774427565, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.3001, "step": 2400 }, { "epoch": 4.354568125141691, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.3593, "step": 2401 }, { "epoch": 4.356381772840626, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2984, "step": 2402 }, { "epoch": 4.35819542053956, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.3218, "step": 2403 }, { "epoch": 4.360009068238495, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.3806, "step": 2404 }, { "epoch": 4.361822715937429, "grad_norm": 1.53125, "learning_rate": 0.0002, "loss": 0.4039, "step": 2405 }, { "epoch": 4.363636363636363, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.5352, "step": 2406 }, { "epoch": 4.365450011335298, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.517, "step": 2407 }, { "epoch": 4.367263659034233, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.3534, "step": 2408 }, { "epoch": 4.369077306733167, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3967, "step": 2409 }, { "epoch": 4.3708909544321015, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.3299, "step": 2410 }, { "epoch": 4.372704602131036, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.4103, "step": 2411 }, { "epoch": 4.37451824982997, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3615, "step": 2412 }, { "epoch": 4.376331897528905, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.34, "step": 2413 }, { "epoch": 4.37814554522784, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3199, "step": 2414 }, { "epoch": 4.379959192926774, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2964, "step": 2415 }, { "epoch": 4.381772840625708, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.287, "step": 2416 }, { "epoch": 4.383586488324643, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.3983, "step": 2417 }, { "epoch": 4.385400136023577, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3935, "step": 2418 }, { "epoch": 4.387213783722512, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.329, "step": 2419 }, { "epoch": 4.389027431421447, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.3689, "step": 2420 }, { "epoch": 4.3908410791203805, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3819, "step": 2421 }, { "epoch": 4.392654726819315, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.3451, "step": 2422 }, { "epoch": 4.39446837451825, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.3547, "step": 2423 }, { "epoch": 4.396282022217184, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.3251, "step": 2424 }, { "epoch": 4.398095669916119, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.2995, "step": 2425 }, { "epoch": 4.399909317615053, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.2708, "step": 2426 }, { "epoch": 4.401722965313988, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.3333, "step": 2427 }, { "epoch": 4.403536613012922, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.4108, "step": 2428 }, { "epoch": 4.405350260711857, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.3744, "step": 2429 }, { "epoch": 4.407163908410791, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.2518, "step": 2430 }, { "epoch": 4.4089775561097255, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.2995, "step": 2431 }, { "epoch": 4.4089775561097255, "eval_loss": 1.7990649938583374, "eval_runtime": 186.2757, "eval_samples_per_second": 5.368, "eval_steps_per_second": 5.368, "step": 2431 }, { "epoch": 4.4089775561097255, "mmlu_eval_accuracy": 0.29638433445731327, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.35, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.30303030303030304, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.9570787402635148, "step": 2431 }, { "epoch": 4.41079120380866, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2833, "step": 2432 }, { "epoch": 4.412604851507595, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.2727, "step": 2433 }, { "epoch": 4.414418499206529, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.3142, "step": 2434 }, { "epoch": 4.416232146905464, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.2999, "step": 2435 }, { "epoch": 4.4180457946043985, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.3584, "step": 2436 }, { "epoch": 4.419859442303332, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.3197, "step": 2437 }, { "epoch": 4.421673090002267, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.378, "step": 2438 }, { "epoch": 4.423486737701202, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.3302, "step": 2439 }, { "epoch": 4.425300385400136, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.3535, "step": 2440 }, { "epoch": 4.427114033099071, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.3771, "step": 2441 }, { "epoch": 4.428927680798005, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.2303, "step": 2442 }, { "epoch": 4.430741328496939, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.2961, "step": 2443 }, { "epoch": 4.432554976195874, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.2788, "step": 2444 }, { "epoch": 4.434368623894809, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2793, "step": 2445 }, { "epoch": 4.436182271593743, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.32, "step": 2446 }, { "epoch": 4.4379959192926774, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.4725, "step": 2447 }, { "epoch": 4.439809566991612, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.3274, "step": 2448 }, { "epoch": 4.441623214690546, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.3437, "step": 2449 }, { "epoch": 4.443436862389481, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.4504, "step": 2450 }, { "epoch": 4.445250510088416, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.3089, "step": 2451 }, { "epoch": 4.4470641577873495, "grad_norm": 1.359375, "learning_rate": 0.0002, "loss": 0.3199, "step": 2452 }, { "epoch": 4.448877805486284, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.3588, "step": 2453 }, { "epoch": 4.450691453185219, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.341, "step": 2454 }, { "epoch": 4.452505100884153, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.3667, "step": 2455 }, { "epoch": 4.454318748583088, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.474, "step": 2456 }, { "epoch": 4.4561323962820225, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.4691, "step": 2457 }, { "epoch": 4.457946043980956, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3806, "step": 2458 }, { "epoch": 4.459759691679891, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.3128, "step": 2459 }, { "epoch": 4.461573339378826, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.3359, "step": 2460 }, { "epoch": 4.46338698707776, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.3556, "step": 2461 }, { "epoch": 4.465200634776695, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.3135, "step": 2462 }, { "epoch": 4.467014282475629, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.2842, "step": 2463 }, { "epoch": 4.468827930174563, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.3734, "step": 2464 }, { "epoch": 4.470641577873498, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.3557, "step": 2465 }, { "epoch": 4.472455225572433, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.3837, "step": 2466 }, { "epoch": 4.474268873271367, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.336, "step": 2467 }, { "epoch": 4.4760825209703015, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3974, "step": 2468 }, { "epoch": 4.477896168669236, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.3458, "step": 2469 }, { "epoch": 4.47970981636817, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.2551, "step": 2470 }, { "epoch": 4.481523464067105, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3662, "step": 2471 }, { "epoch": 4.48333711176604, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.3082, "step": 2472 }, { "epoch": 4.4851507594649735, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.3077, "step": 2473 }, { "epoch": 4.486964407163908, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.2745, "step": 2474 }, { "epoch": 4.488778054862843, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.3028, "step": 2475 }, { "epoch": 4.490591702561777, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.3526, "step": 2476 }, { "epoch": 4.492405350260712, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.2297, "step": 2477 }, { "epoch": 4.4942189979596465, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2934, "step": 2478 }, { "epoch": 4.49603264565858, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.396, "step": 2479 }, { "epoch": 4.497846293357515, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2954, "step": 2480 }, { "epoch": 4.49965994105645, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.3299, "step": 2481 }, { "epoch": 4.501473588755385, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2612, "step": 2482 }, { "epoch": 4.503287236454319, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.3379, "step": 2483 }, { "epoch": 4.505100884153253, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.3485, "step": 2484 }, { "epoch": 4.506914531852187, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.272, "step": 2485 }, { "epoch": 4.508728179551122, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.3771, "step": 2486 }, { "epoch": 4.510541827250057, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.3142, "step": 2487 }, { "epoch": 4.512355474948992, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.3368, "step": 2488 }, { "epoch": 4.5141691226479255, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.3329, "step": 2489 }, { "epoch": 4.51598277034686, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.3872, "step": 2490 }, { "epoch": 4.517796418045794, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.2981, "step": 2491 }, { "epoch": 4.519610065744729, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.2977, "step": 2492 }, { "epoch": 4.521423713443664, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.2875, "step": 2493 }, { "epoch": 4.5232373611425984, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2713, "step": 2494 }, { "epoch": 4.525051008841532, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2896, "step": 2495 }, { "epoch": 4.526864656540467, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.4328, "step": 2496 }, { "epoch": 4.528678304239402, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.3324, "step": 2497 }, { "epoch": 4.530491951938336, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.3244, "step": 2498 }, { "epoch": 4.5323055996372705, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.3256, "step": 2499 }, { "epoch": 4.534119247336205, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.3937, "step": 2500 }, { "epoch": 4.535932895035139, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.3402, "step": 2501 }, { "epoch": 4.537746542734074, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3239, "step": 2502 }, { "epoch": 4.539560190433009, "grad_norm": 1.390625, "learning_rate": 0.0002, "loss": 0.3459, "step": 2503 }, { "epoch": 4.541373838131943, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.3024, "step": 2504 }, { "epoch": 4.543187485830877, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.3812, "step": 2505 }, { "epoch": 4.545001133529812, "grad_norm": 1.390625, "learning_rate": 0.0002, "loss": 0.4885, "step": 2506 }, { "epoch": 4.546814781228746, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.3975, "step": 2507 }, { "epoch": 4.548628428927681, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.4093, "step": 2508 }, { "epoch": 4.550442076626616, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3434, "step": 2509 }, { "epoch": 4.5522557243255495, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.4358, "step": 2510 }, { "epoch": 4.554069372024484, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.305, "step": 2511 }, { "epoch": 4.555883019723419, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3777, "step": 2512 }, { "epoch": 4.557696667422353, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.3072, "step": 2513 }, { "epoch": 4.559510315121288, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.3799, "step": 2514 }, { "epoch": 4.5613239628202225, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3461, "step": 2515 }, { "epoch": 4.563137610519156, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3633, "step": 2516 }, { "epoch": 4.564951258218091, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.4255, "step": 2517 }, { "epoch": 4.566764905917026, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3186, "step": 2518 }, { "epoch": 4.56857855361596, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.32, "step": 2519 }, { "epoch": 4.5703922013148945, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3014, "step": 2520 }, { "epoch": 4.572205849013829, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3098, "step": 2521 }, { "epoch": 4.574019496712763, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.3424, "step": 2522 }, { "epoch": 4.575833144411698, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3395, "step": 2523 }, { "epoch": 4.577646792110633, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.2719, "step": 2524 }, { "epoch": 4.579460439809567, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.4128, "step": 2525 }, { "epoch": 4.581274087508501, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3491, "step": 2526 }, { "epoch": 4.583087735207436, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.3858, "step": 2527 }, { "epoch": 4.58490138290637, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3538, "step": 2528 }, { "epoch": 4.586715030605305, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.2744, "step": 2529 }, { "epoch": 4.58852867830424, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2639, "step": 2530 }, { "epoch": 4.590342326003174, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.3508, "step": 2531 }, { "epoch": 4.592155973702108, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.3383, "step": 2532 }, { "epoch": 4.593969621401043, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2948, "step": 2533 }, { "epoch": 4.595783269099977, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.4346, "step": 2534 }, { "epoch": 4.597596916798912, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.378, "step": 2535 }, { "epoch": 4.5994105644978465, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.3703, "step": 2536 }, { "epoch": 4.601224212196781, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.3411, "step": 2537 }, { "epoch": 4.603037859895715, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.3048, "step": 2538 }, { "epoch": 4.60485150759465, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.2886, "step": 2539 }, { "epoch": 4.606665155293584, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.2986, "step": 2540 }, { "epoch": 4.6084788029925186, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.2617, "step": 2541 }, { "epoch": 4.610292450691453, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3317, "step": 2542 }, { "epoch": 4.612106098390388, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.4017, "step": 2543 }, { "epoch": 4.613919746089322, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.332, "step": 2544 }, { "epoch": 4.615733393788257, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.3376, "step": 2545 }, { "epoch": 4.617547041487191, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.3216, "step": 2546 }, { "epoch": 4.619360689186125, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.4019, "step": 2547 }, { "epoch": 4.62117433688506, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.3542, "step": 2548 }, { "epoch": 4.622987984583995, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.3581, "step": 2549 }, { "epoch": 4.624801632282929, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.291, "step": 2550 }, { "epoch": 4.626615279981864, "grad_norm": 1.3125, "learning_rate": 0.0002, "loss": 0.4495, "step": 2551 }, { "epoch": 4.628428927680798, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.3471, "step": 2552 }, { "epoch": 4.630242575379732, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.3115, "step": 2553 }, { "epoch": 4.632056223078667, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.3825, "step": 2554 }, { "epoch": 4.633869870777602, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.3365, "step": 2555 }, { "epoch": 4.635683518476536, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.5022, "step": 2556 }, { "epoch": 4.6374971661754705, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.4711, "step": 2557 }, { "epoch": 4.639310813874405, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.4588, "step": 2558 }, { "epoch": 4.641124461573339, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3585, "step": 2559 }, { "epoch": 4.642938109272274, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.3255, "step": 2560 }, { "epoch": 4.644751756971209, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3942, "step": 2561 }, { "epoch": 4.646565404670143, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.429, "step": 2562 }, { "epoch": 4.648379052369077, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.2937, "step": 2563 }, { "epoch": 4.650192700068012, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3541, "step": 2564 }, { "epoch": 4.652006347766946, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.3672, "step": 2565 }, { "epoch": 4.653819995465881, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.4425, "step": 2566 }, { "epoch": 4.6556336431648155, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3374, "step": 2567 }, { "epoch": 4.657447290863749, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3301, "step": 2568 }, { "epoch": 4.659260938562684, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.3442, "step": 2569 }, { "epoch": 4.661074586261619, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.3093, "step": 2570 }, { "epoch": 4.662888233960553, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.3223, "step": 2571 }, { "epoch": 4.664701881659488, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.2749, "step": 2572 }, { "epoch": 4.666515529358422, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.3644, "step": 2573 }, { "epoch": 4.668329177057356, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.3178, "step": 2574 }, { "epoch": 4.670142824756291, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.3421, "step": 2575 }, { "epoch": 4.671956472455226, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3567, "step": 2576 }, { "epoch": 4.67377012015416, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.3939, "step": 2577 }, { "epoch": 4.6755837678530945, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.3456, "step": 2578 }, { "epoch": 4.677397415552029, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.358, "step": 2579 }, { "epoch": 4.679211063250963, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.3337, "step": 2580 }, { "epoch": 4.681024710949898, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.4627, "step": 2581 }, { "epoch": 4.682838358648833, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.337, "step": 2582 }, { "epoch": 4.684652006347767, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.3145, "step": 2583 }, { "epoch": 4.686465654046701, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2712, "step": 2584 }, { "epoch": 4.688279301745636, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.3691, "step": 2585 }, { "epoch": 4.690092949444571, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.368, "step": 2586 }, { "epoch": 4.691906597143505, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.3485, "step": 2587 }, { "epoch": 4.6937202448424395, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3683, "step": 2588 }, { "epoch": 4.695533892541373, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.2767, "step": 2589 }, { "epoch": 4.697347540240308, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.3427, "step": 2590 }, { "epoch": 4.699161187939243, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.293, "step": 2591 }, { "epoch": 4.700974835638178, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.4394, "step": 2592 }, { "epoch": 4.702788483337112, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.3757, "step": 2593 }, { "epoch": 4.704602131036046, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.293, "step": 2594 }, { "epoch": 4.70641577873498, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.347, "step": 2595 }, { "epoch": 4.708229426433915, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.3512, "step": 2596 }, { "epoch": 4.71004307413285, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.4128, "step": 2597 }, { "epoch": 4.711856721831785, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.3909, "step": 2598 }, { "epoch": 4.7136703695307185, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.4046, "step": 2599 }, { "epoch": 4.715484017229653, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2992, "step": 2600 }, { "epoch": 4.717297664928587, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.3472, "step": 2601 }, { "epoch": 4.719111312627522, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.3504, "step": 2602 }, { "epoch": 4.720924960326457, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3293, "step": 2603 }, { "epoch": 4.7227386080253915, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.3301, "step": 2604 }, { "epoch": 4.724552255724325, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.4112, "step": 2605 }, { "epoch": 4.72636590342326, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.7103, "step": 2606 }, { "epoch": 4.728179551122195, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.4562, "step": 2607 }, { "epoch": 4.729993198821129, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.408, "step": 2608 }, { "epoch": 4.731806846520064, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3343, "step": 2609 }, { "epoch": 4.733620494218998, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.4009, "step": 2610 }, { "epoch": 4.735434141917932, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3748, "step": 2611 }, { "epoch": 4.737247789616867, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3304, "step": 2612 }, { "epoch": 4.739061437315802, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.3452, "step": 2613 }, { "epoch": 4.740875085014736, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.3787, "step": 2614 }, { "epoch": 4.74268873271367, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.3485, "step": 2615 }, { "epoch": 4.744502380412605, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3841, "step": 2616 }, { "epoch": 4.746316028111539, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.4901, "step": 2617 }, { "epoch": 4.748129675810474, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.4094, "step": 2618 }, { "epoch": 4.748129675810474, "eval_loss": 1.7383798360824585, "eval_runtime": 185.934, "eval_samples_per_second": 5.378, "eval_steps_per_second": 5.378, "step": 2618 }, { "epoch": 4.748129675810474, "mmlu_eval_accuracy": 0.3070590647723487, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.5454545454545454, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.21951219512195122, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.5, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.3333333333333333, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.34285714285714286, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.2222222222222222, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.015977403683105, "step": 2618 }, { "epoch": 4.749943323509409, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.3926, "step": 2619 }, { "epoch": 4.7517569712083425, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.4077, "step": 2620 }, { "epoch": 4.753570618907277, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.3213, "step": 2621 }, { "epoch": 4.755384266606212, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.3033, "step": 2622 }, { "epoch": 4.757197914305146, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.3939, "step": 2623 }, { "epoch": 4.759011562004081, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2969, "step": 2624 }, { "epoch": 4.7608252097030155, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.3508, "step": 2625 }, { "epoch": 4.762638857401949, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3881, "step": 2626 }, { "epoch": 4.764452505100884, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.3727, "step": 2627 }, { "epoch": 4.766266152799819, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3232, "step": 2628 }, { "epoch": 4.768079800498753, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3341, "step": 2629 }, { "epoch": 4.769893448197688, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.3022, "step": 2630 }, { "epoch": 4.771707095896622, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2584, "step": 2631 }, { "epoch": 4.773520743595556, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.2158, "step": 2632 }, { "epoch": 4.775334391294491, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.4019, "step": 2633 }, { "epoch": 4.777148038993426, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.3558, "step": 2634 }, { "epoch": 4.77896168669236, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.5516, "step": 2635 }, { "epoch": 4.780775334391294, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.3811, "step": 2636 }, { "epoch": 4.782588982090229, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.3723, "step": 2637 }, { "epoch": 4.784402629789163, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.3697, "step": 2638 }, { "epoch": 4.786216277488098, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.343, "step": 2639 }, { "epoch": 4.788029925187033, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.4039, "step": 2640 }, { "epoch": 4.7898435728859665, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.2641, "step": 2641 }, { "epoch": 4.791657220584901, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.3561, "step": 2642 }, { "epoch": 4.793470868283836, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.363, "step": 2643 }, { "epoch": 4.79528451598277, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.35, "step": 2644 }, { "epoch": 4.797098163681705, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.2699, "step": 2645 }, { "epoch": 4.7989118113806395, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.4118, "step": 2646 }, { "epoch": 4.800725459079574, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2826, "step": 2647 }, { "epoch": 4.802539106778508, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.3479, "step": 2648 }, { "epoch": 4.804352754477443, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.3526, "step": 2649 }, { "epoch": 4.806166402176377, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.3851, "step": 2650 }, { "epoch": 4.807980049875312, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.3911, "step": 2651 }, { "epoch": 4.809793697574246, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.3629, "step": 2652 }, { "epoch": 4.811607345273181, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.2615, "step": 2653 }, { "epoch": 4.813420992972115, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.3738, "step": 2654 }, { "epoch": 4.81523464067105, "grad_norm": 1.703125, "learning_rate": 0.0002, "loss": 0.4606, "step": 2655 }, { "epoch": 4.817048288369984, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.4466, "step": 2656 }, { "epoch": 4.818861936068918, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.5082, "step": 2657 }, { "epoch": 4.820675583767853, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3682, "step": 2658 }, { "epoch": 4.822489231466788, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.4018, "step": 2659 }, { "epoch": 4.824302879165722, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.4481, "step": 2660 }, { "epoch": 4.826116526864657, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.3332, "step": 2661 }, { "epoch": 4.8279301745635905, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.4501, "step": 2662 }, { "epoch": 4.829743822262525, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3826, "step": 2663 }, { "epoch": 4.83155746996146, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.3827, "step": 2664 }, { "epoch": 4.833371117660395, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.4366, "step": 2665 }, { "epoch": 4.835184765359329, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.3709, "step": 2666 }, { "epoch": 4.8369984130582635, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.3157, "step": 2667 }, { "epoch": 4.838812060757198, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3788, "step": 2668 }, { "epoch": 4.840625708456132, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.3768, "step": 2669 }, { "epoch": 4.842439356155067, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.3486, "step": 2670 }, { "epoch": 4.844253003854002, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3679, "step": 2671 }, { "epoch": 4.846066651552936, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.3482, "step": 2672 }, { "epoch": 4.84788029925187, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.3825, "step": 2673 }, { "epoch": 4.849693946950805, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.3125, "step": 2674 }, { "epoch": 4.851507594649739, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.4065, "step": 2675 }, { "epoch": 4.853321242348674, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.329, "step": 2676 }, { "epoch": 4.855134890047609, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.3253, "step": 2677 }, { "epoch": 4.8569485377465424, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.3653, "step": 2678 }, { "epoch": 4.858762185445477, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.4221, "step": 2679 }, { "epoch": 4.860575833144412, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.3949, "step": 2680 }, { "epoch": 4.862389480843346, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2478, "step": 2681 }, { "epoch": 4.864203128542281, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.2774, "step": 2682 }, { "epoch": 4.866016776241215, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.3498, "step": 2683 }, { "epoch": 4.867830423940149, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.3352, "step": 2684 }, { "epoch": 4.869644071639084, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.3493, "step": 2685 }, { "epoch": 4.871457719338019, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.3264, "step": 2686 }, { "epoch": 4.873271367036953, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.3295, "step": 2687 }, { "epoch": 4.8750850147358875, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2551, "step": 2688 }, { "epoch": 4.876898662434822, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.3741, "step": 2689 }, { "epoch": 4.878712310133756, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2906, "step": 2690 }, { "epoch": 4.880525957832691, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.2762, "step": 2691 }, { "epoch": 4.882339605531626, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.3425, "step": 2692 }, { "epoch": 4.88415325323056, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3566, "step": 2693 }, { "epoch": 4.885966900929494, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.3453, "step": 2694 }, { "epoch": 4.887780548628429, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.3644, "step": 2695 }, { "epoch": 4.889594196327363, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.2862, "step": 2696 }, { "epoch": 4.891407844026298, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.4059, "step": 2697 }, { "epoch": 4.893221491725233, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.4193, "step": 2698 }, { "epoch": 4.8950351394241665, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.325, "step": 2699 }, { "epoch": 4.896848787123101, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.3864, "step": 2700 }, { "epoch": 4.898662434822036, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.3398, "step": 2701 }, { "epoch": 4.900476082520971, "grad_norm": 1.3984375, "learning_rate": 0.0002, "loss": 0.3877, "step": 2702 }, { "epoch": 4.902289730219905, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.3615, "step": 2703 }, { "epoch": 4.904103377918839, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.3871, "step": 2704 }, { "epoch": 4.905917025617773, "grad_norm": 1.4921875, "learning_rate": 0.0002, "loss": 0.3893, "step": 2705 }, { "epoch": 4.907730673316708, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.5571, "step": 2706 }, { "epoch": 4.909544321015643, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.6111, "step": 2707 }, { "epoch": 4.911357968714578, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.3828, "step": 2708 }, { "epoch": 4.9131716164135115, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.5735, "step": 2709 }, { "epoch": 4.914985264112446, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.3697, "step": 2710 }, { "epoch": 4.91679891181138, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.4242, "step": 2711 }, { "epoch": 4.918612559510315, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.46, "step": 2712 }, { "epoch": 4.92042620720925, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.4574, "step": 2713 }, { "epoch": 4.9222398549081845, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.3934, "step": 2714 }, { "epoch": 4.924053502607118, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.3964, "step": 2715 }, { "epoch": 4.925867150306053, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.3634, "step": 2716 }, { "epoch": 4.927680798004987, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.4092, "step": 2717 }, { "epoch": 4.929494445703922, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.3658, "step": 2718 }, { "epoch": 4.931308093402857, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.3866, "step": 2719 }, { "epoch": 4.933121741101791, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2991, "step": 2720 }, { "epoch": 4.934935388800725, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.3263, "step": 2721 }, { "epoch": 4.93674903649966, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.4701, "step": 2722 }, { "epoch": 4.938562684198595, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.4195, "step": 2723 }, { "epoch": 4.940376331897529, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.3328, "step": 2724 }, { "epoch": 4.9421899795964634, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.3628, "step": 2725 }, { "epoch": 4.944003627295398, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.3361, "step": 2726 }, { "epoch": 4.945817274994332, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.3757, "step": 2727 }, { "epoch": 4.947630922693267, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2819, "step": 2728 }, { "epoch": 4.949444570392202, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.364, "step": 2729 }, { "epoch": 4.9512582180911355, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.4053, "step": 2730 }, { "epoch": 4.95307186579007, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.3332, "step": 2731 }, { "epoch": 4.954885513489005, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.4814, "step": 2732 }, { "epoch": 4.956699161187939, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.3546, "step": 2733 }, { "epoch": 4.958512808886874, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.3839, "step": 2734 }, { "epoch": 4.9603264565858085, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.3408, "step": 2735 }, { "epoch": 4.962140104284742, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.3443, "step": 2736 }, { "epoch": 4.963953751983677, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.2979, "step": 2737 }, { "epoch": 4.965767399682612, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.3699, "step": 2738 }, { "epoch": 4.967581047381546, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.3935, "step": 2739 }, { "epoch": 4.969394695080481, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.3887, "step": 2740 }, { "epoch": 4.971208342779415, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.3059, "step": 2741 }, { "epoch": 4.973021990478349, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.309, "step": 2742 }, { "epoch": 4.974835638177284, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.3685, "step": 2743 }, { "epoch": 4.976649285876219, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.3217, "step": 2744 }, { "epoch": 4.978462933575153, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.451, "step": 2745 }, { "epoch": 4.9802765812740875, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.3375, "step": 2746 }, { "epoch": 4.982090228973022, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.3046, "step": 2747 }, { "epoch": 4.983903876671956, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2901, "step": 2748 }, { "epoch": 4.985717524370891, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.41, "step": 2749 }, { "epoch": 4.987531172069826, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.3227, "step": 2750 }, { "epoch": 4.9893448197687595, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3418, "step": 2751 }, { "epoch": 4.991158467467694, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.3755, "step": 2752 }, { "epoch": 4.992972115166629, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2986, "step": 2753 }, { "epoch": 4.994785762865563, "grad_norm": 1.453125, "learning_rate": 0.0002, "loss": 0.4175, "step": 2754 }, { "epoch": 4.996599410564498, "grad_norm": 1.7578125, "learning_rate": 0.0002, "loss": 0.4124, "step": 2755 }, { "epoch": 4.9984130582634325, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.4546, "step": 2756 }, { "epoch": 5.000226705962366, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.3936, "step": 2757 }, { "epoch": 5.002040353661301, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2923, "step": 2758 }, { "epoch": 5.003854001360236, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.2506, "step": 2759 }, { "epoch": 5.00566764905917, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.2216, "step": 2760 }, { "epoch": 5.007481296758105, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.3365, "step": 2761 }, { "epoch": 5.009294944457039, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.1851, "step": 2762 }, { "epoch": 5.011108592155973, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1659, "step": 2763 }, { "epoch": 5.012922239854908, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1631, "step": 2764 }, { "epoch": 5.014735887553843, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1613, "step": 2765 }, { "epoch": 5.016549535252778, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1744, "step": 2766 }, { "epoch": 5.0183631829517115, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2331, "step": 2767 }, { "epoch": 5.020176830650646, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2163, "step": 2768 }, { "epoch": 5.021990478349581, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2514, "step": 2769 }, { "epoch": 5.023804126048515, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1836, "step": 2770 }, { "epoch": 5.02561777374745, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2274, "step": 2771 }, { "epoch": 5.027431421446384, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1551, "step": 2772 }, { "epoch": 5.029245069145318, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1998, "step": 2773 }, { "epoch": 5.031058716844253, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1868, "step": 2774 }, { "epoch": 5.032872364543188, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.2059, "step": 2775 }, { "epoch": 5.034686012242122, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2043, "step": 2776 }, { "epoch": 5.0364996599410565, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1623, "step": 2777 }, { "epoch": 5.038313307639991, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.1701, "step": 2778 }, { "epoch": 5.040126955338925, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.2046, "step": 2779 }, { "epoch": 5.04194060303786, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1687, "step": 2780 }, { "epoch": 5.043754250736795, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2323, "step": 2781 }, { "epoch": 5.045567898435729, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1672, "step": 2782 }, { "epoch": 5.047381546134663, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2034, "step": 2783 }, { "epoch": 5.049195193833598, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2347, "step": 2784 }, { "epoch": 5.051008841532532, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1678, "step": 2785 }, { "epoch": 5.052822489231467, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1604, "step": 2786 }, { "epoch": 5.054636136930402, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1963, "step": 2787 }, { "epoch": 5.0564497846293355, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1647, "step": 2788 }, { "epoch": 5.05826343232827, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1772, "step": 2789 }, { "epoch": 5.060077080027205, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1776, "step": 2790 }, { "epoch": 5.061890727726139, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.173, "step": 2791 }, { "epoch": 5.063704375425074, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1841, "step": 2792 }, { "epoch": 5.0655180231240085, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1892, "step": 2793 }, { "epoch": 5.067331670822942, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1816, "step": 2794 }, { "epoch": 5.069145318521877, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1684, "step": 2795 }, { "epoch": 5.070958966220812, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3019, "step": 2796 }, { "epoch": 5.072772613919746, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1757, "step": 2797 }, { "epoch": 5.0745862616186805, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1995, "step": 2798 }, { "epoch": 5.076399909317615, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1764, "step": 2799 }, { "epoch": 5.078213557016549, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.2832, "step": 2800 }, { "epoch": 5.080027204715484, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1956, "step": 2801 }, { "epoch": 5.081840852414419, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.2109, "step": 2802 }, { "epoch": 5.083654500113353, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.201, "step": 2803 }, { "epoch": 5.085468147812287, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2322, "step": 2804 }, { "epoch": 5.087281795511222, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.2117, "step": 2805 }, { "epoch": 5.087281795511222, "eval_loss": 1.9782476425170898, "eval_runtime": 185.8029, "eval_samples_per_second": 5.382, "eval_steps_per_second": 5.382, "step": 2805 }, { "epoch": 5.087281795511222, "mmlu_eval_accuracy": 0.29221673055810937, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.125, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.30303030303030304, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.37142857142857144, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.28823529411764703, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.18518518518518517, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 1.84882434957328, "step": 2805 }, { "epoch": 5.089095443210156, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.2895, "step": 2806 }, { "epoch": 5.090909090909091, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.3347, "step": 2807 }, { "epoch": 5.092722738608026, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2532, "step": 2808 }, { "epoch": 5.0945363863069595, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2739, "step": 2809 }, { "epoch": 5.096350034005894, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.2249, "step": 2810 }, { "epoch": 5.098163681704829, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.2009, "step": 2811 }, { "epoch": 5.099977329403763, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2693, "step": 2812 }, { "epoch": 5.101790977102698, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.267, "step": 2813 }, { "epoch": 5.1036046248016325, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1957, "step": 2814 }, { "epoch": 5.105418272500566, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2135, "step": 2815 }, { "epoch": 5.107231920199501, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1988, "step": 2816 }, { "epoch": 5.109045567898436, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2584, "step": 2817 }, { "epoch": 5.11085921559737, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1986, "step": 2818 }, { "epoch": 5.1126728632963045, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1761, "step": 2819 }, { "epoch": 5.114486510995239, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2162, "step": 2820 }, { "epoch": 5.116300158694174, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2517, "step": 2821 }, { "epoch": 5.118113806393108, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1832, "step": 2822 }, { "epoch": 5.119927454092043, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1565, "step": 2823 }, { "epoch": 5.1217411017909775, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2271, "step": 2824 }, { "epoch": 5.123554749489911, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.186, "step": 2825 }, { "epoch": 5.125368397188846, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.2105, "step": 2826 }, { "epoch": 5.127182044887781, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2061, "step": 2827 }, { "epoch": 5.128995692586715, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1783, "step": 2828 }, { "epoch": 5.13080934028565, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2156, "step": 2829 }, { "epoch": 5.132622987984584, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1852, "step": 2830 }, { "epoch": 5.134436635683518, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.208, "step": 2831 }, { "epoch": 5.136250283382453, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1694, "step": 2832 }, { "epoch": 5.138063931081388, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1993, "step": 2833 }, { "epoch": 5.139877578780322, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.2497, "step": 2834 }, { "epoch": 5.1416912264792565, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2131, "step": 2835 }, { "epoch": 5.143504874178191, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2029, "step": 2836 }, { "epoch": 5.145318521877125, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.2551, "step": 2837 }, { "epoch": 5.14713216957606, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2256, "step": 2838 }, { "epoch": 5.148945817274995, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1865, "step": 2839 }, { "epoch": 5.150759464973929, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1962, "step": 2840 }, { "epoch": 5.152573112672863, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.2495, "step": 2841 }, { "epoch": 5.154386760371798, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.2128, "step": 2842 }, { "epoch": 5.156200408070732, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1722, "step": 2843 }, { "epoch": 5.158014055769667, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1783, "step": 2844 }, { "epoch": 5.1598277034686015, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.183, "step": 2845 }, { "epoch": 5.161641351167535, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.2523, "step": 2846 }, { "epoch": 5.16345499886647, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2275, "step": 2847 }, { "epoch": 5.165268646565405, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.2511, "step": 2848 }, { "epoch": 5.167082294264339, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.2375, "step": 2849 }, { "epoch": 5.168895941963274, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.185, "step": 2850 }, { "epoch": 5.170709589662208, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.3063, "step": 2851 }, { "epoch": 5.172523237361142, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.2332, "step": 2852 }, { "epoch": 5.174336885060077, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.2399, "step": 2853 }, { "epoch": 5.176150532759012, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.2423, "step": 2854 }, { "epoch": 5.177964180457946, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2652, "step": 2855 }, { "epoch": 5.1797778281568805, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.252, "step": 2856 }, { "epoch": 5.181591475855815, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.3365, "step": 2857 }, { "epoch": 5.183405123554749, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.3889, "step": 2858 }, { "epoch": 5.185218771253684, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.286, "step": 2859 }, { "epoch": 5.187032418952619, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2108, "step": 2860 }, { "epoch": 5.188846066651553, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2173, "step": 2861 }, { "epoch": 5.190659714350487, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2429, "step": 2862 }, { "epoch": 5.192473362049422, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1987, "step": 2863 }, { "epoch": 5.194287009748356, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2139, "step": 2864 }, { "epoch": 5.196100657447291, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1587, "step": 2865 }, { "epoch": 5.1979143051462255, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2818, "step": 2866 }, { "epoch": 5.199727952845159, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.2121, "step": 2867 }, { "epoch": 5.201541600544094, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2362, "step": 2868 }, { "epoch": 5.203355248243029, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2325, "step": 2869 }, { "epoch": 5.205168895941963, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2207, "step": 2870 }, { "epoch": 5.206982543640898, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1958, "step": 2871 }, { "epoch": 5.208796191339832, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1868, "step": 2872 }, { "epoch": 5.210609839038767, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1603, "step": 2873 }, { "epoch": 5.212423486737701, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.2002, "step": 2874 }, { "epoch": 5.214237134436636, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1558, "step": 2875 }, { "epoch": 5.21605078213557, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1988, "step": 2876 }, { "epoch": 5.2178644298345045, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1541, "step": 2877 }, { "epoch": 5.219678077533439, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1796, "step": 2878 }, { "epoch": 5.221491725232374, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1922, "step": 2879 }, { "epoch": 5.223305372931308, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.2094, "step": 2880 }, { "epoch": 5.225119020630243, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2375, "step": 2881 }, { "epoch": 5.2269326683291775, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2088, "step": 2882 }, { "epoch": 5.228746316028111, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1888, "step": 2883 }, { "epoch": 5.230559963727046, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2081, "step": 2884 }, { "epoch": 5.232373611425981, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2138, "step": 2885 }, { "epoch": 5.234187259124915, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2753, "step": 2886 }, { "epoch": 5.2360009068238496, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.2249, "step": 2887 }, { "epoch": 5.237814554522784, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1974, "step": 2888 }, { "epoch": 5.239628202221718, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.2265, "step": 2889 }, { "epoch": 5.241441849920653, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1885, "step": 2890 }, { "epoch": 5.243255497619588, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.262, "step": 2891 }, { "epoch": 5.245069145318522, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1719, "step": 2892 }, { "epoch": 5.246882793017456, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1762, "step": 2893 }, { "epoch": 5.248696440716391, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.234, "step": 2894 }, { "epoch": 5.250510088415325, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.2784, "step": 2895 }, { "epoch": 5.25232373611426, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.2142, "step": 2896 }, { "epoch": 5.254137383813195, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2332, "step": 2897 }, { "epoch": 5.2559510315121285, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2209, "step": 2898 }, { "epoch": 5.257764679211063, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1885, "step": 2899 }, { "epoch": 5.259578326909998, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2041, "step": 2900 }, { "epoch": 5.261391974608932, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.2516, "step": 2901 }, { "epoch": 5.263205622307867, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.2337, "step": 2902 }, { "epoch": 5.2650192700068015, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2593, "step": 2903 }, { "epoch": 5.266832917705735, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.245, "step": 2904 }, { "epoch": 5.26864656540467, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.2906, "step": 2905 }, { "epoch": 5.270460213103605, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.249, "step": 2906 }, { "epoch": 5.272273860802539, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.3421, "step": 2907 }, { "epoch": 5.274087508501474, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2945, "step": 2908 }, { "epoch": 5.275901156200408, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.416, "step": 2909 }, { "epoch": 5.277714803899342, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2466, "step": 2910 }, { "epoch": 5.279528451598277, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.2909, "step": 2911 }, { "epoch": 5.281342099297212, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2604, "step": 2912 }, { "epoch": 5.283155746996146, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2351, "step": 2913 }, { "epoch": 5.28496939469508, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.2016, "step": 2914 }, { "epoch": 5.286783042394015, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2553, "step": 2915 }, { "epoch": 5.288596690092949, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.219, "step": 2916 }, { "epoch": 5.290410337791884, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2409, "step": 2917 }, { "epoch": 5.292223985490819, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.2343, "step": 2918 }, { "epoch": 5.2940376331897525, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1973, "step": 2919 }, { "epoch": 5.295851280888687, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.2101, "step": 2920 }, { "epoch": 5.297664928587622, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2153, "step": 2921 }, { "epoch": 5.299478576286556, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2441, "step": 2922 }, { "epoch": 5.301292223985491, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1971, "step": 2923 }, { "epoch": 5.3031058716844255, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2129, "step": 2924 }, { "epoch": 5.304919519383359, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.2113, "step": 2925 }, { "epoch": 5.306733167082294, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1893, "step": 2926 }, { "epoch": 5.308546814781229, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1901, "step": 2927 }, { "epoch": 5.310360462480164, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1977, "step": 2928 }, { "epoch": 5.312174110179098, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2253, "step": 2929 }, { "epoch": 5.313987757878032, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.226, "step": 2930 }, { "epoch": 5.315801405576966, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2188, "step": 2931 }, { "epoch": 5.317615053275901, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1885, "step": 2932 }, { "epoch": 5.319428700974836, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2411, "step": 2933 }, { "epoch": 5.3212423486737706, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2126, "step": 2934 }, { "epoch": 5.323055996372704, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1805, "step": 2935 }, { "epoch": 5.324869644071639, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.27, "step": 2936 }, { "epoch": 5.326683291770574, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.3706, "step": 2937 }, { "epoch": 5.328496939469508, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1865, "step": 2938 }, { "epoch": 5.330310587168443, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2125, "step": 2939 }, { "epoch": 5.332124234867377, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.2177, "step": 2940 }, { "epoch": 5.333937882566311, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.2188, "step": 2941 }, { "epoch": 5.335751530265246, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1829, "step": 2942 }, { "epoch": 5.337565177964181, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1505, "step": 2943 }, { "epoch": 5.339378825663115, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2542, "step": 2944 }, { "epoch": 5.3411924733620495, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1838, "step": 2945 }, { "epoch": 5.343006121060984, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2651, "step": 2946 }, { "epoch": 5.344819768759918, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.2089, "step": 2947 }, { "epoch": 5.346633416458853, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.3511, "step": 2948 }, { "epoch": 5.348447064157788, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.2186, "step": 2949 }, { "epoch": 5.350260711856722, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.2309, "step": 2950 }, { "epoch": 5.352074359555656, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.277, "step": 2951 }, { "epoch": 5.353888007254591, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2343, "step": 2952 }, { "epoch": 5.355701654953525, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.2177, "step": 2953 }, { "epoch": 5.35751530265246, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.2904, "step": 2954 }, { "epoch": 5.359328950351395, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2523, "step": 2955 }, { "epoch": 5.3611425980503284, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.2456, "step": 2956 }, { "epoch": 5.362956245749263, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.3646, "step": 2957 }, { "epoch": 5.364769893448198, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.3554, "step": 2958 }, { "epoch": 5.366583541147132, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2831, "step": 2959 }, { "epoch": 5.368397188846067, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2523, "step": 2960 }, { "epoch": 5.370210836545001, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2654, "step": 2961 }, { "epoch": 5.372024484243935, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2648, "step": 2962 }, { "epoch": 5.37383813194287, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2042, "step": 2963 }, { "epoch": 5.375651779641805, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.2242, "step": 2964 }, { "epoch": 5.377465427340739, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1923, "step": 2965 }, { "epoch": 5.3792790750396735, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2542, "step": 2966 }, { "epoch": 5.381092722738608, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2298, "step": 2967 }, { "epoch": 5.382906370437542, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2572, "step": 2968 }, { "epoch": 5.384720018136477, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.2434, "step": 2969 }, { "epoch": 5.386533665835412, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2291, "step": 2970 }, { "epoch": 5.388347313534346, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.2195, "step": 2971 }, { "epoch": 5.39016096123328, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2063, "step": 2972 }, { "epoch": 5.391974608932215, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2028, "step": 2973 }, { "epoch": 5.393788256631149, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.197, "step": 2974 }, { "epoch": 5.395601904330084, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1761, "step": 2975 }, { "epoch": 5.397415552029019, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.2065, "step": 2976 }, { "epoch": 5.3992291997279525, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1954, "step": 2977 }, { "epoch": 5.401042847426887, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.2387, "step": 2978 }, { "epoch": 5.402856495125822, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2813, "step": 2979 }, { "epoch": 5.404670142824756, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2253, "step": 2980 }, { "epoch": 5.406483790523691, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2008, "step": 2981 }, { "epoch": 5.408297438222625, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1843, "step": 2982 }, { "epoch": 5.41011108592156, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.188, "step": 2983 }, { "epoch": 5.411924733620494, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.285, "step": 2984 }, { "epoch": 5.413738381319429, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2102, "step": 2985 }, { "epoch": 5.415552029018363, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2239, "step": 2986 }, { "epoch": 5.4173656767172975, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.2867, "step": 2987 }, { "epoch": 5.419179324416232, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.25, "step": 2988 }, { "epoch": 5.420992972115167, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.2706, "step": 2989 }, { "epoch": 5.422806619814101, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.2387, "step": 2990 }, { "epoch": 5.424620267513036, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2283, "step": 2991 }, { "epoch": 5.42643391521197, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2571, "step": 2992 }, { "epoch": 5.42643391521197, "eval_loss": 1.9985148906707764, "eval_runtime": 185.7194, "eval_samples_per_second": 5.384, "eval_steps_per_second": 5.384, "step": 2992 }, { "epoch": 5.42643391521197, "mmlu_eval_accuracy": 0.2904643588605415, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.3793103448275862, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.5454545454545454, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.5, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.28823529411764703, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.18518518518518517, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.8757129754216588, "step": 2992 }, { "epoch": 5.428247562910904, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.2005, "step": 2993 }, { "epoch": 5.430061210609839, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2097, "step": 2994 }, { "epoch": 5.431874858308774, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2151, "step": 2995 }, { "epoch": 5.433688506007708, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.2334, "step": 2996 }, { "epoch": 5.435502153706643, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1939, "step": 2997 }, { "epoch": 5.437315801405577, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.194, "step": 2998 }, { "epoch": 5.439129449104511, "grad_norm": 1.8046875, "learning_rate": 0.0002, "loss": 0.2798, "step": 2999 }, { "epoch": 5.440943096803446, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.242, "step": 3000 }, { "epoch": 5.442756744502381, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.253, "step": 3001 }, { "epoch": 5.444570392201315, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2355, "step": 3002 }, { "epoch": 5.446384039900249, "grad_norm": 1.5390625, "learning_rate": 0.0002, "loss": 0.3418, "step": 3003 }, { "epoch": 5.448197687599184, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.3121, "step": 3004 }, { "epoch": 5.450011335298118, "grad_norm": 1.40625, "learning_rate": 0.0002, "loss": 0.277, "step": 3005 }, { "epoch": 5.451824982997053, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.3053, "step": 3006 }, { "epoch": 5.453638630695988, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.4156, "step": 3007 }, { "epoch": 5.4554522783949215, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.3292, "step": 3008 }, { "epoch": 5.457265926093856, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2201, "step": 3009 }, { "epoch": 5.459079573792791, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.2416, "step": 3010 }, { "epoch": 5.460893221491725, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2794, "step": 3011 }, { "epoch": 5.46270686919066, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1936, "step": 3012 }, { "epoch": 5.4645205168895945, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.2511, "step": 3013 }, { "epoch": 5.466334164588528, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2832, "step": 3014 }, { "epoch": 5.468147812287463, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2178, "step": 3015 }, { "epoch": 5.469961459986398, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2009, "step": 3016 }, { "epoch": 5.471775107685332, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2413, "step": 3017 }, { "epoch": 5.473588755384267, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.2538, "step": 3018 }, { "epoch": 5.475402403083201, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2828, "step": 3019 }, { "epoch": 5.477216050782135, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2655, "step": 3020 }, { "epoch": 5.47902969848107, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.2231, "step": 3021 }, { "epoch": 5.480843346180005, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1882, "step": 3022 }, { "epoch": 5.482656993878939, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2085, "step": 3023 }, { "epoch": 5.4844706415778735, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2228, "step": 3024 }, { "epoch": 5.486284289276808, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.2004, "step": 3025 }, { "epoch": 5.488097936975742, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.212, "step": 3026 }, { "epoch": 5.489911584674677, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2062, "step": 3027 }, { "epoch": 5.491725232373612, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2001, "step": 3028 }, { "epoch": 5.4935388800725455, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1858, "step": 3029 }, { "epoch": 5.49535252777148, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.2283, "step": 3030 }, { "epoch": 5.497166175470415, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2138, "step": 3031 }, { "epoch": 5.498979823169349, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1698, "step": 3032 }, { "epoch": 5.500793470868284, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2132, "step": 3033 }, { "epoch": 5.5026071185672185, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.2559, "step": 3034 }, { "epoch": 5.504420766266152, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1915, "step": 3035 }, { "epoch": 5.506234413965087, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2216, "step": 3036 }, { "epoch": 5.508048061664022, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.2733, "step": 3037 }, { "epoch": 5.509861709362957, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.2522, "step": 3038 }, { "epoch": 5.511675357061891, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2652, "step": 3039 }, { "epoch": 5.513489004760825, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.3558, "step": 3040 }, { "epoch": 5.515302652459759, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1823, "step": 3041 }, { "epoch": 5.517116300158694, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.205, "step": 3042 }, { "epoch": 5.518929947857629, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.2207, "step": 3043 }, { "epoch": 5.520743595556564, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.2054, "step": 3044 }, { "epoch": 5.5225572432554975, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.192, "step": 3045 }, { "epoch": 5.524370890954432, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2412, "step": 3046 }, { "epoch": 5.526184538653366, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1841, "step": 3047 }, { "epoch": 5.527998186352301, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2454, "step": 3048 }, { "epoch": 5.529811834051236, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2172, "step": 3049 }, { "epoch": 5.53162548175017, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.2225, "step": 3050 }, { "epoch": 5.533439129449104, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.2204, "step": 3051 }, { "epoch": 5.535252777148039, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2928, "step": 3052 }, { "epoch": 5.537066424846973, "grad_norm": 1.3984375, "learning_rate": 0.0002, "loss": 0.2524, "step": 3053 }, { "epoch": 5.538880072545908, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.2435, "step": 3054 }, { "epoch": 5.5406937202448425, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.2633, "step": 3055 }, { "epoch": 5.542507367943777, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.2965, "step": 3056 }, { "epoch": 5.544321015642711, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.3271, "step": 3057 }, { "epoch": 5.546134663341646, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.3055, "step": 3058 }, { "epoch": 5.547948311040581, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2782, "step": 3059 }, { "epoch": 5.549761958739515, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2737, "step": 3060 }, { "epoch": 5.551575606438449, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2603, "step": 3061 }, { "epoch": 5.553389254137384, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.2735, "step": 3062 }, { "epoch": 5.555202901836318, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2617, "step": 3063 }, { "epoch": 5.557016549535253, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.287, "step": 3064 }, { "epoch": 5.558830197234188, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2405, "step": 3065 }, { "epoch": 5.5606438449331215, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.3195, "step": 3066 }, { "epoch": 5.562457492632056, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.253, "step": 3067 }, { "epoch": 5.564271140330991, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2271, "step": 3068 }, { "epoch": 5.566084788029925, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2843, "step": 3069 }, { "epoch": 5.56789843572886, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2074, "step": 3070 }, { "epoch": 5.5697120834277944, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2489, "step": 3071 }, { "epoch": 5.571525731126728, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2091, "step": 3072 }, { "epoch": 5.573339378825663, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1896, "step": 3073 }, { "epoch": 5.575153026524598, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2163, "step": 3074 }, { "epoch": 5.576966674223532, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1853, "step": 3075 }, { "epoch": 5.5787803219224665, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2022, "step": 3076 }, { "epoch": 5.580593969621401, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2096, "step": 3077 }, { "epoch": 5.582407617320335, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2009, "step": 3078 }, { "epoch": 5.58422126501927, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.231, "step": 3079 }, { "epoch": 5.586034912718205, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2308, "step": 3080 }, { "epoch": 5.587848560417139, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2487, "step": 3081 }, { "epoch": 5.589662208116073, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1893, "step": 3082 }, { "epoch": 5.591475855815008, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2452, "step": 3083 }, { "epoch": 5.593289503513942, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.228, "step": 3084 }, { "epoch": 5.595103151212877, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1865, "step": 3085 }, { "epoch": 5.596916798911812, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2118, "step": 3086 }, { "epoch": 5.5987304466107455, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2045, "step": 3087 }, { "epoch": 5.60054409430968, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1888, "step": 3088 }, { "epoch": 5.602357742008615, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.3246, "step": 3089 }, { "epoch": 5.604171389707549, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.2002, "step": 3090 }, { "epoch": 5.605985037406484, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.2391, "step": 3091 }, { "epoch": 5.6077986851054185, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2486, "step": 3092 }, { "epoch": 5.609612332804353, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2067, "step": 3093 }, { "epoch": 5.611425980503287, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.2194, "step": 3094 }, { "epoch": 5.613239628202222, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.2279, "step": 3095 }, { "epoch": 5.615053275901156, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2242, "step": 3096 }, { "epoch": 5.6168669236000905, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.2444, "step": 3097 }, { "epoch": 5.618680571299025, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.2192, "step": 3098 }, { "epoch": 5.62049421899796, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2287, "step": 3099 }, { "epoch": 5.622307866696894, "grad_norm": 1.5390625, "learning_rate": 0.0002, "loss": 0.3556, "step": 3100 }, { "epoch": 5.624121514395829, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.2266, "step": 3101 }, { "epoch": 5.625935162094763, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2535, "step": 3102 }, { "epoch": 5.627748809793697, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.2076, "step": 3103 }, { "epoch": 5.629562457492632, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.2911, "step": 3104 }, { "epoch": 5.631376105191567, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2844, "step": 3105 }, { "epoch": 5.633189752890501, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.3458, "step": 3106 }, { "epoch": 5.635003400589436, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.4024, "step": 3107 }, { "epoch": 5.6368170482883695, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.34, "step": 3108 }, { "epoch": 5.638630695987304, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.2832, "step": 3109 }, { "epoch": 5.640444343686239, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.2198, "step": 3110 }, { "epoch": 5.642257991385174, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.217, "step": 3111 }, { "epoch": 5.644071639084108, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2181, "step": 3112 }, { "epoch": 5.6458852867830425, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.2732, "step": 3113 }, { "epoch": 5.647698934481977, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2679, "step": 3114 }, { "epoch": 5.649512582180911, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2285, "step": 3115 }, { "epoch": 5.651326229879846, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1911, "step": 3116 }, { "epoch": 5.653139877578781, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2164, "step": 3117 }, { "epoch": 5.6549535252777146, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.278, "step": 3118 }, { "epoch": 5.656767172976649, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.212, "step": 3119 }, { "epoch": 5.658580820675584, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2618, "step": 3120 }, { "epoch": 5.660394468374518, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2544, "step": 3121 }, { "epoch": 5.662208116073453, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2231, "step": 3122 }, { "epoch": 5.6640217637723875, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.2368, "step": 3123 }, { "epoch": 5.665835411471321, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2268, "step": 3124 }, { "epoch": 5.667649059170256, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.218, "step": 3125 }, { "epoch": 5.669462706869191, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2589, "step": 3126 }, { "epoch": 5.671276354568125, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.3165, "step": 3127 }, { "epoch": 5.67309000226706, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2587, "step": 3128 }, { "epoch": 5.674903649965994, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2547, "step": 3129 }, { "epoch": 5.676717297664928, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2297, "step": 3130 }, { "epoch": 5.678530945363863, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.2035, "step": 3131 }, { "epoch": 5.680344593062798, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.3295, "step": 3132 }, { "epoch": 5.682158240761732, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2981, "step": 3133 }, { "epoch": 5.6839718884606665, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.2091, "step": 3134 }, { "epoch": 5.685785536159601, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2325, "step": 3135 }, { "epoch": 5.687599183858535, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.2829, "step": 3136 }, { "epoch": 5.68941283155747, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.2446, "step": 3137 }, { "epoch": 5.691226479256405, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2585, "step": 3138 }, { "epoch": 5.693040126955339, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.2365, "step": 3139 }, { "epoch": 5.694853774654273, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1824, "step": 3140 }, { "epoch": 5.696667422353208, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.2794, "step": 3141 }, { "epoch": 5.698481070052142, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2024, "step": 3142 }, { "epoch": 5.700294717751077, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2505, "step": 3143 }, { "epoch": 5.7021083654500115, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.228, "step": 3144 }, { "epoch": 5.703922013148945, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2183, "step": 3145 }, { "epoch": 5.70573566084788, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.2266, "step": 3146 }, { "epoch": 5.707549308546815, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.2963, "step": 3147 }, { "epoch": 5.70936295624575, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.221, "step": 3148 }, { "epoch": 5.711176603944684, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.271, "step": 3149 }, { "epoch": 5.712990251643618, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.3057, "step": 3150 }, { "epoch": 5.714803899342552, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.2563, "step": 3151 }, { "epoch": 5.716617547041487, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.2606, "step": 3152 }, { "epoch": 5.718431194740422, "grad_norm": 1.390625, "learning_rate": 0.0002, "loss": 0.2433, "step": 3153 }, { "epoch": 5.720244842439357, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.3027, "step": 3154 }, { "epoch": 5.7220584901382905, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.2409, "step": 3155 }, { "epoch": 5.723872137837225, "grad_norm": 1.671875, "learning_rate": 0.0002, "loss": 0.3214, "step": 3156 }, { "epoch": 5.725685785536159, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.3303, "step": 3157 }, { "epoch": 5.727499433235094, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.4146, "step": 3158 }, { "epoch": 5.729313080934029, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.3529, "step": 3159 }, { "epoch": 5.7311267286329635, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.3143, "step": 3160 }, { "epoch": 5.732940376331897, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2276, "step": 3161 }, { "epoch": 5.734754024030832, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.236, "step": 3162 }, { "epoch": 5.736567671729766, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.2685, "step": 3163 }, { "epoch": 5.738381319428701, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.2611, "step": 3164 }, { "epoch": 5.7401949671276356, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2339, "step": 3165 }, { "epoch": 5.74200861482657, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2682, "step": 3166 }, { "epoch": 5.743822262525504, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2132, "step": 3167 }, { "epoch": 5.745635910224439, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.3008, "step": 3168 }, { "epoch": 5.747449557923374, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1923, "step": 3169 }, { "epoch": 5.749263205622308, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.3159, "step": 3170 }, { "epoch": 5.751076853321242, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2176, "step": 3171 }, { "epoch": 5.752890501020177, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.2031, "step": 3172 }, { "epoch": 5.754704148719111, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2215, "step": 3173 }, { "epoch": 5.756517796418046, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2458, "step": 3174 }, { "epoch": 5.758331444116981, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2014, "step": 3175 }, { "epoch": 5.7601450918159145, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1902, "step": 3176 }, { "epoch": 5.761958739514849, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2472, "step": 3177 }, { "epoch": 5.763772387213784, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.2481, "step": 3178 }, { "epoch": 5.765586034912718, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.3058, "step": 3179 }, { "epoch": 5.765586034912718, "eval_loss": 1.8565202951431274, "eval_runtime": 185.8468, "eval_samples_per_second": 5.381, "eval_steps_per_second": 5.381, "step": 3179 }, { "epoch": 5.765586034912718, "mmlu_eval_accuracy": 0.2957830236184773, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.34375, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.4, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.2608695652173913, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.45348837209302323, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 2.007021735300043, "step": 3179 }, { "epoch": 5.767399682611653, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.2731, "step": 3180 }, { "epoch": 5.7692133303105875, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2722, "step": 3181 }, { "epoch": 5.771026978009521, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1865, "step": 3182 }, { "epoch": 5.772840625708456, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.3341, "step": 3183 }, { "epoch": 5.774654273407391, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.3095, "step": 3184 }, { "epoch": 5.776467921106325, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2119, "step": 3185 }, { "epoch": 5.77828156880526, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1974, "step": 3186 }, { "epoch": 5.780095216504194, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2467, "step": 3187 }, { "epoch": 5.781908864203128, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.204, "step": 3188 }, { "epoch": 5.783722511902063, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.2313, "step": 3189 }, { "epoch": 5.785536159600998, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.2844, "step": 3190 }, { "epoch": 5.787349807299932, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.223, "step": 3191 }, { "epoch": 5.789163454998866, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.2703, "step": 3192 }, { "epoch": 5.790977102697801, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2264, "step": 3193 }, { "epoch": 5.792790750396735, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2275, "step": 3194 }, { "epoch": 5.79460439809567, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.242, "step": 3195 }, { "epoch": 5.796418045794605, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1761, "step": 3196 }, { "epoch": 5.7982316934935385, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2194, "step": 3197 }, { "epoch": 5.800045341192473, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.3412, "step": 3198 }, { "epoch": 5.801858988891408, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.3036, "step": 3199 }, { "epoch": 5.803672636590342, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.2607, "step": 3200 }, { "epoch": 5.805486284289277, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.3602, "step": 3201 }, { "epoch": 5.8072999319882115, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2634, "step": 3202 }, { "epoch": 5.809113579687146, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.2624, "step": 3203 }, { "epoch": 5.81092722738608, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2398, "step": 3204 }, { "epoch": 5.812740875085015, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2715, "step": 3205 }, { "epoch": 5.814554522783949, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2908, "step": 3206 }, { "epoch": 5.816368170482884, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.4199, "step": 3207 }, { "epoch": 5.818181818181818, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.5615, "step": 3208 }, { "epoch": 5.819995465880753, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2895, "step": 3209 }, { "epoch": 5.821809113579687, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2981, "step": 3210 }, { "epoch": 5.823622761278622, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2435, "step": 3211 }, { "epoch": 5.825436408977556, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2461, "step": 3212 }, { "epoch": 5.82725005667649, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.275, "step": 3213 }, { "epoch": 5.829063704375425, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2406, "step": 3214 }, { "epoch": 5.83087735207436, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.2569, "step": 3215 }, { "epoch": 5.832690999773294, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2589, "step": 3216 }, { "epoch": 5.834504647472229, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2118, "step": 3217 }, { "epoch": 5.8363182951711625, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2404, "step": 3218 }, { "epoch": 5.838131942870097, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2057, "step": 3219 }, { "epoch": 5.839945590569032, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2645, "step": 3220 }, { "epoch": 5.841759238267967, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.2352, "step": 3221 }, { "epoch": 5.843572885966901, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.2041, "step": 3222 }, { "epoch": 5.8453865336658355, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.262, "step": 3223 }, { "epoch": 5.84720018136477, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2131, "step": 3224 }, { "epoch": 5.849013829063704, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1754, "step": 3225 }, { "epoch": 5.850827476762639, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1921, "step": 3226 }, { "epoch": 5.852641124461574, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2948, "step": 3227 }, { "epoch": 5.854454772160508, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2409, "step": 3228 }, { "epoch": 5.856268419859442, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.2345, "step": 3229 }, { "epoch": 5.858082067558377, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1938, "step": 3230 }, { "epoch": 5.859895715257311, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.253, "step": 3231 }, { "epoch": 5.861709362956246, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1888, "step": 3232 }, { "epoch": 5.863523010655181, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2211, "step": 3233 }, { "epoch": 5.865336658354114, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.2567, "step": 3234 }, { "epoch": 5.867150306053049, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2331, "step": 3235 }, { "epoch": 5.868963953751984, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.227, "step": 3236 }, { "epoch": 5.870777601450918, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2477, "step": 3237 }, { "epoch": 5.872591249149853, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.2062, "step": 3238 }, { "epoch": 5.874404896848787, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2216, "step": 3239 }, { "epoch": 5.876218544547721, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.2817, "step": 3240 }, { "epoch": 5.878032192246656, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1952, "step": 3241 }, { "epoch": 5.879845839945591, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2657, "step": 3242 }, { "epoch": 5.881659487644525, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2605, "step": 3243 }, { "epoch": 5.8834731353434595, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.2511, "step": 3244 }, { "epoch": 5.885286783042394, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.2671, "step": 3245 }, { "epoch": 5.887100430741328, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.2368, "step": 3246 }, { "epoch": 5.888914078440263, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2465, "step": 3247 }, { "epoch": 5.890727726139198, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2175, "step": 3248 }, { "epoch": 5.892541373838132, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2673, "step": 3249 }, { "epoch": 5.894355021537066, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.2443, "step": 3250 }, { "epoch": 5.896168669236001, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2557, "step": 3251 }, { "epoch": 5.897982316934935, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.2588, "step": 3252 }, { "epoch": 5.89979596463387, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.3019, "step": 3253 }, { "epoch": 5.901609612332805, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.2783, "step": 3254 }, { "epoch": 5.9034232600317385, "grad_norm": 1.625, "learning_rate": 0.0002, "loss": 0.3273, "step": 3255 }, { "epoch": 5.905236907730673, "grad_norm": 1.5, "learning_rate": 0.0002, "loss": 0.3539, "step": 3256 }, { "epoch": 5.907050555429608, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.359, "step": 3257 }, { "epoch": 5.908864203128543, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.4534, "step": 3258 }, { "epoch": 5.910677850827477, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.3307, "step": 3259 }, { "epoch": 5.912491498526411, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.3545, "step": 3260 }, { "epoch": 5.914305146225345, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.2954, "step": 3261 }, { "epoch": 5.91611879392428, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.3186, "step": 3262 }, { "epoch": 5.917932441623215, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2442, "step": 3263 }, { "epoch": 5.91974608932215, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.2792, "step": 3264 }, { "epoch": 5.9215597370210835, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.2327, "step": 3265 }, { "epoch": 5.923373384720018, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2337, "step": 3266 }, { "epoch": 5.925187032418952, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2459, "step": 3267 }, { "epoch": 5.927000680117887, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2413, "step": 3268 }, { "epoch": 5.928814327816822, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.2818, "step": 3269 }, { "epoch": 5.9306279755157565, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2331, "step": 3270 }, { "epoch": 5.93244162321469, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2944, "step": 3271 }, { "epoch": 5.934255270913625, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2695, "step": 3272 }, { "epoch": 5.936068918612559, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2116, "step": 3273 }, { "epoch": 5.937882566311494, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1908, "step": 3274 }, { "epoch": 5.939696214010429, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2378, "step": 3275 }, { "epoch": 5.941509861709363, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2066, "step": 3276 }, { "epoch": 5.943323509408297, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.2163, "step": 3277 }, { "epoch": 5.945137157107232, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2038, "step": 3278 }, { "epoch": 5.946950804806167, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2084, "step": 3279 }, { "epoch": 5.948764452505101, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2387, "step": 3280 }, { "epoch": 5.950578100204035, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2147, "step": 3281 }, { "epoch": 5.95239174790297, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2001, "step": 3282 }, { "epoch": 5.954205395601904, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.2338, "step": 3283 }, { "epoch": 5.956019043300839, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2788, "step": 3284 }, { "epoch": 5.957832690999774, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.2728, "step": 3285 }, { "epoch": 5.9596463386987075, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.217, "step": 3286 }, { "epoch": 5.961459986397642, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2609, "step": 3287 }, { "epoch": 5.963273634096577, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2122, "step": 3288 }, { "epoch": 5.965087281795511, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.2295, "step": 3289 }, { "epoch": 5.966900929494446, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2759, "step": 3290 }, { "epoch": 5.9687145771933805, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.2171, "step": 3291 }, { "epoch": 5.970528224892314, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2475, "step": 3292 }, { "epoch": 5.972341872591249, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2614, "step": 3293 }, { "epoch": 5.974155520290184, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2633, "step": 3294 }, { "epoch": 5.975969167989118, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2797, "step": 3295 }, { "epoch": 5.977782815688053, "grad_norm": 1.3671875, "learning_rate": 0.0002, "loss": 0.2878, "step": 3296 }, { "epoch": 5.979596463386987, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.3308, "step": 3297 }, { "epoch": 5.981410111085921, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2663, "step": 3298 }, { "epoch": 5.983223758784856, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3711, "step": 3299 }, { "epoch": 5.985037406483791, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.3547, "step": 3300 }, { "epoch": 5.986851054182725, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.305, "step": 3301 }, { "epoch": 5.9886647018816594, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2541, "step": 3302 }, { "epoch": 5.990478349580594, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.3366, "step": 3303 }, { "epoch": 5.992291997279528, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.2513, "step": 3304 }, { "epoch": 5.994105644978463, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.273, "step": 3305 }, { "epoch": 5.995919292677398, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.3191, "step": 3306 }, { "epoch": 5.9977329403763315, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.3721, "step": 3307 }, { "epoch": 5.999546588075266, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2886, "step": 3308 }, { "epoch": 6.001360235774201, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.4488, "step": 3309 }, { "epoch": 6.003173883473135, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.152, "step": 3310 }, { "epoch": 6.00498753117207, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.1445, "step": 3311 }, { "epoch": 6.0068011788710045, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1578, "step": 3312 }, { "epoch": 6.008614826569938, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1535, "step": 3313 }, { "epoch": 6.010428474268873, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.1146, "step": 3314 }, { "epoch": 6.012242121967808, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.1246, "step": 3315 }, { "epoch": 6.014055769666742, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1406, "step": 3316 }, { "epoch": 6.015869417365677, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.12, "step": 3317 }, { "epoch": 6.017683065064611, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1833, "step": 3318 }, { "epoch": 6.019496712763545, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1205, "step": 3319 }, { "epoch": 6.02131036046248, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1414, "step": 3320 }, { "epoch": 6.023124008161415, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1351, "step": 3321 }, { "epoch": 6.024937655860349, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1262, "step": 3322 }, { "epoch": 6.0267513035592835, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1137, "step": 3323 }, { "epoch": 6.028564951258218, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1512, "step": 3324 }, { "epoch": 6.030378598957153, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1053, "step": 3325 }, { "epoch": 6.032192246656087, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1186, "step": 3326 }, { "epoch": 6.034005894355022, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1261, "step": 3327 }, { "epoch": 6.035819542053956, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1298, "step": 3328 }, { "epoch": 6.03763318975289, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1343, "step": 3329 }, { "epoch": 6.039446837451825, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1262, "step": 3330 }, { "epoch": 6.04126048515076, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1299, "step": 3331 }, { "epoch": 6.043074132849694, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1093, "step": 3332 }, { "epoch": 6.0448877805486285, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1403, "step": 3333 }, { "epoch": 6.046701428247563, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1308, "step": 3334 }, { "epoch": 6.048515075946497, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1225, "step": 3335 }, { "epoch": 6.050328723645432, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1381, "step": 3336 }, { "epoch": 6.052142371344367, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1811, "step": 3337 }, { "epoch": 6.053956019043301, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1284, "step": 3338 }, { "epoch": 6.055769666742235, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2448, "step": 3339 }, { "epoch": 6.05758331444117, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1523, "step": 3340 }, { "epoch": 6.059396962140104, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1487, "step": 3341 }, { "epoch": 6.061210609839039, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1803, "step": 3342 }, { "epoch": 6.063024257537974, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1386, "step": 3343 }, { "epoch": 6.0648379052369075, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1371, "step": 3344 }, { "epoch": 6.066651552935842, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1284, "step": 3345 }, { "epoch": 6.068465200634777, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1094, "step": 3346 }, { "epoch": 6.070278848333711, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.134, "step": 3347 }, { "epoch": 6.072092496032646, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1539, "step": 3348 }, { "epoch": 6.07390614373158, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.18, "step": 3349 }, { "epoch": 6.075719791430514, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1635, "step": 3350 }, { "epoch": 6.077533439129449, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2534, "step": 3351 }, { "epoch": 6.079347086828384, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1937, "step": 3352 }, { "epoch": 6.081160734527318, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1848, "step": 3353 }, { "epoch": 6.0829743822262525, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1773, "step": 3354 }, { "epoch": 6.084788029925187, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1597, "step": 3355 }, { "epoch": 6.086601677624121, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.1935, "step": 3356 }, { "epoch": 6.088415325323056, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2134, "step": 3357 }, { "epoch": 6.090228973021991, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.2398, "step": 3358 }, { "epoch": 6.092042620720925, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.3233, "step": 3359 }, { "epoch": 6.093856268419859, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1916, "step": 3360 }, { "epoch": 6.095669916118794, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1811, "step": 3361 }, { "epoch": 6.097483563817728, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1576, "step": 3362 }, { "epoch": 6.099297211516663, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1868, "step": 3363 }, { "epoch": 6.101110859215598, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1368, "step": 3364 }, { "epoch": 6.1029245069145315, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.16, "step": 3365 }, { "epoch": 6.104738154613466, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.154, "step": 3366 }, { "epoch": 6.104738154613466, "eval_loss": 2.078160047531128, "eval_runtime": 185.9369, "eval_samples_per_second": 5.378, "eval_steps_per_second": 5.378, "step": 3366 }, { "epoch": 6.104738154613466, "mmlu_eval_accuracy": 0.296329577021802, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.5142857142857142, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2823529411764706, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.7521704462600798, "step": 3366 }, { "epoch": 6.106551802312401, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.137, "step": 3367 }, { "epoch": 6.108365450011335, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1429, "step": 3368 }, { "epoch": 6.11017909771027, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.1173, "step": 3369 }, { "epoch": 6.1119927454092045, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1248, "step": 3370 }, { "epoch": 6.113806393108138, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1617, "step": 3371 }, { "epoch": 6.115620040807073, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1488, "step": 3372 }, { "epoch": 6.117433688506008, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1469, "step": 3373 }, { "epoch": 6.119247336204942, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1677, "step": 3374 }, { "epoch": 6.1210609839038765, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1146, "step": 3375 }, { "epoch": 6.122874631602811, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1332, "step": 3376 }, { "epoch": 6.124688279301745, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.1179, "step": 3377 }, { "epoch": 6.12650192700068, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1156, "step": 3378 }, { "epoch": 6.128315574699615, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1147, "step": 3379 }, { "epoch": 6.1301292223985495, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1276, "step": 3380 }, { "epoch": 6.131942870097483, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1523, "step": 3381 }, { "epoch": 6.133756517796418, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1631, "step": 3382 }, { "epoch": 6.135570165495353, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1587, "step": 3383 }, { "epoch": 6.137383813194287, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1326, "step": 3384 }, { "epoch": 6.139197460893222, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1212, "step": 3385 }, { "epoch": 6.141011108592156, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.171, "step": 3386 }, { "epoch": 6.14282475629109, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1689, "step": 3387 }, { "epoch": 6.144638403990025, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1409, "step": 3388 }, { "epoch": 6.14645205168896, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1481, "step": 3389 }, { "epoch": 6.148265699387894, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1304, "step": 3390 }, { "epoch": 6.1500793470868285, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1254, "step": 3391 }, { "epoch": 6.151892994785763, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1613, "step": 3392 }, { "epoch": 6.153706642484697, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1458, "step": 3393 }, { "epoch": 6.155520290183632, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1567, "step": 3394 }, { "epoch": 6.157333937882567, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.184, "step": 3395 }, { "epoch": 6.1591475855815006, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1885, "step": 3396 }, { "epoch": 6.160961233280435, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1369, "step": 3397 }, { "epoch": 6.16277488097937, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1842, "step": 3398 }, { "epoch": 6.164588528678304, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1953, "step": 3399 }, { "epoch": 6.166402176377239, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1685, "step": 3400 }, { "epoch": 6.1682158240761735, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1501, "step": 3401 }, { "epoch": 6.170029471775107, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.1979, "step": 3402 }, { "epoch": 6.171843119474042, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2154, "step": 3403 }, { "epoch": 6.173656767172977, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.224, "step": 3404 }, { "epoch": 6.175470414871911, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.2122, "step": 3405 }, { "epoch": 6.177284062570846, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.1811, "step": 3406 }, { "epoch": 6.17909771026978, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2332, "step": 3407 }, { "epoch": 6.180911357968714, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.272, "step": 3408 }, { "epoch": 6.182725005667649, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.2986, "step": 3409 }, { "epoch": 6.184538653366584, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2181, "step": 3410 }, { "epoch": 6.186352301065518, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2054, "step": 3411 }, { "epoch": 6.1881659487644525, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.201, "step": 3412 }, { "epoch": 6.189979596463387, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1607, "step": 3413 }, { "epoch": 6.191793244162321, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1558, "step": 3414 }, { "epoch": 6.193606891861256, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1431, "step": 3415 }, { "epoch": 6.195420539560191, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1474, "step": 3416 }, { "epoch": 6.197234187259125, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.172, "step": 3417 }, { "epoch": 6.199047834958059, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1578, "step": 3418 }, { "epoch": 6.200861482656994, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1376, "step": 3419 }, { "epoch": 6.202675130355928, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1515, "step": 3420 }, { "epoch": 6.204488778054863, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1909, "step": 3421 }, { "epoch": 6.2063024257537975, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1361, "step": 3422 }, { "epoch": 6.208116073452731, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1647, "step": 3423 }, { "epoch": 6.209929721151666, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1681, "step": 3424 }, { "epoch": 6.211743368850601, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1108, "step": 3425 }, { "epoch": 6.213557016549535, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1614, "step": 3426 }, { "epoch": 6.21537066424847, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1534, "step": 3427 }, { "epoch": 6.217184311947404, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1359, "step": 3428 }, { "epoch": 6.218997959646338, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1556, "step": 3429 }, { "epoch": 6.220811607345273, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1569, "step": 3430 }, { "epoch": 6.222625255044208, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.132, "step": 3431 }, { "epoch": 6.224438902743142, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1583, "step": 3432 }, { "epoch": 6.2262525504420765, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1166, "step": 3433 }, { "epoch": 6.228066198141011, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1764, "step": 3434 }, { "epoch": 6.229879845839946, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1399, "step": 3435 }, { "epoch": 6.23169349353888, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1705, "step": 3436 }, { "epoch": 6.233507141237815, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1436, "step": 3437 }, { "epoch": 6.2353207889367495, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1664, "step": 3438 }, { "epoch": 6.237134436635683, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1514, "step": 3439 }, { "epoch": 6.238948084334618, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1379, "step": 3440 }, { "epoch": 6.240761732033553, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1607, "step": 3441 }, { "epoch": 6.242575379732487, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1256, "step": 3442 }, { "epoch": 6.2443890274314215, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1744, "step": 3443 }, { "epoch": 6.246202675130356, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1599, "step": 3444 }, { "epoch": 6.24801632282929, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.201, "step": 3445 }, { "epoch": 6.249829970528225, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.146, "step": 3446 }, { "epoch": 6.25164361822716, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1996, "step": 3447 }, { "epoch": 6.253457265926094, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.1546, "step": 3448 }, { "epoch": 6.255270913625028, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.2255, "step": 3449 }, { "epoch": 6.257084561323963, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1747, "step": 3450 }, { "epoch": 6.258898209022897, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.184, "step": 3451 }, { "epoch": 6.260711856721832, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.2078, "step": 3452 }, { "epoch": 6.262525504420767, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1698, "step": 3453 }, { "epoch": 6.2643391521197005, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.2349, "step": 3454 }, { "epoch": 6.266152799818635, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.2176, "step": 3455 }, { "epoch": 6.26796644751757, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2143, "step": 3456 }, { "epoch": 6.269780095216504, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.2532, "step": 3457 }, { "epoch": 6.271593742915439, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.3537, "step": 3458 }, { "epoch": 6.2734073906143735, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.2499, "step": 3459 }, { "epoch": 6.275221038313307, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1837, "step": 3460 }, { "epoch": 6.277034686012242, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1455, "step": 3461 }, { "epoch": 6.278848333711177, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.192, "step": 3462 }, { "epoch": 6.280661981410111, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1726, "step": 3463 }, { "epoch": 6.282475629109046, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1896, "step": 3464 }, { "epoch": 6.28428927680798, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1644, "step": 3465 }, { "epoch": 6.286102924506914, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1514, "step": 3466 }, { "epoch": 6.287916572205849, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1548, "step": 3467 }, { "epoch": 6.289730219904784, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1456, "step": 3468 }, { "epoch": 6.291543867603718, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1595, "step": 3469 }, { "epoch": 6.293357515302652, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1631, "step": 3470 }, { "epoch": 6.295171163001587, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1723, "step": 3471 }, { "epoch": 6.296984810700521, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.162, "step": 3472 }, { "epoch": 6.298798458399456, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1318, "step": 3473 }, { "epoch": 6.300612106098391, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1488, "step": 3474 }, { "epoch": 6.3024257537973245, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1912, "step": 3475 }, { "epoch": 6.304239401496259, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1404, "step": 3476 }, { "epoch": 6.306053049195194, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1413, "step": 3477 }, { "epoch": 6.307866696894128, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1496, "step": 3478 }, { "epoch": 6.309680344593063, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1422, "step": 3479 }, { "epoch": 6.3114939922919975, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1355, "step": 3480 }, { "epoch": 6.313307639990931, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1525, "step": 3481 }, { "epoch": 6.315121287689866, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1636, "step": 3482 }, { "epoch": 6.316934935388801, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1376, "step": 3483 }, { "epoch": 6.318748583087735, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1694, "step": 3484 }, { "epoch": 6.32056223078667, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1302, "step": 3485 }, { "epoch": 6.322375878485604, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1547, "step": 3486 }, { "epoch": 6.324189526184538, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1401, "step": 3487 }, { "epoch": 6.326003173883473, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2089, "step": 3488 }, { "epoch": 6.327816821582408, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1519, "step": 3489 }, { "epoch": 6.3296304692813425, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1573, "step": 3490 }, { "epoch": 6.331444116980276, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1934, "step": 3491 }, { "epoch": 6.333257764679211, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1865, "step": 3492 }, { "epoch": 6.335071412378145, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1725, "step": 3493 }, { "epoch": 6.33688506007708, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1551, "step": 3494 }, { "epoch": 6.338698707776015, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1498, "step": 3495 }, { "epoch": 6.340512355474949, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1343, "step": 3496 }, { "epoch": 6.342326003173883, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.1919, "step": 3497 }, { "epoch": 6.344139650872818, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1846, "step": 3498 }, { "epoch": 6.345953298571753, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.224, "step": 3499 }, { "epoch": 6.347766946270687, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1798, "step": 3500 }, { "epoch": 6.3495805939696215, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.2652, "step": 3501 }, { "epoch": 6.351394241668556, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.246, "step": 3502 }, { "epoch": 6.35320788936749, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2114, "step": 3503 }, { "epoch": 6.355021537066425, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2071, "step": 3504 }, { "epoch": 6.35683518476536, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.2298, "step": 3505 }, { "epoch": 6.358648832464294, "grad_norm": 1.3125, "learning_rate": 0.0002, "loss": 0.2317, "step": 3506 }, { "epoch": 6.360462480163228, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2204, "step": 3507 }, { "epoch": 6.362276127862163, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.2605, "step": 3508 }, { "epoch": 6.364089775561097, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.3016, "step": 3509 }, { "epoch": 6.365903423260032, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2236, "step": 3510 }, { "epoch": 6.3677170709589666, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1969, "step": 3511 }, { "epoch": 6.3695307186579, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1934, "step": 3512 }, { "epoch": 6.371344366356835, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1603, "step": 3513 }, { "epoch": 6.37315801405577, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.1422, "step": 3514 }, { "epoch": 6.374971661754704, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1735, "step": 3515 }, { "epoch": 6.376785309453639, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1758, "step": 3516 }, { "epoch": 6.378598957152573, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1384, "step": 3517 }, { "epoch": 6.380412604851507, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1594, "step": 3518 }, { "epoch": 6.382226252550442, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1612, "step": 3519 }, { "epoch": 6.384039900249377, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.2633, "step": 3520 }, { "epoch": 6.385853547948311, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1617, "step": 3521 }, { "epoch": 6.3876671956472455, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1528, "step": 3522 }, { "epoch": 6.38948084334618, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.134, "step": 3523 }, { "epoch": 6.391294491045114, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1439, "step": 3524 }, { "epoch": 6.393108138744049, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1648, "step": 3525 }, { "epoch": 6.394921786442984, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1286, "step": 3526 }, { "epoch": 6.396735434141918, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1344, "step": 3527 }, { "epoch": 6.398549081840852, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1324, "step": 3528 }, { "epoch": 6.400362729539787, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1214, "step": 3529 }, { "epoch": 6.402176377238721, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1549, "step": 3530 }, { "epoch": 6.403990024937656, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1517, "step": 3531 }, { "epoch": 6.405803672636591, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1582, "step": 3532 }, { "epoch": 6.4076173203355244, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1465, "step": 3533 }, { "epoch": 6.409430968034459, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.201, "step": 3534 }, { "epoch": 6.411244615733394, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1957, "step": 3535 }, { "epoch": 6.413058263432328, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1452, "step": 3536 }, { "epoch": 6.414871911131263, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1694, "step": 3537 }, { "epoch": 6.416685558830197, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.185, "step": 3538 }, { "epoch": 6.418499206529131, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1651, "step": 3539 }, { "epoch": 6.420312854228066, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.175, "step": 3540 }, { "epoch": 6.422126501927001, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1841, "step": 3541 }, { "epoch": 6.423940149625935, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1701, "step": 3542 }, { "epoch": 6.4257537973248695, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1539, "step": 3543 }, { "epoch": 6.427567445023804, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1763, "step": 3544 }, { "epoch": 6.429381092722739, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1938, "step": 3545 }, { "epoch": 6.431194740421673, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1571, "step": 3546 }, { "epoch": 6.433008388120608, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2028, "step": 3547 }, { "epoch": 6.434822035819542, "grad_norm": 1.4921875, "learning_rate": 0.0002, "loss": 0.1703, "step": 3548 }, { "epoch": 6.436635683518476, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.24, "step": 3549 }, { "epoch": 6.438449331217411, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1733, "step": 3550 }, { "epoch": 6.440262978916346, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2311, "step": 3551 }, { "epoch": 6.44207662661528, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.169, "step": 3552 }, { "epoch": 6.443890274314215, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.2155, "step": 3553 }, { "epoch": 6.443890274314215, "eval_loss": 2.1139395236968994, "eval_runtime": 185.8936, "eval_samples_per_second": 5.379, "eval_steps_per_second": 5.379, "step": 3553 }, { "epoch": 6.443890274314215, "mmlu_eval_accuracy": 0.3015329881779265, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.4, "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.29411764705882354, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.4074074074074074, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.9459492966951224, "step": 3553 }, { "epoch": 6.445703922013149, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.226, "step": 3554 }, { "epoch": 6.447517569712083, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2324, "step": 3555 }, { "epoch": 6.449331217411018, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.2297, "step": 3556 }, { "epoch": 6.451144865109953, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2344, "step": 3557 }, { "epoch": 6.452958512808887, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.2981, "step": 3558 }, { "epoch": 6.454772160507821, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.2781, "step": 3559 }, { "epoch": 6.456585808206756, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.2138, "step": 3560 }, { "epoch": 6.45839945590569, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1753, "step": 3561 }, { "epoch": 6.460213103604625, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1985, "step": 3562 }, { "epoch": 6.46202675130356, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.213, "step": 3563 }, { "epoch": 6.4638403990024935, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1818, "step": 3564 }, { "epoch": 6.465654046701428, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.2121, "step": 3565 }, { "epoch": 6.467467694400363, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1721, "step": 3566 }, { "epoch": 6.469281342099297, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.2459, "step": 3567 }, { "epoch": 6.471094989798232, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1399, "step": 3568 }, { "epoch": 6.4729086374971665, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.166, "step": 3569 }, { "epoch": 6.4747222851961, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1595, "step": 3570 }, { "epoch": 6.476535932895035, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.144, "step": 3571 }, { "epoch": 6.47834958059397, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1753, "step": 3572 }, { "epoch": 6.480163228292904, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.177, "step": 3573 }, { "epoch": 6.481976875991839, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1521, "step": 3574 }, { "epoch": 6.483790523690773, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1534, "step": 3575 }, { "epoch": 6.485604171389707, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1426, "step": 3576 }, { "epoch": 6.487417819088642, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1512, "step": 3577 }, { "epoch": 6.489231466787577, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.144, "step": 3578 }, { "epoch": 6.491045114486511, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.149, "step": 3579 }, { "epoch": 6.492858762185445, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.158, "step": 3580 }, { "epoch": 6.49467240988438, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.156, "step": 3581 }, { "epoch": 6.496486057583314, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1475, "step": 3582 }, { "epoch": 6.498299705282249, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1309, "step": 3583 }, { "epoch": 6.500113352981184, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2009, "step": 3584 }, { "epoch": 6.5019270006801175, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1435, "step": 3585 }, { "epoch": 6.503740648379052, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1299, "step": 3586 }, { "epoch": 6.505554296077987, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1637, "step": 3587 }, { "epoch": 6.507367943776921, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.172, "step": 3588 }, { "epoch": 6.509181591475856, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1591, "step": 3589 }, { "epoch": 6.5109952391747905, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1388, "step": 3590 }, { "epoch": 6.512808886873724, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1819, "step": 3591 }, { "epoch": 6.514622534572659, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1749, "step": 3592 }, { "epoch": 6.516436182271594, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1533, "step": 3593 }, { "epoch": 6.518249829970529, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1834, "step": 3594 }, { "epoch": 6.520063477669463, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1723, "step": 3595 }, { "epoch": 6.521877125368397, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1824, "step": 3596 }, { "epoch": 6.523690773067331, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1605, "step": 3597 }, { "epoch": 6.525504420766266, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1545, "step": 3598 }, { "epoch": 6.527318068465201, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2321, "step": 3599 }, { "epoch": 6.529131716164136, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1962, "step": 3600 }, { "epoch": 6.5309453638630695, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1564, "step": 3601 }, { "epoch": 6.532759011562004, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1917, "step": 3602 }, { "epoch": 6.534572659260938, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.1969, "step": 3603 }, { "epoch": 6.536386306959873, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.2246, "step": 3604 }, { "epoch": 6.538199954658808, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.2516, "step": 3605 }, { "epoch": 6.540013602357742, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2035, "step": 3606 }, { "epoch": 6.541827250056676, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2626, "step": 3607 }, { "epoch": 6.543640897755611, "grad_norm": 1.9453125, "learning_rate": 0.0002, "loss": 0.335, "step": 3608 }, { "epoch": 6.545454545454545, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2147, "step": 3609 }, { "epoch": 6.54726819315348, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2371, "step": 3610 }, { "epoch": 6.5490818408524145, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1776, "step": 3611 }, { "epoch": 6.550895488551349, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1638, "step": 3612 }, { "epoch": 6.552709136250283, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1665, "step": 3613 }, { "epoch": 6.554522783949218, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1789, "step": 3614 }, { "epoch": 6.556336431648153, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.2241, "step": 3615 }, { "epoch": 6.558150079347087, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1375, "step": 3616 }, { "epoch": 6.559963727046021, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1314, "step": 3617 }, { "epoch": 6.561777374744956, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1717, "step": 3618 }, { "epoch": 6.56359102244389, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1978, "step": 3619 }, { "epoch": 6.565404670142825, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1808, "step": 3620 }, { "epoch": 6.56721831784176, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1759, "step": 3621 }, { "epoch": 6.5690319655406935, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1873, "step": 3622 }, { "epoch": 6.570845613239628, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1466, "step": 3623 }, { "epoch": 6.572659260938563, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.1394, "step": 3624 }, { "epoch": 6.574472908637497, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1737, "step": 3625 }, { "epoch": 6.576286556336432, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1495, "step": 3626 }, { "epoch": 6.578100204035366, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1362, "step": 3627 }, { "epoch": 6.5799138517343, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1497, "step": 3628 }, { "epoch": 6.581727499433235, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1502, "step": 3629 }, { "epoch": 6.58354114713217, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1421, "step": 3630 }, { "epoch": 6.585354794831104, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.176, "step": 3631 }, { "epoch": 6.5871684425300385, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1656, "step": 3632 }, { "epoch": 6.588982090228973, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1328, "step": 3633 }, { "epoch": 6.590795737927907, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1426, "step": 3634 }, { "epoch": 6.592609385626842, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1307, "step": 3635 }, { "epoch": 6.594423033325777, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1995, "step": 3636 }, { "epoch": 6.596236681024711, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1946, "step": 3637 }, { "epoch": 6.598050328723645, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1857, "step": 3638 }, { "epoch": 6.59986397642258, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1552, "step": 3639 }, { "epoch": 6.601677624121514, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.2101, "step": 3640 }, { "epoch": 6.603491271820449, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.1982, "step": 3641 }, { "epoch": 6.605304919519384, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2406, "step": 3642 }, { "epoch": 6.6071185672183175, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1484, "step": 3643 }, { "epoch": 6.608932214917252, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1726, "step": 3644 }, { "epoch": 6.610745862616187, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.188, "step": 3645 }, { "epoch": 6.612559510315121, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.22, "step": 3646 }, { "epoch": 6.614373158014056, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.1823, "step": 3647 }, { "epoch": 6.6161868057129904, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1812, "step": 3648 }, { "epoch": 6.618000453411925, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1775, "step": 3649 }, { "epoch": 6.619814101110859, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.2237, "step": 3650 }, { "epoch": 6.621627748809794, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2408, "step": 3651 }, { "epoch": 6.623441396508728, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1699, "step": 3652 }, { "epoch": 6.6252550442076625, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2326, "step": 3653 }, { "epoch": 6.627068691906597, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2259, "step": 3654 }, { "epoch": 6.628882339605532, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1806, "step": 3655 }, { "epoch": 6.630695987304466, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1962, "step": 3656 }, { "epoch": 6.632509635003401, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2128, "step": 3657 }, { "epoch": 6.634323282702335, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.3259, "step": 3658 }, { "epoch": 6.636136930401269, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.3368, "step": 3659 }, { "epoch": 6.637950578100204, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2184, "step": 3660 }, { "epoch": 6.639764225799139, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.2327, "step": 3661 }, { "epoch": 6.641577873498073, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1583, "step": 3662 }, { "epoch": 6.643391521197008, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1955, "step": 3663 }, { "epoch": 6.6452051688959415, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1711, "step": 3664 }, { "epoch": 6.647018816594876, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1672, "step": 3665 }, { "epoch": 6.648832464293811, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.2026, "step": 3666 }, { "epoch": 6.650646111992746, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1441, "step": 3667 }, { "epoch": 6.65245975969168, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1726, "step": 3668 }, { "epoch": 6.6542734073906145, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1706, "step": 3669 }, { "epoch": 6.656087055089549, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1819, "step": 3670 }, { "epoch": 6.657900702788483, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1965, "step": 3671 }, { "epoch": 6.659714350487418, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1735, "step": 3672 }, { "epoch": 6.661527998186353, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1395, "step": 3673 }, { "epoch": 6.6633416458852865, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1585, "step": 3674 }, { "epoch": 6.665155293584221, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1723, "step": 3675 }, { "epoch": 6.666968941283156, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.148, "step": 3676 }, { "epoch": 6.66878258898209, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1514, "step": 3677 }, { "epoch": 6.670596236681025, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.191, "step": 3678 }, { "epoch": 6.6724098843799595, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.2128, "step": 3679 }, { "epoch": 6.674223532078893, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1488, "step": 3680 }, { "epoch": 6.676037179777828, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1612, "step": 3681 }, { "epoch": 6.677850827476763, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1645, "step": 3682 }, { "epoch": 6.679664475175697, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1539, "step": 3683 }, { "epoch": 6.681478122874632, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1672, "step": 3684 }, { "epoch": 6.683291770573566, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1599, "step": 3685 }, { "epoch": 6.6851054182725, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1783, "step": 3686 }, { "epoch": 6.686919065971435, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1904, "step": 3687 }, { "epoch": 6.68873271367037, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.194, "step": 3688 }, { "epoch": 6.690546361369304, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1579, "step": 3689 }, { "epoch": 6.6923600090682385, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1877, "step": 3690 }, { "epoch": 6.694173656767173, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1808, "step": 3691 }, { "epoch": 6.695987304466107, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1589, "step": 3692 }, { "epoch": 6.697800952165042, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1989, "step": 3693 }, { "epoch": 6.699614599863977, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1734, "step": 3694 }, { "epoch": 6.701428247562911, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.1539, "step": 3695 }, { "epoch": 6.703241895261845, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.1838, "step": 3696 }, { "epoch": 6.70505554296078, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1955, "step": 3697 }, { "epoch": 6.706869190659714, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.1629, "step": 3698 }, { "epoch": 6.708682838358649, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1978, "step": 3699 }, { "epoch": 6.7104964860575835, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.2382, "step": 3700 }, { "epoch": 6.712310133756517, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2042, "step": 3701 }, { "epoch": 6.714123781455452, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.2475, "step": 3702 }, { "epoch": 6.715937429154387, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.2747, "step": 3703 }, { "epoch": 6.717751076853322, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.2156, "step": 3704 }, { "epoch": 6.719564724552256, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.2331, "step": 3705 }, { "epoch": 6.72137837225119, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.2623, "step": 3706 }, { "epoch": 6.723192019950124, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2947, "step": 3707 }, { "epoch": 6.725005667649059, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.3019, "step": 3708 }, { "epoch": 6.726819315347994, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.3331, "step": 3709 }, { "epoch": 6.728632963046929, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.2483, "step": 3710 }, { "epoch": 6.7304466107458625, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2224, "step": 3711 }, { "epoch": 6.732260258444797, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.2239, "step": 3712 }, { "epoch": 6.734073906143731, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2002, "step": 3713 }, { "epoch": 6.735887553842666, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.2036, "step": 3714 }, { "epoch": 6.737701201541601, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1947, "step": 3715 }, { "epoch": 6.7395148492405355, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1724, "step": 3716 }, { "epoch": 6.741328496939469, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1778, "step": 3717 }, { "epoch": 6.743142144638404, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.181, "step": 3718 }, { "epoch": 6.744955792337338, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.203, "step": 3719 }, { "epoch": 6.746769440036273, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1788, "step": 3720 }, { "epoch": 6.7485830877352075, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1989, "step": 3721 }, { "epoch": 6.750396735434142, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1978, "step": 3722 }, { "epoch": 6.752210383133076, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1518, "step": 3723 }, { "epoch": 6.754024030832011, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1667, "step": 3724 }, { "epoch": 6.755837678530946, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1771, "step": 3725 }, { "epoch": 6.75765132622988, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1684, "step": 3726 }, { "epoch": 6.759464973928814, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1466, "step": 3727 }, { "epoch": 6.761278621627749, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.154, "step": 3728 }, { "epoch": 6.763092269326683, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.2466, "step": 3729 }, { "epoch": 6.764905917025618, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1886, "step": 3730 }, { "epoch": 6.766719564724553, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1656, "step": 3731 }, { "epoch": 6.7685332124234865, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1721, "step": 3732 }, { "epoch": 6.770346860122421, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.2044, "step": 3733 }, { "epoch": 6.772160507821356, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.2048, "step": 3734 }, { "epoch": 6.77397415552029, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1708, "step": 3735 }, { "epoch": 6.775787803219225, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1594, "step": 3736 }, { "epoch": 6.7776014509181595, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1554, "step": 3737 }, { "epoch": 6.779415098617093, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1681, "step": 3738 }, { "epoch": 6.781228746316028, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1877, "step": 3739 }, { "epoch": 6.783042394014963, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1508, "step": 3740 }, { "epoch": 6.783042394014963, "eval_loss": 2.0257790088653564, "eval_runtime": 186.0801, "eval_samples_per_second": 5.374, "eval_steps_per_second": 5.374, "step": 3740 }, { "epoch": 6.783042394014963, "mmlu_eval_accuracy": 0.28886478977209173, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.15384615384615385, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.09090909090909091, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.23076923076923078, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.36, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3488372093023256, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.21764705882352942, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.25, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.1585750586452397, "step": 3740 }, { "epoch": 6.784856041713897, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.2158, "step": 3741 }, { "epoch": 6.7866696894128316, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.191, "step": 3742 }, { "epoch": 6.788483337111766, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1501, "step": 3743 }, { "epoch": 6.7902969848107, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1744, "step": 3744 }, { "epoch": 6.792110632509635, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.166, "step": 3745 }, { "epoch": 6.79392428020857, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1818, "step": 3746 }, { "epoch": 6.795737927907504, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1864, "step": 3747 }, { "epoch": 6.797551575606438, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.1842, "step": 3748 }, { "epoch": 6.799365223305373, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2256, "step": 3749 }, { "epoch": 6.801178871004307, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1797, "step": 3750 }, { "epoch": 6.802992518703242, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2461, "step": 3751 }, { "epoch": 6.804806166402177, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2174, "step": 3752 }, { "epoch": 6.8066198141011105, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2174, "step": 3753 }, { "epoch": 6.808433461800045, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.2218, "step": 3754 }, { "epoch": 6.81024710949898, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1964, "step": 3755 }, { "epoch": 6.812060757197914, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2294, "step": 3756 }, { "epoch": 6.813874404896849, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 0.2787, "step": 3757 }, { "epoch": 6.8156880525957835, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.3492, "step": 3758 }, { "epoch": 6.817501700294718, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2985, "step": 3759 }, { "epoch": 6.819315347993652, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2378, "step": 3760 }, { "epoch": 6.821128995692587, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2069, "step": 3761 }, { "epoch": 6.822942643391521, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1662, "step": 3762 }, { "epoch": 6.824756291090456, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.174, "step": 3763 }, { "epoch": 6.82656993878939, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.214, "step": 3764 }, { "epoch": 6.828383586488325, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.2062, "step": 3765 }, { "epoch": 6.830197234187259, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1924, "step": 3766 }, { "epoch": 6.832010881886194, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1539, "step": 3767 }, { "epoch": 6.833824529585128, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1756, "step": 3768 }, { "epoch": 6.835638177284062, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.2218, "step": 3769 }, { "epoch": 6.837451824982997, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1891, "step": 3770 }, { "epoch": 6.839265472681932, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1879, "step": 3771 }, { "epoch": 6.841079120380866, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.2353, "step": 3772 }, { "epoch": 6.842892768079801, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1729, "step": 3773 }, { "epoch": 6.8447064157787345, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1872, "step": 3774 }, { "epoch": 6.846520063477669, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.2257, "step": 3775 }, { "epoch": 6.848333711176604, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1575, "step": 3776 }, { "epoch": 6.850147358875539, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1425, "step": 3777 }, { "epoch": 6.851961006574473, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1496, "step": 3778 }, { "epoch": 6.8537746542734075, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1596, "step": 3779 }, { "epoch": 6.855588301972342, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.2061, "step": 3780 }, { "epoch": 6.857401949671276, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1298, "step": 3781 }, { "epoch": 6.859215597370211, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.198, "step": 3782 }, { "epoch": 6.861029245069146, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1614, "step": 3783 }, { "epoch": 6.86284289276808, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1832, "step": 3784 }, { "epoch": 6.864656540467014, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1955, "step": 3785 }, { "epoch": 6.866470188165949, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1872, "step": 3786 }, { "epoch": 6.868283835864883, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.3287, "step": 3787 }, { "epoch": 6.870097483563818, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1849, "step": 3788 }, { "epoch": 6.8719111312627525, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1531, "step": 3789 }, { "epoch": 6.873724778961686, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1592, "step": 3790 }, { "epoch": 6.875538426660621, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.2778, "step": 3791 }, { "epoch": 6.877352074359556, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1626, "step": 3792 }, { "epoch": 6.87916572205849, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1793, "step": 3793 }, { "epoch": 6.880979369757425, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1956, "step": 3794 }, { "epoch": 6.882793017456359, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.2228, "step": 3795 }, { "epoch": 6.884606665155293, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1929, "step": 3796 }, { "epoch": 6.886420312854228, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.2118, "step": 3797 }, { "epoch": 6.888233960553163, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.2007, "step": 3798 }, { "epoch": 6.890047608252097, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2248, "step": 3799 }, { "epoch": 6.8918612559510315, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2142, "step": 3800 }, { "epoch": 6.893674903649966, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.2545, "step": 3801 }, { "epoch": 6.8954885513489, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.2132, "step": 3802 }, { "epoch": 6.897302199047835, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.283, "step": 3803 }, { "epoch": 6.89911584674677, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.2219, "step": 3804 }, { "epoch": 6.900929494445704, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2722, "step": 3805 }, { "epoch": 6.902743142144638, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.2447, "step": 3806 }, { "epoch": 6.904556789843573, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2148, "step": 3807 }, { "epoch": 6.906370437542507, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.2842, "step": 3808 }, { "epoch": 6.908184085241442, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.3918, "step": 3809 }, { "epoch": 6.909997732940377, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.221, "step": 3810 }, { "epoch": 6.91181138063931, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.23, "step": 3811 }, { "epoch": 6.913625028338245, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.2142, "step": 3812 }, { "epoch": 6.91543867603718, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1739, "step": 3813 }, { "epoch": 6.917252323736114, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1823, "step": 3814 }, { "epoch": 6.919065971435049, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1809, "step": 3815 }, { "epoch": 6.920879619133983, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1872, "step": 3816 }, { "epoch": 6.922693266832917, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1929, "step": 3817 }, { "epoch": 6.924506914531852, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.2159, "step": 3818 }, { "epoch": 6.926320562230787, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1902, "step": 3819 }, { "epoch": 6.928134209929722, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.2128, "step": 3820 }, { "epoch": 6.9299478576286555, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1936, "step": 3821 }, { "epoch": 6.93176150532759, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1801, "step": 3822 }, { "epoch": 6.933575153026524, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.2062, "step": 3823 }, { "epoch": 6.935388800725459, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1737, "step": 3824 }, { "epoch": 6.937202448424394, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1799, "step": 3825 }, { "epoch": 6.9390160961233285, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1659, "step": 3826 }, { "epoch": 6.940829743822262, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1576, "step": 3827 }, { "epoch": 6.942643391521197, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1887, "step": 3828 }, { "epoch": 6.944457039220131, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1574, "step": 3829 }, { "epoch": 6.946270686919066, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1636, "step": 3830 }, { "epoch": 6.948084334618001, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.161, "step": 3831 }, { "epoch": 6.949897982316935, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1629, "step": 3832 }, { "epoch": 6.951711630015869, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1702, "step": 3833 }, { "epoch": 6.953525277714804, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.182, "step": 3834 }, { "epoch": 6.955338925413738, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1785, "step": 3835 }, { "epoch": 6.957152573112673, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1741, "step": 3836 }, { "epoch": 6.958966220811607, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1691, "step": 3837 }, { "epoch": 6.960779868510542, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.183, "step": 3838 }, { "epoch": 6.962593516209476, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1962, "step": 3839 }, { "epoch": 6.964407163908411, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1483, "step": 3840 }, { "epoch": 6.966220811607346, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1742, "step": 3841 }, { "epoch": 6.9680344593062795, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1685, "step": 3842 }, { "epoch": 6.969848107005214, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1932, "step": 3843 }, { "epoch": 6.971661754704149, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.2323, "step": 3844 }, { "epoch": 6.973475402403083, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.2192, "step": 3845 }, { "epoch": 6.975289050102018, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2317, "step": 3846 }, { "epoch": 6.9771026978009525, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.2303, "step": 3847 }, { "epoch": 6.978916345499886, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2092, "step": 3848 }, { "epoch": 6.980729993198821, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1726, "step": 3849 }, { "epoch": 6.982543640897756, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1731, "step": 3850 }, { "epoch": 6.98435728859669, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2216, "step": 3851 }, { "epoch": 6.986170936295625, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.2152, "step": 3852 }, { "epoch": 6.987984583994559, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2676, "step": 3853 }, { "epoch": 6.989798231693493, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2289, "step": 3854 }, { "epoch": 6.991611879392428, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.2898, "step": 3855 }, { "epoch": 6.993425527091363, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.2153, "step": 3856 }, { "epoch": 6.995239174790297, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.295, "step": 3857 }, { "epoch": 6.997052822489231, "grad_norm": 1.34375, "learning_rate": 0.0002, "loss": 0.3243, "step": 3858 }, { "epoch": 6.998866470188166, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.2451, "step": 3859 }, { "epoch": 7.0006801178871, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.216, "step": 3860 }, { "epoch": 7.002493765586035, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.1083, "step": 3861 }, { "epoch": 7.00430741328497, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.1098, "step": 3862 }, { "epoch": 7.0061210609839035, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.1133, "step": 3863 }, { "epoch": 7.007934708682838, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.1224, "step": 3864 }, { "epoch": 7.009748356381773, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.1, "step": 3865 }, { "epoch": 7.011562004080707, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0933, "step": 3866 }, { "epoch": 7.013375651779642, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.084, "step": 3867 }, { "epoch": 7.0151892994785765, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0915, "step": 3868 }, { "epoch": 7.01700294717751, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1155, "step": 3869 }, { "epoch": 7.018816594876445, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1176, "step": 3870 }, { "epoch": 7.02063024257538, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1296, "step": 3871 }, { "epoch": 7.022443890274314, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1034, "step": 3872 }, { "epoch": 7.024257537973249, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1054, "step": 3873 }, { "epoch": 7.026071185672183, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.088, "step": 3874 }, { "epoch": 7.027884833371117, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1149, "step": 3875 }, { "epoch": 7.029698481070052, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1167, "step": 3876 }, { "epoch": 7.031512128768987, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0986, "step": 3877 }, { "epoch": 7.033325776467921, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0918, "step": 3878 }, { "epoch": 7.0351394241668554, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0946, "step": 3879 }, { "epoch": 7.03695307186579, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0898, "step": 3880 }, { "epoch": 7.038766719564725, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0978, "step": 3881 }, { "epoch": 7.040580367263659, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1046, "step": 3882 }, { "epoch": 7.042394014962594, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1127, "step": 3883 }, { "epoch": 7.044207662661528, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1251, "step": 3884 }, { "epoch": 7.046021310360462, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0913, "step": 3885 }, { "epoch": 7.047834958059397, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1089, "step": 3886 }, { "epoch": 7.049648605758332, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1319, "step": 3887 }, { "epoch": 7.051462253457266, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1287, "step": 3888 }, { "epoch": 7.0532759011562005, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1112, "step": 3889 }, { "epoch": 7.055089548855135, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1191, "step": 3890 }, { "epoch": 7.056903196554069, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1091, "step": 3891 }, { "epoch": 7.058716844253004, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1103, "step": 3892 }, { "epoch": 7.060530491951939, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1297, "step": 3893 }, { "epoch": 7.062344139650873, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1037, "step": 3894 }, { "epoch": 7.064157787349807, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1367, "step": 3895 }, { "epoch": 7.065971435048742, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1242, "step": 3896 }, { "epoch": 7.067785082747676, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.1366, "step": 3897 }, { "epoch": 7.069598730446611, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1716, "step": 3898 }, { "epoch": 7.071412378145546, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1344, "step": 3899 }, { "epoch": 7.0732260258444795, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1425, "step": 3900 }, { "epoch": 7.075039673543414, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.155, "step": 3901 }, { "epoch": 7.076853321242349, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1277, "step": 3902 }, { "epoch": 7.078666968941283, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1456, "step": 3903 }, { "epoch": 7.080480616640218, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.1959, "step": 3904 }, { "epoch": 7.082294264339152, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1806, "step": 3905 }, { "epoch": 7.084107912038086, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.1681, "step": 3906 }, { "epoch": 7.085921559737021, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1971, "step": 3907 }, { "epoch": 7.087735207435956, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2175, "step": 3908 }, { "epoch": 7.08954885513489, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.2002, "step": 3909 }, { "epoch": 7.0913625028338245, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.2514, "step": 3910 }, { "epoch": 7.093176150532759, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1511, "step": 3911 }, { "epoch": 7.094989798231693, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1273, "step": 3912 }, { "epoch": 7.096803445930628, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.127, "step": 3913 }, { "epoch": 7.098617093629563, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1185, "step": 3914 }, { "epoch": 7.100430741328497, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1133, "step": 3915 }, { "epoch": 7.102244389027431, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1412, "step": 3916 }, { "epoch": 7.104058036726366, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1131, "step": 3917 }, { "epoch": 7.1058716844253, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1324, "step": 3918 }, { "epoch": 7.107685332124235, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1152, "step": 3919 }, { "epoch": 7.10949897982317, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.1031, "step": 3920 }, { "epoch": 7.1113126275221035, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1077, "step": 3921 }, { "epoch": 7.113126275221038, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.107, "step": 3922 }, { "epoch": 7.114939922919973, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1358, "step": 3923 }, { "epoch": 7.116753570618907, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1013, "step": 3924 }, { "epoch": 7.118567218317842, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1085, "step": 3925 }, { "epoch": 7.1203808660167764, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.105, "step": 3926 }, { "epoch": 7.12219451371571, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0964, "step": 3927 }, { "epoch": 7.12219451371571, "eval_loss": 2.126635789871216, "eval_runtime": 187.539, "eval_samples_per_second": 5.332, "eval_steps_per_second": 5.332, "step": 3927 }, { "epoch": 7.12219451371571, "mmlu_eval_accuracy": 0.2879483372234327, "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.32558139534883723, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.37681159420289856, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.14814814814814814, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.9978969186096702, "step": 3927 }, { "epoch": 7.124008161414645, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.11, "step": 3928 }, { "epoch": 7.12582180911358, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.109, "step": 3929 }, { "epoch": 7.127635456812514, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1016, "step": 3930 }, { "epoch": 7.1294491045114485, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1316, "step": 3931 }, { "epoch": 7.131262752210383, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1535, "step": 3932 }, { "epoch": 7.133076399909317, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1081, "step": 3933 }, { "epoch": 7.134890047608252, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1121, "step": 3934 }, { "epoch": 7.136703695307187, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1136, "step": 3935 }, { "epoch": 7.1385173430061215, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1467, "step": 3936 }, { "epoch": 7.140330990705055, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1118, "step": 3937 }, { "epoch": 7.14214463840399, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1086, "step": 3938 }, { "epoch": 7.143958286102924, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1229, "step": 3939 }, { "epoch": 7.145771933801859, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1254, "step": 3940 }, { "epoch": 7.147585581500794, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1313, "step": 3941 }, { "epoch": 7.149399229199728, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1259, "step": 3942 }, { "epoch": 7.151212876898662, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1463, "step": 3943 }, { "epoch": 7.153026524597597, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0988, "step": 3944 }, { "epoch": 7.154840172296532, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1208, "step": 3945 }, { "epoch": 7.156653819995466, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1128, "step": 3946 }, { "epoch": 7.1584674676944005, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1573, "step": 3947 }, { "epoch": 7.160281115393335, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1346, "step": 3948 }, { "epoch": 7.162094763092269, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.2157, "step": 3949 }, { "epoch": 7.163908410791204, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1854, "step": 3950 }, { "epoch": 7.165722058490139, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1382, "step": 3951 }, { "epoch": 7.1675357061890725, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.159, "step": 3952 }, { "epoch": 7.169349353888007, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.22, "step": 3953 }, { "epoch": 7.171163001586942, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.162, "step": 3954 }, { "epoch": 7.172976649285876, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1882, "step": 3955 }, { "epoch": 7.174790296984811, "grad_norm": 1.4609375, "learning_rate": 0.0002, "loss": 0.197, "step": 3956 }, { "epoch": 7.1766039446837455, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.1795, "step": 3957 }, { "epoch": 7.178417592382679, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2184, "step": 3958 }, { "epoch": 7.180231240081614, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 0.2586, "step": 3959 }, { "epoch": 7.182044887780549, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.307, "step": 3960 }, { "epoch": 7.183858535479483, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1573, "step": 3961 }, { "epoch": 7.185672183178418, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1427, "step": 3962 }, { "epoch": 7.187485830877352, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1576, "step": 3963 }, { "epoch": 7.189299478576286, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1162, "step": 3964 }, { "epoch": 7.191113126275221, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1116, "step": 3965 }, { "epoch": 7.192926773974156, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.127, "step": 3966 }, { "epoch": 7.19474042167309, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1194, "step": 3967 }, { "epoch": 7.1965540693720245, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1243, "step": 3968 }, { "epoch": 7.198367717070959, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1162, "step": 3969 }, { "epoch": 7.200181364769893, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1203, "step": 3970 }, { "epoch": 7.201995012468828, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1219, "step": 3971 }, { "epoch": 7.203808660167763, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1244, "step": 3972 }, { "epoch": 7.2056223078666966, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1093, "step": 3973 }, { "epoch": 7.207435955565631, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.114, "step": 3974 }, { "epoch": 7.209249603264566, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1012, "step": 3975 }, { "epoch": 7.2110632509635, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0942, "step": 3976 }, { "epoch": 7.212876898662435, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1106, "step": 3977 }, { "epoch": 7.2146905463613695, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1089, "step": 3978 }, { "epoch": 7.216504194060303, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1042, "step": 3979 }, { "epoch": 7.218317841759238, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1056, "step": 3980 }, { "epoch": 7.220131489458173, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1085, "step": 3981 }, { "epoch": 7.221945137157107, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.117, "step": 3982 }, { "epoch": 7.223758784856042, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1087, "step": 3983 }, { "epoch": 7.225572432554976, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0961, "step": 3984 }, { "epoch": 7.22738608025391, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1464, "step": 3985 }, { "epoch": 7.229199727952845, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1235, "step": 3986 }, { "epoch": 7.23101337565178, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1227, "step": 3987 }, { "epoch": 7.232827023350714, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1168, "step": 3988 }, { "epoch": 7.2346406710496485, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1166, "step": 3989 }, { "epoch": 7.236454318748583, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.113, "step": 3990 }, { "epoch": 7.238267966447518, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1086, "step": 3991 }, { "epoch": 7.240081614146452, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1254, "step": 3992 }, { "epoch": 7.241895261845387, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1553, "step": 3993 }, { "epoch": 7.243708909544321, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1187, "step": 3994 }, { "epoch": 7.245522557243255, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1229, "step": 3995 }, { "epoch": 7.24733620494219, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1262, "step": 3996 }, { "epoch": 7.249149852641125, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1646, "step": 3997 }, { "epoch": 7.250963500340059, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1307, "step": 3998 }, { "epoch": 7.2527771480389935, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1389, "step": 3999 }, { "epoch": 7.254590795737928, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.137, "step": 4000 }, { "epoch": 7.256404443436862, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1438, "step": 4001 }, { "epoch": 7.258218091135797, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1498, "step": 4002 }, { "epoch": 7.260031738834732, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.1686, "step": 4003 }, { "epoch": 7.261845386533666, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1861, "step": 4004 }, { "epoch": 7.2636590342326, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.188, "step": 4005 }, { "epoch": 7.265472681931535, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.216, "step": 4006 }, { "epoch": 7.267286329630469, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1663, "step": 4007 }, { "epoch": 7.269099977329404, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.228, "step": 4008 }, { "epoch": 7.270913625028339, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.2176, "step": 4009 }, { "epoch": 7.2727272727272725, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2932, "step": 4010 }, { "epoch": 7.274540920426207, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1716, "step": 4011 }, { "epoch": 7.276354568125142, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1494, "step": 4012 }, { "epoch": 7.278168215824076, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1294, "step": 4013 }, { "epoch": 7.279981863523011, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1293, "step": 4014 }, { "epoch": 7.2817955112219455, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1175, "step": 4015 }, { "epoch": 7.283609158920879, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1083, "step": 4016 }, { "epoch": 7.285422806619814, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1277, "step": 4017 }, { "epoch": 7.287236454318749, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1286, "step": 4018 }, { "epoch": 7.289050102017683, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1068, "step": 4019 }, { "epoch": 7.2908637497166175, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1211, "step": 4020 }, { "epoch": 7.292677397415552, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1157, "step": 4021 }, { "epoch": 7.294491045114486, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1239, "step": 4022 }, { "epoch": 7.296304692813421, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1367, "step": 4023 }, { "epoch": 7.298118340512356, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1485, "step": 4024 }, { "epoch": 7.29993198821129, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1112, "step": 4025 }, { "epoch": 7.301745635910224, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1017, "step": 4026 }, { "epoch": 7.303559283609159, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1063, "step": 4027 }, { "epoch": 7.305372931308093, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1123, "step": 4028 }, { "epoch": 7.307186579007028, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1204, "step": 4029 }, { "epoch": 7.309000226705963, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1163, "step": 4030 }, { "epoch": 7.3108138744048965, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1114, "step": 4031 }, { "epoch": 7.312627522103831, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1306, "step": 4032 }, { "epoch": 7.314441169802766, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1212, "step": 4033 }, { "epoch": 7.3162548175017, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.111, "step": 4034 }, { "epoch": 7.318068465200635, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1122, "step": 4035 }, { "epoch": 7.3198821128995695, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1362, "step": 4036 }, { "epoch": 7.321695760598503, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1033, "step": 4037 }, { "epoch": 7.323509408297438, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1108, "step": 4038 }, { "epoch": 7.325323055996373, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1306, "step": 4039 }, { "epoch": 7.327136703695307, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.117, "step": 4040 }, { "epoch": 7.328950351394242, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1847, "step": 4041 }, { "epoch": 7.330763999093176, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1212, "step": 4042 }, { "epoch": 7.33257764679211, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1245, "step": 4043 }, { "epoch": 7.334391294491045, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1134, "step": 4044 }, { "epoch": 7.33620494218998, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1098, "step": 4045 }, { "epoch": 7.3380185898889145, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1347, "step": 4046 }, { "epoch": 7.339832237587848, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1233, "step": 4047 }, { "epoch": 7.341645885286783, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1424, "step": 4048 }, { "epoch": 7.343459532985717, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1551, "step": 4049 }, { "epoch": 7.345273180684652, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1556, "step": 4050 }, { "epoch": 7.347086828383587, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.1558, "step": 4051 }, { "epoch": 7.348900476082521, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1785, "step": 4052 }, { "epoch": 7.350714123781455, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.1571, "step": 4053 }, { "epoch": 7.35252777148039, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1912, "step": 4054 }, { "epoch": 7.354341419179325, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1602, "step": 4055 }, { "epoch": 7.356155066878259, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.19, "step": 4056 }, { "epoch": 7.3579687145771935, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1886, "step": 4057 }, { "epoch": 7.359782362276128, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.2478, "step": 4058 }, { "epoch": 7.361596009975062, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.2856, "step": 4059 }, { "epoch": 7.363409657673997, "grad_norm": 1.4296875, "learning_rate": 0.0002, "loss": 0.299, "step": 4060 }, { "epoch": 7.365223305372932, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1307, "step": 4061 }, { "epoch": 7.367036953071866, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1574, "step": 4062 }, { "epoch": 7.3688506007708, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1491, "step": 4063 }, { "epoch": 7.370664248469735, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1252, "step": 4064 }, { "epoch": 7.372477896168669, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1288, "step": 4065 }, { "epoch": 7.374291543867604, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1194, "step": 4066 }, { "epoch": 7.3761051915665385, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1331, "step": 4067 }, { "epoch": 7.377918839265472, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1247, "step": 4068 }, { "epoch": 7.379732486964407, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1242, "step": 4069 }, { "epoch": 7.381546134663342, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1341, "step": 4070 }, { "epoch": 7.383359782362276, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1481, "step": 4071 }, { "epoch": 7.385173430061211, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1338, "step": 4072 }, { "epoch": 7.386987077760145, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0986, "step": 4073 }, { "epoch": 7.388800725459079, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1161, "step": 4074 }, { "epoch": 7.390614373158014, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1157, "step": 4075 }, { "epoch": 7.392428020856949, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0986, "step": 4076 }, { "epoch": 7.394241668555883, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1124, "step": 4077 }, { "epoch": 7.3960553162548175, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.121, "step": 4078 }, { "epoch": 7.397868963953752, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1222, "step": 4079 }, { "epoch": 7.399682611652686, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1388, "step": 4080 }, { "epoch": 7.401496259351621, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1349, "step": 4081 }, { "epoch": 7.403309907050556, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1268, "step": 4082 }, { "epoch": 7.40512355474949, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1192, "step": 4083 }, { "epoch": 7.406937202448424, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1218, "step": 4084 }, { "epoch": 7.408750850147359, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1366, "step": 4085 }, { "epoch": 7.410564497846293, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1388, "step": 4086 }, { "epoch": 7.412378145545228, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1181, "step": 4087 }, { "epoch": 7.4141917932441626, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1552, "step": 4088 }, { "epoch": 7.416005440943096, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1325, "step": 4089 }, { "epoch": 7.417819088642031, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1355, "step": 4090 }, { "epoch": 7.419632736340966, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1155, "step": 4091 }, { "epoch": 7.4214463840399, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1483, "step": 4092 }, { "epoch": 7.423260031738835, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1309, "step": 4093 }, { "epoch": 7.425073679437769, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1274, "step": 4094 }, { "epoch": 7.426887327136703, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1334, "step": 4095 }, { "epoch": 7.428700974835638, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1171, "step": 4096 }, { "epoch": 7.430514622534573, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1289, "step": 4097 }, { "epoch": 7.432328270233507, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1431, "step": 4098 }, { "epoch": 7.4341419179324415, "grad_norm": 1.4921875, "learning_rate": 0.0002, "loss": 0.1603, "step": 4099 }, { "epoch": 7.435955565631376, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1387, "step": 4100 }, { "epoch": 7.437769213330311, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.152, "step": 4101 }, { "epoch": 7.439582861029245, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.2267, "step": 4102 }, { "epoch": 7.44139650872818, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.2161, "step": 4103 }, { "epoch": 7.443210156427114, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.2134, "step": 4104 }, { "epoch": 7.445023804126048, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.155, "step": 4105 }, { "epoch": 7.446837451824983, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2062, "step": 4106 }, { "epoch": 7.448651099523918, "grad_norm": 1.328125, "learning_rate": 0.0002, "loss": 0.2255, "step": 4107 }, { "epoch": 7.450464747222852, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.1998, "step": 4108 }, { "epoch": 7.452278394921787, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.255, "step": 4109 }, { "epoch": 7.454092042620721, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.3425, "step": 4110 }, { "epoch": 7.455905690319655, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1844, "step": 4111 }, { "epoch": 7.45771933801859, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1298, "step": 4112 }, { "epoch": 7.459532985717525, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1434, "step": 4113 }, { "epoch": 7.461346633416459, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1315, "step": 4114 }, { "epoch": 7.461346633416459, "eval_loss": 2.1537978649139404, "eval_runtime": 185.9196, "eval_samples_per_second": 5.379, "eval_steps_per_second": 5.379, "step": 4114 }, { "epoch": 7.461346633416459, "mmlu_eval_accuracy": 0.2905122179037528, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.2727272727272727, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.5, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.0, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.21, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.16129032258064516, "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.8992808467418365, "step": 4114 }, { "epoch": 7.463160281115393, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1379, "step": 4115 }, { "epoch": 7.464973928814328, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.158, "step": 4116 }, { "epoch": 7.466787576513262, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1489, "step": 4117 }, { "epoch": 7.468601224212197, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1788, "step": 4118 }, { "epoch": 7.470414871911132, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1354, "step": 4119 }, { "epoch": 7.4722285196100655, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1217, "step": 4120 }, { "epoch": 7.474042167309, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1235, "step": 4121 }, { "epoch": 7.475855815007935, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1171, "step": 4122 }, { "epoch": 7.477669462706869, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1258, "step": 4123 }, { "epoch": 7.479483110405804, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1206, "step": 4124 }, { "epoch": 7.4812967581047385, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1376, "step": 4125 }, { "epoch": 7.483110405803672, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1104, "step": 4126 }, { "epoch": 7.484924053502607, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1273, "step": 4127 }, { "epoch": 7.486737701201542, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1442, "step": 4128 }, { "epoch": 7.488551348900476, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1207, "step": 4129 }, { "epoch": 7.490364996599411, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1131, "step": 4130 }, { "epoch": 7.492178644298345, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1093, "step": 4131 }, { "epoch": 7.493992291997279, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1276, "step": 4132 }, { "epoch": 7.495805939696214, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1214, "step": 4133 }, { "epoch": 7.497619587395149, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1184, "step": 4134 }, { "epoch": 7.499433235094083, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1239, "step": 4135 }, { "epoch": 7.501246882793017, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1198, "step": 4136 }, { "epoch": 7.503060530491952, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1255, "step": 4137 }, { "epoch": 7.504874178190886, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1242, "step": 4138 }, { "epoch": 7.506687825889821, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1161, "step": 4139 }, { "epoch": 7.508501473588756, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1697, "step": 4140 }, { "epoch": 7.5103151212876895, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1211, "step": 4141 }, { "epoch": 7.512128768986624, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1291, "step": 4142 }, { "epoch": 7.513942416685559, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.151, "step": 4143 }, { "epoch": 7.515756064384493, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1264, "step": 4144 }, { "epoch": 7.517569712083428, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1455, "step": 4145 }, { "epoch": 7.5193833597823625, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1393, "step": 4146 }, { "epoch": 7.521197007481296, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1491, "step": 4147 }, { "epoch": 7.523010655180231, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1653, "step": 4148 }, { "epoch": 7.524824302879166, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1707, "step": 4149 }, { "epoch": 7.526637950578101, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1377, "step": 4150 }, { "epoch": 7.528451598277035, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1518, "step": 4151 }, { "epoch": 7.530265245975969, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1775, "step": 4152 }, { "epoch": 7.532078893674903, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.1964, "step": 4153 }, { "epoch": 7.533892541373838, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.1901, "step": 4154 }, { "epoch": 7.535706189072773, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1816, "step": 4155 }, { "epoch": 7.537519836771708, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1683, "step": 4156 }, { "epoch": 7.5393334844706414, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.195, "step": 4157 }, { "epoch": 7.541147132169576, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1987, "step": 4158 }, { "epoch": 7.54296077986851, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.2506, "step": 4159 }, { "epoch": 7.544774427567445, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.2743, "step": 4160 }, { "epoch": 7.54658807526638, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1748, "step": 4161 }, { "epoch": 7.548401722965314, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1585, "step": 4162 }, { "epoch": 7.550215370664248, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1408, "step": 4163 }, { "epoch": 7.552029018363183, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1523, "step": 4164 }, { "epoch": 7.553842666062117, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1424, "step": 4165 }, { "epoch": 7.555656313761052, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.13, "step": 4166 }, { "epoch": 7.5574699614599865, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1314, "step": 4167 }, { "epoch": 7.559283609158921, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1449, "step": 4168 }, { "epoch": 7.561097256857855, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1392, "step": 4169 }, { "epoch": 7.56291090455679, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1205, "step": 4170 }, { "epoch": 7.564724552255725, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1394, "step": 4171 }, { "epoch": 7.566538199954659, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.132, "step": 4172 }, { "epoch": 7.568351847653593, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1411, "step": 4173 }, { "epoch": 7.570165495352528, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1561, "step": 4174 }, { "epoch": 7.571979143051462, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1192, "step": 4175 }, { "epoch": 7.573792790750397, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1322, "step": 4176 }, { "epoch": 7.575606438449332, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1369, "step": 4177 }, { "epoch": 7.5774200861482655, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1217, "step": 4178 }, { "epoch": 7.5792337338472, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1087, "step": 4179 }, { "epoch": 7.581047381546135, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1303, "step": 4180 }, { "epoch": 7.582861029245069, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1194, "step": 4181 }, { "epoch": 7.584674676944004, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1294, "step": 4182 }, { "epoch": 7.586488324642938, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1287, "step": 4183 }, { "epoch": 7.588301972341872, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1272, "step": 4184 }, { "epoch": 7.590115620040807, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1149, "step": 4185 }, { "epoch": 7.591929267739742, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1197, "step": 4186 }, { "epoch": 7.593742915438676, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1192, "step": 4187 }, { "epoch": 7.5955565631376105, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1134, "step": 4188 }, { "epoch": 7.597370210836545, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1357, "step": 4189 }, { "epoch": 7.599183858535479, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1139, "step": 4190 }, { "epoch": 7.600997506234414, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1795, "step": 4191 }, { "epoch": 7.602811153933349, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1315, "step": 4192 }, { "epoch": 7.604624801632283, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1519, "step": 4193 }, { "epoch": 7.606438449331217, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1749, "step": 4194 }, { "epoch": 7.608252097030152, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1413, "step": 4195 }, { "epoch": 7.610065744729086, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1664, "step": 4196 }, { "epoch": 7.611879392428021, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1121, "step": 4197 }, { "epoch": 7.613693040126956, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1272, "step": 4198 }, { "epoch": 7.6155066878258895, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1499, "step": 4199 }, { "epoch": 7.617320335524824, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1264, "step": 4200 }, { "epoch": 7.619133983223759, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1834, "step": 4201 }, { "epoch": 7.620947630922693, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1544, "step": 4202 }, { "epoch": 7.622761278621628, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1762, "step": 4203 }, { "epoch": 7.624574926320562, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.1611, "step": 4204 }, { "epoch": 7.626388574019496, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.1775, "step": 4205 }, { "epoch": 7.628202221718431, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.2007, "step": 4206 }, { "epoch": 7.630015869417366, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.2543, "step": 4207 }, { "epoch": 7.6318295171163, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1971, "step": 4208 }, { "epoch": 7.6336431648152345, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.246, "step": 4209 }, { "epoch": 7.635456812514169, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.3864, "step": 4210 }, { "epoch": 7.637270460213104, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1505, "step": 4211 }, { "epoch": 7.639084107912038, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.122, "step": 4212 }, { "epoch": 7.640897755610973, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1557, "step": 4213 }, { "epoch": 7.642711403309907, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1307, "step": 4214 }, { "epoch": 7.644525051008841, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1414, "step": 4215 }, { "epoch": 7.646338698707776, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1306, "step": 4216 }, { "epoch": 7.648152346406711, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1202, "step": 4217 }, { "epoch": 7.649965994105645, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1291, "step": 4218 }, { "epoch": 7.65177964180458, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1456, "step": 4219 }, { "epoch": 7.6535932895035135, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1337, "step": 4220 }, { "epoch": 7.655406937202448, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1478, "step": 4221 }, { "epoch": 7.657220584901383, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1421, "step": 4222 }, { "epoch": 7.659034232600318, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1454, "step": 4223 }, { "epoch": 7.660847880299252, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1647, "step": 4224 }, { "epoch": 7.6626615279981865, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1401, "step": 4225 }, { "epoch": 7.66447517569712, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.124, "step": 4226 }, { "epoch": 7.666288823396055, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1497, "step": 4227 }, { "epoch": 7.66810247109499, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1151, "step": 4228 }, { "epoch": 7.669916118793925, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1407, "step": 4229 }, { "epoch": 7.6717297664928585, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1481, "step": 4230 }, { "epoch": 7.673543414191793, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1435, "step": 4231 }, { "epoch": 7.675357061890728, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1094, "step": 4232 }, { "epoch": 7.677170709589662, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1539, "step": 4233 }, { "epoch": 7.678984357288597, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.129, "step": 4234 }, { "epoch": 7.6807980049875315, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1343, "step": 4235 }, { "epoch": 7.682611652686465, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1398, "step": 4236 }, { "epoch": 7.6844253003854, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.13, "step": 4237 }, { "epoch": 7.686238948084335, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1146, "step": 4238 }, { "epoch": 7.688052595783269, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1711, "step": 4239 }, { "epoch": 7.689866243482204, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1457, "step": 4240 }, { "epoch": 7.691679891181138, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1289, "step": 4241 }, { "epoch": 7.693493538880072, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1684, "step": 4242 }, { "epoch": 7.695307186579007, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1179, "step": 4243 }, { "epoch": 7.697120834277942, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1364, "step": 4244 }, { "epoch": 7.698934481976876, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1332, "step": 4245 }, { "epoch": 7.7007481296758105, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1682, "step": 4246 }, { "epoch": 7.702561777374745, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1503, "step": 4247 }, { "epoch": 7.704375425073679, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1865, "step": 4248 }, { "epoch": 7.706189072772614, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1459, "step": 4249 }, { "epoch": 7.708002720471549, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.161, "step": 4250 }, { "epoch": 7.7098163681704825, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1597, "step": 4251 }, { "epoch": 7.711630015869417, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2142, "step": 4252 }, { "epoch": 7.713443663568352, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1451, "step": 4253 }, { "epoch": 7.715257311267286, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.1747, "step": 4254 }, { "epoch": 7.717070958966221, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1641, "step": 4255 }, { "epoch": 7.7188846066651555, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.2326, "step": 4256 }, { "epoch": 7.720698254364089, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.2055, "step": 4257 }, { "epoch": 7.722511902063024, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1901, "step": 4258 }, { "epoch": 7.724325549761959, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.2572, "step": 4259 }, { "epoch": 7.726139197460893, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2643, "step": 4260 }, { "epoch": 7.727952845159828, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1673, "step": 4261 }, { "epoch": 7.729766492858762, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.153, "step": 4262 }, { "epoch": 7.731580140557696, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1669, "step": 4263 }, { "epoch": 7.733393788256631, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1529, "step": 4264 }, { "epoch": 7.735207435955566, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1298, "step": 4265 }, { "epoch": 7.737021083654501, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1183, "step": 4266 }, { "epoch": 7.7388347313534345, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1515, "step": 4267 }, { "epoch": 7.740648379052369, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1352, "step": 4268 }, { "epoch": 7.742462026751303, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.144, "step": 4269 }, { "epoch": 7.744275674450238, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1594, "step": 4270 }, { "epoch": 7.746089322149173, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1272, "step": 4271 }, { "epoch": 7.7479029698481074, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2172, "step": 4272 }, { "epoch": 7.749716617547041, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1243, "step": 4273 }, { "epoch": 7.751530265245976, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1412, "step": 4274 }, { "epoch": 7.75334391294491, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1507, "step": 4275 }, { "epoch": 7.755157560643845, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1279, "step": 4276 }, { "epoch": 7.7569712083427795, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1246, "step": 4277 }, { "epoch": 7.758784856041714, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1359, "step": 4278 }, { "epoch": 7.760598503740648, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1218, "step": 4279 }, { "epoch": 7.762412151439583, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1427, "step": 4280 }, { "epoch": 7.764225799138517, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.142, "step": 4281 }, { "epoch": 7.766039446837452, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1327, "step": 4282 }, { "epoch": 7.767853094536386, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1761, "step": 4283 }, { "epoch": 7.769666742235321, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.116, "step": 4284 }, { "epoch": 7.771480389934255, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1058, "step": 4285 }, { "epoch": 7.77329403763319, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1612, "step": 4286 }, { "epoch": 7.775107685332125, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1359, "step": 4287 }, { "epoch": 7.7769213330310585, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1511, "step": 4288 }, { "epoch": 7.778734980729993, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1377, "step": 4289 }, { "epoch": 7.780548628428928, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1179, "step": 4290 }, { "epoch": 7.782362276127862, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1428, "step": 4291 }, { "epoch": 7.784175923826797, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.129, "step": 4292 }, { "epoch": 7.7859895715257315, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1636, "step": 4293 }, { "epoch": 7.787803219224665, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1198, "step": 4294 }, { "epoch": 7.7896168669236, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1818, "step": 4295 }, { "epoch": 7.791430514622535, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1424, "step": 4296 }, { "epoch": 7.793244162321469, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1586, "step": 4297 }, { "epoch": 7.7950578100204035, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1313, "step": 4298 }, { "epoch": 7.796871457719338, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1603, "step": 4299 }, { "epoch": 7.798685105418272, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.2173, "step": 4300 }, { "epoch": 7.800498753117207, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1598, "step": 4301 }, { "epoch": 7.800498753117207, "eval_loss": 2.138075590133667, "eval_runtime": 185.782, "eval_samples_per_second": 5.383, "eval_steps_per_second": 5.383, "step": 4301 }, { "epoch": 7.800498753117207, "mmlu_eval_accuracy": 0.3005029442764854, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.45454545454545453, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.3058823529411765, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.18518518518518517, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.16666666666666666, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 2.1129618068678875, "step": 4301 }, { "epoch": 7.802312400816142, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.2143, "step": 4302 }, { "epoch": 7.804126048515076, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1848, "step": 4303 }, { "epoch": 7.80593969621401, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1994, "step": 4304 }, { "epoch": 7.807753343912945, "grad_norm": 1.28125, "learning_rate": 0.0002, "loss": 0.2067, "step": 4305 }, { "epoch": 7.809566991611879, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.1951, "step": 4306 }, { "epoch": 7.811380639310814, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.2179, "step": 4307 }, { "epoch": 7.813194287009749, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.2172, "step": 4308 }, { "epoch": 7.8150079347086825, "grad_norm": 1.3671875, "learning_rate": 0.0002, "loss": 0.2832, "step": 4309 }, { "epoch": 7.816821582407617, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.3325, "step": 4310 }, { "epoch": 7.818635230106552, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1829, "step": 4311 }, { "epoch": 7.820448877805486, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1808, "step": 4312 }, { "epoch": 7.822262525504421, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1706, "step": 4313 }, { "epoch": 7.8240761732033555, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1428, "step": 4314 }, { "epoch": 7.825889820902289, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1616, "step": 4315 }, { "epoch": 7.827703468601224, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1503, "step": 4316 }, { "epoch": 7.829517116300159, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1447, "step": 4317 }, { "epoch": 7.831330763999093, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1532, "step": 4318 }, { "epoch": 7.8331444116980276, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1424, "step": 4319 }, { "epoch": 7.834958059396962, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1433, "step": 4320 }, { "epoch": 7.836771707095897, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.135, "step": 4321 }, { "epoch": 7.838585354794831, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1424, "step": 4322 }, { "epoch": 7.840399002493766, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1398, "step": 4323 }, { "epoch": 7.8422126501927, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1471, "step": 4324 }, { "epoch": 7.844026297891634, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1356, "step": 4325 }, { "epoch": 7.845839945590569, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1582, "step": 4326 }, { "epoch": 7.847653593289504, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1354, "step": 4327 }, { "epoch": 7.849467240988438, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1296, "step": 4328 }, { "epoch": 7.851280888687373, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1334, "step": 4329 }, { "epoch": 7.8530945363863065, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.118, "step": 4330 }, { "epoch": 7.854908184085241, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1428, "step": 4331 }, { "epoch": 7.856721831784176, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1353, "step": 4332 }, { "epoch": 7.858535479483111, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1489, "step": 4333 }, { "epoch": 7.860349127182045, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1202, "step": 4334 }, { "epoch": 7.8621627748809795, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1332, "step": 4335 }, { "epoch": 7.863976422579913, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1417, "step": 4336 }, { "epoch": 7.865790070278848, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1567, "step": 4337 }, { "epoch": 7.867603717977783, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1448, "step": 4338 }, { "epoch": 7.869417365676718, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1315, "step": 4339 }, { "epoch": 7.871231013375652, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2403, "step": 4340 }, { "epoch": 7.873044661074586, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1433, "step": 4341 }, { "epoch": 7.874858308773521, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1512, "step": 4342 }, { "epoch": 7.876671956472455, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1186, "step": 4343 }, { "epoch": 7.87848560417139, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1289, "step": 4344 }, { "epoch": 7.8802992518703245, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1709, "step": 4345 }, { "epoch": 7.882112899569258, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1141, "step": 4346 }, { "epoch": 7.883926547268193, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1747, "step": 4347 }, { "epoch": 7.885740194967128, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1601, "step": 4348 }, { "epoch": 7.887553842666062, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1431, "step": 4349 }, { "epoch": 7.889367490364997, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1534, "step": 4350 }, { "epoch": 7.891181138063931, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.2609, "step": 4351 }, { "epoch": 7.892994785762865, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.1723, "step": 4352 }, { "epoch": 7.8948084334618, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1777, "step": 4353 }, { "epoch": 7.896622081160735, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.1665, "step": 4354 }, { "epoch": 7.898435728859669, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.2161, "step": 4355 }, { "epoch": 7.9002493765586035, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2063, "step": 4356 }, { "epoch": 7.902063024257538, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2028, "step": 4357 }, { "epoch": 7.903876671956472, "grad_norm": 1.3046875, "learning_rate": 0.0002, "loss": 0.2467, "step": 4358 }, { "epoch": 7.905690319655407, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.2504, "step": 4359 }, { "epoch": 7.907503967354342, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.3335, "step": 4360 }, { "epoch": 7.909317615053276, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2314, "step": 4361 }, { "epoch": 7.91113126275221, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.154, "step": 4362 }, { "epoch": 7.912944910451145, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1713, "step": 4363 }, { "epoch": 7.914758558150079, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1798, "step": 4364 }, { "epoch": 7.916572205849014, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1576, "step": 4365 }, { "epoch": 7.9183858535479485, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1484, "step": 4366 }, { "epoch": 7.920199501246882, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1659, "step": 4367 }, { "epoch": 7.922013148945817, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1558, "step": 4368 }, { "epoch": 7.923826796644752, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1456, "step": 4369 }, { "epoch": 7.925640444343686, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1534, "step": 4370 }, { "epoch": 7.927454092042621, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1827, "step": 4371 }, { "epoch": 7.929267739741555, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1363, "step": 4372 }, { "epoch": 7.931081387440489, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1741, "step": 4373 }, { "epoch": 7.932895035139424, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1231, "step": 4374 }, { "epoch": 7.934708682838359, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1445, "step": 4375 }, { "epoch": 7.936522330537294, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1451, "step": 4376 }, { "epoch": 7.9383359782362275, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.153, "step": 4377 }, { "epoch": 7.940149625935162, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1499, "step": 4378 }, { "epoch": 7.941963273634096, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1351, "step": 4379 }, { "epoch": 7.943776921333031, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1504, "step": 4380 }, { "epoch": 7.945590569031966, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1103, "step": 4381 }, { "epoch": 7.9474042167309005, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1381, "step": 4382 }, { "epoch": 7.949217864429834, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1234, "step": 4383 }, { "epoch": 7.951031512128769, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1101, "step": 4384 }, { "epoch": 7.952845159827703, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1434, "step": 4385 }, { "epoch": 7.954658807526638, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1577, "step": 4386 }, { "epoch": 7.956472455225573, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1485, "step": 4387 }, { "epoch": 7.958286102924507, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.166, "step": 4388 }, { "epoch": 7.960099750623441, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1341, "step": 4389 }, { "epoch": 7.961913398322376, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1797, "step": 4390 }, { "epoch": 7.96372704602131, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1396, "step": 4391 }, { "epoch": 7.965540693720245, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1715, "step": 4392 }, { "epoch": 7.967354341419179, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1814, "step": 4393 }, { "epoch": 7.969167989118114, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1645, "step": 4394 }, { "epoch": 7.970981636817048, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.2271, "step": 4395 }, { "epoch": 7.972795284515983, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1577, "step": 4396 }, { "epoch": 7.974608932214918, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1473, "step": 4397 }, { "epoch": 7.9764225799138515, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.219, "step": 4398 }, { "epoch": 7.978236227612786, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.2036, "step": 4399 }, { "epoch": 7.980049875311721, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.1682, "step": 4400 }, { "epoch": 7.981863523010655, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1811, "step": 4401 }, { "epoch": 7.98367717070959, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.186, "step": 4402 }, { "epoch": 7.9854908184085245, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1984, "step": 4403 }, { "epoch": 7.987304466107458, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1988, "step": 4404 }, { "epoch": 7.989118113806393, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.2116, "step": 4405 }, { "epoch": 7.990931761505328, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.2076, "step": 4406 }, { "epoch": 7.992745409204262, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.206, "step": 4407 }, { "epoch": 7.994559056903197, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2152, "step": 4408 }, { "epoch": 7.996372704602131, "grad_norm": 1.5078125, "learning_rate": 0.0002, "loss": 0.2938, "step": 4409 }, { "epoch": 7.998186352301065, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.2606, "step": 4410 }, { "epoch": 8.0, "grad_norm": 2.15625, "learning_rate": 0.0002, "loss": 0.2788, "step": 4411 }, { "epoch": 8.001813647698935, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1385, "step": 4412 }, { "epoch": 8.00362729539787, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.1047, "step": 4413 }, { "epoch": 8.005440943096804, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0837, "step": 4414 }, { "epoch": 8.007254590795737, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0819, "step": 4415 }, { "epoch": 8.009068238494672, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.1015, "step": 4416 }, { "epoch": 8.010881886193607, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0924, "step": 4417 }, { "epoch": 8.012695533892542, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1194, "step": 4418 }, { "epoch": 8.014509181591476, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0843, "step": 4419 }, { "epoch": 8.016322829290411, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0903, "step": 4420 }, { "epoch": 8.018136476989344, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0904, "step": 4421 }, { "epoch": 8.019950124688279, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0905, "step": 4422 }, { "epoch": 8.021763772387214, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0927, "step": 4423 }, { "epoch": 8.023577420086148, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0986, "step": 4424 }, { "epoch": 8.025391067785083, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0854, "step": 4425 }, { "epoch": 8.027204715484018, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0769, "step": 4426 }, { "epoch": 8.029018363182951, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0876, "step": 4427 }, { "epoch": 8.030832010881886, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0812, "step": 4428 }, { "epoch": 8.03264565858082, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0822, "step": 4429 }, { "epoch": 8.034459306279755, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0761, "step": 4430 }, { "epoch": 8.03627295397869, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0879, "step": 4431 }, { "epoch": 8.038086601677625, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.089, "step": 4432 }, { "epoch": 8.039900249376558, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0956, "step": 4433 }, { "epoch": 8.041713897075493, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0907, "step": 4434 }, { "epoch": 8.043527544774427, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0942, "step": 4435 }, { "epoch": 8.045341192473362, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0829, "step": 4436 }, { "epoch": 8.047154840172297, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1061, "step": 4437 }, { "epoch": 8.048968487871232, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0826, "step": 4438 }, { "epoch": 8.050782135570165, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0934, "step": 4439 }, { "epoch": 8.0525957832691, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0998, "step": 4440 }, { "epoch": 8.054409430968034, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1012, "step": 4441 }, { "epoch": 8.056223078666969, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1112, "step": 4442 }, { "epoch": 8.058036726365904, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1181, "step": 4443 }, { "epoch": 8.059850374064839, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1144, "step": 4444 }, { "epoch": 8.061664021763772, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.113, "step": 4445 }, { "epoch": 8.063477669462706, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1142, "step": 4446 }, { "epoch": 8.065291317161641, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1046, "step": 4447 }, { "epoch": 8.067104964860576, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1228, "step": 4448 }, { "epoch": 8.06891861255951, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1196, "step": 4449 }, { "epoch": 8.070732260258445, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1162, "step": 4450 }, { "epoch": 8.072545907957378, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1364, "step": 4451 }, { "epoch": 8.074359555656313, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1411, "step": 4452 }, { "epoch": 8.076173203355248, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1578, "step": 4453 }, { "epoch": 8.077986851054183, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1501, "step": 4454 }, { "epoch": 8.079800498753118, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1314, "step": 4455 }, { "epoch": 8.081614146452052, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1457, "step": 4456 }, { "epoch": 8.083427794150985, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1432, "step": 4457 }, { "epoch": 8.08524144184992, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.1841, "step": 4458 }, { "epoch": 8.087055089548855, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1557, "step": 4459 }, { "epoch": 8.08886873724779, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1957, "step": 4460 }, { "epoch": 8.090682384946724, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2483, "step": 4461 }, { "epoch": 8.09249603264566, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1309, "step": 4462 }, { "epoch": 8.094309680344592, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.1065, "step": 4463 }, { "epoch": 8.096123328043527, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.083, "step": 4464 }, { "epoch": 8.097936975742462, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1208, "step": 4465 }, { "epoch": 8.099750623441397, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0973, "step": 4466 }, { "epoch": 8.101564271140331, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0943, "step": 4467 }, { "epoch": 8.103377918839266, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0892, "step": 4468 }, { "epoch": 8.1051915665382, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0924, "step": 4469 }, { "epoch": 8.107005214237134, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1231, "step": 4470 }, { "epoch": 8.108818861936069, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0944, "step": 4471 }, { "epoch": 8.110632509635003, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0895, "step": 4472 }, { "epoch": 8.112446157333938, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1035, "step": 4473 }, { "epoch": 8.114259805032873, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.121, "step": 4474 }, { "epoch": 8.116073452731808, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0927, "step": 4475 }, { "epoch": 8.11788710043074, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0876, "step": 4476 }, { "epoch": 8.119700748129675, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0795, "step": 4477 }, { "epoch": 8.12151439582861, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0991, "step": 4478 }, { "epoch": 8.123328043527545, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0874, "step": 4479 }, { "epoch": 8.12514169122648, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.096, "step": 4480 }, { "epoch": 8.126955338925415, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1028, "step": 4481 }, { "epoch": 8.128768986624348, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0937, "step": 4482 }, { "epoch": 8.130582634323282, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0829, "step": 4483 }, { "epoch": 8.132396282022217, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1318, "step": 4484 }, { "epoch": 8.134209929721152, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1148, "step": 4485 }, { "epoch": 8.136023577420087, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0916, "step": 4486 }, { "epoch": 8.137837225119021, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.085, "step": 4487 }, { "epoch": 8.139650872817954, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1106, "step": 4488 }, { "epoch": 8.139650872817954, "eval_loss": 2.149118661880493, "eval_runtime": 185.4981, "eval_samples_per_second": 5.391, "eval_steps_per_second": 5.391, "step": 4488 }, { "epoch": 8.139650872817954, "mmlu_eval_accuracy": 0.2985351760273182, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.4782608695652174, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.5142857142857142, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.16129032258064516, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.15789473684210525, "mmlu_loss": 2.091872192836524, "step": 4488 }, { "epoch": 8.14146452051689, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0981, "step": 4489 }, { "epoch": 8.143278168215824, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0948, "step": 4490 }, { "epoch": 8.145091815914759, "grad_norm": 1.7109375, "learning_rate": 0.0002, "loss": 0.2221, "step": 4491 }, { "epoch": 8.146905463613693, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1054, "step": 4492 }, { "epoch": 8.148719111312628, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1152, "step": 4493 }, { "epoch": 8.150532759011561, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1052, "step": 4494 }, { "epoch": 8.152346406710496, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1425, "step": 4495 }, { "epoch": 8.15416005440943, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1258, "step": 4496 }, { "epoch": 8.155973702108366, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1026, "step": 4497 }, { "epoch": 8.1577873498073, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1381, "step": 4498 }, { "epoch": 8.159600997506235, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1358, "step": 4499 }, { "epoch": 8.161414645205168, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1351, "step": 4500 }, { "epoch": 8.163228292904103, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1183, "step": 4501 }, { "epoch": 8.165041940603038, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.15, "step": 4502 }, { "epoch": 8.166855588301972, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1327, "step": 4503 }, { "epoch": 8.168669236000907, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1399, "step": 4504 }, { "epoch": 8.170482883699842, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1738, "step": 4505 }, { "epoch": 8.172296531398775, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1827, "step": 4506 }, { "epoch": 8.17411017909771, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1637, "step": 4507 }, { "epoch": 8.175923826796645, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.184, "step": 4508 }, { "epoch": 8.17773747449558, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1745, "step": 4509 }, { "epoch": 8.179551122194514, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2149, "step": 4510 }, { "epoch": 8.181364769893449, "grad_norm": 1.5703125, "learning_rate": 0.0002, "loss": 0.2894, "step": 4511 }, { "epoch": 8.183178417592382, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.2584, "step": 4512 }, { "epoch": 8.184992065291317, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1027, "step": 4513 }, { "epoch": 8.186805712990251, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.107, "step": 4514 }, { "epoch": 8.188619360689186, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1072, "step": 4515 }, { "epoch": 8.190433008388121, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.1035, "step": 4516 }, { "epoch": 8.192246656087056, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1057, "step": 4517 }, { "epoch": 8.194060303785989, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1028, "step": 4518 }, { "epoch": 8.195873951484923, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0972, "step": 4519 }, { "epoch": 8.197687599183858, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0963, "step": 4520 }, { "epoch": 8.199501246882793, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1101, "step": 4521 }, { "epoch": 8.201314894581728, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.109, "step": 4522 }, { "epoch": 8.203128542280663, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0925, "step": 4523 }, { "epoch": 8.204942189979597, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1035, "step": 4524 }, { "epoch": 8.20675583767853, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1181, "step": 4525 }, { "epoch": 8.208569485377465, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0972, "step": 4526 }, { "epoch": 8.2103831330764, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0894, "step": 4527 }, { "epoch": 8.212196780775335, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1194, "step": 4528 }, { "epoch": 8.21401042847427, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0915, "step": 4529 }, { "epoch": 8.215824076173204, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0974, "step": 4530 }, { "epoch": 8.217637723872137, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0847, "step": 4531 }, { "epoch": 8.219451371571072, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0916, "step": 4532 }, { "epoch": 8.221265019270007, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.107, "step": 4533 }, { "epoch": 8.223078666968942, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0891, "step": 4534 }, { "epoch": 8.224892314667876, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1051, "step": 4535 }, { "epoch": 8.226705962366811, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1012, "step": 4536 }, { "epoch": 8.228519610065744, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0972, "step": 4537 }, { "epoch": 8.230333257764679, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1025, "step": 4538 }, { "epoch": 8.232146905463614, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0894, "step": 4539 }, { "epoch": 8.233960553162548, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1258, "step": 4540 }, { "epoch": 8.235774200861483, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0993, "step": 4541 }, { "epoch": 8.237587848560418, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0927, "step": 4542 }, { "epoch": 8.239401496259351, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1177, "step": 4543 }, { "epoch": 8.241215143958286, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.178, "step": 4544 }, { "epoch": 8.24302879165722, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1188, "step": 4545 }, { "epoch": 8.244842439356155, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1084, "step": 4546 }, { "epoch": 8.24665608705509, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1111, "step": 4547 }, { "epoch": 8.248469734754025, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1168, "step": 4548 }, { "epoch": 8.250283382452958, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1171, "step": 4549 }, { "epoch": 8.252097030151893, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1367, "step": 4550 }, { "epoch": 8.253910677850827, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1163, "step": 4551 }, { "epoch": 8.255724325549762, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1289, "step": 4552 }, { "epoch": 8.257537973248697, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1538, "step": 4553 }, { "epoch": 8.259351620947632, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1522, "step": 4554 }, { "epoch": 8.261165268646565, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1494, "step": 4555 }, { "epoch": 8.2629789163455, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1445, "step": 4556 }, { "epoch": 8.264792564044434, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1766, "step": 4557 }, { "epoch": 8.266606211743369, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2119, "step": 4558 }, { "epoch": 8.268419859442304, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.177, "step": 4559 }, { "epoch": 8.270233507141239, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1931, "step": 4560 }, { "epoch": 8.272047154840172, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.2245, "step": 4561 }, { "epoch": 8.273860802539106, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1352, "step": 4562 }, { "epoch": 8.275674450238041, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.1076, "step": 4563 }, { "epoch": 8.277488097936976, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1119, "step": 4564 }, { "epoch": 8.27930174563591, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1044, "step": 4565 }, { "epoch": 8.281115393334845, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1051, "step": 4566 }, { "epoch": 8.282929041033778, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.096, "step": 4567 }, { "epoch": 8.284742688732713, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0997, "step": 4568 }, { "epoch": 8.286556336431648, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1307, "step": 4569 }, { "epoch": 8.288369984130583, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1122, "step": 4570 }, { "epoch": 8.290183631829517, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0898, "step": 4571 }, { "epoch": 8.291997279528452, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1154, "step": 4572 }, { "epoch": 8.293810927227385, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1135, "step": 4573 }, { "epoch": 8.29562457492632, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1029, "step": 4574 }, { "epoch": 8.297438222625255, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0961, "step": 4575 }, { "epoch": 8.29925187032419, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1035, "step": 4576 }, { "epoch": 8.301065518023124, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1176, "step": 4577 }, { "epoch": 8.302879165722059, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0905, "step": 4578 }, { "epoch": 8.304692813420992, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0922, "step": 4579 }, { "epoch": 8.306506461119927, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0952, "step": 4580 }, { "epoch": 8.308320108818862, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0955, "step": 4581 }, { "epoch": 8.310133756517796, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0998, "step": 4582 }, { "epoch": 8.311947404216731, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1115, "step": 4583 }, { "epoch": 8.313761051915666, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1017, "step": 4584 }, { "epoch": 8.3155746996146, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1057, "step": 4585 }, { "epoch": 8.317388347313534, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0985, "step": 4586 }, { "epoch": 8.319201995012468, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1238, "step": 4587 }, { "epoch": 8.321015642711403, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0977, "step": 4588 }, { "epoch": 8.322829290410338, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0996, "step": 4589 }, { "epoch": 8.324642938109273, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1113, "step": 4590 }, { "epoch": 8.326456585808208, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1132, "step": 4591 }, { "epoch": 8.32827023350714, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1033, "step": 4592 }, { "epoch": 8.330083881206075, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.144, "step": 4593 }, { "epoch": 8.33189752890501, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1009, "step": 4594 }, { "epoch": 8.333711176603945, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1298, "step": 4595 }, { "epoch": 8.33552482430288, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1256, "step": 4596 }, { "epoch": 8.337338472001814, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1277, "step": 4597 }, { "epoch": 8.339152119700747, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1152, "step": 4598 }, { "epoch": 8.340965767399682, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1257, "step": 4599 }, { "epoch": 8.342779415098617, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1222, "step": 4600 }, { "epoch": 8.344593062797552, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1558, "step": 4601 }, { "epoch": 8.346406710496487, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1379, "step": 4602 }, { "epoch": 8.348220358195421, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1606, "step": 4603 }, { "epoch": 8.350034005894354, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1726, "step": 4604 }, { "epoch": 8.351847653593289, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1736, "step": 4605 }, { "epoch": 8.353661301292224, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2206, "step": 4606 }, { "epoch": 8.355474948991159, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1723, "step": 4607 }, { "epoch": 8.357288596690093, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.1826, "step": 4608 }, { "epoch": 8.359102244389028, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.2142, "step": 4609 }, { "epoch": 8.360915892087961, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.2308, "step": 4610 }, { "epoch": 8.362729539786896, "grad_norm": 1.5234375, "learning_rate": 0.0002, "loss": 0.3047, "step": 4611 }, { "epoch": 8.36454318748583, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1981, "step": 4612 }, { "epoch": 8.366356835184765, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1167, "step": 4613 }, { "epoch": 8.3681704828837, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.116, "step": 4614 }, { "epoch": 8.369984130582635, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1441, "step": 4615 }, { "epoch": 8.371797778281568, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1063, "step": 4616 }, { "epoch": 8.373611425980503, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1031, "step": 4617 }, { "epoch": 8.375425073679438, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1166, "step": 4618 }, { "epoch": 8.377238721378372, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1042, "step": 4619 }, { "epoch": 8.379052369077307, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0938, "step": 4620 }, { "epoch": 8.380866016776242, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0947, "step": 4621 }, { "epoch": 8.382679664475175, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1266, "step": 4622 }, { "epoch": 8.38449331217411, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0991, "step": 4623 }, { "epoch": 8.386306959873044, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1016, "step": 4624 }, { "epoch": 8.38812060757198, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1532, "step": 4625 }, { "epoch": 8.389934255270914, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1042, "step": 4626 }, { "epoch": 8.391747902969849, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1321, "step": 4627 }, { "epoch": 8.393561550668782, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0948, "step": 4628 }, { "epoch": 8.395375198367717, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0996, "step": 4629 }, { "epoch": 8.397188846066651, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1185, "step": 4630 }, { "epoch": 8.399002493765586, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1117, "step": 4631 }, { "epoch": 8.40081614146452, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1133, "step": 4632 }, { "epoch": 8.402629789163456, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1012, "step": 4633 }, { "epoch": 8.40444343686239, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1116, "step": 4634 }, { "epoch": 8.406257084561323, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1121, "step": 4635 }, { "epoch": 8.408070732260258, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0948, "step": 4636 }, { "epoch": 8.409884379959193, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0974, "step": 4637 }, { "epoch": 8.411698027658128, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1124, "step": 4638 }, { "epoch": 8.413511675357062, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1249, "step": 4639 }, { "epoch": 8.415325323055997, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1114, "step": 4640 }, { "epoch": 8.41713897075493, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1544, "step": 4641 }, { "epoch": 8.418952618453865, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0993, "step": 4642 }, { "epoch": 8.4207662661528, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1037, "step": 4643 }, { "epoch": 8.422579913851735, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1372, "step": 4644 }, { "epoch": 8.42439356155067, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1107, "step": 4645 }, { "epoch": 8.426207209249604, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1144, "step": 4646 }, { "epoch": 8.428020856948537, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1098, "step": 4647 }, { "epoch": 8.429834504647472, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1334, "step": 4648 }, { "epoch": 8.431648152346407, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1264, "step": 4649 }, { "epoch": 8.433461800045341, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1349, "step": 4650 }, { "epoch": 8.435275447744276, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1618, "step": 4651 }, { "epoch": 8.437089095443211, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.2306, "step": 4652 }, { "epoch": 8.438902743142144, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1375, "step": 4653 }, { "epoch": 8.440716390841079, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1974, "step": 4654 }, { "epoch": 8.442530038540013, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1616, "step": 4655 }, { "epoch": 8.444343686238948, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1546, "step": 4656 }, { "epoch": 8.446157333937883, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1615, "step": 4657 }, { "epoch": 8.447970981636818, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.1894, "step": 4658 }, { "epoch": 8.44978462933575, "grad_norm": 1.3046875, "learning_rate": 0.0002, "loss": 0.2118, "step": 4659 }, { "epoch": 8.451598277034686, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.1996, "step": 4660 }, { "epoch": 8.45341192473362, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.3369, "step": 4661 }, { "epoch": 8.455225572432555, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.137, "step": 4662 }, { "epoch": 8.45703922013149, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1434, "step": 4663 }, { "epoch": 8.458852867830425, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1182, "step": 4664 }, { "epoch": 8.460666515529358, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1216, "step": 4665 }, { "epoch": 8.462480163228292, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1269, "step": 4666 }, { "epoch": 8.464293810927227, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1178, "step": 4667 }, { "epoch": 8.466107458626162, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1113, "step": 4668 }, { "epoch": 8.467921106325097, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1211, "step": 4669 }, { "epoch": 8.469734754024032, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0986, "step": 4670 }, { "epoch": 8.471548401722965, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1109, "step": 4671 }, { "epoch": 8.4733620494219, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1099, "step": 4672 }, { "epoch": 8.475175697120834, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1025, "step": 4673 }, { "epoch": 8.476989344819769, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1177, "step": 4674 }, { "epoch": 8.478802992518704, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0935, "step": 4675 }, { "epoch": 8.478802992518704, "eval_loss": 2.2178797721862793, "eval_runtime": 189.8023, "eval_samples_per_second": 5.269, "eval_steps_per_second": 5.269, "step": 4675 }, { "epoch": 8.478802992518704, "mmlu_eval_accuracy": 0.31353095552782667, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.3076923076923077, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.3176470588235294, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.45454545454545453, "mmlu_eval_accuracy_virology": 0.16666666666666666, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 1.8772569160536174, "step": 4675 }, { "epoch": 8.480616640217638, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1113, "step": 4676 }, { "epoch": 8.482430287916571, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1019, "step": 4677 }, { "epoch": 8.484243935615506, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1328, "step": 4678 }, { "epoch": 8.486057583314441, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1083, "step": 4679 }, { "epoch": 8.487871231013376, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1001, "step": 4680 }, { "epoch": 8.48968487871231, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1011, "step": 4681 }, { "epoch": 8.491498526411245, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1049, "step": 4682 }, { "epoch": 8.493312174110178, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1044, "step": 4683 }, { "epoch": 8.495125821809113, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1028, "step": 4684 }, { "epoch": 8.496939469508048, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1177, "step": 4685 }, { "epoch": 8.498753117206983, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0983, "step": 4686 }, { "epoch": 8.500566764905917, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1236, "step": 4687 }, { "epoch": 8.502380412604852, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1311, "step": 4688 }, { "epoch": 8.504194060303785, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1095, "step": 4689 }, { "epoch": 8.50600770800272, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1093, "step": 4690 }, { "epoch": 8.507821355701655, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1244, "step": 4691 }, { "epoch": 8.50963500340059, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1018, "step": 4692 }, { "epoch": 8.511448651099524, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.109, "step": 4693 }, { "epoch": 8.513262298798459, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1324, "step": 4694 }, { "epoch": 8.515075946497394, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1186, "step": 4695 }, { "epoch": 8.516889594196327, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1257, "step": 4696 }, { "epoch": 8.518703241895262, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1148, "step": 4697 }, { "epoch": 8.520516889594196, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1266, "step": 4698 }, { "epoch": 8.522330537293131, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1408, "step": 4699 }, { "epoch": 8.524144184992066, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1423, "step": 4700 }, { "epoch": 8.525957832691, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1684, "step": 4701 }, { "epoch": 8.527771480389934, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1187, "step": 4702 }, { "epoch": 8.529585128088868, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1851, "step": 4703 }, { "epoch": 8.531398775787803, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.138, "step": 4704 }, { "epoch": 8.533212423486738, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1515, "step": 4705 }, { "epoch": 8.535026071185673, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1599, "step": 4706 }, { "epoch": 8.536839718884607, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1978, "step": 4707 }, { "epoch": 8.53865336658354, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1731, "step": 4708 }, { "epoch": 8.540467014282475, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.2024, "step": 4709 }, { "epoch": 8.54228066198141, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2021, "step": 4710 }, { "epoch": 8.544094309680345, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.2932, "step": 4711 }, { "epoch": 8.54590795737928, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1583, "step": 4712 }, { "epoch": 8.547721605078214, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.128, "step": 4713 }, { "epoch": 8.549535252777147, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1465, "step": 4714 }, { "epoch": 8.551348900476082, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1114, "step": 4715 }, { "epoch": 8.553162548175017, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.124, "step": 4716 }, { "epoch": 8.554976195873952, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1176, "step": 4717 }, { "epoch": 8.556789843572886, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1071, "step": 4718 }, { "epoch": 8.558603491271821, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.113, "step": 4719 }, { "epoch": 8.560417138970754, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1126, "step": 4720 }, { "epoch": 8.562230786669689, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1187, "step": 4721 }, { "epoch": 8.564044434368624, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1284, "step": 4722 }, { "epoch": 8.565858082067558, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1102, "step": 4723 }, { "epoch": 8.567671729766493, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1196, "step": 4724 }, { "epoch": 8.569485377465428, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1214, "step": 4725 }, { "epoch": 8.571299025164361, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1068, "step": 4726 }, { "epoch": 8.573112672863296, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0943, "step": 4727 }, { "epoch": 8.57492632056223, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1144, "step": 4728 }, { "epoch": 8.576739968261165, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1063, "step": 4729 }, { "epoch": 8.5785536159601, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0922, "step": 4730 }, { "epoch": 8.580367263659035, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1199, "step": 4731 }, { "epoch": 8.582180911357968, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1076, "step": 4732 }, { "epoch": 8.583994559056903, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1048, "step": 4733 }, { "epoch": 8.585808206755837, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1205, "step": 4734 }, { "epoch": 8.587621854454772, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1015, "step": 4735 }, { "epoch": 8.589435502153707, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1228, "step": 4736 }, { "epoch": 8.591249149852642, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1241, "step": 4737 }, { "epoch": 8.593062797551575, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1158, "step": 4738 }, { "epoch": 8.59487644525051, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1105, "step": 4739 }, { "epoch": 8.596690092949444, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1258, "step": 4740 }, { "epoch": 8.598503740648379, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1087, "step": 4741 }, { "epoch": 8.600317388347314, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1252, "step": 4742 }, { "epoch": 8.602131036046249, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.125, "step": 4743 }, { "epoch": 8.603944683745183, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1238, "step": 4744 }, { "epoch": 8.605758331444116, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1456, "step": 4745 }, { "epoch": 8.607571979143051, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1268, "step": 4746 }, { "epoch": 8.609385626841986, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1233, "step": 4747 }, { "epoch": 8.61119927454092, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1225, "step": 4748 }, { "epoch": 8.613012922239855, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.2282, "step": 4749 }, { "epoch": 8.614826569938788, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1414, "step": 4750 }, { "epoch": 8.616640217637723, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1593, "step": 4751 }, { "epoch": 8.618453865336658, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1682, "step": 4752 }, { "epoch": 8.620267513035593, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1654, "step": 4753 }, { "epoch": 8.622081160734528, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.184, "step": 4754 }, { "epoch": 8.623894808433462, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.158, "step": 4755 }, { "epoch": 8.625708456132397, "grad_norm": 1.359375, "learning_rate": 0.0002, "loss": 0.2024, "step": 4756 }, { "epoch": 8.62752210383133, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1642, "step": 4757 }, { "epoch": 8.629335751530265, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.2089, "step": 4758 }, { "epoch": 8.6311493992292, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1988, "step": 4759 }, { "epoch": 8.632963046928134, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.2849, "step": 4760 }, { "epoch": 8.63477669462707, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.2875, "step": 4761 }, { "epoch": 8.636590342326004, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.134, "step": 4762 }, { "epoch": 8.638403990024937, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1293, "step": 4763 }, { "epoch": 8.640217637723872, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1198, "step": 4764 }, { "epoch": 8.642031285422807, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1116, "step": 4765 }, { "epoch": 8.643844933121741, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1051, "step": 4766 }, { "epoch": 8.645658580820676, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1239, "step": 4767 }, { "epoch": 8.64747222851961, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1208, "step": 4768 }, { "epoch": 8.649285876218544, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1164, "step": 4769 }, { "epoch": 8.651099523917479, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1074, "step": 4770 }, { "epoch": 8.652913171616413, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1148, "step": 4771 }, { "epoch": 8.654726819315348, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1387, "step": 4772 }, { "epoch": 8.656540467014283, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1194, "step": 4773 }, { "epoch": 8.658354114713218, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1332, "step": 4774 }, { "epoch": 8.66016776241215, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1173, "step": 4775 }, { "epoch": 8.661981410111085, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1081, "step": 4776 }, { "epoch": 8.66379505781002, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1107, "step": 4777 }, { "epoch": 8.665608705508955, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1101, "step": 4778 }, { "epoch": 8.66742235320789, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0947, "step": 4779 }, { "epoch": 8.669236000906825, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1248, "step": 4780 }, { "epoch": 8.671049648605758, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0962, "step": 4781 }, { "epoch": 8.672863296304692, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.128, "step": 4782 }, { "epoch": 8.674676944003627, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0999, "step": 4783 }, { "epoch": 8.676490591702562, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1049, "step": 4784 }, { "epoch": 8.678304239401497, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1, "step": 4785 }, { "epoch": 8.680117887100431, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1367, "step": 4786 }, { "epoch": 8.681931534799364, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1216, "step": 4787 }, { "epoch": 8.6837451824983, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1021, "step": 4788 }, { "epoch": 8.685558830197234, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1356, "step": 4789 }, { "epoch": 8.687372477896169, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1335, "step": 4790 }, { "epoch": 8.689186125595104, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1101, "step": 4791 }, { "epoch": 8.690999773294038, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1238, "step": 4792 }, { "epoch": 8.692813420992973, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1288, "step": 4793 }, { "epoch": 8.694627068691906, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1257, "step": 4794 }, { "epoch": 8.69644071639084, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1392, "step": 4795 }, { "epoch": 8.698254364089776, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1501, "step": 4796 }, { "epoch": 8.70006801178871, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.124, "step": 4797 }, { "epoch": 8.701881659487645, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1239, "step": 4798 }, { "epoch": 8.703695307186578, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1472, "step": 4799 }, { "epoch": 8.705508954885513, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1388, "step": 4800 }, { "epoch": 8.707322602584448, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1591, "step": 4801 }, { "epoch": 8.709136250283382, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1493, "step": 4802 }, { "epoch": 8.710949897982317, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1354, "step": 4803 }, { "epoch": 8.712763545681252, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1781, "step": 4804 }, { "epoch": 8.714577193380187, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1558, "step": 4805 }, { "epoch": 8.71639084107912, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1538, "step": 4806 }, { "epoch": 8.718204488778055, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1619, "step": 4807 }, { "epoch": 8.72001813647699, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.1783, "step": 4808 }, { "epoch": 8.721831784175924, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.2006, "step": 4809 }, { "epoch": 8.723645431874859, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.2, "step": 4810 }, { "epoch": 8.725459079573792, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.3011, "step": 4811 }, { "epoch": 8.727272727272727, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1704, "step": 4812 }, { "epoch": 8.729086374971661, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1252, "step": 4813 }, { "epoch": 8.730900022670596, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1543, "step": 4814 }, { "epoch": 8.732713670369531, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1163, "step": 4815 }, { "epoch": 8.734527318068466, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1266, "step": 4816 }, { "epoch": 8.7363409657674, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1088, "step": 4817 }, { "epoch": 8.738154613466333, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1203, "step": 4818 }, { "epoch": 8.739968261165268, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1251, "step": 4819 }, { "epoch": 8.741781908864203, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.129, "step": 4820 }, { "epoch": 8.743595556563138, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1048, "step": 4821 }, { "epoch": 8.745409204262073, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1179, "step": 4822 }, { "epoch": 8.747222851961007, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1188, "step": 4823 }, { "epoch": 8.74903649965994, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1273, "step": 4824 }, { "epoch": 8.750850147358875, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1693, "step": 4825 }, { "epoch": 8.75266379505781, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1003, "step": 4826 }, { "epoch": 8.754477442756745, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1242, "step": 4827 }, { "epoch": 8.75629109045568, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1051, "step": 4828 }, { "epoch": 8.758104738154614, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1122, "step": 4829 }, { "epoch": 8.759918385853547, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1143, "step": 4830 }, { "epoch": 8.761732033552482, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1259, "step": 4831 }, { "epoch": 8.763545681251417, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1349, "step": 4832 }, { "epoch": 8.765359328950352, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1298, "step": 4833 }, { "epoch": 8.767172976649286, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1227, "step": 4834 }, { "epoch": 8.768986624348221, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1211, "step": 4835 }, { "epoch": 8.770800272047154, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1015, "step": 4836 }, { "epoch": 8.772613919746089, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.11, "step": 4837 }, { "epoch": 8.774427567445024, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1082, "step": 4838 }, { "epoch": 8.776241215143958, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1444, "step": 4839 }, { "epoch": 8.778054862842893, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1168, "step": 4840 }, { "epoch": 8.779868510541828, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1464, "step": 4841 }, { "epoch": 8.781682158240761, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1353, "step": 4842 }, { "epoch": 8.783495805939696, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1104, "step": 4843 }, { "epoch": 8.78530945363863, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1248, "step": 4844 }, { "epoch": 8.787123101337565, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1309, "step": 4845 }, { "epoch": 8.7889367490365, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1224, "step": 4846 }, { "epoch": 8.790750396735435, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1427, "step": 4847 }, { "epoch": 8.792564044434368, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1124, "step": 4848 }, { "epoch": 8.794377692133303, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1288, "step": 4849 }, { "epoch": 8.796191339832237, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1611, "step": 4850 }, { "epoch": 8.798004987531172, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1402, "step": 4851 }, { "epoch": 8.799818635230107, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1426, "step": 4852 }, { "epoch": 8.801632282929042, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.162, "step": 4853 }, { "epoch": 8.803445930627976, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1718, "step": 4854 }, { "epoch": 8.80525957832691, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1702, "step": 4855 }, { "epoch": 8.807073226025844, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1788, "step": 4856 }, { "epoch": 8.808886873724779, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1835, "step": 4857 }, { "epoch": 8.810700521423714, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1676, "step": 4858 }, { "epoch": 8.812514169122649, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1719, "step": 4859 }, { "epoch": 8.814327816821582, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2272, "step": 4860 }, { "epoch": 8.816141464520516, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.3016, "step": 4861 }, { "epoch": 8.817955112219451, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1957, "step": 4862 }, { "epoch": 8.817955112219451, "eval_loss": 2.1993396282196045, "eval_runtime": 189.8153, "eval_samples_per_second": 5.268, "eval_steps_per_second": 5.268, "step": 4862 }, { "epoch": 8.817955112219451, "mmlu_eval_accuracy": 0.3095585725823937, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.4444444444444444, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.2, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2647058823529412, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.43478260869565216, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.3684210526315789, "mmlu_loss": 2.146956494962523, "step": 4862 }, { "epoch": 8.819768759918386, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.151, "step": 4863 }, { "epoch": 8.82158240761732, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1125, "step": 4864 }, { "epoch": 8.823396055316255, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1119, "step": 4865 }, { "epoch": 8.82520970301519, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1283, "step": 4866 }, { "epoch": 8.827023350714123, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1365, "step": 4867 }, { "epoch": 8.828836998413058, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1238, "step": 4868 }, { "epoch": 8.830650646111993, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1293, "step": 4869 }, { "epoch": 8.832464293810927, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1137, "step": 4870 }, { "epoch": 8.834277941509862, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1097, "step": 4871 }, { "epoch": 8.836091589208797, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1139, "step": 4872 }, { "epoch": 8.83790523690773, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1075, "step": 4873 }, { "epoch": 8.839718884606665, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1615, "step": 4874 }, { "epoch": 8.8415325323056, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1198, "step": 4875 }, { "epoch": 8.843346180004534, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1267, "step": 4876 }, { "epoch": 8.845159827703469, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1118, "step": 4877 }, { "epoch": 8.846973475402404, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.102, "step": 4878 }, { "epoch": 8.848787123101337, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.109, "step": 4879 }, { "epoch": 8.850600770800272, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1136, "step": 4880 }, { "epoch": 8.852414418499206, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1036, "step": 4881 }, { "epoch": 8.854228066198141, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0994, "step": 4882 }, { "epoch": 8.856041713897076, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.115, "step": 4883 }, { "epoch": 8.85785536159601, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1217, "step": 4884 }, { "epoch": 8.859669009294944, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1135, "step": 4885 }, { "epoch": 8.861482656993878, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1147, "step": 4886 }, { "epoch": 8.863296304692813, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1214, "step": 4887 }, { "epoch": 8.865109952391748, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1739, "step": 4888 }, { "epoch": 8.866923600090683, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1402, "step": 4889 }, { "epoch": 8.868737247789618, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1209, "step": 4890 }, { "epoch": 8.87055089548855, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1191, "step": 4891 }, { "epoch": 8.872364543187485, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1044, "step": 4892 }, { "epoch": 8.87417819088642, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1263, "step": 4893 }, { "epoch": 8.875991838585355, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1363, "step": 4894 }, { "epoch": 8.87780548628429, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1151, "step": 4895 }, { "epoch": 8.879619133983224, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1346, "step": 4896 }, { "epoch": 8.881432781682157, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1354, "step": 4897 }, { "epoch": 8.883246429381092, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1408, "step": 4898 }, { "epoch": 8.885060077080027, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1646, "step": 4899 }, { "epoch": 8.886873724778962, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1532, "step": 4900 }, { "epoch": 8.888687372477897, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1432, "step": 4901 }, { "epoch": 8.890501020176831, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1551, "step": 4902 }, { "epoch": 8.892314667875764, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1513, "step": 4903 }, { "epoch": 8.894128315574699, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1659, "step": 4904 }, { "epoch": 8.895941963273634, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.1778, "step": 4905 }, { "epoch": 8.897755610972569, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1575, "step": 4906 }, { "epoch": 8.899569258671503, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1925, "step": 4907 }, { "epoch": 8.901382906370438, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1859, "step": 4908 }, { "epoch": 8.903196554069371, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1851, "step": 4909 }, { "epoch": 8.905010201768306, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.2551, "step": 4910 }, { "epoch": 8.90682384946724, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2936, "step": 4911 }, { "epoch": 8.908637497166175, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1556, "step": 4912 }, { "epoch": 8.91045114486511, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1593, "step": 4913 }, { "epoch": 8.912264792564045, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1454, "step": 4914 }, { "epoch": 8.91407844026298, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1084, "step": 4915 }, { "epoch": 8.915892087961913, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1337, "step": 4916 }, { "epoch": 8.917705735660848, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1233, "step": 4917 }, { "epoch": 8.919519383359782, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1367, "step": 4918 }, { "epoch": 8.921333031058717, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1134, "step": 4919 }, { "epoch": 8.923146678757652, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.132, "step": 4920 }, { "epoch": 8.924960326456585, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1264, "step": 4921 }, { "epoch": 8.92677397415552, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1188, "step": 4922 }, { "epoch": 8.928587621854454, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1352, "step": 4923 }, { "epoch": 8.93040126955339, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1204, "step": 4924 }, { "epoch": 8.932214917252324, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1296, "step": 4925 }, { "epoch": 8.934028564951259, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.125, "step": 4926 }, { "epoch": 8.935842212650194, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1293, "step": 4927 }, { "epoch": 8.937655860349127, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1121, "step": 4928 }, { "epoch": 8.939469508048061, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0987, "step": 4929 }, { "epoch": 8.941283155746996, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1212, "step": 4930 }, { "epoch": 8.94309680344593, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1217, "step": 4931 }, { "epoch": 8.944910451144866, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1368, "step": 4932 }, { "epoch": 8.9467240988438, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1396, "step": 4933 }, { "epoch": 8.948537746542733, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.105, "step": 4934 }, { "epoch": 8.950351394241668, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1263, "step": 4935 }, { "epoch": 8.952165041940603, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1138, "step": 4936 }, { "epoch": 8.953978689639538, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1067, "step": 4937 }, { "epoch": 8.955792337338472, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1106, "step": 4938 }, { "epoch": 8.957605985037407, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1094, "step": 4939 }, { "epoch": 8.95941963273634, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1401, "step": 4940 }, { "epoch": 8.961233280435275, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1479, "step": 4941 }, { "epoch": 8.96304692813421, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1258, "step": 4942 }, { "epoch": 8.964860575833145, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1251, "step": 4943 }, { "epoch": 8.96667422353208, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1437, "step": 4944 }, { "epoch": 8.968487871231014, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1161, "step": 4945 }, { "epoch": 8.970301518929947, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1363, "step": 4946 }, { "epoch": 8.972115166628882, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1235, "step": 4947 }, { "epoch": 8.973928814327817, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1377, "step": 4948 }, { "epoch": 8.975742462026751, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1571, "step": 4949 }, { "epoch": 8.977556109725686, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1266, "step": 4950 }, { "epoch": 8.979369757424621, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1378, "step": 4951 }, { "epoch": 8.981183405123554, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1873, "step": 4952 }, { "epoch": 8.982997052822489, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1655, "step": 4953 }, { "epoch": 8.984810700521423, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.1755, "step": 4954 }, { "epoch": 8.986624348220358, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1775, "step": 4955 }, { "epoch": 8.988437995919293, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1839, "step": 4956 }, { "epoch": 8.990251643618228, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2046, "step": 4957 }, { "epoch": 8.99206529131716, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.2174, "step": 4958 }, { "epoch": 8.993878939016096, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1946, "step": 4959 }, { "epoch": 8.99569258671503, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 0.2505, "step": 4960 }, { "epoch": 8.997506234413965, "grad_norm": 1.328125, "learning_rate": 0.0002, "loss": 0.3479, "step": 4961 }, { "epoch": 8.9993198821129, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1333, "step": 4962 }, { "epoch": 9.001133529811835, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1402, "step": 4963 }, { "epoch": 9.002947177510768, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0871, "step": 4964 }, { "epoch": 9.004760825209702, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0827, "step": 4965 }, { "epoch": 9.006574472908637, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0843, "step": 4966 }, { "epoch": 9.008388120607572, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0794, "step": 4967 }, { "epoch": 9.010201768306507, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0861, "step": 4968 }, { "epoch": 9.012015416005442, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0805, "step": 4969 }, { "epoch": 9.013829063704376, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0705, "step": 4970 }, { "epoch": 9.01564271140331, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.072, "step": 4971 }, { "epoch": 9.017456359102244, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0832, "step": 4972 }, { "epoch": 9.019270006801179, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0762, "step": 4973 }, { "epoch": 9.021083654500114, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.084, "step": 4974 }, { "epoch": 9.022897302199048, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0741, "step": 4975 }, { "epoch": 9.024710949897983, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0817, "step": 4976 }, { "epoch": 9.026524597596916, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0753, "step": 4977 }, { "epoch": 9.028338245295851, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0807, "step": 4978 }, { "epoch": 9.030151892994786, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0824, "step": 4979 }, { "epoch": 9.03196554069372, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0658, "step": 4980 }, { "epoch": 9.033779188392655, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0733, "step": 4981 }, { "epoch": 9.03559283609159, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0813, "step": 4982 }, { "epoch": 9.037406483790523, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.084, "step": 4983 }, { "epoch": 9.039220131489458, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0865, "step": 4984 }, { "epoch": 9.041033779188393, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0827, "step": 4985 }, { "epoch": 9.042847426887327, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0831, "step": 4986 }, { "epoch": 9.044661074586262, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.078, "step": 4987 }, { "epoch": 9.046474722285197, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1081, "step": 4988 }, { "epoch": 9.04828836998413, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0849, "step": 4989 }, { "epoch": 9.050102017683065, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0928, "step": 4990 }, { "epoch": 9.051915665382, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0772, "step": 4991 }, { "epoch": 9.053729313080934, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.0703, "step": 4992 }, { "epoch": 9.055542960779869, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0787, "step": 4993 }, { "epoch": 9.057356608478804, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0923, "step": 4994 }, { "epoch": 9.059170256177737, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0961, "step": 4995 }, { "epoch": 9.060983903876672, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0962, "step": 4996 }, { "epoch": 9.062797551575606, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0983, "step": 4997 }, { "epoch": 9.064611199274541, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1083, "step": 4998 }, { "epoch": 9.066424846973476, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1013, "step": 4999 }, { "epoch": 9.06823849467241, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0995, "step": 5000 }, { "epoch": 9.070052142371344, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1106, "step": 5001 }, { "epoch": 9.071865790070278, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1088, "step": 5002 }, { "epoch": 9.073679437769213, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1812, "step": 5003 }, { "epoch": 9.075493085468148, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0915, "step": 5004 }, { "epoch": 9.077306733167083, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1275, "step": 5005 }, { "epoch": 9.079120380866017, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1205, "step": 5006 }, { "epoch": 9.08093402856495, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1351, "step": 5007 }, { "epoch": 9.082747676263885, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1573, "step": 5008 }, { "epoch": 9.08456132396282, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1569, "step": 5009 }, { "epoch": 9.086374971661755, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1573, "step": 5010 }, { "epoch": 9.08818861936069, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1485, "step": 5011 }, { "epoch": 9.090002267059624, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.2451, "step": 5012 }, { "epoch": 9.091815914758557, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1688, "step": 5013 }, { "epoch": 9.093629562457492, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0914, "step": 5014 }, { "epoch": 9.095443210156427, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0976, "step": 5015 }, { "epoch": 9.097256857855362, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0964, "step": 5016 }, { "epoch": 9.099070505554296, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0844, "step": 5017 }, { "epoch": 9.100884153253231, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0767, "step": 5018 }, { "epoch": 9.102697800952164, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1127, "step": 5019 }, { "epoch": 9.104511448651099, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0797, "step": 5020 }, { "epoch": 9.106325096350034, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0729, "step": 5021 }, { "epoch": 9.108138744048969, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0923, "step": 5022 }, { "epoch": 9.109952391747903, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0954, "step": 5023 }, { "epoch": 9.111766039446838, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0859, "step": 5024 }, { "epoch": 9.113579687145773, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0832, "step": 5025 }, { "epoch": 9.115393334844706, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0979, "step": 5026 }, { "epoch": 9.11720698254364, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0811, "step": 5027 }, { "epoch": 9.119020630242575, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0877, "step": 5028 }, { "epoch": 9.12083427794151, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0774, "step": 5029 }, { "epoch": 9.122647925640445, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0847, "step": 5030 }, { "epoch": 9.12446157333938, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0747, "step": 5031 }, { "epoch": 9.126275221038313, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0812, "step": 5032 }, { "epoch": 9.128088868737247, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0961, "step": 5033 }, { "epoch": 9.129902516436182, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0804, "step": 5034 }, { "epoch": 9.131716164135117, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0764, "step": 5035 }, { "epoch": 9.133529811834052, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0847, "step": 5036 }, { "epoch": 9.135343459532987, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0856, "step": 5037 }, { "epoch": 9.13715710723192, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1032, "step": 5038 }, { "epoch": 9.138970754930854, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0854, "step": 5039 }, { "epoch": 9.140784402629789, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0921, "step": 5040 }, { "epoch": 9.142598050328724, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1009, "step": 5041 }, { "epoch": 9.144411698027659, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0932, "step": 5042 }, { "epoch": 9.146225345726593, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1028, "step": 5043 }, { "epoch": 9.148038993425526, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0957, "step": 5044 }, { "epoch": 9.149852641124461, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1294, "step": 5045 }, { "epoch": 9.151666288823396, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0974, "step": 5046 }, { "epoch": 9.15347993652233, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1208, "step": 5047 }, { "epoch": 9.155293584221265, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1129, "step": 5048 }, { "epoch": 9.1571072319202, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1017, "step": 5049 }, { "epoch": 9.1571072319202, "eval_loss": 2.1894032955169678, "eval_runtime": 185.2699, "eval_samples_per_second": 5.398, "eval_steps_per_second": 5.398, "step": 5049 }, { "epoch": 9.1571072319202, "mmlu_eval_accuracy": 0.29299080719861853, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.5142857142857142, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.2222222222222222, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.15789473684210525, "mmlu_loss": 2.134528558601514, "step": 5049 }, { "epoch": 9.158920879619133, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1155, "step": 5050 }, { "epoch": 9.160734527318068, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1308, "step": 5051 }, { "epoch": 9.162548175017003, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1399, "step": 5052 }, { "epoch": 9.164361822715938, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1352, "step": 5053 }, { "epoch": 9.166175470414872, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1516, "step": 5054 }, { "epoch": 9.167989118113807, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1381, "step": 5055 }, { "epoch": 9.16980276581274, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1529, "step": 5056 }, { "epoch": 9.171616413511675, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1279, "step": 5057 }, { "epoch": 9.17343006121061, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.1458, "step": 5058 }, { "epoch": 9.175243708909544, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1488, "step": 5059 }, { "epoch": 9.17705735660848, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.1814, "step": 5060 }, { "epoch": 9.178871004307414, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1788, "step": 5061 }, { "epoch": 9.180684652006347, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2454, "step": 5062 }, { "epoch": 9.182498299705282, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.2319, "step": 5063 }, { "epoch": 9.184311947404217, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0992, "step": 5064 }, { "epoch": 9.186125595103151, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0966, "step": 5065 }, { "epoch": 9.187939242802086, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0985, "step": 5066 }, { "epoch": 9.18975289050102, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0903, "step": 5067 }, { "epoch": 9.191566538199954, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0976, "step": 5068 }, { "epoch": 9.193380185898889, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1007, "step": 5069 }, { "epoch": 9.195193833597823, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.087, "step": 5070 }, { "epoch": 9.197007481296758, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0892, "step": 5071 }, { "epoch": 9.198821128995693, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0836, "step": 5072 }, { "epoch": 9.200634776694628, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0895, "step": 5073 }, { "epoch": 9.20244842439356, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0836, "step": 5074 }, { "epoch": 9.204262072092495, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1065, "step": 5075 }, { "epoch": 9.20607571979143, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0899, "step": 5076 }, { "epoch": 9.207889367490365, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0817, "step": 5077 }, { "epoch": 9.2097030151893, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0946, "step": 5078 }, { "epoch": 9.211516662888235, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0807, "step": 5079 }, { "epoch": 9.213330310587168, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0827, "step": 5080 }, { "epoch": 9.215143958286102, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0939, "step": 5081 }, { "epoch": 9.216957605985037, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0764, "step": 5082 }, { "epoch": 9.218771253683972, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0933, "step": 5083 }, { "epoch": 9.220584901382907, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0849, "step": 5084 }, { "epoch": 9.222398549081841, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0923, "step": 5085 }, { "epoch": 9.224212196780776, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.094, "step": 5086 }, { "epoch": 9.22602584447971, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.088, "step": 5087 }, { "epoch": 9.227839492178644, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0987, "step": 5088 }, { "epoch": 9.229653139877579, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0824, "step": 5089 }, { "epoch": 9.231466787576514, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1185, "step": 5090 }, { "epoch": 9.233280435275448, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0943, "step": 5091 }, { "epoch": 9.235094082974383, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.093, "step": 5092 }, { "epoch": 9.236907730673316, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1287, "step": 5093 }, { "epoch": 9.23872137837225, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0968, "step": 5094 }, { "epoch": 9.240535026071186, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1009, "step": 5095 }, { "epoch": 9.24234867377012, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.096, "step": 5096 }, { "epoch": 9.244162321469055, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1055, "step": 5097 }, { "epoch": 9.24597596916799, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.12, "step": 5098 }, { "epoch": 9.247789616866923, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1107, "step": 5099 }, { "epoch": 9.249603264565858, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1275, "step": 5100 }, { "epoch": 9.251416912264792, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1048, "step": 5101 }, { "epoch": 9.253230559963727, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1293, "step": 5102 }, { "epoch": 9.255044207662662, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1437, "step": 5103 }, { "epoch": 9.256857855361597, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.138, "step": 5104 }, { "epoch": 9.25867150306053, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1387, "step": 5105 }, { "epoch": 9.260485150759465, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1407, "step": 5106 }, { "epoch": 9.2622987984584, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.1702, "step": 5107 }, { "epoch": 9.264112446157334, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1374, "step": 5108 }, { "epoch": 9.265926093856269, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.1549, "step": 5109 }, { "epoch": 9.267739741555204, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1875, "step": 5110 }, { "epoch": 9.269553389254137, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1931, "step": 5111 }, { "epoch": 9.271367036953071, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1835, "step": 5112 }, { "epoch": 9.273180684652006, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2057, "step": 5113 }, { "epoch": 9.274994332350941, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1053, "step": 5114 }, { "epoch": 9.276807980049876, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0959, "step": 5115 }, { "epoch": 9.27862162774881, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1057, "step": 5116 }, { "epoch": 9.280435275447743, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0938, "step": 5117 }, { "epoch": 9.282248923146678, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0906, "step": 5118 }, { "epoch": 9.284062570845613, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0904, "step": 5119 }, { "epoch": 9.285876218544548, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0926, "step": 5120 }, { "epoch": 9.287689866243483, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1028, "step": 5121 }, { "epoch": 9.289503513942417, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.088, "step": 5122 }, { "epoch": 9.29131716164135, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0865, "step": 5123 }, { "epoch": 9.293130809340285, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0905, "step": 5124 }, { "epoch": 9.29494445703922, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1018, "step": 5125 }, { "epoch": 9.296758104738155, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0863, "step": 5126 }, { "epoch": 9.29857175243709, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0832, "step": 5127 }, { "epoch": 9.300385400136024, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.088, "step": 5128 }, { "epoch": 9.302199047834957, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0995, "step": 5129 }, { "epoch": 9.304012695533892, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0789, "step": 5130 }, { "epoch": 9.305826343232827, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0878, "step": 5131 }, { "epoch": 9.307639990931762, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.082, "step": 5132 }, { "epoch": 9.309453638630696, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.104, "step": 5133 }, { "epoch": 9.311267286329631, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1025, "step": 5134 }, { "epoch": 9.313080934028566, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1042, "step": 5135 }, { "epoch": 9.314894581727499, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0957, "step": 5136 }, { "epoch": 9.316708229426434, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.083, "step": 5137 }, { "epoch": 9.318521877125368, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0978, "step": 5138 }, { "epoch": 9.320335524824303, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1191, "step": 5139 }, { "epoch": 9.322149172523238, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1117, "step": 5140 }, { "epoch": 9.323962820222171, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1079, "step": 5141 }, { "epoch": 9.325776467921106, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0974, "step": 5142 }, { "epoch": 9.32759011562004, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1011, "step": 5143 }, { "epoch": 9.329403763318975, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0906, "step": 5144 }, { "epoch": 9.33121741101791, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1024, "step": 5145 }, { "epoch": 9.333031058716845, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.092, "step": 5146 }, { "epoch": 9.33484470641578, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1086, "step": 5147 }, { "epoch": 9.336658354114713, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1165, "step": 5148 }, { "epoch": 9.338472001813647, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1407, "step": 5149 }, { "epoch": 9.340285649512582, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1054, "step": 5150 }, { "epoch": 9.342099297211517, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1114, "step": 5151 }, { "epoch": 9.343912944910452, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1293, "step": 5152 }, { "epoch": 9.345726592609386, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1323, "step": 5153 }, { "epoch": 9.34754024030832, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1369, "step": 5154 }, { "epoch": 9.349353888007254, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1318, "step": 5155 }, { "epoch": 9.351167535706189, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.137, "step": 5156 }, { "epoch": 9.352981183405124, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1319, "step": 5157 }, { "epoch": 9.354794831104059, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1437, "step": 5158 }, { "epoch": 9.356608478802993, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1427, "step": 5159 }, { "epoch": 9.358422126501926, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1672, "step": 5160 }, { "epoch": 9.360235774200861, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2073, "step": 5161 }, { "epoch": 9.362049421899796, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.2671, "step": 5162 }, { "epoch": 9.36386306959873, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1808, "step": 5163 }, { "epoch": 9.365676717297665, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0933, "step": 5164 }, { "epoch": 9.3674903649966, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.113, "step": 5165 }, { "epoch": 9.369304012695533, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0876, "step": 5166 }, { "epoch": 9.371117660394468, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1029, "step": 5167 }, { "epoch": 9.372931308093403, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0901, "step": 5168 }, { "epoch": 9.374744955792337, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.09, "step": 5169 }, { "epoch": 9.376558603491272, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1012, "step": 5170 }, { "epoch": 9.378372251190207, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0886, "step": 5171 }, { "epoch": 9.38018589888914, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1062, "step": 5172 }, { "epoch": 9.381999546588075, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0948, "step": 5173 }, { "epoch": 9.38381319428701, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1031, "step": 5174 }, { "epoch": 9.385626841985944, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0883, "step": 5175 }, { "epoch": 9.387440489684879, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1044, "step": 5176 }, { "epoch": 9.389254137383814, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0898, "step": 5177 }, { "epoch": 9.391067785082747, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0954, "step": 5178 }, { "epoch": 9.392881432781682, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.09, "step": 5179 }, { "epoch": 9.394695080480616, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0866, "step": 5180 }, { "epoch": 9.396508728179551, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0844, "step": 5181 }, { "epoch": 9.398322375878486, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0962, "step": 5182 }, { "epoch": 9.40013602357742, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0948, "step": 5183 }, { "epoch": 9.401949671276354, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1001, "step": 5184 }, { "epoch": 9.403763318975288, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0976, "step": 5185 }, { "epoch": 9.405576966674223, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0966, "step": 5186 }, { "epoch": 9.407390614373158, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0951, "step": 5187 }, { "epoch": 9.409204262072093, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1047, "step": 5188 }, { "epoch": 9.411017909771028, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1044, "step": 5189 }, { "epoch": 9.41283155746996, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1042, "step": 5190 }, { "epoch": 9.414645205168895, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0846, "step": 5191 }, { "epoch": 9.41645885286783, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1058, "step": 5192 }, { "epoch": 9.418272500566765, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0903, "step": 5193 }, { "epoch": 9.4200861482657, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0966, "step": 5194 }, { "epoch": 9.421899795964634, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1045, "step": 5195 }, { "epoch": 9.42371344366357, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1213, "step": 5196 }, { "epoch": 9.425527091362502, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1121, "step": 5197 }, { "epoch": 9.427340739061437, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1083, "step": 5198 }, { "epoch": 9.429154386760372, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1031, "step": 5199 }, { "epoch": 9.430968034459307, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1223, "step": 5200 }, { "epoch": 9.432781682158241, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1196, "step": 5201 }, { "epoch": 9.434595329857176, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1203, "step": 5202 }, { "epoch": 9.436408977556109, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1391, "step": 5203 }, { "epoch": 9.438222625255044, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1403, "step": 5204 }, { "epoch": 9.440036272953979, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1277, "step": 5205 }, { "epoch": 9.441849920652913, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1513, "step": 5206 }, { "epoch": 9.443663568351848, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.1635, "step": 5207 }, { "epoch": 9.445477216050783, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.1594, "step": 5208 }, { "epoch": 9.447290863749716, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1939, "step": 5209 }, { "epoch": 9.44910451144865, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1619, "step": 5210 }, { "epoch": 9.450918159147585, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1705, "step": 5211 }, { "epoch": 9.45273180684652, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.242, "step": 5212 }, { "epoch": 9.454545454545455, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.224, "step": 5213 }, { "epoch": 9.45635910224439, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0954, "step": 5214 }, { "epoch": 9.458172749943323, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0952, "step": 5215 }, { "epoch": 9.459986397642258, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.09, "step": 5216 }, { "epoch": 9.461800045341192, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0983, "step": 5217 }, { "epoch": 9.463613693040127, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1031, "step": 5218 }, { "epoch": 9.465427340739062, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0923, "step": 5219 }, { "epoch": 9.467240988437997, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0906, "step": 5220 }, { "epoch": 9.46905463613693, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0887, "step": 5221 }, { "epoch": 9.470868283835864, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1035, "step": 5222 }, { "epoch": 9.4726819315348, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0945, "step": 5223 }, { "epoch": 9.474495579233734, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.094, "step": 5224 }, { "epoch": 9.476309226932669, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0903, "step": 5225 }, { "epoch": 9.478122874631604, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0993, "step": 5226 }, { "epoch": 9.479936522330537, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1145, "step": 5227 }, { "epoch": 9.481750170029471, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0948, "step": 5228 }, { "epoch": 9.483563817728406, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.093, "step": 5229 }, { "epoch": 9.48537746542734, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1008, "step": 5230 }, { "epoch": 9.487191113126276, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0985, "step": 5231 }, { "epoch": 9.48900476082521, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0888, "step": 5232 }, { "epoch": 9.490818408524143, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1075, "step": 5233 }, { "epoch": 9.492632056223078, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0775, "step": 5234 }, { "epoch": 9.494445703922013, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0908, "step": 5235 }, { "epoch": 9.496259351620948, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1121, "step": 5236 }, { "epoch": 9.496259351620948, "eval_loss": 2.2151801586151123, "eval_runtime": 186.0264, "eval_samples_per_second": 5.376, "eval_steps_per_second": 5.376, "step": 5236 }, { "epoch": 9.496259351620948, "mmlu_eval_accuracy": 0.31320476233938155, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.1724137931034483, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.53125, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.45454545454545453, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2647058823529412, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3333333333333333, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.201744149881278, "step": 5236 }, { "epoch": 9.498072999319882, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0827, "step": 5237 }, { "epoch": 9.499886647018817, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0962, "step": 5238 }, { "epoch": 9.50170029471775, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0882, "step": 5239 }, { "epoch": 9.503513942416685, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1158, "step": 5240 }, { "epoch": 9.50532759011562, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1125, "step": 5241 }, { "epoch": 9.507141237814555, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1025, "step": 5242 }, { "epoch": 9.50895488551349, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1003, "step": 5243 }, { "epoch": 9.510768533212424, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.115, "step": 5244 }, { "epoch": 9.512582180911359, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1101, "step": 5245 }, { "epoch": 9.514395828610292, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.118, "step": 5246 }, { "epoch": 9.516209476309227, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1041, "step": 5247 }, { "epoch": 9.518023124008161, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1122, "step": 5248 }, { "epoch": 9.519836771707096, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1051, "step": 5249 }, { "epoch": 9.521650419406031, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1113, "step": 5250 }, { "epoch": 9.523464067104964, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1107, "step": 5251 }, { "epoch": 9.525277714803899, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1365, "step": 5252 }, { "epoch": 9.527091362502834, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1163, "step": 5253 }, { "epoch": 9.528905010201768, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1346, "step": 5254 }, { "epoch": 9.530718657900703, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1347, "step": 5255 }, { "epoch": 9.532532305599638, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1276, "step": 5256 }, { "epoch": 9.534345953298573, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1472, "step": 5257 }, { "epoch": 9.536159600997506, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1624, "step": 5258 }, { "epoch": 9.53797324869644, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1662, "step": 5259 }, { "epoch": 9.539786896395375, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1545, "step": 5260 }, { "epoch": 9.54160054409431, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1852, "step": 5261 }, { "epoch": 9.543414191793245, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.2868, "step": 5262 }, { "epoch": 9.54522783949218, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.3444, "step": 5263 }, { "epoch": 9.547041487191112, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1065, "step": 5264 }, { "epoch": 9.548855134890047, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1018, "step": 5265 }, { "epoch": 9.550668782588982, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1144, "step": 5266 }, { "epoch": 9.552482430287917, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1047, "step": 5267 }, { "epoch": 9.554296077986852, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0972, "step": 5268 }, { "epoch": 9.556109725685786, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1055, "step": 5269 }, { "epoch": 9.55792337338472, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0914, "step": 5270 }, { "epoch": 9.559737021083654, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1051, "step": 5271 }, { "epoch": 9.561550668782589, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1105, "step": 5272 }, { "epoch": 9.563364316481524, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0925, "step": 5273 }, { "epoch": 9.565177964180458, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0947, "step": 5274 }, { "epoch": 9.566991611879393, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1157, "step": 5275 }, { "epoch": 9.568805259578326, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0972, "step": 5276 }, { "epoch": 9.570618907277261, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1162, "step": 5277 }, { "epoch": 9.572432554976196, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0889, "step": 5278 }, { "epoch": 9.57424620267513, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0949, "step": 5279 }, { "epoch": 9.576059850374065, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0918, "step": 5280 }, { "epoch": 9.577873498073, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.111, "step": 5281 }, { "epoch": 9.579687145771933, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1022, "step": 5282 }, { "epoch": 9.581500793470868, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0843, "step": 5283 }, { "epoch": 9.583314441169803, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1039, "step": 5284 }, { "epoch": 9.585128088868737, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0842, "step": 5285 }, { "epoch": 9.586941736567672, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0987, "step": 5286 }, { "epoch": 9.588755384266607, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0941, "step": 5287 }, { "epoch": 9.59056903196554, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1001, "step": 5288 }, { "epoch": 9.592382679664475, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0937, "step": 5289 }, { "epoch": 9.59419632736341, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1084, "step": 5290 }, { "epoch": 9.596009975062344, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1242, "step": 5291 }, { "epoch": 9.597823622761279, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1047, "step": 5292 }, { "epoch": 9.599637270460214, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0959, "step": 5293 }, { "epoch": 9.601450918159149, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.17, "step": 5294 }, { "epoch": 9.603264565858082, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1057, "step": 5295 }, { "epoch": 9.605078213557016, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1063, "step": 5296 }, { "epoch": 9.606891861255951, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1172, "step": 5297 }, { "epoch": 9.608705508954886, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0997, "step": 5298 }, { "epoch": 9.61051915665382, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1267, "step": 5299 }, { "epoch": 9.612332804352754, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1068, "step": 5300 }, { "epoch": 9.614146452051688, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.123, "step": 5301 }, { "epoch": 9.615960099750623, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1132, "step": 5302 }, { "epoch": 9.617773747449558, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.114, "step": 5303 }, { "epoch": 9.619587395148493, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.122, "step": 5304 }, { "epoch": 9.621401042847427, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1257, "step": 5305 }, { "epoch": 9.623214690546362, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1543, "step": 5306 }, { "epoch": 9.625028338245295, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.1793, "step": 5307 }, { "epoch": 9.62684198594423, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.149, "step": 5308 }, { "epoch": 9.628655633643165, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1624, "step": 5309 }, { "epoch": 9.6304692813421, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.188, "step": 5310 }, { "epoch": 9.632282929041034, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.2352, "step": 5311 }, { "epoch": 9.634096576739967, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.2937, "step": 5312 }, { "epoch": 9.635910224438902, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.2034, "step": 5313 }, { "epoch": 9.637723872137837, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1108, "step": 5314 }, { "epoch": 9.639537519836772, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.126, "step": 5315 }, { "epoch": 9.641351167535706, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0978, "step": 5316 }, { "epoch": 9.643164815234641, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1019, "step": 5317 }, { "epoch": 9.644978462933576, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0898, "step": 5318 }, { "epoch": 9.646792110632509, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1149, "step": 5319 }, { "epoch": 9.648605758331444, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1059, "step": 5320 }, { "epoch": 9.650419406030379, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.133, "step": 5321 }, { "epoch": 9.652233053729313, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1004, "step": 5322 }, { "epoch": 9.654046701428248, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1016, "step": 5323 }, { "epoch": 9.655860349127183, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1366, "step": 5324 }, { "epoch": 9.657673996826116, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0999, "step": 5325 }, { "epoch": 9.65948764452505, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1448, "step": 5326 }, { "epoch": 9.661301292223985, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1036, "step": 5327 }, { "epoch": 9.66311493992292, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0917, "step": 5328 }, { "epoch": 9.664928587621855, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1051, "step": 5329 }, { "epoch": 9.66674223532079, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1064, "step": 5330 }, { "epoch": 9.668555883019723, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1022, "step": 5331 }, { "epoch": 9.670369530718657, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.089, "step": 5332 }, { "epoch": 9.672183178417592, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1189, "step": 5333 }, { "epoch": 9.673996826116527, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0973, "step": 5334 }, { "epoch": 9.675810473815462, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1052, "step": 5335 }, { "epoch": 9.677624121514397, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1081, "step": 5336 }, { "epoch": 9.67943776921333, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1001, "step": 5337 }, { "epoch": 9.681251416912264, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1013, "step": 5338 }, { "epoch": 9.683065064611199, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0977, "step": 5339 }, { "epoch": 9.684878712310134, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1097, "step": 5340 }, { "epoch": 9.686692360009069, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1076, "step": 5341 }, { "epoch": 9.688506007708003, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0924, "step": 5342 }, { "epoch": 9.690319655406936, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.107, "step": 5343 }, { "epoch": 9.692133303105871, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1161, "step": 5344 }, { "epoch": 9.693946950804806, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1075, "step": 5345 }, { "epoch": 9.69576059850374, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1406, "step": 5346 }, { "epoch": 9.697574246202675, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1101, "step": 5347 }, { "epoch": 9.69938789390161, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0965, "step": 5348 }, { "epoch": 9.701201541600543, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1309, "step": 5349 }, { "epoch": 9.703015189299478, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1178, "step": 5350 }, { "epoch": 9.704828836998413, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1231, "step": 5351 }, { "epoch": 9.706642484697348, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1383, "step": 5352 }, { "epoch": 9.708456132396282, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1254, "step": 5353 }, { "epoch": 9.710269780095217, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1406, "step": 5354 }, { "epoch": 9.712083427794152, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1556, "step": 5355 }, { "epoch": 9.713897075493085, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1552, "step": 5356 }, { "epoch": 9.71571072319202, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1628, "step": 5357 }, { "epoch": 9.717524370890954, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.1744, "step": 5358 }, { "epoch": 9.71933801858989, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.1946, "step": 5359 }, { "epoch": 9.721151666288824, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1756, "step": 5360 }, { "epoch": 9.722965313987757, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1817, "step": 5361 }, { "epoch": 9.724778961686692, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.2548, "step": 5362 }, { "epoch": 9.726592609385627, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.2304, "step": 5363 }, { "epoch": 9.728406257084561, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1032, "step": 5364 }, { "epoch": 9.730219904783496, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1012, "step": 5365 }, { "epoch": 9.73203355248243, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1208, "step": 5366 }, { "epoch": 9.733847200181366, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1028, "step": 5367 }, { "epoch": 9.735660847880299, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0995, "step": 5368 }, { "epoch": 9.737474495579233, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1002, "step": 5369 }, { "epoch": 9.739288143278168, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1033, "step": 5370 }, { "epoch": 9.741101790977103, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1141, "step": 5371 }, { "epoch": 9.742915438676038, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1042, "step": 5372 }, { "epoch": 9.744729086374972, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1002, "step": 5373 }, { "epoch": 9.746542734073905, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1144, "step": 5374 }, { "epoch": 9.74835638177284, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1151, "step": 5375 }, { "epoch": 9.750170029471775, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1474, "step": 5376 }, { "epoch": 9.75198367717071, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.097, "step": 5377 }, { "epoch": 9.753797324869645, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0925, "step": 5378 }, { "epoch": 9.75561097256858, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1166, "step": 5379 }, { "epoch": 9.757424620267512, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1122, "step": 5380 }, { "epoch": 9.759238267966447, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.111, "step": 5381 }, { "epoch": 9.761051915665382, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1083, "step": 5382 }, { "epoch": 9.762865563364317, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1022, "step": 5383 }, { "epoch": 9.764679211063251, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0929, "step": 5384 }, { "epoch": 9.766492858762186, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1122, "step": 5385 }, { "epoch": 9.76830650646112, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1095, "step": 5386 }, { "epoch": 9.770120154160054, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1036, "step": 5387 }, { "epoch": 9.771933801858989, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1018, "step": 5388 }, { "epoch": 9.773747449557924, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1054, "step": 5389 }, { "epoch": 9.775561097256858, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.102, "step": 5390 }, { "epoch": 9.777374744955793, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1021, "step": 5391 }, { "epoch": 9.779188392654726, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.1002, "step": 5392 }, { "epoch": 9.78100204035366, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1095, "step": 5393 }, { "epoch": 9.782815688052596, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1109, "step": 5394 }, { "epoch": 9.78462933575153, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1102, "step": 5395 }, { "epoch": 9.786442983450465, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1272, "step": 5396 }, { "epoch": 9.7882566311494, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1046, "step": 5397 }, { "epoch": 9.790070278848333, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1234, "step": 5398 }, { "epoch": 9.791883926547268, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.14, "step": 5399 }, { "epoch": 9.793697574246202, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1159, "step": 5400 }, { "epoch": 9.795511221945137, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2006, "step": 5401 }, { "epoch": 9.797324869644072, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1389, "step": 5402 }, { "epoch": 9.799138517343007, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1553, "step": 5403 }, { "epoch": 9.80095216504194, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1324, "step": 5404 }, { "epoch": 9.802765812740875, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.158, "step": 5405 }, { "epoch": 9.80457946043981, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1538, "step": 5406 }, { "epoch": 9.806393108138744, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.1659, "step": 5407 }, { "epoch": 9.808206755837679, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1435, "step": 5408 }, { "epoch": 9.810020403536614, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1648, "step": 5409 }, { "epoch": 9.811834051235547, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.2093, "step": 5410 }, { "epoch": 9.813647698934481, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1931, "step": 5411 }, { "epoch": 9.815461346633416, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1975, "step": 5412 }, { "epoch": 9.817274994332351, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.2146, "step": 5413 }, { "epoch": 9.819088642031286, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1161, "step": 5414 }, { "epoch": 9.82090228973022, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1258, "step": 5415 }, { "epoch": 9.822715937429155, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1041, "step": 5416 }, { "epoch": 9.824529585128088, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1201, "step": 5417 }, { "epoch": 9.826343232827023, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0968, "step": 5418 }, { "epoch": 9.828156880525958, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1184, "step": 5419 }, { "epoch": 9.829970528224893, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1252, "step": 5420 }, { "epoch": 9.831784175923827, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1097, "step": 5421 }, { "epoch": 9.83359782362276, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.125, "step": 5422 }, { "epoch": 9.835411471321695, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1047, "step": 5423 }, { "epoch": 9.835411471321695, "eval_loss": 2.2571909427642822, "eval_runtime": 185.9446, "eval_samples_per_second": 5.378, "eval_steps_per_second": 5.378, "step": 5423 }, { "epoch": 9.835411471321695, "mmlu_eval_accuracy": 0.3041783540601528, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.26666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.5428571428571428, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.0039477425899324, "step": 5423 }, { "epoch": 9.83722511902063, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1045, "step": 5424 }, { "epoch": 9.839038766719565, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1153, "step": 5425 }, { "epoch": 9.8408524144185, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1136, "step": 5426 }, { "epoch": 9.842666062117434, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1085, "step": 5427 }, { "epoch": 9.844479709816369, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0997, "step": 5428 }, { "epoch": 9.846293357515302, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.099, "step": 5429 }, { "epoch": 9.848107005214237, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1211, "step": 5430 }, { "epoch": 9.849920652913172, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1009, "step": 5431 }, { "epoch": 9.851734300612106, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1105, "step": 5432 }, { "epoch": 9.853547948311041, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1063, "step": 5433 }, { "epoch": 9.855361596009976, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1272, "step": 5434 }, { "epoch": 9.857175243708909, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0844, "step": 5435 }, { "epoch": 9.858988891407844, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1127, "step": 5436 }, { "epoch": 9.860802539106778, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1025, "step": 5437 }, { "epoch": 9.862616186805713, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1126, "step": 5438 }, { "epoch": 9.864429834504648, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1095, "step": 5439 }, { "epoch": 9.866243482203583, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1045, "step": 5440 }, { "epoch": 9.868057129902516, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1136, "step": 5441 }, { "epoch": 9.86987077760145, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1193, "step": 5442 }, { "epoch": 9.871684425300385, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1234, "step": 5443 }, { "epoch": 9.87349807299932, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1347, "step": 5444 }, { "epoch": 9.875311720698255, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.145, "step": 5445 }, { "epoch": 9.87712536839719, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1132, "step": 5446 }, { "epoch": 9.878939016096123, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1228, "step": 5447 }, { "epoch": 9.880752663795057, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1288, "step": 5448 }, { "epoch": 9.882566311493992, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.117, "step": 5449 }, { "epoch": 9.884379959192927, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1304, "step": 5450 }, { "epoch": 9.886193606891862, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1316, "step": 5451 }, { "epoch": 9.888007254590796, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1211, "step": 5452 }, { "epoch": 9.88982090228973, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1282, "step": 5453 }, { "epoch": 9.891634549988664, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1408, "step": 5454 }, { "epoch": 9.893448197687599, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1481, "step": 5455 }, { "epoch": 9.895261845386534, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.147, "step": 5456 }, { "epoch": 9.897075493085469, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1634, "step": 5457 }, { "epoch": 9.898889140784403, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1588, "step": 5458 }, { "epoch": 9.900702788483336, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 0.2032, "step": 5459 }, { "epoch": 9.902516436182271, "grad_norm": 1.8046875, "learning_rate": 0.0002, "loss": 0.2343, "step": 5460 }, { "epoch": 9.904330083881206, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.2394, "step": 5461 }, { "epoch": 9.90614373158014, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.2441, "step": 5462 }, { "epoch": 9.907957379279075, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.257, "step": 5463 }, { "epoch": 9.90977102697801, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1244, "step": 5464 }, { "epoch": 9.911584674676945, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1466, "step": 5465 }, { "epoch": 9.913398322375878, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1043, "step": 5466 }, { "epoch": 9.915211970074813, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1019, "step": 5467 }, { "epoch": 9.917025617773747, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1245, "step": 5468 }, { "epoch": 9.918839265472682, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1245, "step": 5469 }, { "epoch": 9.920652913171617, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1078, "step": 5470 }, { "epoch": 9.92246656087055, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1126, "step": 5471 }, { "epoch": 9.924280208569485, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1211, "step": 5472 }, { "epoch": 9.92609385626842, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1084, "step": 5473 }, { "epoch": 9.927907503967354, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1087, "step": 5474 }, { "epoch": 9.929721151666289, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1031, "step": 5475 }, { "epoch": 9.931534799365224, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1246, "step": 5476 }, { "epoch": 9.933348447064159, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1212, "step": 5477 }, { "epoch": 9.935162094763092, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1055, "step": 5478 }, { "epoch": 9.936975742462026, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1083, "step": 5479 }, { "epoch": 9.938789390160961, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1062, "step": 5480 }, { "epoch": 9.940603037859896, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0969, "step": 5481 }, { "epoch": 9.94241668555883, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0866, "step": 5482 }, { "epoch": 9.944230333257766, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0975, "step": 5483 }, { "epoch": 9.946043980956699, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1285, "step": 5484 }, { "epoch": 9.947857628655633, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1145, "step": 5485 }, { "epoch": 9.949671276354568, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.113, "step": 5486 }, { "epoch": 9.951484924053503, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.107, "step": 5487 }, { "epoch": 9.953298571752438, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1041, "step": 5488 }, { "epoch": 9.955112219451372, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1268, "step": 5489 }, { "epoch": 9.956925867150305, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1183, "step": 5490 }, { "epoch": 9.95873951484924, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1164, "step": 5491 }, { "epoch": 9.960553162548175, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1991, "step": 5492 }, { "epoch": 9.96236681024711, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1124, "step": 5493 }, { "epoch": 9.964180457946044, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1053, "step": 5494 }, { "epoch": 9.96599410564498, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1199, "step": 5495 }, { "epoch": 9.967807753343912, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1155, "step": 5496 }, { "epoch": 9.969621401042847, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1215, "step": 5497 }, { "epoch": 9.971435048741782, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1253, "step": 5498 }, { "epoch": 9.973248696440717, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1353, "step": 5499 }, { "epoch": 9.975062344139651, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1382, "step": 5500 }, { "epoch": 9.976875991838586, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1347, "step": 5501 }, { "epoch": 9.978689639537519, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1215, "step": 5502 }, { "epoch": 9.980503287236454, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1329, "step": 5503 }, { "epoch": 9.982316934935389, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1322, "step": 5504 }, { "epoch": 9.984130582634323, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1295, "step": 5505 }, { "epoch": 9.985944230333258, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1638, "step": 5506 }, { "epoch": 9.987757878032193, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1533, "step": 5507 }, { "epoch": 9.989571525731126, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1619, "step": 5508 }, { "epoch": 9.99138517343006, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1574, "step": 5509 }, { "epoch": 9.993198821128995, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1995, "step": 5510 }, { "epoch": 9.99501246882793, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.2158, "step": 5511 }, { "epoch": 9.996826116526865, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.2294, "step": 5512 }, { "epoch": 9.9986397642258, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1647, "step": 5513 }, { "epoch": 10.000453411924733, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1655, "step": 5514 }, { "epoch": 10.002267059623668, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0842, "step": 5515 }, { "epoch": 10.004080707322602, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0657, "step": 5516 }, { "epoch": 10.005894355021537, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0701, "step": 5517 }, { "epoch": 10.007708002720472, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.0679, "step": 5518 }, { "epoch": 10.009521650419407, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0645, "step": 5519 }, { "epoch": 10.01133529811834, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.076, "step": 5520 }, { "epoch": 10.013148945817274, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0682, "step": 5521 }, { "epoch": 10.01496259351621, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0679, "step": 5522 }, { "epoch": 10.016776241215144, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0707, "step": 5523 }, { "epoch": 10.018589888914079, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0712, "step": 5524 }, { "epoch": 10.020403536613014, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0711, "step": 5525 }, { "epoch": 10.022217184311947, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.075, "step": 5526 }, { "epoch": 10.024030832010881, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0679, "step": 5527 }, { "epoch": 10.025844479709816, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0709, "step": 5528 }, { "epoch": 10.02765812740875, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0821, "step": 5529 }, { "epoch": 10.029471775107686, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0709, "step": 5530 }, { "epoch": 10.03128542280662, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0712, "step": 5531 }, { "epoch": 10.033099070505555, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0705, "step": 5532 }, { "epoch": 10.034912718204488, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0892, "step": 5533 }, { "epoch": 10.036726365903423, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0757, "step": 5534 }, { "epoch": 10.038540013602358, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0745, "step": 5535 }, { "epoch": 10.040353661301292, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0739, "step": 5536 }, { "epoch": 10.042167309000227, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0698, "step": 5537 }, { "epoch": 10.043980956699162, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0734, "step": 5538 }, { "epoch": 10.045794604398095, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0823, "step": 5539 }, { "epoch": 10.04760825209703, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0689, "step": 5540 }, { "epoch": 10.049421899795965, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.075, "step": 5541 }, { "epoch": 10.0512355474949, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0769, "step": 5542 }, { "epoch": 10.053049195193834, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0866, "step": 5543 }, { "epoch": 10.054862842892769, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.089, "step": 5544 }, { "epoch": 10.056676490591702, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1035, "step": 5545 }, { "epoch": 10.058490138290637, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1041, "step": 5546 }, { "epoch": 10.060303785989571, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0888, "step": 5547 }, { "epoch": 10.062117433688506, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0839, "step": 5548 }, { "epoch": 10.063931081387441, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0852, "step": 5549 }, { "epoch": 10.065744729086376, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0824, "step": 5550 }, { "epoch": 10.067558376785309, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0944, "step": 5551 }, { "epoch": 10.069372024484244, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0897, "step": 5552 }, { "epoch": 10.071185672183178, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0974, "step": 5553 }, { "epoch": 10.072999319882113, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0978, "step": 5554 }, { "epoch": 10.074812967581048, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0998, "step": 5555 }, { "epoch": 10.076626615279983, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1333, "step": 5556 }, { "epoch": 10.078440262978916, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1341, "step": 5557 }, { "epoch": 10.08025391067785, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1483, "step": 5558 }, { "epoch": 10.082067558376785, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1426, "step": 5559 }, { "epoch": 10.08388120607572, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.144, "step": 5560 }, { "epoch": 10.085694853774655, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1565, "step": 5561 }, { "epoch": 10.08750850147359, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1591, "step": 5562 }, { "epoch": 10.089322149172522, "grad_norm": 1.5234375, "learning_rate": 0.0002, "loss": 0.2245, "step": 5563 }, { "epoch": 10.091135796871457, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.2326, "step": 5564 }, { "epoch": 10.092949444570392, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0896, "step": 5565 }, { "epoch": 10.094763092269327, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0812, "step": 5566 }, { "epoch": 10.096576739968262, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0925, "step": 5567 }, { "epoch": 10.098390387667196, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0903, "step": 5568 }, { "epoch": 10.10020403536613, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0878, "step": 5569 }, { "epoch": 10.102017683065064, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0784, "step": 5570 }, { "epoch": 10.103831330763999, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0998, "step": 5571 }, { "epoch": 10.105644978462934, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.071, "step": 5572 }, { "epoch": 10.107458626161868, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.073, "step": 5573 }, { "epoch": 10.109272273860803, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0719, "step": 5574 }, { "epoch": 10.111085921559736, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.066, "step": 5575 }, { "epoch": 10.112899569258671, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0929, "step": 5576 }, { "epoch": 10.114713216957606, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0779, "step": 5577 }, { "epoch": 10.11652686465654, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0787, "step": 5578 }, { "epoch": 10.118340512355475, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0795, "step": 5579 }, { "epoch": 10.12015416005441, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0857, "step": 5580 }, { "epoch": 10.121967807753343, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0782, "step": 5581 }, { "epoch": 10.123781455452278, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0734, "step": 5582 }, { "epoch": 10.125595103151213, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0849, "step": 5583 }, { "epoch": 10.127408750850147, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0997, "step": 5584 }, { "epoch": 10.129222398549082, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0852, "step": 5585 }, { "epoch": 10.131036046248017, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0958, "step": 5586 }, { "epoch": 10.132849693946952, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0708, "step": 5587 }, { "epoch": 10.134663341645885, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0756, "step": 5588 }, { "epoch": 10.13647698934482, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1002, "step": 5589 }, { "epoch": 10.138290637043754, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0708, "step": 5590 }, { "epoch": 10.140104284742689, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0726, "step": 5591 }, { "epoch": 10.141917932441624, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0903, "step": 5592 }, { "epoch": 10.143731580140559, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1028, "step": 5593 }, { "epoch": 10.145545227839492, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0866, "step": 5594 }, { "epoch": 10.147358875538426, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.102, "step": 5595 }, { "epoch": 10.149172523237361, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0831, "step": 5596 }, { "epoch": 10.150986170936296, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1, "step": 5597 }, { "epoch": 10.15279981863523, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0834, "step": 5598 }, { "epoch": 10.154613466334165, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1099, "step": 5599 }, { "epoch": 10.156427114033098, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0931, "step": 5600 }, { "epoch": 10.158240761732033, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1067, "step": 5601 }, { "epoch": 10.160054409430968, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1267, "step": 5602 }, { "epoch": 10.161868057129903, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1114, "step": 5603 }, { "epoch": 10.163681704828837, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.1143, "step": 5604 }, { "epoch": 10.165495352527772, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1403, "step": 5605 }, { "epoch": 10.167309000226705, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1144, "step": 5606 }, { "epoch": 10.16912264792564, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1267, "step": 5607 }, { "epoch": 10.170936295624575, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.1503, "step": 5608 }, { "epoch": 10.17274994332351, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.1714, "step": 5609 }, { "epoch": 10.174563591022444, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1359, "step": 5610 }, { "epoch": 10.174563591022444, "eval_loss": 2.228809356689453, "eval_runtime": 185.3682, "eval_samples_per_second": 5.395, "eval_steps_per_second": 5.395, "step": 5610 }, { "epoch": 10.174563591022444, "mmlu_eval_accuracy": 0.2983685955770338, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.15789473684210525, "mmlu_loss": 2.7170363353951004, "step": 5610 }, { "epoch": 10.17637723872138, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.1789, "step": 5611 }, { "epoch": 10.178190886420312, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1475, "step": 5612 }, { "epoch": 10.180004534119247, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.2365, "step": 5613 }, { "epoch": 10.181818181818182, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.2451, "step": 5614 }, { "epoch": 10.183631829517116, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0909, "step": 5615 }, { "epoch": 10.185445477216051, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1171, "step": 5616 }, { "epoch": 10.187259124914986, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0796, "step": 5617 }, { "epoch": 10.189072772613919, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0788, "step": 5618 }, { "epoch": 10.190886420312854, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0727, "step": 5619 }, { "epoch": 10.192700068011789, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0799, "step": 5620 }, { "epoch": 10.194513715710723, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0811, "step": 5621 }, { "epoch": 10.196327363409658, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0844, "step": 5622 }, { "epoch": 10.198141011108593, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1008, "step": 5623 }, { "epoch": 10.199954658807526, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0809, "step": 5624 }, { "epoch": 10.20176830650646, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0808, "step": 5625 }, { "epoch": 10.203581954205395, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0844, "step": 5626 }, { "epoch": 10.20539560190433, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0817, "step": 5627 }, { "epoch": 10.207209249603265, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0852, "step": 5628 }, { "epoch": 10.2090228973022, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0812, "step": 5629 }, { "epoch": 10.210836545001133, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.072, "step": 5630 }, { "epoch": 10.212650192700067, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0722, "step": 5631 }, { "epoch": 10.214463840399002, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0734, "step": 5632 }, { "epoch": 10.216277488097937, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0833, "step": 5633 }, { "epoch": 10.218091135796872, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0804, "step": 5634 }, { "epoch": 10.219904783495807, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0733, "step": 5635 }, { "epoch": 10.22171843119474, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0655, "step": 5636 }, { "epoch": 10.223532078893674, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0851, "step": 5637 }, { "epoch": 10.225345726592609, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0945, "step": 5638 }, { "epoch": 10.227159374291544, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0779, "step": 5639 }, { "epoch": 10.228973021990479, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0928, "step": 5640 }, { "epoch": 10.230786669689413, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0907, "step": 5641 }, { "epoch": 10.232600317388348, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0936, "step": 5642 }, { "epoch": 10.234413965087281, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0849, "step": 5643 }, { "epoch": 10.236227612786216, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0819, "step": 5644 }, { "epoch": 10.23804126048515, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0985, "step": 5645 }, { "epoch": 10.239854908184086, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.111, "step": 5646 }, { "epoch": 10.24166855588302, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0888, "step": 5647 }, { "epoch": 10.243482203581955, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0982, "step": 5648 }, { "epoch": 10.245295851280888, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1209, "step": 5649 }, { "epoch": 10.247109498979823, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1081, "step": 5650 }, { "epoch": 10.248923146678758, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1153, "step": 5651 }, { "epoch": 10.250736794377692, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1147, "step": 5652 }, { "epoch": 10.252550442076627, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1068, "step": 5653 }, { "epoch": 10.254364089775562, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1169, "step": 5654 }, { "epoch": 10.256177737474495, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1173, "step": 5655 }, { "epoch": 10.25799138517343, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1065, "step": 5656 }, { "epoch": 10.259805032872364, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1366, "step": 5657 }, { "epoch": 10.2616186805713, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1346, "step": 5658 }, { "epoch": 10.263432328270234, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1452, "step": 5659 }, { "epoch": 10.265245975969169, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.157, "step": 5660 }, { "epoch": 10.267059623668102, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.162, "step": 5661 }, { "epoch": 10.268873271367037, "grad_norm": 1.4296875, "learning_rate": 0.0002, "loss": 0.1942, "step": 5662 }, { "epoch": 10.270686919065971, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1777, "step": 5663 }, { "epoch": 10.272500566764906, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2156, "step": 5664 }, { "epoch": 10.27431421446384, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0931, "step": 5665 }, { "epoch": 10.276127862162776, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0894, "step": 5666 }, { "epoch": 10.277941509861709, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0865, "step": 5667 }, { "epoch": 10.279755157560643, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0864, "step": 5668 }, { "epoch": 10.281568805259578, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0838, "step": 5669 }, { "epoch": 10.283382452958513, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0744, "step": 5670 }, { "epoch": 10.285196100657448, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1024, "step": 5671 }, { "epoch": 10.287009748356382, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0851, "step": 5672 }, { "epoch": 10.288823396055315, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0911, "step": 5673 }, { "epoch": 10.29063704375425, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0932, "step": 5674 }, { "epoch": 10.292450691453185, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0886, "step": 5675 }, { "epoch": 10.29426433915212, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0826, "step": 5676 }, { "epoch": 10.296077986851055, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0808, "step": 5677 }, { "epoch": 10.29789163454999, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1028, "step": 5678 }, { "epoch": 10.299705282248922, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.079, "step": 5679 }, { "epoch": 10.301518929947857, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0876, "step": 5680 }, { "epoch": 10.303332577646792, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0827, "step": 5681 }, { "epoch": 10.305146225345727, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.085, "step": 5682 }, { "epoch": 10.306959873044661, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0874, "step": 5683 }, { "epoch": 10.308773520743596, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0778, "step": 5684 }, { "epoch": 10.31058716844253, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0809, "step": 5685 }, { "epoch": 10.312400816141464, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0919, "step": 5686 }, { "epoch": 10.314214463840399, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0994, "step": 5687 }, { "epoch": 10.316028111539334, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0892, "step": 5688 }, { "epoch": 10.317841759238268, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0777, "step": 5689 }, { "epoch": 10.319655406937203, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.073, "step": 5690 }, { "epoch": 10.321469054636136, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1117, "step": 5691 }, { "epoch": 10.32328270233507, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0906, "step": 5692 }, { "epoch": 10.325096350034006, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.094, "step": 5693 }, { "epoch": 10.32690999773294, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0957, "step": 5694 }, { "epoch": 10.328723645431875, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0833, "step": 5695 }, { "epoch": 10.33053729313081, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1101, "step": 5696 }, { "epoch": 10.332350940829745, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1049, "step": 5697 }, { "epoch": 10.334164588528678, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0885, "step": 5698 }, { "epoch": 10.335978236227612, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0861, "step": 5699 }, { "epoch": 10.337791883926547, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1003, "step": 5700 }, { "epoch": 10.339605531625482, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0907, "step": 5701 }, { "epoch": 10.341419179324417, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1074, "step": 5702 }, { "epoch": 10.343232827023352, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1042, "step": 5703 }, { "epoch": 10.345046474722285, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1178, "step": 5704 }, { "epoch": 10.34686012242122, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1052, "step": 5705 }, { "epoch": 10.348673770120154, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1402, "step": 5706 }, { "epoch": 10.350487417819089, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1434, "step": 5707 }, { "epoch": 10.352301065518024, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1203, "step": 5708 }, { "epoch": 10.354114713216958, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.1404, "step": 5709 }, { "epoch": 10.355928360915891, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1409, "step": 5710 }, { "epoch": 10.357742008614826, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1635, "step": 5711 }, { "epoch": 10.359555656313761, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1775, "step": 5712 }, { "epoch": 10.361369304012696, "grad_norm": 1.203125, "learning_rate": 0.0002, "loss": 0.2456, "step": 5713 }, { "epoch": 10.36318295171163, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.3025, "step": 5714 }, { "epoch": 10.364996599410565, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1014, "step": 5715 }, { "epoch": 10.366810247109498, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0891, "step": 5716 }, { "epoch": 10.368623894808433, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.089, "step": 5717 }, { "epoch": 10.370437542507368, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0898, "step": 5718 }, { "epoch": 10.372251190206303, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0923, "step": 5719 }, { "epoch": 10.374064837905237, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0816, "step": 5720 }, { "epoch": 10.375878485604172, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0922, "step": 5721 }, { "epoch": 10.377692133303105, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0805, "step": 5722 }, { "epoch": 10.37950578100204, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0917, "step": 5723 }, { "epoch": 10.381319428700975, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.092, "step": 5724 }, { "epoch": 10.38313307639991, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0981, "step": 5725 }, { "epoch": 10.384946724098844, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0988, "step": 5726 }, { "epoch": 10.386760371797779, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0956, "step": 5727 }, { "epoch": 10.388574019496712, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0752, "step": 5728 }, { "epoch": 10.390387667195647, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0918, "step": 5729 }, { "epoch": 10.392201314894582, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0881, "step": 5730 }, { "epoch": 10.394014962593516, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.089, "step": 5731 }, { "epoch": 10.395828610292451, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.082, "step": 5732 }, { "epoch": 10.397642257991386, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0873, "step": 5733 }, { "epoch": 10.399455905690319, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0895, "step": 5734 }, { "epoch": 10.401269553389254, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0952, "step": 5735 }, { "epoch": 10.403083201088188, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1005, "step": 5736 }, { "epoch": 10.404896848787123, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0878, "step": 5737 }, { "epoch": 10.406710496486058, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0794, "step": 5738 }, { "epoch": 10.408524144184993, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0819, "step": 5739 }, { "epoch": 10.410337791883926, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0994, "step": 5740 }, { "epoch": 10.41215143958286, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1005, "step": 5741 }, { "epoch": 10.413965087281795, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0902, "step": 5742 }, { "epoch": 10.41577873498073, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0872, "step": 5743 }, { "epoch": 10.417592382679665, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0939, "step": 5744 }, { "epoch": 10.4194060303786, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0875, "step": 5745 }, { "epoch": 10.421219678077534, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1041, "step": 5746 }, { "epoch": 10.423033325776467, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0915, "step": 5747 }, { "epoch": 10.424846973475402, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1225, "step": 5748 }, { "epoch": 10.426660621174337, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0902, "step": 5749 }, { "epoch": 10.428474268873272, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1198, "step": 5750 }, { "epoch": 10.430287916572206, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1081, "step": 5751 }, { "epoch": 10.43210156427114, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1157, "step": 5752 }, { "epoch": 10.433915211970074, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1329, "step": 5753 }, { "epoch": 10.435728859669009, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1257, "step": 5754 }, { "epoch": 10.437542507367944, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1177, "step": 5755 }, { "epoch": 10.439356155066879, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1563, "step": 5756 }, { "epoch": 10.441169802765813, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1338, "step": 5757 }, { "epoch": 10.442983450464748, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1434, "step": 5758 }, { "epoch": 10.444797098163681, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1275, "step": 5759 }, { "epoch": 10.446610745862616, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1472, "step": 5760 }, { "epoch": 10.44842439356155, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1788, "step": 5761 }, { "epoch": 10.450238041260485, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.1981, "step": 5762 }, { "epoch": 10.45205168895942, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.2298, "step": 5763 }, { "epoch": 10.453865336658355, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.2856, "step": 5764 }, { "epoch": 10.455678984357288, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0931, "step": 5765 }, { "epoch": 10.457492632056223, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0971, "step": 5766 }, { "epoch": 10.459306279755157, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.096, "step": 5767 }, { "epoch": 10.461119927454092, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0925, "step": 5768 }, { "epoch": 10.462933575153027, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1134, "step": 5769 }, { "epoch": 10.464747222851962, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0994, "step": 5770 }, { "epoch": 10.466560870550895, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0951, "step": 5771 }, { "epoch": 10.46837451824983, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0956, "step": 5772 }, { "epoch": 10.470188165948764, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0833, "step": 5773 }, { "epoch": 10.472001813647699, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0959, "step": 5774 }, { "epoch": 10.473815461346634, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0963, "step": 5775 }, { "epoch": 10.475629109045569, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0827, "step": 5776 }, { "epoch": 10.477442756744502, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1458, "step": 5777 }, { "epoch": 10.479256404443436, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.08, "step": 5778 }, { "epoch": 10.481070052142371, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0843, "step": 5779 }, { "epoch": 10.482883699841306, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0766, "step": 5780 }, { "epoch": 10.48469734754024, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0888, "step": 5781 }, { "epoch": 10.486510995239176, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.084, "step": 5782 }, { "epoch": 10.488324642938109, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0836, "step": 5783 }, { "epoch": 10.490138290637043, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0974, "step": 5784 }, { "epoch": 10.491951938335978, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0777, "step": 5785 }, { "epoch": 10.493765586034913, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0971, "step": 5786 }, { "epoch": 10.495579233733848, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0972, "step": 5787 }, { "epoch": 10.497392881432782, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0907, "step": 5788 }, { "epoch": 10.499206529131715, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0995, "step": 5789 }, { "epoch": 10.50102017683065, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1013, "step": 5790 }, { "epoch": 10.502833824529585, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0967, "step": 5791 }, { "epoch": 10.50464747222852, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0886, "step": 5792 }, { "epoch": 10.506461119927454, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0929, "step": 5793 }, { "epoch": 10.50827476762639, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1013, "step": 5794 }, { "epoch": 10.510088415325322, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1084, "step": 5795 }, { "epoch": 10.511902063024257, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1016, "step": 5796 }, { "epoch": 10.513715710723192, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0988, "step": 5797 }, { "epoch": 10.513715710723192, "eval_loss": 2.2109897136688232, "eval_runtime": 193.0694, "eval_samples_per_second": 5.179, "eval_steps_per_second": 5.179, "step": 5797 }, { "epoch": 10.513715710723192, "mmlu_eval_accuracy": 0.3115067566025124, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.53125, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.18181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.4782608695652174, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.41935483870967744, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.2222222222222222, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.1460641602688098, "step": 5797 }, { "epoch": 10.515529358422127, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1066, "step": 5798 }, { "epoch": 10.517343006121061, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.096, "step": 5799 }, { "epoch": 10.519156653819996, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1061, "step": 5800 }, { "epoch": 10.520970301518929, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0999, "step": 5801 }, { "epoch": 10.522783949217864, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1116, "step": 5802 }, { "epoch": 10.524597596916799, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1334, "step": 5803 }, { "epoch": 10.526411244615733, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1319, "step": 5804 }, { "epoch": 10.528224892314668, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1341, "step": 5805 }, { "epoch": 10.530038540013603, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1372, "step": 5806 }, { "epoch": 10.531852187712538, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1702, "step": 5807 }, { "epoch": 10.53366583541147, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1478, "step": 5808 }, { "epoch": 10.535479483110405, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.2009, "step": 5809 }, { "epoch": 10.53729313080934, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1692, "step": 5810 }, { "epoch": 10.539106778508275, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1746, "step": 5811 }, { "epoch": 10.54092042620721, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1857, "step": 5812 }, { "epoch": 10.542734073906143, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2667, "step": 5813 }, { "epoch": 10.544547721605078, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.3016, "step": 5814 }, { "epoch": 10.546361369304012, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1165, "step": 5815 }, { "epoch": 10.548175017002947, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0922, "step": 5816 }, { "epoch": 10.549988664701882, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.103, "step": 5817 }, { "epoch": 10.551802312400817, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0966, "step": 5818 }, { "epoch": 10.553615960099751, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.089, "step": 5819 }, { "epoch": 10.555429607798684, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0815, "step": 5820 }, { "epoch": 10.55724325549762, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0993, "step": 5821 }, { "epoch": 10.559056903196554, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0898, "step": 5822 }, { "epoch": 10.560870550895489, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0878, "step": 5823 }, { "epoch": 10.562684198594424, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1023, "step": 5824 }, { "epoch": 10.564497846293358, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0938, "step": 5825 }, { "epoch": 10.566311493992291, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0963, "step": 5826 }, { "epoch": 10.568125141691226, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0868, "step": 5827 }, { "epoch": 10.56993878939016, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0907, "step": 5828 }, { "epoch": 10.571752437089096, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1068, "step": 5829 }, { "epoch": 10.57356608478803, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0974, "step": 5830 }, { "epoch": 10.575379732486965, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0883, "step": 5831 }, { "epoch": 10.577193380185898, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1014, "step": 5832 }, { "epoch": 10.579007027884833, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0916, "step": 5833 }, { "epoch": 10.580820675583768, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0879, "step": 5834 }, { "epoch": 10.582634323282702, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0845, "step": 5835 }, { "epoch": 10.584447970981637, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0871, "step": 5836 }, { "epoch": 10.586261618680572, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0854, "step": 5837 }, { "epoch": 10.588075266379505, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0946, "step": 5838 }, { "epoch": 10.58988891407844, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0767, "step": 5839 }, { "epoch": 10.591702561777375, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0782, "step": 5840 }, { "epoch": 10.59351620947631, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.0966, "step": 5841 }, { "epoch": 10.595329857175244, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0913, "step": 5842 }, { "epoch": 10.597143504874179, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0951, "step": 5843 }, { "epoch": 10.598957152573112, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1002, "step": 5844 }, { "epoch": 10.600770800272047, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0931, "step": 5845 }, { "epoch": 10.602584447970981, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1179, "step": 5846 }, { "epoch": 10.604398095669916, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1115, "step": 5847 }, { "epoch": 10.606211743368851, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1083, "step": 5848 }, { "epoch": 10.608025391067786, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1024, "step": 5849 }, { "epoch": 10.609839038766719, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1114, "step": 5850 }, { "epoch": 10.611652686465654, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1126, "step": 5851 }, { "epoch": 10.613466334164588, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1186, "step": 5852 }, { "epoch": 10.615279981863523, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1281, "step": 5853 }, { "epoch": 10.617093629562458, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1199, "step": 5854 }, { "epoch": 10.618907277261393, "grad_norm": 1.1015625, "learning_rate": 0.0002, "loss": 0.149, "step": 5855 }, { "epoch": 10.620720924960327, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1512, "step": 5856 }, { "epoch": 10.62253457265926, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1296, "step": 5857 }, { "epoch": 10.624348220358195, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1368, "step": 5858 }, { "epoch": 10.62616186805713, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.14, "step": 5859 }, { "epoch": 10.627975515756065, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1742, "step": 5860 }, { "epoch": 10.629789163455, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1684, "step": 5861 }, { "epoch": 10.631602811153932, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1966, "step": 5862 }, { "epoch": 10.633416458852867, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.2215, "step": 5863 }, { "epoch": 10.635230106551802, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.2622, "step": 5864 }, { "epoch": 10.637043754250737, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0851, "step": 5865 }, { "epoch": 10.638857401949672, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1087, "step": 5866 }, { "epoch": 10.640671049648606, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.095, "step": 5867 }, { "epoch": 10.642484697347541, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0865, "step": 5868 }, { "epoch": 10.644298345046474, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0874, "step": 5869 }, { "epoch": 10.646111992745409, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0986, "step": 5870 }, { "epoch": 10.647925640444344, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0821, "step": 5871 }, { "epoch": 10.649739288143278, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1019, "step": 5872 }, { "epoch": 10.651552935842213, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0928, "step": 5873 }, { "epoch": 10.653366583541148, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0996, "step": 5874 }, { "epoch": 10.655180231240081, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0932, "step": 5875 }, { "epoch": 10.656993878939016, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1015, "step": 5876 }, { "epoch": 10.65880752663795, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0988, "step": 5877 }, { "epoch": 10.660621174336885, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0879, "step": 5878 }, { "epoch": 10.66243482203582, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1028, "step": 5879 }, { "epoch": 10.664248469734755, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0941, "step": 5880 }, { "epoch": 10.666062117433688, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1, "step": 5881 }, { "epoch": 10.667875765132623, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1009, "step": 5882 }, { "epoch": 10.669689412831557, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.113, "step": 5883 }, { "epoch": 10.671503060530492, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0921, "step": 5884 }, { "epoch": 10.673316708229427, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0888, "step": 5885 }, { "epoch": 10.675130355928362, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0843, "step": 5886 }, { "epoch": 10.676944003627295, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0891, "step": 5887 }, { "epoch": 10.67875765132623, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1034, "step": 5888 }, { "epoch": 10.680571299025164, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0842, "step": 5889 }, { "epoch": 10.682384946724099, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0845, "step": 5890 }, { "epoch": 10.684198594423034, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0909, "step": 5891 }, { "epoch": 10.686012242121969, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0941, "step": 5892 }, { "epoch": 10.687825889820902, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.0973, "step": 5893 }, { "epoch": 10.689639537519836, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0998, "step": 5894 }, { "epoch": 10.691453185218771, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1084, "step": 5895 }, { "epoch": 10.693266832917706, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0966, "step": 5896 }, { "epoch": 10.69508048061664, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1144, "step": 5897 }, { "epoch": 10.696894128315575, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1132, "step": 5898 }, { "epoch": 10.698707776014508, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1221, "step": 5899 }, { "epoch": 10.700521423713443, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1165, "step": 5900 }, { "epoch": 10.702335071412378, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1111, "step": 5901 }, { "epoch": 10.704148719111313, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1177, "step": 5902 }, { "epoch": 10.705962366810247, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1325, "step": 5903 }, { "epoch": 10.707776014509182, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1276, "step": 5904 }, { "epoch": 10.709589662208115, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1211, "step": 5905 }, { "epoch": 10.71140330990705, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1362, "step": 5906 }, { "epoch": 10.713216957605985, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1381, "step": 5907 }, { "epoch": 10.71503060530492, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.143, "step": 5908 }, { "epoch": 10.716844253003854, "grad_norm": 1.1640625, "learning_rate": 0.0002, "loss": 0.1831, "step": 5909 }, { "epoch": 10.71865790070279, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1487, "step": 5910 }, { "epoch": 10.720471548401722, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.1841, "step": 5911 }, { "epoch": 10.722285196100657, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1553, "step": 5912 }, { "epoch": 10.724098843799592, "grad_norm": 1.3984375, "learning_rate": 0.0002, "loss": 0.2037, "step": 5913 }, { "epoch": 10.725912491498526, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.259, "step": 5914 }, { "epoch": 10.727726139197461, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1193, "step": 5915 }, { "epoch": 10.729539786896396, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0989, "step": 5916 }, { "epoch": 10.73135343459533, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1085, "step": 5917 }, { "epoch": 10.733167082294264, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1088, "step": 5918 }, { "epoch": 10.734980729993199, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1121, "step": 5919 }, { "epoch": 10.736794377692133, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0998, "step": 5920 }, { "epoch": 10.738608025391068, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.09, "step": 5921 }, { "epoch": 10.740421673090003, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1036, "step": 5922 }, { "epoch": 10.742235320788936, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0749, "step": 5923 }, { "epoch": 10.74404896848787, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1139, "step": 5924 }, { "epoch": 10.745862616186805, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1194, "step": 5925 }, { "epoch": 10.74767626388574, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1048, "step": 5926 }, { "epoch": 10.749489911584675, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1137, "step": 5927 }, { "epoch": 10.75130355928361, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0808, "step": 5928 }, { "epoch": 10.753117206982544, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0907, "step": 5929 }, { "epoch": 10.754930854681477, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0966, "step": 5930 }, { "epoch": 10.756744502380412, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0872, "step": 5931 }, { "epoch": 10.758558150079347, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1073, "step": 5932 }, { "epoch": 10.760371797778282, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0919, "step": 5933 }, { "epoch": 10.762185445477217, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1002, "step": 5934 }, { "epoch": 10.763999093176151, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0934, "step": 5935 }, { "epoch": 10.765812740875084, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0932, "step": 5936 }, { "epoch": 10.767626388574019, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1013, "step": 5937 }, { "epoch": 10.769440036272954, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1, "step": 5938 }, { "epoch": 10.771253683971889, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0946, "step": 5939 }, { "epoch": 10.773067331670823, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1074, "step": 5940 }, { "epoch": 10.774880979369758, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1019, "step": 5941 }, { "epoch": 10.776694627068691, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1397, "step": 5942 }, { "epoch": 10.778508274767626, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1335, "step": 5943 }, { "epoch": 10.78032192246656, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0968, "step": 5944 }, { "epoch": 10.782135570165496, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1224, "step": 5945 }, { "epoch": 10.78394921786443, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1108, "step": 5946 }, { "epoch": 10.785762865563365, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1128, "step": 5947 }, { "epoch": 10.787576513262298, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1126, "step": 5948 }, { "epoch": 10.789390160961233, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0999, "step": 5949 }, { "epoch": 10.791203808660168, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1414, "step": 5950 }, { "epoch": 10.793017456359102, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1152, "step": 5951 }, { "epoch": 10.794831104058037, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1309, "step": 5952 }, { "epoch": 10.796644751756972, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1205, "step": 5953 }, { "epoch": 10.798458399455905, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1436, "step": 5954 }, { "epoch": 10.80027204715484, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1185, "step": 5955 }, { "epoch": 10.802085694853774, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1246, "step": 5956 }, { "epoch": 10.80389934255271, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1551, "step": 5957 }, { "epoch": 10.805712990251644, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1564, "step": 5958 }, { "epoch": 10.807526637950579, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.153, "step": 5959 }, { "epoch": 10.809340285649512, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1468, "step": 5960 }, { "epoch": 10.811153933348447, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1986, "step": 5961 }, { "epoch": 10.812967581047381, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1728, "step": 5962 }, { "epoch": 10.814781228746316, "grad_norm": 1.34375, "learning_rate": 0.0002, "loss": 0.2402, "step": 5963 }, { "epoch": 10.81659487644525, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.2581, "step": 5964 }, { "epoch": 10.818408524144186, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1087, "step": 5965 }, { "epoch": 10.82022217184312, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0871, "step": 5966 }, { "epoch": 10.822035819542053, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1233, "step": 5967 }, { "epoch": 10.823849467240988, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1205, "step": 5968 }, { "epoch": 10.825663114939923, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0878, "step": 5969 }, { "epoch": 10.827476762638858, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0966, "step": 5970 }, { "epoch": 10.829290410337792, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0873, "step": 5971 }, { "epoch": 10.831104058036725, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1051, "step": 5972 }, { "epoch": 10.83291770573566, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1072, "step": 5973 }, { "epoch": 10.834731353434595, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1076, "step": 5974 }, { "epoch": 10.83654500113353, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0955, "step": 5975 }, { "epoch": 10.838358648832465, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1099, "step": 5976 }, { "epoch": 10.8401722965314, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1074, "step": 5977 }, { "epoch": 10.841985944230334, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0803, "step": 5978 }, { "epoch": 10.843799591929267, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1077, "step": 5979 }, { "epoch": 10.845613239628202, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1042, "step": 5980 }, { "epoch": 10.847426887327137, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0979, "step": 5981 }, { "epoch": 10.849240535026071, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0974, "step": 5982 }, { "epoch": 10.851054182725006, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1002, "step": 5983 }, { "epoch": 10.85286783042394, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0964, "step": 5984 }, { "epoch": 10.85286783042394, "eval_loss": 2.2737178802490234, "eval_runtime": 185.3052, "eval_samples_per_second": 5.397, "eval_steps_per_second": 5.397, "step": 5984 }, { "epoch": 10.85286783042394, "mmlu_eval_accuracy": 0.30872361451614183, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.3170731707317073, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.53125, "mmlu_eval_accuracy_high_school_chemistry": 0.36363636363636365, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.23529411764705882, "mmlu_eval_accuracy_high_school_psychology": 0.26666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.38461538461538464, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.29411764705882354, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.130901400518137, "step": 5984 }, { "epoch": 10.854681478122874, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0989, "step": 5985 }, { "epoch": 10.856495125821809, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1114, "step": 5986 }, { "epoch": 10.858308773520744, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1102, "step": 5987 }, { "epoch": 10.860122421219678, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0988, "step": 5988 }, { "epoch": 10.861936068918613, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1102, "step": 5989 }, { "epoch": 10.863749716617548, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1329, "step": 5990 }, { "epoch": 10.86556336431648, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1313, "step": 5991 }, { "epoch": 10.867377012015416, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1116, "step": 5992 }, { "epoch": 10.86919065971435, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1105, "step": 5993 }, { "epoch": 10.871004307413285, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0935, "step": 5994 }, { "epoch": 10.87281795511222, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1052, "step": 5995 }, { "epoch": 10.874631602811155, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0992, "step": 5996 }, { "epoch": 10.876445250510088, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1177, "step": 5997 }, { "epoch": 10.878258898209022, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1292, "step": 5998 }, { "epoch": 10.880072545907957, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.095, "step": 5999 }, { "epoch": 10.881886193606892, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1072, "step": 6000 }, { "epoch": 10.883699841305827, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1116, "step": 6001 }, { "epoch": 10.885513489004762, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1227, "step": 6002 }, { "epoch": 10.887327136703695, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1355, "step": 6003 }, { "epoch": 10.88914078440263, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1707, "step": 6004 }, { "epoch": 10.890954432101564, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1253, "step": 6005 }, { "epoch": 10.892768079800499, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1343, "step": 6006 }, { "epoch": 10.894581727499434, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1532, "step": 6007 }, { "epoch": 10.896395375198368, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1565, "step": 6008 }, { "epoch": 10.898209022897301, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1682, "step": 6009 }, { "epoch": 10.900022670596236, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1733, "step": 6010 }, { "epoch": 10.901836318295171, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1517, "step": 6011 }, { "epoch": 10.903649965994106, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1973, "step": 6012 }, { "epoch": 10.90546361369304, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.258, "step": 6013 }, { "epoch": 10.907277261391975, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.2162, "step": 6014 }, { "epoch": 10.909090909090908, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.1052, "step": 6015 }, { "epoch": 10.910904556789843, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1173, "step": 6016 }, { "epoch": 10.912718204488778, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1052, "step": 6017 }, { "epoch": 10.914531852187713, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.098, "step": 6018 }, { "epoch": 10.916345499886647, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1036, "step": 6019 }, { "epoch": 10.918159147585582, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1041, "step": 6020 }, { "epoch": 10.919972795284515, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1074, "step": 6021 }, { "epoch": 10.92178644298345, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1028, "step": 6022 }, { "epoch": 10.923600090682385, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1028, "step": 6023 }, { "epoch": 10.92541373838132, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.086, "step": 6024 }, { "epoch": 10.927227386080254, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1143, "step": 6025 }, { "epoch": 10.929041033779189, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1008, "step": 6026 }, { "epoch": 10.930854681478124, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1024, "step": 6027 }, { "epoch": 10.932668329177057, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0963, "step": 6028 }, { "epoch": 10.934481976875992, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0977, "step": 6029 }, { "epoch": 10.936295624574926, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0923, "step": 6030 }, { "epoch": 10.938109272273861, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0973, "step": 6031 }, { "epoch": 10.939922919972796, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0965, "step": 6032 }, { "epoch": 10.941736567671729, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0936, "step": 6033 }, { "epoch": 10.943550215370664, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1072, "step": 6034 }, { "epoch": 10.945363863069598, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1031, "step": 6035 }, { "epoch": 10.947177510768533, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1064, "step": 6036 }, { "epoch": 10.948991158467468, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0985, "step": 6037 }, { "epoch": 10.950804806166403, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1138, "step": 6038 }, { "epoch": 10.952618453865338, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1018, "step": 6039 }, { "epoch": 10.95443210156427, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1095, "step": 6040 }, { "epoch": 10.956245749263205, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1004, "step": 6041 }, { "epoch": 10.95805939696214, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1043, "step": 6042 }, { "epoch": 10.959873044661075, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.116, "step": 6043 }, { "epoch": 10.96168669236001, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0931, "step": 6044 }, { "epoch": 10.963500340058944, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.102, "step": 6045 }, { "epoch": 10.965313987757877, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1015, "step": 6046 }, { "epoch": 10.967127635456812, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1097, "step": 6047 }, { "epoch": 10.968941283155747, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0919, "step": 6048 }, { "epoch": 10.970754930854682, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1104, "step": 6049 }, { "epoch": 10.972568578553616, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1119, "step": 6050 }, { "epoch": 10.974382226252551, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1189, "step": 6051 }, { "epoch": 10.976195873951484, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1326, "step": 6052 }, { "epoch": 10.978009521650419, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1454, "step": 6053 }, { "epoch": 10.979823169349354, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.126, "step": 6054 }, { "epoch": 10.981636817048289, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1287, "step": 6055 }, { "epoch": 10.983450464747223, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1503, "step": 6056 }, { "epoch": 10.985264112446158, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1261, "step": 6057 }, { "epoch": 10.987077760145091, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1516, "step": 6058 }, { "epoch": 10.988891407844026, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1482, "step": 6059 }, { "epoch": 10.99070505554296, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.143, "step": 6060 }, { "epoch": 10.992518703241895, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1424, "step": 6061 }, { "epoch": 10.99433235094083, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1682, "step": 6062 }, { "epoch": 10.996145998639765, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.1787, "step": 6063 }, { "epoch": 10.997959646338698, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.259, "step": 6064 }, { "epoch": 10.999773294037633, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1182, "step": 6065 }, { "epoch": 11.001586941736567, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0809, "step": 6066 }, { "epoch": 11.003400589435502, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0647, "step": 6067 }, { "epoch": 11.005214237134437, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0758, "step": 6068 }, { "epoch": 11.007027884833372, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0691, "step": 6069 }, { "epoch": 11.008841532532305, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.0609, "step": 6070 }, { "epoch": 11.01065518023124, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.0682, "step": 6071 }, { "epoch": 11.012468827930174, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0652, "step": 6072 }, { "epoch": 11.01428247562911, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.073, "step": 6073 }, { "epoch": 11.016096123328044, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0665, "step": 6074 }, { "epoch": 11.017909771026979, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0568, "step": 6075 }, { "epoch": 11.019723418725912, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.069, "step": 6076 }, { "epoch": 11.021537066424846, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.065, "step": 6077 }, { "epoch": 11.023350714123781, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0672, "step": 6078 }, { "epoch": 11.025164361822716, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0678, "step": 6079 }, { "epoch": 11.02697800952165, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0593, "step": 6080 }, { "epoch": 11.028791657220586, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.0598, "step": 6081 }, { "epoch": 11.030605304919519, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0704, "step": 6082 }, { "epoch": 11.032418952618453, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0683, "step": 6083 }, { "epoch": 11.034232600317388, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0612, "step": 6084 }, { "epoch": 11.036046248016323, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0671, "step": 6085 }, { "epoch": 11.037859895715258, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0598, "step": 6086 }, { "epoch": 11.039673543414192, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0767, "step": 6087 }, { "epoch": 11.041487191113127, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0734, "step": 6088 }, { "epoch": 11.04330083881206, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0761, "step": 6089 }, { "epoch": 11.045114486510995, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.078, "step": 6090 }, { "epoch": 11.04692813420993, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0779, "step": 6091 }, { "epoch": 11.048741781908864, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0801, "step": 6092 }, { "epoch": 11.0505554296078, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0708, "step": 6093 }, { "epoch": 11.052369077306734, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0737, "step": 6094 }, { "epoch": 11.054182725005667, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.1495, "step": 6095 }, { "epoch": 11.055996372704602, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0896, "step": 6096 }, { "epoch": 11.057810020403537, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0853, "step": 6097 }, { "epoch": 11.059623668102471, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0896, "step": 6098 }, { "epoch": 11.061437315801406, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0933, "step": 6099 }, { "epoch": 11.06325096350034, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0875, "step": 6100 }, { "epoch": 11.065064611199274, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0974, "step": 6101 }, { "epoch": 11.066878258898209, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.077, "step": 6102 }, { "epoch": 11.068691906597143, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0933, "step": 6103 }, { "epoch": 11.070505554296078, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0969, "step": 6104 }, { "epoch": 11.072319201995013, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0982, "step": 6105 }, { "epoch": 11.074132849693948, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0873, "step": 6106 }, { "epoch": 11.07594649739288, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1068, "step": 6107 }, { "epoch": 11.077760145091816, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1112, "step": 6108 }, { "epoch": 11.07957379279075, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.1016, "step": 6109 }, { "epoch": 11.081387440489685, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1404, "step": 6110 }, { "epoch": 11.08320108818862, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1281, "step": 6111 }, { "epoch": 11.085014735887555, "grad_norm": 1.25, "learning_rate": 0.0002, "loss": 0.2041, "step": 6112 }, { "epoch": 11.086828383586488, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1376, "step": 6113 }, { "epoch": 11.088642031285422, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1941, "step": 6114 }, { "epoch": 11.090455678984357, "grad_norm": 1.15625, "learning_rate": 0.0002, "loss": 0.3033, "step": 6115 }, { "epoch": 11.092269326683292, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.1212, "step": 6116 }, { "epoch": 11.094082974382227, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0718, "step": 6117 }, { "epoch": 11.095896622081161, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.0625, "step": 6118 }, { "epoch": 11.097710269780094, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0685, "step": 6119 }, { "epoch": 11.09952391747903, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0801, "step": 6120 }, { "epoch": 11.101337565177964, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0719, "step": 6121 }, { "epoch": 11.103151212876899, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0675, "step": 6122 }, { "epoch": 11.104964860575834, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0757, "step": 6123 }, { "epoch": 11.106778508274768, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0686, "step": 6124 }, { "epoch": 11.108592155973701, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0732, "step": 6125 }, { "epoch": 11.110405803672636, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0753, "step": 6126 }, { "epoch": 11.11221945137157, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0775, "step": 6127 }, { "epoch": 11.114033099070506, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0827, "step": 6128 }, { "epoch": 11.11584674676944, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0809, "step": 6129 }, { "epoch": 11.117660394468375, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.079, "step": 6130 }, { "epoch": 11.119474042167308, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0667, "step": 6131 }, { "epoch": 11.121287689866243, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0712, "step": 6132 }, { "epoch": 11.123101337565178, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0694, "step": 6133 }, { "epoch": 11.124914985264112, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0806, "step": 6134 }, { "epoch": 11.126728632963047, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0724, "step": 6135 }, { "epoch": 11.128542280661982, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0863, "step": 6136 }, { "epoch": 11.130355928360915, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0779, "step": 6137 }, { "epoch": 11.13216957605985, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0759, "step": 6138 }, { "epoch": 11.133983223758785, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0717, "step": 6139 }, { "epoch": 11.13579687145772, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0699, "step": 6140 }, { "epoch": 11.137610519156654, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0795, "step": 6141 }, { "epoch": 11.139424166855589, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0831, "step": 6142 }, { "epoch": 11.141237814554522, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0786, "step": 6143 }, { "epoch": 11.143051462253457, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0714, "step": 6144 }, { "epoch": 11.144865109952391, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0815, "step": 6145 }, { "epoch": 11.146678757651326, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0675, "step": 6146 }, { "epoch": 11.148492405350261, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0939, "step": 6147 }, { "epoch": 11.150306053049196, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0888, "step": 6148 }, { "epoch": 11.15211970074813, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1046, "step": 6149 }, { "epoch": 11.153933348447064, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0962, "step": 6150 }, { "epoch": 11.155746996145998, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0928, "step": 6151 }, { "epoch": 11.157560643844933, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.099, "step": 6152 }, { "epoch": 11.159374291543868, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0971, "step": 6153 }, { "epoch": 11.161187939242803, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1273, "step": 6154 }, { "epoch": 11.163001586941737, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0984, "step": 6155 }, { "epoch": 11.16481523464067, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1151, "step": 6156 }, { "epoch": 11.166628882339605, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1154, "step": 6157 }, { "epoch": 11.16844253003854, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0969, "step": 6158 }, { "epoch": 11.170256177737475, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1294, "step": 6159 }, { "epoch": 11.17206982543641, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1577, "step": 6160 }, { "epoch": 11.173883473135344, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1298, "step": 6161 }, { "epoch": 11.175697120834277, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1448, "step": 6162 }, { "epoch": 11.177510768533212, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1351, "step": 6163 }, { "epoch": 11.179324416232147, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1844, "step": 6164 }, { "epoch": 11.181138063931082, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.2158, "step": 6165 }, { "epoch": 11.182951711630016, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1179, "step": 6166 }, { "epoch": 11.184765359328951, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0822, "step": 6167 }, { "epoch": 11.186579007027884, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0699, "step": 6168 }, { "epoch": 11.188392654726819, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0771, "step": 6169 }, { "epoch": 11.190206302425754, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0862, "step": 6170 }, { "epoch": 11.192019950124688, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0723, "step": 6171 }, { "epoch": 11.192019950124688, "eval_loss": 2.3120412826538086, "eval_runtime": 185.732, "eval_samples_per_second": 5.384, "eval_steps_per_second": 5.384, "step": 6171 }, { "epoch": 11.192019950124688, "mmlu_eval_accuracy": 0.30092808403365046, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.38461538461538464, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.4, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2823529411764706, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.2608695652173913, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.45454545454545453, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.15789473684210525, "mmlu_loss": 2.7458994509074364, "step": 6171 }, { "epoch": 11.193833597823623, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0761, "step": 6172 }, { "epoch": 11.195647245522558, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0769, "step": 6173 }, { "epoch": 11.197460893221491, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0798, "step": 6174 }, { "epoch": 11.199274540920426, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0808, "step": 6175 }, { "epoch": 11.20108818861936, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0755, "step": 6176 }, { "epoch": 11.202901836318295, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0792, "step": 6177 }, { "epoch": 11.20471548401723, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1259, "step": 6178 }, { "epoch": 11.206529131716165, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0846, "step": 6179 }, { "epoch": 11.208342779415098, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0811, "step": 6180 }, { "epoch": 11.210156427114033, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0773, "step": 6181 }, { "epoch": 11.211970074812967, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0796, "step": 6182 }, { "epoch": 11.213783722511902, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0679, "step": 6183 }, { "epoch": 11.215597370210837, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0805, "step": 6184 }, { "epoch": 11.217411017909772, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0805, "step": 6185 }, { "epoch": 11.219224665608705, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0754, "step": 6186 }, { "epoch": 11.22103831330764, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0829, "step": 6187 }, { "epoch": 11.222851961006574, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0803, "step": 6188 }, { "epoch": 11.224665608705509, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0875, "step": 6189 }, { "epoch": 11.226479256404444, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0809, "step": 6190 }, { "epoch": 11.228292904103379, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0959, "step": 6191 }, { "epoch": 11.230106551802312, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0813, "step": 6192 }, { "epoch": 11.231920199501246, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0848, "step": 6193 }, { "epoch": 11.233733847200181, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.088, "step": 6194 }, { "epoch": 11.235547494899116, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0836, "step": 6195 }, { "epoch": 11.23736114259805, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0877, "step": 6196 }, { "epoch": 11.239174790296985, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0961, "step": 6197 }, { "epoch": 11.24098843799592, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0906, "step": 6198 }, { "epoch": 11.242802085694853, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0989, "step": 6199 }, { "epoch": 11.244615733393788, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1035, "step": 6200 }, { "epoch": 11.246429381092723, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1, "step": 6201 }, { "epoch": 11.248243028791657, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.093, "step": 6202 }, { "epoch": 11.250056676490592, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1122, "step": 6203 }, { "epoch": 11.251870324189527, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0971, "step": 6204 }, { "epoch": 11.25368397188846, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1081, "step": 6205 }, { "epoch": 11.255497619587395, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.1014, "step": 6206 }, { "epoch": 11.25731126728633, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1301, "step": 6207 }, { "epoch": 11.259124914985264, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.14, "step": 6208 }, { "epoch": 11.2609385626842, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1472, "step": 6209 }, { "epoch": 11.262752210383134, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1328, "step": 6210 }, { "epoch": 11.264565858082067, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1341, "step": 6211 }, { "epoch": 11.266379505781002, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1649, "step": 6212 }, { "epoch": 11.268193153479936, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1673, "step": 6213 }, { "epoch": 11.270006801178871, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1982, "step": 6214 }, { "epoch": 11.271820448877806, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.2689, "step": 6215 }, { "epoch": 11.27363409657674, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1376, "step": 6216 }, { "epoch": 11.275447744275674, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0783, "step": 6217 }, { "epoch": 11.277261391974609, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0771, "step": 6218 }, { "epoch": 11.279075039673543, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0761, "step": 6219 }, { "epoch": 11.280888687372478, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0817, "step": 6220 }, { "epoch": 11.282702335071413, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0819, "step": 6221 }, { "epoch": 11.284515982770348, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0742, "step": 6222 }, { "epoch": 11.28632963046928, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0804, "step": 6223 }, { "epoch": 11.288143278168215, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0826, "step": 6224 }, { "epoch": 11.28995692586715, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0871, "step": 6225 }, { "epoch": 11.291770573566085, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0738, "step": 6226 }, { "epoch": 11.29358422126502, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0826, "step": 6227 }, { "epoch": 11.295397868963954, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0774, "step": 6228 }, { "epoch": 11.297211516662887, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.09, "step": 6229 }, { "epoch": 11.299025164361822, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0878, "step": 6230 }, { "epoch": 11.300838812060757, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0792, "step": 6231 }, { "epoch": 11.302652459759692, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0797, "step": 6232 }, { "epoch": 11.304466107458627, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0862, "step": 6233 }, { "epoch": 11.306279755157561, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0866, "step": 6234 }, { "epoch": 11.308093402856494, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0718, "step": 6235 }, { "epoch": 11.309907050555429, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0889, "step": 6236 }, { "epoch": 11.311720698254364, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0746, "step": 6237 }, { "epoch": 11.313534345953299, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0742, "step": 6238 }, { "epoch": 11.315347993652233, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0754, "step": 6239 }, { "epoch": 11.317161641351168, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0805, "step": 6240 }, { "epoch": 11.318975289050101, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0796, "step": 6241 }, { "epoch": 11.320788936749036, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0875, "step": 6242 }, { "epoch": 11.32260258444797, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1015, "step": 6243 }, { "epoch": 11.324416232146906, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0741, "step": 6244 }, { "epoch": 11.32622987984584, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0896, "step": 6245 }, { "epoch": 11.328043527544775, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0841, "step": 6246 }, { "epoch": 11.329857175243708, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0906, "step": 6247 }, { "epoch": 11.331670822942643, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1028, "step": 6248 }, { "epoch": 11.333484470641578, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0985, "step": 6249 }, { "epoch": 11.335298118340512, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.094, "step": 6250 }, { "epoch": 11.337111766039447, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0827, "step": 6251 }, { "epoch": 11.338925413738382, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0921, "step": 6252 }, { "epoch": 11.340739061437315, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1211, "step": 6253 }, { "epoch": 11.34255270913625, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.106, "step": 6254 }, { "epoch": 11.344366356835184, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1425, "step": 6255 }, { "epoch": 11.34618000453412, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1225, "step": 6256 }, { "epoch": 11.347993652233054, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1308, "step": 6257 }, { "epoch": 11.349807299931989, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1387, "step": 6258 }, { "epoch": 11.351620947630924, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1383, "step": 6259 }, { "epoch": 11.353434595329857, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.145, "step": 6260 }, { "epoch": 11.355248243028791, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1514, "step": 6261 }, { "epoch": 11.357061890727726, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.1766, "step": 6262 }, { "epoch": 11.35887553842666, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1644, "step": 6263 }, { "epoch": 11.360689186125596, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.1868, "step": 6264 }, { "epoch": 11.36250283382453, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.2684, "step": 6265 }, { "epoch": 11.364316481523463, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1261, "step": 6266 }, { "epoch": 11.366130129222398, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0877, "step": 6267 }, { "epoch": 11.367943776921333, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0836, "step": 6268 }, { "epoch": 11.369757424620268, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0886, "step": 6269 }, { "epoch": 11.371571072319203, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0825, "step": 6270 }, { "epoch": 11.373384720018137, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0708, "step": 6271 }, { "epoch": 11.37519836771707, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0827, "step": 6272 }, { "epoch": 11.377012015416005, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0789, "step": 6273 }, { "epoch": 11.37882566311494, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0787, "step": 6274 }, { "epoch": 11.380639310813875, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0835, "step": 6275 }, { "epoch": 11.38245295851281, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0833, "step": 6276 }, { "epoch": 11.384266606211744, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0863, "step": 6277 }, { "epoch": 11.386080253910677, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0989, "step": 6278 }, { "epoch": 11.387893901609612, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0806, "step": 6279 }, { "epoch": 11.389707549308547, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0839, "step": 6280 }, { "epoch": 11.391521197007481, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0949, "step": 6281 }, { "epoch": 11.393334844706416, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0832, "step": 6282 }, { "epoch": 11.395148492405351, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0853, "step": 6283 }, { "epoch": 11.396962140104284, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0825, "step": 6284 }, { "epoch": 11.398775787803219, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0864, "step": 6285 }, { "epoch": 11.400589435502154, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0748, "step": 6286 }, { "epoch": 11.402403083201088, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.072, "step": 6287 }, { "epoch": 11.404216730900023, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0865, "step": 6288 }, { "epoch": 11.406030378598958, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0854, "step": 6289 }, { "epoch": 11.40784402629789, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0812, "step": 6290 }, { "epoch": 11.409657673996826, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0709, "step": 6291 }, { "epoch": 11.41147132169576, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0774, "step": 6292 }, { "epoch": 11.413284969394695, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.095, "step": 6293 }, { "epoch": 11.41509861709363, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0962, "step": 6294 }, { "epoch": 11.416912264792565, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0772, "step": 6295 }, { "epoch": 11.418725912491498, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0968, "step": 6296 }, { "epoch": 11.420539560190432, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0823, "step": 6297 }, { "epoch": 11.422353207889367, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0992, "step": 6298 }, { "epoch": 11.424166855588302, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.113, "step": 6299 }, { "epoch": 11.425980503287237, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0982, "step": 6300 }, { "epoch": 11.427794150986172, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0925, "step": 6301 }, { "epoch": 11.429607798685105, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.097, "step": 6302 }, { "epoch": 11.43142144638404, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0981, "step": 6303 }, { "epoch": 11.433235094082974, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0987, "step": 6304 }, { "epoch": 11.435048741781909, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1044, "step": 6305 }, { "epoch": 11.436862389480844, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1004, "step": 6306 }, { "epoch": 11.438676037179778, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1597, "step": 6307 }, { "epoch": 11.440489684878713, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1202, "step": 6308 }, { "epoch": 11.442303332577646, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1397, "step": 6309 }, { "epoch": 11.444116980276581, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1315, "step": 6310 }, { "epoch": 11.445930627975516, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1303, "step": 6311 }, { "epoch": 11.44774427567445, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1425, "step": 6312 }, { "epoch": 11.449557923373385, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1696, "step": 6313 }, { "epoch": 11.451371571072318, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.1973, "step": 6314 }, { "epoch": 11.453185218771253, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.2269, "step": 6315 }, { "epoch": 11.454998866470188, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1258, "step": 6316 }, { "epoch": 11.456812514169123, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0959, "step": 6317 }, { "epoch": 11.458626161868057, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0898, "step": 6318 }, { "epoch": 11.460439809566992, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0748, "step": 6319 }, { "epoch": 11.462253457265927, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0823, "step": 6320 }, { "epoch": 11.46406710496486, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0757, "step": 6321 }, { "epoch": 11.465880752663795, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0844, "step": 6322 }, { "epoch": 11.46769440036273, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0763, "step": 6323 }, { "epoch": 11.469508048061664, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0981, "step": 6324 }, { "epoch": 11.471321695760599, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0884, "step": 6325 }, { "epoch": 11.473135343459534, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0822, "step": 6326 }, { "epoch": 11.474948991158467, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0944, "step": 6327 }, { "epoch": 11.476762638857402, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0951, "step": 6328 }, { "epoch": 11.478576286556336, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0928, "step": 6329 }, { "epoch": 11.480389934255271, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0776, "step": 6330 }, { "epoch": 11.482203581954206, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.084, "step": 6331 }, { "epoch": 11.48401722965314, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0902, "step": 6332 }, { "epoch": 11.485830877352074, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0871, "step": 6333 }, { "epoch": 11.487644525051008, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0802, "step": 6334 }, { "epoch": 11.489458172749943, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.075, "step": 6335 }, { "epoch": 11.491271820448878, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0877, "step": 6336 }, { "epoch": 11.493085468147813, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0897, "step": 6337 }, { "epoch": 11.494899115846748, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.077, "step": 6338 }, { "epoch": 11.49671276354568, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0772, "step": 6339 }, { "epoch": 11.498526411244615, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0921, "step": 6340 }, { "epoch": 11.50034005894355, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0804, "step": 6341 }, { "epoch": 11.502153706642485, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0915, "step": 6342 }, { "epoch": 11.50396735434142, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1053, "step": 6343 }, { "epoch": 11.505781002040354, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0856, "step": 6344 }, { "epoch": 11.507594649739287, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0911, "step": 6345 }, { "epoch": 11.509408297438222, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0954, "step": 6346 }, { "epoch": 11.511221945137157, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0955, "step": 6347 }, { "epoch": 11.513035592836092, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1082, "step": 6348 }, { "epoch": 11.514849240535026, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1001, "step": 6349 }, { "epoch": 11.516662888233961, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1007, "step": 6350 }, { "epoch": 11.518476535932894, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1013, "step": 6351 }, { "epoch": 11.520290183631829, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0932, "step": 6352 }, { "epoch": 11.522103831330764, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1032, "step": 6353 }, { "epoch": 11.523917479029699, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1094, "step": 6354 }, { "epoch": 11.525731126728633, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1113, "step": 6355 }, { "epoch": 11.527544774427568, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1253, "step": 6356 }, { "epoch": 11.529358422126503, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1193, "step": 6357 }, { "epoch": 11.531172069825436, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1423, "step": 6358 }, { "epoch": 11.531172069825436, "eval_loss": 2.2633256912231445, "eval_runtime": 185.8082, "eval_samples_per_second": 5.382, "eval_steps_per_second": 5.382, "step": 6358 }, { "epoch": 11.531172069825436, "mmlu_eval_accuracy": 0.30531437902665426, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.2727272727272727, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.32558139534883723, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.4444444444444444, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.4, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.4444444444444444, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.772678396306265, "step": 6358 }, { "epoch": 11.53298571752437, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1347, "step": 6359 }, { "epoch": 11.534799365223305, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1447, "step": 6360 }, { "epoch": 11.53661301292224, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.1603, "step": 6361 }, { "epoch": 11.538426660621175, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1688, "step": 6362 }, { "epoch": 11.540240308320108, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1976, "step": 6363 }, { "epoch": 11.542053956019043, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.2019, "step": 6364 }, { "epoch": 11.543867603717977, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.2678, "step": 6365 }, { "epoch": 11.545681251416912, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1172, "step": 6366 }, { "epoch": 11.547494899115847, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0818, "step": 6367 }, { "epoch": 11.549308546814782, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1033, "step": 6368 }, { "epoch": 11.551122194513717, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0919, "step": 6369 }, { "epoch": 11.55293584221265, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0838, "step": 6370 }, { "epoch": 11.554749489911584, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0934, "step": 6371 }, { "epoch": 11.55656313761052, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.083, "step": 6372 }, { "epoch": 11.558376785309454, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0834, "step": 6373 }, { "epoch": 11.560190433008389, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0826, "step": 6374 }, { "epoch": 11.562004080707322, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0863, "step": 6375 }, { "epoch": 11.563817728406256, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0845, "step": 6376 }, { "epoch": 11.565631376105191, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0917, "step": 6377 }, { "epoch": 11.567445023804126, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0852, "step": 6378 }, { "epoch": 11.56925867150306, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0831, "step": 6379 }, { "epoch": 11.571072319201996, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.096, "step": 6380 }, { "epoch": 11.57288596690093, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0844, "step": 6381 }, { "epoch": 11.574699614599863, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0858, "step": 6382 }, { "epoch": 11.576513262298798, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0836, "step": 6383 }, { "epoch": 11.578326909997733, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0855, "step": 6384 }, { "epoch": 11.580140557696668, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.083, "step": 6385 }, { "epoch": 11.581954205395602, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0797, "step": 6386 }, { "epoch": 11.583767853094537, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0855, "step": 6387 }, { "epoch": 11.58558150079347, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0783, "step": 6388 }, { "epoch": 11.587395148492405, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0875, "step": 6389 }, { "epoch": 11.58920879619134, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0741, "step": 6390 }, { "epoch": 11.591022443890274, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1025, "step": 6391 }, { "epoch": 11.59283609158921, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0874, "step": 6392 }, { "epoch": 11.594649739288144, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0871, "step": 6393 }, { "epoch": 11.596463386987077, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0927, "step": 6394 }, { "epoch": 11.598277034686012, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.093, "step": 6395 }, { "epoch": 11.600090682384947, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0808, "step": 6396 }, { "epoch": 11.601904330083881, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1013, "step": 6397 }, { "epoch": 11.603717977782816, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0971, "step": 6398 }, { "epoch": 11.60553162548175, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.091, "step": 6399 }, { "epoch": 11.607345273180684, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1118, "step": 6400 }, { "epoch": 11.609158920879619, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0995, "step": 6401 }, { "epoch": 11.610972568578553, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1034, "step": 6402 }, { "epoch": 11.612786216277488, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1201, "step": 6403 }, { "epoch": 11.614599863976423, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1018, "step": 6404 }, { "epoch": 11.616413511675358, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1114, "step": 6405 }, { "epoch": 11.61822715937429, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1236, "step": 6406 }, { "epoch": 11.620040807073226, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1076, "step": 6407 }, { "epoch": 11.62185445477216, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1397, "step": 6408 }, { "epoch": 11.623668102471095, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1299, "step": 6409 }, { "epoch": 11.62548175017003, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1238, "step": 6410 }, { "epoch": 11.627295397868965, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1497, "step": 6411 }, { "epoch": 11.629109045567898, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1457, "step": 6412 }, { "epoch": 11.630922693266832, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1849, "step": 6413 }, { "epoch": 11.632736340965767, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2142, "step": 6414 }, { "epoch": 11.634549988664702, "grad_norm": 1.609375, "learning_rate": 0.0002, "loss": 0.2729, "step": 6415 }, { "epoch": 11.636363636363637, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.1148, "step": 6416 }, { "epoch": 11.638177284062571, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1107, "step": 6417 }, { "epoch": 11.639990931761506, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0891, "step": 6418 }, { "epoch": 11.64180457946044, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0781, "step": 6419 }, { "epoch": 11.643618227159374, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0881, "step": 6420 }, { "epoch": 11.645431874858309, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0824, "step": 6421 }, { "epoch": 11.647245522557244, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0877, "step": 6422 }, { "epoch": 11.649059170256178, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0907, "step": 6423 }, { "epoch": 11.650872817955111, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0861, "step": 6424 }, { "epoch": 11.652686465654046, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0889, "step": 6425 }, { "epoch": 11.65450011335298, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0916, "step": 6426 }, { "epoch": 11.656313761051916, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0946, "step": 6427 }, { "epoch": 11.65812740875085, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0859, "step": 6428 }, { "epoch": 11.659941056449785, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.091, "step": 6429 }, { "epoch": 11.66175470414872, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0891, "step": 6430 }, { "epoch": 11.663568351847653, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0818, "step": 6431 }, { "epoch": 11.665381999546588, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0801, "step": 6432 }, { "epoch": 11.667195647245522, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0856, "step": 6433 }, { "epoch": 11.669009294944457, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0767, "step": 6434 }, { "epoch": 11.670822942643392, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0873, "step": 6435 }, { "epoch": 11.672636590342327, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0773, "step": 6436 }, { "epoch": 11.67445023804126, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0894, "step": 6437 }, { "epoch": 11.676263885740195, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0895, "step": 6438 }, { "epoch": 11.67807753343913, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1103, "step": 6439 }, { "epoch": 11.679891181138064, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0905, "step": 6440 }, { "epoch": 11.681704828836999, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0954, "step": 6441 }, { "epoch": 11.683518476535934, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0846, "step": 6442 }, { "epoch": 11.685332124234867, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0933, "step": 6443 }, { "epoch": 11.687145771933801, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0869, "step": 6444 }, { "epoch": 11.688959419632736, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0905, "step": 6445 }, { "epoch": 11.690773067331671, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1422, "step": 6446 }, { "epoch": 11.692586715030606, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.093, "step": 6447 }, { "epoch": 11.69440036272954, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0927, "step": 6448 }, { "epoch": 11.696214010428474, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0983, "step": 6449 }, { "epoch": 11.698027658127408, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1035, "step": 6450 }, { "epoch": 11.699841305826343, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1049, "step": 6451 }, { "epoch": 11.701654953525278, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0888, "step": 6452 }, { "epoch": 11.703468601224213, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1004, "step": 6453 }, { "epoch": 11.705282248923147, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1087, "step": 6454 }, { "epoch": 11.70709589662208, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1217, "step": 6455 }, { "epoch": 11.708909544321015, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.1326, "step": 6456 }, { "epoch": 11.71072319201995, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1337, "step": 6457 }, { "epoch": 11.712536839718885, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1207, "step": 6458 }, { "epoch": 11.71435048741782, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1249, "step": 6459 }, { "epoch": 11.716164135116754, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1249, "step": 6460 }, { "epoch": 11.717977782815687, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1386, "step": 6461 }, { "epoch": 11.719791430514622, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1601, "step": 6462 }, { "epoch": 11.721605078213557, "grad_norm": 1.046875, "learning_rate": 0.0002, "loss": 0.1711, "step": 6463 }, { "epoch": 11.723418725912492, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.2033, "step": 6464 }, { "epoch": 11.725232373611426, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2654, "step": 6465 }, { "epoch": 11.727046021310361, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1478, "step": 6466 }, { "epoch": 11.728859669009296, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0791, "step": 6467 }, { "epoch": 11.730673316708229, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0961, "step": 6468 }, { "epoch": 11.732486964407164, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0844, "step": 6469 }, { "epoch": 11.734300612106098, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0835, "step": 6470 }, { "epoch": 11.736114259805033, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0842, "step": 6471 }, { "epoch": 11.737927907503968, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0933, "step": 6472 }, { "epoch": 11.739741555202901, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.094, "step": 6473 }, { "epoch": 11.741555202901836, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0966, "step": 6474 }, { "epoch": 11.74336885060077, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0849, "step": 6475 }, { "epoch": 11.745182498299705, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0898, "step": 6476 }, { "epoch": 11.74699614599864, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0968, "step": 6477 }, { "epoch": 11.748809793697575, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0802, "step": 6478 }, { "epoch": 11.75062344139651, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0926, "step": 6479 }, { "epoch": 11.752437089095443, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.118, "step": 6480 }, { "epoch": 11.754250736794377, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0892, "step": 6481 }, { "epoch": 11.756064384493312, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0894, "step": 6482 }, { "epoch": 11.757878032192247, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0895, "step": 6483 }, { "epoch": 11.759691679891182, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0918, "step": 6484 }, { "epoch": 11.761505327590115, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0788, "step": 6485 }, { "epoch": 11.76331897528905, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0984, "step": 6486 }, { "epoch": 11.765132622987984, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0867, "step": 6487 }, { "epoch": 11.766946270686919, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0932, "step": 6488 }, { "epoch": 11.768759918385854, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0963, "step": 6489 }, { "epoch": 11.770573566084789, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0881, "step": 6490 }, { "epoch": 11.772387213783723, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0831, "step": 6491 }, { "epoch": 11.774200861482656, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0744, "step": 6492 }, { "epoch": 11.776014509181591, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0999, "step": 6493 }, { "epoch": 11.777828156880526, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0887, "step": 6494 }, { "epoch": 11.77964180457946, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0965, "step": 6495 }, { "epoch": 11.781455452278395, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0787, "step": 6496 }, { "epoch": 11.78326909997733, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1078, "step": 6497 }, { "epoch": 11.785082747676263, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0967, "step": 6498 }, { "epoch": 11.786896395375198, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1097, "step": 6499 }, { "epoch": 11.788710043074133, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0981, "step": 6500 }, { "epoch": 11.790523690773068, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.091, "step": 6501 }, { "epoch": 11.792337338472002, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.104, "step": 6502 }, { "epoch": 11.794150986170937, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1103, "step": 6503 }, { "epoch": 11.79596463386987, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1442, "step": 6504 }, { "epoch": 11.797778281568805, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.1349, "step": 6505 }, { "epoch": 11.79959192926774, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1161, "step": 6506 }, { "epoch": 11.801405576966674, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1398, "step": 6507 }, { "epoch": 11.80321922466561, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1193, "step": 6508 }, { "epoch": 11.805032872364544, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1377, "step": 6509 }, { "epoch": 11.806846520063477, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1345, "step": 6510 }, { "epoch": 11.808660167762412, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1374, "step": 6511 }, { "epoch": 11.810473815461346, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.146, "step": 6512 }, { "epoch": 11.812287463160281, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1689, "step": 6513 }, { "epoch": 11.814101110859216, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1659, "step": 6514 }, { "epoch": 11.81591475855815, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.2614, "step": 6515 }, { "epoch": 11.817728406257084, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1441, "step": 6516 }, { "epoch": 11.819542053956019, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1218, "step": 6517 }, { "epoch": 11.821355701654953, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0982, "step": 6518 }, { "epoch": 11.823169349353888, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1041, "step": 6519 }, { "epoch": 11.824982997052823, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0838, "step": 6520 }, { "epoch": 11.826796644751758, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0911, "step": 6521 }, { "epoch": 11.82861029245069, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0908, "step": 6522 }, { "epoch": 11.830423940149625, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0898, "step": 6523 }, { "epoch": 11.83223758784856, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0892, "step": 6524 }, { "epoch": 11.834051235547495, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0822, "step": 6525 }, { "epoch": 11.83586488324643, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0923, "step": 6526 }, { "epoch": 11.837678530945364, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0978, "step": 6527 }, { "epoch": 11.8394921786443, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0893, "step": 6528 }, { "epoch": 11.841305826343232, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1019, "step": 6529 }, { "epoch": 11.843119474042167, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0876, "step": 6530 }, { "epoch": 11.844933121741102, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0856, "step": 6531 }, { "epoch": 11.846746769440037, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0841, "step": 6532 }, { "epoch": 11.848560417138971, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0995, "step": 6533 }, { "epoch": 11.850374064837904, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0888, "step": 6534 }, { "epoch": 11.85218771253684, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1035, "step": 6535 }, { "epoch": 11.854001360235774, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0945, "step": 6536 }, { "epoch": 11.855815007934709, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0925, "step": 6537 }, { "epoch": 11.857628655633643, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.095, "step": 6538 }, { "epoch": 11.859442303332578, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0843, "step": 6539 }, { "epoch": 11.861255951031513, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0923, "step": 6540 }, { "epoch": 11.863069598730446, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1157, "step": 6541 }, { "epoch": 11.86488324642938, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.0984, "step": 6542 }, { "epoch": 11.866696894128316, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0865, "step": 6543 }, { "epoch": 11.86851054182725, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1102, "step": 6544 }, { "epoch": 11.870324189526185, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0945, "step": 6545 }, { "epoch": 11.870324189526185, "eval_loss": 2.2472355365753174, "eval_runtime": 186.1224, "eval_samples_per_second": 5.373, "eval_steps_per_second": 5.373, "step": 6545 }, { "epoch": 11.870324189526185, "mmlu_eval_accuracy": 0.3051747914392692, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.5217391304347826, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.2894736842105263, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 2.8529180990645187, "step": 6545 }, { "epoch": 11.87213783722512, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0996, "step": 6546 }, { "epoch": 11.873951484924053, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1143, "step": 6547 }, { "epoch": 11.875765132622988, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0954, "step": 6548 }, { "epoch": 11.877578780321922, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0861, "step": 6549 }, { "epoch": 11.879392428020857, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1057, "step": 6550 }, { "epoch": 11.881206075719792, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1134, "step": 6551 }, { "epoch": 11.883019723418727, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1043, "step": 6552 }, { "epoch": 11.88483337111766, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1299, "step": 6553 }, { "epoch": 11.886647018816594, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1237, "step": 6554 }, { "epoch": 11.88846066651553, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1048, "step": 6555 }, { "epoch": 11.890274314214464, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1106, "step": 6556 }, { "epoch": 11.892087961913399, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1272, "step": 6557 }, { "epoch": 11.893901609612334, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1423, "step": 6558 }, { "epoch": 11.895715257311267, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1364, "step": 6559 }, { "epoch": 11.897528905010201, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.154, "step": 6560 }, { "epoch": 11.899342552709136, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1504, "step": 6561 }, { "epoch": 11.90115620040807, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.156, "step": 6562 }, { "epoch": 11.902969848107006, "grad_norm": 1.46875, "learning_rate": 0.0002, "loss": 0.1817, "step": 6563 }, { "epoch": 11.90478349580594, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1928, "step": 6564 }, { "epoch": 11.906597143504873, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.235, "step": 6565 }, { "epoch": 11.908410791203808, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1928, "step": 6566 }, { "epoch": 11.910224438902743, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0939, "step": 6567 }, { "epoch": 11.912038086601678, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0794, "step": 6568 }, { "epoch": 11.913851734300613, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0938, "step": 6569 }, { "epoch": 11.915665381999547, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.098, "step": 6570 }, { "epoch": 11.91747902969848, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0842, "step": 6571 }, { "epoch": 11.919292677397415, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1076, "step": 6572 }, { "epoch": 11.92110632509635, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0999, "step": 6573 }, { "epoch": 11.922919972795285, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0976, "step": 6574 }, { "epoch": 11.92473362049422, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0852, "step": 6575 }, { "epoch": 11.926547268193154, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0902, "step": 6576 }, { "epoch": 11.928360915892087, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1068, "step": 6577 }, { "epoch": 11.930174563591022, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1002, "step": 6578 }, { "epoch": 11.931988211289957, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1015, "step": 6579 }, { "epoch": 11.933801858988891, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0958, "step": 6580 }, { "epoch": 11.935615506687826, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0897, "step": 6581 }, { "epoch": 11.937429154386761, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0918, "step": 6582 }, { "epoch": 11.939242802085694, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0965, "step": 6583 }, { "epoch": 11.941056449784629, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0865, "step": 6584 }, { "epoch": 11.942870097483564, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0905, "step": 6585 }, { "epoch": 11.944683745182498, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0964, "step": 6586 }, { "epoch": 11.946497392881433, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0966, "step": 6587 }, { "epoch": 11.948311040580368, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0869, "step": 6588 }, { "epoch": 11.950124688279303, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0862, "step": 6589 }, { "epoch": 11.951938335978236, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0927, "step": 6590 }, { "epoch": 11.95375198367717, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0905, "step": 6591 }, { "epoch": 11.955565631376105, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.0949, "step": 6592 }, { "epoch": 11.95737927907504, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.0968, "step": 6593 }, { "epoch": 11.959192926773975, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0975, "step": 6594 }, { "epoch": 11.961006574472908, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.106, "step": 6595 }, { "epoch": 11.962820222171842, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0886, "step": 6596 }, { "epoch": 11.964633869870777, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1196, "step": 6597 }, { "epoch": 11.966447517569712, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1096, "step": 6598 }, { "epoch": 11.968261165268647, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0929, "step": 6599 }, { "epoch": 11.970074812967582, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.109, "step": 6600 }, { "epoch": 11.971888460666516, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1207, "step": 6601 }, { "epoch": 11.97370210836545, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1148, "step": 6602 }, { "epoch": 11.975515756064384, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1239, "step": 6603 }, { "epoch": 11.977329403763319, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1243, "step": 6604 }, { "epoch": 11.979143051462254, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1338, "step": 6605 }, { "epoch": 11.980956699161188, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1245, "step": 6606 }, { "epoch": 11.982770346860123, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1344, "step": 6607 }, { "epoch": 11.984583994559056, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1403, "step": 6608 }, { "epoch": 11.986397642257991, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1393, "step": 6609 }, { "epoch": 11.988211289956926, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1523, "step": 6610 }, { "epoch": 11.99002493765586, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1452, "step": 6611 }, { "epoch": 11.991838585354795, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.1653, "step": 6612 }, { "epoch": 11.99365223305373, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1796, "step": 6613 }, { "epoch": 11.995465880752663, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.2324, "step": 6614 }, { "epoch": 11.997279528451598, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2921, "step": 6615 }, { "epoch": 11.999093176150533, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1531, "step": 6616 }, { "epoch": 12.000906823849467, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1422, "step": 6617 }, { "epoch": 12.002720471548402, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.0663, "step": 6618 }, { "epoch": 12.004534119247337, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.0697, "step": 6619 }, { "epoch": 12.00634776694627, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.058, "step": 6620 }, { "epoch": 12.008161414645205, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0627, "step": 6621 }, { "epoch": 12.00997506234414, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.0598, "step": 6622 }, { "epoch": 12.011788710043074, "grad_norm": 0.3203125, "learning_rate": 0.0002, "loss": 0.0534, "step": 6623 }, { "epoch": 12.013602357742009, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.059, "step": 6624 }, { "epoch": 12.015416005440944, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0634, "step": 6625 }, { "epoch": 12.017229653139877, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0629, "step": 6626 }, { "epoch": 12.019043300838812, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0644, "step": 6627 }, { "epoch": 12.020856948537746, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0672, "step": 6628 }, { "epoch": 12.022670596236681, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0544, "step": 6629 }, { "epoch": 12.024484243935616, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.067, "step": 6630 }, { "epoch": 12.02629789163455, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0722, "step": 6631 }, { "epoch": 12.028111539333484, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0652, "step": 6632 }, { "epoch": 12.029925187032418, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0589, "step": 6633 }, { "epoch": 12.031738834731353, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0713, "step": 6634 }, { "epoch": 12.033552482430288, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.062, "step": 6635 }, { "epoch": 12.035366130129223, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0732, "step": 6636 }, { "epoch": 12.037179777828158, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0693, "step": 6637 }, { "epoch": 12.03899342552709, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0665, "step": 6638 }, { "epoch": 12.040807073226025, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0731, "step": 6639 }, { "epoch": 12.04262072092496, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0655, "step": 6640 }, { "epoch": 12.044434368623895, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0777, "step": 6641 }, { "epoch": 12.04624801632283, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0661, "step": 6642 }, { "epoch": 12.048061664021764, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.065, "step": 6643 }, { "epoch": 12.049875311720697, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0812, "step": 6644 }, { "epoch": 12.051688959419632, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0751, "step": 6645 }, { "epoch": 12.053502607118567, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0812, "step": 6646 }, { "epoch": 12.055316254817502, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0771, "step": 6647 }, { "epoch": 12.057129902516436, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0848, "step": 6648 }, { "epoch": 12.058943550215371, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0781, "step": 6649 }, { "epoch": 12.060757197914306, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.09, "step": 6650 }, { "epoch": 12.062570845613239, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1205, "step": 6651 }, { "epoch": 12.064384493312174, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0735, "step": 6652 }, { "epoch": 12.066198141011109, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0867, "step": 6653 }, { "epoch": 12.068011788710043, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0911, "step": 6654 }, { "epoch": 12.069825436408978, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.086, "step": 6655 }, { "epoch": 12.071639084107913, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0948, "step": 6656 }, { "epoch": 12.073452731806846, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0981, "step": 6657 }, { "epoch": 12.07526637950578, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0974, "step": 6658 }, { "epoch": 12.077080027204715, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1075, "step": 6659 }, { "epoch": 12.07889367490365, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1056, "step": 6660 }, { "epoch": 12.080707322602585, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1278, "step": 6661 }, { "epoch": 12.08252097030152, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.122, "step": 6662 }, { "epoch": 12.084334618000453, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.1411, "step": 6663 }, { "epoch": 12.086148265699387, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1413, "step": 6664 }, { "epoch": 12.087961913398322, "grad_norm": 0.97265625, "learning_rate": 0.0002, "loss": 0.1681, "step": 6665 }, { "epoch": 12.089775561097257, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1517, "step": 6666 }, { "epoch": 12.091589208796192, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.1604, "step": 6667 }, { "epoch": 12.093402856495127, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0698, "step": 6668 }, { "epoch": 12.09521650419406, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0797, "step": 6669 }, { "epoch": 12.097030151892994, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.0613, "step": 6670 }, { "epoch": 12.09884379959193, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0663, "step": 6671 }, { "epoch": 12.100657447290864, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.06, "step": 6672 }, { "epoch": 12.102471094989799, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0612, "step": 6673 }, { "epoch": 12.104284742688733, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0795, "step": 6674 }, { "epoch": 12.106098390387666, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0723, "step": 6675 }, { "epoch": 12.107912038086601, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0664, "step": 6676 }, { "epoch": 12.109725685785536, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0677, "step": 6677 }, { "epoch": 12.11153933348447, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0676, "step": 6678 }, { "epoch": 12.113352981183406, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0702, "step": 6679 }, { "epoch": 12.11516662888234, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0649, "step": 6680 }, { "epoch": 12.116980276581273, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0752, "step": 6681 }, { "epoch": 12.118793924280208, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0792, "step": 6682 }, { "epoch": 12.120607571979143, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0648, "step": 6683 }, { "epoch": 12.122421219678078, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0793, "step": 6684 }, { "epoch": 12.124234867377012, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0718, "step": 6685 }, { "epoch": 12.126048515075947, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0728, "step": 6686 }, { "epoch": 12.12786216277488, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0701, "step": 6687 }, { "epoch": 12.129675810473815, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0683, "step": 6688 }, { "epoch": 12.13148945817275, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0714, "step": 6689 }, { "epoch": 12.133303105871684, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0708, "step": 6690 }, { "epoch": 12.13511675357062, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0632, "step": 6691 }, { "epoch": 12.136930401269554, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0788, "step": 6692 }, { "epoch": 12.138744048968487, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0691, "step": 6693 }, { "epoch": 12.140557696667422, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0731, "step": 6694 }, { "epoch": 12.142371344366357, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0872, "step": 6695 }, { "epoch": 12.144184992065291, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0735, "step": 6696 }, { "epoch": 12.145998639764226, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.069, "step": 6697 }, { "epoch": 12.14781228746316, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1168, "step": 6698 }, { "epoch": 12.149625935162096, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0888, "step": 6699 }, { "epoch": 12.151439582861029, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0802, "step": 6700 }, { "epoch": 12.153253230559963, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0792, "step": 6701 }, { "epoch": 12.155066878258898, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0898, "step": 6702 }, { "epoch": 12.156880525957833, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0878, "step": 6703 }, { "epoch": 12.158694173656768, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0893, "step": 6704 }, { "epoch": 12.160507821355703, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0994, "step": 6705 }, { "epoch": 12.162321469054636, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0989, "step": 6706 }, { "epoch": 12.16413511675357, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1174, "step": 6707 }, { "epoch": 12.165948764452505, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1078, "step": 6708 }, { "epoch": 12.16776241215144, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1211, "step": 6709 }, { "epoch": 12.169576059850375, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1112, "step": 6710 }, { "epoch": 12.17138970754931, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1288, "step": 6711 }, { "epoch": 12.173203355248242, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1253, "step": 6712 }, { "epoch": 12.175017002947177, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1563, "step": 6713 }, { "epoch": 12.176830650646112, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1554, "step": 6714 }, { "epoch": 12.178644298345047, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1811, "step": 6715 }, { "epoch": 12.180457946043981, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2175, "step": 6716 }, { "epoch": 12.182271593742916, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1763, "step": 6717 }, { "epoch": 12.18408524144185, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0741, "step": 6718 }, { "epoch": 12.185898889140784, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0633, "step": 6719 }, { "epoch": 12.187712536839719, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0745, "step": 6720 }, { "epoch": 12.189526184538654, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0782, "step": 6721 }, { "epoch": 12.191339832237588, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0691, "step": 6722 }, { "epoch": 12.193153479936523, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0715, "step": 6723 }, { "epoch": 12.194967127635456, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0663, "step": 6724 }, { "epoch": 12.19678077533439, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0713, "step": 6725 }, { "epoch": 12.198594423033326, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0763, "step": 6726 }, { "epoch": 12.20040807073226, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0841, "step": 6727 }, { "epoch": 12.202221718431195, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0704, "step": 6728 }, { "epoch": 12.20403536613013, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1108, "step": 6729 }, { "epoch": 12.205849013829063, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0763, "step": 6730 }, { "epoch": 12.207662661527998, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0634, "step": 6731 }, { "epoch": 12.209476309226932, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0719, "step": 6732 }, { "epoch": 12.209476309226932, "eval_loss": 2.3867170810699463, "eval_runtime": 185.758, "eval_samples_per_second": 5.383, "eval_steps_per_second": 5.383, "step": 6732 }, { "epoch": 12.209476309226932, "mmlu_eval_accuracy": 0.2877297484739115, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.25, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.26666666666666666, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.15384615384615385, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.18181818181818182, "mmlu_eval_accuracy_miscellaneous": 0.4186046511627907, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.23, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.5142857142857142, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.2727272727272727, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.0009652245753267, "step": 6732 }, { "epoch": 12.211289956925867, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0639, "step": 6733 }, { "epoch": 12.213103604624802, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0761, "step": 6734 }, { "epoch": 12.214917252323737, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0707, "step": 6735 }, { "epoch": 12.21673090002267, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0762, "step": 6736 }, { "epoch": 12.218544547721605, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0807, "step": 6737 }, { "epoch": 12.22035819542054, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0776, "step": 6738 }, { "epoch": 12.222171843119474, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0849, "step": 6739 }, { "epoch": 12.223985490818409, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0714, "step": 6740 }, { "epoch": 12.225799138517344, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0763, "step": 6741 }, { "epoch": 12.227612786216277, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0752, "step": 6742 }, { "epoch": 12.229426433915211, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.077, "step": 6743 }, { "epoch": 12.231240081614146, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0836, "step": 6744 }, { "epoch": 12.233053729313081, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0738, "step": 6745 }, { "epoch": 12.234867377012016, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0696, "step": 6746 }, { "epoch": 12.23668102471095, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0807, "step": 6747 }, { "epoch": 12.238494672409884, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0857, "step": 6748 }, { "epoch": 12.240308320108818, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.086, "step": 6749 }, { "epoch": 12.242121967807753, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0846, "step": 6750 }, { "epoch": 12.243935615506688, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0934, "step": 6751 }, { "epoch": 12.245749263205623, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.094, "step": 6752 }, { "epoch": 12.247562910904557, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0976, "step": 6753 }, { "epoch": 12.24937655860349, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.096, "step": 6754 }, { "epoch": 12.251190206302425, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1113, "step": 6755 }, { "epoch": 12.25300385400136, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0921, "step": 6756 }, { "epoch": 12.254817501700295, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1162, "step": 6757 }, { "epoch": 12.25663114939923, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1066, "step": 6758 }, { "epoch": 12.258444797098164, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0938, "step": 6759 }, { "epoch": 12.260258444797099, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1086, "step": 6760 }, { "epoch": 12.262072092496032, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1149, "step": 6761 }, { "epoch": 12.263885740194967, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.1145, "step": 6762 }, { "epoch": 12.265699387893902, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1358, "step": 6763 }, { "epoch": 12.267513035592836, "grad_norm": 1.0703125, "learning_rate": 0.0002, "loss": 0.2011, "step": 6764 }, { "epoch": 12.269326683291771, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.149, "step": 6765 }, { "epoch": 12.271140330990706, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.2044, "step": 6766 }, { "epoch": 12.272953978689639, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1965, "step": 6767 }, { "epoch": 12.274767626388574, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0748, "step": 6768 }, { "epoch": 12.276581274087508, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0712, "step": 6769 }, { "epoch": 12.278394921786443, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0775, "step": 6770 }, { "epoch": 12.280208569485378, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0737, "step": 6771 }, { "epoch": 12.282022217184313, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0752, "step": 6772 }, { "epoch": 12.283835864883246, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0696, "step": 6773 }, { "epoch": 12.28564951258218, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0773, "step": 6774 }, { "epoch": 12.287463160281115, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0781, "step": 6775 }, { "epoch": 12.28927680798005, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0755, "step": 6776 }, { "epoch": 12.291090455678985, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0758, "step": 6777 }, { "epoch": 12.29290410337792, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0765, "step": 6778 }, { "epoch": 12.294717751076853, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0866, "step": 6779 }, { "epoch": 12.296531398775787, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0672, "step": 6780 }, { "epoch": 12.298345046474722, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0794, "step": 6781 }, { "epoch": 12.300158694173657, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0731, "step": 6782 }, { "epoch": 12.301972341872592, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0768, "step": 6783 }, { "epoch": 12.303785989571526, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0842, "step": 6784 }, { "epoch": 12.30559963727046, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0869, "step": 6785 }, { "epoch": 12.307413284969394, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.073, "step": 6786 }, { "epoch": 12.309226932668329, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0828, "step": 6787 }, { "epoch": 12.311040580367264, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0781, "step": 6788 }, { "epoch": 12.312854228066199, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0847, "step": 6789 }, { "epoch": 12.314667875765133, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0771, "step": 6790 }, { "epoch": 12.316481523464066, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0774, "step": 6791 }, { "epoch": 12.318295171163001, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0741, "step": 6792 }, { "epoch": 12.320108818861936, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0764, "step": 6793 }, { "epoch": 12.32192246656087, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0886, "step": 6794 }, { "epoch": 12.323736114259805, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1363, "step": 6795 }, { "epoch": 12.32554976195874, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0856, "step": 6796 }, { "epoch": 12.327363409657673, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.084, "step": 6797 }, { "epoch": 12.329177057356608, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0853, "step": 6798 }, { "epoch": 12.330990705055543, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.092, "step": 6799 }, { "epoch": 12.332804352754478, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0872, "step": 6800 }, { "epoch": 12.334618000453412, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0994, "step": 6801 }, { "epoch": 12.336431648152347, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.0997, "step": 6802 }, { "epoch": 12.33824529585128, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0974, "step": 6803 }, { "epoch": 12.340058943550215, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0999, "step": 6804 }, { "epoch": 12.34187259124915, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1124, "step": 6805 }, { "epoch": 12.343686238948084, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0988, "step": 6806 }, { "epoch": 12.34549988664702, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1161, "step": 6807 }, { "epoch": 12.347313534345954, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1159, "step": 6808 }, { "epoch": 12.349127182044889, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1157, "step": 6809 }, { "epoch": 12.350940829743822, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1254, "step": 6810 }, { "epoch": 12.352754477442756, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1402, "step": 6811 }, { "epoch": 12.354568125141691, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1532, "step": 6812 }, { "epoch": 12.356381772840626, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1272, "step": 6813 }, { "epoch": 12.35819542053956, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1462, "step": 6814 }, { "epoch": 12.360009068238494, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.2002, "step": 6815 }, { "epoch": 12.361822715937429, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.2163, "step": 6816 }, { "epoch": 12.363636363636363, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.1702, "step": 6817 }, { "epoch": 12.365450011335298, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0794, "step": 6818 }, { "epoch": 12.367263659034233, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0676, "step": 6819 }, { "epoch": 12.369077306733168, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0762, "step": 6820 }, { "epoch": 12.370890954432102, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0807, "step": 6821 }, { "epoch": 12.372704602131035, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0733, "step": 6822 }, { "epoch": 12.37451824982997, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0822, "step": 6823 }, { "epoch": 12.376331897528905, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0666, "step": 6824 }, { "epoch": 12.37814554522784, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0721, "step": 6825 }, { "epoch": 12.379959192926774, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0731, "step": 6826 }, { "epoch": 12.38177284062571, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0787, "step": 6827 }, { "epoch": 12.383586488324642, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0756, "step": 6828 }, { "epoch": 12.385400136023577, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0805, "step": 6829 }, { "epoch": 12.387213783722512, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0751, "step": 6830 }, { "epoch": 12.389027431421447, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0785, "step": 6831 }, { "epoch": 12.390841079120381, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0757, "step": 6832 }, { "epoch": 12.392654726819316, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0868, "step": 6833 }, { "epoch": 12.39446837451825, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.081, "step": 6834 }, { "epoch": 12.396282022217184, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.071, "step": 6835 }, { "epoch": 12.398095669916119, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.078, "step": 6836 }, { "epoch": 12.399909317615053, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0806, "step": 6837 }, { "epoch": 12.401722965313988, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0737, "step": 6838 }, { "epoch": 12.403536613012923, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0757, "step": 6839 }, { "epoch": 12.405350260711856, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0717, "step": 6840 }, { "epoch": 12.40716390841079, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.073, "step": 6841 }, { "epoch": 12.408977556109726, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0974, "step": 6842 }, { "epoch": 12.41079120380866, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1177, "step": 6843 }, { "epoch": 12.412604851507595, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0796, "step": 6844 }, { "epoch": 12.41441849920653, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0858, "step": 6845 }, { "epoch": 12.416232146905463, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1028, "step": 6846 }, { "epoch": 12.418045794604398, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0963, "step": 6847 }, { "epoch": 12.419859442303332, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0941, "step": 6848 }, { "epoch": 12.421673090002267, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1095, "step": 6849 }, { "epoch": 12.423486737701202, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0977, "step": 6850 }, { "epoch": 12.425300385400137, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1006, "step": 6851 }, { "epoch": 12.42711403309907, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0943, "step": 6852 }, { "epoch": 12.428927680798004, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1001, "step": 6853 }, { "epoch": 12.43074132849694, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.092, "step": 6854 }, { "epoch": 12.432554976195874, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1052, "step": 6855 }, { "epoch": 12.434368623894809, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1265, "step": 6856 }, { "epoch": 12.436182271593744, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0985, "step": 6857 }, { "epoch": 12.437995919292677, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1163, "step": 6858 }, { "epoch": 12.439809566991611, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1037, "step": 6859 }, { "epoch": 12.441623214690546, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1329, "step": 6860 }, { "epoch": 12.44343686238948, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1257, "step": 6861 }, { "epoch": 12.445250510088416, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1482, "step": 6862 }, { "epoch": 12.44706415778735, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.136, "step": 6863 }, { "epoch": 12.448877805486283, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1648, "step": 6864 }, { "epoch": 12.450691453185218, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.154, "step": 6865 }, { "epoch": 12.452505100884153, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.2146, "step": 6866 }, { "epoch": 12.454318748583088, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.214, "step": 6867 }, { "epoch": 12.456132396282023, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0765, "step": 6868 }, { "epoch": 12.457946043980957, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0769, "step": 6869 }, { "epoch": 12.459759691679892, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0711, "step": 6870 }, { "epoch": 12.461573339378825, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0849, "step": 6871 }, { "epoch": 12.46338698707776, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0817, "step": 6872 }, { "epoch": 12.465200634776695, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0809, "step": 6873 }, { "epoch": 12.46701428247563, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0752, "step": 6874 }, { "epoch": 12.468827930174564, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0855, "step": 6875 }, { "epoch": 12.470641577873499, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0794, "step": 6876 }, { "epoch": 12.472455225572432, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0789, "step": 6877 }, { "epoch": 12.474268873271367, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0874, "step": 6878 }, { "epoch": 12.476082520970301, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0862, "step": 6879 }, { "epoch": 12.477896168669236, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0919, "step": 6880 }, { "epoch": 12.479709816368171, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0795, "step": 6881 }, { "epoch": 12.481523464067106, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0842, "step": 6882 }, { "epoch": 12.483337111766039, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0734, "step": 6883 }, { "epoch": 12.485150759464974, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0802, "step": 6884 }, { "epoch": 12.486964407163908, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0882, "step": 6885 }, { "epoch": 12.488778054862843, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0793, "step": 6886 }, { "epoch": 12.490591702561778, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0773, "step": 6887 }, { "epoch": 12.492405350260713, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0747, "step": 6888 }, { "epoch": 12.494218997959646, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.09, "step": 6889 }, { "epoch": 12.49603264565858, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0826, "step": 6890 }, { "epoch": 12.497846293357515, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0801, "step": 6891 }, { "epoch": 12.49965994105645, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.076, "step": 6892 }, { "epoch": 12.501473588755385, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0969, "step": 6893 }, { "epoch": 12.50328723645432, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.09, "step": 6894 }, { "epoch": 12.505100884153252, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0812, "step": 6895 }, { "epoch": 12.506914531852187, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0821, "step": 6896 }, { "epoch": 12.508728179551122, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0875, "step": 6897 }, { "epoch": 12.510541827250057, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0849, "step": 6898 }, { "epoch": 12.512355474948992, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0893, "step": 6899 }, { "epoch": 12.514169122647926, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0869, "step": 6900 }, { "epoch": 12.51598277034686, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0951, "step": 6901 }, { "epoch": 12.517796418045794, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0931, "step": 6902 }, { "epoch": 12.519610065744729, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0956, "step": 6903 }, { "epoch": 12.521423713443664, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1131, "step": 6904 }, { "epoch": 12.523237361142598, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1074, "step": 6905 }, { "epoch": 12.525051008841533, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0962, "step": 6906 }, { "epoch": 12.526864656540466, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1083, "step": 6907 }, { "epoch": 12.528678304239401, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1393, "step": 6908 }, { "epoch": 12.530491951938336, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1098, "step": 6909 }, { "epoch": 12.53230559963727, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1111, "step": 6910 }, { "epoch": 12.534119247336205, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1247, "step": 6911 }, { "epoch": 12.53593289503514, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1244, "step": 6912 }, { "epoch": 12.537746542734073, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1407, "step": 6913 }, { "epoch": 12.539560190433008, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1616, "step": 6914 }, { "epoch": 12.541373838131943, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1609, "step": 6915 }, { "epoch": 12.543187485830877, "grad_norm": 1.875, "learning_rate": 0.0002, "loss": 0.3249, "step": 6916 }, { "epoch": 12.545001133529812, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.2259, "step": 6917 }, { "epoch": 12.546814781228747, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0753, "step": 6918 }, { "epoch": 12.548628428927682, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0735, "step": 6919 }, { "epoch": 12.548628428927682, "eval_loss": 2.309109687805176, "eval_runtime": 185.9764, "eval_samples_per_second": 5.377, "eval_steps_per_second": 5.377, "step": 6919 }, { "epoch": 12.548628428927682, "mmlu_eval_accuracy": 0.2968325561564201, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.1875, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.13636363636363635, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.1875, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.38461538461538464, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.19230769230769232, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.3157894736842105, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.38235294117647056, "mmlu_eval_accuracy_prehistory": 0.5428571428571428, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 2.822694471932018, "step": 6919 }, { "epoch": 12.550442076626615, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0825, "step": 6920 }, { "epoch": 12.55225572432555, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0842, "step": 6921 }, { "epoch": 12.554069372024484, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0705, "step": 6922 }, { "epoch": 12.555883019723419, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0905, "step": 6923 }, { "epoch": 12.557696667422354, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0785, "step": 6924 }, { "epoch": 12.559510315121287, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0768, "step": 6925 }, { "epoch": 12.561323962820222, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0761, "step": 6926 }, { "epoch": 12.563137610519156, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0873, "step": 6927 }, { "epoch": 12.564951258218091, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0793, "step": 6928 }, { "epoch": 12.566764905917026, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1084, "step": 6929 }, { "epoch": 12.56857855361596, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.075, "step": 6930 }, { "epoch": 12.570392201314895, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.075, "step": 6931 }, { "epoch": 12.572205849013828, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0821, "step": 6932 }, { "epoch": 12.574019496712763, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.085, "step": 6933 }, { "epoch": 12.575833144411698, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.091, "step": 6934 }, { "epoch": 12.577646792110633, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0808, "step": 6935 }, { "epoch": 12.579460439809568, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0874, "step": 6936 }, { "epoch": 12.581274087508502, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0881, "step": 6937 }, { "epoch": 12.583087735207435, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0784, "step": 6938 }, { "epoch": 12.58490138290637, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0739, "step": 6939 }, { "epoch": 12.586715030605305, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0851, "step": 6940 }, { "epoch": 12.58852867830424, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0942, "step": 6941 }, { "epoch": 12.590342326003174, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.0948, "step": 6942 }, { "epoch": 12.59215597370211, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0905, "step": 6943 }, { "epoch": 12.593969621401042, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0927, "step": 6944 }, { "epoch": 12.595783269099977, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0811, "step": 6945 }, { "epoch": 12.597596916798912, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1018, "step": 6946 }, { "epoch": 12.599410564497846, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0877, "step": 6947 }, { "epoch": 12.601224212196781, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0854, "step": 6948 }, { "epoch": 12.603037859895716, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0882, "step": 6949 }, { "epoch": 12.604851507594649, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1103, "step": 6950 }, { "epoch": 12.606665155293584, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0953, "step": 6951 }, { "epoch": 12.608478802992519, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1034, "step": 6952 }, { "epoch": 12.610292450691453, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1097, "step": 6953 }, { "epoch": 12.612106098390388, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1006, "step": 6954 }, { "epoch": 12.613919746089323, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0992, "step": 6955 }, { "epoch": 12.615733393788256, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1047, "step": 6956 }, { "epoch": 12.61754704148719, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1244, "step": 6957 }, { "epoch": 12.619360689186125, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1113, "step": 6958 }, { "epoch": 12.62117433688506, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.123, "step": 6959 }, { "epoch": 12.622987984583995, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1357, "step": 6960 }, { "epoch": 12.62480163228293, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1266, "step": 6961 }, { "epoch": 12.626615279981863, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1391, "step": 6962 }, { "epoch": 12.628428927680797, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1372, "step": 6963 }, { "epoch": 12.630242575379732, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1461, "step": 6964 }, { "epoch": 12.632056223078667, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1844, "step": 6965 }, { "epoch": 12.633869870777602, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.2265, "step": 6966 }, { "epoch": 12.635683518476537, "grad_norm": 1.3828125, "learning_rate": 0.0002, "loss": 0.2264, "step": 6967 }, { "epoch": 12.63749716617547, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0932, "step": 6968 }, { "epoch": 12.639310813874404, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0821, "step": 6969 }, { "epoch": 12.64112446157334, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0818, "step": 6970 }, { "epoch": 12.642938109272274, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0842, "step": 6971 }, { "epoch": 12.644751756971209, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0757, "step": 6972 }, { "epoch": 12.646565404670143, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0865, "step": 6973 }, { "epoch": 12.648379052369076, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0937, "step": 6974 }, { "epoch": 12.650192700068011, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0844, "step": 6975 }, { "epoch": 12.652006347766946, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0858, "step": 6976 }, { "epoch": 12.65381999546588, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0859, "step": 6977 }, { "epoch": 12.655633643164816, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0909, "step": 6978 }, { "epoch": 12.65744729086375, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0821, "step": 6979 }, { "epoch": 12.659260938562685, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0808, "step": 6980 }, { "epoch": 12.661074586261618, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0773, "step": 6981 }, { "epoch": 12.662888233960553, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0846, "step": 6982 }, { "epoch": 12.664701881659488, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0861, "step": 6983 }, { "epoch": 12.666515529358422, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0912, "step": 6984 }, { "epoch": 12.668329177057357, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0928, "step": 6985 }, { "epoch": 12.67014282475629, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0888, "step": 6986 }, { "epoch": 12.671956472455225, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0804, "step": 6987 }, { "epoch": 12.67377012015416, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0861, "step": 6988 }, { "epoch": 12.675583767853094, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0891, "step": 6989 }, { "epoch": 12.67739741555203, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0919, "step": 6990 }, { "epoch": 12.679211063250964, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.098, "step": 6991 }, { "epoch": 12.681024710949899, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.0898, "step": 6992 }, { "epoch": 12.682838358648832, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0949, "step": 6993 }, { "epoch": 12.684652006347767, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0819, "step": 6994 }, { "epoch": 12.686465654046701, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0858, "step": 6995 }, { "epoch": 12.688279301745636, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0831, "step": 6996 }, { "epoch": 12.690092949444571, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0798, "step": 6997 }, { "epoch": 12.691906597143506, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0886, "step": 6998 }, { "epoch": 12.693720244842439, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1102, "step": 6999 }, { "epoch": 12.695533892541373, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1074, "step": 7000 }, { "epoch": 12.697347540240308, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.097, "step": 7001 }, { "epoch": 12.699161187939243, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1069, "step": 7002 }, { "epoch": 12.700974835638178, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0929, "step": 7003 }, { "epoch": 12.702788483337113, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0976, "step": 7004 }, { "epoch": 12.704602131036046, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1069, "step": 7005 }, { "epoch": 12.70641577873498, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1163, "step": 7006 }, { "epoch": 12.708229426433915, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1173, "step": 7007 }, { "epoch": 12.71004307413285, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.1089, "step": 7008 }, { "epoch": 12.711856721831785, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1368, "step": 7009 }, { "epoch": 12.71367036953072, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.1201, "step": 7010 }, { "epoch": 12.715484017229652, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1444, "step": 7011 }, { "epoch": 12.717297664928587, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1456, "step": 7012 }, { "epoch": 12.719111312627522, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.172, "step": 7013 }, { "epoch": 12.720924960326457, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.168, "step": 7014 }, { "epoch": 12.722738608025391, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.2153, "step": 7015 }, { "epoch": 12.724552255724326, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.2409, "step": 7016 }, { "epoch": 12.72636590342326, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.2049, "step": 7017 }, { "epoch": 12.728179551122194, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0851, "step": 7018 }, { "epoch": 12.729993198821129, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1066, "step": 7019 }, { "epoch": 12.731806846520064, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0846, "step": 7020 }, { "epoch": 12.733620494218998, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0843, "step": 7021 }, { "epoch": 12.735434141917933, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.076, "step": 7022 }, { "epoch": 12.737247789616866, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0799, "step": 7023 }, { "epoch": 12.7390614373158, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1001, "step": 7024 }, { "epoch": 12.740875085014736, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0948, "step": 7025 }, { "epoch": 12.74268873271367, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0943, "step": 7026 }, { "epoch": 12.744502380412605, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0801, "step": 7027 }, { "epoch": 12.74631602811154, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0846, "step": 7028 }, { "epoch": 12.748129675810475, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0874, "step": 7029 }, { "epoch": 12.749943323509408, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0836, "step": 7030 }, { "epoch": 12.751756971208343, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0892, "step": 7031 }, { "epoch": 12.753570618907277, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.084, "step": 7032 }, { "epoch": 12.755384266606212, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.091, "step": 7033 }, { "epoch": 12.757197914305147, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0911, "step": 7034 }, { "epoch": 12.75901156200408, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0853, "step": 7035 }, { "epoch": 12.760825209703015, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0904, "step": 7036 }, { "epoch": 12.76263885740195, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0964, "step": 7037 }, { "epoch": 12.764452505100884, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0919, "step": 7038 }, { "epoch": 12.766266152799819, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0908, "step": 7039 }, { "epoch": 12.768079800498754, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.0921, "step": 7040 }, { "epoch": 12.769893448197688, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.087, "step": 7041 }, { "epoch": 12.771707095896621, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0966, "step": 7042 }, { "epoch": 12.773520743595556, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0942, "step": 7043 }, { "epoch": 12.775334391294491, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0879, "step": 7044 }, { "epoch": 12.777148038993426, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1043, "step": 7045 }, { "epoch": 12.77896168669236, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1024, "step": 7046 }, { "epoch": 12.780775334391295, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.092, "step": 7047 }, { "epoch": 12.782588982090228, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0949, "step": 7048 }, { "epoch": 12.784402629789163, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0918, "step": 7049 }, { "epoch": 12.786216277488098, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1056, "step": 7050 }, { "epoch": 12.788029925187033, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1103, "step": 7051 }, { "epoch": 12.789843572885967, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1012, "step": 7052 }, { "epoch": 12.791657220584902, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1014, "step": 7053 }, { "epoch": 12.793470868283835, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1037, "step": 7054 }, { "epoch": 12.79528451598277, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.111, "step": 7055 }, { "epoch": 12.797098163681705, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1155, "step": 7056 }, { "epoch": 12.79891181138064, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1177, "step": 7057 }, { "epoch": 12.800725459079574, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1287, "step": 7058 }, { "epoch": 12.802539106778509, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1233, "step": 7059 }, { "epoch": 12.804352754477442, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1303, "step": 7060 }, { "epoch": 12.806166402176377, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1456, "step": 7061 }, { "epoch": 12.807980049875312, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1501, "step": 7062 }, { "epoch": 12.809793697574246, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.166, "step": 7063 }, { "epoch": 12.811607345273181, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1959, "step": 7064 }, { "epoch": 12.813420992972116, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.2106, "step": 7065 }, { "epoch": 12.815234640671049, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2346, "step": 7066 }, { "epoch": 12.817048288369984, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.2426, "step": 7067 }, { "epoch": 12.818861936068918, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.086, "step": 7068 }, { "epoch": 12.820675583767853, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0943, "step": 7069 }, { "epoch": 12.822489231466788, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.08, "step": 7070 }, { "epoch": 12.824302879165723, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.088, "step": 7071 }, { "epoch": 12.826116526864656, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.079, "step": 7072 }, { "epoch": 12.82793017456359, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0771, "step": 7073 }, { "epoch": 12.829743822262525, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0918, "step": 7074 }, { "epoch": 12.83155746996146, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0916, "step": 7075 }, { "epoch": 12.833371117660395, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0896, "step": 7076 }, { "epoch": 12.83518476535933, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0858, "step": 7077 }, { "epoch": 12.836998413058263, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0917, "step": 7078 }, { "epoch": 12.838812060757197, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.0881, "step": 7079 }, { "epoch": 12.840625708456132, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.088, "step": 7080 }, { "epoch": 12.842439356155067, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1021, "step": 7081 }, { "epoch": 12.844253003854002, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1035, "step": 7082 }, { "epoch": 12.846066651552936, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0954, "step": 7083 }, { "epoch": 12.84788029925187, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0853, "step": 7084 }, { "epoch": 12.849693946950804, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0898, "step": 7085 }, { "epoch": 12.851507594649739, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0803, "step": 7086 }, { "epoch": 12.853321242348674, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0893, "step": 7087 }, { "epoch": 12.855134890047609, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0785, "step": 7088 }, { "epoch": 12.856948537746543, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0856, "step": 7089 }, { "epoch": 12.858762185445478, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.095, "step": 7090 }, { "epoch": 12.860575833144411, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0838, "step": 7091 }, { "epoch": 12.862389480843346, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0835, "step": 7092 }, { "epoch": 12.86420312854228, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0803, "step": 7093 }, { "epoch": 12.866016776241215, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0828, "step": 7094 }, { "epoch": 12.86783042394015, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0881, "step": 7095 }, { "epoch": 12.869644071639083, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1068, "step": 7096 }, { "epoch": 12.871457719338018, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0929, "step": 7097 }, { "epoch": 12.873271367036953, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0916, "step": 7098 }, { "epoch": 12.875085014735888, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.109, "step": 7099 }, { "epoch": 12.876898662434822, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1048, "step": 7100 }, { "epoch": 12.878712310133757, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1047, "step": 7101 }, { "epoch": 12.880525957832692, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.0978, "step": 7102 }, { "epoch": 12.882339605531625, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1084, "step": 7103 }, { "epoch": 12.88415325323056, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1008, "step": 7104 }, { "epoch": 12.885966900929494, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1097, "step": 7105 }, { "epoch": 12.88778054862843, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1106, "step": 7106 }, { "epoch": 12.88778054862843, "eval_loss": 2.2936649322509766, "eval_runtime": 185.7487, "eval_samples_per_second": 5.384, "eval_steps_per_second": 5.384, "step": 7106 }, { "epoch": 12.88778054862843, "mmlu_eval_accuracy": 0.3085966446922163, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.35714285714285715, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.6363636363636364, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.18181818181818182, "mmlu_eval_accuracy_conceptual_physics": 0.4230769230769231, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.5, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.19047619047619047, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.35294117647058826, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.4782608695652174, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.2727272727272727, "mmlu_eval_accuracy_miscellaneous": 0.3953488372093023, "mmlu_eval_accuracy_moral_disputes": 0.34210526315789475, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.2647058823529412, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.27647058823529413, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 3.5965848019277247, "step": 7106 }, { "epoch": 12.889594196327364, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1342, "step": 7107 }, { "epoch": 12.891407844026299, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1283, "step": 7108 }, { "epoch": 12.893221491725232, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1236, "step": 7109 }, { "epoch": 12.895035139424166, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1275, "step": 7110 }, { "epoch": 12.896848787123101, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1427, "step": 7111 }, { "epoch": 12.898662434822036, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1315, "step": 7112 }, { "epoch": 12.90047608252097, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1473, "step": 7113 }, { "epoch": 12.902289730219906, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1474, "step": 7114 }, { "epoch": 12.904103377918839, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1657, "step": 7115 }, { "epoch": 12.905917025617773, "grad_norm": 1.125, "learning_rate": 0.0002, "loss": 0.2563, "step": 7116 }, { "epoch": 12.907730673316708, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1952, "step": 7117 }, { "epoch": 12.909544321015643, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0843, "step": 7118 }, { "epoch": 12.911357968714578, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0859, "step": 7119 }, { "epoch": 12.913171616413512, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1036, "step": 7120 }, { "epoch": 12.914985264112445, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0951, "step": 7121 }, { "epoch": 12.91679891181138, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0866, "step": 7122 }, { "epoch": 12.918612559510315, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0873, "step": 7123 }, { "epoch": 12.92042620720925, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0926, "step": 7124 }, { "epoch": 12.922239854908184, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0869, "step": 7125 }, { "epoch": 12.92405350260712, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0888, "step": 7126 }, { "epoch": 12.925867150306052, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0896, "step": 7127 }, { "epoch": 12.927680798004987, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1067, "step": 7128 }, { "epoch": 12.929494445703922, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0875, "step": 7129 }, { "epoch": 12.931308093402857, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0825, "step": 7130 }, { "epoch": 12.933121741101791, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0911, "step": 7131 }, { "epoch": 12.934935388800726, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0788, "step": 7132 }, { "epoch": 12.93674903649966, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0908, "step": 7133 }, { "epoch": 12.938562684198594, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0877, "step": 7134 }, { "epoch": 12.940376331897529, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0873, "step": 7135 }, { "epoch": 12.942189979596463, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0958, "step": 7136 }, { "epoch": 12.944003627295398, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0939, "step": 7137 }, { "epoch": 12.945817274994333, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0822, "step": 7138 }, { "epoch": 12.947630922693268, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.0994, "step": 7139 }, { "epoch": 12.9494445703922, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0832, "step": 7140 }, { "epoch": 12.951258218091136, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0902, "step": 7141 }, { "epoch": 12.95307186579007, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0918, "step": 7142 }, { "epoch": 12.954885513489005, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0825, "step": 7143 }, { "epoch": 12.95669916118794, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0902, "step": 7144 }, { "epoch": 12.958512808886873, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0903, "step": 7145 }, { "epoch": 12.960326456585808, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0937, "step": 7146 }, { "epoch": 12.962140104284742, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0942, "step": 7147 }, { "epoch": 12.963953751983677, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0963, "step": 7148 }, { "epoch": 12.965767399682612, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1038, "step": 7149 }, { "epoch": 12.967581047381547, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0996, "step": 7150 }, { "epoch": 12.969394695080481, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1158, "step": 7151 }, { "epoch": 12.971208342779414, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1002, "step": 7152 }, { "epoch": 12.97302199047835, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.104, "step": 7153 }, { "epoch": 12.974835638177284, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1074, "step": 7154 }, { "epoch": 12.976649285876219, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1195, "step": 7155 }, { "epoch": 12.978462933575154, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1095, "step": 7156 }, { "epoch": 12.980276581274087, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1175, "step": 7157 }, { "epoch": 12.982090228973021, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1372, "step": 7158 }, { "epoch": 12.983903876671956, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1175, "step": 7159 }, { "epoch": 12.98571752437089, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1497, "step": 7160 }, { "epoch": 12.987531172069826, "grad_norm": 1.2890625, "learning_rate": 0.0002, "loss": 0.1619, "step": 7161 }, { "epoch": 12.98934481976876, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.145, "step": 7162 }, { "epoch": 12.991158467467695, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.163, "step": 7163 }, { "epoch": 12.992972115166628, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.1787, "step": 7164 }, { "epoch": 12.994785762865563, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1658, "step": 7165 }, { "epoch": 12.996599410564498, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.2074, "step": 7166 }, { "epoch": 12.998413058263433, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.2005, "step": 7167 }, { "epoch": 13.000226705962367, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1058, "step": 7168 }, { "epoch": 13.002040353661302, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0665, "step": 7169 }, { "epoch": 13.003854001360235, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0711, "step": 7170 }, { "epoch": 13.00566764905917, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0558, "step": 7171 }, { "epoch": 13.007481296758105, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0624, "step": 7172 }, { "epoch": 13.00929494445704, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0641, "step": 7173 }, { "epoch": 13.011108592155974, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.0566, "step": 7174 }, { "epoch": 13.012922239854909, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0626, "step": 7175 }, { "epoch": 13.014735887553842, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0594, "step": 7176 }, { "epoch": 13.016549535252777, "grad_norm": 0.357421875, "learning_rate": 0.0002, "loss": 0.0496, "step": 7177 }, { "epoch": 13.018363182951711, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.057, "step": 7178 }, { "epoch": 13.020176830650646, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.057, "step": 7179 }, { "epoch": 13.021990478349581, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0687, "step": 7180 }, { "epoch": 13.023804126048516, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0624, "step": 7181 }, { "epoch": 13.025617773747449, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0701, "step": 7182 }, { "epoch": 13.027431421446384, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0677, "step": 7183 }, { "epoch": 13.029245069145318, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0692, "step": 7184 }, { "epoch": 13.031058716844253, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0648, "step": 7185 }, { "epoch": 13.032872364543188, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0617, "step": 7186 }, { "epoch": 13.034686012242123, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0609, "step": 7187 }, { "epoch": 13.036499659941056, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0605, "step": 7188 }, { "epoch": 13.03831330763999, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0581, "step": 7189 }, { "epoch": 13.040126955338925, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0683, "step": 7190 }, { "epoch": 13.04194060303786, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0562, "step": 7191 }, { "epoch": 13.043754250736795, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0678, "step": 7192 }, { "epoch": 13.04556789843573, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0706, "step": 7193 }, { "epoch": 13.047381546134662, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0645, "step": 7194 }, { "epoch": 13.049195193833597, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0639, "step": 7195 }, { "epoch": 13.051008841532532, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0746, "step": 7196 }, { "epoch": 13.052822489231467, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0702, "step": 7197 }, { "epoch": 13.054636136930402, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0752, "step": 7198 }, { "epoch": 13.056449784629336, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0802, "step": 7199 }, { "epoch": 13.05826343232827, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.0933, "step": 7200 }, { "epoch": 13.060077080027204, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0787, "step": 7201 }, { "epoch": 13.061890727726139, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0729, "step": 7202 }, { "epoch": 13.063704375425074, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0692, "step": 7203 }, { "epoch": 13.065518023124008, "grad_norm": 1.1171875, "learning_rate": 0.0002, "loss": 0.0809, "step": 7204 }, { "epoch": 13.067331670822943, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.108, "step": 7205 }, { "epoch": 13.069145318521878, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0979, "step": 7206 }, { "epoch": 13.070958966220811, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1189, "step": 7207 }, { "epoch": 13.072772613919746, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0981, "step": 7208 }, { "epoch": 13.07458626161868, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0917, "step": 7209 }, { "epoch": 13.076399909317615, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0916, "step": 7210 }, { "epoch": 13.07821355701655, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0958, "step": 7211 }, { "epoch": 13.080027204715485, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1221, "step": 7212 }, { "epoch": 13.081840852414418, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1148, "step": 7213 }, { "epoch": 13.083654500113353, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1262, "step": 7214 }, { "epoch": 13.085468147812287, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1587, "step": 7215 }, { "epoch": 13.087281795511222, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1342, "step": 7216 }, { "epoch": 13.089095443210157, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1994, "step": 7217 }, { "epoch": 13.090909090909092, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1868, "step": 7218 }, { "epoch": 13.092722738608025, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0756, "step": 7219 }, { "epoch": 13.09453638630696, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.07, "step": 7220 }, { "epoch": 13.096350034005894, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0691, "step": 7221 }, { "epoch": 13.098163681704829, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0673, "step": 7222 }, { "epoch": 13.099977329403764, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.0537, "step": 7223 }, { "epoch": 13.101790977102699, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0611, "step": 7224 }, { "epoch": 13.103604624801632, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0664, "step": 7225 }, { "epoch": 13.105418272500566, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0643, "step": 7226 }, { "epoch": 13.107231920199501, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0664, "step": 7227 }, { "epoch": 13.109045567898436, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0704, "step": 7228 }, { "epoch": 13.11085921559737, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0694, "step": 7229 }, { "epoch": 13.112672863296305, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0802, "step": 7230 }, { "epoch": 13.114486510995238, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0705, "step": 7231 }, { "epoch": 13.116300158694173, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0674, "step": 7232 }, { "epoch": 13.118113806393108, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.062, "step": 7233 }, { "epoch": 13.119927454092043, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0746, "step": 7234 }, { "epoch": 13.121741101790978, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0748, "step": 7235 }, { "epoch": 13.123554749489912, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0647, "step": 7236 }, { "epoch": 13.125368397188845, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0697, "step": 7237 }, { "epoch": 13.12718204488778, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0677, "step": 7238 }, { "epoch": 13.128995692586715, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0708, "step": 7239 }, { "epoch": 13.13080934028565, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0663, "step": 7240 }, { "epoch": 13.132622987984584, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0774, "step": 7241 }, { "epoch": 13.13443663568352, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0776, "step": 7242 }, { "epoch": 13.136250283382452, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0651, "step": 7243 }, { "epoch": 13.138063931081387, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0693, "step": 7244 }, { "epoch": 13.139877578780322, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1197, "step": 7245 }, { "epoch": 13.141691226479256, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0794, "step": 7246 }, { "epoch": 13.143504874178191, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0857, "step": 7247 }, { "epoch": 13.145318521877126, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0871, "step": 7248 }, { "epoch": 13.147132169576059, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0714, "step": 7249 }, { "epoch": 13.148945817274994, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0783, "step": 7250 }, { "epoch": 13.150759464973929, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0835, "step": 7251 }, { "epoch": 13.152573112672863, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0899, "step": 7252 }, { "epoch": 13.154386760371798, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0901, "step": 7253 }, { "epoch": 13.156200408070733, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.086, "step": 7254 }, { "epoch": 13.158014055769666, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1031, "step": 7255 }, { "epoch": 13.1598277034686, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0928, "step": 7256 }, { "epoch": 13.161641351167535, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.0984, "step": 7257 }, { "epoch": 13.16345499886647, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.1003, "step": 7258 }, { "epoch": 13.165268646565405, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1034, "step": 7259 }, { "epoch": 13.16708229426434, "grad_norm": 1.140625, "learning_rate": 0.0002, "loss": 0.1259, "step": 7260 }, { "epoch": 13.168895941963275, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0935, "step": 7261 }, { "epoch": 13.170709589662208, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1254, "step": 7262 }, { "epoch": 13.172523237361142, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1101, "step": 7263 }, { "epoch": 13.174336885060077, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.1303, "step": 7264 }, { "epoch": 13.176150532759012, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1347, "step": 7265 }, { "epoch": 13.177964180457947, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.1567, "step": 7266 }, { "epoch": 13.179777828156881, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1945, "step": 7267 }, { "epoch": 13.181591475855814, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.1961, "step": 7268 }, { "epoch": 13.18340512355475, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0636, "step": 7269 }, { "epoch": 13.185218771253684, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0718, "step": 7270 }, { "epoch": 13.187032418952619, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0716, "step": 7271 }, { "epoch": 13.188846066651553, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0739, "step": 7272 }, { "epoch": 13.190659714350488, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0786, "step": 7273 }, { "epoch": 13.192473362049421, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0746, "step": 7274 }, { "epoch": 13.194287009748356, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0595, "step": 7275 }, { "epoch": 13.19610065744729, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0655, "step": 7276 }, { "epoch": 13.197914305146226, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0658, "step": 7277 }, { "epoch": 13.19972795284516, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0713, "step": 7278 }, { "epoch": 13.201541600544095, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0681, "step": 7279 }, { "epoch": 13.203355248243028, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0774, "step": 7280 }, { "epoch": 13.205168895941963, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0651, "step": 7281 }, { "epoch": 13.206982543640898, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0807, "step": 7282 }, { "epoch": 13.208796191339832, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0698, "step": 7283 }, { "epoch": 13.210609839038767, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0619, "step": 7284 }, { "epoch": 13.212423486737702, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0765, "step": 7285 }, { "epoch": 13.214237134436635, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0773, "step": 7286 }, { "epoch": 13.21605078213557, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0643, "step": 7287 }, { "epoch": 13.217864429834504, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0709, "step": 7288 }, { "epoch": 13.21967807753344, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0708, "step": 7289 }, { "epoch": 13.221491725232374, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0727, "step": 7290 }, { "epoch": 13.223305372931309, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0805, "step": 7291 }, { "epoch": 13.225119020630242, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0705, "step": 7292 }, { "epoch": 13.226932668329177, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0695, "step": 7293 }, { "epoch": 13.226932668329177, "eval_loss": 2.3759872913360596, "eval_runtime": 185.8931, "eval_samples_per_second": 5.379, "eval_steps_per_second": 5.379, "step": 7293 }, { "epoch": 13.226932668329177, "mmlu_eval_accuracy": 0.3135813797247376, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.5454545454545454, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.4230769230769231, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.35, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.56, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.2, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.29411764705882354, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.23529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.4885214835035496, "step": 7293 }, { "epoch": 13.228746316028111, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0657, "step": 7294 }, { "epoch": 13.230559963727046, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0728, "step": 7295 }, { "epoch": 13.232373611425981, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0851, "step": 7296 }, { "epoch": 13.234187259124916, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1469, "step": 7297 }, { "epoch": 13.236000906823849, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0821, "step": 7298 }, { "epoch": 13.237814554522783, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.083, "step": 7299 }, { "epoch": 13.239628202221718, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0773, "step": 7300 }, { "epoch": 13.241441849920653, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0921, "step": 7301 }, { "epoch": 13.243255497619588, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0838, "step": 7302 }, { "epoch": 13.245069145318523, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0856, "step": 7303 }, { "epoch": 13.246882793017456, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0798, "step": 7304 }, { "epoch": 13.24869644071639, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1014, "step": 7305 }, { "epoch": 13.250510088415325, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0924, "step": 7306 }, { "epoch": 13.25232373611426, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1005, "step": 7307 }, { "epoch": 13.254137383813195, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0967, "step": 7308 }, { "epoch": 13.25595103151213, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1102, "step": 7309 }, { "epoch": 13.257764679211064, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1174, "step": 7310 }, { "epoch": 13.259578326909997, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1183, "step": 7311 }, { "epoch": 13.261391974608932, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1384, "step": 7312 }, { "epoch": 13.263205622307867, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1319, "step": 7313 }, { "epoch": 13.265019270006801, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1331, "step": 7314 }, { "epoch": 13.266832917705736, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1393, "step": 7315 }, { "epoch": 13.26864656540467, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1682, "step": 7316 }, { "epoch": 13.270460213103604, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.2038, "step": 7317 }, { "epoch": 13.272273860802539, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 0.2391, "step": 7318 }, { "epoch": 13.274087508501474, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.085, "step": 7319 }, { "epoch": 13.275901156200408, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0817, "step": 7320 }, { "epoch": 13.277714803899343, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0698, "step": 7321 }, { "epoch": 13.279528451598278, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0736, "step": 7322 }, { "epoch": 13.28134209929721, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0687, "step": 7323 }, { "epoch": 13.283155746996146, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0642, "step": 7324 }, { "epoch": 13.28496939469508, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0691, "step": 7325 }, { "epoch": 13.286783042394015, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0647, "step": 7326 }, { "epoch": 13.28859669009295, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0826, "step": 7327 }, { "epoch": 13.290410337791885, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0716, "step": 7328 }, { "epoch": 13.292223985490818, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0724, "step": 7329 }, { "epoch": 13.294037633189753, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.0717, "step": 7330 }, { "epoch": 13.295851280888687, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0602, "step": 7331 }, { "epoch": 13.297664928587622, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.081, "step": 7332 }, { "epoch": 13.299478576286557, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.08, "step": 7333 }, { "epoch": 13.301292223985492, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.065, "step": 7334 }, { "epoch": 13.303105871684425, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0706, "step": 7335 }, { "epoch": 13.30491951938336, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0712, "step": 7336 }, { "epoch": 13.306733167082294, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0766, "step": 7337 }, { "epoch": 13.308546814781229, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0651, "step": 7338 }, { "epoch": 13.310360462480164, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0764, "step": 7339 }, { "epoch": 13.312174110179098, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0694, "step": 7340 }, { "epoch": 13.313987757878031, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0765, "step": 7341 }, { "epoch": 13.315801405576966, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0804, "step": 7342 }, { "epoch": 13.317615053275901, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0674, "step": 7343 }, { "epoch": 13.319428700974836, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0714, "step": 7344 }, { "epoch": 13.32124234867377, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0842, "step": 7345 }, { "epoch": 13.323055996372705, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0814, "step": 7346 }, { "epoch": 13.324869644071638, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0681, "step": 7347 }, { "epoch": 13.326683291770573, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0777, "step": 7348 }, { "epoch": 13.328496939469508, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0859, "step": 7349 }, { "epoch": 13.330310587168443, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0757, "step": 7350 }, { "epoch": 13.332124234867377, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0868, "step": 7351 }, { "epoch": 13.333937882566312, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0871, "step": 7352 }, { "epoch": 13.335751530265245, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0901, "step": 7353 }, { "epoch": 13.33756517796418, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0795, "step": 7354 }, { "epoch": 13.339378825663115, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.0994, "step": 7355 }, { "epoch": 13.34119247336205, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0778, "step": 7356 }, { "epoch": 13.343006121060984, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0928, "step": 7357 }, { "epoch": 13.344819768759919, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0914, "step": 7358 }, { "epoch": 13.346633416458852, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.0949, "step": 7359 }, { "epoch": 13.348447064157787, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0889, "step": 7360 }, { "epoch": 13.350260711856722, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1069, "step": 7361 }, { "epoch": 13.352074359555656, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1031, "step": 7362 }, { "epoch": 13.353888007254591, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1284, "step": 7363 }, { "epoch": 13.355701654953526, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1297, "step": 7364 }, { "epoch": 13.357515302652459, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1324, "step": 7365 }, { "epoch": 13.359328950351394, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1382, "step": 7366 }, { "epoch": 13.361142598050328, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 0.2181, "step": 7367 }, { "epoch": 13.362956245749263, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.1925, "step": 7368 }, { "epoch": 13.364769893448198, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0685, "step": 7369 }, { "epoch": 13.366583541147133, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0672, "step": 7370 }, { "epoch": 13.368397188846068, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0767, "step": 7371 }, { "epoch": 13.370210836545, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0689, "step": 7372 }, { "epoch": 13.372024484243935, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.0649, "step": 7373 }, { "epoch": 13.37383813194287, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0654, "step": 7374 }, { "epoch": 13.375651779641805, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0725, "step": 7375 }, { "epoch": 13.37746542734074, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.067, "step": 7376 }, { "epoch": 13.379279075039674, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0749, "step": 7377 }, { "epoch": 13.381092722738607, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0725, "step": 7378 }, { "epoch": 13.382906370437542, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0733, "step": 7379 }, { "epoch": 13.384720018136477, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.071, "step": 7380 }, { "epoch": 13.386533665835412, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1144, "step": 7381 }, { "epoch": 13.388347313534346, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0722, "step": 7382 }, { "epoch": 13.390160961233281, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.067, "step": 7383 }, { "epoch": 13.391974608932214, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.085, "step": 7384 }, { "epoch": 13.393788256631149, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0727, "step": 7385 }, { "epoch": 13.395601904330084, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0699, "step": 7386 }, { "epoch": 13.397415552029019, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0694, "step": 7387 }, { "epoch": 13.399229199727953, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0722, "step": 7388 }, { "epoch": 13.401042847426888, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0834, "step": 7389 }, { "epoch": 13.402856495125821, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.082, "step": 7390 }, { "epoch": 13.404670142824756, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0874, "step": 7391 }, { "epoch": 13.40648379052369, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.079, "step": 7392 }, { "epoch": 13.408297438222625, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0723, "step": 7393 }, { "epoch": 13.41011108592156, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0798, "step": 7394 }, { "epoch": 13.411924733620495, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0838, "step": 7395 }, { "epoch": 13.413738381319428, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.092, "step": 7396 }, { "epoch": 13.415552029018363, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0884, "step": 7397 }, { "epoch": 13.417365676717298, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.089, "step": 7398 }, { "epoch": 13.419179324416232, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.073, "step": 7399 }, { "epoch": 13.420992972115167, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0811, "step": 7400 }, { "epoch": 13.422806619814102, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0808, "step": 7401 }, { "epoch": 13.424620267513035, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0934, "step": 7402 }, { "epoch": 13.42643391521197, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0901, "step": 7403 }, { "epoch": 13.428247562910904, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1024, "step": 7404 }, { "epoch": 13.43006121060984, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1055, "step": 7405 }, { "epoch": 13.431874858308774, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1176, "step": 7406 }, { "epoch": 13.433688506007709, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1276, "step": 7407 }, { "epoch": 13.435502153706642, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1075, "step": 7408 }, { "epoch": 13.437315801405576, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1245, "step": 7409 }, { "epoch": 13.439129449104511, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.106, "step": 7410 }, { "epoch": 13.440943096803446, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1127, "step": 7411 }, { "epoch": 13.44275674450238, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.126, "step": 7412 }, { "epoch": 13.444570392201316, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1359, "step": 7413 }, { "epoch": 13.446384039900249, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1637, "step": 7414 }, { "epoch": 13.448197687599183, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1491, "step": 7415 }, { "epoch": 13.450011335298118, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1714, "step": 7416 }, { "epoch": 13.451824982997053, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.1733, "step": 7417 }, { "epoch": 13.453638630695988, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.2466, "step": 7418 }, { "epoch": 13.455452278394922, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0852, "step": 7419 }, { "epoch": 13.457265926093855, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.077, "step": 7420 }, { "epoch": 13.45907957379279, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0789, "step": 7421 }, { "epoch": 13.460893221491725, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0718, "step": 7422 }, { "epoch": 13.46270686919066, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0668, "step": 7423 }, { "epoch": 13.464520516889595, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0667, "step": 7424 }, { "epoch": 13.46633416458853, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0812, "step": 7425 }, { "epoch": 13.468147812287462, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0757, "step": 7426 }, { "epoch": 13.469961459986397, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0687, "step": 7427 }, { "epoch": 13.471775107685332, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0789, "step": 7428 }, { "epoch": 13.473588755384267, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0817, "step": 7429 }, { "epoch": 13.475402403083201, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0881, "step": 7430 }, { "epoch": 13.477216050782136, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0743, "step": 7431 }, { "epoch": 13.479029698481071, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.075, "step": 7432 }, { "epoch": 13.480843346180004, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0841, "step": 7433 }, { "epoch": 13.482656993878939, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.066, "step": 7434 }, { "epoch": 13.484470641577873, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0805, "step": 7435 }, { "epoch": 13.486284289276808, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0789, "step": 7436 }, { "epoch": 13.488097936975743, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0702, "step": 7437 }, { "epoch": 13.489911584674678, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0763, "step": 7438 }, { "epoch": 13.49172523237361, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0648, "step": 7439 }, { "epoch": 13.493538880072546, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0862, "step": 7440 }, { "epoch": 13.49535252777148, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0772, "step": 7441 }, { "epoch": 13.497166175470415, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0806, "step": 7442 }, { "epoch": 13.49897982316935, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.083, "step": 7443 }, { "epoch": 13.500793470868285, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.091, "step": 7444 }, { "epoch": 13.502607118567218, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0934, "step": 7445 }, { "epoch": 13.504420766266152, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0851, "step": 7446 }, { "epoch": 13.506234413965087, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.079, "step": 7447 }, { "epoch": 13.508048061664022, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0753, "step": 7448 }, { "epoch": 13.509861709362957, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0834, "step": 7449 }, { "epoch": 13.511675357061891, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1026, "step": 7450 }, { "epoch": 13.513489004760824, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1032, "step": 7451 }, { "epoch": 13.51530265245976, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0924, "step": 7452 }, { "epoch": 13.517116300158694, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0948, "step": 7453 }, { "epoch": 13.518929947857629, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0873, "step": 7454 }, { "epoch": 13.520743595556564, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0965, "step": 7455 }, { "epoch": 13.522557243255498, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1015, "step": 7456 }, { "epoch": 13.524370890954431, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1015, "step": 7457 }, { "epoch": 13.526184538653366, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1483, "step": 7458 }, { "epoch": 13.527998186352301, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.1019, "step": 7459 }, { "epoch": 13.529811834051236, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1311, "step": 7460 }, { "epoch": 13.53162548175017, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1092, "step": 7461 }, { "epoch": 13.533439129449105, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1376, "step": 7462 }, { "epoch": 13.535252777148038, "grad_norm": 0.87890625, "learning_rate": 0.0002, "loss": 0.1471, "step": 7463 }, { "epoch": 13.537066424846973, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1503, "step": 7464 }, { "epoch": 13.538880072545908, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1541, "step": 7465 }, { "epoch": 13.540693720244843, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.1749, "step": 7466 }, { "epoch": 13.542507367943777, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.2254, "step": 7467 }, { "epoch": 13.544321015642712, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.2646, "step": 7468 }, { "epoch": 13.546134663341645, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0832, "step": 7469 }, { "epoch": 13.54794831104058, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0828, "step": 7470 }, { "epoch": 13.549761958739515, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0721, "step": 7471 }, { "epoch": 13.55157560643845, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0775, "step": 7472 }, { "epoch": 13.553389254137384, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0735, "step": 7473 }, { "epoch": 13.555202901836319, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0755, "step": 7474 }, { "epoch": 13.557016549535252, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0759, "step": 7475 }, { "epoch": 13.558830197234187, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0723, "step": 7476 }, { "epoch": 13.560643844933121, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0685, "step": 7477 }, { "epoch": 13.562457492632056, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0757, "step": 7478 }, { "epoch": 13.564271140330991, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0702, "step": 7479 }, { "epoch": 13.566084788029926, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.078, "step": 7480 }, { "epoch": 13.566084788029926, "eval_loss": 2.3705649375915527, "eval_runtime": 185.3813, "eval_samples_per_second": 5.394, "eval_steps_per_second": 5.394, "step": 7480 }, { "epoch": 13.566084788029926, "mmlu_eval_accuracy": 0.2997078567675707, "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.1111111111111111, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.047619047619047616, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.2833333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.08333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.4418604651162791, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.26, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.29411764705882354, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.24705882352941178, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.09090909090909091, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.8279581049312417, "step": 7480 }, { "epoch": 13.56789843572886, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0852, "step": 7481 }, { "epoch": 13.569712083427794, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0819, "step": 7482 }, { "epoch": 13.571525731126728, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0763, "step": 7483 }, { "epoch": 13.573339378825663, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0793, "step": 7484 }, { "epoch": 13.575153026524598, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0777, "step": 7485 }, { "epoch": 13.576966674223533, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0787, "step": 7486 }, { "epoch": 13.578780321922466, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0777, "step": 7487 }, { "epoch": 13.5805939696214, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0812, "step": 7488 }, { "epoch": 13.582407617320335, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0778, "step": 7489 }, { "epoch": 13.58422126501927, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0841, "step": 7490 }, { "epoch": 13.586034912718205, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0878, "step": 7491 }, { "epoch": 13.58784856041714, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0729, "step": 7492 }, { "epoch": 13.589662208116074, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0789, "step": 7493 }, { "epoch": 13.591475855815007, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0929, "step": 7494 }, { "epoch": 13.593289503513942, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0906, "step": 7495 }, { "epoch": 13.595103151212877, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0799, "step": 7496 }, { "epoch": 13.596916798911812, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1019, "step": 7497 }, { "epoch": 13.598730446610746, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0781, "step": 7498 }, { "epoch": 13.600544094309681, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0825, "step": 7499 }, { "epoch": 13.602357742008614, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0869, "step": 7500 }, { "epoch": 13.604171389707549, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1144, "step": 7501 }, { "epoch": 13.605985037406484, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0922, "step": 7502 }, { "epoch": 13.607798685105418, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0884, "step": 7503 }, { "epoch": 13.609612332804353, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0953, "step": 7504 }, { "epoch": 13.611425980503288, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.101, "step": 7505 }, { "epoch": 13.613239628202221, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0984, "step": 7506 }, { "epoch": 13.615053275901156, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1094, "step": 7507 }, { "epoch": 13.61686692360009, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.113, "step": 7508 }, { "epoch": 13.618680571299025, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1038, "step": 7509 }, { "epoch": 13.62049421899796, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1135, "step": 7510 }, { "epoch": 13.622307866696895, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1215, "step": 7511 }, { "epoch": 13.624121514395828, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1296, "step": 7512 }, { "epoch": 13.625935162094763, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.134, "step": 7513 }, { "epoch": 13.627748809793697, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1333, "step": 7514 }, { "epoch": 13.629562457492632, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.1465, "step": 7515 }, { "epoch": 13.631376105191567, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1886, "step": 7516 }, { "epoch": 13.633189752890502, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.2425, "step": 7517 }, { "epoch": 13.635003400589435, "grad_norm": 1.3203125, "learning_rate": 0.0002, "loss": 0.2833, "step": 7518 }, { "epoch": 13.63681704828837, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0843, "step": 7519 }, { "epoch": 13.638630695987304, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0893, "step": 7520 }, { "epoch": 13.640444343686239, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0732, "step": 7521 }, { "epoch": 13.642257991385174, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0792, "step": 7522 }, { "epoch": 13.644071639084109, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0788, "step": 7523 }, { "epoch": 13.645885286783042, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0859, "step": 7524 }, { "epoch": 13.647698934481976, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0774, "step": 7525 }, { "epoch": 13.649512582180911, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0797, "step": 7526 }, { "epoch": 13.651326229879846, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0816, "step": 7527 }, { "epoch": 13.65313987757878, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0786, "step": 7528 }, { "epoch": 13.654953525277715, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0811, "step": 7529 }, { "epoch": 13.65676717297665, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0828, "step": 7530 }, { "epoch": 13.658580820675583, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0911, "step": 7531 }, { "epoch": 13.660394468374518, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0734, "step": 7532 }, { "epoch": 13.662208116073453, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0848, "step": 7533 }, { "epoch": 13.664021763772388, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0746, "step": 7534 }, { "epoch": 13.665835411471322, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0847, "step": 7535 }, { "epoch": 13.667649059170255, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0798, "step": 7536 }, { "epoch": 13.66946270686919, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.086, "step": 7537 }, { "epoch": 13.671276354568125, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0766, "step": 7538 }, { "epoch": 13.67309000226706, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.071, "step": 7539 }, { "epoch": 13.674903649965994, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0843, "step": 7540 }, { "epoch": 13.67671729766493, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0693, "step": 7541 }, { "epoch": 13.678530945363864, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0795, "step": 7542 }, { "epoch": 13.680344593062797, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0752, "step": 7543 }, { "epoch": 13.682158240761732, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0818, "step": 7544 }, { "epoch": 13.683971888460666, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0753, "step": 7545 }, { "epoch": 13.685785536159601, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0792, "step": 7546 }, { "epoch": 13.687599183858536, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.098, "step": 7547 }, { "epoch": 13.689412831557469, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0867, "step": 7548 }, { "epoch": 13.691226479256404, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0879, "step": 7549 }, { "epoch": 13.693040126955339, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0964, "step": 7550 }, { "epoch": 13.694853774654273, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0875, "step": 7551 }, { "epoch": 13.696667422353208, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.098, "step": 7552 }, { "epoch": 13.698481070052143, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0858, "step": 7553 }, { "epoch": 13.700294717751078, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.11, "step": 7554 }, { "epoch": 13.70210836545001, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.095, "step": 7555 }, { "epoch": 13.703922013148945, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0982, "step": 7556 }, { "epoch": 13.70573566084788, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1133, "step": 7557 }, { "epoch": 13.707549308546815, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.1036, "step": 7558 }, { "epoch": 13.70936295624575, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1005, "step": 7559 }, { "epoch": 13.711176603944685, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1228, "step": 7560 }, { "epoch": 13.712990251643618, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1419, "step": 7561 }, { "epoch": 13.714803899342552, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1306, "step": 7562 }, { "epoch": 13.716617547041487, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1425, "step": 7563 }, { "epoch": 13.718431194740422, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.156, "step": 7564 }, { "epoch": 13.720244842439357, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.144, "step": 7565 }, { "epoch": 13.722058490138291, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1631, "step": 7566 }, { "epoch": 13.723872137837224, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.2094, "step": 7567 }, { "epoch": 13.72568578553616, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2925, "step": 7568 }, { "epoch": 13.727499433235094, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0884, "step": 7569 }, { "epoch": 13.729313080934029, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0882, "step": 7570 }, { "epoch": 13.731126728632963, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0769, "step": 7571 }, { "epoch": 13.732940376331898, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0755, "step": 7572 }, { "epoch": 13.734754024030831, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0769, "step": 7573 }, { "epoch": 13.736567671729766, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0844, "step": 7574 }, { "epoch": 13.7383813194287, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0942, "step": 7575 }, { "epoch": 13.740194967127636, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.071, "step": 7576 }, { "epoch": 13.74200861482657, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0869, "step": 7577 }, { "epoch": 13.743822262525505, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0775, "step": 7578 }, { "epoch": 13.745635910224438, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0858, "step": 7579 }, { "epoch": 13.747449557923373, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0851, "step": 7580 }, { "epoch": 13.749263205622308, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0909, "step": 7581 }, { "epoch": 13.751076853321242, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0834, "step": 7582 }, { "epoch": 13.752890501020177, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0784, "step": 7583 }, { "epoch": 13.754704148719112, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0887, "step": 7584 }, { "epoch": 13.756517796418045, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0942, "step": 7585 }, { "epoch": 13.75833144411698, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0975, "step": 7586 }, { "epoch": 13.760145091815914, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.075, "step": 7587 }, { "epoch": 13.76195873951485, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0837, "step": 7588 }, { "epoch": 13.763772387213784, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0906, "step": 7589 }, { "epoch": 13.765586034912719, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0786, "step": 7590 }, { "epoch": 13.767399682611654, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0832, "step": 7591 }, { "epoch": 13.769213330310587, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0783, "step": 7592 }, { "epoch": 13.771026978009521, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.102, "step": 7593 }, { "epoch": 13.772840625708456, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0784, "step": 7594 }, { "epoch": 13.774654273407391, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0882, "step": 7595 }, { "epoch": 13.776467921106326, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0905, "step": 7596 }, { "epoch": 13.778281568805259, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0905, "step": 7597 }, { "epoch": 13.780095216504193, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0973, "step": 7598 }, { "epoch": 13.781908864203128, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0968, "step": 7599 }, { "epoch": 13.783722511902063, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0978, "step": 7600 }, { "epoch": 13.785536159600998, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0941, "step": 7601 }, { "epoch": 13.787349807299933, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0843, "step": 7602 }, { "epoch": 13.789163454998867, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0933, "step": 7603 }, { "epoch": 13.7909771026978, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0905, "step": 7604 }, { "epoch": 13.792790750396735, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0972, "step": 7605 }, { "epoch": 13.79460439809567, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.123, "step": 7606 }, { "epoch": 13.796418045794605, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1012, "step": 7607 }, { "epoch": 13.79823169349354, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1167, "step": 7608 }, { "epoch": 13.800045341192474, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1186, "step": 7609 }, { "epoch": 13.801858988891407, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1033, "step": 7610 }, { "epoch": 13.803672636590342, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.123, "step": 7611 }, { "epoch": 13.805486284289277, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1352, "step": 7612 }, { "epoch": 13.807299931988211, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.1236, "step": 7613 }, { "epoch": 13.809113579687146, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1468, "step": 7614 }, { "epoch": 13.810927227386081, "grad_norm": 1.265625, "learning_rate": 0.0002, "loss": 0.1978, "step": 7615 }, { "epoch": 13.812740875085014, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1595, "step": 7616 }, { "epoch": 13.814554522783949, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.221, "step": 7617 }, { "epoch": 13.816368170482884, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.2628, "step": 7618 }, { "epoch": 13.818181818181818, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0822, "step": 7619 }, { "epoch": 13.819995465880753, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0788, "step": 7620 }, { "epoch": 13.821809113579688, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0771, "step": 7621 }, { "epoch": 13.82362276127862, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.101, "step": 7622 }, { "epoch": 13.825436408977556, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0728, "step": 7623 }, { "epoch": 13.82725005667649, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0819, "step": 7624 }, { "epoch": 13.829063704375425, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0759, "step": 7625 }, { "epoch": 13.83087735207436, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0806, "step": 7626 }, { "epoch": 13.832690999773295, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0836, "step": 7627 }, { "epoch": 13.834504647472228, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0886, "step": 7628 }, { "epoch": 13.836318295171163, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.091, "step": 7629 }, { "epoch": 13.838131942870097, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0801, "step": 7630 }, { "epoch": 13.839945590569032, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0873, "step": 7631 }, { "epoch": 13.841759238267967, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0905, "step": 7632 }, { "epoch": 13.843572885966902, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0795, "step": 7633 }, { "epoch": 13.845386533665835, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0753, "step": 7634 }, { "epoch": 13.84720018136477, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0774, "step": 7635 }, { "epoch": 13.849013829063704, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0854, "step": 7636 }, { "epoch": 13.850827476762639, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0885, "step": 7637 }, { "epoch": 13.852641124461574, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0907, "step": 7638 }, { "epoch": 13.854454772160508, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0882, "step": 7639 }, { "epoch": 13.856268419859443, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0877, "step": 7640 }, { "epoch": 13.858082067558376, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0878, "step": 7641 }, { "epoch": 13.859895715257311, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0855, "step": 7642 }, { "epoch": 13.861709362956246, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0794, "step": 7643 }, { "epoch": 13.86352301065518, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0778, "step": 7644 }, { "epoch": 13.865336658354115, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0816, "step": 7645 }, { "epoch": 13.867150306053048, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0828, "step": 7646 }, { "epoch": 13.868963953751983, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0879, "step": 7647 }, { "epoch": 13.870777601450918, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0867, "step": 7648 }, { "epoch": 13.872591249149853, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0907, "step": 7649 }, { "epoch": 13.874404896848787, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0836, "step": 7650 }, { "epoch": 13.876218544547722, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0867, "step": 7651 }, { "epoch": 13.878032192246657, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1115, "step": 7652 }, { "epoch": 13.87984583994559, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1097, "step": 7653 }, { "epoch": 13.881659487644525, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1056, "step": 7654 }, { "epoch": 13.88347313534346, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.104, "step": 7655 }, { "epoch": 13.885286783042394, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.11, "step": 7656 }, { "epoch": 13.887100430741329, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1148, "step": 7657 }, { "epoch": 13.888914078440262, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1192, "step": 7658 }, { "epoch": 13.890727726139197, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0997, "step": 7659 }, { "epoch": 13.892541373838132, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1221, "step": 7660 }, { "epoch": 13.894355021537066, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.1286, "step": 7661 }, { "epoch": 13.896168669236001, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1328, "step": 7662 }, { "epoch": 13.897982316934936, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.127, "step": 7663 }, { "epoch": 13.89979596463387, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1352, "step": 7664 }, { "epoch": 13.901609612332804, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1448, "step": 7665 }, { "epoch": 13.903423260031738, "grad_norm": 1.6015625, "learning_rate": 0.0002, "loss": 0.1978, "step": 7666 }, { "epoch": 13.905236907730673, "grad_norm": 1.4609375, "learning_rate": 0.0002, "loss": 0.2388, "step": 7667 }, { "epoch": 13.905236907730673, "eval_loss": 2.281301975250244, "eval_runtime": 189.376, "eval_samples_per_second": 5.281, "eval_steps_per_second": 5.281, "step": 7667 }, { "epoch": 13.905236907730673, "mmlu_eval_accuracy": 0.31113214219032886, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.1875, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.5, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.36363636363636365, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.375, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3023255813953488, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.13043478260869565, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.3488372093023256, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.24705882352941178, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.6062466321545905, "step": 7667 }, { "epoch": 13.907050555429608, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.2524, "step": 7668 }, { "epoch": 13.908864203128543, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1003, "step": 7669 }, { "epoch": 13.910677850827478, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0886, "step": 7670 }, { "epoch": 13.91249149852641, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0858, "step": 7671 }, { "epoch": 13.914305146225345, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.082, "step": 7672 }, { "epoch": 13.91611879392428, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0778, "step": 7673 }, { "epoch": 13.917932441623215, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0764, "step": 7674 }, { "epoch": 13.91974608932215, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0859, "step": 7675 }, { "epoch": 13.921559737021084, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1136, "step": 7676 }, { "epoch": 13.923373384720017, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0805, "step": 7677 }, { "epoch": 13.925187032418952, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0849, "step": 7678 }, { "epoch": 13.927000680117887, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0773, "step": 7679 }, { "epoch": 13.928814327816822, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0915, "step": 7680 }, { "epoch": 13.930627975515756, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0763, "step": 7681 }, { "epoch": 13.932441623214691, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0829, "step": 7682 }, { "epoch": 13.934255270913624, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0799, "step": 7683 }, { "epoch": 13.936068918612559, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0918, "step": 7684 }, { "epoch": 13.937882566311494, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0767, "step": 7685 }, { "epoch": 13.939696214010429, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0786, "step": 7686 }, { "epoch": 13.941509861709363, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0805, "step": 7687 }, { "epoch": 13.943323509408298, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0801, "step": 7688 }, { "epoch": 13.945137157107231, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0858, "step": 7689 }, { "epoch": 13.946950804806166, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0935, "step": 7690 }, { "epoch": 13.9487644525051, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0771, "step": 7691 }, { "epoch": 13.950578100204035, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0863, "step": 7692 }, { "epoch": 13.95239174790297, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.09, "step": 7693 }, { "epoch": 13.954205395601905, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0891, "step": 7694 }, { "epoch": 13.956019043300838, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0788, "step": 7695 }, { "epoch": 13.957832690999773, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1116, "step": 7696 }, { "epoch": 13.959646338698708, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0893, "step": 7697 }, { "epoch": 13.961459986397642, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0864, "step": 7698 }, { "epoch": 13.963273634096577, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0833, "step": 7699 }, { "epoch": 13.965087281795512, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0916, "step": 7700 }, { "epoch": 13.966900929494447, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0908, "step": 7701 }, { "epoch": 13.96871457719338, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0933, "step": 7702 }, { "epoch": 13.970528224892314, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0845, "step": 7703 }, { "epoch": 13.97234187259125, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1195, "step": 7704 }, { "epoch": 13.974155520290184, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1016, "step": 7705 }, { "epoch": 13.975969167989119, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1055, "step": 7706 }, { "epoch": 13.977782815688052, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1149, "step": 7707 }, { "epoch": 13.979596463386986, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.109, "step": 7708 }, { "epoch": 13.981410111085921, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.113, "step": 7709 }, { "epoch": 13.983223758784856, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1137, "step": 7710 }, { "epoch": 13.98503740648379, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1315, "step": 7711 }, { "epoch": 13.986851054182726, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1472, "step": 7712 }, { "epoch": 13.98866470188166, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1263, "step": 7713 }, { "epoch": 13.990478349580593, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1417, "step": 7714 }, { "epoch": 13.992291997279528, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1465, "step": 7715 }, { "epoch": 13.994105644978463, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1627, "step": 7716 }, { "epoch": 13.995919292677398, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1898, "step": 7717 }, { "epoch": 13.997732940376332, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.2597, "step": 7718 }, { "epoch": 13.999546588075267, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0966, "step": 7719 }, { "epoch": 14.0013602357742, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0753, "step": 7720 }, { "epoch": 14.003173883473135, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0666, "step": 7721 }, { "epoch": 14.00498753117207, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0603, "step": 7722 }, { "epoch": 14.006801178871005, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0613, "step": 7723 }, { "epoch": 14.00861482656994, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.063, "step": 7724 }, { "epoch": 14.010428474268874, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0649, "step": 7725 }, { "epoch": 14.012242121967807, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.0509, "step": 7726 }, { "epoch": 14.014055769666742, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.0556, "step": 7727 }, { "epoch": 14.015869417365677, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0563, "step": 7728 }, { "epoch": 14.017683065064611, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.0625, "step": 7729 }, { "epoch": 14.019496712763546, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0619, "step": 7730 }, { "epoch": 14.021310360462481, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.068, "step": 7731 }, { "epoch": 14.023124008161414, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1114, "step": 7732 }, { "epoch": 14.024937655860349, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0607, "step": 7733 }, { "epoch": 14.026751303559283, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0629, "step": 7734 }, { "epoch": 14.028564951258218, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0587, "step": 7735 }, { "epoch": 14.030378598957153, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0644, "step": 7736 }, { "epoch": 14.032192246656088, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0579, "step": 7737 }, { "epoch": 14.03400589435502, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.06, "step": 7738 }, { "epoch": 14.035819542053956, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0691, "step": 7739 }, { "epoch": 14.03763318975289, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0597, "step": 7740 }, { "epoch": 14.039446837451825, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0602, "step": 7741 }, { "epoch": 14.04126048515076, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0588, "step": 7742 }, { "epoch": 14.043074132849695, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0693, "step": 7743 }, { "epoch": 14.044887780548628, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0688, "step": 7744 }, { "epoch": 14.046701428247562, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0662, "step": 7745 }, { "epoch": 14.048515075946497, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0835, "step": 7746 }, { "epoch": 14.050328723645432, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0742, "step": 7747 }, { "epoch": 14.052142371344367, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0687, "step": 7748 }, { "epoch": 14.053956019043301, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0728, "step": 7749 }, { "epoch": 14.055769666742234, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0752, "step": 7750 }, { "epoch": 14.05758331444117, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.091, "step": 7751 }, { "epoch": 14.059396962140104, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0925, "step": 7752 }, { "epoch": 14.061210609839039, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.079, "step": 7753 }, { "epoch": 14.063024257537974, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.079, "step": 7754 }, { "epoch": 14.064837905236908, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0787, "step": 7755 }, { "epoch": 14.066651552935841, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0903, "step": 7756 }, { "epoch": 14.068465200634776, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.089, "step": 7757 }, { "epoch": 14.070278848333711, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0981, "step": 7758 }, { "epoch": 14.072092496032646, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0891, "step": 7759 }, { "epoch": 14.07390614373158, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1121, "step": 7760 }, { "epoch": 14.075719791430515, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1086, "step": 7761 }, { "epoch": 14.07753343912945, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1249, "step": 7762 }, { "epoch": 14.079347086828383, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.1011, "step": 7763 }, { "epoch": 14.081160734527318, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.1035, "step": 7764 }, { "epoch": 14.082974382226253, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1412, "step": 7765 }, { "epoch": 14.084788029925187, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1324, "step": 7766 }, { "epoch": 14.086601677624122, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.1567, "step": 7767 }, { "epoch": 14.088415325323057, "grad_norm": 1.3515625, "learning_rate": 0.0002, "loss": 0.1653, "step": 7768 }, { "epoch": 14.09022897302199, "grad_norm": 1.7265625, "learning_rate": 0.0002, "loss": 0.2698, "step": 7769 }, { "epoch": 14.092042620720925, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.1245, "step": 7770 }, { "epoch": 14.09385626841986, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.0588, "step": 7771 }, { "epoch": 14.095669916118794, "grad_norm": 0.330078125, "learning_rate": 0.0002, "loss": 0.0507, "step": 7772 }, { "epoch": 14.097483563817729, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.059, "step": 7773 }, { "epoch": 14.099297211516664, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.0592, "step": 7774 }, { "epoch": 14.101110859215597, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0585, "step": 7775 }, { "epoch": 14.102924506914531, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0731, "step": 7776 }, { "epoch": 14.104738154613466, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0652, "step": 7777 }, { "epoch": 14.106551802312401, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0585, "step": 7778 }, { "epoch": 14.108365450011336, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0592, "step": 7779 }, { "epoch": 14.11017909771027, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0648, "step": 7780 }, { "epoch": 14.111992745409204, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.061, "step": 7781 }, { "epoch": 14.113806393108138, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.067, "step": 7782 }, { "epoch": 14.115620040807073, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0751, "step": 7783 }, { "epoch": 14.117433688506008, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0801, "step": 7784 }, { "epoch": 14.119247336204943, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0628, "step": 7785 }, { "epoch": 14.121060983903877, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.0543, "step": 7786 }, { "epoch": 14.12287463160281, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0782, "step": 7787 }, { "epoch": 14.124688279301745, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0661, "step": 7788 }, { "epoch": 14.12650192700068, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0669, "step": 7789 }, { "epoch": 14.128315574699615, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.066, "step": 7790 }, { "epoch": 14.13012922239855, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0673, "step": 7791 }, { "epoch": 14.131942870097484, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0698, "step": 7792 }, { "epoch": 14.133756517796417, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0664, "step": 7793 }, { "epoch": 14.135570165495352, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0643, "step": 7794 }, { "epoch": 14.137383813194287, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0668, "step": 7795 }, { "epoch": 14.139197460893222, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0742, "step": 7796 }, { "epoch": 14.141011108592156, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0681, "step": 7797 }, { "epoch": 14.142824756291091, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0716, "step": 7798 }, { "epoch": 14.144638403990024, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0726, "step": 7799 }, { "epoch": 14.146452051688959, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0764, "step": 7800 }, { "epoch": 14.148265699387894, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0761, "step": 7801 }, { "epoch": 14.150079347086828, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0796, "step": 7802 }, { "epoch": 14.151892994785763, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0763, "step": 7803 }, { "epoch": 14.153706642484698, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0792, "step": 7804 }, { "epoch": 14.155520290183631, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0794, "step": 7805 }, { "epoch": 14.157333937882566, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0831, "step": 7806 }, { "epoch": 14.1591475855815, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0947, "step": 7807 }, { "epoch": 14.160961233280435, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0886, "step": 7808 }, { "epoch": 14.16277488097937, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0897, "step": 7809 }, { "epoch": 14.164588528678305, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0974, "step": 7810 }, { "epoch": 14.166402176377238, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.094, "step": 7811 }, { "epoch": 14.168215824076173, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1068, "step": 7812 }, { "epoch": 14.170029471775107, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1131, "step": 7813 }, { "epoch": 14.171843119474042, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.1151, "step": 7814 }, { "epoch": 14.173656767172977, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1364, "step": 7815 }, { "epoch": 14.175470414871912, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1257, "step": 7816 }, { "epoch": 14.177284062570845, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1469, "step": 7817 }, { "epoch": 14.17909771026978, "grad_norm": 1.171875, "learning_rate": 0.0002, "loss": 0.2104, "step": 7818 }, { "epoch": 14.180911357968714, "grad_norm": 0.35546875, "learning_rate": 0.0002, "loss": 0.2017, "step": 7819 }, { "epoch": 14.182725005667649, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.1112, "step": 7820 }, { "epoch": 14.184538653366584, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0619, "step": 7821 }, { "epoch": 14.186352301065519, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0763, "step": 7822 }, { "epoch": 14.188165948764453, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0698, "step": 7823 }, { "epoch": 14.189979596463386, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0603, "step": 7824 }, { "epoch": 14.191793244162321, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.073, "step": 7825 }, { "epoch": 14.193606891861256, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0643, "step": 7826 }, { "epoch": 14.19542053956019, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0613, "step": 7827 }, { "epoch": 14.197234187259125, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0732, "step": 7828 }, { "epoch": 14.19904783495806, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0605, "step": 7829 }, { "epoch": 14.200861482656993, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0746, "step": 7830 }, { "epoch": 14.202675130355928, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0671, "step": 7831 }, { "epoch": 14.204488778054863, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0738, "step": 7832 }, { "epoch": 14.206302425753798, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0661, "step": 7833 }, { "epoch": 14.208116073452732, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0666, "step": 7834 }, { "epoch": 14.209929721151667, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0725, "step": 7835 }, { "epoch": 14.2117433688506, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0682, "step": 7836 }, { "epoch": 14.213557016549535, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.068, "step": 7837 }, { "epoch": 14.21537066424847, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0773, "step": 7838 }, { "epoch": 14.217184311947404, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0674, "step": 7839 }, { "epoch": 14.21899795964634, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0761, "step": 7840 }, { "epoch": 14.220811607345274, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0742, "step": 7841 }, { "epoch": 14.222625255044207, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0706, "step": 7842 }, { "epoch": 14.224438902743142, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.067, "step": 7843 }, { "epoch": 14.226252550442076, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0691, "step": 7844 }, { "epoch": 14.228066198141011, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0786, "step": 7845 }, { "epoch": 14.229879845839946, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0719, "step": 7846 }, { "epoch": 14.23169349353888, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0745, "step": 7847 }, { "epoch": 14.233507141237814, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0732, "step": 7848 }, { "epoch": 14.235320788936749, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0987, "step": 7849 }, { "epoch": 14.237134436635683, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0841, "step": 7850 }, { "epoch": 14.238948084334618, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0769, "step": 7851 }, { "epoch": 14.240761732033553, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0777, "step": 7852 }, { "epoch": 14.242575379732488, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.0915, "step": 7853 }, { "epoch": 14.24438902743142, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0911, "step": 7854 }, { "epoch": 14.24438902743142, "eval_loss": 2.344208002090454, "eval_runtime": 186.8469, "eval_samples_per_second": 5.352, "eval_steps_per_second": 5.352, "step": 7854 }, { "epoch": 14.24438902743142, "mmlu_eval_accuracy": 0.3236434899813899, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3103448275862069, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.375, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.24390243902439024, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.45454545454545453, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.47058823529411764, "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.4069767441860465, "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735, "mmlu_eval_accuracy_moral_scenarios": 0.29, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.29411764705882354, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.2529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.3225806451612903, "mmlu_eval_accuracy_professional_psychology": 0.391304347826087, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3888888888888889, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.4053492866101256, "step": 7854 }, { "epoch": 14.246202675130355, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0801, "step": 7855 }, { "epoch": 14.24801632282929, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0857, "step": 7856 }, { "epoch": 14.249829970528225, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1033, "step": 7857 }, { "epoch": 14.25164361822716, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1051, "step": 7858 }, { "epoch": 14.253457265926095, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0945, "step": 7859 }, { "epoch": 14.255270913625028, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1038, "step": 7860 }, { "epoch": 14.257084561323962, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.102, "step": 7861 }, { "epoch": 14.258898209022897, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1015, "step": 7862 }, { "epoch": 14.260711856721832, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1059, "step": 7863 }, { "epoch": 14.262525504420767, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.1195, "step": 7864 }, { "epoch": 14.264339152119701, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1411, "step": 7865 }, { "epoch": 14.266152799818634, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.134, "step": 7866 }, { "epoch": 14.26796644751757, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1422, "step": 7867 }, { "epoch": 14.269780095216504, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1777, "step": 7868 }, { "epoch": 14.271593742915439, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2419, "step": 7869 }, { "epoch": 14.273407390614373, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1375, "step": 7870 }, { "epoch": 14.275221038313308, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0657, "step": 7871 }, { "epoch": 14.277034686012243, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0635, "step": 7872 }, { "epoch": 14.278848333711176, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0659, "step": 7873 }, { "epoch": 14.28066198141011, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0669, "step": 7874 }, { "epoch": 14.282475629109046, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0663, "step": 7875 }, { "epoch": 14.28428927680798, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0664, "step": 7876 }, { "epoch": 14.286102924506915, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0893, "step": 7877 }, { "epoch": 14.287916572205848, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0713, "step": 7878 }, { "epoch": 14.289730219904783, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0755, "step": 7879 }, { "epoch": 14.291543867603718, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0702, "step": 7880 }, { "epoch": 14.293357515302652, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0674, "step": 7881 }, { "epoch": 14.295171163001587, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0689, "step": 7882 }, { "epoch": 14.296984810700522, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0681, "step": 7883 }, { "epoch": 14.298798458399457, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0839, "step": 7884 }, { "epoch": 14.30061210609839, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0841, "step": 7885 }, { "epoch": 14.302425753797325, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0669, "step": 7886 }, { "epoch": 14.30423940149626, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0763, "step": 7887 }, { "epoch": 14.306053049195194, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0664, "step": 7888 }, { "epoch": 14.307866696894129, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0751, "step": 7889 }, { "epoch": 14.309680344593064, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0672, "step": 7890 }, { "epoch": 14.311493992291997, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0723, "step": 7891 }, { "epoch": 14.313307639990931, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0774, "step": 7892 }, { "epoch": 14.315121287689866, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0781, "step": 7893 }, { "epoch": 14.316934935388801, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0816, "step": 7894 }, { "epoch": 14.318748583087736, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0798, "step": 7895 }, { "epoch": 14.32056223078667, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.069, "step": 7896 }, { "epoch": 14.322375878485603, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0832, "step": 7897 }, { "epoch": 14.324189526184538, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0908, "step": 7898 }, { "epoch": 14.326003173883473, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0786, "step": 7899 }, { "epoch": 14.327816821582408, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0775, "step": 7900 }, { "epoch": 14.329630469281343, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.076, "step": 7901 }, { "epoch": 14.331444116980277, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0911, "step": 7902 }, { "epoch": 14.33325776467921, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0802, "step": 7903 }, { "epoch": 14.335071412378145, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0741, "step": 7904 }, { "epoch": 14.33688506007708, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0868, "step": 7905 }, { "epoch": 14.338698707776015, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.084, "step": 7906 }, { "epoch": 14.34051235547495, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0979, "step": 7907 }, { "epoch": 14.342326003173884, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0975, "step": 7908 }, { "epoch": 14.344139650872817, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1027, "step": 7909 }, { "epoch": 14.345953298571752, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.106, "step": 7910 }, { "epoch": 14.347766946270687, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1094, "step": 7911 }, { "epoch": 14.349580593969621, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1048, "step": 7912 }, { "epoch": 14.351394241668556, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1155, "step": 7913 }, { "epoch": 14.353207889367491, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1218, "step": 7914 }, { "epoch": 14.355021537066424, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1177, "step": 7915 }, { "epoch": 14.356835184765359, "grad_norm": 0.984375, "learning_rate": 0.0002, "loss": 0.1352, "step": 7916 }, { "epoch": 14.358648832464294, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1579, "step": 7917 }, { "epoch": 14.360462480163228, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1846, "step": 7918 }, { "epoch": 14.362276127862163, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.2175, "step": 7919 }, { "epoch": 14.364089775561098, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.137, "step": 7920 }, { "epoch": 14.365903423260031, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0701, "step": 7921 }, { "epoch": 14.367717070958966, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0684, "step": 7922 }, { "epoch": 14.3695307186579, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0712, "step": 7923 }, { "epoch": 14.371344366356835, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0819, "step": 7924 }, { "epoch": 14.37315801405577, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0624, "step": 7925 }, { "epoch": 14.374971661754705, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0673, "step": 7926 }, { "epoch": 14.376785309453638, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0679, "step": 7927 }, { "epoch": 14.378598957152573, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0655, "step": 7928 }, { "epoch": 14.380412604851507, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0767, "step": 7929 }, { "epoch": 14.382226252550442, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0739, "step": 7930 }, { "epoch": 14.384039900249377, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0681, "step": 7931 }, { "epoch": 14.385853547948312, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0688, "step": 7932 }, { "epoch": 14.387667195647246, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0782, "step": 7933 }, { "epoch": 14.38948084334618, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0787, "step": 7934 }, { "epoch": 14.391294491045114, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0777, "step": 7935 }, { "epoch": 14.393108138744049, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0637, "step": 7936 }, { "epoch": 14.394921786442984, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.067, "step": 7937 }, { "epoch": 14.396735434141918, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0711, "step": 7938 }, { "epoch": 14.398549081840853, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0734, "step": 7939 }, { "epoch": 14.400362729539786, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0694, "step": 7940 }, { "epoch": 14.402176377238721, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0741, "step": 7941 }, { "epoch": 14.403990024937656, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0734, "step": 7942 }, { "epoch": 14.40580367263659, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0685, "step": 7943 }, { "epoch": 14.407617320335525, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0784, "step": 7944 }, { "epoch": 14.40943096803446, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0667, "step": 7945 }, { "epoch": 14.411244615733393, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0734, "step": 7946 }, { "epoch": 14.413058263432328, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0902, "step": 7947 }, { "epoch": 14.414871911131263, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0778, "step": 7948 }, { "epoch": 14.416685558830197, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0834, "step": 7949 }, { "epoch": 14.418499206529132, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0863, "step": 7950 }, { "epoch": 14.420312854228067, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0724, "step": 7951 }, { "epoch": 14.422126501927, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0779, "step": 7952 }, { "epoch": 14.423940149625935, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1022, "step": 7953 }, { "epoch": 14.42575379732487, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0891, "step": 7954 }, { "epoch": 14.427567445023804, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.084, "step": 7955 }, { "epoch": 14.429381092722739, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.093, "step": 7956 }, { "epoch": 14.431194740421674, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0935, "step": 7957 }, { "epoch": 14.433008388120607, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0932, "step": 7958 }, { "epoch": 14.434822035819542, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1101, "step": 7959 }, { "epoch": 14.436635683518476, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1105, "step": 7960 }, { "epoch": 14.438449331217411, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1088, "step": 7961 }, { "epoch": 14.440262978916346, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1309, "step": 7962 }, { "epoch": 14.44207662661528, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1224, "step": 7963 }, { "epoch": 14.443890274314214, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1281, "step": 7964 }, { "epoch": 14.445703922013148, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.1391, "step": 7965 }, { "epoch": 14.447517569712083, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1483, "step": 7966 }, { "epoch": 14.449331217411018, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1648, "step": 7967 }, { "epoch": 14.451144865109953, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1947, "step": 7968 }, { "epoch": 14.452958512808888, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2261, "step": 7969 }, { "epoch": 14.45477216050782, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.1613, "step": 7970 }, { "epoch": 14.456585808206755, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0659, "step": 7971 }, { "epoch": 14.45839945590569, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0632, "step": 7972 }, { "epoch": 14.460213103604625, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0812, "step": 7973 }, { "epoch": 14.46202675130356, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0648, "step": 7974 }, { "epoch": 14.463840399002494, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0655, "step": 7975 }, { "epoch": 14.465654046701427, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0702, "step": 7976 }, { "epoch": 14.467467694400362, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0711, "step": 7977 }, { "epoch": 14.469281342099297, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0717, "step": 7978 }, { "epoch": 14.471094989798232, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0689, "step": 7979 }, { "epoch": 14.472908637497166, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0747, "step": 7980 }, { "epoch": 14.474722285196101, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0831, "step": 7981 }, { "epoch": 14.476535932895036, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0704, "step": 7982 }, { "epoch": 14.478349580593969, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0757, "step": 7983 }, { "epoch": 14.480163228292904, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0748, "step": 7984 }, { "epoch": 14.481976875991839, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0757, "step": 7985 }, { "epoch": 14.483790523690773, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0867, "step": 7986 }, { "epoch": 14.485604171389708, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0677, "step": 7987 }, { "epoch": 14.487417819088641, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.068, "step": 7988 }, { "epoch": 14.489231466787576, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0791, "step": 7989 }, { "epoch": 14.49104511448651, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0797, "step": 7990 }, { "epoch": 14.492858762185445, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0746, "step": 7991 }, { "epoch": 14.49467240988438, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.0789, "step": 7992 }, { "epoch": 14.496486057583315, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0854, "step": 7993 }, { "epoch": 14.49829970528225, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0737, "step": 7994 }, { "epoch": 14.500113352981183, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.072, "step": 7995 }, { "epoch": 14.501927000680118, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0796, "step": 7996 }, { "epoch": 14.503740648379052, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0835, "step": 7997 }, { "epoch": 14.505554296077987, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.077, "step": 7998 }, { "epoch": 14.507367943776922, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0896, "step": 7999 }, { "epoch": 14.509181591475857, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0767, "step": 8000 }, { "epoch": 14.51099523917479, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1019, "step": 8001 }, { "epoch": 14.512808886873724, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.09, "step": 8002 }, { "epoch": 14.51462253457266, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.091, "step": 8003 }, { "epoch": 14.516436182271594, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.093, "step": 8004 }, { "epoch": 14.518249829970529, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0922, "step": 8005 }, { "epoch": 14.520063477669463, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0821, "step": 8006 }, { "epoch": 14.521877125368396, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1068, "step": 8007 }, { "epoch": 14.523690773067331, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1032, "step": 8008 }, { "epoch": 14.525504420766266, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1077, "step": 8009 }, { "epoch": 14.5273180684652, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1227, "step": 8010 }, { "epoch": 14.529131716164136, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1107, "step": 8011 }, { "epoch": 14.53094536386307, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1295, "step": 8012 }, { "epoch": 14.532759011562003, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.132, "step": 8013 }, { "epoch": 14.534572659260938, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.142, "step": 8014 }, { "epoch": 14.536386306959873, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.1275, "step": 8015 }, { "epoch": 14.538199954658808, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1518, "step": 8016 }, { "epoch": 14.540013602357742, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1596, "step": 8017 }, { "epoch": 14.541827250056677, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1935, "step": 8018 }, { "epoch": 14.54364089775561, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1996, "step": 8019 }, { "epoch": 14.545454545454545, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1355, "step": 8020 }, { "epoch": 14.54726819315348, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0732, "step": 8021 }, { "epoch": 14.549081840852415, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0908, "step": 8022 }, { "epoch": 14.55089548855135, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0751, "step": 8023 }, { "epoch": 14.552709136250284, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0671, "step": 8024 }, { "epoch": 14.554522783949217, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0689, "step": 8025 }, { "epoch": 14.556336431648152, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0775, "step": 8026 }, { "epoch": 14.558150079347087, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0728, "step": 8027 }, { "epoch": 14.559963727046021, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0679, "step": 8028 }, { "epoch": 14.561777374744956, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0689, "step": 8029 }, { "epoch": 14.563591022443891, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.076, "step": 8030 }, { "epoch": 14.565404670142826, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0734, "step": 8031 }, { "epoch": 14.567218317841759, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0675, "step": 8032 }, { "epoch": 14.569031965540693, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0656, "step": 8033 }, { "epoch": 14.570845613239628, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.077, "step": 8034 }, { "epoch": 14.572659260938563, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0677, "step": 8035 }, { "epoch": 14.574472908637498, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0688, "step": 8036 }, { "epoch": 14.57628655633643, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0748, "step": 8037 }, { "epoch": 14.578100204035366, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0765, "step": 8038 }, { "epoch": 14.5799138517343, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0771, "step": 8039 }, { "epoch": 14.581727499433235, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0651, "step": 8040 }, { "epoch": 14.58354114713217, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0833, "step": 8041 }, { "epoch": 14.58354114713217, "eval_loss": 2.3907833099365234, "eval_runtime": 186.9122, "eval_samples_per_second": 5.35, "eval_steps_per_second": 5.35, "step": 8041 }, { "epoch": 14.58354114713217, "mmlu_eval_accuracy": 0.3138842107380362, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5714285714285714, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.3181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.4, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.2692307692307692, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.5454545454545454, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.32558139534883723, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.2727272727272727, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.25806451612903225, "mmlu_eval_accuracy_professional_law": 0.2647058823529412, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.5, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 3.4294643637637234, "step": 8041 }, { "epoch": 14.585354794831105, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0718, "step": 8042 }, { "epoch": 14.58716844253004, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0797, "step": 8043 }, { "epoch": 14.588982090228972, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0781, "step": 8044 }, { "epoch": 14.590795737927907, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0757, "step": 8045 }, { "epoch": 14.592609385626842, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0817, "step": 8046 }, { "epoch": 14.594423033325777, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.0866, "step": 8047 }, { "epoch": 14.596236681024712, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0847, "step": 8048 }, { "epoch": 14.598050328723644, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0917, "step": 8049 }, { "epoch": 14.59986397642258, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0878, "step": 8050 }, { "epoch": 14.601677624121514, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0876, "step": 8051 }, { "epoch": 14.603491271820449, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0877, "step": 8052 }, { "epoch": 14.605304919519384, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0891, "step": 8053 }, { "epoch": 14.607118567218318, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0909, "step": 8054 }, { "epoch": 14.608932214917253, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0888, "step": 8055 }, { "epoch": 14.610745862616186, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0833, "step": 8056 }, { "epoch": 14.612559510315121, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.099, "step": 8057 }, { "epoch": 14.614373158014056, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0994, "step": 8058 }, { "epoch": 14.61618680571299, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.118, "step": 8059 }, { "epoch": 14.618000453411925, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0934, "step": 8060 }, { "epoch": 14.61981410111086, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1072, "step": 8061 }, { "epoch": 14.621627748809793, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1127, "step": 8062 }, { "epoch": 14.623441396508728, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1051, "step": 8063 }, { "epoch": 14.625255044207663, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.135, "step": 8064 }, { "epoch": 14.627068691906597, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.128, "step": 8065 }, { "epoch": 14.628882339605532, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.156, "step": 8066 }, { "epoch": 14.630695987304467, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1544, "step": 8067 }, { "epoch": 14.6325096350034, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.1713, "step": 8068 }, { "epoch": 14.634323282702335, "grad_norm": 1.2578125, "learning_rate": 0.0002, "loss": 0.2878, "step": 8069 }, { "epoch": 14.63613693040127, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1283, "step": 8070 }, { "epoch": 14.637950578100204, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0771, "step": 8071 }, { "epoch": 14.639764225799139, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0781, "step": 8072 }, { "epoch": 14.641577873498074, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0793, "step": 8073 }, { "epoch": 14.643391521197007, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0714, "step": 8074 }, { "epoch": 14.645205168895941, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0677, "step": 8075 }, { "epoch": 14.647018816594876, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0693, "step": 8076 }, { "epoch": 14.648832464293811, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0807, "step": 8077 }, { "epoch": 14.650646111992746, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0756, "step": 8078 }, { "epoch": 14.65245975969168, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0737, "step": 8079 }, { "epoch": 14.654273407390614, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0701, "step": 8080 }, { "epoch": 14.656087055089548, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0737, "step": 8081 }, { "epoch": 14.657900702788483, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0851, "step": 8082 }, { "epoch": 14.659714350487418, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0758, "step": 8083 }, { "epoch": 14.661527998186353, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0759, "step": 8084 }, { "epoch": 14.663341645885287, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.086, "step": 8085 }, { "epoch": 14.66515529358422, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0834, "step": 8086 }, { "epoch": 14.666968941283155, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0853, "step": 8087 }, { "epoch": 14.66878258898209, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0757, "step": 8088 }, { "epoch": 14.670596236681025, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0867, "step": 8089 }, { "epoch": 14.67240988437996, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0913, "step": 8090 }, { "epoch": 14.674223532078894, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0869, "step": 8091 }, { "epoch": 14.676037179777829, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0737, "step": 8092 }, { "epoch": 14.677850827476762, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0754, "step": 8093 }, { "epoch": 14.679664475175697, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0719, "step": 8094 }, { "epoch": 14.681478122874632, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0924, "step": 8095 }, { "epoch": 14.683291770573566, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0801, "step": 8096 }, { "epoch": 14.685105418272501, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1522, "step": 8097 }, { "epoch": 14.686919065971434, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0841, "step": 8098 }, { "epoch": 14.688732713670369, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0856, "step": 8099 }, { "epoch": 14.690546361369304, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0919, "step": 8100 }, { "epoch": 14.692360009068238, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0828, "step": 8101 }, { "epoch": 14.694173656767173, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.0861, "step": 8102 }, { "epoch": 14.695987304466108, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0888, "step": 8103 }, { "epoch": 14.697800952165043, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0884, "step": 8104 }, { "epoch": 14.699614599863976, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0892, "step": 8105 }, { "epoch": 14.70142824756291, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1051, "step": 8106 }, { "epoch": 14.703241895261845, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0933, "step": 8107 }, { "epoch": 14.70505554296078, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1155, "step": 8108 }, { "epoch": 14.706869190659715, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1025, "step": 8109 }, { "epoch": 14.70868283835865, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1039, "step": 8110 }, { "epoch": 14.710496486057583, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1033, "step": 8111 }, { "epoch": 14.712310133756517, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1296, "step": 8112 }, { "epoch": 14.714123781455452, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1244, "step": 8113 }, { "epoch": 14.715937429154387, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1343, "step": 8114 }, { "epoch": 14.717751076853322, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1334, "step": 8115 }, { "epoch": 14.719564724552257, "grad_norm": 0.96875, "learning_rate": 0.0002, "loss": 0.154, "step": 8116 }, { "epoch": 14.72137837225119, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1442, "step": 8117 }, { "epoch": 14.723192019950124, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1877, "step": 8118 }, { "epoch": 14.725005667649059, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1944, "step": 8119 }, { "epoch": 14.726819315347994, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1788, "step": 8120 }, { "epoch": 14.728632963046929, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0872, "step": 8121 }, { "epoch": 14.730446610745863, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0799, "step": 8122 }, { "epoch": 14.732260258444796, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0713, "step": 8123 }, { "epoch": 14.734073906143731, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0707, "step": 8124 }, { "epoch": 14.735887553842666, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0784, "step": 8125 }, { "epoch": 14.7377012015416, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0681, "step": 8126 }, { "epoch": 14.739514849240535, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0733, "step": 8127 }, { "epoch": 14.74132849693947, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0697, "step": 8128 }, { "epoch": 14.743142144638403, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0801, "step": 8129 }, { "epoch": 14.744955792337338, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0754, "step": 8130 }, { "epoch": 14.746769440036273, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0816, "step": 8131 }, { "epoch": 14.748583087735208, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0744, "step": 8132 }, { "epoch": 14.750396735434142, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0832, "step": 8133 }, { "epoch": 14.752210383133077, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0799, "step": 8134 }, { "epoch": 14.75402403083201, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0773, "step": 8135 }, { "epoch": 14.755837678530945, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0824, "step": 8136 }, { "epoch": 14.75765132622988, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0821, "step": 8137 }, { "epoch": 14.759464973928814, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0794, "step": 8138 }, { "epoch": 14.76127862162775, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0816, "step": 8139 }, { "epoch": 14.763092269326684, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0806, "step": 8140 }, { "epoch": 14.764905917025617, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0932, "step": 8141 }, { "epoch": 14.766719564724552, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.0921, "step": 8142 }, { "epoch": 14.768533212423486, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0702, "step": 8143 }, { "epoch": 14.770346860122421, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0824, "step": 8144 }, { "epoch": 14.772160507821356, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0738, "step": 8145 }, { "epoch": 14.77397415552029, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0989, "step": 8146 }, { "epoch": 14.775787803219224, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0802, "step": 8147 }, { "epoch": 14.777601450918159, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0803, "step": 8148 }, { "epoch": 14.779415098617093, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0904, "step": 8149 }, { "epoch": 14.781228746316028, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.083, "step": 8150 }, { "epoch": 14.783042394014963, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0939, "step": 8151 }, { "epoch": 14.784856041713898, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0927, "step": 8152 }, { "epoch": 14.786669689412832, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0961, "step": 8153 }, { "epoch": 14.788483337111765, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0853, "step": 8154 }, { "epoch": 14.7902969848107, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0943, "step": 8155 }, { "epoch": 14.792110632509635, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.092, "step": 8156 }, { "epoch": 14.79392428020857, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1093, "step": 8157 }, { "epoch": 14.795737927907505, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.112, "step": 8158 }, { "epoch": 14.797551575606438, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.11, "step": 8159 }, { "epoch": 14.799365223305372, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.103, "step": 8160 }, { "epoch": 14.801178871004307, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.1295, "step": 8161 }, { "epoch": 14.802992518703242, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1213, "step": 8162 }, { "epoch": 14.804806166402177, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1191, "step": 8163 }, { "epoch": 14.806619814101111, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1477, "step": 8164 }, { "epoch": 14.808433461800046, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.1333, "step": 8165 }, { "epoch": 14.81024710949898, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1607, "step": 8166 }, { "epoch": 14.812060757197914, "grad_norm": 1.3359375, "learning_rate": 0.0002, "loss": 0.1745, "step": 8167 }, { "epoch": 14.813874404896849, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.192, "step": 8168 }, { "epoch": 14.815688052595783, "grad_norm": 0.96484375, "learning_rate": 0.0002, "loss": 0.2671, "step": 8169 }, { "epoch": 14.817501700294718, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1496, "step": 8170 }, { "epoch": 14.819315347993653, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.08, "step": 8171 }, { "epoch": 14.821128995692586, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.072, "step": 8172 }, { "epoch": 14.82294264339152, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0756, "step": 8173 }, { "epoch": 14.824756291090456, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0761, "step": 8174 }, { "epoch": 14.82656993878939, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0765, "step": 8175 }, { "epoch": 14.828383586488325, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0674, "step": 8176 }, { "epoch": 14.83019723418726, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.077, "step": 8177 }, { "epoch": 14.832010881886193, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0812, "step": 8178 }, { "epoch": 14.833824529585128, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.071, "step": 8179 }, { "epoch": 14.835638177284062, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.079, "step": 8180 }, { "epoch": 14.837451824982997, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0768, "step": 8181 }, { "epoch": 14.839265472681932, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0734, "step": 8182 }, { "epoch": 14.841079120380867, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0771, "step": 8183 }, { "epoch": 14.8428927680798, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0784, "step": 8184 }, { "epoch": 14.844706415778735, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0879, "step": 8185 }, { "epoch": 14.84652006347767, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0781, "step": 8186 }, { "epoch": 14.848333711176604, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0797, "step": 8187 }, { "epoch": 14.850147358875539, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0862, "step": 8188 }, { "epoch": 14.851961006574474, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0733, "step": 8189 }, { "epoch": 14.853774654273407, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0805, "step": 8190 }, { "epoch": 14.855588301972341, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0862, "step": 8191 }, { "epoch": 14.857401949671276, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0742, "step": 8192 }, { "epoch": 14.859215597370211, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0814, "step": 8193 }, { "epoch": 14.861029245069146, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.072, "step": 8194 }, { "epoch": 14.86284289276808, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0764, "step": 8195 }, { "epoch": 14.864656540467013, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0742, "step": 8196 }, { "epoch": 14.866470188165948, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.086, "step": 8197 }, { "epoch": 14.868283835864883, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0793, "step": 8198 }, { "epoch": 14.870097483563818, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0871, "step": 8199 }, { "epoch": 14.871911131262753, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0815, "step": 8200 }, { "epoch": 14.873724778961687, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.0945, "step": 8201 }, { "epoch": 14.875538426660622, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.099, "step": 8202 }, { "epoch": 14.877352074359555, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1047, "step": 8203 }, { "epoch": 14.87916572205849, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0993, "step": 8204 }, { "epoch": 14.880979369757425, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0897, "step": 8205 }, { "epoch": 14.88279301745636, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1216, "step": 8206 }, { "epoch": 14.884606665155294, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0885, "step": 8207 }, { "epoch": 14.886420312854227, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1001, "step": 8208 }, { "epoch": 14.888233960553162, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1254, "step": 8209 }, { "epoch": 14.890047608252097, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1158, "step": 8210 }, { "epoch": 14.891861255951031, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.134, "step": 8211 }, { "epoch": 14.893674903649966, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1211, "step": 8212 }, { "epoch": 14.895488551348901, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1262, "step": 8213 }, { "epoch": 14.897302199047836, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1348, "step": 8214 }, { "epoch": 14.899115846746769, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1331, "step": 8215 }, { "epoch": 14.900929494445704, "grad_norm": 1.3046875, "learning_rate": 0.0002, "loss": 0.179, "step": 8216 }, { "epoch": 14.902743142144638, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1498, "step": 8217 }, { "epoch": 14.904556789843573, "grad_norm": 1.4609375, "learning_rate": 0.0002, "loss": 0.1883, "step": 8218 }, { "epoch": 14.906370437542508, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.2419, "step": 8219 }, { "epoch": 14.908184085241443, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.1764, "step": 8220 }, { "epoch": 14.909997732940376, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0836, "step": 8221 }, { "epoch": 14.91181138063931, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0818, "step": 8222 }, { "epoch": 14.913625028338245, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0778, "step": 8223 }, { "epoch": 14.91543867603718, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0704, "step": 8224 }, { "epoch": 14.917252323736115, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0719, "step": 8225 }, { "epoch": 14.91906597143505, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0768, "step": 8226 }, { "epoch": 14.920879619133983, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0754, "step": 8227 }, { "epoch": 14.922693266832917, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0894, "step": 8228 }, { "epoch": 14.922693266832917, "eval_loss": 2.331159830093384, "eval_runtime": 187.1468, "eval_samples_per_second": 5.343, "eval_steps_per_second": 5.343, "step": 8228 }, { "epoch": 14.922693266832917, "mmlu_eval_accuracy": 0.297491825840311, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.1875, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.3448275862068966, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.2727272727272727, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.07142857142857142, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3372093023255814, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.28823529411764703, "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.09090909090909091, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 2.6120894521494304, "step": 8228 }, { "epoch": 14.924506914531852, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0747, "step": 8229 }, { "epoch": 14.926320562230787, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0854, "step": 8230 }, { "epoch": 14.928134209929722, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0742, "step": 8231 }, { "epoch": 14.929947857628656, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.093, "step": 8232 }, { "epoch": 14.93176150532759, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.074, "step": 8233 }, { "epoch": 14.933575153026524, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.076, "step": 8234 }, { "epoch": 14.935388800725459, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0861, "step": 8235 }, { "epoch": 14.937202448424394, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0811, "step": 8236 }, { "epoch": 14.939016096123328, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.071, "step": 8237 }, { "epoch": 14.940829743822263, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0783, "step": 8238 }, { "epoch": 14.942643391521196, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0795, "step": 8239 }, { "epoch": 14.944457039220131, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0825, "step": 8240 }, { "epoch": 14.946270686919066, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0857, "step": 8241 }, { "epoch": 14.948084334618, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.089, "step": 8242 }, { "epoch": 14.949897982316935, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.08, "step": 8243 }, { "epoch": 14.95171163001587, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0902, "step": 8244 }, { "epoch": 14.953525277714803, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0929, "step": 8245 }, { "epoch": 14.955338925413738, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0853, "step": 8246 }, { "epoch": 14.957152573112673, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0758, "step": 8247 }, { "epoch": 14.958966220811607, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.0916, "step": 8248 }, { "epoch": 14.960779868510542, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0792, "step": 8249 }, { "epoch": 14.962593516209477, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0962, "step": 8250 }, { "epoch": 14.96440716390841, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0871, "step": 8251 }, { "epoch": 14.966220811607345, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0981, "step": 8252 }, { "epoch": 14.96803445930628, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1019, "step": 8253 }, { "epoch": 14.969848107005214, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1039, "step": 8254 }, { "epoch": 14.971661754704149, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1081, "step": 8255 }, { "epoch": 14.973475402403084, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 0.1001, "step": 8256 }, { "epoch": 14.975289050102017, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1063, "step": 8257 }, { "epoch": 14.977102697800952, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.12, "step": 8258 }, { "epoch": 14.978916345499886, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1187, "step": 8259 }, { "epoch": 14.980729993198821, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1153, "step": 8260 }, { "epoch": 14.982543640897756, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1124, "step": 8261 }, { "epoch": 14.98435728859669, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.1348, "step": 8262 }, { "epoch": 14.986170936295625, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.1031, "step": 8263 }, { "epoch": 14.987984583994558, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1288, "step": 8264 }, { "epoch": 14.989798231693493, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1388, "step": 8265 }, { "epoch": 14.991611879392428, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1562, "step": 8266 }, { "epoch": 14.993425527091363, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1679, "step": 8267 }, { "epoch": 14.995239174790298, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1969, "step": 8268 }, { "epoch": 14.99705282248923, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.2472, "step": 8269 }, { "epoch": 14.998866470188165, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1568, "step": 8270 }, { "epoch": 15.0006801178871, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0917, "step": 8271 }, { "epoch": 15.002493765586035, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0625, "step": 8272 }, { "epoch": 15.00430741328497, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0585, "step": 8273 }, { "epoch": 15.006121060983904, "grad_norm": 0.3671875, "learning_rate": 0.0002, "loss": 0.049, "step": 8274 }, { "epoch": 15.00793470868284, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.06, "step": 8275 }, { "epoch": 15.009748356381772, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.0494, "step": 8276 }, { "epoch": 15.011562004080707, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.0529, "step": 8277 }, { "epoch": 15.013375651779642, "grad_norm": 0.37109375, "learning_rate": 0.0002, "loss": 0.055, "step": 8278 }, { "epoch": 15.015189299478577, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0642, "step": 8279 }, { "epoch": 15.017002947177511, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.0519, "step": 8280 }, { "epoch": 15.018816594876446, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.056, "step": 8281 }, { "epoch": 15.020630242575379, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0633, "step": 8282 }, { "epoch": 15.022443890274314, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0677, "step": 8283 }, { "epoch": 15.024257537973249, "grad_norm": 0.419921875, "learning_rate": 0.0002, "loss": 0.0569, "step": 8284 }, { "epoch": 15.026071185672183, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0569, "step": 8285 }, { "epoch": 15.027884833371118, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0524, "step": 8286 }, { "epoch": 15.029698481070053, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0661, "step": 8287 }, { "epoch": 15.031512128768986, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0538, "step": 8288 }, { "epoch": 15.03332577646792, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0647, "step": 8289 }, { "epoch": 15.035139424166855, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.067, "step": 8290 }, { "epoch": 15.03695307186579, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0688, "step": 8291 }, { "epoch": 15.038766719564725, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.057, "step": 8292 }, { "epoch": 15.04058036726366, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8293 }, { "epoch": 15.042394014962593, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.061, "step": 8294 }, { "epoch": 15.044207662661528, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0648, "step": 8295 }, { "epoch": 15.046021310360462, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0617, "step": 8296 }, { "epoch": 15.047834958059397, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0751, "step": 8297 }, { "epoch": 15.049648605758332, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0653, "step": 8298 }, { "epoch": 15.051462253457267, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0771, "step": 8299 }, { "epoch": 15.0532759011562, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0759, "step": 8300 }, { "epoch": 15.055089548855134, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0592, "step": 8301 }, { "epoch": 15.05690319655407, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0693, "step": 8302 }, { "epoch": 15.058716844253004, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0699, "step": 8303 }, { "epoch": 15.060530491951939, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0758, "step": 8304 }, { "epoch": 15.062344139650873, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0637, "step": 8305 }, { "epoch": 15.064157787349806, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.09, "step": 8306 }, { "epoch": 15.065971435048741, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0854, "step": 8307 }, { "epoch": 15.067785082747676, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.0991, "step": 8308 }, { "epoch": 15.06959873044661, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0868, "step": 8309 }, { "epoch": 15.071412378145546, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.0997, "step": 8310 }, { "epoch": 15.07322602584448, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.109, "step": 8311 }, { "epoch": 15.075039673543413, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1053, "step": 8312 }, { "epoch": 15.076853321242348, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0881, "step": 8313 }, { "epoch": 15.078666968941283, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0904, "step": 8314 }, { "epoch": 15.080480616640218, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1201, "step": 8315 }, { "epoch": 15.082294264339152, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1292, "step": 8316 }, { "epoch": 15.084107912038087, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1486, "step": 8317 }, { "epoch": 15.08592155973702, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1557, "step": 8318 }, { "epoch": 15.087735207435955, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.153, "step": 8319 }, { "epoch": 15.08954885513489, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.173, "step": 8320 }, { "epoch": 15.091362502833825, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1599, "step": 8321 }, { "epoch": 15.09317615053276, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0566, "step": 8322 }, { "epoch": 15.094989798231694, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0693, "step": 8323 }, { "epoch": 15.096803445930629, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.0615, "step": 8324 }, { "epoch": 15.098617093629562, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.062, "step": 8325 }, { "epoch": 15.100430741328497, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.0593, "step": 8326 }, { "epoch": 15.102244389027431, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0611, "step": 8327 }, { "epoch": 15.104058036726366, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0546, "step": 8328 }, { "epoch": 15.105871684425301, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0585, "step": 8329 }, { "epoch": 15.107685332124236, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0684, "step": 8330 }, { "epoch": 15.109498979823169, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0703, "step": 8331 }, { "epoch": 15.111312627522103, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0652, "step": 8332 }, { "epoch": 15.113126275221038, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0741, "step": 8333 }, { "epoch": 15.114939922919973, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.065, "step": 8334 }, { "epoch": 15.116753570618908, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0703, "step": 8335 }, { "epoch": 15.118567218317843, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0594, "step": 8336 }, { "epoch": 15.120380866016776, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0587, "step": 8337 }, { "epoch": 15.12219451371571, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0687, "step": 8338 }, { "epoch": 15.124008161414645, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.074, "step": 8339 }, { "epoch": 15.12582180911358, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0624, "step": 8340 }, { "epoch": 15.127635456812515, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0629, "step": 8341 }, { "epoch": 15.12944910451145, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0652, "step": 8342 }, { "epoch": 15.131262752210382, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0731, "step": 8343 }, { "epoch": 15.133076399909317, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0722, "step": 8344 }, { "epoch": 15.134890047608252, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0612, "step": 8345 }, { "epoch": 15.136703695307187, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0681, "step": 8346 }, { "epoch": 15.138517343006122, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0621, "step": 8347 }, { "epoch": 15.140330990705056, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0784, "step": 8348 }, { "epoch": 15.14214463840399, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0702, "step": 8349 }, { "epoch": 15.143958286102924, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0679, "step": 8350 }, { "epoch": 15.145771933801859, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0759, "step": 8351 }, { "epoch": 15.147585581500794, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.074, "step": 8352 }, { "epoch": 15.149399229199728, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0742, "step": 8353 }, { "epoch": 15.151212876898663, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0888, "step": 8354 }, { "epoch": 15.153026524597596, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0767, "step": 8355 }, { "epoch": 15.154840172296531, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0841, "step": 8356 }, { "epoch": 15.156653819995466, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0875, "step": 8357 }, { "epoch": 15.1584674676944, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0755, "step": 8358 }, { "epoch": 15.160281115393335, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0854, "step": 8359 }, { "epoch": 15.16209476309227, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.094, "step": 8360 }, { "epoch": 15.163908410791203, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1044, "step": 8361 }, { "epoch": 15.165722058490138, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1268, "step": 8362 }, { "epoch": 15.167535706189073, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1209, "step": 8363 }, { "epoch": 15.169349353888007, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0943, "step": 8364 }, { "epoch": 15.171163001586942, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.1027, "step": 8365 }, { "epoch": 15.172976649285877, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1228, "step": 8366 }, { "epoch": 15.17479029698481, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1434, "step": 8367 }, { "epoch": 15.176603944683745, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1317, "step": 8368 }, { "epoch": 15.17841759238268, "grad_norm": 0.9765625, "learning_rate": 0.0002, "loss": 0.1483, "step": 8369 }, { "epoch": 15.180231240081614, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1997, "step": 8370 }, { "epoch": 15.182044887780549, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1861, "step": 8371 }, { "epoch": 15.183858535479484, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0708, "step": 8372 }, { "epoch": 15.185672183178418, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0692, "step": 8373 }, { "epoch": 15.187485830877351, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0644, "step": 8374 }, { "epoch": 15.189299478576286, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0571, "step": 8375 }, { "epoch": 15.191113126275221, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0682, "step": 8376 }, { "epoch": 15.192926773974156, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0702, "step": 8377 }, { "epoch": 15.19474042167309, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0639, "step": 8378 }, { "epoch": 15.196554069372025, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0688, "step": 8379 }, { "epoch": 15.198367717070958, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0676, "step": 8380 }, { "epoch": 15.200181364769893, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0653, "step": 8381 }, { "epoch": 15.201995012468828, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0719, "step": 8382 }, { "epoch": 15.203808660167763, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0679, "step": 8383 }, { "epoch": 15.205622307866697, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.072, "step": 8384 }, { "epoch": 15.207435955565632, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0653, "step": 8385 }, { "epoch": 15.209249603264565, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0752, "step": 8386 }, { "epoch": 15.2110632509635, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0631, "step": 8387 }, { "epoch": 15.212876898662435, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0754, "step": 8388 }, { "epoch": 15.21469054636137, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0699, "step": 8389 }, { "epoch": 15.216504194060304, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0701, "step": 8390 }, { "epoch": 15.218317841759239, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0633, "step": 8391 }, { "epoch": 15.220131489458172, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0753, "step": 8392 }, { "epoch": 15.221945137157107, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0651, "step": 8393 }, { "epoch": 15.223758784856042, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0773, "step": 8394 }, { "epoch": 15.225572432554976, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0722, "step": 8395 }, { "epoch": 15.227386080253911, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0803, "step": 8396 }, { "epoch": 15.229199727952846, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0698, "step": 8397 }, { "epoch": 15.231013375651779, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.084, "step": 8398 }, { "epoch": 15.232827023350714, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0669, "step": 8399 }, { "epoch": 15.234640671049648, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0836, "step": 8400 }, { "epoch": 15.236454318748583, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0765, "step": 8401 }, { "epoch": 15.238267966447518, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0768, "step": 8402 }, { "epoch": 15.240081614146453, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0841, "step": 8403 }, { "epoch": 15.241895261845386, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0708, "step": 8404 }, { "epoch": 15.24370890954432, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0794, "step": 8405 }, { "epoch": 15.245522557243255, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0946, "step": 8406 }, { "epoch": 15.24733620494219, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0961, "step": 8407 }, { "epoch": 15.249149852641125, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0949, "step": 8408 }, { "epoch": 15.25096350034006, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.0989, "step": 8409 }, { "epoch": 15.252777148038993, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1132, "step": 8410 }, { "epoch": 15.254590795737927, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1189, "step": 8411 }, { "epoch": 15.256404443436862, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1007, "step": 8412 }, { "epoch": 15.258218091135797, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.1023, "step": 8413 }, { "epoch": 15.260031738834732, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1104, "step": 8414 }, { "epoch": 15.261845386533667, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1405, "step": 8415 }, { "epoch": 15.261845386533667, "eval_loss": 2.299604892730713, "eval_runtime": 186.2601, "eval_samples_per_second": 5.369, "eval_steps_per_second": 5.369, "step": 8415 }, { "epoch": 15.261845386533667, "mmlu_eval_accuracy": 0.30591688789148563, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.4090909090909091, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.3488372093023256, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.35, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3488372093023256, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.3917416742547206, "step": 8415 }, { "epoch": 15.2636590342326, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1516, "step": 8416 }, { "epoch": 15.265472681931534, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.1356, "step": 8417 }, { "epoch": 15.267286329630469, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1204, "step": 8418 }, { "epoch": 15.269099977329404, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.1297, "step": 8419 }, { "epoch": 15.270913625028339, "grad_norm": 1.296875, "learning_rate": 0.0002, "loss": 0.2051, "step": 8420 }, { "epoch": 15.272727272727273, "grad_norm": 1.109375, "learning_rate": 0.0002, "loss": 0.2186, "step": 8421 }, { "epoch": 15.274540920426206, "grad_norm": 0.392578125, "learning_rate": 0.0002, "loss": 0.0627, "step": 8422 }, { "epoch": 15.276354568125141, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.063, "step": 8423 }, { "epoch": 15.278168215824076, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0627, "step": 8424 }, { "epoch": 15.27998186352301, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.065, "step": 8425 }, { "epoch": 15.281795511221945, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0655, "step": 8426 }, { "epoch": 15.28360915892088, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.062, "step": 8427 }, { "epoch": 15.285422806619813, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0658, "step": 8428 }, { "epoch": 15.287236454318748, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0667, "step": 8429 }, { "epoch": 15.289050102017683, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0655, "step": 8430 }, { "epoch": 15.290863749716618, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0655, "step": 8431 }, { "epoch": 15.292677397415552, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0749, "step": 8432 }, { "epoch": 15.294491045114487, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0639, "step": 8433 }, { "epoch": 15.296304692813422, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0637, "step": 8434 }, { "epoch": 15.298118340512355, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0726, "step": 8435 }, { "epoch": 15.29993198821129, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0705, "step": 8436 }, { "epoch": 15.301745635910224, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.073, "step": 8437 }, { "epoch": 15.30355928360916, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0694, "step": 8438 }, { "epoch": 15.305372931308094, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0675, "step": 8439 }, { "epoch": 15.307186579007029, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0757, "step": 8440 }, { "epoch": 15.309000226705962, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0633, "step": 8441 }, { "epoch": 15.310813874404896, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0677, "step": 8442 }, { "epoch": 15.312627522103831, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0742, "step": 8443 }, { "epoch": 15.314441169802766, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0739, "step": 8444 }, { "epoch": 15.3162548175017, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0678, "step": 8445 }, { "epoch": 15.318068465200636, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0812, "step": 8446 }, { "epoch": 15.319882112899569, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.126, "step": 8447 }, { "epoch": 15.321695760598503, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0766, "step": 8448 }, { "epoch": 15.323509408297438, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0865, "step": 8449 }, { "epoch": 15.325323055996373, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.081, "step": 8450 }, { "epoch": 15.327136703695308, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0755, "step": 8451 }, { "epoch": 15.328950351394242, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0958, "step": 8452 }, { "epoch": 15.330763999093175, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0813, "step": 8453 }, { "epoch": 15.33257764679211, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0882, "step": 8454 }, { "epoch": 15.334391294491045, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0934, "step": 8455 }, { "epoch": 15.33620494218998, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0792, "step": 8456 }, { "epoch": 15.338018589888915, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.091, "step": 8457 }, { "epoch": 15.33983223758785, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0827, "step": 8458 }, { "epoch": 15.341645885286782, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0875, "step": 8459 }, { "epoch": 15.343459532985717, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.095, "step": 8460 }, { "epoch": 15.345273180684652, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1123, "step": 8461 }, { "epoch": 15.347086828383587, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1078, "step": 8462 }, { "epoch": 15.348900476082521, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0988, "step": 8463 }, { "epoch": 15.350714123781456, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1036, "step": 8464 }, { "epoch": 15.35252777148039, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1123, "step": 8465 }, { "epoch": 15.354341419179324, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1306, "step": 8466 }, { "epoch": 15.356155066878259, "grad_norm": 1.3671875, "learning_rate": 0.0002, "loss": 0.1386, "step": 8467 }, { "epoch": 15.357968714577193, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1348, "step": 8468 }, { "epoch": 15.359782362276128, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1615, "step": 8469 }, { "epoch": 15.361596009975063, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2141, "step": 8470 }, { "epoch": 15.363409657673996, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1782, "step": 8471 }, { "epoch": 15.36522330537293, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0758, "step": 8472 }, { "epoch": 15.367036953071866, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0627, "step": 8473 }, { "epoch": 15.3688506007708, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0699, "step": 8474 }, { "epoch": 15.370664248469735, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0665, "step": 8475 }, { "epoch": 15.37247789616867, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0778, "step": 8476 }, { "epoch": 15.374291543867603, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0611, "step": 8477 }, { "epoch": 15.376105191566538, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0808, "step": 8478 }, { "epoch": 15.377918839265472, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0653, "step": 8479 }, { "epoch": 15.379732486964407, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0684, "step": 8480 }, { "epoch": 15.381546134663342, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0671, "step": 8481 }, { "epoch": 15.383359782362277, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0617, "step": 8482 }, { "epoch": 15.385173430061212, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0696, "step": 8483 }, { "epoch": 15.386987077760145, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0699, "step": 8484 }, { "epoch": 15.38880072545908, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0741, "step": 8485 }, { "epoch": 15.390614373158014, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0817, "step": 8486 }, { "epoch": 15.392428020856949, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0588, "step": 8487 }, { "epoch": 15.394241668555884, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0682, "step": 8488 }, { "epoch": 15.396055316254817, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0634, "step": 8489 }, { "epoch": 15.397868963953751, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0627, "step": 8490 }, { "epoch": 15.399682611652686, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0708, "step": 8491 }, { "epoch": 15.401496259351621, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0744, "step": 8492 }, { "epoch": 15.403309907050556, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0708, "step": 8493 }, { "epoch": 15.40512355474949, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0648, "step": 8494 }, { "epoch": 15.406937202448425, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0695, "step": 8495 }, { "epoch": 15.408750850147358, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0731, "step": 8496 }, { "epoch": 15.410564497846293, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.09, "step": 8497 }, { "epoch": 15.412378145545228, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0707, "step": 8498 }, { "epoch": 15.414191793244163, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0754, "step": 8499 }, { "epoch": 15.416005440943097, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0815, "step": 8500 }, { "epoch": 15.417819088642032, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1452, "step": 8501 }, { "epoch": 15.419632736340965, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0833, "step": 8502 }, { "epoch": 15.4214463840399, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1003, "step": 8503 }, { "epoch": 15.423260031738835, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0898, "step": 8504 }, { "epoch": 15.42507367943777, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0789, "step": 8505 }, { "epoch": 15.426887327136704, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0844, "step": 8506 }, { "epoch": 15.428700974835639, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0927, "step": 8507 }, { "epoch": 15.430514622534572, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1045, "step": 8508 }, { "epoch": 15.432328270233507, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0804, "step": 8509 }, { "epoch": 15.434141917932442, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0977, "step": 8510 }, { "epoch": 15.435955565631376, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.105, "step": 8511 }, { "epoch": 15.437769213330311, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1039, "step": 8512 }, { "epoch": 15.439582861029246, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1176, "step": 8513 }, { "epoch": 15.441396508728179, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0996, "step": 8514 }, { "epoch": 15.443210156427114, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1169, "step": 8515 }, { "epoch": 15.445023804126048, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1369, "step": 8516 }, { "epoch": 15.446837451824983, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1365, "step": 8517 }, { "epoch": 15.448651099523918, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.135, "step": 8518 }, { "epoch": 15.450464747222853, "grad_norm": 1.0859375, "learning_rate": 0.0002, "loss": 0.1999, "step": 8519 }, { "epoch": 15.452278394921786, "grad_norm": 1.8515625, "learning_rate": 0.0002, "loss": 0.2386, "step": 8520 }, { "epoch": 15.45409204262072, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.2233, "step": 8521 }, { "epoch": 15.455905690319655, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0714, "step": 8522 }, { "epoch": 15.45771933801859, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0657, "step": 8523 }, { "epoch": 15.459532985717525, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8524 }, { "epoch": 15.46134663341646, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0746, "step": 8525 }, { "epoch": 15.463160281115393, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.0655, "step": 8526 }, { "epoch": 15.464973928814327, "grad_norm": 0.412109375, "learning_rate": 0.0002, "loss": 0.0584, "step": 8527 }, { "epoch": 15.466787576513262, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.0605, "step": 8528 }, { "epoch": 15.468601224212197, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0736, "step": 8529 }, { "epoch": 15.470414871911132, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0734, "step": 8530 }, { "epoch": 15.472228519610066, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0677, "step": 8531 }, { "epoch": 15.474042167309, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0693, "step": 8532 }, { "epoch": 15.475855815007934, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.066, "step": 8533 }, { "epoch": 15.477669462706869, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0685, "step": 8534 }, { "epoch": 15.479483110405804, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0689, "step": 8535 }, { "epoch": 15.481296758104738, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0774, "step": 8536 }, { "epoch": 15.483110405803673, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0664, "step": 8537 }, { "epoch": 15.484924053502606, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0667, "step": 8538 }, { "epoch": 15.486737701201541, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0739, "step": 8539 }, { "epoch": 15.488551348900476, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0829, "step": 8540 }, { "epoch": 15.49036499659941, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0747, "step": 8541 }, { "epoch": 15.492178644298345, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0714, "step": 8542 }, { "epoch": 15.49399229199728, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0775, "step": 8543 }, { "epoch": 15.495805939696215, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0728, "step": 8544 }, { "epoch": 15.497619587395148, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.078, "step": 8545 }, { "epoch": 15.499433235094083, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0738, "step": 8546 }, { "epoch": 15.501246882793017, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0707, "step": 8547 }, { "epoch": 15.503060530491952, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0808, "step": 8548 }, { "epoch": 15.504874178190887, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.078, "step": 8549 }, { "epoch": 15.50668782588982, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0767, "step": 8550 }, { "epoch": 15.508501473588755, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0834, "step": 8551 }, { "epoch": 15.51031512128769, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.0855, "step": 8552 }, { "epoch": 15.512128768986624, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0946, "step": 8553 }, { "epoch": 15.513942416685559, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.096, "step": 8554 }, { "epoch": 15.515756064384494, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0903, "step": 8555 }, { "epoch": 15.517569712083429, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0987, "step": 8556 }, { "epoch": 15.519383359782362, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0956, "step": 8557 }, { "epoch": 15.521197007481296, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0899, "step": 8558 }, { "epoch": 15.523010655180231, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0931, "step": 8559 }, { "epoch": 15.524824302879166, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0967, "step": 8560 }, { "epoch": 15.5266379505781, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1042, "step": 8561 }, { "epoch": 15.528451598277035, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0966, "step": 8562 }, { "epoch": 15.530265245975968, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1088, "step": 8563 }, { "epoch": 15.532078893674903, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1493, "step": 8564 }, { "epoch": 15.533892541373838, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1142, "step": 8565 }, { "epoch": 15.535706189072773, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1426, "step": 8566 }, { "epoch": 15.537519836771708, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1517, "step": 8567 }, { "epoch": 15.539333484470642, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.1611, "step": 8568 }, { "epoch": 15.541147132169575, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1703, "step": 8569 }, { "epoch": 15.54296077986851, "grad_norm": 0.8359375, "learning_rate": 0.0002, "loss": 0.2212, "step": 8570 }, { "epoch": 15.544774427567445, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.2176, "step": 8571 }, { "epoch": 15.54658807526638, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0735, "step": 8572 }, { "epoch": 15.548401722965314, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0607, "step": 8573 }, { "epoch": 15.55021537066425, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0721, "step": 8574 }, { "epoch": 15.552029018363182, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.071, "step": 8575 }, { "epoch": 15.553842666062117, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0757, "step": 8576 }, { "epoch": 15.555656313761052, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0701, "step": 8577 }, { "epoch": 15.557469961459987, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0696, "step": 8578 }, { "epoch": 15.559283609158921, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0751, "step": 8579 }, { "epoch": 15.561097256857856, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0748, "step": 8580 }, { "epoch": 15.562910904556789, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0714, "step": 8581 }, { "epoch": 15.564724552255724, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0676, "step": 8582 }, { "epoch": 15.566538199954659, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1275, "step": 8583 }, { "epoch": 15.568351847653593, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0805, "step": 8584 }, { "epoch": 15.570165495352528, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.073, "step": 8585 }, { "epoch": 15.571979143051463, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0746, "step": 8586 }, { "epoch": 15.573792790750396, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0716, "step": 8587 }, { "epoch": 15.57560643844933, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0739, "step": 8588 }, { "epoch": 15.577420086148265, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0665, "step": 8589 }, { "epoch": 15.5792337338472, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0697, "step": 8590 }, { "epoch": 15.581047381546135, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0649, "step": 8591 }, { "epoch": 15.58286102924507, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0773, "step": 8592 }, { "epoch": 15.584674676944005, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0701, "step": 8593 }, { "epoch": 15.586488324642938, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0807, "step": 8594 }, { "epoch": 15.588301972341872, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0728, "step": 8595 }, { "epoch": 15.590115620040807, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0773, "step": 8596 }, { "epoch": 15.591929267739742, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0866, "step": 8597 }, { "epoch": 15.593742915438677, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0742, "step": 8598 }, { "epoch": 15.59555656313761, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0764, "step": 8599 }, { "epoch": 15.597370210836544, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0755, "step": 8600 }, { "epoch": 15.59918385853548, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0873, "step": 8601 }, { "epoch": 15.600997506234414, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0967, "step": 8602 }, { "epoch": 15.600997506234414, "eval_loss": 2.3877344131469727, "eval_runtime": 190.6979, "eval_samples_per_second": 5.244, "eval_steps_per_second": 5.244, "step": 8602 }, { "epoch": 15.600997506234414, "mmlu_eval_accuracy": 0.30773194534735887, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.1875, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.3333333333333333, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.5, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.4090909090909091, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.38461538461538464, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.3372093023255814, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.42857142857142855, "mmlu_eval_accuracy_professional_accounting": 0.3225806451612903, "mmlu_eval_accuracy_professional_law": 0.2823529411764706, "mmlu_eval_accuracy_professional_medicine": 0.25806451612903225, "mmlu_eval_accuracy_professional_psychology": 0.3188405797101449, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.2727272727272727, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 3.2886812212341052, "step": 8602 }, { "epoch": 15.602811153933349, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0861, "step": 8603 }, { "epoch": 15.604624801632283, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0873, "step": 8604 }, { "epoch": 15.606438449331218, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0828, "step": 8605 }, { "epoch": 15.608252097030151, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1064, "step": 8606 }, { "epoch": 15.610065744729086, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1026, "step": 8607 }, { "epoch": 15.61187939242802, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.09, "step": 8608 }, { "epoch": 15.613693040126956, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0947, "step": 8609 }, { "epoch": 15.61550668782589, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.1142, "step": 8610 }, { "epoch": 15.617320335524825, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.1092, "step": 8611 }, { "epoch": 15.619133983223758, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1004, "step": 8612 }, { "epoch": 15.620947630922693, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1172, "step": 8613 }, { "epoch": 15.622761278621628, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1181, "step": 8614 }, { "epoch": 15.624574926320562, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1268, "step": 8615 }, { "epoch": 15.626388574019497, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1341, "step": 8616 }, { "epoch": 15.628202221718432, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.1326, "step": 8617 }, { "epoch": 15.630015869417365, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.1558, "step": 8618 }, { "epoch": 15.6318295171163, "grad_norm": 1.015625, "learning_rate": 0.0002, "loss": 0.1959, "step": 8619 }, { "epoch": 15.633643164815235, "grad_norm": 1.234375, "learning_rate": 0.0002, "loss": 0.2504, "step": 8620 }, { "epoch": 15.63545681251417, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.2198, "step": 8621 }, { "epoch": 15.637270460213104, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0659, "step": 8622 }, { "epoch": 15.639084107912039, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0788, "step": 8623 }, { "epoch": 15.640897755610972, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0769, "step": 8624 }, { "epoch": 15.642711403309907, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0613, "step": 8625 }, { "epoch": 15.644525051008841, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0612, "step": 8626 }, { "epoch": 15.646338698707776, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0711, "step": 8627 }, { "epoch": 15.648152346406711, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.067, "step": 8628 }, { "epoch": 15.649965994105646, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0662, "step": 8629 }, { "epoch": 15.651779641804579, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.067, "step": 8630 }, { "epoch": 15.653593289503513, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0684, "step": 8631 }, { "epoch": 15.655406937202448, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0728, "step": 8632 }, { "epoch": 15.657220584901383, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0888, "step": 8633 }, { "epoch": 15.659034232600318, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0829, "step": 8634 }, { "epoch": 15.660847880299253, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0672, "step": 8635 }, { "epoch": 15.662661527998186, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0689, "step": 8636 }, { "epoch": 15.66447517569712, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0787, "step": 8637 }, { "epoch": 15.666288823396055, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0693, "step": 8638 }, { "epoch": 15.66810247109499, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8639 }, { "epoch": 15.669916118793925, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0784, "step": 8640 }, { "epoch": 15.67172976649286, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0817, "step": 8641 }, { "epoch": 15.673543414191792, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0686, "step": 8642 }, { "epoch": 15.675357061890727, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.0924, "step": 8643 }, { "epoch": 15.677170709589662, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0748, "step": 8644 }, { "epoch": 15.678984357288597, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0717, "step": 8645 }, { "epoch": 15.680798004987532, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0893, "step": 8646 }, { "epoch": 15.682611652686466, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0748, "step": 8647 }, { "epoch": 15.6844253003854, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0834, "step": 8648 }, { "epoch": 15.686238948084334, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0896, "step": 8649 }, { "epoch": 15.688052595783269, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.093, "step": 8650 }, { "epoch": 15.689866243482204, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0881, "step": 8651 }, { "epoch": 15.691679891181138, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0791, "step": 8652 }, { "epoch": 15.693493538880073, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0896, "step": 8653 }, { "epoch": 15.695307186579008, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0893, "step": 8654 }, { "epoch": 15.697120834277941, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0926, "step": 8655 }, { "epoch": 15.698934481976876, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.094, "step": 8656 }, { "epoch": 15.70074812967581, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0872, "step": 8657 }, { "epoch": 15.702561777374745, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0967, "step": 8658 }, { "epoch": 15.70437542507368, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.104, "step": 8659 }, { "epoch": 15.706189072772613, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.1029, "step": 8660 }, { "epoch": 15.708002720471548, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.098, "step": 8661 }, { "epoch": 15.709816368170483, "grad_norm": 1.09375, "learning_rate": 0.0002, "loss": 0.1369, "step": 8662 }, { "epoch": 15.711630015869417, "grad_norm": 1.0, "learning_rate": 0.0002, "loss": 0.1295, "step": 8663 }, { "epoch": 15.713443663568352, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1089, "step": 8664 }, { "epoch": 15.715257311267287, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1299, "step": 8665 }, { "epoch": 15.717070958966222, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1359, "step": 8666 }, { "epoch": 15.718884606665155, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.149, "step": 8667 }, { "epoch": 15.72069825436409, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1459, "step": 8668 }, { "epoch": 15.722511902063024, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1556, "step": 8669 }, { "epoch": 15.724325549761959, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.181, "step": 8670 }, { "epoch": 15.726139197460894, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.236, "step": 8671 }, { "epoch": 15.727952845159828, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0772, "step": 8672 }, { "epoch": 15.729766492858761, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0768, "step": 8673 }, { "epoch": 15.731580140557696, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0702, "step": 8674 }, { "epoch": 15.733393788256631, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0763, "step": 8675 }, { "epoch": 15.735207435955566, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8676 }, { "epoch": 15.7370210836545, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0678, "step": 8677 }, { "epoch": 15.738834731353435, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0671, "step": 8678 }, { "epoch": 15.740648379052368, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0703, "step": 8679 }, { "epoch": 15.742462026751303, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0756, "step": 8680 }, { "epoch": 15.744275674450238, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0835, "step": 8681 }, { "epoch": 15.746089322149173, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.087, "step": 8682 }, { "epoch": 15.747902969848107, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0883, "step": 8683 }, { "epoch": 15.749716617547042, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0809, "step": 8684 }, { "epoch": 15.751530265245975, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0715, "step": 8685 }, { "epoch": 15.75334391294491, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0805, "step": 8686 }, { "epoch": 15.755157560643845, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0695, "step": 8687 }, { "epoch": 15.75697120834278, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0861, "step": 8688 }, { "epoch": 15.758784856041714, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0722, "step": 8689 }, { "epoch": 15.760598503740649, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0693, "step": 8690 }, { "epoch": 15.762412151439582, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0693, "step": 8691 }, { "epoch": 15.764225799138517, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0745, "step": 8692 }, { "epoch": 15.766039446837452, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.08, "step": 8693 }, { "epoch": 15.767853094536386, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0809, "step": 8694 }, { "epoch": 15.769666742235321, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0836, "step": 8695 }, { "epoch": 15.771480389934256, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0793, "step": 8696 }, { "epoch": 15.773294037633189, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0805, "step": 8697 }, { "epoch": 15.775107685332124, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0889, "step": 8698 }, { "epoch": 15.776921333031058, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.08, "step": 8699 }, { "epoch": 15.778734980729993, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0872, "step": 8700 }, { "epoch": 15.780548628428928, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0796, "step": 8701 }, { "epoch": 15.782362276127863, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0869, "step": 8702 }, { "epoch": 15.784175923826798, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0848, "step": 8703 }, { "epoch": 15.78598957152573, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0878, "step": 8704 }, { "epoch": 15.787803219224665, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0849, "step": 8705 }, { "epoch": 15.7896168669236, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0973, "step": 8706 }, { "epoch": 15.791430514622535, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.088, "step": 8707 }, { "epoch": 15.79324416232147, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0884, "step": 8708 }, { "epoch": 15.795057810020403, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0939, "step": 8709 }, { "epoch": 15.796871457719337, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0993, "step": 8710 }, { "epoch": 15.798685105418272, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1188, "step": 8711 }, { "epoch": 15.800498753117207, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1056, "step": 8712 }, { "epoch": 15.802312400816142, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.1377, "step": 8713 }, { "epoch": 15.804126048515077, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.1131, "step": 8714 }, { "epoch": 15.805939696214011, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.119, "step": 8715 }, { "epoch": 15.807753343912944, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.1194, "step": 8716 }, { "epoch": 15.809566991611879, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1297, "step": 8717 }, { "epoch": 15.811380639310814, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1576, "step": 8718 }, { "epoch": 15.813194287009749, "grad_norm": 1.0078125, "learning_rate": 0.0002, "loss": 0.1714, "step": 8719 }, { "epoch": 15.815007934708683, "grad_norm": 1.421875, "learning_rate": 0.0002, "loss": 0.2212, "step": 8720 }, { "epoch": 15.816821582407618, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.2327, "step": 8721 }, { "epoch": 15.818635230106551, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0788, "step": 8722 }, { "epoch": 15.820448877805486, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0713, "step": 8723 }, { "epoch": 15.82226252550442, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0799, "step": 8724 }, { "epoch": 15.824076173203355, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0755, "step": 8725 }, { "epoch": 15.82588982090229, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.074, "step": 8726 }, { "epoch": 15.827703468601225, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0795, "step": 8727 }, { "epoch": 15.829517116300158, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0758, "step": 8728 }, { "epoch": 15.831330763999093, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0717, "step": 8729 }, { "epoch": 15.833144411698028, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0615, "step": 8730 }, { "epoch": 15.834958059396962, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0786, "step": 8731 }, { "epoch": 15.836771707095897, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0663, "step": 8732 }, { "epoch": 15.838585354794832, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0768, "step": 8733 }, { "epoch": 15.840399002493765, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0744, "step": 8734 }, { "epoch": 15.8422126501927, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0703, "step": 8735 }, { "epoch": 15.844026297891634, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0775, "step": 8736 }, { "epoch": 15.84583994559057, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0716, "step": 8737 }, { "epoch": 15.847653593289504, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0775, "step": 8738 }, { "epoch": 15.849467240988439, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0757, "step": 8739 }, { "epoch": 15.851280888687372, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0744, "step": 8740 }, { "epoch": 15.853094536386307, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0698, "step": 8741 }, { "epoch": 15.854908184085241, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0793, "step": 8742 }, { "epoch": 15.856721831784176, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0775, "step": 8743 }, { "epoch": 15.85853547948311, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0916, "step": 8744 }, { "epoch": 15.860349127182046, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.079, "step": 8745 }, { "epoch": 15.862162774880979, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0739, "step": 8746 }, { "epoch": 15.863976422579913, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0792, "step": 8747 }, { "epoch": 15.865790070278848, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0741, "step": 8748 }, { "epoch": 15.867603717977783, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0862, "step": 8749 }, { "epoch": 15.869417365676718, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0873, "step": 8750 }, { "epoch": 15.871231013375652, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0853, "step": 8751 }, { "epoch": 15.873044661074585, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0851, "step": 8752 }, { "epoch": 15.87485830877352, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.0938, "step": 8753 }, { "epoch": 15.876671956472455, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1002, "step": 8754 }, { "epoch": 15.87848560417139, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.0906, "step": 8755 }, { "epoch": 15.880299251870325, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0876, "step": 8756 }, { "epoch": 15.88211289956926, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1056, "step": 8757 }, { "epoch": 15.883926547268192, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1126, "step": 8758 }, { "epoch": 15.885740194967127, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1129, "step": 8759 }, { "epoch": 15.887553842666062, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.1052, "step": 8760 }, { "epoch": 15.889367490364997, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0993, "step": 8761 }, { "epoch": 15.891181138063931, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.114, "step": 8762 }, { "epoch": 15.892994785762866, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1187, "step": 8763 }, { "epoch": 15.894808433461801, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1291, "step": 8764 }, { "epoch": 15.896622081160734, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1276, "step": 8765 }, { "epoch": 15.898435728859669, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.1328, "step": 8766 }, { "epoch": 15.900249376558603, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1464, "step": 8767 }, { "epoch": 15.902063024257538, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1675, "step": 8768 }, { "epoch": 15.903876671956473, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1463, "step": 8769 }, { "epoch": 15.905690319655406, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.2034, "step": 8770 }, { "epoch": 15.90750396735434, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.2223, "step": 8771 }, { "epoch": 15.909317615053276, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0708, "step": 8772 }, { "epoch": 15.91113126275221, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0753, "step": 8773 }, { "epoch": 15.912944910451145, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0722, "step": 8774 }, { "epoch": 15.91475855815008, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0687, "step": 8775 }, { "epoch": 15.916572205849015, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0672, "step": 8776 }, { "epoch": 15.918385853547948, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0699, "step": 8777 }, { "epoch": 15.920199501246882, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0882, "step": 8778 }, { "epoch": 15.922013148945817, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0643, "step": 8779 }, { "epoch": 15.923826796644752, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0753, "step": 8780 }, { "epoch": 15.925640444343687, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0764, "step": 8781 }, { "epoch": 15.927454092042622, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0699, "step": 8782 }, { "epoch": 15.929267739741555, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0747, "step": 8783 }, { "epoch": 15.93108138744049, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0811, "step": 8784 }, { "epoch": 15.932895035139424, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0932, "step": 8785 }, { "epoch": 15.934708682838359, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0802, "step": 8786 }, { "epoch": 15.936522330537294, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0734, "step": 8787 }, { "epoch": 15.938335978236228, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0816, "step": 8788 }, { "epoch": 15.940149625935161, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0763, "step": 8789 }, { "epoch": 15.940149625935161, "eval_loss": 2.398373603820801, "eval_runtime": 187.1575, "eval_samples_per_second": 5.343, "eval_steps_per_second": 5.343, "step": 8789 }, { "epoch": 15.940149625935161, "mmlu_eval_accuracy": 0.2988522334136679, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.25, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.0, "mmlu_eval_accuracy_electrical_engineering": 0.5, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.1111111111111111, "mmlu_eval_accuracy_high_school_european_history": 0.4444444444444444, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.14285714285714285, "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.3, "mmlu_eval_accuracy_high_school_statistics": 0.21739130434782608, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.30434782608695654, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.48, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.3, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.2823529411764706, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.2898550724637681, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 3.481798545820584, "step": 8789 }, { "epoch": 15.941963273634096, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0736, "step": 8790 }, { "epoch": 15.943776921333031, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0764, "step": 8791 }, { "epoch": 15.945590569031966, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0787, "step": 8792 }, { "epoch": 15.9474042167309, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0833, "step": 8793 }, { "epoch": 15.949217864429835, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0855, "step": 8794 }, { "epoch": 15.951031512128768, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0783, "step": 8795 }, { "epoch": 15.952845159827703, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.084, "step": 8796 }, { "epoch": 15.954658807526638, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.0919, "step": 8797 }, { "epoch": 15.956472455225573, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0825, "step": 8798 }, { "epoch": 15.958286102924507, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0921, "step": 8799 }, { "epoch": 15.960099750623442, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0808, "step": 8800 }, { "epoch": 15.961913398322375, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0747, "step": 8801 }, { "epoch": 15.96372704602131, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0846, "step": 8802 }, { "epoch": 15.965540693720245, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0848, "step": 8803 }, { "epoch": 15.96735434141918, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0934, "step": 8804 }, { "epoch": 15.969167989118114, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.0981, "step": 8805 }, { "epoch": 15.970981636817049, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0922, "step": 8806 }, { "epoch": 15.972795284515982, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0965, "step": 8807 }, { "epoch": 15.974608932214917, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0967, "step": 8808 }, { "epoch": 15.976422579913852, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1044, "step": 8809 }, { "epoch": 15.978236227612786, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.1189, "step": 8810 }, { "epoch": 15.980049875311721, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1086, "step": 8811 }, { "epoch": 15.981863523010656, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1142, "step": 8812 }, { "epoch": 15.98367717070959, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1207, "step": 8813 }, { "epoch": 15.985490818408524, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1274, "step": 8814 }, { "epoch": 15.987304466107458, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1312, "step": 8815 }, { "epoch": 15.989118113806393, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1589, "step": 8816 }, { "epoch": 15.990931761505328, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1352, "step": 8817 }, { "epoch": 15.992745409204263, "grad_norm": 1.1953125, "learning_rate": 0.0002, "loss": 0.1712, "step": 8818 }, { "epoch": 15.994559056903196, "grad_norm": 1.2421875, "learning_rate": 0.0002, "loss": 0.2207, "step": 8819 }, { "epoch": 15.99637270460213, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.2261, "step": 8820 }, { "epoch": 15.998186352301065, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.1893, "step": 8821 }, { "epoch": 16.0, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1315, "step": 8822 }, { "epoch": 16.001813647698935, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0622, "step": 8823 }, { "epoch": 16.00362729539787, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0525, "step": 8824 }, { "epoch": 16.005440943096804, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.0521, "step": 8825 }, { "epoch": 16.00725459079574, "grad_norm": 0.326171875, "learning_rate": 0.0002, "loss": 0.0526, "step": 8826 }, { "epoch": 16.009068238494674, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0613, "step": 8827 }, { "epoch": 16.01088188619361, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.0524, "step": 8828 }, { "epoch": 16.01269553389254, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0562, "step": 8829 }, { "epoch": 16.014509181591475, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0518, "step": 8830 }, { "epoch": 16.01632282929041, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0613, "step": 8831 }, { "epoch": 16.018136476989344, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0533, "step": 8832 }, { "epoch": 16.01995012468828, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0533, "step": 8833 }, { "epoch": 16.021763772387214, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0616, "step": 8834 }, { "epoch": 16.02357742008615, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.0574, "step": 8835 }, { "epoch": 16.025391067785083, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0616, "step": 8836 }, { "epoch": 16.027204715484018, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.0544, "step": 8837 }, { "epoch": 16.029018363182953, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0562, "step": 8838 }, { "epoch": 16.030832010881888, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0571, "step": 8839 }, { "epoch": 16.032645658580822, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0562, "step": 8840 }, { "epoch": 16.034459306279754, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0578, "step": 8841 }, { "epoch": 16.03627295397869, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.059, "step": 8842 }, { "epoch": 16.038086601677623, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0594, "step": 8843 }, { "epoch": 16.039900249376558, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0575, "step": 8844 }, { "epoch": 16.041713897075493, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0747, "step": 8845 }, { "epoch": 16.043527544774427, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0614, "step": 8846 }, { "epoch": 16.045341192473362, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0581, "step": 8847 }, { "epoch": 16.047154840172297, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.0627, "step": 8848 }, { "epoch": 16.04896848787123, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0638, "step": 8849 }, { "epoch": 16.050782135570167, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0682, "step": 8850 }, { "epoch": 16.0525957832691, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0613, "step": 8851 }, { "epoch": 16.054409430968036, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0666, "step": 8852 }, { "epoch": 16.056223078666967, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0619, "step": 8853 }, { "epoch": 16.058036726365902, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0603, "step": 8854 }, { "epoch": 16.059850374064837, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0665, "step": 8855 }, { "epoch": 16.06166402176377, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0695, "step": 8856 }, { "epoch": 16.063477669462706, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0759, "step": 8857 }, { "epoch": 16.06529131716164, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0709, "step": 8858 }, { "epoch": 16.067104964860576, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0953, "step": 8859 }, { "epoch": 16.06891861255951, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0744, "step": 8860 }, { "epoch": 16.070732260258445, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0789, "step": 8861 }, { "epoch": 16.07254590795738, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0897, "step": 8862 }, { "epoch": 16.074359555656315, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.087, "step": 8863 }, { "epoch": 16.07617320335525, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.095, "step": 8864 }, { "epoch": 16.07798685105418, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1266, "step": 8865 }, { "epoch": 16.079800498753116, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.1002, "step": 8866 }, { "epoch": 16.08161414645205, "grad_norm": 1.078125, "learning_rate": 0.0002, "loss": 0.1293, "step": 8867 }, { "epoch": 16.083427794150985, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1217, "step": 8868 }, { "epoch": 16.08524144184992, "grad_norm": 0.83203125, "learning_rate": 0.0002, "loss": 0.1276, "step": 8869 }, { "epoch": 16.087055089548855, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1435, "step": 8870 }, { "epoch": 16.08886873724779, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.1505, "step": 8871 }, { "epoch": 16.090682384946724, "grad_norm": 1.0546875, "learning_rate": 0.0002, "loss": 0.2569, "step": 8872 }, { "epoch": 16.09249603264566, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.06, "step": 8873 }, { "epoch": 16.094309680344594, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0565, "step": 8874 }, { "epoch": 16.09612332804353, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0627, "step": 8875 }, { "epoch": 16.097936975742464, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.0574, "step": 8876 }, { "epoch": 16.099750623441395, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0671, "step": 8877 }, { "epoch": 16.10156427114033, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.054, "step": 8878 }, { "epoch": 16.103377918839264, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0565, "step": 8879 }, { "epoch": 16.1051915665382, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0591, "step": 8880 }, { "epoch": 16.107005214237134, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.0535, "step": 8881 }, { "epoch": 16.10881886193607, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0616, "step": 8882 }, { "epoch": 16.110632509635003, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0701, "step": 8883 }, { "epoch": 16.112446157333938, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0628, "step": 8884 }, { "epoch": 16.114259805032873, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0884, "step": 8885 }, { "epoch": 16.116073452731808, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0581, "step": 8886 }, { "epoch": 16.117887100430742, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0616, "step": 8887 }, { "epoch": 16.119700748129677, "grad_norm": 0.388671875, "learning_rate": 0.0002, "loss": 0.0541, "step": 8888 }, { "epoch": 16.121514395828612, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0682, "step": 8889 }, { "epoch": 16.123328043527543, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0645, "step": 8890 }, { "epoch": 16.125141691226478, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.06, "step": 8891 }, { "epoch": 16.126955338925413, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0601, "step": 8892 }, { "epoch": 16.128768986624348, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0633, "step": 8893 }, { "epoch": 16.130582634323282, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0646, "step": 8894 }, { "epoch": 16.132396282022217, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0749, "step": 8895 }, { "epoch": 16.134209929721152, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.067, "step": 8896 }, { "epoch": 16.136023577420087, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0709, "step": 8897 }, { "epoch": 16.13783722511902, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0608, "step": 8898 }, { "epoch": 16.139650872817956, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0635, "step": 8899 }, { "epoch": 16.14146452051689, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0707, "step": 8900 }, { "epoch": 16.143278168215826, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0645, "step": 8901 }, { "epoch": 16.145091815914757, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0739, "step": 8902 }, { "epoch": 16.14690546361369, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0768, "step": 8903 }, { "epoch": 16.148719111312626, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0618, "step": 8904 }, { "epoch": 16.15053275901156, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0739, "step": 8905 }, { "epoch": 16.152346406710496, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.079, "step": 8906 }, { "epoch": 16.15416005440943, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0746, "step": 8907 }, { "epoch": 16.155973702108366, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0757, "step": 8908 }, { "epoch": 16.1577873498073, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0904, "step": 8909 }, { "epoch": 16.159600997506235, "grad_norm": 0.94140625, "learning_rate": 0.0002, "loss": 0.0946, "step": 8910 }, { "epoch": 16.16141464520517, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0968, "step": 8911 }, { "epoch": 16.163228292904105, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.0995, "step": 8912 }, { "epoch": 16.16504194060304, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.0944, "step": 8913 }, { "epoch": 16.16685558830197, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0992, "step": 8914 }, { "epoch": 16.168669236000905, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1245, "step": 8915 }, { "epoch": 16.17048288369984, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1037, "step": 8916 }, { "epoch": 16.172296531398775, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1274, "step": 8917 }, { "epoch": 16.17411017909771, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.1153, "step": 8918 }, { "epoch": 16.175923826796645, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1428, "step": 8919 }, { "epoch": 16.17773747449558, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.1497, "step": 8920 }, { "epoch": 16.179551122194514, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1957, "step": 8921 }, { "epoch": 16.18136476989345, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.2421, "step": 8922 }, { "epoch": 16.183178417592384, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0665, "step": 8923 }, { "epoch": 16.18499206529132, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0631, "step": 8924 }, { "epoch": 16.186805712990253, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8925 }, { "epoch": 16.188619360689184, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0596, "step": 8926 }, { "epoch": 16.19043300838812, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.0572, "step": 8927 }, { "epoch": 16.192246656087054, "grad_norm": 0.365234375, "learning_rate": 0.0002, "loss": 0.056, "step": 8928 }, { "epoch": 16.19406030378599, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0611, "step": 8929 }, { "epoch": 16.195873951484923, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0581, "step": 8930 }, { "epoch": 16.19768759918386, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0575, "step": 8931 }, { "epoch": 16.199501246882793, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0611, "step": 8932 }, { "epoch": 16.201314894581728, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0591, "step": 8933 }, { "epoch": 16.203128542280663, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0627, "step": 8934 }, { "epoch": 16.204942189979597, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0704, "step": 8935 }, { "epoch": 16.206755837678532, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0699, "step": 8936 }, { "epoch": 16.208569485377467, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.0587, "step": 8937 }, { "epoch": 16.2103831330764, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0645, "step": 8938 }, { "epoch": 16.212196780775333, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.063, "step": 8939 }, { "epoch": 16.214010428474268, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0696, "step": 8940 }, { "epoch": 16.215824076173202, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0722, "step": 8941 }, { "epoch": 16.217637723872137, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0615, "step": 8942 }, { "epoch": 16.219451371571072, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.068, "step": 8943 }, { "epoch": 16.221265019270007, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0623, "step": 8944 }, { "epoch": 16.22307866696894, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0698, "step": 8945 }, { "epoch": 16.224892314667876, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.076, "step": 8946 }, { "epoch": 16.22670596236681, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0775, "step": 8947 }, { "epoch": 16.228519610065746, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0663, "step": 8948 }, { "epoch": 16.23033325776468, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0708, "step": 8949 }, { "epoch": 16.232146905463615, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0691, "step": 8950 }, { "epoch": 16.233960553162547, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.0906, "step": 8951 }, { "epoch": 16.23577420086148, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0744, "step": 8952 }, { "epoch": 16.237587848560416, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0708, "step": 8953 }, { "epoch": 16.23940149625935, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0754, "step": 8954 }, { "epoch": 16.241215143958286, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0832, "step": 8955 }, { "epoch": 16.24302879165722, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0783, "step": 8956 }, { "epoch": 16.244842439356155, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.084, "step": 8957 }, { "epoch": 16.24665608705509, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0881, "step": 8958 }, { "epoch": 16.248469734754025, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.0964, "step": 8959 }, { "epoch": 16.25028338245296, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0921, "step": 8960 }, { "epoch": 16.252097030151894, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0854, "step": 8961 }, { "epoch": 16.25391067785083, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0962, "step": 8962 }, { "epoch": 16.25572432554976, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.097, "step": 8963 }, { "epoch": 16.257537973248695, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1121, "step": 8964 }, { "epoch": 16.25935162094763, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.1004, "step": 8965 }, { "epoch": 16.261165268646565, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.1159, "step": 8966 }, { "epoch": 16.2629789163455, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1349, "step": 8967 }, { "epoch": 16.264792564044434, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1374, "step": 8968 }, { "epoch": 16.26660621174337, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.1336, "step": 8969 }, { "epoch": 16.268419859442304, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1415, "step": 8970 }, { "epoch": 16.27023350714124, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.2005, "step": 8971 }, { "epoch": 16.272047154840173, "grad_norm": 1.421875, "learning_rate": 0.0002, "loss": 0.2565, "step": 8972 }, { "epoch": 16.273860802539108, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0708, "step": 8973 }, { "epoch": 16.275674450238043, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0714, "step": 8974 }, { "epoch": 16.277488097936974, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0626, "step": 8975 }, { "epoch": 16.27930174563591, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.063, "step": 8976 }, { "epoch": 16.27930174563591, "eval_loss": 2.377936601638794, "eval_runtime": 184.5441, "eval_samples_per_second": 5.419, "eval_steps_per_second": 5.419, "step": 8976 }, { "epoch": 16.27930174563591, "mmlu_eval_accuracy": 0.29601878174768703, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.23076923076923078, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.40625, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.23255813953488372, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.36363636363636365, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.34782608695652173, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.39473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.22, "mmlu_eval_accuracy_nutrition": 0.21212121212121213, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.30434782608695654, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.3333333333333333, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2222222222222222, "mmlu_eval_accuracy_world_religions": 0.21052631578947367, "mmlu_loss": 3.3220934004350706, "step": 8976 }, { "epoch": 16.281115393334844, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0584, "step": 8977 }, { "epoch": 16.28292904103378, "grad_norm": 0.41796875, "learning_rate": 0.0002, "loss": 0.062, "step": 8978 }, { "epoch": 16.284742688732713, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0614, "step": 8979 }, { "epoch": 16.286556336431648, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0607, "step": 8980 }, { "epoch": 16.288369984130583, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0609, "step": 8981 }, { "epoch": 16.290183631829517, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0722, "step": 8982 }, { "epoch": 16.291997279528452, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0633, "step": 8983 }, { "epoch": 16.293810927227387, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0756, "step": 8984 }, { "epoch": 16.29562457492632, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0588, "step": 8985 }, { "epoch": 16.297438222625257, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0729, "step": 8986 }, { "epoch": 16.29925187032419, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0674, "step": 8987 }, { "epoch": 16.301065518023123, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0609, "step": 8988 }, { "epoch": 16.302879165722057, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0684, "step": 8989 }, { "epoch": 16.304692813420992, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0751, "step": 8990 }, { "epoch": 16.306506461119927, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0729, "step": 8991 }, { "epoch": 16.30832010881886, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0603, "step": 8992 }, { "epoch": 16.310133756517796, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0647, "step": 8993 }, { "epoch": 16.31194740421673, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0694, "step": 8994 }, { "epoch": 16.313761051915666, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0675, "step": 8995 }, { "epoch": 16.3155746996146, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0676, "step": 8996 }, { "epoch": 16.317388347313535, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0645, "step": 8997 }, { "epoch": 16.31920199501247, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.068, "step": 8998 }, { "epoch": 16.321015642711405, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0692, "step": 8999 }, { "epoch": 16.322829290410336, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0717, "step": 9000 }, { "epoch": 16.32464293810927, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0787, "step": 9001 }, { "epoch": 16.326456585808206, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0791, "step": 9002 }, { "epoch": 16.32827023350714, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.0826, "step": 9003 }, { "epoch": 16.330083881206075, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0721, "step": 9004 }, { "epoch": 16.33189752890501, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.097, "step": 9005 }, { "epoch": 16.333711176603945, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.0883, "step": 9006 }, { "epoch": 16.33552482430288, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0885, "step": 9007 }, { "epoch": 16.337338472001814, "grad_norm": 0.69921875, "learning_rate": 0.0002, "loss": 0.096, "step": 9008 }, { "epoch": 16.33915211970075, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0819, "step": 9009 }, { "epoch": 16.340965767399684, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0874, "step": 9010 }, { "epoch": 16.34277941509862, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.084, "step": 9011 }, { "epoch": 16.34459306279755, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0974, "step": 9012 }, { "epoch": 16.346406710496485, "grad_norm": 0.9140625, "learning_rate": 0.0002, "loss": 0.1163, "step": 9013 }, { "epoch": 16.34822035819542, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0909, "step": 9014 }, { "epoch": 16.350034005894354, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1169, "step": 9015 }, { "epoch": 16.35184765359329, "grad_norm": 0.88671875, "learning_rate": 0.0002, "loss": 0.1337, "step": 9016 }, { "epoch": 16.353661301292224, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1286, "step": 9017 }, { "epoch": 16.35547494899116, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.1043, "step": 9018 }, { "epoch": 16.357288596690093, "grad_norm": 0.7890625, "learning_rate": 0.0002, "loss": 0.1509, "step": 9019 }, { "epoch": 16.359102244389028, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.1446, "step": 9020 }, { "epoch": 16.360915892087963, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.1585, "step": 9021 }, { "epoch": 16.362729539786898, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.2189, "step": 9022 }, { "epoch": 16.364543187485832, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0678, "step": 9023 }, { "epoch": 16.366356835184764, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0613, "step": 9024 }, { "epoch": 16.3681704828837, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.073, "step": 9025 }, { "epoch": 16.369984130582633, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0626, "step": 9026 }, { "epoch": 16.371797778281568, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0589, "step": 9027 }, { "epoch": 16.373611425980503, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.062, "step": 9028 }, { "epoch": 16.375425073679438, "grad_norm": 0.40625, "learning_rate": 0.0002, "loss": 0.055, "step": 9029 }, { "epoch": 16.377238721378372, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0706, "step": 9030 }, { "epoch": 16.379052369077307, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0604, "step": 9031 }, { "epoch": 16.380866016776242, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0611, "step": 9032 }, { "epoch": 16.382679664475177, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0721, "step": 9033 }, { "epoch": 16.38449331217411, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0719, "step": 9034 }, { "epoch": 16.386306959873046, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0697, "step": 9035 }, { "epoch": 16.388120607571977, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0791, "step": 9036 }, { "epoch": 16.389934255270912, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0752, "step": 9037 }, { "epoch": 16.391747902969847, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.068, "step": 9038 }, { "epoch": 16.39356155066878, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0725, "step": 9039 }, { "epoch": 16.395375198367717, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0726, "step": 9040 }, { "epoch": 16.39718884606665, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0758, "step": 9041 }, { "epoch": 16.399002493765586, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0669, "step": 9042 }, { "epoch": 16.40081614146452, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0707, "step": 9043 }, { "epoch": 16.402629789163456, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0754, "step": 9044 }, { "epoch": 16.40444343686239, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0756, "step": 9045 }, { "epoch": 16.406257084561325, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0655, "step": 9046 }, { "epoch": 16.40807073226026, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0722, "step": 9047 }, { "epoch": 16.409884379959195, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0773, "step": 9048 }, { "epoch": 16.411698027658126, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0782, "step": 9049 }, { "epoch": 16.41351167535706, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0896, "step": 9050 }, { "epoch": 16.415325323055995, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0794, "step": 9051 }, { "epoch": 16.41713897075493, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0832, "step": 9052 }, { "epoch": 16.418952618453865, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.094, "step": 9053 }, { "epoch": 16.4207662661528, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0864, "step": 9054 }, { "epoch": 16.422579913851735, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0813, "step": 9055 }, { "epoch": 16.42439356155067, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0864, "step": 9056 }, { "epoch": 16.426207209249604, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0797, "step": 9057 }, { "epoch": 16.42802085694854, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1039, "step": 9058 }, { "epoch": 16.429834504647474, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0873, "step": 9059 }, { "epoch": 16.43164815234641, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1003, "step": 9060 }, { "epoch": 16.43346180004534, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1055, "step": 9061 }, { "epoch": 16.435275447744274, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1029, "step": 9062 }, { "epoch": 16.43708909544321, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1028, "step": 9063 }, { "epoch": 16.438902743142144, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1074, "step": 9064 }, { "epoch": 16.44071639084108, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.1104, "step": 9065 }, { "epoch": 16.442530038540013, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1176, "step": 9066 }, { "epoch": 16.44434368623895, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1307, "step": 9067 }, { "epoch": 16.446157333937883, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1391, "step": 9068 }, { "epoch": 16.447970981636818, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1411, "step": 9069 }, { "epoch": 16.449784629335753, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1524, "step": 9070 }, { "epoch": 16.451598277034687, "grad_norm": 1.3984375, "learning_rate": 0.0002, "loss": 0.2027, "step": 9071 }, { "epoch": 16.453411924733622, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.2634, "step": 9072 }, { "epoch": 16.455225572432553, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0691, "step": 9073 }, { "epoch": 16.457039220131488, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0715, "step": 9074 }, { "epoch": 16.458852867830423, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0635, "step": 9075 }, { "epoch": 16.460666515529358, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.067, "step": 9076 }, { "epoch": 16.462480163228292, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0624, "step": 9077 }, { "epoch": 16.464293810927227, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0674, "step": 9078 }, { "epoch": 16.466107458626162, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.0598, "step": 9079 }, { "epoch": 16.467921106325097, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0642, "step": 9080 }, { "epoch": 16.46973475402403, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.0653, "step": 9081 }, { "epoch": 16.471548401722966, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0651, "step": 9082 }, { "epoch": 16.4733620494219, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.071, "step": 9083 }, { "epoch": 16.475175697120836, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0765, "step": 9084 }, { "epoch": 16.476989344819767, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0706, "step": 9085 }, { "epoch": 16.478802992518702, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0801, "step": 9086 }, { "epoch": 16.480616640217637, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0603, "step": 9087 }, { "epoch": 16.48243028791657, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.063, "step": 9088 }, { "epoch": 16.484243935615506, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.066, "step": 9089 }, { "epoch": 16.48605758331444, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0669, "step": 9090 }, { "epoch": 16.487871231013376, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0733, "step": 9091 }, { "epoch": 16.48968487871231, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0651, "step": 9092 }, { "epoch": 16.491498526411245, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0794, "step": 9093 }, { "epoch": 16.49331217411018, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0768, "step": 9094 }, { "epoch": 16.495125821809115, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0717, "step": 9095 }, { "epoch": 16.49693946950805, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0726, "step": 9096 }, { "epoch": 16.49875311720698, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0738, "step": 9097 }, { "epoch": 16.500566764905916, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0736, "step": 9098 }, { "epoch": 16.50238041260485, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0813, "step": 9099 }, { "epoch": 16.504194060303785, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0751, "step": 9100 }, { "epoch": 16.50600770800272, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0693, "step": 9101 }, { "epoch": 16.507821355701655, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0775, "step": 9102 }, { "epoch": 16.50963500340059, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0733, "step": 9103 }, { "epoch": 16.511448651099524, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0747, "step": 9104 }, { "epoch": 16.51326229879846, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0808, "step": 9105 }, { "epoch": 16.515075946497394, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0882, "step": 9106 }, { "epoch": 16.51688959419633, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0768, "step": 9107 }, { "epoch": 16.518703241895263, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.0928, "step": 9108 }, { "epoch": 16.520516889594198, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0871, "step": 9109 }, { "epoch": 16.52233053729313, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0873, "step": 9110 }, { "epoch": 16.524144184992064, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0917, "step": 9111 }, { "epoch": 16.525957832691, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.0932, "step": 9112 }, { "epoch": 16.527771480389934, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0976, "step": 9113 }, { "epoch": 16.52958512808887, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1215, "step": 9114 }, { "epoch": 16.531398775787803, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1189, "step": 9115 }, { "epoch": 16.533212423486738, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1165, "step": 9116 }, { "epoch": 16.535026071185673, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1311, "step": 9117 }, { "epoch": 16.536839718884607, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1453, "step": 9118 }, { "epoch": 16.538653366583542, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.15, "step": 9119 }, { "epoch": 16.540467014282477, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1556, "step": 9120 }, { "epoch": 16.542280661981412, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1955, "step": 9121 }, { "epoch": 16.544094309680343, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.2873, "step": 9122 }, { "epoch": 16.545907957379278, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0748, "step": 9123 }, { "epoch": 16.547721605078213, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0683, "step": 9124 }, { "epoch": 16.549535252777147, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0644, "step": 9125 }, { "epoch": 16.551348900476082, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0659, "step": 9126 }, { "epoch": 16.553162548175017, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0605, "step": 9127 }, { "epoch": 16.55497619587395, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0634, "step": 9128 }, { "epoch": 16.556789843572886, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0629, "step": 9129 }, { "epoch": 16.55860349127182, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0726, "step": 9130 }, { "epoch": 16.560417138970756, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0605, "step": 9131 }, { "epoch": 16.56223078666969, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0702, "step": 9132 }, { "epoch": 16.564044434368626, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0803, "step": 9133 }, { "epoch": 16.565858082067557, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0702, "step": 9134 }, { "epoch": 16.56767172976649, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0779, "step": 9135 }, { "epoch": 16.569485377465426, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0622, "step": 9136 }, { "epoch": 16.57129902516436, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0642, "step": 9137 }, { "epoch": 16.573112672863296, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0768, "step": 9138 }, { "epoch": 16.57492632056223, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0662, "step": 9139 }, { "epoch": 16.576739968261165, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0689, "step": 9140 }, { "epoch": 16.5785536159601, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0767, "step": 9141 }, { "epoch": 16.580367263659035, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0649, "step": 9142 }, { "epoch": 16.58218091135797, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0688, "step": 9143 }, { "epoch": 16.583994559056904, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0745, "step": 9144 }, { "epoch": 16.58580820675584, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0787, "step": 9145 }, { "epoch": 16.58762185445477, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0773, "step": 9146 }, { "epoch": 16.589435502153705, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.064, "step": 9147 }, { "epoch": 16.59124914985264, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0806, "step": 9148 }, { "epoch": 16.593062797551575, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.076, "step": 9149 }, { "epoch": 16.59487644525051, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.0885, "step": 9150 }, { "epoch": 16.596690092949444, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.1183, "step": 9151 }, { "epoch": 16.59850374064838, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0787, "step": 9152 }, { "epoch": 16.600317388347314, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0878, "step": 9153 }, { "epoch": 16.60213103604625, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.083, "step": 9154 }, { "epoch": 16.603944683745183, "grad_norm": 0.78125, "learning_rate": 0.0002, "loss": 0.0943, "step": 9155 }, { "epoch": 16.605758331444118, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0826, "step": 9156 }, { "epoch": 16.607571979143053, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0953, "step": 9157 }, { "epoch": 16.609385626841984, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0799, "step": 9158 }, { "epoch": 16.61119927454092, "grad_norm": 0.6953125, "learning_rate": 0.0002, "loss": 0.093, "step": 9159 }, { "epoch": 16.613012922239854, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0927, "step": 9160 }, { "epoch": 16.61482656993879, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0957, "step": 9161 }, { "epoch": 16.616640217637723, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0938, "step": 9162 }, { "epoch": 16.618453865336658, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0998, "step": 9163 }, { "epoch": 16.618453865336658, "eval_loss": 2.371652364730835, "eval_runtime": 184.7197, "eval_samples_per_second": 5.414, "eval_steps_per_second": 5.414, "step": 9163 }, { "epoch": 16.618453865336658, "mmlu_eval_accuracy": 0.29977813870855935, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.20689655172413793, "mmlu_eval_accuracy_college_biology": 0.3125, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.09090909090909091, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.2, "mmlu_eval_accuracy_high_school_biology": 0.46875, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3888888888888889, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.0, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.31666666666666665, "mmlu_eval_accuracy_high_school_statistics": 0.17391304347826086, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.34615384615384615, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.16666666666666666, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, "mmlu_eval_accuracy_machine_learning": 0.18181818181818182, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.24242424242424243, "mmlu_eval_accuracy_philosophy": 0.3235294117647059, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.27058823529411763, "mmlu_eval_accuracy_professional_medicine": 0.3548387096774194, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.4090909090909091, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.2777777777777778, "mmlu_eval_accuracy_world_religions": 0.2631578947368421, "mmlu_loss": 2.946527149710758, "step": 9163 }, { "epoch": 16.620267513035593, "grad_norm": 0.9609375, "learning_rate": 0.0002, "loss": 0.137, "step": 9164 }, { "epoch": 16.622081160734528, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.1038, "step": 9165 }, { "epoch": 16.623894808433462, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1235, "step": 9166 }, { "epoch": 16.625708456132397, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1372, "step": 9167 }, { "epoch": 16.627522103831332, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.1307, "step": 9168 }, { "epoch": 16.629335751530267, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1599, "step": 9169 }, { "epoch": 16.6311493992292, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1442, "step": 9170 }, { "epoch": 16.632963046928133, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1606, "step": 9171 }, { "epoch": 16.634776694627067, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.2471, "step": 9172 }, { "epoch": 16.636590342326002, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0669, "step": 9173 }, { "epoch": 16.638403990024937, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0708, "step": 9174 }, { "epoch": 16.64021763772387, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0665, "step": 9175 }, { "epoch": 16.642031285422807, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0695, "step": 9176 }, { "epoch": 16.64384493312174, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.064, "step": 9177 }, { "epoch": 16.645658580820676, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0601, "step": 9178 }, { "epoch": 16.64747222851961, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0683, "step": 9179 }, { "epoch": 16.649285876218546, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0676, "step": 9180 }, { "epoch": 16.65109952391748, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0638, "step": 9181 }, { "epoch": 16.652913171616415, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0667, "step": 9182 }, { "epoch": 16.654726819315346, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.07, "step": 9183 }, { "epoch": 16.65654046701428, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0694, "step": 9184 }, { "epoch": 16.658354114713216, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0727, "step": 9185 }, { "epoch": 16.66016776241215, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0748, "step": 9186 }, { "epoch": 16.661981410111085, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0664, "step": 9187 }, { "epoch": 16.66379505781002, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0761, "step": 9188 }, { "epoch": 16.665608705508955, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0703, "step": 9189 }, { "epoch": 16.66742235320789, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0681, "step": 9190 }, { "epoch": 16.669236000906825, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.073, "step": 9191 }, { "epoch": 16.67104964860576, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0791, "step": 9192 }, { "epoch": 16.672863296304694, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0707, "step": 9193 }, { "epoch": 16.67467694400363, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0793, "step": 9194 }, { "epoch": 16.67649059170256, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0749, "step": 9195 }, { "epoch": 16.678304239401495, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0781, "step": 9196 }, { "epoch": 16.68011788710043, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0819, "step": 9197 }, { "epoch": 16.681931534799364, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0862, "step": 9198 }, { "epoch": 16.6837451824983, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.078, "step": 9199 }, { "epoch": 16.685558830197234, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0804, "step": 9200 }, { "epoch": 16.68737247789617, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0804, "step": 9201 }, { "epoch": 16.689186125595104, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0923, "step": 9202 }, { "epoch": 16.69099977329404, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0775, "step": 9203 }, { "epoch": 16.692813420992973, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0916, "step": 9204 }, { "epoch": 16.694627068691908, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.093, "step": 9205 }, { "epoch": 16.696440716390843, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.0926, "step": 9206 }, { "epoch": 16.698254364089777, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0877, "step": 9207 }, { "epoch": 16.70006801178871, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0812, "step": 9208 }, { "epoch": 16.701881659487643, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0852, "step": 9209 }, { "epoch": 16.703695307186578, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0933, "step": 9210 }, { "epoch": 16.705508954885513, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0938, "step": 9211 }, { "epoch": 16.707322602584448, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1014, "step": 9212 }, { "epoch": 16.709136250283382, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1415, "step": 9213 }, { "epoch": 16.710949897982317, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.1108, "step": 9214 }, { "epoch": 16.712763545681252, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.103, "step": 9215 }, { "epoch": 16.714577193380187, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1245, "step": 9216 }, { "epoch": 16.71639084107912, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1301, "step": 9217 }, { "epoch": 16.718204488778056, "grad_norm": 0.90234375, "learning_rate": 0.0002, "loss": 0.1383, "step": 9218 }, { "epoch": 16.720018136476988, "grad_norm": 0.9453125, "learning_rate": 0.0002, "loss": 0.1509, "step": 9219 }, { "epoch": 16.721831784175922, "grad_norm": 0.87109375, "learning_rate": 0.0002, "loss": 0.1654, "step": 9220 }, { "epoch": 16.723645431874857, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1835, "step": 9221 }, { "epoch": 16.725459079573792, "grad_norm": 1.1796875, "learning_rate": 0.0002, "loss": 0.2389, "step": 9222 }, { "epoch": 16.727272727272727, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.076, "step": 9223 }, { "epoch": 16.72908637497166, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0679, "step": 9224 }, { "epoch": 16.730900022670596, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0694, "step": 9225 }, { "epoch": 16.73271367036953, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.069, "step": 9226 }, { "epoch": 16.734527318068466, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0761, "step": 9227 }, { "epoch": 16.7363409657674, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0684, "step": 9228 }, { "epoch": 16.738154613466335, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0741, "step": 9229 }, { "epoch": 16.73996826116527, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0778, "step": 9230 }, { "epoch": 16.741781908864205, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0683, "step": 9231 }, { "epoch": 16.743595556563136, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0755, "step": 9232 }, { "epoch": 16.74540920426207, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0821, "step": 9233 }, { "epoch": 16.747222851961006, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0726, "step": 9234 }, { "epoch": 16.74903649965994, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.067, "step": 9235 }, { "epoch": 16.750850147358875, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0783, "step": 9236 }, { "epoch": 16.75266379505781, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0672, "step": 9237 }, { "epoch": 16.754477442756745, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0699, "step": 9238 }, { "epoch": 16.75629109045568, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0689, "step": 9239 }, { "epoch": 16.758104738154614, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0726, "step": 9240 }, { "epoch": 16.75991838585355, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0701, "step": 9241 }, { "epoch": 16.761732033552484, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0763, "step": 9242 }, { "epoch": 16.76354568125142, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0798, "step": 9243 }, { "epoch": 16.76535932895035, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0739, "step": 9244 }, { "epoch": 16.767172976649285, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0831, "step": 9245 }, { "epoch": 16.76898662434822, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0801, "step": 9246 }, { "epoch": 16.770800272047154, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0778, "step": 9247 }, { "epoch": 16.77261391974609, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0738, "step": 9248 }, { "epoch": 16.774427567445024, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.081, "step": 9249 }, { "epoch": 16.77624121514396, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0839, "step": 9250 }, { "epoch": 16.778054862842893, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0817, "step": 9251 }, { "epoch": 16.779868510541828, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0712, "step": 9252 }, { "epoch": 16.781682158240763, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0723, "step": 9253 }, { "epoch": 16.783495805939697, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0767, "step": 9254 }, { "epoch": 16.785309453638632, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0783, "step": 9255 }, { "epoch": 16.787123101337563, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0943, "step": 9256 }, { "epoch": 16.7889367490365, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0823, "step": 9257 }, { "epoch": 16.790750396735433, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0811, "step": 9258 }, { "epoch": 16.792564044434368, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0926, "step": 9259 }, { "epoch": 16.794377692133303, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.106, "step": 9260 }, { "epoch": 16.796191339832237, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1016, "step": 9261 }, { "epoch": 16.798004987531172, "grad_norm": 0.73828125, "learning_rate": 0.0002, "loss": 0.1, "step": 9262 }, { "epoch": 16.799818635230107, "grad_norm": 0.78515625, "learning_rate": 0.0002, "loss": 0.1073, "step": 9263 }, { "epoch": 16.80163228292904, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1221, "step": 9264 }, { "epoch": 16.803445930627976, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1109, "step": 9265 }, { "epoch": 16.80525957832691, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0993, "step": 9266 }, { "epoch": 16.807073226025846, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1221, "step": 9267 }, { "epoch": 16.80888687372478, "grad_norm": 0.91015625, "learning_rate": 0.0002, "loss": 0.1285, "step": 9268 }, { "epoch": 16.810700521423712, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.1424, "step": 9269 }, { "epoch": 16.812514169122647, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.169, "step": 9270 }, { "epoch": 16.81432781682158, "grad_norm": 0.91796875, "learning_rate": 0.0002, "loss": 0.1888, "step": 9271 }, { "epoch": 16.816141464520516, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.2361, "step": 9272 }, { "epoch": 16.81795511221945, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0856, "step": 9273 }, { "epoch": 16.819768759918386, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0665, "step": 9274 }, { "epoch": 16.82158240761732, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0715, "step": 9275 }, { "epoch": 16.823396055316255, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0689, "step": 9276 }, { "epoch": 16.82520970301519, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0703, "step": 9277 }, { "epoch": 16.827023350714125, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.074, "step": 9278 }, { "epoch": 16.82883699841306, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.068, "step": 9279 }, { "epoch": 16.830650646111994, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0711, "step": 9280 }, { "epoch": 16.832464293810926, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0752, "step": 9281 }, { "epoch": 16.83427794150986, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0629, "step": 9282 }, { "epoch": 16.836091589208795, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.0664, "step": 9283 }, { "epoch": 16.83790523690773, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0887, "step": 9284 }, { "epoch": 16.839718884606665, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0788, "step": 9285 }, { "epoch": 16.8415325323056, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0812, "step": 9286 }, { "epoch": 16.843346180004534, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0755, "step": 9287 }, { "epoch": 16.84515982770347, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0628, "step": 9288 }, { "epoch": 16.846973475402404, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0783, "step": 9289 }, { "epoch": 16.84878712310134, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0638, "step": 9290 }, { "epoch": 16.850600770800273, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0699, "step": 9291 }, { "epoch": 16.852414418499208, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0733, "step": 9292 }, { "epoch": 16.85422806619814, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0783, "step": 9293 }, { "epoch": 16.856041713897074, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0731, "step": 9294 }, { "epoch": 16.85785536159601, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.082, "step": 9295 }, { "epoch": 16.859669009294944, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0703, "step": 9296 }, { "epoch": 16.86148265699388, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0814, "step": 9297 }, { "epoch": 16.863296304692813, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0778, "step": 9298 }, { "epoch": 16.865109952391748, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.0814, "step": 9299 }, { "epoch": 16.866923600090683, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0849, "step": 9300 }, { "epoch": 16.868737247789618, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0969, "step": 9301 }, { "epoch": 16.870550895488552, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.086, "step": 9302 }, { "epoch": 16.872364543187487, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0835, "step": 9303 }, { "epoch": 16.874178190886422, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1136, "step": 9304 }, { "epoch": 16.875991838585353, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.0884, "step": 9305 }, { "epoch": 16.877805486284288, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.0951, "step": 9306 }, { "epoch": 16.879619133983223, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.092, "step": 9307 }, { "epoch": 16.881432781682157, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.0946, "step": 9308 }, { "epoch": 16.883246429381092, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1011, "step": 9309 }, { "epoch": 16.885060077080027, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1074, "step": 9310 }, { "epoch": 16.88687372477896, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.1048, "step": 9311 }, { "epoch": 16.888687372477897, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1137, "step": 9312 }, { "epoch": 16.89050102017683, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1174, "step": 9313 }, { "epoch": 16.892314667875766, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1214, "step": 9314 }, { "epoch": 16.8941283155747, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1284, "step": 9315 }, { "epoch": 16.895941963273636, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1203, "step": 9316 }, { "epoch": 16.897755610972567, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1159, "step": 9317 }, { "epoch": 16.8995692586715, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1355, "step": 9318 }, { "epoch": 16.901382906370436, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1661, "step": 9319 }, { "epoch": 16.90319655406937, "grad_norm": 1.546875, "learning_rate": 0.0002, "loss": 0.2063, "step": 9320 }, { "epoch": 16.905010201768306, "grad_norm": 0.89453125, "learning_rate": 0.0002, "loss": 0.2034, "step": 9321 }, { "epoch": 16.90682384946724, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.2694, "step": 9322 }, { "epoch": 16.908637497166175, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0715, "step": 9323 }, { "epoch": 16.91045114486511, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0726, "step": 9324 }, { "epoch": 16.912264792564045, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.0692, "step": 9325 }, { "epoch": 16.91407844026298, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0784, "step": 9326 }, { "epoch": 16.915892087961915, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0743, "step": 9327 }, { "epoch": 16.91770573566085, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0721, "step": 9328 }, { "epoch": 16.919519383359784, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0811, "step": 9329 }, { "epoch": 16.921333031058715, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.075, "step": 9330 }, { "epoch": 16.92314667875765, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0764, "step": 9331 }, { "epoch": 16.924960326456585, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0746, "step": 9332 }, { "epoch": 16.92677397415552, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0717, "step": 9333 }, { "epoch": 16.928587621854454, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0758, "step": 9334 }, { "epoch": 16.93040126955339, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0738, "step": 9335 }, { "epoch": 16.932214917252324, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0788, "step": 9336 }, { "epoch": 16.93402856495126, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0699, "step": 9337 }, { "epoch": 16.935842212650194, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0773, "step": 9338 }, { "epoch": 16.93765586034913, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0747, "step": 9339 }, { "epoch": 16.939469508048063, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0856, "step": 9340 }, { "epoch": 16.941283155746998, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0732, "step": 9341 }, { "epoch": 16.94309680344593, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0734, "step": 9342 }, { "epoch": 16.944910451144864, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0761, "step": 9343 }, { "epoch": 16.9467240988438, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0748, "step": 9344 }, { "epoch": 16.948537746542733, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0809, "step": 9345 }, { "epoch": 16.950351394241668, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0874, "step": 9346 }, { "epoch": 16.952165041940603, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0794, "step": 9347 }, { "epoch": 16.953978689639538, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0769, "step": 9348 }, { "epoch": 16.955792337338472, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1083, "step": 9349 }, { "epoch": 16.957605985037407, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0938, "step": 9350 }, { "epoch": 16.957605985037407, "eval_loss": 2.396846294403076, "eval_runtime": 186.2824, "eval_samples_per_second": 5.368, "eval_steps_per_second": 5.368, "step": 9350 }, { "epoch": 16.957605985037407, "mmlu_eval_accuracy": 0.30818208291185356, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.2413793103448276, "mmlu_eval_accuracy_college_biology": 0.4375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.18181818181818182, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.2558139534883721, "mmlu_eval_accuracy_high_school_mathematics": 0.10344827586206896, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.3333333333333333, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.3076923076923077, "mmlu_eval_accuracy_human_aging": 0.391304347826087, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.5384615384615384, "mmlu_eval_accuracy_jurisprudence": 0.36363636363636365, "mmlu_eval_accuracy_logical_fallacies": 0.2777777777777778, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.5454545454545454, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.30303030303030304, "mmlu_eval_accuracy_philosophy": 0.4411764705882353, "mmlu_eval_accuracy_prehistory": 0.45714285714285713, "mmlu_eval_accuracy_professional_accounting": 0.2903225806451613, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.34782608695652173, "mmlu_eval_accuracy_public_relations": 0.25, "mmlu_eval_accuracy_security_studies": 0.25925925925925924, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.09090909090909091, "mmlu_eval_accuracy_virology": 0.16666666666666666, "mmlu_eval_accuracy_world_religions": 0.3157894736842105, "mmlu_loss": 3.196023402203928, "step": 9350 }, { "epoch": 16.959419632736342, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0817, "step": 9351 }, { "epoch": 16.961233280435277, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0892, "step": 9352 }, { "epoch": 16.96304692813421, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.0988, "step": 9353 }, { "epoch": 16.964860575833143, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0922, "step": 9354 }, { "epoch": 16.966674223532078, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0869, "step": 9355 }, { "epoch": 16.968487871231012, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0836, "step": 9356 }, { "epoch": 16.970301518929947, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.094, "step": 9357 }, { "epoch": 16.972115166628882, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1014, "step": 9358 }, { "epoch": 16.973928814327817, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1001, "step": 9359 }, { "epoch": 16.97574246202675, "grad_norm": 0.84375, "learning_rate": 0.0002, "loss": 0.1092, "step": 9360 }, { "epoch": 16.977556109725686, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.101, "step": 9361 }, { "epoch": 16.97936975742462, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.1036, "step": 9362 }, { "epoch": 16.981183405123556, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.1058, "step": 9363 }, { "epoch": 16.98299705282249, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1237, "step": 9364 }, { "epoch": 16.984810700521425, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1101, "step": 9365 }, { "epoch": 16.986624348220356, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1213, "step": 9366 }, { "epoch": 16.98843799591929, "grad_norm": 1.0390625, "learning_rate": 0.0002, "loss": 0.1451, "step": 9367 }, { "epoch": 16.990251643618226, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1283, "step": 9368 }, { "epoch": 16.99206529131716, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.1648, "step": 9369 }, { "epoch": 16.993878939016096, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.1615, "step": 9370 }, { "epoch": 16.99569258671503, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.2077, "step": 9371 }, { "epoch": 16.997506234413965, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.241, "step": 9372 }, { "epoch": 16.9993198821129, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.0964, "step": 9373 }, { "epoch": 17.001133529811835, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.1222, "step": 9374 }, { "epoch": 17.00294717751077, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.0516, "step": 9375 }, { "epoch": 17.004760825209704, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.0554, "step": 9376 }, { "epoch": 17.00657447290864, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.051, "step": 9377 }, { "epoch": 17.00838812060757, "grad_norm": 0.36328125, "learning_rate": 0.0002, "loss": 0.0483, "step": 9378 }, { "epoch": 17.010201768306505, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0561, "step": 9379 }, { "epoch": 17.01201541600544, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0556, "step": 9380 }, { "epoch": 17.013829063704375, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 0.0519, "step": 9381 }, { "epoch": 17.01564271140331, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.0558, "step": 9382 }, { "epoch": 17.017456359102244, "grad_norm": 0.375, "learning_rate": 0.0002, "loss": 0.0512, "step": 9383 }, { "epoch": 17.01927000680118, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0652, "step": 9384 }, { "epoch": 17.021083654500114, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0536, "step": 9385 }, { "epoch": 17.02289730219905, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0691, "step": 9386 }, { "epoch": 17.024710949897983, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0696, "step": 9387 }, { "epoch": 17.026524597596918, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0652, "step": 9388 }, { "epoch": 17.028338245295853, "grad_norm": 0.38671875, "learning_rate": 0.0002, "loss": 0.0601, "step": 9389 }, { "epoch": 17.030151892994787, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0591, "step": 9390 }, { "epoch": 17.03196554069372, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0522, "step": 9391 }, { "epoch": 17.033779188392653, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0544, "step": 9392 }, { "epoch": 17.03559283609159, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.066, "step": 9393 }, { "epoch": 17.037406483790523, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.056, "step": 9394 }, { "epoch": 17.039220131489458, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.059, "step": 9395 }, { "epoch": 17.041033779188393, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.067, "step": 9396 }, { "epoch": 17.042847426887327, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0666, "step": 9397 }, { "epoch": 17.044661074586262, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0622, "step": 9398 }, { "epoch": 17.046474722285197, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.067, "step": 9399 }, { "epoch": 17.04828836998413, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0718, "step": 9400 }, { "epoch": 17.050102017683066, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.082, "step": 9401 }, { "epoch": 17.051915665382, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0674, "step": 9402 }, { "epoch": 17.053729313080932, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0694, "step": 9403 }, { "epoch": 17.055542960779867, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0717, "step": 9404 }, { "epoch": 17.057356608478802, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0641, "step": 9405 }, { "epoch": 17.059170256177737, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.063, "step": 9406 }, { "epoch": 17.06098390387667, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0687, "step": 9407 }, { "epoch": 17.062797551575606, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0736, "step": 9408 }, { "epoch": 17.06461119927454, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0869, "step": 9409 }, { "epoch": 17.066424846973476, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.081, "step": 9410 }, { "epoch": 17.06823849467241, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0837, "step": 9411 }, { "epoch": 17.070052142371345, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0899, "step": 9412 }, { "epoch": 17.07186579007028, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0849, "step": 9413 }, { "epoch": 17.073679437769215, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.0891, "step": 9414 }, { "epoch": 17.075493085468146, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0936, "step": 9415 }, { "epoch": 17.07730673316708, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1007, "step": 9416 }, { "epoch": 17.079120380866016, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0948, "step": 9417 }, { "epoch": 17.08093402856495, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1228, "step": 9418 }, { "epoch": 17.082747676263885, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1048, "step": 9419 }, { "epoch": 17.08456132396282, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1753, "step": 9420 }, { "epoch": 17.086374971661755, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1512, "step": 9421 }, { "epoch": 17.08818861936069, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1565, "step": 9422 }, { "epoch": 17.090002267059624, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1909, "step": 9423 }, { "epoch": 17.09181591475856, "grad_norm": 1.2265625, "learning_rate": 0.0002, "loss": 0.1422, "step": 9424 }, { "epoch": 17.093629562457494, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0627, "step": 9425 }, { "epoch": 17.09544321015643, "grad_norm": 0.3984375, "learning_rate": 0.0002, "loss": 0.0548, "step": 9426 }, { "epoch": 17.09725685785536, "grad_norm": 0.37890625, "learning_rate": 0.0002, "loss": 0.0486, "step": 9427 }, { "epoch": 17.099070505554295, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0539, "step": 9428 }, { "epoch": 17.10088415325323, "grad_norm": 0.3828125, "learning_rate": 0.0002, "loss": 0.0599, "step": 9429 }, { "epoch": 17.102697800952164, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.0557, "step": 9430 }, { "epoch": 17.1045114486511, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0598, "step": 9431 }, { "epoch": 17.106325096350034, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0578, "step": 9432 }, { "epoch": 17.10813874404897, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0621, "step": 9433 }, { "epoch": 17.109952391747903, "grad_norm": 0.373046875, "learning_rate": 0.0002, "loss": 0.0546, "step": 9434 }, { "epoch": 17.111766039446838, "grad_norm": 0.34765625, "learning_rate": 0.0002, "loss": 0.0556, "step": 9435 }, { "epoch": 17.113579687145773, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0689, "step": 9436 }, { "epoch": 17.115393334844708, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0559, "step": 9437 }, { "epoch": 17.117206982543642, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0679, "step": 9438 }, { "epoch": 17.119020630242577, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0581, "step": 9439 }, { "epoch": 17.12083427794151, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0614, "step": 9440 }, { "epoch": 17.122647925640443, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0627, "step": 9441 }, { "epoch": 17.124461573339378, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0568, "step": 9442 }, { "epoch": 17.126275221038313, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0633, "step": 9443 }, { "epoch": 17.128088868737247, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0617, "step": 9444 }, { "epoch": 17.129902516436182, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.062, "step": 9445 }, { "epoch": 17.131716164135117, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0625, "step": 9446 }, { "epoch": 17.13352981183405, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0691, "step": 9447 }, { "epoch": 17.135343459532987, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0743, "step": 9448 }, { "epoch": 17.13715710723192, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0645, "step": 9449 }, { "epoch": 17.138970754930856, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0609, "step": 9450 }, { "epoch": 17.14078440262979, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0669, "step": 9451 }, { "epoch": 17.142598050328722, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0695, "step": 9452 }, { "epoch": 17.144411698027657, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0664, "step": 9453 }, { "epoch": 17.14622534572659, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0776, "step": 9454 }, { "epoch": 17.148038993425526, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0759, "step": 9455 }, { "epoch": 17.14985264112446, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0837, "step": 9456 }, { "epoch": 17.151666288823396, "grad_norm": 0.93359375, "learning_rate": 0.0002, "loss": 0.102, "step": 9457 }, { "epoch": 17.15347993652233, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0825, "step": 9458 }, { "epoch": 17.155293584221265, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.0961, "step": 9459 }, { "epoch": 17.1571072319202, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0761, "step": 9460 }, { "epoch": 17.158920879619135, "grad_norm": 0.85546875, "learning_rate": 0.0002, "loss": 0.0907, "step": 9461 }, { "epoch": 17.16073452731807, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.086, "step": 9462 }, { "epoch": 17.162548175017005, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.104, "step": 9463 }, { "epoch": 17.164361822715936, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.103, "step": 9464 }, { "epoch": 17.16617547041487, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0974, "step": 9465 }, { "epoch": 17.167989118113805, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1189, "step": 9466 }, { "epoch": 17.16980276581274, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.1236, "step": 9467 }, { "epoch": 17.171616413511675, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1235, "step": 9468 }, { "epoch": 17.17343006121061, "grad_norm": 0.9296875, "learning_rate": 0.0002, "loss": 0.1448, "step": 9469 }, { "epoch": 17.175243708909544, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.1298, "step": 9470 }, { "epoch": 17.17705735660848, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1405, "step": 9471 }, { "epoch": 17.178871004307414, "grad_norm": 0.435546875, "learning_rate": 0.0002, "loss": 0.147, "step": 9472 }, { "epoch": 17.18068465200635, "grad_norm": 0.98828125, "learning_rate": 0.0002, "loss": 0.1988, "step": 9473 }, { "epoch": 17.182498299705284, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1788, "step": 9474 }, { "epoch": 17.18431194740422, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0741, "step": 9475 }, { "epoch": 17.18612559510315, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0609, "step": 9476 }, { "epoch": 17.187939242802084, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.0713, "step": 9477 }, { "epoch": 17.18975289050102, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0598, "step": 9478 }, { "epoch": 17.191566538199954, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0553, "step": 9479 }, { "epoch": 17.19338018589889, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.065, "step": 9480 }, { "epoch": 17.195193833597823, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0585, "step": 9481 }, { "epoch": 17.197007481296758, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0638, "step": 9482 }, { "epoch": 17.198821128995693, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.064, "step": 9483 }, { "epoch": 17.200634776694628, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0562, "step": 9484 }, { "epoch": 17.202448424393562, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.063, "step": 9485 }, { "epoch": 17.204262072092497, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0612, "step": 9486 }, { "epoch": 17.206075719791432, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0625, "step": 9487 }, { "epoch": 17.207889367490363, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0641, "step": 9488 }, { "epoch": 17.209703015189298, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0604, "step": 9489 }, { "epoch": 17.211516662888233, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0633, "step": 9490 }, { "epoch": 17.213330310587168, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0632, "step": 9491 }, { "epoch": 17.215143958286102, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0682, "step": 9492 }, { "epoch": 17.216957605985037, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0764, "step": 9493 }, { "epoch": 17.218771253683972, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0632, "step": 9494 }, { "epoch": 17.220584901382907, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0739, "step": 9495 }, { "epoch": 17.22239854908184, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0698, "step": 9496 }, { "epoch": 17.224212196780776, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0653, "step": 9497 }, { "epoch": 17.22602584447971, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0671, "step": 9498 }, { "epoch": 17.227839492178646, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0734, "step": 9499 }, { "epoch": 17.22965313987758, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0663, "step": 9500 }, { "epoch": 17.23146678757651, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0752, "step": 9501 }, { "epoch": 17.233280435275447, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0712, "step": 9502 }, { "epoch": 17.23509408297438, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0849, "step": 9503 }, { "epoch": 17.236907730673316, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0725, "step": 9504 }, { "epoch": 17.23872137837225, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0824, "step": 9505 }, { "epoch": 17.240535026071186, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0802, "step": 9506 }, { "epoch": 17.24234867377012, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0769, "step": 9507 }, { "epoch": 17.244162321469055, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0778, "step": 9508 }, { "epoch": 17.24597596916799, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0749, "step": 9509 }, { "epoch": 17.247789616866925, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.099, "step": 9510 }, { "epoch": 17.24960326456586, "grad_norm": 0.765625, "learning_rate": 0.0002, "loss": 0.0944, "step": 9511 }, { "epoch": 17.251416912264794, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.097, "step": 9512 }, { "epoch": 17.253230559963725, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0833, "step": 9513 }, { "epoch": 17.25504420766266, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1021, "step": 9514 }, { "epoch": 17.256857855361595, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0893, "step": 9515 }, { "epoch": 17.25867150306053, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1431, "step": 9516 }, { "epoch": 17.260485150759465, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1075, "step": 9517 }, { "epoch": 17.2622987984584, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.1658, "step": 9518 }, { "epoch": 17.264112446157334, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.119, "step": 9519 }, { "epoch": 17.26592609385627, "grad_norm": 0.8828125, "learning_rate": 0.0002, "loss": 0.1632, "step": 9520 }, { "epoch": 17.267739741555204, "grad_norm": 0.98046875, "learning_rate": 0.0002, "loss": 0.1552, "step": 9521 }, { "epoch": 17.26955338925414, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.182, "step": 9522 }, { "epoch": 17.271367036953073, "grad_norm": 0.83984375, "learning_rate": 0.0002, "loss": 0.1764, "step": 9523 }, { "epoch": 17.273180684652008, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1234, "step": 9524 }, { "epoch": 17.27499433235094, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.064, "step": 9525 }, { "epoch": 17.276807980049874, "grad_norm": 0.48828125, "learning_rate": 0.0002, "loss": 0.0667, "step": 9526 }, { "epoch": 17.27862162774881, "grad_norm": 0.380859375, "learning_rate": 0.0002, "loss": 0.0581, "step": 9527 }, { "epoch": 17.280435275447743, "grad_norm": 0.361328125, "learning_rate": 0.0002, "loss": 0.0527, "step": 9528 }, { "epoch": 17.28224892314668, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 0.058, "step": 9529 }, { "epoch": 17.284062570845613, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0702, "step": 9530 }, { "epoch": 17.285876218544548, "grad_norm": 0.390625, "learning_rate": 0.0002, "loss": 0.0555, "step": 9531 }, { "epoch": 17.287689866243483, "grad_norm": 0.376953125, "learning_rate": 0.0002, "loss": 0.0587, "step": 9532 }, { "epoch": 17.289503513942417, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0633, "step": 9533 }, { "epoch": 17.291317161641352, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0696, "step": 9534 }, { "epoch": 17.293130809340287, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0655, "step": 9535 }, { "epoch": 17.29494445703922, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0645, "step": 9536 }, { "epoch": 17.296758104738153, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.063, "step": 9537 }, { "epoch": 17.296758104738153, "eval_loss": 2.413344144821167, "eval_runtime": 186.1591, "eval_samples_per_second": 5.372, "eval_steps_per_second": 5.372, "step": 9537 }, { "epoch": 17.296758104738153, "mmlu_eval_accuracy": 0.3206296715258152, "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365, "mmlu_eval_accuracy_anatomy": 0.42857142857142855, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.22727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077, "mmlu_eval_accuracy_econometrics": 0.16666666666666666, "mmlu_eval_accuracy_electrical_engineering": 0.375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.5, "mmlu_eval_accuracy_high_school_chemistry": 0.2727272727272727, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, "mmlu_eval_accuracy_high_school_microeconomics": 0.23076923076923078, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, "mmlu_eval_accuracy_high_school_statistics": 0.30434782608695654, "mmlu_eval_accuracy_high_school_us_history": 0.22727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.2692307692307692, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.25, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.18181818181818182, "mmlu_eval_accuracy_logical_fallacies": 0.3888888888888889, "mmlu_eval_accuracy_machine_learning": 0.36363636363636365, "mmlu_eval_accuracy_management": 0.36363636363636365, "mmlu_eval_accuracy_marketing": 0.52, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.38372093023255816, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.31, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.4117647058823529, "mmlu_eval_accuracy_prehistory": 0.4, "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194, "mmlu_eval_accuracy_professional_law": 0.23529411764705882, "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613, "mmlu_eval_accuracy_professional_psychology": 0.36231884057971014, "mmlu_eval_accuracy_public_relations": 0.3333333333333333, "mmlu_eval_accuracy_security_studies": 0.2962962962962963, "mmlu_eval_accuracy_sociology": 0.36363636363636365, "mmlu_eval_accuracy_us_foreign_policy": 0.18181818181818182, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.3684210526315789, "mmlu_loss": 2.649376418127646, "step": 9537 }, { "epoch": 17.298571752437088, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0564, "step": 9538 }, { "epoch": 17.300385400136022, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0649, "step": 9539 }, { "epoch": 17.302199047834957, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0617, "step": 9540 }, { "epoch": 17.304012695533892, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0718, "step": 9541 }, { "epoch": 17.305826343232827, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0702, "step": 9542 }, { "epoch": 17.30763999093176, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0649, "step": 9543 }, { "epoch": 17.309453638630696, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.0531, "step": 9544 }, { "epoch": 17.31126728632963, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.064, "step": 9545 }, { "epoch": 17.313080934028566, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0636, "step": 9546 }, { "epoch": 17.3148945817275, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0642, "step": 9547 }, { "epoch": 17.316708229426435, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0647, "step": 9548 }, { "epoch": 17.31852187712537, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0704, "step": 9549 }, { "epoch": 17.3203355248243, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0675, "step": 9550 }, { "epoch": 17.322149172523236, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0813, "step": 9551 }, { "epoch": 17.32396282022217, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0771, "step": 9552 }, { "epoch": 17.325776467921106, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0699, "step": 9553 }, { "epoch": 17.32759011562004, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0752, "step": 9554 }, { "epoch": 17.329403763318975, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.0788, "step": 9555 }, { "epoch": 17.33121741101791, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.0986, "step": 9556 }, { "epoch": 17.333031058716845, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.0807, "step": 9557 }, { "epoch": 17.33484470641578, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.089, "step": 9558 }, { "epoch": 17.336658354114714, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0869, "step": 9559 }, { "epoch": 17.33847200181365, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0918, "step": 9560 }, { "epoch": 17.340285649512584, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0909, "step": 9561 }, { "epoch": 17.342099297211515, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.0901, "step": 9562 }, { "epoch": 17.34391294491045, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1255, "step": 9563 }, { "epoch": 17.345726592609385, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.1036, "step": 9564 }, { "epoch": 17.34754024030832, "grad_norm": 0.77734375, "learning_rate": 0.0002, "loss": 0.1073, "step": 9565 }, { "epoch": 17.349353888007254, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1045, "step": 9566 }, { "epoch": 17.35116753570619, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.1022, "step": 9567 }, { "epoch": 17.352981183405124, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1214, "step": 9568 }, { "epoch": 17.35479483110406, "grad_norm": 0.92578125, "learning_rate": 0.0002, "loss": 0.1378, "step": 9569 }, { "epoch": 17.356608478802993, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1475, "step": 9570 }, { "epoch": 17.358422126501928, "grad_norm": 0.703125, "learning_rate": 0.0002, "loss": 0.14, "step": 9571 }, { "epoch": 17.360235774200863, "grad_norm": 0.9921875, "learning_rate": 0.0002, "loss": 0.1913, "step": 9572 }, { "epoch": 17.362049421899798, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.1979, "step": 9573 }, { "epoch": 17.36386306959873, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1701, "step": 9574 }, { "epoch": 17.365676717297664, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0608, "step": 9575 }, { "epoch": 17.3674903649966, "grad_norm": 0.447265625, "learning_rate": 0.0002, "loss": 0.0724, "step": 9576 }, { "epoch": 17.369304012695533, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.062, "step": 9577 }, { "epoch": 17.371117660394468, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0675, "step": 9578 }, { "epoch": 17.372931308093403, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0729, "step": 9579 }, { "epoch": 17.374744955792337, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0676, "step": 9580 }, { "epoch": 17.376558603491272, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0682, "step": 9581 }, { "epoch": 17.378372251190207, "grad_norm": 0.384765625, "learning_rate": 0.0002, "loss": 0.0544, "step": 9582 }, { "epoch": 17.380185898889142, "grad_norm": 0.423828125, "learning_rate": 0.0002, "loss": 0.0637, "step": 9583 }, { "epoch": 17.381999546588077, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0547, "step": 9584 }, { "epoch": 17.38381319428701, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0675, "step": 9585 }, { "epoch": 17.385626841985943, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0596, "step": 9586 }, { "epoch": 17.387440489684877, "grad_norm": 0.44140625, "learning_rate": 0.0002, "loss": 0.0621, "step": 9587 }, { "epoch": 17.389254137383812, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0607, "step": 9588 }, { "epoch": 17.391067785082747, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0671, "step": 9589 }, { "epoch": 17.39288143278168, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0663, "step": 9590 }, { "epoch": 17.394695080480616, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0621, "step": 9591 }, { "epoch": 17.39650872817955, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0711, "step": 9592 }, { "epoch": 17.398322375878486, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0767, "step": 9593 }, { "epoch": 17.40013602357742, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0743, "step": 9594 }, { "epoch": 17.401949671276356, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0672, "step": 9595 }, { "epoch": 17.40376331897529, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.083, "step": 9596 }, { "epoch": 17.405576966674225, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.074, "step": 9597 }, { "epoch": 17.407390614373156, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0723, "step": 9598 }, { "epoch": 17.40920426207209, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0735, "step": 9599 }, { "epoch": 17.411017909771026, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.0748, "step": 9600 }, { "epoch": 17.41283155746996, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0734, "step": 9601 }, { "epoch": 17.414645205168895, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0766, "step": 9602 }, { "epoch": 17.41645885286783, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0823, "step": 9603 }, { "epoch": 17.418272500566765, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0711, "step": 9604 }, { "epoch": 17.4200861482657, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0786, "step": 9605 }, { "epoch": 17.421899795964634, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.0837, "step": 9606 }, { "epoch": 17.42371344366357, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.093, "step": 9607 }, { "epoch": 17.425527091362504, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.0978, "step": 9608 }, { "epoch": 17.42734073906144, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0752, "step": 9609 }, { "epoch": 17.429154386760374, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0876, "step": 9610 }, { "epoch": 17.430968034459305, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0897, "step": 9611 }, { "epoch": 17.43278168215824, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0782, "step": 9612 }, { "epoch": 17.434595329857174, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.093, "step": 9613 }, { "epoch": 17.43640897755611, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0869, "step": 9614 }, { "epoch": 17.438222625255044, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.1008, "step": 9615 }, { "epoch": 17.44003627295398, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1061, "step": 9616 }, { "epoch": 17.441849920652913, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1299, "step": 9617 }, { "epoch": 17.443663568351848, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.1095, "step": 9618 }, { "epoch": 17.445477216050783, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1195, "step": 9619 }, { "epoch": 17.447290863749718, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1244, "step": 9620 }, { "epoch": 17.449104511448652, "grad_norm": 1.21875, "learning_rate": 0.0002, "loss": 0.1793, "step": 9621 }, { "epoch": 17.450918159147587, "grad_norm": 1.1484375, "learning_rate": 0.0002, "loss": 0.1612, "step": 9622 }, { "epoch": 17.45273180684652, "grad_norm": 0.79296875, "learning_rate": 0.0002, "loss": 0.176, "step": 9623 }, { "epoch": 17.454545454545453, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.1667, "step": 9624 }, { "epoch": 17.456359102244388, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0677, "step": 9625 }, { "epoch": 17.458172749943323, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.0558, "step": 9626 }, { "epoch": 17.459986397642258, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0674, "step": 9627 }, { "epoch": 17.461800045341192, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0659, "step": 9628 }, { "epoch": 17.463613693040127, "grad_norm": 0.474609375, "learning_rate": 0.0002, "loss": 0.0636, "step": 9629 }, { "epoch": 17.465427340739062, "grad_norm": 0.4921875, "learning_rate": 0.0002, "loss": 0.0589, "step": 9630 }, { "epoch": 17.467240988437997, "grad_norm": 0.484375, "learning_rate": 0.0002, "loss": 0.065, "step": 9631 }, { "epoch": 17.46905463613693, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.0609, "step": 9632 }, { "epoch": 17.470868283835866, "grad_norm": 0.462890625, "learning_rate": 0.0002, "loss": 0.068, "step": 9633 }, { "epoch": 17.4726819315348, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.061, "step": 9634 }, { "epoch": 17.474495579233732, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.071, "step": 9635 }, { "epoch": 17.476309226932667, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0771, "step": 9636 }, { "epoch": 17.4781228746316, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.1105, "step": 9637 }, { "epoch": 17.479936522330537, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0739, "step": 9638 }, { "epoch": 17.48175017002947, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0629, "step": 9639 }, { "epoch": 17.483563817728406, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.07, "step": 9640 }, { "epoch": 17.48537746542734, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0719, "step": 9641 }, { "epoch": 17.487191113126276, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0755, "step": 9642 }, { "epoch": 17.48900476082521, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0674, "step": 9643 }, { "epoch": 17.490818408524145, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0747, "step": 9644 }, { "epoch": 17.49263205622308, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0677, "step": 9645 }, { "epoch": 17.494445703922015, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0679, "step": 9646 }, { "epoch": 17.496259351620946, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0685, "step": 9647 }, { "epoch": 17.49807299931988, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.071, "step": 9648 }, { "epoch": 17.499886647018815, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0748, "step": 9649 }, { "epoch": 17.50170029471775, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0832, "step": 9650 }, { "epoch": 17.503513942416685, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0906, "step": 9651 }, { "epoch": 17.50532759011562, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0727, "step": 9652 }, { "epoch": 17.507141237814555, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.1049, "step": 9653 }, { "epoch": 17.50895488551349, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0828, "step": 9654 }, { "epoch": 17.510768533212424, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0689, "step": 9655 }, { "epoch": 17.51258218091136, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0839, "step": 9656 }, { "epoch": 17.514395828610294, "grad_norm": 0.7578125, "learning_rate": 0.0002, "loss": 0.0882, "step": 9657 }, { "epoch": 17.51620947630923, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0867, "step": 9658 }, { "epoch": 17.51802312400816, "grad_norm": 0.64453125, "learning_rate": 0.0002, "loss": 0.089, "step": 9659 }, { "epoch": 17.519836771707094, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0991, "step": 9660 }, { "epoch": 17.52165041940603, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0898, "step": 9661 }, { "epoch": 17.523464067104964, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1011, "step": 9662 }, { "epoch": 17.5252777148039, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0952, "step": 9663 }, { "epoch": 17.527091362502834, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.1213, "step": 9664 }, { "epoch": 17.52890501020177, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0922, "step": 9665 }, { "epoch": 17.530718657900703, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.0995, "step": 9666 }, { "epoch": 17.532532305599638, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1076, "step": 9667 }, { "epoch": 17.534345953298573, "grad_norm": 1.2109375, "learning_rate": 0.0002, "loss": 0.13, "step": 9668 }, { "epoch": 17.536159600997507, "grad_norm": 0.8125, "learning_rate": 0.0002, "loss": 0.1391, "step": 9669 }, { "epoch": 17.537973248696442, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1166, "step": 9670 }, { "epoch": 17.539786896395377, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.1743, "step": 9671 }, { "epoch": 17.541600544094308, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.153, "step": 9672 }, { "epoch": 17.543414191793243, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.2034, "step": 9673 }, { "epoch": 17.545227839492178, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.1431, "step": 9674 }, { "epoch": 17.547041487191112, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0641, "step": 9675 }, { "epoch": 17.548855134890047, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0711, "step": 9676 }, { "epoch": 17.550668782588982, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0633, "step": 9677 }, { "epoch": 17.552482430287917, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.063, "step": 9678 }, { "epoch": 17.55429607798685, "grad_norm": 0.498046875, "learning_rate": 0.0002, "loss": 0.0679, "step": 9679 }, { "epoch": 17.556109725685786, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0592, "step": 9680 }, { "epoch": 17.55792337338472, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0715, "step": 9681 }, { "epoch": 17.559737021083656, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.068, "step": 9682 }, { "epoch": 17.56155066878259, "grad_norm": 0.46875, "learning_rate": 0.0002, "loss": 0.0629, "step": 9683 }, { "epoch": 17.563364316481522, "grad_norm": 0.44921875, "learning_rate": 0.0002, "loss": 0.0678, "step": 9684 }, { "epoch": 17.565177964180457, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0722, "step": 9685 }, { "epoch": 17.56699161187939, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.07, "step": 9686 }, { "epoch": 17.568805259578326, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0698, "step": 9687 }, { "epoch": 17.57061890727726, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0649, "step": 9688 }, { "epoch": 17.572432554976196, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0647, "step": 9689 }, { "epoch": 17.57424620267513, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0686, "step": 9690 }, { "epoch": 17.576059850374065, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0693, "step": 9691 }, { "epoch": 17.577873498073, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0719, "step": 9692 }, { "epoch": 17.579687145771935, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0742, "step": 9693 }, { "epoch": 17.58150079347087, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0687, "step": 9694 }, { "epoch": 17.583314441169804, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.074, "step": 9695 }, { "epoch": 17.585128088868736, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0824, "step": 9696 }, { "epoch": 17.58694173656767, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0793, "step": 9697 }, { "epoch": 17.588755384266605, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0715, "step": 9698 }, { "epoch": 17.59056903196554, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0708, "step": 9699 }, { "epoch": 17.592382679664475, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.0724, "step": 9700 }, { "epoch": 17.59419632736341, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0766, "step": 9701 }, { "epoch": 17.596009975062344, "grad_norm": 0.70703125, "learning_rate": 0.0002, "loss": 0.0805, "step": 9702 }, { "epoch": 17.59782362276128, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.0799, "step": 9703 }, { "epoch": 17.599637270460214, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0769, "step": 9704 }, { "epoch": 17.60145091815915, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.073, "step": 9705 }, { "epoch": 17.603264565858083, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0749, "step": 9706 }, { "epoch": 17.605078213557018, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0913, "step": 9707 }, { "epoch": 17.606891861255953, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0814, "step": 9708 }, { "epoch": 17.608705508954884, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.0813, "step": 9709 }, { "epoch": 17.61051915665382, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0823, "step": 9710 }, { "epoch": 17.612332804352754, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0807, "step": 9711 }, { "epoch": 17.61414645205169, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0966, "step": 9712 }, { "epoch": 17.615960099750623, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0979, "step": 9713 }, { "epoch": 17.617773747449558, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.1123, "step": 9714 }, { "epoch": 17.619587395148493, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0987, "step": 9715 }, { "epoch": 17.621401042847427, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1071, "step": 9716 }, { "epoch": 17.623214690546362, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.1088, "step": 9717 }, { "epoch": 17.625028338245297, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.1199, "step": 9718 }, { "epoch": 17.626841985944232, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.1194, "step": 9719 }, { "epoch": 17.628655633643163, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1217, "step": 9720 }, { "epoch": 17.630469281342098, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1395, "step": 9721 }, { "epoch": 17.632282929041033, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1794, "step": 9722 }, { "epoch": 17.634096576739967, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.2037, "step": 9723 }, { "epoch": 17.635910224438902, "grad_norm": 1.421875, "learning_rate": 0.0002, "loss": 0.2187, "step": 9724 }, { "epoch": 17.635910224438902, "eval_loss": 2.3854990005493164, "eval_runtime": 186.2128, "eval_samples_per_second": 5.37, "eval_steps_per_second": 5.37, "step": 9724 }, { "epoch": 17.635910224438902, "mmlu_eval_accuracy": 0.31827931887120436, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.3125, "mmlu_eval_accuracy_business_ethics": 0.45454545454545453, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.25, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.36363636363636365, "mmlu_eval_accuracy_computer_security": 0.45454545454545453, "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692, "mmlu_eval_accuracy_econometrics": 0.08333333333333333, "mmlu_eval_accuracy_electrical_engineering": 0.3125, "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683, "mmlu_eval_accuracy_formal_logic": 0.21428571428571427, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.4375, "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.2777777777777778, "mmlu_eval_accuracy_high_school_geography": 0.3181818181818182, "mmlu_eval_accuracy_high_school_government_and_politics": 0.23809523809523808, "mmlu_eval_accuracy_high_school_macroeconomics": 0.27906976744186046, "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, "mmlu_eval_accuracy_high_school_microeconomics": 0.3076923076923077, "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529, "mmlu_eval_accuracy_high_school_psychology": 0.36666666666666664, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.2727272727272727, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.4782608695652174, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.3333333333333333, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.18181818181818182, "mmlu_eval_accuracy_marketing": 0.44, "mmlu_eval_accuracy_medical_genetics": 0.36363636363636365, "mmlu_eval_accuracy_miscellaneous": 0.37209302325581395, "mmlu_eval_accuracy_moral_disputes": 0.3684210526315789, "mmlu_eval_accuracy_moral_scenarios": 0.27, "mmlu_eval_accuracy_nutrition": 0.3939393939393939, "mmlu_eval_accuracy_philosophy": 0.23529411764705882, "mmlu_eval_accuracy_prehistory": 0.4857142857142857, "mmlu_eval_accuracy_professional_accounting": 0.1935483870967742, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, "mmlu_eval_accuracy_professional_psychology": 0.42028985507246375, "mmlu_eval_accuracy_public_relations": 0.5, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.4444444444444444, "mmlu_eval_accuracy_world_religions": 0.47368421052631576, "mmlu_loss": 2.0945806729733203, "step": 9724 }, { "epoch": 17.637723872137837, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.058, "step": 9725 }, { "epoch": 17.63953751983677, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0644, "step": 9726 }, { "epoch": 17.641351167535706, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0678, "step": 9727 }, { "epoch": 17.64316481523464, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0677, "step": 9728 }, { "epoch": 17.644978462933576, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0677, "step": 9729 }, { "epoch": 17.64679211063251, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0598, "step": 9730 }, { "epoch": 17.648605758331446, "grad_norm": 0.55859375, "learning_rate": 0.0002, "loss": 0.0742, "step": 9731 }, { "epoch": 17.65041940603038, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0722, "step": 9732 }, { "epoch": 17.65223305372931, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0739, "step": 9733 }, { "epoch": 17.654046701428246, "grad_norm": 0.431640625, "learning_rate": 0.0002, "loss": 0.067, "step": 9734 }, { "epoch": 17.65586034912718, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0636, "step": 9735 }, { "epoch": 17.657673996826116, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0779, "step": 9736 }, { "epoch": 17.65948764452505, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0751, "step": 9737 }, { "epoch": 17.661301292223985, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0688, "step": 9738 }, { "epoch": 17.66311493992292, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0763, "step": 9739 }, { "epoch": 17.664928587621855, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0698, "step": 9740 }, { "epoch": 17.66674223532079, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0656, "step": 9741 }, { "epoch": 17.668555883019724, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0722, "step": 9742 }, { "epoch": 17.67036953071866, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.08, "step": 9743 }, { "epoch": 17.672183178417594, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0707, "step": 9744 }, { "epoch": 17.673996826116525, "grad_norm": 0.49609375, "learning_rate": 0.0002, "loss": 0.0641, "step": 9745 }, { "epoch": 17.67581047381546, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0744, "step": 9746 }, { "epoch": 17.677624121514395, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0661, "step": 9747 }, { "epoch": 17.67943776921333, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.0766, "step": 9748 }, { "epoch": 17.681251416912264, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0777, "step": 9749 }, { "epoch": 17.6830650646112, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0761, "step": 9750 }, { "epoch": 17.684878712310134, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0844, "step": 9751 }, { "epoch": 17.68669236000907, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.0731, "step": 9752 }, { "epoch": 17.688506007708003, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0767, "step": 9753 }, { "epoch": 17.690319655406938, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0826, "step": 9754 }, { "epoch": 17.692133303105873, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0815, "step": 9755 }, { "epoch": 17.693946950804808, "grad_norm": 0.66015625, "learning_rate": 0.0002, "loss": 0.0912, "step": 9756 }, { "epoch": 17.69576059850374, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0817, "step": 9757 }, { "epoch": 17.697574246202674, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.082, "step": 9758 }, { "epoch": 17.69938789390161, "grad_norm": 0.734375, "learning_rate": 0.0002, "loss": 0.0944, "step": 9759 }, { "epoch": 17.701201541600543, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0839, "step": 9760 }, { "epoch": 17.703015189299478, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1038, "step": 9761 }, { "epoch": 17.704828836998413, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.0959, "step": 9762 }, { "epoch": 17.706642484697348, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1142, "step": 9763 }, { "epoch": 17.708456132396282, "grad_norm": 0.875, "learning_rate": 0.0002, "loss": 0.1067, "step": 9764 }, { "epoch": 17.710269780095217, "grad_norm": 0.60546875, "learning_rate": 0.0002, "loss": 0.1064, "step": 9765 }, { "epoch": 17.712083427794152, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.1012, "step": 9766 }, { "epoch": 17.713897075493087, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.1105, "step": 9767 }, { "epoch": 17.71571072319202, "grad_norm": 0.99609375, "learning_rate": 0.0002, "loss": 0.1245, "step": 9768 }, { "epoch": 17.717524370890956, "grad_norm": 0.796875, "learning_rate": 0.0002, "loss": 0.1311, "step": 9769 }, { "epoch": 17.719338018589887, "grad_norm": 0.86328125, "learning_rate": 0.0002, "loss": 0.1439, "step": 9770 }, { "epoch": 17.721151666288822, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.1437, "step": 9771 }, { "epoch": 17.722965313987757, "grad_norm": 1.1328125, "learning_rate": 0.0002, "loss": 0.1687, "step": 9772 }, { "epoch": 17.72477896168669, "grad_norm": 0.8203125, "learning_rate": 0.0002, "loss": 0.2293, "step": 9773 }, { "epoch": 17.726592609385627, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.1473, "step": 9774 }, { "epoch": 17.72840625708456, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0714, "step": 9775 }, { "epoch": 17.730219904783496, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0669, "step": 9776 }, { "epoch": 17.73203355248243, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0587, "step": 9777 }, { "epoch": 17.733847200181366, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0747, "step": 9778 }, { "epoch": 17.7356608478803, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0656, "step": 9779 }, { "epoch": 17.737474495579235, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0693, "step": 9780 }, { "epoch": 17.739288143278166, "grad_norm": 0.45703125, "learning_rate": 0.0002, "loss": 0.0642, "step": 9781 }, { "epoch": 17.7411017909771, "grad_norm": 0.490234375, "learning_rate": 0.0002, "loss": 0.0629, "step": 9782 }, { "epoch": 17.742915438676036, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0665, "step": 9783 }, { "epoch": 17.74472908637497, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0573, "step": 9784 }, { "epoch": 17.746542734073905, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0659, "step": 9785 }, { "epoch": 17.74835638177284, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0719, "step": 9786 }, { "epoch": 17.750170029471775, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0794, "step": 9787 }, { "epoch": 17.75198367717071, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0749, "step": 9788 }, { "epoch": 17.753797324869645, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0761, "step": 9789 }, { "epoch": 17.75561097256858, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.0791, "step": 9790 }, { "epoch": 17.757424620267514, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0693, "step": 9791 }, { "epoch": 17.75923826796645, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0733, "step": 9792 }, { "epoch": 17.761051915665384, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0701, "step": 9793 }, { "epoch": 17.762865563364315, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0713, "step": 9794 }, { "epoch": 17.76467921106325, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0673, "step": 9795 }, { "epoch": 17.766492858762184, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0777, "step": 9796 }, { "epoch": 17.76830650646112, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0778, "step": 9797 }, { "epoch": 17.770120154160054, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0826, "step": 9798 }, { "epoch": 17.77193380185899, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0754, "step": 9799 }, { "epoch": 17.773747449557924, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.0963, "step": 9800 }, { "epoch": 17.77556109725686, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0767, "step": 9801 }, { "epoch": 17.777374744955793, "grad_norm": 0.65234375, "learning_rate": 0.0002, "loss": 0.0802, "step": 9802 }, { "epoch": 17.779188392654728, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0762, "step": 9803 }, { "epoch": 17.781002040353663, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0755, "step": 9804 }, { "epoch": 17.782815688052597, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0741, "step": 9805 }, { "epoch": 17.78462933575153, "grad_norm": 0.494140625, "learning_rate": 0.0002, "loss": 0.0808, "step": 9806 }, { "epoch": 17.786442983450463, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0837, "step": 9807 }, { "epoch": 17.788256631149398, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0814, "step": 9808 }, { "epoch": 17.790070278848333, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0849, "step": 9809 }, { "epoch": 17.791883926547268, "grad_norm": 0.7265625, "learning_rate": 0.0002, "loss": 0.0912, "step": 9810 }, { "epoch": 17.793697574246202, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0909, "step": 9811 }, { "epoch": 17.795511221945137, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.1001, "step": 9812 }, { "epoch": 17.797324869644072, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0967, "step": 9813 }, { "epoch": 17.799138517343007, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.1012, "step": 9814 }, { "epoch": 17.80095216504194, "grad_norm": 0.6875, "learning_rate": 0.0002, "loss": 0.1044, "step": 9815 }, { "epoch": 17.802765812740876, "grad_norm": 0.80078125, "learning_rate": 0.0002, "loss": 0.114, "step": 9816 }, { "epoch": 17.80457946043981, "grad_norm": 0.74609375, "learning_rate": 0.0002, "loss": 0.1346, "step": 9817 }, { "epoch": 17.806393108138742, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.1477, "step": 9818 }, { "epoch": 17.808206755837677, "grad_norm": 0.75390625, "learning_rate": 0.0002, "loss": 0.1348, "step": 9819 }, { "epoch": 17.810020403536612, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.125, "step": 9820 }, { "epoch": 17.811834051235547, "grad_norm": 1.03125, "learning_rate": 0.0002, "loss": 0.1472, "step": 9821 }, { "epoch": 17.81364769893448, "grad_norm": 0.84765625, "learning_rate": 0.0002, "loss": 0.2073, "step": 9822 }, { "epoch": 17.815461346633416, "grad_norm": 1.375, "learning_rate": 0.0002, "loss": 0.2594, "step": 9823 }, { "epoch": 17.81727499433235, "grad_norm": 0.80859375, "learning_rate": 0.0002, "loss": 0.1684, "step": 9824 }, { "epoch": 17.819088642031286, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0819, "step": 9825 }, { "epoch": 17.82090228973022, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0734, "step": 9826 }, { "epoch": 17.822715937429155, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.0761, "step": 9827 }, { "epoch": 17.82452958512809, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.073, "step": 9828 }, { "epoch": 17.826343232827025, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0708, "step": 9829 }, { "epoch": 17.82815688052596, "grad_norm": 0.439453125, "learning_rate": 0.0002, "loss": 0.06, "step": 9830 }, { "epoch": 17.82997052822489, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0675, "step": 9831 }, { "epoch": 17.831784175923826, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0702, "step": 9832 }, { "epoch": 17.83359782362276, "grad_norm": 0.59765625, "learning_rate": 0.0002, "loss": 0.0758, "step": 9833 }, { "epoch": 17.835411471321695, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0713, "step": 9834 }, { "epoch": 17.83722511902063, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0722, "step": 9835 }, { "epoch": 17.839038766719565, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.071, "step": 9836 }, { "epoch": 17.8408524144185, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0716, "step": 9837 }, { "epoch": 17.842666062117434, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0769, "step": 9838 }, { "epoch": 17.84447970981637, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.073, "step": 9839 }, { "epoch": 17.846293357515304, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0648, "step": 9840 }, { "epoch": 17.84810700521424, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.072, "step": 9841 }, { "epoch": 17.849920652913173, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0747, "step": 9842 }, { "epoch": 17.851734300612105, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0705, "step": 9843 }, { "epoch": 17.85354794831104, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0746, "step": 9844 }, { "epoch": 17.855361596009974, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0716, "step": 9845 }, { "epoch": 17.85717524370891, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0846, "step": 9846 }, { "epoch": 17.858988891407844, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0769, "step": 9847 }, { "epoch": 17.86080253910678, "grad_norm": 0.58203125, "learning_rate": 0.0002, "loss": 0.0786, "step": 9848 }, { "epoch": 17.862616186805713, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0777, "step": 9849 }, { "epoch": 17.864429834504648, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.0879, "step": 9850 }, { "epoch": 17.866243482203583, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0733, "step": 9851 }, { "epoch": 17.868057129902517, "grad_norm": 0.53125, "learning_rate": 0.0002, "loss": 0.0755, "step": 9852 }, { "epoch": 17.869870777601452, "grad_norm": 0.62109375, "learning_rate": 0.0002, "loss": 0.0794, "step": 9853 }, { "epoch": 17.871684425300387, "grad_norm": 0.6796875, "learning_rate": 0.0002, "loss": 0.0877, "step": 9854 }, { "epoch": 17.87349807299932, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0829, "step": 9855 }, { "epoch": 17.875311720698253, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.0808, "step": 9856 }, { "epoch": 17.877125368397188, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0848, "step": 9857 }, { "epoch": 17.878939016096123, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.084, "step": 9858 }, { "epoch": 17.880752663795057, "grad_norm": 0.828125, "learning_rate": 0.0002, "loss": 0.122, "step": 9859 }, { "epoch": 17.882566311493992, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0848, "step": 9860 }, { "epoch": 17.884379959192927, "grad_norm": 0.5859375, "learning_rate": 0.0002, "loss": 0.0873, "step": 9861 }, { "epoch": 17.88619360689186, "grad_norm": 0.67578125, "learning_rate": 0.0002, "loss": 0.0958, "step": 9862 }, { "epoch": 17.888007254590796, "grad_norm": 0.76171875, "learning_rate": 0.0002, "loss": 0.1004, "step": 9863 }, { "epoch": 17.88982090228973, "grad_norm": 0.76953125, "learning_rate": 0.0002, "loss": 0.1139, "step": 9864 }, { "epoch": 17.891634549988666, "grad_norm": 0.95703125, "learning_rate": 0.0002, "loss": 0.1097, "step": 9865 }, { "epoch": 17.8934481976876, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.1105, "step": 9866 }, { "epoch": 17.895261845386532, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.1179, "step": 9867 }, { "epoch": 17.897075493085467, "grad_norm": 0.7734375, "learning_rate": 0.0002, "loss": 0.1288, "step": 9868 }, { "epoch": 17.8988891407844, "grad_norm": 0.921875, "learning_rate": 0.0002, "loss": 0.1355, "step": 9869 }, { "epoch": 17.900702788483336, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.1393, "step": 9870 }, { "epoch": 17.90251643618227, "grad_norm": 0.69140625, "learning_rate": 0.0002, "loss": 0.1647, "step": 9871 }, { "epoch": 17.904330083881206, "grad_norm": 0.58984375, "learning_rate": 0.0002, "loss": 0.2127, "step": 9872 }, { "epoch": 17.90614373158014, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.2261, "step": 9873 }, { "epoch": 17.907957379279075, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.1824, "step": 9874 }, { "epoch": 17.90977102697801, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0826, "step": 9875 }, { "epoch": 17.911584674676945, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0691, "step": 9876 }, { "epoch": 17.91339832237588, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.076, "step": 9877 }, { "epoch": 17.915211970074814, "grad_norm": 0.578125, "learning_rate": 0.0002, "loss": 0.0692, "step": 9878 }, { "epoch": 17.917025617773746, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0718, "step": 9879 }, { "epoch": 17.91883926547268, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0759, "step": 9880 }, { "epoch": 17.920652913171615, "grad_norm": 0.4765625, "learning_rate": 0.0002, "loss": 0.0633, "step": 9881 }, { "epoch": 17.92246656087055, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0719, "step": 9882 }, { "epoch": 17.924280208569485, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0747, "step": 9883 }, { "epoch": 17.92609385626842, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0777, "step": 9884 }, { "epoch": 17.927907503967354, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0772, "step": 9885 }, { "epoch": 17.92972115166629, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0724, "step": 9886 }, { "epoch": 17.931534799365224, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0793, "step": 9887 }, { "epoch": 17.93334844706416, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0678, "step": 9888 }, { "epoch": 17.935162094763093, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0788, "step": 9889 }, { "epoch": 17.936975742462028, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0863, "step": 9890 }, { "epoch": 17.938789390160963, "grad_norm": 0.53515625, "learning_rate": 0.0002, "loss": 0.072, "step": 9891 }, { "epoch": 17.940603037859894, "grad_norm": 0.5390625, "learning_rate": 0.0002, "loss": 0.0772, "step": 9892 }, { "epoch": 17.94241668555883, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.0842, "step": 9893 }, { "epoch": 17.944230333257764, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0727, "step": 9894 }, { "epoch": 17.9460439809567, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.0826, "step": 9895 }, { "epoch": 17.947857628655633, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.0747, "step": 9896 }, { "epoch": 17.949671276354568, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0891, "step": 9897 }, { "epoch": 17.951484924053503, "grad_norm": 0.6640625, "learning_rate": 0.0002, "loss": 0.0787, "step": 9898 }, { "epoch": 17.953298571752438, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0723, "step": 9899 }, { "epoch": 17.955112219451372, "grad_norm": 0.68359375, "learning_rate": 0.0002, "loss": 0.086, "step": 9900 }, { "epoch": 17.956925867150307, "grad_norm": 0.6328125, "learning_rate": 0.0002, "loss": 0.0821, "step": 9901 }, { "epoch": 17.958739514849242, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0781, "step": 9902 }, { "epoch": 17.960553162548177, "grad_norm": 0.66796875, "learning_rate": 0.0002, "loss": 0.0861, "step": 9903 }, { "epoch": 17.962366810247108, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0846, "step": 9904 }, { "epoch": 17.964180457946043, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.0922, "step": 9905 }, { "epoch": 17.965994105644977, "grad_norm": 0.81640625, "learning_rate": 0.0002, "loss": 0.0968, "step": 9906 }, { "epoch": 17.967807753343912, "grad_norm": 0.56640625, "learning_rate": 0.0002, "loss": 0.0817, "step": 9907 }, { "epoch": 17.969621401042847, "grad_norm": 0.640625, "learning_rate": 0.0002, "loss": 0.0892, "step": 9908 }, { "epoch": 17.97143504874178, "grad_norm": 0.859375, "learning_rate": 0.0002, "loss": 0.1099, "step": 9909 }, { "epoch": 17.973248696440717, "grad_norm": 0.62890625, "learning_rate": 0.0002, "loss": 0.0957, "step": 9910 }, { "epoch": 17.97506234413965, "grad_norm": 0.72265625, "learning_rate": 0.0002, "loss": 0.1012, "step": 9911 }, { "epoch": 17.97506234413965, "eval_loss": 2.405622959136963, "eval_runtime": 185.9934, "eval_samples_per_second": 5.377, "eval_steps_per_second": 5.377, "step": 9911 }, { "epoch": 17.97506234413965, "mmlu_eval_accuracy": 0.315661593571309, "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727, "mmlu_eval_accuracy_anatomy": 0.5, "mmlu_eval_accuracy_astronomy": 0.375, "mmlu_eval_accuracy_business_ethics": 0.36363636363636365, "mmlu_eval_accuracy_clinical_knowledge": 0.27586206896551724, "mmlu_eval_accuracy_college_biology": 0.375, "mmlu_eval_accuracy_college_chemistry": 0.125, "mmlu_eval_accuracy_college_computer_science": 0.18181818181818182, "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182, "mmlu_eval_accuracy_college_medicine": 0.2727272727272727, "mmlu_eval_accuracy_college_physics": 0.45454545454545453, "mmlu_eval_accuracy_computer_security": 0.36363636363636365, "mmlu_eval_accuracy_conceptual_physics": 0.19230769230769232, "mmlu_eval_accuracy_econometrics": 0.25, "mmlu_eval_accuracy_electrical_engineering": 0.4375, "mmlu_eval_accuracy_elementary_mathematics": 0.2682926829268293, "mmlu_eval_accuracy_formal_logic": 0.14285714285714285, "mmlu_eval_accuracy_global_facts": 0.3, "mmlu_eval_accuracy_high_school_biology": 0.53125, "mmlu_eval_accuracy_high_school_chemistry": 0.13636363636363635, "mmlu_eval_accuracy_high_school_computer_science": 0.2222222222222222, "mmlu_eval_accuracy_high_school_european_history": 0.3333333333333333, "mmlu_eval_accuracy_high_school_geography": 0.36363636363636365, "mmlu_eval_accuracy_high_school_government_and_politics": 0.09523809523809523, "mmlu_eval_accuracy_high_school_macroeconomics": 0.20930232558139536, "mmlu_eval_accuracy_high_school_mathematics": 0.13793103448275862, "mmlu_eval_accuracy_high_school_microeconomics": 0.34615384615384615, "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354, "mmlu_eval_accuracy_high_school_psychology": 0.38333333333333336, "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913, "mmlu_eval_accuracy_high_school_us_history": 0.3181818181818182, "mmlu_eval_accuracy_high_school_world_history": 0.23076923076923078, "mmlu_eval_accuracy_human_aging": 0.43478260869565216, "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333, "mmlu_eval_accuracy_international_law": 0.46153846153846156, "mmlu_eval_accuracy_jurisprudence": 0.2727272727272727, "mmlu_eval_accuracy_logical_fallacies": 0.2222222222222222, "mmlu_eval_accuracy_machine_learning": 0.2727272727272727, "mmlu_eval_accuracy_management": 0.2727272727272727, "mmlu_eval_accuracy_marketing": 0.6, "mmlu_eval_accuracy_medical_genetics": 0.45454545454545453, "mmlu_eval_accuracy_miscellaneous": 0.36046511627906974, "mmlu_eval_accuracy_moral_disputes": 0.4473684210526316, "mmlu_eval_accuracy_moral_scenarios": 0.24, "mmlu_eval_accuracy_nutrition": 0.30303030303030304, "mmlu_eval_accuracy_philosophy": 0.35294117647058826, "mmlu_eval_accuracy_prehistory": 0.5142857142857142, "mmlu_eval_accuracy_professional_accounting": 0.1935483870967742, "mmlu_eval_accuracy_professional_law": 0.25882352941176473, "mmlu_eval_accuracy_professional_medicine": 0.22580645161290322, "mmlu_eval_accuracy_professional_psychology": 0.4057971014492754, "mmlu_eval_accuracy_public_relations": 0.4166666666666667, "mmlu_eval_accuracy_security_studies": 0.37037037037037035, "mmlu_eval_accuracy_sociology": 0.3181818181818182, "mmlu_eval_accuracy_us_foreign_policy": 0.36363636363636365, "mmlu_eval_accuracy_virology": 0.3333333333333333, "mmlu_eval_accuracy_world_religions": 0.3684210526315789, "mmlu_loss": 2.479912135606494, "step": 9911 }, { "epoch": 17.976875991838586, "grad_norm": 0.953125, "learning_rate": 0.0002, "loss": 0.1197, "step": 9912 }, { "epoch": 17.97868963953752, "grad_norm": 0.8515625, "learning_rate": 0.0002, "loss": 0.1183, "step": 9913 }, { "epoch": 17.980503287236456, "grad_norm": 0.90625, "learning_rate": 0.0002, "loss": 0.1153, "step": 9914 }, { "epoch": 17.98231693493539, "grad_norm": 0.59375, "learning_rate": 0.0002, "loss": 0.1092, "step": 9915 }, { "epoch": 17.98413058263432, "grad_norm": 0.75, "learning_rate": 0.0002, "loss": 0.1261, "step": 9916 }, { "epoch": 17.985944230333256, "grad_norm": 0.71875, "learning_rate": 0.0002, "loss": 0.1335, "step": 9917 }, { "epoch": 17.98775787803219, "grad_norm": 0.73046875, "learning_rate": 0.0002, "loss": 0.1243, "step": 9918 }, { "epoch": 17.989571525731126, "grad_norm": 0.9375, "learning_rate": 0.0002, "loss": 0.1381, "step": 9919 }, { "epoch": 17.99138517343006, "grad_norm": 0.7421875, "learning_rate": 0.0002, "loss": 0.1656, "step": 9920 }, { "epoch": 17.993198821128995, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.149, "step": 9921 }, { "epoch": 17.99501246882793, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.1631, "step": 9922 }, { "epoch": 17.996826116526865, "grad_norm": 0.94921875, "learning_rate": 0.0002, "loss": 0.2022, "step": 9923 }, { "epoch": 17.9986397642258, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1501, "step": 9924 }, { "epoch": 18.000453411924735, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0844, "step": 9925 }, { "epoch": 18.00226705962367, "grad_norm": 0.42578125, "learning_rate": 0.0002, "loss": 0.0581, "step": 9926 }, { "epoch": 18.004080707322604, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.0547, "step": 9927 }, { "epoch": 18.005894355021535, "grad_norm": 0.40234375, "learning_rate": 0.0002, "loss": 0.0533, "step": 9928 }, { "epoch": 18.00770800272047, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0544, "step": 9929 }, { "epoch": 18.009521650419405, "grad_norm": 0.408203125, "learning_rate": 0.0002, "loss": 0.0559, "step": 9930 }, { "epoch": 18.01133529811834, "grad_norm": 0.4375, "learning_rate": 0.0002, "loss": 0.0541, "step": 9931 }, { "epoch": 18.013148945817274, "grad_norm": 0.328125, "learning_rate": 0.0002, "loss": 0.0486, "step": 9932 }, { "epoch": 18.01496259351621, "grad_norm": 0.39453125, "learning_rate": 0.0002, "loss": 0.0485, "step": 9933 }, { "epoch": 18.016776241215144, "grad_norm": 0.421875, "learning_rate": 0.0002, "loss": 0.0538, "step": 9934 }, { "epoch": 18.01858988891408, "grad_norm": 0.416015625, "learning_rate": 0.0002, "loss": 0.0542, "step": 9935 }, { "epoch": 18.020403536613014, "grad_norm": 0.443359375, "learning_rate": 0.0002, "loss": 0.0611, "step": 9936 }, { "epoch": 18.02221718431195, "grad_norm": 0.4296875, "learning_rate": 0.0002, "loss": 0.0504, "step": 9937 }, { "epoch": 18.024030832010883, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.0493, "step": 9938 }, { "epoch": 18.025844479709818, "grad_norm": 0.41015625, "learning_rate": 0.0002, "loss": 0.0519, "step": 9939 }, { "epoch": 18.027658127408753, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0569, "step": 9940 }, { "epoch": 18.029471775107684, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0525, "step": 9941 }, { "epoch": 18.03128542280662, "grad_norm": 0.50390625, "learning_rate": 0.0002, "loss": 0.0561, "step": 9942 }, { "epoch": 18.033099070505553, "grad_norm": 0.427734375, "learning_rate": 0.0002, "loss": 0.0578, "step": 9943 }, { "epoch": 18.034912718204488, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0603, "step": 9944 }, { "epoch": 18.036726365903423, "grad_norm": 0.51953125, "learning_rate": 0.0002, "loss": 0.0582, "step": 9945 }, { "epoch": 18.038540013602358, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.061, "step": 9946 }, { "epoch": 18.040353661301292, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0648, "step": 9947 }, { "epoch": 18.042167309000227, "grad_norm": 0.51171875, "learning_rate": 0.0002, "loss": 0.0664, "step": 9948 }, { "epoch": 18.043980956699162, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.072, "step": 9949 }, { "epoch": 18.045794604398097, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.059, "step": 9950 }, { "epoch": 18.04760825209703, "grad_norm": 0.482421875, "learning_rate": 0.0002, "loss": 0.0685, "step": 9951 }, { "epoch": 18.049421899795966, "grad_norm": 0.609375, "learning_rate": 0.0002, "loss": 0.0747, "step": 9952 }, { "epoch": 18.051235547494898, "grad_norm": 0.52734375, "learning_rate": 0.0002, "loss": 0.065, "step": 9953 }, { "epoch": 18.053049195193832, "grad_norm": 0.5546875, "learning_rate": 0.0002, "loss": 0.0975, "step": 9954 }, { "epoch": 18.054862842892767, "grad_norm": 0.515625, "learning_rate": 0.0002, "loss": 0.0705, "step": 9955 }, { "epoch": 18.056676490591702, "grad_norm": 0.4453125, "learning_rate": 0.0002, "loss": 0.0617, "step": 9956 }, { "epoch": 18.058490138290637, "grad_norm": 0.63671875, "learning_rate": 0.0002, "loss": 0.082, "step": 9957 }, { "epoch": 18.06030378598957, "grad_norm": 0.55078125, "learning_rate": 0.0002, "loss": 0.0741, "step": 9958 }, { "epoch": 18.062117433688506, "grad_norm": 0.5234375, "learning_rate": 0.0002, "loss": 0.0736, "step": 9959 }, { "epoch": 18.06393108138744, "grad_norm": 0.7109375, "learning_rate": 0.0002, "loss": 0.1325, "step": 9960 }, { "epoch": 18.065744729086376, "grad_norm": 0.5625, "learning_rate": 0.0002, "loss": 0.0826, "step": 9961 }, { "epoch": 18.06755837678531, "grad_norm": 0.71484375, "learning_rate": 0.0002, "loss": 0.0855, "step": 9962 }, { "epoch": 18.069372024484245, "grad_norm": 0.671875, "learning_rate": 0.0002, "loss": 0.0923, "step": 9963 }, { "epoch": 18.07118567218318, "grad_norm": 0.6484375, "learning_rate": 0.0002, "loss": 0.0911, "step": 9964 }, { "epoch": 18.07299931988211, "grad_norm": 0.57421875, "learning_rate": 0.0002, "loss": 0.084, "step": 9965 }, { "epoch": 18.074812967581046, "grad_norm": 0.5703125, "learning_rate": 0.0002, "loss": 0.0866, "step": 9966 }, { "epoch": 18.07662661527998, "grad_norm": 0.65625, "learning_rate": 0.0002, "loss": 0.0876, "step": 9967 }, { "epoch": 18.078440262978916, "grad_norm": 0.400390625, "learning_rate": 0.0002, "loss": 0.0928, "step": 9968 }, { "epoch": 18.08025391067785, "grad_norm": 0.8671875, "learning_rate": 0.0002, "loss": 0.1285, "step": 9969 }, { "epoch": 18.082067558376785, "grad_norm": 0.82421875, "learning_rate": 0.0002, "loss": 0.1183, "step": 9970 }, { "epoch": 18.08388120607572, "grad_norm": 0.8046875, "learning_rate": 0.0002, "loss": 0.153, "step": 9971 }, { "epoch": 18.085694853774655, "grad_norm": 0.6015625, "learning_rate": 0.0002, "loss": 0.129, "step": 9972 }, { "epoch": 18.08750850147359, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.1379, "step": 9973 }, { "epoch": 18.089322149172524, "grad_norm": 0.8984375, "learning_rate": 0.0002, "loss": 0.1959, "step": 9974 }, { "epoch": 18.09113579687146, "grad_norm": 1.0234375, "learning_rate": 0.0002, "loss": 0.2728, "step": 9975 }, { "epoch": 18.092949444570394, "grad_norm": 0.396484375, "learning_rate": 0.0002, "loss": 0.0576, "step": 9976 }, { "epoch": 18.094763092269325, "grad_norm": 0.47265625, "learning_rate": 0.0002, "loss": 0.0553, "step": 9977 }, { "epoch": 18.09657673996826, "grad_norm": 0.478515625, "learning_rate": 0.0002, "loss": 0.0641, "step": 9978 }, { "epoch": 18.098390387667195, "grad_norm": 0.458984375, "learning_rate": 0.0002, "loss": 0.0558, "step": 9979 }, { "epoch": 18.10020403536613, "grad_norm": 0.369140625, "learning_rate": 0.0002, "loss": 0.0495, "step": 9980 }, { "epoch": 18.102017683065064, "grad_norm": 0.46484375, "learning_rate": 0.0002, "loss": 0.0704, "step": 9981 }, { "epoch": 18.103831330764, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.0578, "step": 9982 }, { "epoch": 18.105644978462934, "grad_norm": 0.4140625, "learning_rate": 0.0002, "loss": 0.0548, "step": 9983 }, { "epoch": 18.10745862616187, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.054, "step": 9984 }, { "epoch": 18.109272273860803, "grad_norm": 0.43359375, "learning_rate": 0.0002, "loss": 0.0588, "step": 9985 }, { "epoch": 18.111085921559738, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.0578, "step": 9986 }, { "epoch": 18.112899569258673, "grad_norm": 0.455078125, "learning_rate": 0.0002, "loss": 0.0588, "step": 9987 }, { "epoch": 18.114713216957608, "grad_norm": 0.5078125, "learning_rate": 0.0002, "loss": 0.0635, "step": 9988 }, { "epoch": 18.11652686465654, "grad_norm": 0.470703125, "learning_rate": 0.0002, "loss": 0.0587, "step": 9989 }, { "epoch": 18.118340512355473, "grad_norm": 0.404296875, "learning_rate": 0.0002, "loss": 0.055, "step": 9990 }, { "epoch": 18.12015416005441, "grad_norm": 0.546875, "learning_rate": 0.0002, "loss": 0.0664, "step": 9991 }, { "epoch": 18.121967807753343, "grad_norm": 0.5, "learning_rate": 0.0002, "loss": 0.0584, "step": 9992 }, { "epoch": 18.123781455452278, "grad_norm": 0.4609375, "learning_rate": 0.0002, "loss": 0.0611, "step": 9993 }, { "epoch": 18.125595103151213, "grad_norm": 0.466796875, "learning_rate": 0.0002, "loss": 0.0604, "step": 9994 }, { "epoch": 18.127408750850147, "grad_norm": 0.451171875, "learning_rate": 0.0002, "loss": 0.0612, "step": 9995 }, { "epoch": 18.129222398549082, "grad_norm": 0.486328125, "learning_rate": 0.0002, "loss": 0.065, "step": 9996 }, { "epoch": 18.131036046248017, "grad_norm": 0.48046875, "learning_rate": 0.0002, "loss": 0.0628, "step": 9997 }, { "epoch": 18.13284969394695, "grad_norm": 0.6171875, "learning_rate": 0.0002, "loss": 0.0745, "step": 9998 }, { "epoch": 18.134663341645886, "grad_norm": 0.54296875, "learning_rate": 0.0002, "loss": 0.0655, "step": 9999 }, { "epoch": 18.13647698934482, "grad_norm": 0.453125, "learning_rate": 0.0002, "loss": 0.0689, "step": 10000 }, { "epoch": 18.13647698934482, "step": 10000, "total_flos": 2.2710638499315057e+18, "train_loss": 0.31174591144770386, "train_runtime": 464430.3082, "train_samples_per_second": 0.345, "train_steps_per_second": 0.022 } ], "logging_steps": 1, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 19, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2710638499315057e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }