samuellimabraz's picture
End of training
1cfb1e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945945945945946,
"eval_steps": 500,
"global_step": 115,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008648648648648649,
"grad_norm": 2.249741315841675,
"learning_rate": 0.0001,
"loss": 1.8319,
"step": 1
},
{
"epoch": 0.017297297297297298,
"grad_norm": 2.1813502311706543,
"learning_rate": 0.0002,
"loss": 1.4027,
"step": 2
},
{
"epoch": 0.025945945945945945,
"grad_norm": 0.8601759672164917,
"learning_rate": 0.00019823008849557524,
"loss": 1.1102,
"step": 3
},
{
"epoch": 0.034594594594594595,
"grad_norm": 1.7297605276107788,
"learning_rate": 0.00019646017699115044,
"loss": 1.3774,
"step": 4
},
{
"epoch": 0.043243243243243246,
"grad_norm": 1.0936262607574463,
"learning_rate": 0.00019469026548672567,
"loss": 0.895,
"step": 5
},
{
"epoch": 0.05189189189189189,
"grad_norm": 0.6946480870246887,
"learning_rate": 0.00019292035398230087,
"loss": 0.7451,
"step": 6
},
{
"epoch": 0.06054054054054054,
"grad_norm": 0.45863592624664307,
"learning_rate": 0.00019115044247787613,
"loss": 0.876,
"step": 7
},
{
"epoch": 0.06918918918918919,
"grad_norm": 0.5447478890419006,
"learning_rate": 0.00018938053097345133,
"loss": 0.7719,
"step": 8
},
{
"epoch": 0.07783783783783783,
"grad_norm": 0.45514124631881714,
"learning_rate": 0.00018761061946902656,
"loss": 0.5759,
"step": 9
},
{
"epoch": 0.08648648648648649,
"grad_norm": 0.4590395987033844,
"learning_rate": 0.0001858407079646018,
"loss": 0.5838,
"step": 10
},
{
"epoch": 0.09513513513513513,
"grad_norm": 0.5425634384155273,
"learning_rate": 0.000184070796460177,
"loss": 0.6641,
"step": 11
},
{
"epoch": 0.10378378378378378,
"grad_norm": 1.0379027128219604,
"learning_rate": 0.00018230088495575222,
"loss": 0.9623,
"step": 12
},
{
"epoch": 0.11243243243243244,
"grad_norm": 0.5286022424697876,
"learning_rate": 0.00018053097345132742,
"loss": 0.4761,
"step": 13
},
{
"epoch": 0.12108108108108108,
"grad_norm": 0.6451830267906189,
"learning_rate": 0.00017876106194690265,
"loss": 0.547,
"step": 14
},
{
"epoch": 0.12972972972972974,
"grad_norm": 0.6369953751564026,
"learning_rate": 0.0001769911504424779,
"loss": 0.5872,
"step": 15
},
{
"epoch": 0.13837837837837838,
"grad_norm": 0.4720052182674408,
"learning_rate": 0.0001752212389380531,
"loss": 0.3248,
"step": 16
},
{
"epoch": 0.14702702702702702,
"grad_norm": 0.5918360352516174,
"learning_rate": 0.00017345132743362834,
"loss": 0.6277,
"step": 17
},
{
"epoch": 0.15567567567567567,
"grad_norm": 0.5242601037025452,
"learning_rate": 0.00017168141592920354,
"loss": 0.5645,
"step": 18
},
{
"epoch": 0.1643243243243243,
"grad_norm": 0.474292129278183,
"learning_rate": 0.00016991150442477877,
"loss": 0.2115,
"step": 19
},
{
"epoch": 0.17297297297297298,
"grad_norm": 0.6523647904396057,
"learning_rate": 0.000168141592920354,
"loss": 0.5803,
"step": 20
},
{
"epoch": 0.18162162162162163,
"grad_norm": 0.521297812461853,
"learning_rate": 0.0001663716814159292,
"loss": 0.4483,
"step": 21
},
{
"epoch": 0.19027027027027027,
"grad_norm": 0.5689568519592285,
"learning_rate": 0.00016460176991150443,
"loss": 0.6231,
"step": 22
},
{
"epoch": 0.1989189189189189,
"grad_norm": 0.4570567011833191,
"learning_rate": 0.00016283185840707966,
"loss": 0.2368,
"step": 23
},
{
"epoch": 0.20756756756756756,
"grad_norm": 0.414307564496994,
"learning_rate": 0.0001610619469026549,
"loss": 0.4674,
"step": 24
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.5027227997779846,
"learning_rate": 0.0001592920353982301,
"loss": 0.3558,
"step": 25
},
{
"epoch": 0.22486486486486487,
"grad_norm": 0.4441507160663605,
"learning_rate": 0.00015752212389380532,
"loss": 0.437,
"step": 26
},
{
"epoch": 0.23351351351351352,
"grad_norm": 0.4098701477050781,
"learning_rate": 0.00015575221238938055,
"loss": 0.3553,
"step": 27
},
{
"epoch": 0.24216216216216216,
"grad_norm": 0.3602244257926941,
"learning_rate": 0.00015398230088495575,
"loss": 0.3689,
"step": 28
},
{
"epoch": 0.2508108108108108,
"grad_norm": 0.4340718984603882,
"learning_rate": 0.00015221238938053098,
"loss": 0.318,
"step": 29
},
{
"epoch": 0.2594594594594595,
"grad_norm": 0.44470590353012085,
"learning_rate": 0.00015044247787610618,
"loss": 0.4992,
"step": 30
},
{
"epoch": 0.2681081081081081,
"grad_norm": 0.43699413537979126,
"learning_rate": 0.00014867256637168144,
"loss": 0.3362,
"step": 31
},
{
"epoch": 0.27675675675675676,
"grad_norm": 0.4950752258300781,
"learning_rate": 0.00014690265486725664,
"loss": 0.4464,
"step": 32
},
{
"epoch": 0.28540540540540543,
"grad_norm": 0.4312315881252289,
"learning_rate": 0.00014513274336283187,
"loss": 0.4786,
"step": 33
},
{
"epoch": 0.29405405405405405,
"grad_norm": 0.45234543085098267,
"learning_rate": 0.0001433628318584071,
"loss": 0.5572,
"step": 34
},
{
"epoch": 0.3027027027027027,
"grad_norm": 0.4373219311237335,
"learning_rate": 0.0001415929203539823,
"loss": 0.3873,
"step": 35
},
{
"epoch": 0.31135135135135134,
"grad_norm": 0.35862988233566284,
"learning_rate": 0.00013982300884955753,
"loss": 0.2902,
"step": 36
},
{
"epoch": 0.32,
"grad_norm": 0.41014787554740906,
"learning_rate": 0.00013805309734513276,
"loss": 0.3806,
"step": 37
},
{
"epoch": 0.3286486486486486,
"grad_norm": 0.4181463420391083,
"learning_rate": 0.00013628318584070796,
"loss": 0.3036,
"step": 38
},
{
"epoch": 0.3372972972972973,
"grad_norm": 0.3663095235824585,
"learning_rate": 0.00013451327433628321,
"loss": 0.1979,
"step": 39
},
{
"epoch": 0.34594594594594597,
"grad_norm": 0.46295005083084106,
"learning_rate": 0.00013274336283185842,
"loss": 0.4204,
"step": 40
},
{
"epoch": 0.3545945945945946,
"grad_norm": 0.39596325159072876,
"learning_rate": 0.00013097345132743365,
"loss": 0.3512,
"step": 41
},
{
"epoch": 0.36324324324324325,
"grad_norm": 0.7628335952758789,
"learning_rate": 0.00012920353982300885,
"loss": 0.4965,
"step": 42
},
{
"epoch": 0.37189189189189187,
"grad_norm": 0.5216770172119141,
"learning_rate": 0.00012743362831858408,
"loss": 0.4658,
"step": 43
},
{
"epoch": 0.38054054054054054,
"grad_norm": 0.38578447699546814,
"learning_rate": 0.0001256637168141593,
"loss": 0.2661,
"step": 44
},
{
"epoch": 0.3891891891891892,
"grad_norm": 0.2811882197856903,
"learning_rate": 0.0001238938053097345,
"loss": 0.1545,
"step": 45
},
{
"epoch": 0.3978378378378378,
"grad_norm": 0.3812131881713867,
"learning_rate": 0.00012212389380530974,
"loss": 0.3295,
"step": 46
},
{
"epoch": 0.4064864864864865,
"grad_norm": 0.3791070878505707,
"learning_rate": 0.00012035398230088497,
"loss": 0.2472,
"step": 47
},
{
"epoch": 0.4151351351351351,
"grad_norm": 0.38515138626098633,
"learning_rate": 0.0001185840707964602,
"loss": 0.4042,
"step": 48
},
{
"epoch": 0.4237837837837838,
"grad_norm": 0.5093116164207458,
"learning_rate": 0.00011681415929203541,
"loss": 0.8376,
"step": 49
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.2971178889274597,
"learning_rate": 0.00011504424778761063,
"loss": 0.4082,
"step": 50
},
{
"epoch": 0.4410810810810811,
"grad_norm": 0.30018818378448486,
"learning_rate": 0.00011327433628318584,
"loss": 0.129,
"step": 51
},
{
"epoch": 0.44972972972972974,
"grad_norm": 0.4631483256816864,
"learning_rate": 0.00011150442477876106,
"loss": 0.3752,
"step": 52
},
{
"epoch": 0.45837837837837836,
"grad_norm": 0.3890452980995178,
"learning_rate": 0.00010973451327433629,
"loss": 0.4054,
"step": 53
},
{
"epoch": 0.46702702702702703,
"grad_norm": 0.3566686511039734,
"learning_rate": 0.0001079646017699115,
"loss": 0.2452,
"step": 54
},
{
"epoch": 0.4756756756756757,
"grad_norm": 0.4903372526168823,
"learning_rate": 0.00010619469026548674,
"loss": 0.4505,
"step": 55
},
{
"epoch": 0.4843243243243243,
"grad_norm": 0.3836239278316498,
"learning_rate": 0.00010442477876106196,
"loss": 0.3952,
"step": 56
},
{
"epoch": 0.492972972972973,
"grad_norm": 0.42047417163848877,
"learning_rate": 0.00010265486725663717,
"loss": 0.5074,
"step": 57
},
{
"epoch": 0.5016216216216216,
"grad_norm": 0.24409635365009308,
"learning_rate": 0.00010088495575221239,
"loss": 0.1389,
"step": 58
},
{
"epoch": 0.5102702702702703,
"grad_norm": 0.3819220960140228,
"learning_rate": 9.911504424778762e-05,
"loss": 0.3945,
"step": 59
},
{
"epoch": 0.518918918918919,
"grad_norm": 0.31148406863212585,
"learning_rate": 9.734513274336283e-05,
"loss": 0.5203,
"step": 60
},
{
"epoch": 0.5275675675675676,
"grad_norm": 0.3157011866569519,
"learning_rate": 9.557522123893806e-05,
"loss": 0.262,
"step": 61
},
{
"epoch": 0.5362162162162162,
"grad_norm": 0.40180379152297974,
"learning_rate": 9.380530973451328e-05,
"loss": 0.2404,
"step": 62
},
{
"epoch": 0.5448648648648649,
"grad_norm": 0.4064180552959442,
"learning_rate": 9.20353982300885e-05,
"loss": 0.6118,
"step": 63
},
{
"epoch": 0.5535135135135135,
"grad_norm": 0.3912467956542969,
"learning_rate": 9.026548672566371e-05,
"loss": 0.271,
"step": 64
},
{
"epoch": 0.5621621621621622,
"grad_norm": 0.31059980392456055,
"learning_rate": 8.849557522123895e-05,
"loss": 0.2373,
"step": 65
},
{
"epoch": 0.5708108108108109,
"grad_norm": 0.30928152799606323,
"learning_rate": 8.672566371681417e-05,
"loss": 0.4169,
"step": 66
},
{
"epoch": 0.5794594594594594,
"grad_norm": 0.40631791949272156,
"learning_rate": 8.495575221238938e-05,
"loss": 0.4175,
"step": 67
},
{
"epoch": 0.5881081081081081,
"grad_norm": 0.40440961718559265,
"learning_rate": 8.31858407079646e-05,
"loss": 0.3269,
"step": 68
},
{
"epoch": 0.5967567567567568,
"grad_norm": 0.4534294009208679,
"learning_rate": 8.141592920353983e-05,
"loss": 0.2242,
"step": 69
},
{
"epoch": 0.6054054054054054,
"grad_norm": 0.41317978501319885,
"learning_rate": 7.964601769911504e-05,
"loss": 0.2633,
"step": 70
},
{
"epoch": 0.614054054054054,
"grad_norm": 0.272535115480423,
"learning_rate": 7.787610619469027e-05,
"loss": 0.1455,
"step": 71
},
{
"epoch": 0.6227027027027027,
"grad_norm": 0.4280416667461395,
"learning_rate": 7.610619469026549e-05,
"loss": 0.5289,
"step": 72
},
{
"epoch": 0.6313513513513513,
"grad_norm": 0.4870530664920807,
"learning_rate": 7.433628318584072e-05,
"loss": 0.5633,
"step": 73
},
{
"epoch": 0.64,
"grad_norm": 0.38074707984924316,
"learning_rate": 7.256637168141593e-05,
"loss": 0.4738,
"step": 74
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.32775411009788513,
"learning_rate": 7.079646017699115e-05,
"loss": 0.2764,
"step": 75
},
{
"epoch": 0.6572972972972972,
"grad_norm": 0.3663316071033478,
"learning_rate": 6.902654867256638e-05,
"loss": 0.4794,
"step": 76
},
{
"epoch": 0.6659459459459459,
"grad_norm": 0.36854031682014465,
"learning_rate": 6.725663716814161e-05,
"loss": 0.1809,
"step": 77
},
{
"epoch": 0.6745945945945946,
"grad_norm": 0.37296342849731445,
"learning_rate": 6.548672566371682e-05,
"loss": 0.4067,
"step": 78
},
{
"epoch": 0.6832432432432433,
"grad_norm": 0.4202044606208801,
"learning_rate": 6.371681415929204e-05,
"loss": 0.2752,
"step": 79
},
{
"epoch": 0.6918918918918919,
"grad_norm": 0.29250282049179077,
"learning_rate": 6.194690265486725e-05,
"loss": 0.1461,
"step": 80
},
{
"epoch": 0.7005405405405405,
"grad_norm": 0.37763354182243347,
"learning_rate": 6.017699115044248e-05,
"loss": 0.2817,
"step": 81
},
{
"epoch": 0.7091891891891892,
"grad_norm": 0.30031171441078186,
"learning_rate": 5.8407079646017705e-05,
"loss": 0.1572,
"step": 82
},
{
"epoch": 0.7178378378378378,
"grad_norm": 0.4519175887107849,
"learning_rate": 5.663716814159292e-05,
"loss": 0.3046,
"step": 83
},
{
"epoch": 0.7264864864864865,
"grad_norm": 0.3103352189064026,
"learning_rate": 5.486725663716814e-05,
"loss": 0.1347,
"step": 84
},
{
"epoch": 0.7351351351351352,
"grad_norm": 0.7960600852966309,
"learning_rate": 5.309734513274337e-05,
"loss": 0.3168,
"step": 85
},
{
"epoch": 0.7437837837837837,
"grad_norm": 0.3281419277191162,
"learning_rate": 5.132743362831859e-05,
"loss": 0.2045,
"step": 86
},
{
"epoch": 0.7524324324324324,
"grad_norm": 0.35785752534866333,
"learning_rate": 4.955752212389381e-05,
"loss": 0.4077,
"step": 87
},
{
"epoch": 0.7610810810810811,
"grad_norm": 0.37461650371551514,
"learning_rate": 4.778761061946903e-05,
"loss": 0.3227,
"step": 88
},
{
"epoch": 0.7697297297297298,
"grad_norm": 0.3365744352340698,
"learning_rate": 4.601769911504425e-05,
"loss": 0.2306,
"step": 89
},
{
"epoch": 0.7783783783783784,
"grad_norm": 0.29543980956077576,
"learning_rate": 4.4247787610619477e-05,
"loss": 0.3661,
"step": 90
},
{
"epoch": 0.787027027027027,
"grad_norm": 0.3135324716567993,
"learning_rate": 4.247787610619469e-05,
"loss": 0.2503,
"step": 91
},
{
"epoch": 0.7956756756756757,
"grad_norm": 0.23556429147720337,
"learning_rate": 4.0707964601769914e-05,
"loss": 0.1044,
"step": 92
},
{
"epoch": 0.8043243243243243,
"grad_norm": 0.2718769907951355,
"learning_rate": 3.893805309734514e-05,
"loss": 0.1471,
"step": 93
},
{
"epoch": 0.812972972972973,
"grad_norm": 0.25528448820114136,
"learning_rate": 3.716814159292036e-05,
"loss": 0.1126,
"step": 94
},
{
"epoch": 0.8216216216216217,
"grad_norm": 0.514164388179779,
"learning_rate": 3.5398230088495574e-05,
"loss": 0.3423,
"step": 95
},
{
"epoch": 0.8302702702702702,
"grad_norm": 0.33162716031074524,
"learning_rate": 3.3628318584070804e-05,
"loss": 0.3637,
"step": 96
},
{
"epoch": 0.8389189189189189,
"grad_norm": 0.25161704421043396,
"learning_rate": 3.185840707964602e-05,
"loss": 0.1284,
"step": 97
},
{
"epoch": 0.8475675675675676,
"grad_norm": 0.32825589179992676,
"learning_rate": 3.008849557522124e-05,
"loss": 0.2171,
"step": 98
},
{
"epoch": 0.8562162162162162,
"grad_norm": 0.23435255885124207,
"learning_rate": 2.831858407079646e-05,
"loss": 0.16,
"step": 99
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.2661581337451935,
"learning_rate": 2.6548672566371686e-05,
"loss": 0.2421,
"step": 100
},
{
"epoch": 0.8735135135135135,
"grad_norm": 0.2724602222442627,
"learning_rate": 2.4778761061946905e-05,
"loss": 0.1246,
"step": 101
},
{
"epoch": 0.8821621621621621,
"grad_norm": 0.47894561290740967,
"learning_rate": 2.3008849557522124e-05,
"loss": 0.4472,
"step": 102
},
{
"epoch": 0.8908108108108108,
"grad_norm": 0.3064163327217102,
"learning_rate": 2.1238938053097346e-05,
"loss": 0.2987,
"step": 103
},
{
"epoch": 0.8994594594594595,
"grad_norm": 0.4226900637149811,
"learning_rate": 1.946902654867257e-05,
"loss": 0.4185,
"step": 104
},
{
"epoch": 0.9081081081081082,
"grad_norm": 0.34745219349861145,
"learning_rate": 1.7699115044247787e-05,
"loss": 0.2572,
"step": 105
},
{
"epoch": 0.9167567567567567,
"grad_norm": 0.35236531496047974,
"learning_rate": 1.592920353982301e-05,
"loss": 0.3427,
"step": 106
},
{
"epoch": 0.9254054054054054,
"grad_norm": 0.37095391750335693,
"learning_rate": 1.415929203539823e-05,
"loss": 0.4018,
"step": 107
},
{
"epoch": 0.9340540540540541,
"grad_norm": 0.3331229090690613,
"learning_rate": 1.2389380530973452e-05,
"loss": 0.2038,
"step": 108
},
{
"epoch": 0.9427027027027027,
"grad_norm": 0.2652183175086975,
"learning_rate": 1.0619469026548673e-05,
"loss": 0.1072,
"step": 109
},
{
"epoch": 0.9513513513513514,
"grad_norm": 0.29123690724372864,
"learning_rate": 8.849557522123894e-06,
"loss": 0.1406,
"step": 110
},
{
"epoch": 0.96,
"grad_norm": 0.3317340612411499,
"learning_rate": 7.079646017699115e-06,
"loss": 0.2202,
"step": 111
},
{
"epoch": 0.9686486486486486,
"grad_norm": 0.47986647486686707,
"learning_rate": 5.3097345132743365e-06,
"loss": 0.3464,
"step": 112
},
{
"epoch": 0.9772972972972973,
"grad_norm": 0.2612822949886322,
"learning_rate": 3.5398230088495575e-06,
"loss": 0.1271,
"step": 113
},
{
"epoch": 0.985945945945946,
"grad_norm": 0.26845863461494446,
"learning_rate": 1.7699115044247788e-06,
"loss": 0.1044,
"step": 114
},
{
"epoch": 0.9945945945945946,
"grad_norm": 0.2526237368583679,
"learning_rate": 0.0,
"loss": 0.1158,
"step": 115
},
{
"epoch": 0.9945945945945946,
"step": 115,
"total_flos": 1.3431114641260646e+17,
"train_loss": 0.4029887131374815,
"train_runtime": 1125.7865,
"train_samples_per_second": 0.822,
"train_steps_per_second": 0.102
}
],
"logging_steps": 1,
"max_steps": 115,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3431114641260646e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}