|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.9411764705882355,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.029411764705882353,
|
|
"grad_norm": 160.71739196777344,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.163,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.058823529411764705,
|
|
"grad_norm": 167.84927368164062,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.2743,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.08823529411764706,
|
|
"grad_norm": 134.49913024902344,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.5496,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.11764705882352941,
|
|
"grad_norm": 327.9437255859375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.2971,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.14705882352941177,
|
|
"grad_norm": 286.48297119140625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.238,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.17647058823529413,
|
|
"grad_norm": 160.5793914794922,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.8738,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.20588235294117646,
|
|
"grad_norm": 173.0782012939453,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.2928,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.23529411764705882,
|
|
"grad_norm": 452.2684020996094,
|
|
"learning_rate": 2e-05,
|
|
"loss": 6.4371,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.2647058823529412,
|
|
"grad_norm": 101.81562042236328,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8329,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.29411764705882354,
|
|
"grad_norm": 157.65127563476562,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.053,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.3235294117647059,
|
|
"grad_norm": 143.43881225585938,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6041,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.35294117647058826,
|
|
"grad_norm": 119.99986267089844,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.7732,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.38235294117647056,
|
|
"grad_norm": 340.0208435058594,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.7953,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.4117647058823529,
|
|
"grad_norm": 139.62266540527344,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.9037,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.4411764705882353,
|
|
"grad_norm": 133.6896209716797,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2289,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.47058823529411764,
|
|
"grad_norm": 402.2488708496094,
|
|
"learning_rate": 2e-05,
|
|
"loss": 5.3283,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 154.9514617919922,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.5564,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.5294117647058824,
|
|
"grad_norm": 408.76507568359375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8475,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.5588235294117647,
|
|
"grad_norm": 849.7493286132812,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2031,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.5882352941176471,
|
|
"grad_norm": 583.0057373046875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.9562,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.6176470588235294,
|
|
"grad_norm": 261.2338562011719,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2209,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.6470588235294118,
|
|
"grad_norm": 409.6594543457031,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.913,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.6764705882352942,
|
|
"grad_norm": 224.56918334960938,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.1768,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.7058823529411765,
|
|
"grad_norm": 384.32147216796875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.7449,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.7352941176470589,
|
|
"grad_norm": 57.91896057128906,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.3582,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.7647058823529411,
|
|
"grad_norm": 58.27327346801758,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.1744,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.7941176470588235,
|
|
"grad_norm": 96.63125610351562,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6751,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.8235294117647058,
|
|
"grad_norm": 124.51710510253906,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8836,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.8529411764705882,
|
|
"grad_norm": 408.7486877441406,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.733,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.8823529411764706,
|
|
"grad_norm": 167.9065399169922,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.5512,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.9117647058823529,
|
|
"grad_norm": 203.8374786376953,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.7913,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.9411764705882353,
|
|
"grad_norm": 313.9844665527344,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.509,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.9705882352941176,
|
|
"grad_norm": 144.14212036132812,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.1653,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 95.7232894897461,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2242,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 1.0294117647058822,
|
|
"grad_norm": 43.582393646240234,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.3664,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 1.0588235294117647,
|
|
"grad_norm": 388.4141540527344,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.9917,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 1.088235294117647,
|
|
"grad_norm": 56.53749465942383,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.393,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 1.1176470588235294,
|
|
"grad_norm": 174.79148864746094,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.9908,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 1.1470588235294117,
|
|
"grad_norm": 1044.60498046875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.7904,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 1.1764705882352942,
|
|
"grad_norm": 1415.9786376953125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.5518,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 1.2058823529411764,
|
|
"grad_norm": 307.51263427734375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.9784,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 1.2352941176470589,
|
|
"grad_norm": 230.46514892578125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.7486,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 1.2647058823529411,
|
|
"grad_norm": 134.46823120117188,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.2822,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 1.2941176470588236,
|
|
"grad_norm": 69.17707061767578,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.7713,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 1.3235294117647058,
|
|
"grad_norm": 66.95726776123047,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6892,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 1.3529411764705883,
|
|
"grad_norm": 174.15359497070312,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2474,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 1.3823529411764706,
|
|
"grad_norm": 159.84739685058594,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.2839,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 1.4117647058823528,
|
|
"grad_norm": 61.734619140625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.0586,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 1.4411764705882353,
|
|
"grad_norm": 258.8429260253906,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.829,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 1.4705882352941178,
|
|
"grad_norm": 256.6245422363281,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2496,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 38.860015869140625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.9418,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 1.5294117647058822,
|
|
"grad_norm": 169.32525634765625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.8798,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 1.5588235294117647,
|
|
"grad_norm": 547.1870727539062,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.0999,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 1.5882352941176472,
|
|
"grad_norm": 69.51364135742188,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.9031,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 1.6176470588235294,
|
|
"grad_norm": 47.159114837646484,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.5062,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 1.6470588235294117,
|
|
"grad_norm": 215.8884735107422,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8866,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 1.6764705882352942,
|
|
"grad_norm": 612.4532470703125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.0209,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 1.7058823529411766,
|
|
"grad_norm": 375.0256652832031,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.1054,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 1.7352941176470589,
|
|
"grad_norm": 63.03890609741211,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.5422,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 1.7647058823529411,
|
|
"grad_norm": 1484.6824951171875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.5796,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 1.7941176470588234,
|
|
"grad_norm": 209.3314208984375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.5442,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 1.8235294117647058,
|
|
"grad_norm": 187.7886505126953,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.0212,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 1.8529411764705883,
|
|
"grad_norm": 510.379150390625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.3788,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 1.8823529411764706,
|
|
"grad_norm": 224.35646057128906,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.1101,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 1.9117647058823528,
|
|
"grad_norm": 244.10589599609375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 4.0655,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 1.9411764705882353,
|
|
"grad_norm": 254.54693603515625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.653,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 1.9705882352941178,
|
|
"grad_norm": 57.74852752685547,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.993,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 81.31387329101562,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6492,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 2.0294117647058822,
|
|
"grad_norm": 40.57068634033203,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.0186,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 2.0588235294117645,
|
|
"grad_norm": 67.01395416259766,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.239,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 2.088235294117647,
|
|
"grad_norm": 110.17364501953125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.4758,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 2.1176470588235294,
|
|
"grad_norm": 68.4447021484375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.5485,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 2.1470588235294117,
|
|
"grad_norm": 267.31463623046875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.1875,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 2.176470588235294,
|
|
"grad_norm": 86.10749053955078,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.3052,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 2.2058823529411766,
|
|
"grad_norm": 292.28314208984375,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8977,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 2.235294117647059,
|
|
"grad_norm": 782.0560913085938,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.7391,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 2.264705882352941,
|
|
"grad_norm": 132.75177001953125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.6825,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 2.2941176470588234,
|
|
"grad_norm": 231.06690979003906,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.819,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 2.323529411764706,
|
|
"grad_norm": 586.24658203125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.9857,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 2.3529411764705883,
|
|
"grad_norm": 77.22154235839844,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.8883,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 2.3823529411764706,
|
|
"grad_norm": 214.43612670898438,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.4815,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 2.411764705882353,
|
|
"grad_norm": 145.10537719726562,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.3451,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 2.4411764705882355,
|
|
"grad_norm": 85.24402618408203,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.1772,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 2.4705882352941178,
|
|
"grad_norm": 402.4455261230469,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.2879,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 77.11824798583984,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.1511,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 2.5294117647058822,
|
|
"grad_norm": 106.68152618408203,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.4542,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 2.5588235294117645,
|
|
"grad_norm": 84.24452209472656,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.523,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 2.588235294117647,
|
|
"grad_norm": 125.50318145751953,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.8231,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 2.6176470588235294,
|
|
"grad_norm": 41.62876510620117,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.5234,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 2.6470588235294117,
|
|
"grad_norm": 204.7890167236328,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.3231,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 2.6764705882352944,
|
|
"grad_norm": 336.05963134765625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.5473,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 2.7058823529411766,
|
|
"grad_norm": 167.6499786376953,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.2598,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 2.735294117647059,
|
|
"grad_norm": 39.165950775146484,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.5955,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 2.764705882352941,
|
|
"grad_norm": 103.95963287353516,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.3233,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 2.7941176470588234,
|
|
"grad_norm": 141.0578155517578,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.0557,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 2.8235294117647056,
|
|
"grad_norm": 79.57919311523438,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.6418,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 2.8529411764705883,
|
|
"grad_norm": 167.03134155273438,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.3345,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 2.8823529411764706,
|
|
"grad_norm": 1838.4185791015625,
|
|
"learning_rate": 2e-05,
|
|
"loss": 3.5249,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 2.911764705882353,
|
|
"grad_norm": 724.2003173828125,
|
|
"learning_rate": 2e-05,
|
|
"loss": 1.6495,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 2.9411764705882355,
|
|
"grad_norm": 703.0057373046875,
|
|
"learning_rate": 2e-05,
|
|
"loss": 2.7896,
|
|
"step": 100
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 331537700093952.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|