MistralTest / trainer_state.json
GRatier's picture
Upload 12 files
7aa6ea4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9411764705882355,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029411764705882353,
"grad_norm": 160.71739196777344,
"learning_rate": 2e-05,
"loss": 2.163,
"step": 1
},
{
"epoch": 0.058823529411764705,
"grad_norm": 167.84927368164062,
"learning_rate": 2e-05,
"loss": 4.2743,
"step": 2
},
{
"epoch": 0.08823529411764706,
"grad_norm": 134.49913024902344,
"learning_rate": 2e-05,
"loss": 3.5496,
"step": 3
},
{
"epoch": 0.11764705882352941,
"grad_norm": 327.9437255859375,
"learning_rate": 2e-05,
"loss": 4.2971,
"step": 4
},
{
"epoch": 0.14705882352941177,
"grad_norm": 286.48297119140625,
"learning_rate": 2e-05,
"loss": 4.238,
"step": 5
},
{
"epoch": 0.17647058823529413,
"grad_norm": 160.5793914794922,
"learning_rate": 2e-05,
"loss": 4.8738,
"step": 6
},
{
"epoch": 0.20588235294117646,
"grad_norm": 173.0782012939453,
"learning_rate": 2e-05,
"loss": 4.2928,
"step": 7
},
{
"epoch": 0.23529411764705882,
"grad_norm": 452.2684020996094,
"learning_rate": 2e-05,
"loss": 6.4371,
"step": 8
},
{
"epoch": 0.2647058823529412,
"grad_norm": 101.81562042236328,
"learning_rate": 2e-05,
"loss": 2.8329,
"step": 9
},
{
"epoch": 0.29411764705882354,
"grad_norm": 157.65127563476562,
"learning_rate": 2e-05,
"loss": 3.053,
"step": 10
},
{
"epoch": 0.3235294117647059,
"grad_norm": 143.43881225585938,
"learning_rate": 2e-05,
"loss": 2.6041,
"step": 11
},
{
"epoch": 0.35294117647058826,
"grad_norm": 119.99986267089844,
"learning_rate": 2e-05,
"loss": 2.7732,
"step": 12
},
{
"epoch": 0.38235294117647056,
"grad_norm": 340.0208435058594,
"learning_rate": 2e-05,
"loss": 4.7953,
"step": 13
},
{
"epoch": 0.4117647058823529,
"grad_norm": 139.62266540527344,
"learning_rate": 2e-05,
"loss": 3.9037,
"step": 14
},
{
"epoch": 0.4411764705882353,
"grad_norm": 133.6896209716797,
"learning_rate": 2e-05,
"loss": 3.2289,
"step": 15
},
{
"epoch": 0.47058823529411764,
"grad_norm": 402.2488708496094,
"learning_rate": 2e-05,
"loss": 5.3283,
"step": 16
},
{
"epoch": 0.5,
"grad_norm": 154.9514617919922,
"learning_rate": 2e-05,
"loss": 2.5564,
"step": 17
},
{
"epoch": 0.5294117647058824,
"grad_norm": 408.76507568359375,
"learning_rate": 2e-05,
"loss": 2.8475,
"step": 18
},
{
"epoch": 0.5588235294117647,
"grad_norm": 849.7493286132812,
"learning_rate": 2e-05,
"loss": 3.2031,
"step": 19
},
{
"epoch": 0.5882352941176471,
"grad_norm": 583.0057373046875,
"learning_rate": 2e-05,
"loss": 3.9562,
"step": 20
},
{
"epoch": 0.6176470588235294,
"grad_norm": 261.2338562011719,
"learning_rate": 2e-05,
"loss": 3.2209,
"step": 21
},
{
"epoch": 0.6470588235294118,
"grad_norm": 409.6594543457031,
"learning_rate": 2e-05,
"loss": 4.913,
"step": 22
},
{
"epoch": 0.6764705882352942,
"grad_norm": 224.56918334960938,
"learning_rate": 2e-05,
"loss": 4.1768,
"step": 23
},
{
"epoch": 0.7058823529411765,
"grad_norm": 384.32147216796875,
"learning_rate": 2e-05,
"loss": 4.7449,
"step": 24
},
{
"epoch": 0.7352941176470589,
"grad_norm": 57.91896057128906,
"learning_rate": 2e-05,
"loss": 1.3582,
"step": 25
},
{
"epoch": 0.7647058823529411,
"grad_norm": 58.27327346801758,
"learning_rate": 2e-05,
"loss": 2.1744,
"step": 26
},
{
"epoch": 0.7941176470588235,
"grad_norm": 96.63125610351562,
"learning_rate": 2e-05,
"loss": 2.6751,
"step": 27
},
{
"epoch": 0.8235294117647058,
"grad_norm": 124.51710510253906,
"learning_rate": 2e-05,
"loss": 2.8836,
"step": 28
},
{
"epoch": 0.8529411764705882,
"grad_norm": 408.7486877441406,
"learning_rate": 2e-05,
"loss": 4.733,
"step": 29
},
{
"epoch": 0.8823529411764706,
"grad_norm": 167.9065399169922,
"learning_rate": 2e-05,
"loss": 3.5512,
"step": 30
},
{
"epoch": 0.9117647058823529,
"grad_norm": 203.8374786376953,
"learning_rate": 2e-05,
"loss": 3.7913,
"step": 31
},
{
"epoch": 0.9411764705882353,
"grad_norm": 313.9844665527344,
"learning_rate": 2e-05,
"loss": 4.509,
"step": 32
},
{
"epoch": 0.9705882352941176,
"grad_norm": 144.14212036132812,
"learning_rate": 2e-05,
"loss": 2.1653,
"step": 33
},
{
"epoch": 1.0,
"grad_norm": 95.7232894897461,
"learning_rate": 2e-05,
"loss": 3.2242,
"step": 34
},
{
"epoch": 1.0294117647058822,
"grad_norm": 43.582393646240234,
"learning_rate": 2e-05,
"loss": 1.3664,
"step": 35
},
{
"epoch": 1.0588235294117647,
"grad_norm": 388.4141540527344,
"learning_rate": 2e-05,
"loss": 1.9917,
"step": 36
},
{
"epoch": 1.088235294117647,
"grad_norm": 56.53749465942383,
"learning_rate": 2e-05,
"loss": 2.393,
"step": 37
},
{
"epoch": 1.1176470588235294,
"grad_norm": 174.79148864746094,
"learning_rate": 2e-05,
"loss": 1.9908,
"step": 38
},
{
"epoch": 1.1470588235294117,
"grad_norm": 1044.60498046875,
"learning_rate": 2e-05,
"loss": 2.7904,
"step": 39
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1415.9786376953125,
"learning_rate": 2e-05,
"loss": 2.5518,
"step": 40
},
{
"epoch": 1.2058823529411764,
"grad_norm": 307.51263427734375,
"learning_rate": 2e-05,
"loss": 3.9784,
"step": 41
},
{
"epoch": 1.2352941176470589,
"grad_norm": 230.46514892578125,
"learning_rate": 2e-05,
"loss": 3.7486,
"step": 42
},
{
"epoch": 1.2647058823529411,
"grad_norm": 134.46823120117188,
"learning_rate": 2e-05,
"loss": 2.2822,
"step": 43
},
{
"epoch": 1.2941176470588236,
"grad_norm": 69.17707061767578,
"learning_rate": 2e-05,
"loss": 1.7713,
"step": 44
},
{
"epoch": 1.3235294117647058,
"grad_norm": 66.95726776123047,
"learning_rate": 2e-05,
"loss": 2.6892,
"step": 45
},
{
"epoch": 1.3529411764705883,
"grad_norm": 174.15359497070312,
"learning_rate": 2e-05,
"loss": 3.2474,
"step": 46
},
{
"epoch": 1.3823529411764706,
"grad_norm": 159.84739685058594,
"learning_rate": 2e-05,
"loss": 2.2839,
"step": 47
},
{
"epoch": 1.4117647058823528,
"grad_norm": 61.734619140625,
"learning_rate": 2e-05,
"loss": 2.0586,
"step": 48
},
{
"epoch": 1.4411764705882353,
"grad_norm": 258.8429260253906,
"learning_rate": 2e-05,
"loss": 3.829,
"step": 49
},
{
"epoch": 1.4705882352941178,
"grad_norm": 256.6245422363281,
"learning_rate": 2e-05,
"loss": 3.2496,
"step": 50
},
{
"epoch": 1.5,
"grad_norm": 38.860015869140625,
"learning_rate": 2e-05,
"loss": 0.9418,
"step": 51
},
{
"epoch": 1.5294117647058822,
"grad_norm": 169.32525634765625,
"learning_rate": 2e-05,
"loss": 1.8798,
"step": 52
},
{
"epoch": 1.5588235294117647,
"grad_norm": 547.1870727539062,
"learning_rate": 2e-05,
"loss": 2.0999,
"step": 53
},
{
"epoch": 1.5882352941176472,
"grad_norm": 69.51364135742188,
"learning_rate": 2e-05,
"loss": 1.9031,
"step": 54
},
{
"epoch": 1.6176470588235294,
"grad_norm": 47.159114837646484,
"learning_rate": 2e-05,
"loss": 2.5062,
"step": 55
},
{
"epoch": 1.6470588235294117,
"grad_norm": 215.8884735107422,
"learning_rate": 2e-05,
"loss": 2.8866,
"step": 56
},
{
"epoch": 1.6764705882352942,
"grad_norm": 612.4532470703125,
"learning_rate": 2e-05,
"loss": 2.0209,
"step": 57
},
{
"epoch": 1.7058823529411766,
"grad_norm": 375.0256652832031,
"learning_rate": 2e-05,
"loss": 3.1054,
"step": 58
},
{
"epoch": 1.7352941176470589,
"grad_norm": 63.03890609741211,
"learning_rate": 2e-05,
"loss": 1.5422,
"step": 59
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1484.6824951171875,
"learning_rate": 2e-05,
"loss": 2.5796,
"step": 60
},
{
"epoch": 1.7941176470588234,
"grad_norm": 209.3314208984375,
"learning_rate": 2e-05,
"loss": 2.5442,
"step": 61
},
{
"epoch": 1.8235294117647058,
"grad_norm": 187.7886505126953,
"learning_rate": 2e-05,
"loss": 3.0212,
"step": 62
},
{
"epoch": 1.8529411764705883,
"grad_norm": 510.379150390625,
"learning_rate": 2e-05,
"loss": 3.3788,
"step": 63
},
{
"epoch": 1.8823529411764706,
"grad_norm": 224.35646057128906,
"learning_rate": 2e-05,
"loss": 4.1101,
"step": 64
},
{
"epoch": 1.9117647058823528,
"grad_norm": 244.10589599609375,
"learning_rate": 2e-05,
"loss": 4.0655,
"step": 65
},
{
"epoch": 1.9411764705882353,
"grad_norm": 254.54693603515625,
"learning_rate": 2e-05,
"loss": 2.653,
"step": 66
},
{
"epoch": 1.9705882352941178,
"grad_norm": 57.74852752685547,
"learning_rate": 2e-05,
"loss": 1.993,
"step": 67
},
{
"epoch": 2.0,
"grad_norm": 81.31387329101562,
"learning_rate": 2e-05,
"loss": 2.6492,
"step": 68
},
{
"epoch": 2.0294117647058822,
"grad_norm": 40.57068634033203,
"learning_rate": 2e-05,
"loss": 1.0186,
"step": 69
},
{
"epoch": 2.0588235294117645,
"grad_norm": 67.01395416259766,
"learning_rate": 2e-05,
"loss": 2.239,
"step": 70
},
{
"epoch": 2.088235294117647,
"grad_norm": 110.17364501953125,
"learning_rate": 2e-05,
"loss": 2.4758,
"step": 71
},
{
"epoch": 2.1176470588235294,
"grad_norm": 68.4447021484375,
"learning_rate": 2e-05,
"loss": 1.5485,
"step": 72
},
{
"epoch": 2.1470588235294117,
"grad_norm": 267.31463623046875,
"learning_rate": 2e-05,
"loss": 3.1875,
"step": 73
},
{
"epoch": 2.176470588235294,
"grad_norm": 86.10749053955078,
"learning_rate": 2e-05,
"loss": 2.3052,
"step": 74
},
{
"epoch": 2.2058823529411766,
"grad_norm": 292.28314208984375,
"learning_rate": 2e-05,
"loss": 2.8977,
"step": 75
},
{
"epoch": 2.235294117647059,
"grad_norm": 782.0560913085938,
"learning_rate": 2e-05,
"loss": 2.7391,
"step": 76
},
{
"epoch": 2.264705882352941,
"grad_norm": 132.75177001953125,
"learning_rate": 2e-05,
"loss": 1.6825,
"step": 77
},
{
"epoch": 2.2941176470588234,
"grad_norm": 231.06690979003906,
"learning_rate": 2e-05,
"loss": 1.819,
"step": 78
},
{
"epoch": 2.323529411764706,
"grad_norm": 586.24658203125,
"learning_rate": 2e-05,
"loss": 1.9857,
"step": 79
},
{
"epoch": 2.3529411764705883,
"grad_norm": 77.22154235839844,
"learning_rate": 2e-05,
"loss": 1.8883,
"step": 80
},
{
"epoch": 2.3823529411764706,
"grad_norm": 214.43612670898438,
"learning_rate": 2e-05,
"loss": 3.4815,
"step": 81
},
{
"epoch": 2.411764705882353,
"grad_norm": 145.10537719726562,
"learning_rate": 2e-05,
"loss": 3.3451,
"step": 82
},
{
"epoch": 2.4411764705882355,
"grad_norm": 85.24402618408203,
"learning_rate": 2e-05,
"loss": 2.1772,
"step": 83
},
{
"epoch": 2.4705882352941178,
"grad_norm": 402.4455261230469,
"learning_rate": 2e-05,
"loss": 3.2879,
"step": 84
},
{
"epoch": 2.5,
"grad_norm": 77.11824798583984,
"learning_rate": 2e-05,
"loss": 1.1511,
"step": 85
},
{
"epoch": 2.5294117647058822,
"grad_norm": 106.68152618408203,
"learning_rate": 2e-05,
"loss": 1.4542,
"step": 86
},
{
"epoch": 2.5588235294117645,
"grad_norm": 84.24452209472656,
"learning_rate": 2e-05,
"loss": 1.523,
"step": 87
},
{
"epoch": 2.588235294117647,
"grad_norm": 125.50318145751953,
"learning_rate": 2e-05,
"loss": 2.8231,
"step": 88
},
{
"epoch": 2.6176470588235294,
"grad_norm": 41.62876510620117,
"learning_rate": 2e-05,
"loss": 1.5234,
"step": 89
},
{
"epoch": 2.6470588235294117,
"grad_norm": 204.7890167236328,
"learning_rate": 2e-05,
"loss": 1.3231,
"step": 90
},
{
"epoch": 2.6764705882352944,
"grad_norm": 336.05963134765625,
"learning_rate": 2e-05,
"loss": 1.5473,
"step": 91
},
{
"epoch": 2.7058823529411766,
"grad_norm": 167.6499786376953,
"learning_rate": 2e-05,
"loss": 2.2598,
"step": 92
},
{
"epoch": 2.735294117647059,
"grad_norm": 39.165950775146484,
"learning_rate": 2e-05,
"loss": 0.5955,
"step": 93
},
{
"epoch": 2.764705882352941,
"grad_norm": 103.95963287353516,
"learning_rate": 2e-05,
"loss": 1.3233,
"step": 94
},
{
"epoch": 2.7941176470588234,
"grad_norm": 141.0578155517578,
"learning_rate": 2e-05,
"loss": 2.0557,
"step": 95
},
{
"epoch": 2.8235294117647056,
"grad_norm": 79.57919311523438,
"learning_rate": 2e-05,
"loss": 2.6418,
"step": 96
},
{
"epoch": 2.8529411764705883,
"grad_norm": 167.03134155273438,
"learning_rate": 2e-05,
"loss": 2.3345,
"step": 97
},
{
"epoch": 2.8823529411764706,
"grad_norm": 1838.4185791015625,
"learning_rate": 2e-05,
"loss": 3.5249,
"step": 98
},
{
"epoch": 2.911764705882353,
"grad_norm": 724.2003173828125,
"learning_rate": 2e-05,
"loss": 1.6495,
"step": 99
},
{
"epoch": 2.9411764705882355,
"grad_norm": 703.0057373046875,
"learning_rate": 2e-05,
"loss": 2.7896,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 331537700093952.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}