jfranklin-foundry's picture
Upload folder using huggingface_hub
160bf60 verified
raw
history blame contribute delete
No virus
11.4 kB
{
"best_metric": 1.42880117893219,
"best_model_checkpoint": "outputs/checkpoint-828",
"epoch": 17.878542510121456,
"eval_steps": 500,
"global_step": 828,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4318488529014845,
"grad_norm": 0.9647712111473083,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.3464,
"step": 20
},
{
"epoch": 0.863697705802969,
"grad_norm": 0.7620949149131775,
"learning_rate": 4.000000000000001e-06,
"loss": 2.3457,
"step": 40
},
{
"epoch": 0.9932523616734144,
"eval_loss": 2.172492504119873,
"eval_runtime": 12.5552,
"eval_samples_per_second": 29.629,
"eval_steps_per_second": 3.743,
"step": 46
},
{
"epoch": 1.2955465587044535,
"grad_norm": 0.7905983924865723,
"learning_rate": 6e-06,
"loss": 2.3256,
"step": 60
},
{
"epoch": 1.7273954116059378,
"grad_norm": 0.8448758721351624,
"learning_rate": 8.000000000000001e-06,
"loss": 2.1999,
"step": 80
},
{
"epoch": 1.9865047233468287,
"eval_loss": 1.9573273658752441,
"eval_runtime": 12.2982,
"eval_samples_per_second": 30.248,
"eval_steps_per_second": 3.822,
"step": 92
},
{
"epoch": 2.1592442645074224,
"grad_norm": 0.665162980556488,
"learning_rate": 1e-05,
"loss": 2.1343,
"step": 100
},
{
"epoch": 2.591093117408907,
"grad_norm": 0.5304886698722839,
"learning_rate": 9.981389099710132e-06,
"loss": 2.0325,
"step": 120
},
{
"epoch": 2.979757085020243,
"eval_loss": 1.8101614713668823,
"eval_runtime": 12.0422,
"eval_samples_per_second": 30.891,
"eval_steps_per_second": 3.903,
"step": 138
},
{
"epoch": 3.0229419703103915,
"grad_norm": 0.6249691247940063,
"learning_rate": 9.925694945084369e-06,
"loss": 2.038,
"step": 140
},
{
"epoch": 3.454790823211876,
"grad_norm": 0.549341082572937,
"learning_rate": 9.833332143466099e-06,
"loss": 1.9539,
"step": 160
},
{
"epoch": 3.8866396761133606,
"grad_norm": 0.5345794558525085,
"learning_rate": 9.704988276811883e-06,
"loss": 1.9295,
"step": 180
},
{
"epoch": 3.9946018893387314,
"eval_loss": 1.7296996116638184,
"eval_runtime": 12.0445,
"eval_samples_per_second": 30.885,
"eval_steps_per_second": 3.902,
"step": 185
},
{
"epoch": 4.318488529014845,
"grad_norm": 0.6139589548110962,
"learning_rate": 9.54161878308377e-06,
"loss": 1.8949,
"step": 200
},
{
"epoch": 4.75033738191633,
"grad_norm": 0.6864662766456604,
"learning_rate": 9.344439843625034e-06,
"loss": 1.8976,
"step": 220
},
{
"epoch": 4.987854251012146,
"eval_loss": 1.6704739332199097,
"eval_runtime": 12.0592,
"eval_samples_per_second": 30.848,
"eval_steps_per_second": 3.897,
"step": 231
},
{
"epoch": 5.182186234817814,
"grad_norm": 0.5886803865432739,
"learning_rate": 9.114919329468283e-06,
"loss": 1.8686,
"step": 240
},
{
"epoch": 5.614035087719298,
"grad_norm": 0.6739407181739807,
"learning_rate": 8.854765873974898e-06,
"loss": 1.8265,
"step": 260
},
{
"epoch": 5.98110661268556,
"eval_loss": 1.6191655397415161,
"eval_runtime": 12.0582,
"eval_samples_per_second": 30.85,
"eval_steps_per_second": 3.898,
"step": 277
},
{
"epoch": 6.045883940620783,
"grad_norm": 0.7047673463821411,
"learning_rate": 8.565916153152982e-06,
"loss": 1.8201,
"step": 280
},
{
"epoch": 6.477732793522267,
"grad_norm": 0.6909105777740479,
"learning_rate": 8.250520468343722e-06,
"loss": 1.7656,
"step": 300
},
{
"epoch": 6.909581646423752,
"grad_norm": 0.7460944056510925,
"learning_rate": 7.910926738603855e-06,
"loss": 1.8038,
"step": 320
},
{
"epoch": 6.995951417004049,
"eval_loss": 1.5762709379196167,
"eval_runtime": 12.0686,
"eval_samples_per_second": 30.824,
"eval_steps_per_second": 3.894,
"step": 324
},
{
"epoch": 7.341430499325236,
"grad_norm": 0.8885225653648376,
"learning_rate": 7.5496630219506805e-06,
"loss": 1.7428,
"step": 340
},
{
"epoch": 7.77327935222672,
"grad_norm": 0.771353542804718,
"learning_rate": 7.169418695587791e-06,
"loss": 1.7608,
"step": 360
},
{
"epoch": 7.989203778677463,
"eval_loss": 1.5418975353240967,
"eval_runtime": 12.0558,
"eval_samples_per_second": 30.857,
"eval_steps_per_second": 3.899,
"step": 370
},
{
"epoch": 8.205128205128204,
"grad_norm": 0.7849822640419006,
"learning_rate": 6.773024435212678e-06,
"loss": 1.7396,
"step": 380
},
{
"epoch": 8.63697705802969,
"grad_norm": 0.997053325176239,
"learning_rate": 6.363431142447469e-06,
"loss": 1.7123,
"step": 400
},
{
"epoch": 8.982456140350877,
"eval_loss": 1.5107336044311523,
"eval_runtime": 12.0487,
"eval_samples_per_second": 30.875,
"eval_steps_per_second": 3.901,
"step": 416
},
{
"epoch": 9.068825910931174,
"grad_norm": 0.8057714104652405,
"learning_rate": 5.943687977264584e-06,
"loss": 1.7143,
"step": 420
},
{
"epoch": 9.50067476383266,
"grad_norm": 1.0519981384277344,
"learning_rate": 5.51691965894185e-06,
"loss": 1.6754,
"step": 440
},
{
"epoch": 9.932523616734143,
"grad_norm": 0.8790378570556641,
"learning_rate": 5.0863032045269435e-06,
"loss": 1.7078,
"step": 460
},
{
"epoch": 9.997300944669366,
"eval_loss": 1.4861911535263062,
"eval_runtime": 12.0616,
"eval_samples_per_second": 30.842,
"eval_steps_per_second": 3.897,
"step": 463
},
{
"epoch": 10.364372469635628,
"grad_norm": 0.9965023994445801,
"learning_rate": 4.6550442779783755e-06,
"loss": 1.6729,
"step": 480
},
{
"epoch": 10.796221322537113,
"grad_norm": 0.9378894567489624,
"learning_rate": 4.226353326048594e-06,
"loss": 1.6375,
"step": 500
},
{
"epoch": 10.99055330634278,
"eval_loss": 1.4666177034378052,
"eval_runtime": 12.0581,
"eval_samples_per_second": 30.851,
"eval_steps_per_second": 3.898,
"step": 509
},
{
"epoch": 11.228070175438596,
"grad_norm": 1.088391900062561,
"learning_rate": 3.803421678562213e-06,
"loss": 1.6817,
"step": 520
},
{
"epoch": 11.65991902834008,
"grad_norm": 1.2648347616195679,
"learning_rate": 3.389397791007548e-06,
"loss": 1.6618,
"step": 540
},
{
"epoch": 11.983805668016194,
"eval_loss": 1.4524654150009155,
"eval_runtime": 12.0523,
"eval_samples_per_second": 30.866,
"eval_steps_per_second": 3.9,
"step": 555
},
{
"epoch": 12.091767881241566,
"grad_norm": 1.1444419622421265,
"learning_rate": 2.9873638063001633e-06,
"loss": 1.637,
"step": 560
},
{
"epoch": 12.523616734143049,
"grad_norm": 0.8730684518814087,
"learning_rate": 2.6003126102010696e-06,
"loss": 1.6219,
"step": 580
},
{
"epoch": 12.955465587044534,
"grad_norm": 1.3365355730056763,
"learning_rate": 2.2311255511973347e-06,
"loss": 1.629,
"step": 600
},
{
"epoch": 12.998650472334683,
"eval_loss": 1.4416956901550293,
"eval_runtime": 12.0642,
"eval_samples_per_second": 30.835,
"eval_steps_per_second": 3.896,
"step": 602
},
{
"epoch": 13.387314439946019,
"grad_norm": 1.1396286487579346,
"learning_rate": 1.8825509907063328e-06,
"loss": 1.5987,
"step": 620
},
{
"epoch": 13.819163292847504,
"grad_norm": 1.1069096326828003,
"learning_rate": 1.557183843283614e-06,
"loss": 1.6634,
"step": 640
},
{
"epoch": 13.991902834008098,
"eval_loss": 1.4354872703552246,
"eval_runtime": 12.0604,
"eval_samples_per_second": 30.845,
"eval_steps_per_second": 3.897,
"step": 648
},
{
"epoch": 14.251012145748987,
"grad_norm": 0.929169237613678,
"learning_rate": 1.257446259144494e-06,
"loss": 1.6245,
"step": 660
},
{
"epoch": 14.682860998650472,
"grad_norm": 1.1899679899215698,
"learning_rate": 9.85569592805588e-07,
"loss": 1.6097,
"step": 680
},
{
"epoch": 14.98515519568151,
"eval_loss": 1.4315454959869385,
"eval_runtime": 12.0693,
"eval_samples_per_second": 30.822,
"eval_steps_per_second": 3.894,
"step": 694
},
{
"epoch": 15.114709851551957,
"grad_norm": 1.0818545818328857,
"learning_rate": 7.435777920782444e-07,
"loss": 1.6349,
"step": 700
},
{
"epoch": 15.54655870445344,
"grad_norm": 1.077989935874939,
"learning_rate": 5.332723310721855e-07,
"loss": 1.6035,
"step": 720
},
{
"epoch": 15.978407557354926,
"grad_norm": 1.1727399826049805,
"learning_rate": 3.5621879937348836e-07,
"loss": 1.6489,
"step": 740
},
{
"epoch": 16.0,
"eval_loss": 1.4295289516448975,
"eval_runtime": 12.0333,
"eval_samples_per_second": 30.914,
"eval_steps_per_second": 3.906,
"step": 741
},
{
"epoch": 16.41025641025641,
"grad_norm": 1.0742014646530151,
"learning_rate": 2.137352472319215e-07,
"loss": 1.5994,
"step": 760
},
{
"epoch": 16.842105263157894,
"grad_norm": 1.0748684406280518,
"learning_rate": 1.0688237352022346e-07,
"loss": 1.6251,
"step": 780
},
{
"epoch": 16.993252361673413,
"eval_loss": 1.4288655519485474,
"eval_runtime": 11.987,
"eval_samples_per_second": 31.034,
"eval_steps_per_second": 3.921,
"step": 787
},
{
"epoch": 17.27395411605938,
"grad_norm": 1.1126863956451416,
"learning_rate": 3.645562950973014e-08,
"loss": 1.6199,
"step": 800
},
{
"epoch": 17.705802968960864,
"grad_norm": 1.1560289859771729,
"learning_rate": 2.9792972446479605e-09,
"loss": 1.5742,
"step": 820
},
{
"epoch": 17.878542510121456,
"eval_loss": 1.42880117893219,
"eval_runtime": 11.9734,
"eval_samples_per_second": 31.069,
"eval_steps_per_second": 3.925,
"step": 828
}
],
"logging_steps": 20,
"max_steps": 828,
"num_input_tokens_seen": 0,
"num_train_epochs": 18,
"save_steps": 500,
"total_flos": 7.361856849884774e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}