jfranklin-foundry's picture
Upload folder using huggingface_hub
2f4f33a verified
{
"best_metric": 1.0477440357208252,
"best_model_checkpoint": "outputs/checkpoint-555",
"epoch": 17.98650472334683,
"eval_steps": 500,
"global_step": 833,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4318488529014845,
"grad_norm": 0.8887946605682373,
"learning_rate": 6e-06,
"loss": 2.3403,
"step": 20
},
{
"epoch": 0.863697705802969,
"grad_norm": 0.6425972580909729,
"learning_rate": 1.2e-05,
"loss": 2.296,
"step": 40
},
{
"epoch": 0.9932523616734144,
"eval_loss": 2.043026924133301,
"eval_runtime": 11.9177,
"eval_samples_per_second": 31.214,
"eval_steps_per_second": 3.944,
"step": 46
},
{
"epoch": 1.2955465587044535,
"grad_norm": 0.5528222918510437,
"learning_rate": 1.8e-05,
"loss": 2.1948,
"step": 60
},
{
"epoch": 1.7273954116059378,
"grad_norm": 0.5118728876113892,
"learning_rate": 2.4e-05,
"loss": 2.0099,
"step": 80
},
{
"epoch": 1.9865047233468287,
"eval_loss": 1.7544503211975098,
"eval_runtime": 11.9277,
"eval_samples_per_second": 31.188,
"eval_steps_per_second": 3.94,
"step": 92
},
{
"epoch": 2.1592442645074224,
"grad_norm": 0.6005122661590576,
"learning_rate": 3e-05,
"loss": 1.9526,
"step": 100
},
{
"epoch": 2.591093117408907,
"grad_norm": 0.580225944519043,
"learning_rate": 2.9973151946516027e-05,
"loss": 1.8471,
"step": 120
},
{
"epoch": 2.979757085020243,
"eval_loss": 1.5811642408370972,
"eval_runtime": 11.9279,
"eval_samples_per_second": 31.187,
"eval_steps_per_second": 3.94,
"step": 138
},
{
"epoch": 3.0229419703103915,
"grad_norm": 0.9076627492904663,
"learning_rate": 2.989270389512756e-05,
"loss": 1.8369,
"step": 140
},
{
"epoch": 3.454790823211876,
"grad_norm": 0.9181211590766907,
"learning_rate": 2.9758943828979444e-05,
"loss": 1.7294,
"step": 160
},
{
"epoch": 3.8866396761133606,
"grad_norm": 0.953004002571106,
"learning_rate": 2.957235057439301e-05,
"loss": 1.6939,
"step": 180
},
{
"epoch": 3.9946018893387314,
"eval_loss": 1.4385555982589722,
"eval_runtime": 11.9317,
"eval_samples_per_second": 31.177,
"eval_steps_per_second": 3.939,
"step": 185
},
{
"epoch": 4.318488529014845,
"grad_norm": 1.2283871173858643,
"learning_rate": 2.9333592086792113e-05,
"loss": 1.6026,
"step": 200
},
{
"epoch": 4.75033738191633,
"grad_norm": 1.482079267501831,
"learning_rate": 2.904352305959606e-05,
"loss": 1.5949,
"step": 220
},
{
"epoch": 4.987854251012146,
"eval_loss": 1.3210583925247192,
"eval_runtime": 11.9364,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 3.938,
"step": 231
},
{
"epoch": 5.182186234817814,
"grad_norm": 1.305906057357788,
"learning_rate": 2.8703181864639013e-05,
"loss": 1.5484,
"step": 240
},
{
"epoch": 5.614035087719298,
"grad_norm": 1.6025702953338623,
"learning_rate": 2.8313786835068314e-05,
"loss": 1.4699,
"step": 260
},
{
"epoch": 5.98110661268556,
"eval_loss": 1.2309151887893677,
"eval_runtime": 11.9213,
"eval_samples_per_second": 31.205,
"eval_steps_per_second": 3.943,
"step": 277
},
{
"epoch": 6.045883940620783,
"grad_norm": 1.6253570318222046,
"learning_rate": 2.7876731904027994e-05,
"loss": 1.4256,
"step": 280
},
{
"epoch": 6.477732793522267,
"grad_norm": 1.6080800294876099,
"learning_rate": 2.7393581614739924e-05,
"loss": 1.3346,
"step": 300
},
{
"epoch": 6.909581646423752,
"grad_norm": 1.8431881666183472,
"learning_rate": 2.6866065519845124e-05,
"loss": 1.3798,
"step": 320
},
{
"epoch": 6.995951417004049,
"eval_loss": 1.1634759902954102,
"eval_runtime": 11.927,
"eval_samples_per_second": 31.19,
"eval_steps_per_second": 3.941,
"step": 324
},
{
"epoch": 7.341430499325236,
"grad_norm": 2.2968621253967285,
"learning_rate": 2.6296071990054167e-05,
"loss": 1.2827,
"step": 340
},
{
"epoch": 7.77327935222672,
"grad_norm": 2.0731184482574463,
"learning_rate": 2.5685641454270172e-05,
"loss": 1.2972,
"step": 360
},
{
"epoch": 7.989203778677463,
"eval_loss": 1.1176676750183105,
"eval_runtime": 11.9385,
"eval_samples_per_second": 31.16,
"eval_steps_per_second": 3.937,
"step": 370
},
{
"epoch": 8.205128205128204,
"grad_norm": 2.328958749771118,
"learning_rate": 2.5036959095382875e-05,
"loss": 1.2416,
"step": 380
},
{
"epoch": 8.63697705802969,
"grad_norm": 2.6954452991485596,
"learning_rate": 2.4352347027881003e-05,
"loss": 1.2072,
"step": 400
},
{
"epoch": 8.982456140350877,
"eval_loss": 1.084679365158081,
"eval_runtime": 11.9276,
"eval_samples_per_second": 31.188,
"eval_steps_per_second": 3.94,
"step": 416
},
{
"epoch": 9.068825910931174,
"grad_norm": 2.3439438343048096,
"learning_rate": 2.3634255985285104e-05,
"loss": 1.2027,
"step": 420
},
{
"epoch": 9.50067476383266,
"grad_norm": 2.3041927814483643,
"learning_rate": 2.288525654715757e-05,
"loss": 1.135,
"step": 440
},
{
"epoch": 9.932523616734143,
"grad_norm": 2.1704723834991455,
"learning_rate": 2.210802993709498e-05,
"loss": 1.1612,
"step": 460
},
{
"epoch": 9.997300944669366,
"eval_loss": 1.0629429817199707,
"eval_runtime": 11.909,
"eval_samples_per_second": 31.237,
"eval_steps_per_second": 3.947,
"step": 463
},
{
"epoch": 10.364372469635628,
"grad_norm": 2.9438016414642334,
"learning_rate": 2.1305358424643484e-05,
"loss": 1.0958,
"step": 480
},
{
"epoch": 10.796221322537113,
"grad_norm": 2.8956775665283203,
"learning_rate": 2.0480115365495928e-05,
"loss": 1.0673,
"step": 500
},
{
"epoch": 10.99055330634278,
"eval_loss": 1.0595099925994873,
"eval_runtime": 11.9097,
"eval_samples_per_second": 31.235,
"eval_steps_per_second": 3.946,
"step": 509
},
{
"epoch": 11.228070175438596,
"grad_norm": 2.5965044498443604,
"learning_rate": 1.963525491562421e-05,
"loss": 1.1122,
"step": 520
},
{
"epoch": 11.65991902834008,
"grad_norm": 3.310291290283203,
"learning_rate": 1.877380145616763e-05,
"loss": 1.0611,
"step": 540
},
{
"epoch": 11.983805668016194,
"eval_loss": 1.0477440357208252,
"eval_runtime": 11.9101,
"eval_samples_per_second": 31.234,
"eval_steps_per_second": 3.946,
"step": 555
},
{
"epoch": 12.091767881241566,
"grad_norm": 2.768397569656372,
"learning_rate": 1.78988387669333e-05,
"loss": 1.0279,
"step": 560
},
{
"epoch": 12.523616734143049,
"grad_norm": 2.6344711780548096,
"learning_rate": 1.7013498987264832e-05,
"loss": 1.0072,
"step": 580
},
{
"epoch": 12.955465587044534,
"grad_norm": 3.8299720287323,
"learning_rate": 1.6120951403796367e-05,
"loss": 0.9972,
"step": 600
},
{
"epoch": 12.998650472334683,
"eval_loss": 1.049790859222412,
"eval_runtime": 11.9084,
"eval_samples_per_second": 31.239,
"eval_steps_per_second": 3.947,
"step": 602
},
{
"epoch": 13.387314439946019,
"grad_norm": 2.735280990600586,
"learning_rate": 1.5224391105228956e-05,
"loss": 0.9579,
"step": 620
},
{
"epoch": 13.819163292847504,
"grad_norm": 2.9928438663482666,
"learning_rate": 1.4327027544742281e-05,
"loss": 1.0177,
"step": 640
},
{
"epoch": 13.991902834008098,
"eval_loss": 1.0596399307250977,
"eval_runtime": 11.9225,
"eval_samples_per_second": 31.201,
"eval_steps_per_second": 3.942,
"step": 648
},
{
"epoch": 14.251012145748987,
"grad_norm": 2.9771831035614014,
"learning_rate": 1.3432073050985201e-05,
"loss": 0.9815,
"step": 660
},
{
"epoch": 14.682860998650472,
"grad_norm": 2.8720908164978027,
"learning_rate": 1.2542731328772936e-05,
"loss": 0.9378,
"step": 680
},
{
"epoch": 14.98515519568151,
"eval_loss": 1.0685012340545654,
"eval_runtime": 11.9562,
"eval_samples_per_second": 31.114,
"eval_steps_per_second": 3.931,
"step": 694
},
{
"epoch": 15.114709851551957,
"grad_norm": 3.394012212753296,
"learning_rate": 1.1662185990655285e-05,
"loss": 0.9611,
"step": 700
},
{
"epoch": 15.54655870445344,
"grad_norm": 3.343366861343384,
"learning_rate": 1.079358916040996e-05,
"loss": 0.9022,
"step": 720
},
{
"epoch": 15.978407557354926,
"grad_norm": 3.1342532634735107,
"learning_rate": 9.940050189257552e-06,
"loss": 0.9787,
"step": 740
},
{
"epoch": 16.0,
"eval_loss": 1.0815356969833374,
"eval_runtime": 11.9572,
"eval_samples_per_second": 31.111,
"eval_steps_per_second": 3.931,
"step": 741
},
{
"epoch": 16.41025641025641,
"grad_norm": 2.9826908111572266,
"learning_rate": 9.104624525191147e-06,
"loss": 0.8726,
"step": 760
},
{
"epoch": 16.842105263157894,
"grad_norm": 3.0531911849975586,
"learning_rate": 8.290302775265509e-06,
"loss": 0.9296,
"step": 780
},
{
"epoch": 16.993252361673413,
"eval_loss": 1.0938704013824463,
"eval_runtime": 12.0533,
"eval_samples_per_second": 30.863,
"eval_steps_per_second": 3.899,
"step": 787
},
{
"epoch": 17.27395411605938,
"grad_norm": 3.4158248901367188,
"learning_rate": 7.500000000000004e-06,
"loss": 0.9234,
"step": 800
},
{
"epoch": 17.705802968960864,
"grad_norm": 3.574179172515869,
"learning_rate": 6.736545278218464e-06,
"loss": 0.8333,
"step": 820
},
{
"epoch": 17.98650472334683,
"eval_loss": 1.0902316570281982,
"eval_runtime": 12.0644,
"eval_samples_per_second": 30.835,
"eval_steps_per_second": 3.896,
"step": 833
}
],
"logging_steps": 20,
"max_steps": 1150,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"total_flos": 7.4125817360597e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}