jfranklin-foundry's picture
Upload folder using huggingface_hub
2f4f33a verified
{
"best_metric": 1.0532242059707642,
"best_model_checkpoint": "outputs/checkpoint-648",
"epoch": 19.865047233468285,
"eval_steps": 500,
"global_step": 920,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4318488529014845,
"grad_norm": 0.8586685061454773,
"learning_rate": 6e-06,
"loss": 2.3406,
"step": 20
},
{
"epoch": 0.863697705802969,
"grad_norm": 0.6262326240539551,
"learning_rate": 1.2e-05,
"loss": 2.2977,
"step": 40
},
{
"epoch": 0.9932523616734144,
"eval_loss": 2.046743154525757,
"eval_runtime": 11.8178,
"eval_samples_per_second": 31.478,
"eval_steps_per_second": 3.977,
"step": 46
},
{
"epoch": 1.2955465587044535,
"grad_norm": 0.5441118478775024,
"learning_rate": 1.8e-05,
"loss": 2.1983,
"step": 60
},
{
"epoch": 1.7273954116059378,
"grad_norm": 0.5027536153793335,
"learning_rate": 2.4e-05,
"loss": 2.0126,
"step": 80
},
{
"epoch": 1.9865047233468287,
"eval_loss": 1.756650686264038,
"eval_runtime": 11.8327,
"eval_samples_per_second": 31.438,
"eval_steps_per_second": 3.972,
"step": 92
},
{
"epoch": 2.1592442645074224,
"grad_norm": 0.5985815525054932,
"learning_rate": 3e-05,
"loss": 1.9539,
"step": 100
},
{
"epoch": 2.591093117408907,
"grad_norm": 0.5876953601837158,
"learning_rate": 2.9955987017756105e-05,
"loss": 1.8487,
"step": 120
},
{
"epoch": 2.979757085020243,
"eval_loss": 1.5829453468322754,
"eval_runtime": 11.8342,
"eval_samples_per_second": 31.434,
"eval_steps_per_second": 3.972,
"step": 138
},
{
"epoch": 3.0229419703103915,
"grad_norm": 0.9397473931312561,
"learning_rate": 2.982420635670523e-05,
"loss": 1.838,
"step": 140
},
{
"epoch": 3.454790823211876,
"grad_norm": 0.9532083868980408,
"learning_rate": 2.9605431358166687e-05,
"loss": 1.7307,
"step": 160
},
{
"epoch": 3.8866396761133606,
"grad_norm": 0.9994444847106934,
"learning_rate": 2.9300945880823957e-05,
"loss": 1.6956,
"step": 180
},
{
"epoch": 3.9946018893387314,
"eval_loss": 1.4417527914047241,
"eval_runtime": 11.8388,
"eval_samples_per_second": 31.422,
"eval_steps_per_second": 3.97,
"step": 185
},
{
"epoch": 4.318488529014845,
"grad_norm": 1.2850545644760132,
"learning_rate": 2.8912536766531424e-05,
"loss": 1.6058,
"step": 200
},
{
"epoch": 4.75033738191633,
"grad_norm": 1.5529141426086426,
"learning_rate": 2.8442483354415835e-05,
"loss": 1.5981,
"step": 220
},
{
"epoch": 4.987854251012146,
"eval_loss": 1.3265655040740967,
"eval_runtime": 11.8379,
"eval_samples_per_second": 31.424,
"eval_steps_per_second": 3.97,
"step": 231
},
{
"epoch": 5.182186234817814,
"grad_norm": 1.335929036140442,
"learning_rate": 2.789354410480802e-05,
"loss": 1.5531,
"step": 240
},
{
"epoch": 5.614035087719298,
"grad_norm": 1.6453096866607666,
"learning_rate": 2.7268940411500768e-05,
"loss": 1.4762,
"step": 260
},
{
"epoch": 5.98110661268556,
"eval_loss": 1.2389419078826904,
"eval_runtime": 11.8303,
"eval_samples_per_second": 31.445,
"eval_steps_per_second": 3.973,
"step": 277
},
{
"epoch": 6.045883940620783,
"grad_norm": 1.6546735763549805,
"learning_rate": 2.6572337697329145e-05,
"loss": 1.4335,
"step": 280
},
{
"epoch": 6.477732793522267,
"grad_norm": 1.6672505140304565,
"learning_rate": 2.5807823904011803e-05,
"loss": 1.3456,
"step": 300
},
{
"epoch": 6.909581646423752,
"grad_norm": 1.872035264968872,
"learning_rate": 2.497988550248348e-05,
"loss": 1.3907,
"step": 320
},
{
"epoch": 6.995951417004049,
"eval_loss": 1.1727797985076904,
"eval_runtime": 11.8334,
"eval_samples_per_second": 31.436,
"eval_steps_per_second": 3.972,
"step": 324
},
{
"epoch": 7.341430499325236,
"grad_norm": 2.2928590774536133,
"learning_rate": 2.4093381164499572e-05,
"loss": 1.2972,
"step": 340
},
{
"epoch": 7.77327935222672,
"grad_norm": 2.046597957611084,
"learning_rate": 2.315351325001832e-05,
"loss": 1.3121,
"step": 360
},
{
"epoch": 7.989203778677463,
"eval_loss": 1.1317445039749146,
"eval_runtime": 11.8354,
"eval_samples_per_second": 31.431,
"eval_steps_per_second": 3.971,
"step": 370
},
{
"epoch": 8.205128205128204,
"grad_norm": 2.3657939434051514,
"learning_rate": 2.2165797277683945e-05,
"loss": 1.2587,
"step": 380
},
{
"epoch": 8.63697705802969,
"grad_norm": 2.6628217697143555,
"learning_rate": 2.11360295575701e-05,
"loss": 1.2259,
"step": 400
},
{
"epoch": 8.982456140350877,
"eval_loss": 1.089838981628418,
"eval_runtime": 11.8331,
"eval_samples_per_second": 31.437,
"eval_steps_per_second": 3.972,
"step": 416
},
{
"epoch": 9.068825910931174,
"grad_norm": 2.126490354537964,
"learning_rate": 2.007025317612754e-05,
"loss": 1.2231,
"step": 420
},
{
"epoch": 9.50067476383266,
"grad_norm": 2.33013653755188,
"learning_rate": 1.897472253294993e-05,
"loss": 1.1601,
"step": 440
},
{
"epoch": 9.932523616734143,
"grad_norm": 2.1320955753326416,
"learning_rate": 1.7855866637470027e-05,
"loss": 1.1858,
"step": 460
},
{
"epoch": 9.997300944669366,
"eval_loss": 1.0753756761550903,
"eval_runtime": 11.8356,
"eval_samples_per_second": 31.431,
"eval_steps_per_second": 3.971,
"step": 463
},
{
"epoch": 10.364372469635628,
"grad_norm": 2.7270896434783936,
"learning_rate": 1.6720251380976008e-05,
"loss": 1.1264,
"step": 480
},
{
"epoch": 10.796221322537113,
"grad_norm": 2.817499876022339,
"learning_rate": 1.557454100535053e-05,
"loss": 1.0953,
"step": 500
},
{
"epoch": 10.99055330634278,
"eval_loss": 1.0628381967544556,
"eval_runtime": 11.8346,
"eval_samples_per_second": 31.433,
"eval_steps_per_second": 3.971,
"step": 509
},
{
"epoch": 11.228070175438596,
"grad_norm": 2.4983890056610107,
"learning_rate": 1.442545899464947e-05,
"loss": 1.1448,
"step": 520
},
{
"epoch": 11.65991902834008,
"grad_norm": 3.187267303466797,
"learning_rate": 1.3279748619023994e-05,
"loss": 1.0969,
"step": 540
},
{
"epoch": 11.983805668016194,
"eval_loss": 1.0652039051055908,
"eval_runtime": 11.8314,
"eval_samples_per_second": 31.442,
"eval_steps_per_second": 3.972,
"step": 555
},
{
"epoch": 12.091767881241566,
"grad_norm": 2.7236690521240234,
"learning_rate": 1.2144133362529972e-05,
"loss": 1.0659,
"step": 560
},
{
"epoch": 12.523616734143049,
"grad_norm": 2.291569232940674,
"learning_rate": 1.1025277467050077e-05,
"loss": 1.0509,
"step": 580
},
{
"epoch": 12.955465587044534,
"grad_norm": 3.7598021030426025,
"learning_rate": 9.929746823872461e-06,
"loss": 1.0398,
"step": 600
},
{
"epoch": 12.998650472334683,
"eval_loss": 1.0660185813903809,
"eval_runtime": 11.8359,
"eval_samples_per_second": 31.43,
"eval_steps_per_second": 3.971,
"step": 602
},
{
"epoch": 13.387314439946019,
"grad_norm": 2.636166572570801,
"learning_rate": 8.863970442429903e-06,
"loss": 1.0089,
"step": 620
},
{
"epoch": 13.819163292847504,
"grad_norm": 2.8337323665618896,
"learning_rate": 7.834202722316054e-06,
"loss": 1.0692,
"step": 640
},
{
"epoch": 13.991902834008098,
"eval_loss": 1.0532242059707642,
"eval_runtime": 11.8398,
"eval_samples_per_second": 31.419,
"eval_steps_per_second": 3.97,
"step": 648
},
{
"epoch": 14.251012145748987,
"grad_norm": 3.053009033203125,
"learning_rate": 6.846486749981685e-06,
"loss": 1.0366,
"step": 660
},
{
"epoch": 14.682860998650472,
"grad_norm": 2.598278522491455,
"learning_rate": 5.906618835500434e-06,
"loss": 0.9994,
"step": 680
},
{
"epoch": 14.98515519568151,
"eval_loss": 1.0662660598754883,
"eval_runtime": 11.8811,
"eval_samples_per_second": 31.31,
"eval_steps_per_second": 3.956,
"step": 694
},
{
"epoch": 15.114709851551957,
"grad_norm": 3.115391492843628,
"learning_rate": 5.0201144975165215e-06,
"loss": 1.0248,
"step": 700
},
{
"epoch": 15.54655870445344,
"grad_norm": 2.9552927017211914,
"learning_rate": 4.192176095988196e-06,
"loss": 0.9757,
"step": 720
},
{
"epoch": 15.978407557354926,
"grad_norm": 2.836552143096924,
"learning_rate": 3.4276623026708552e-06,
"loss": 1.0462,
"step": 740
},
{
"epoch": 16.0,
"eval_loss": 1.0600887537002563,
"eval_runtime": 11.8762,
"eval_samples_per_second": 31.323,
"eval_steps_per_second": 3.957,
"step": 741
},
{
"epoch": 16.41025641025641,
"grad_norm": 2.7339868545532227,
"learning_rate": 2.7310595884992356e-06,
"loss": 0.956,
"step": 760
},
{
"epoch": 16.842105263157894,
"grad_norm": 2.860797882080078,
"learning_rate": 2.1064558951919854e-06,
"loss": 1.0121,
"step": 780
},
{
"epoch": 16.993252361673413,
"eval_loss": 1.067551851272583,
"eval_runtime": 11.9433,
"eval_samples_per_second": 31.147,
"eval_steps_per_second": 3.935,
"step": 787
},
{
"epoch": 17.27395411605938,
"grad_norm": 3.2689273357391357,
"learning_rate": 1.5575166455841678e-06,
"loss": 1.0109,
"step": 800
},
{
"epoch": 17.705802968960864,
"grad_norm": 2.9255502223968506,
"learning_rate": 1.0874632334685808e-06,
"loss": 0.926,
"step": 820
},
{
"epoch": 17.98650472334683,
"eval_loss": 1.0671662092208862,
"eval_runtime": 11.9354,
"eval_samples_per_second": 31.168,
"eval_steps_per_second": 3.938,
"step": 833
},
{
"epoch": 18.13765182186235,
"grad_norm": 2.799520492553711,
"learning_rate": 6.990541191760419e-07,
"loss": 1.015,
"step": 840
},
{
"epoch": 18.569500674763834,
"grad_norm": 3.2285776138305664,
"learning_rate": 3.945686418333155e-07,
"loss": 0.9512,
"step": 860
},
{
"epoch": 18.979757085020243,
"eval_loss": 1.0708719491958618,
"eval_runtime": 11.9397,
"eval_samples_per_second": 31.157,
"eval_steps_per_second": 3.936,
"step": 879
},
{
"epoch": 19.00134952766532,
"grad_norm": 2.990569591522217,
"learning_rate": 1.7579364329477376e-07,
"loss": 0.9571,
"step": 880
},
{
"epoch": 19.4331983805668,
"grad_norm": 3.1629343032836914,
"learning_rate": 4.401298224389338e-08,
"loss": 0.9524,
"step": 900
},
{
"epoch": 19.865047233468285,
"grad_norm": 3.00166392326355,
"learning_rate": 0.0,
"loss": 0.976,
"step": 920
},
{
"epoch": 19.865047233468285,
"eval_loss": 1.0710477828979492,
"eval_runtime": 11.9108,
"eval_samples_per_second": 31.232,
"eval_steps_per_second": 3.946,
"step": 920
}
],
"logging_steps": 20,
"max_steps": 920,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 8.174619616582042e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}