|
{ |
|
"best_metric": 0.0, |
|
"best_model_checkpoint": "hiera_model/checkpoint-283", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 849, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0353356890459364, |
|
"grad_norm": 4.513326644897461, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 7.6619, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0706713780918728, |
|
"grad_norm": 5.347131729125977, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 7.6612, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 7.650573253631592, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 7.6148, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 6.892947196960449, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 7.532, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 7.388574600219727, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 7.4152, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 8.62385368347168, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 7.4114, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24734982332155478, |
|
"grad_norm": 10.373017311096191, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 7.3113, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 6.3279500007629395, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 7.2527, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31802120141342755, |
|
"grad_norm": 5.869943618774414, |
|
"learning_rate": 4.967277486910995e-05, |
|
"loss": 7.2645, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35335689045936397, |
|
"grad_norm": 5.0281453132629395, |
|
"learning_rate": 4.9018324607329844e-05, |
|
"loss": 7.1714, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38869257950530034, |
|
"grad_norm": 7.027573585510254, |
|
"learning_rate": 4.836387434554974e-05, |
|
"loss": 7.1634, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 4.754793167114258, |
|
"learning_rate": 4.770942408376964e-05, |
|
"loss": 7.1112, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.45936395759717313, |
|
"grad_norm": 6.395844459533691, |
|
"learning_rate": 4.7054973821989526e-05, |
|
"loss": 7.0364, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.49469964664310956, |
|
"grad_norm": 5.85114049911499, |
|
"learning_rate": 4.6400523560209424e-05, |
|
"loss": 7.0286, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 5.452080249786377, |
|
"learning_rate": 4.574607329842932e-05, |
|
"loss": 7.1043, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 5.287403583526611, |
|
"learning_rate": 4.5091623036649215e-05, |
|
"loss": 6.9891, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6007067137809188, |
|
"grad_norm": 5.20114803314209, |
|
"learning_rate": 4.4437172774869113e-05, |
|
"loss": 6.9994, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6360424028268551, |
|
"grad_norm": 5.050561428070068, |
|
"learning_rate": 4.3782722513089005e-05, |
|
"loss": 6.9812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6713780918727915, |
|
"grad_norm": 4.857853412628174, |
|
"learning_rate": 4.3128272251308904e-05, |
|
"loss": 7.1296, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 4.78601598739624, |
|
"learning_rate": 4.24738219895288e-05, |
|
"loss": 7.0627, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7420494699646644, |
|
"grad_norm": 7.134556293487549, |
|
"learning_rate": 4.181937172774869e-05, |
|
"loss": 7.0056, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7773851590106007, |
|
"grad_norm": 5.701265811920166, |
|
"learning_rate": 4.1164921465968586e-05, |
|
"loss": 6.9878, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8127208480565371, |
|
"grad_norm": 5.249512672424316, |
|
"learning_rate": 4.0510471204188485e-05, |
|
"loss": 6.9481, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 5.31380558013916, |
|
"learning_rate": 3.985602094240838e-05, |
|
"loss": 6.9556, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8833922261484098, |
|
"grad_norm": 5.351413726806641, |
|
"learning_rate": 3.9201570680628275e-05, |
|
"loss": 6.9064, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9187279151943463, |
|
"grad_norm": 5.594610214233398, |
|
"learning_rate": 3.8547120418848174e-05, |
|
"loss": 6.934, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9540636042402827, |
|
"grad_norm": 8.065481185913086, |
|
"learning_rate": 3.7892670157068066e-05, |
|
"loss": 6.8646, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9893992932862191, |
|
"grad_norm": 6.58479642868042, |
|
"learning_rate": 3.7238219895287964e-05, |
|
"loss": 6.9557, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.0, |
|
"eval_loss": 8.122916221618652, |
|
"eval_runtime": 9.1372, |
|
"eval_samples_per_second": 10.944, |
|
"eval_steps_per_second": 0.219, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.0247349823321554, |
|
"grad_norm": 9.340655326843262, |
|
"learning_rate": 3.6583769633507856e-05, |
|
"loss": 6.7977, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0600706713780919, |
|
"grad_norm": 5.579546928405762, |
|
"learning_rate": 3.592931937172775e-05, |
|
"loss": 6.6527, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0954063604240283, |
|
"grad_norm": 6.204797267913818, |
|
"learning_rate": 3.5274869109947647e-05, |
|
"loss": 6.7087, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1307420494699647, |
|
"grad_norm": 5.36489725112915, |
|
"learning_rate": 3.462041884816754e-05, |
|
"loss": 6.7543, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1660777385159011, |
|
"grad_norm": 5.784945964813232, |
|
"learning_rate": 3.396596858638744e-05, |
|
"loss": 6.8001, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2014134275618376, |
|
"grad_norm": 8.815863609313965, |
|
"learning_rate": 3.3311518324607336e-05, |
|
"loss": 6.7099, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2367491166077738, |
|
"grad_norm": 8.466463088989258, |
|
"learning_rate": 3.265706806282723e-05, |
|
"loss": 6.6982, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2720848056537102, |
|
"grad_norm": 6.15283727645874, |
|
"learning_rate": 3.2002617801047126e-05, |
|
"loss": 6.6768, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3074204946996466, |
|
"grad_norm": 7.654407501220703, |
|
"learning_rate": 3.134816753926702e-05, |
|
"loss": 6.734, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.342756183745583, |
|
"grad_norm": 7.1313347816467285, |
|
"learning_rate": 3.069371727748691e-05, |
|
"loss": 6.7348, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3780918727915195, |
|
"grad_norm": 6.387876987457275, |
|
"learning_rate": 3.003926701570681e-05, |
|
"loss": 6.6646, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4134275618374559, |
|
"grad_norm": 6.7032294273376465, |
|
"learning_rate": 2.9384816753926704e-05, |
|
"loss": 6.664, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4487632508833923, |
|
"grad_norm": 8.315750122070312, |
|
"learning_rate": 2.87303664921466e-05, |
|
"loss": 6.6876, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.4840989399293285, |
|
"grad_norm": 7.1473259925842285, |
|
"learning_rate": 2.8075916230366494e-05, |
|
"loss": 6.7499, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5194346289752652, |
|
"grad_norm": 8.586325645446777, |
|
"learning_rate": 2.742146596858639e-05, |
|
"loss": 6.625, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5547703180212014, |
|
"grad_norm": 6.852565765380859, |
|
"learning_rate": 2.6767015706806288e-05, |
|
"loss": 6.6484, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.5901060070671378, |
|
"grad_norm": 7.314846992492676, |
|
"learning_rate": 2.6112565445026176e-05, |
|
"loss": 6.6476, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6254416961130742, |
|
"grad_norm": 8.678882598876953, |
|
"learning_rate": 2.545811518324607e-05, |
|
"loss": 6.6164, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6607773851590106, |
|
"grad_norm": 7.417328357696533, |
|
"learning_rate": 2.480366492146597e-05, |
|
"loss": 6.5872, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.696113074204947, |
|
"grad_norm": 9.037349700927734, |
|
"learning_rate": 2.4149214659685865e-05, |
|
"loss": 6.6096, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7314487632508833, |
|
"grad_norm": 9.220462799072266, |
|
"learning_rate": 2.349476439790576e-05, |
|
"loss": 6.598, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.76678445229682, |
|
"grad_norm": 10.437491416931152, |
|
"learning_rate": 2.2840314136125656e-05, |
|
"loss": 6.5736, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.802120141342756, |
|
"grad_norm": 7.730395317077637, |
|
"learning_rate": 2.218586387434555e-05, |
|
"loss": 6.5369, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8374558303886925, |
|
"grad_norm": 9.4498872756958, |
|
"learning_rate": 2.1531413612565446e-05, |
|
"loss": 6.5505, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.872791519434629, |
|
"grad_norm": 9.64456558227539, |
|
"learning_rate": 2.087696335078534e-05, |
|
"loss": 6.5647, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9081272084805654, |
|
"grad_norm": 7.969595909118652, |
|
"learning_rate": 2.0222513089005237e-05, |
|
"loss": 6.4623, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9434628975265018, |
|
"grad_norm": 13.205784797668457, |
|
"learning_rate": 1.9568062827225132e-05, |
|
"loss": 6.5204, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.978798586572438, |
|
"grad_norm": 8.421072006225586, |
|
"learning_rate": 1.8913612565445027e-05, |
|
"loss": 6.6176, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.0, |
|
"eval_loss": 7.943063735961914, |
|
"eval_runtime": 8.8235, |
|
"eval_samples_per_second": 11.333, |
|
"eval_steps_per_second": 0.227, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.0141342756183747, |
|
"grad_norm": 10.754253387451172, |
|
"learning_rate": 1.8259162303664922e-05, |
|
"loss": 6.3718, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.049469964664311, |
|
"grad_norm": 9.648688316345215, |
|
"learning_rate": 1.7604712041884818e-05, |
|
"loss": 6.2833, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.0848056537102475, |
|
"grad_norm": 8.878774642944336, |
|
"learning_rate": 1.6950261780104713e-05, |
|
"loss": 6.3031, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.1201413427561837, |
|
"grad_norm": 8.666033744812012, |
|
"learning_rate": 1.6295811518324608e-05, |
|
"loss": 6.3308, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.1554770318021204, |
|
"grad_norm": 9.023731231689453, |
|
"learning_rate": 1.5641361256544503e-05, |
|
"loss": 6.2989, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.1908127208480566, |
|
"grad_norm": 10.014788627624512, |
|
"learning_rate": 1.49869109947644e-05, |
|
"loss": 6.2167, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.2261484098939928, |
|
"grad_norm": 9.36070442199707, |
|
"learning_rate": 1.4332460732984294e-05, |
|
"loss": 6.3361, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.2614840989399294, |
|
"grad_norm": 11.200228691101074, |
|
"learning_rate": 1.3678010471204189e-05, |
|
"loss": 6.2062, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.2968197879858656, |
|
"grad_norm": 11.959548950195312, |
|
"learning_rate": 1.3023560209424086e-05, |
|
"loss": 6.2845, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.3321554770318023, |
|
"grad_norm": 11.0889892578125, |
|
"learning_rate": 1.236910994764398e-05, |
|
"loss": 6.3267, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.3674911660777385, |
|
"grad_norm": 11.451295852661133, |
|
"learning_rate": 1.1714659685863876e-05, |
|
"loss": 6.3199, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.402826855123675, |
|
"grad_norm": 10.989713668823242, |
|
"learning_rate": 1.106020942408377e-05, |
|
"loss": 6.3512, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.4381625441696113, |
|
"grad_norm": 12.176304817199707, |
|
"learning_rate": 1.0405759162303667e-05, |
|
"loss": 6.3017, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.4734982332155475, |
|
"grad_norm": 8.967185020446777, |
|
"learning_rate": 9.75130890052356e-06, |
|
"loss": 6.3324, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.508833922261484, |
|
"grad_norm": 11.730249404907227, |
|
"learning_rate": 9.096858638743457e-06, |
|
"loss": 6.2276, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.5441696113074204, |
|
"grad_norm": 9.879383087158203, |
|
"learning_rate": 8.44240837696335e-06, |
|
"loss": 6.2391, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.579505300353357, |
|
"grad_norm": 11.204628944396973, |
|
"learning_rate": 7.787958115183248e-06, |
|
"loss": 6.2064, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.614840989399293, |
|
"grad_norm": 11.713162422180176, |
|
"learning_rate": 7.133507853403142e-06, |
|
"loss": 6.1786, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.65017667844523, |
|
"grad_norm": 11.1579008102417, |
|
"learning_rate": 6.479057591623037e-06, |
|
"loss": 6.2621, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.685512367491166, |
|
"grad_norm": 10.596528053283691, |
|
"learning_rate": 5.824607329842932e-06, |
|
"loss": 6.2023, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.7208480565371023, |
|
"grad_norm": 11.238260269165039, |
|
"learning_rate": 5.170157068062828e-06, |
|
"loss": 6.267, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.756183745583039, |
|
"grad_norm": 10.648133277893066, |
|
"learning_rate": 4.515706806282723e-06, |
|
"loss": 6.2735, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.791519434628975, |
|
"grad_norm": 10.547654151916504, |
|
"learning_rate": 3.861256544502618e-06, |
|
"loss": 6.2288, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.8268551236749118, |
|
"grad_norm": 11.513481140136719, |
|
"learning_rate": 3.2068062827225132e-06, |
|
"loss": 6.2293, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.862190812720848, |
|
"grad_norm": 12.321627616882324, |
|
"learning_rate": 2.5523560209424085e-06, |
|
"loss": 6.2343, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.8975265017667846, |
|
"grad_norm": 10.748133659362793, |
|
"learning_rate": 1.8979057591623037e-06, |
|
"loss": 6.1784, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.932862190812721, |
|
"grad_norm": 11.156218528747559, |
|
"learning_rate": 1.243455497382199e-06, |
|
"loss": 6.2708, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.968197879858657, |
|
"grad_norm": 10.066128730773926, |
|
"learning_rate": 5.890052356020942e-07, |
|
"loss": 6.2068, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.0, |
|
"eval_loss": 7.837530136108398, |
|
"eval_runtime": 8.6213, |
|
"eval_samples_per_second": 11.599, |
|
"eval_steps_per_second": 0.232, |
|
"step": 849 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 849, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.3953276781480673e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|