|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3352, |
|
"perplexity": 28.083999462050915, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_accuracy": 0.23046520883543017, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 8.0774, |
|
"eval_samples_per_second": 45.064, |
|
"eval_steps_per_second": 5.695, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3091, |
|
"perplexity": 27.360489946627773, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_accuracy": 0.23046520883543017, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 7.9509, |
|
"eval_samples_per_second": 45.781, |
|
"eval_steps_per_second": 5.785, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3347, |
|
"perplexity": 28.06996097223482, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_accuracy": 0.23046520883543017, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 7.9602, |
|
"eval_samples_per_second": 45.728, |
|
"eval_steps_per_second": 5.779, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1445, |
|
"perplexity": 23.20806853752346, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_accuracy": 0.23053154089371394, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 7.9267, |
|
"eval_samples_per_second": 45.921, |
|
"eval_steps_per_second": 5.803, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1e-06, |
|
"loss": 2.8918, |
|
"perplexity": 18.025726725492028, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_accuracy": 0.23053154089371394, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 7.9947, |
|
"eval_samples_per_second": 45.53, |
|
"eval_steps_per_second": 5.754, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2068, |
|
"perplexity": 24.699919882371823, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.23053154089371394, |
|
"eval_loss": 3.173828125, |
|
"eval_perplexity": 23.898797049169467, |
|
"eval_runtime": 7.9696, |
|
"eval_samples_per_second": 45.674, |
|
"eval_steps_per_second": 5.772, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1e-06, |
|
"loss": 3.6245, |
|
"perplexity": 37.50596548782992, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_accuracy": 0.23047073650695382, |
|
"eval_loss": 3.171875, |
|
"eval_perplexity": 23.852165264858517, |
|
"eval_runtime": 7.9787, |
|
"eval_samples_per_second": 45.622, |
|
"eval_steps_per_second": 5.765, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2256, |
|
"perplexity": 25.168670828860865, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_accuracy": 0.230514957879143, |
|
"eval_loss": 3.171875, |
|
"eval_perplexity": 23.852165264858517, |
|
"eval_runtime": 7.9349, |
|
"eval_samples_per_second": 45.873, |
|
"eval_steps_per_second": 5.797, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1e-06, |
|
"loss": 2.9991, |
|
"perplexity": 20.067468072159407, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_accuracy": 0.23050943020761935, |
|
"eval_loss": 3.169921875, |
|
"eval_perplexity": 23.80562446936611, |
|
"eval_runtime": 7.9581, |
|
"eval_samples_per_second": 45.74, |
|
"eval_steps_per_second": 5.78, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3257, |
|
"perplexity": 27.818464754063648, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.23057023459437948, |
|
"eval_loss": 3.16796875, |
|
"eval_perplexity": 23.75917448515314, |
|
"eval_runtime": 8.0497, |
|
"eval_samples_per_second": 45.219, |
|
"eval_steps_per_second": 5.715, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1199, |
|
"perplexity": 22.6441151184392, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_accuracy": 0.23057023459437948, |
|
"eval_loss": 3.166015625, |
|
"eval_perplexity": 23.71281513502692, |
|
"eval_runtime": 7.9865, |
|
"eval_samples_per_second": 45.577, |
|
"eval_steps_per_second": 5.76, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3735, |
|
"perplexity": 29.180480216449844, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy": 0.23062551130961595, |
|
"eval_loss": 3.166015625, |
|
"eval_perplexity": 23.71281513502692, |
|
"eval_runtime": 7.9024, |
|
"eval_samples_per_second": 46.062, |
|
"eval_steps_per_second": 5.821, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0051, |
|
"perplexity": 20.188234818531463, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_accuracy": 0.2306531496672342, |
|
"eval_loss": 3.1640625, |
|
"eval_perplexity": 23.666546242140512, |
|
"eval_runtime": 7.9745, |
|
"eval_samples_per_second": 45.646, |
|
"eval_steps_per_second": 5.768, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2695, |
|
"perplexity": 26.298186961963587, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_accuracy": 0.23081345214141996, |
|
"eval_loss": 3.162109375, |
|
"eval_perplexity": 23.620367629992042, |
|
"eval_runtime": 8.0031, |
|
"eval_samples_per_second": 45.483, |
|
"eval_steps_per_second": 5.748, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2004, |
|
"perplexity": 24.54234517205232, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_accuracy": 0.23086872885665644, |
|
"eval_loss": 3.16015625, |
|
"eval_perplexity": 23.574279122424027, |
|
"eval_runtime": 7.96, |
|
"eval_samples_per_second": 45.729, |
|
"eval_steps_per_second": 5.779, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2075, |
|
"perplexity": 24.71721587918212, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_accuracy": 0.2308300351559909, |
|
"eval_loss": 3.158203125, |
|
"eval_perplexity": 23.52828054362271, |
|
"eval_runtime": 7.9635, |
|
"eval_samples_per_second": 45.708, |
|
"eval_steps_per_second": 5.776, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 1e-06, |
|
"loss": 3.321, |
|
"perplexity": 27.688024743861764, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_accuracy": 0.2308300351559909, |
|
"eval_loss": 3.15625, |
|
"eval_perplexity": 23.482371718117374, |
|
"eval_runtime": 7.9658, |
|
"eval_samples_per_second": 45.695, |
|
"eval_steps_per_second": 5.775, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1e-06, |
|
"loss": 3.4026, |
|
"perplexity": 30.04210807401033, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_accuracy": 0.23086872885665644, |
|
"eval_loss": 3.154296875, |
|
"eval_perplexity": 23.4365524707797, |
|
"eval_runtime": 7.934, |
|
"eval_samples_per_second": 45.878, |
|
"eval_steps_per_second": 5.798, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0383, |
|
"perplexity": 20.869734512558935, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_accuracy": 0.23087425652818008, |
|
"eval_loss": 3.15234375, |
|
"eval_perplexity": 23.390822626823073, |
|
"eval_runtime": 7.6988, |
|
"eval_samples_per_second": 47.28, |
|
"eval_steps_per_second": 5.975, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1e-06, |
|
"loss": 3.166, |
|
"perplexity": 23.71244462518505, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_accuracy": 0.23091847790036926, |
|
"eval_loss": 3.150390625, |
|
"eval_perplexity": 23.345182011801924, |
|
"eval_runtime": 7.9469, |
|
"eval_samples_per_second": 45.804, |
|
"eval_steps_per_second": 5.788, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1e-06, |
|
"loss": 3.144, |
|
"perplexity": 23.196467403779828, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_accuracy": 0.2309626992725584, |
|
"eval_loss": 3.1484375, |
|
"eval_perplexity": 23.299630451611073, |
|
"eval_runtime": 7.957, |
|
"eval_samples_per_second": 45.746, |
|
"eval_steps_per_second": 5.781, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1624, |
|
"perplexity": 23.627233296953413, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_accuracy": 0.23100139297322395, |
|
"eval_loss": 3.1484375, |
|
"eval_perplexity": 23.299630451611073, |
|
"eval_runtime": 8.0068, |
|
"eval_samples_per_second": 45.461, |
|
"eval_steps_per_second": 5.745, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0332, |
|
"perplexity": 20.763569816631378, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_accuracy": 0.23103455900236583, |
|
"eval_loss": 3.146484375, |
|
"eval_perplexity": 23.25416777248505, |
|
"eval_runtime": 7.9578, |
|
"eval_samples_per_second": 45.741, |
|
"eval_steps_per_second": 5.78, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3745, |
|
"perplexity": 29.209675291771028, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.23106772503150771, |
|
"eval_loss": 3.14453125, |
|
"eval_perplexity": 23.20879380099744, |
|
"eval_runtime": 7.9682, |
|
"eval_samples_per_second": 45.682, |
|
"eval_steps_per_second": 5.773, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0823, |
|
"perplexity": 21.808504316830465, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_accuracy": 0.23117827846198066, |
|
"eval_loss": 3.142578125, |
|
"eval_perplexity": 23.16350836406023, |
|
"eval_runtime": 7.916, |
|
"eval_samples_per_second": 45.983, |
|
"eval_steps_per_second": 5.811, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1e-06, |
|
"loss": 3.6021, |
|
"perplexity": 36.67517149163571, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_accuracy": 0.2312280275056935, |
|
"eval_loss": 3.140625, |
|
"eval_perplexity": 23.118311288923124, |
|
"eval_runtime": 7.9551, |
|
"eval_samples_per_second": 45.757, |
|
"eval_steps_per_second": 5.782, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1125, |
|
"perplexity": 22.477167135936607, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_accuracy": 0.23127777654940632, |
|
"eval_loss": 3.138671875, |
|
"eval_perplexity": 23.073202403172917, |
|
"eval_runtime": 7.9108, |
|
"eval_samples_per_second": 46.013, |
|
"eval_steps_per_second": 5.815, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1406, |
|
"perplexity": 23.117733338365316, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.23136621929378468, |
|
"eval_loss": 3.138671875, |
|
"eval_perplexity": 23.073202403172917, |
|
"eval_runtime": 7.9649, |
|
"eval_samples_per_second": 45.7, |
|
"eval_steps_per_second": 5.775, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1736, |
|
"perplexity": 23.893345757904175, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.2314049129944502, |
|
"eval_loss": 3.13671875, |
|
"eval_perplexity": 23.028181534732802, |
|
"eval_runtime": 7.9854, |
|
"eval_samples_per_second": 45.583, |
|
"eval_steps_per_second": 5.76, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1104, |
|
"perplexity": 22.43001461242937, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_accuracy": 0.23154863245406504, |
|
"eval_loss": 3.134765625, |
|
"eval_perplexity": 22.98324851186175, |
|
"eval_runtime": 7.9536, |
|
"eval_samples_per_second": 45.765, |
|
"eval_steps_per_second": 5.784, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1301, |
|
"perplexity": 22.876267054768768, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_accuracy": 0.23159285382625422, |
|
"eval_loss": 3.1328125, |
|
"eval_perplexity": 22.938403163153815, |
|
"eval_runtime": 7.904, |
|
"eval_samples_per_second": 46.053, |
|
"eval_steps_per_second": 5.82, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3376, |
|
"perplexity": 28.151482007422672, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_accuracy": 0.23154863245406504, |
|
"eval_loss": 3.130859375, |
|
"eval_perplexity": 22.893645317537526, |
|
"eval_runtime": 7.6495, |
|
"eval_samples_per_second": 47.585, |
|
"eval_steps_per_second": 6.013, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 1e-06, |
|
"loss": 3.218, |
|
"perplexity": 24.978113963861347, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_accuracy": 0.23159285382625422, |
|
"eval_loss": 3.130859375, |
|
"eval_perplexity": 22.893645317537526, |
|
"eval_runtime": 7.9297, |
|
"eval_samples_per_second": 45.904, |
|
"eval_steps_per_second": 5.801, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0786, |
|
"perplexity": 21.727961946129383, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_accuracy": 0.2316481305414907, |
|
"eval_loss": 3.12890625, |
|
"eval_perplexity": 22.84897480427519, |
|
"eval_runtime": 7.9813, |
|
"eval_samples_per_second": 45.607, |
|
"eval_steps_per_second": 5.763, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0125, |
|
"perplexity": 20.33818187604361, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_accuracy": 0.23167024122758528, |
|
"eval_loss": 3.126953125, |
|
"eval_perplexity": 22.80439145296227, |
|
"eval_runtime": 7.9533, |
|
"eval_samples_per_second": 45.767, |
|
"eval_steps_per_second": 5.784, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2634, |
|
"perplexity": 26.138256305914563, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_accuracy": 0.2317199902712981, |
|
"eval_loss": 3.126953125, |
|
"eval_perplexity": 22.80439145296227, |
|
"eval_runtime": 6.9427, |
|
"eval_samples_per_second": 52.429, |
|
"eval_steps_per_second": 6.626, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 1e-06, |
|
"loss": 2.9888, |
|
"perplexity": 19.861833984540883, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_accuracy": 0.23177526698653458, |
|
"eval_loss": 3.125, |
|
"eval_perplexity": 22.75989509352673, |
|
"eval_runtime": 7.9347, |
|
"eval_samples_per_second": 45.875, |
|
"eval_steps_per_second": 5.797, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1624, |
|
"perplexity": 23.627233296953413, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_accuracy": 0.23179737767262917, |
|
"eval_loss": 3.123046875, |
|
"eval_perplexity": 22.715485556228362, |
|
"eval_runtime": 7.9127, |
|
"eval_samples_per_second": 46.002, |
|
"eval_steps_per_second": 5.813, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1e-06, |
|
"loss": 2.9807, |
|
"perplexity": 19.7016029410545, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_accuracy": 0.2318581820593893, |
|
"eval_loss": 3.12109375, |
|
"eval_perplexity": 22.67116267165818, |
|
"eval_runtime": 7.9324, |
|
"eval_samples_per_second": 45.888, |
|
"eval_steps_per_second": 5.799, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 1e-06, |
|
"loss": 3.446, |
|
"perplexity": 31.374642406982968, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.2319300417891967, |
|
"eval_loss": 3.12109375, |
|
"eval_perplexity": 22.67116267165818, |
|
"eval_runtime": 7.9334, |
|
"eval_samples_per_second": 45.882, |
|
"eval_steps_per_second": 5.798, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1338, |
|
"perplexity": 22.96106602422343, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_accuracy": 0.23195768014681495, |
|
"eval_loss": 3.119140625, |
|
"eval_perplexity": 22.626926270737744, |
|
"eval_runtime": 7.926, |
|
"eval_samples_per_second": 45.925, |
|
"eval_steps_per_second": 5.804, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1841, |
|
"perplexity": 24.145547631095972, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_accuracy": 0.23199084617595683, |
|
"eval_loss": 3.119140625, |
|
"eval_perplexity": 22.626926270737744, |
|
"eval_runtime": 7.9503, |
|
"eval_samples_per_second": 45.785, |
|
"eval_steps_per_second": 5.786, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1079, |
|
"perplexity": 22.374009611318957, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_accuracy": 0.232035067548146, |
|
"eval_loss": 3.1171875, |
|
"eval_perplexity": 22.582776184718522, |
|
"eval_runtime": 7.7126, |
|
"eval_samples_per_second": 47.195, |
|
"eval_steps_per_second": 5.964, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0918, |
|
"perplexity": 22.016672340357456, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_accuracy": 0.23214562097861896, |
|
"eval_loss": 3.115234375, |
|
"eval_perplexity": 22.538712245181248, |
|
"eval_runtime": 7.9205, |
|
"eval_samples_per_second": 45.956, |
|
"eval_steps_per_second": 5.808, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0302, |
|
"perplexity": 20.701372449879624, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_accuracy": 0.23220089769385543, |
|
"eval_loss": 3.115234375, |
|
"eval_perplexity": 22.538712245181248, |
|
"eval_runtime": 7.9218, |
|
"eval_samples_per_second": 45.949, |
|
"eval_steps_per_second": 5.807, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1123, |
|
"perplexity": 22.472672152022792, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_accuracy": 0.23228381276671015, |
|
"eval_loss": 3.11328125, |
|
"eval_perplexity": 22.494734284035275, |
|
"eval_runtime": 7.9089, |
|
"eval_samples_per_second": 46.024, |
|
"eval_steps_per_second": 5.816, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 1e-06, |
|
"loss": 2.9985, |
|
"perplexity": 20.055431202738045, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_accuracy": 0.23235567249651756, |
|
"eval_loss": 3.111328125, |
|
"eval_perplexity": 22.450842133517945, |
|
"eval_runtime": 7.9743, |
|
"eval_samples_per_second": 45.646, |
|
"eval_steps_per_second": 5.769, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3816, |
|
"perplexity": 29.417801961716197, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy": 0.23235567249651756, |
|
"eval_loss": 3.111328125, |
|
"eval_perplexity": 22.450842133517945, |
|
"eval_runtime": 7.9744, |
|
"eval_samples_per_second": 45.646, |
|
"eval_steps_per_second": 5.768, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0813, |
|
"perplexity": 21.786706713131952, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_accuracy": 0.23242753222632498, |
|
"eval_loss": 3.109375, |
|
"eval_perplexity": 22.40703562619394, |
|
"eval_runtime": 8.0068, |
|
"eval_samples_per_second": 45.461, |
|
"eval_steps_per_second": 5.745, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2024, |
|
"perplexity": 24.591478979826256, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_accuracy": 0.2324662259269905, |
|
"eval_loss": 3.109375, |
|
"eval_perplexity": 22.40703562619394, |
|
"eval_runtime": 7.9621, |
|
"eval_samples_per_second": 45.717, |
|
"eval_steps_per_second": 5.777, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0178, |
|
"perplexity": 20.446260395068368, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_accuracy": 0.23248280894156145, |
|
"eval_loss": 3.107421875, |
|
"eval_perplexity": 22.36331459495464, |
|
"eval_runtime": 7.9553, |
|
"eval_samples_per_second": 45.756, |
|
"eval_steps_per_second": 5.782, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1646, |
|
"perplexity": 23.67927043006483, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_accuracy": 0.23263758374422358, |
|
"eval_loss": 3.107421875, |
|
"eval_perplexity": 22.36331459495464, |
|
"eval_runtime": 7.9632, |
|
"eval_samples_per_second": 45.71, |
|
"eval_steps_per_second": 5.777, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0046, |
|
"perplexity": 20.17814322423101, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_accuracy": 0.23268180511641276, |
|
"eval_loss": 3.10546875, |
|
"eval_perplexity": 22.319678873017494, |
|
"eval_runtime": 7.9716, |
|
"eval_samples_per_second": 45.662, |
|
"eval_steps_per_second": 5.77, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0266, |
|
"perplexity": 20.626981493124443, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_accuracy": 0.23273708183164923, |
|
"eval_loss": 3.10546875, |
|
"eval_perplexity": 22.319678873017494, |
|
"eval_runtime": 7.9222, |
|
"eval_samples_per_second": 45.947, |
|
"eval_steps_per_second": 5.806, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3857, |
|
"perplexity": 29.538662544648755, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.23274813717469653, |
|
"eval_loss": 3.103515625, |
|
"eval_perplexity": 22.27612829392538, |
|
"eval_runtime": 8.0121, |
|
"eval_samples_per_second": 45.431, |
|
"eval_steps_per_second": 5.741, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 1e-06, |
|
"loss": 3.064, |
|
"perplexity": 21.413038238853925, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_accuracy": 0.23277577553231477, |
|
"eval_loss": 3.103515625, |
|
"eval_perplexity": 22.27612829392538, |
|
"eval_runtime": 7.9323, |
|
"eval_samples_per_second": 45.888, |
|
"eval_steps_per_second": 5.799, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 1e-06, |
|
"loss": 3.176, |
|
"perplexity": 23.950758655642247, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_accuracy": 0.2328144692329803, |
|
"eval_loss": 3.1015625, |
|
"eval_perplexity": 22.232662691545976, |
|
"eval_runtime": 7.9184, |
|
"eval_samples_per_second": 45.969, |
|
"eval_steps_per_second": 5.809, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1851, |
|
"perplexity": 24.169705255526146, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_accuracy": 0.23286974594821677, |
|
"eval_loss": 3.1015625, |
|
"eval_perplexity": 22.232662691545976, |
|
"eval_runtime": 7.9669, |
|
"eval_samples_per_second": 45.689, |
|
"eval_steps_per_second": 5.774, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0811, |
|
"perplexity": 21.782349807494416, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.232897384305835, |
|
"eval_loss": 3.099609375, |
|
"eval_perplexity": 22.189281900071105, |
|
"eval_runtime": 7.9339, |
|
"eval_samples_per_second": 45.879, |
|
"eval_steps_per_second": 5.798, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0205, |
|
"perplexity": 20.501539891873456, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_accuracy": 0.23296924403564243, |
|
"eval_loss": 3.099609375, |
|
"eval_perplexity": 22.189281900071105, |
|
"eval_runtime": 7.9929, |
|
"eval_samples_per_second": 45.54, |
|
"eval_steps_per_second": 5.755, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1e-06, |
|
"loss": 3.26, |
|
"perplexity": 26.049537142518336, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_accuracy": 0.23296371636411878, |
|
"eval_loss": 3.09765625, |
|
"eval_perplexity": 22.145985754016134, |
|
"eval_runtime": 7.9124, |
|
"eval_samples_per_second": 46.004, |
|
"eval_steps_per_second": 5.814, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2922, |
|
"perplexity": 26.90198296333493, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_accuracy": 0.23305768678002078, |
|
"eval_loss": 3.09765625, |
|
"eval_perplexity": 22.145985754016134, |
|
"eval_runtime": 7.9262, |
|
"eval_samples_per_second": 45.923, |
|
"eval_steps_per_second": 5.804, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1e-06, |
|
"loss": 3.5349, |
|
"perplexity": 34.29158538422678, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_accuracy": 0.2331295465098282, |
|
"eval_loss": 3.095703125, |
|
"eval_perplexity": 22.10277408821932, |
|
"eval_runtime": 7.9843, |
|
"eval_samples_per_second": 45.589, |
|
"eval_steps_per_second": 5.761, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3525, |
|
"perplexity": 28.57407962319162, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_accuracy": 0.23312401883830455, |
|
"eval_loss": 3.095703125, |
|
"eval_perplexity": 22.10277408821932, |
|
"eval_runtime": 7.9231, |
|
"eval_samples_per_second": 45.942, |
|
"eval_steps_per_second": 5.806, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 1e-06, |
|
"loss": 3.135, |
|
"perplexity": 22.988635842034803, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_accuracy": 0.23312401883830455, |
|
"eval_loss": 3.09375, |
|
"eval_perplexity": 22.059646737841184, |
|
"eval_runtime": 7.9384, |
|
"eval_samples_per_second": 45.853, |
|
"eval_steps_per_second": 5.795, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1707, |
|
"perplexity": 23.824155429673073, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_accuracy": 0.23318482322506467, |
|
"eval_loss": 3.09375, |
|
"eval_perplexity": 22.059646737841184, |
|
"eval_runtime": 7.9764, |
|
"eval_samples_per_second": 45.635, |
|
"eval_steps_per_second": 5.767, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0127, |
|
"perplexity": 20.342249919209575, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_accuracy": 0.23324009994030115, |
|
"eval_loss": 3.091796875, |
|
"eval_perplexity": 22.016603538363892, |
|
"eval_runtime": 7.9476, |
|
"eval_samples_per_second": 45.8, |
|
"eval_steps_per_second": 5.788, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0952, |
|
"perplexity": 22.091656427027353, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_accuracy": 0.23322904459725385, |
|
"eval_loss": 3.091796875, |
|
"eval_perplexity": 22.016603538363892, |
|
"eval_runtime": 8.0177, |
|
"eval_samples_per_second": 45.4, |
|
"eval_steps_per_second": 5.737, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1023, |
|
"perplexity": 22.24906532800973, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.23338934707143963, |
|
"eval_loss": 3.08984375, |
|
"eval_perplexity": 21.973644325590612, |
|
"eval_runtime": 7.7593, |
|
"eval_samples_per_second": 46.911, |
|
"eval_steps_per_second": 5.928, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 1e-06, |
|
"loss": 3.3821, |
|
"perplexity": 29.43251454053524, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_accuracy": 0.23339487474296328, |
|
"eval_loss": 3.08984375, |
|
"eval_perplexity": 21.973644325590612, |
|
"eval_runtime": 7.9307, |
|
"eval_samples_per_second": 45.898, |
|
"eval_steps_per_second": 5.8, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1118, |
|
"perplexity": 22.461438624562685, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_accuracy": 0.23339487474296328, |
|
"eval_loss": 3.087890625, |
|
"eval_perplexity": 21.930768935644906, |
|
"eval_runtime": 8.0017, |
|
"eval_samples_per_second": 45.49, |
|
"eval_steps_per_second": 5.749, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1143, |
|
"perplexity": 22.5176624716497, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.2334667344727707, |
|
"eval_loss": 3.087890625, |
|
"eval_perplexity": 21.930768935644906, |
|
"eval_runtime": 7.9822, |
|
"eval_samples_per_second": 45.602, |
|
"eval_steps_per_second": 5.763, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1118, |
|
"perplexity": 22.461438624562685, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_accuracy": 0.23352201118800717, |
|
"eval_loss": 3.087890625, |
|
"eval_perplexity": 21.930768935644906, |
|
"eval_runtime": 6.928, |
|
"eval_samples_per_second": 52.54, |
|
"eval_steps_per_second": 6.64, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 1e-06, |
|
"loss": 3.0596, |
|
"perplexity": 21.319027845139193, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_accuracy": 0.2335828155747673, |
|
"eval_loss": 3.0859375, |
|
"eval_perplexity": 21.887977204970085, |
|
"eval_runtime": 7.9803, |
|
"eval_samples_per_second": 45.612, |
|
"eval_steps_per_second": 5.764, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1e-06, |
|
"loss": 3.1033, |
|
"perplexity": 22.271325521579506, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.23359387091781458, |
|
"eval_loss": 3.0859375, |
|
"eval_perplexity": 21.887977204970085, |
|
"eval_runtime": 7.9699, |
|
"eval_samples_per_second": 45.672, |
|
"eval_steps_per_second": 5.772, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 75, |
|
"total_flos": 352569360384.0, |
|
"train_loss": 3.1779069010416667, |
|
"train_runtime": 632.8224, |
|
"train_samples_per_second": 0.947, |
|
"train_steps_per_second": 0.119 |
|
} |
|
], |
|
"max_steps": 75, |
|
"num_train_epochs": 1, |
|
"total_flos": 352569360384.0, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|