|
{ |
|
"best_metric": 1.0345184803009033, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-2_180-samples_config-1_full/checkpoint-119", |
|
"epoch": 14.0, |
|
"eval_steps": 500, |
|
"global_step": 238, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 0.5214345455169678, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 1.5724, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.5508020520210266, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 1.6207, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.5937786102294922, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.6106, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.5013459324836731, |
|
"learning_rate": 7.058823529411765e-06, |
|
"loss": 1.6182, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.4731301963329315, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 1.5894, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.4294460713863373, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 1.5504, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.4912262260913849, |
|
"learning_rate": 1.411764705882353e-05, |
|
"loss": 1.5279, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.5266876220703125, |
|
"learning_rate": 1.647058823529412e-05, |
|
"loss": 1.5391, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.391021728515625, |
|
"learning_rate": 1.8823529411764708e-05, |
|
"loss": 1.504, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.5035704374313354, |
|
"eval_runtime": 37.0347, |
|
"eval_samples_per_second": 0.972, |
|
"eval_steps_per_second": 0.972, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.0588235294117647, |
|
"grad_norm": 0.3921213746070862, |
|
"learning_rate": 2.1176470588235296e-05, |
|
"loss": 1.5014, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.38555383682250977, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.4873, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.2941176470588236, |
|
"grad_norm": 0.2951497733592987, |
|
"learning_rate": 2.5882352941176475e-05, |
|
"loss": 1.4459, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 0.31671950221061707, |
|
"learning_rate": 2.823529411764706e-05, |
|
"loss": 1.4021, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.5294117647058822, |
|
"grad_norm": 0.2338249385356903, |
|
"learning_rate": 3.058823529411765e-05, |
|
"loss": 1.3717, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 0.2480524629354477, |
|
"learning_rate": 3.294117647058824e-05, |
|
"loss": 1.364, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.29380786418914795, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 1.3607, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 0.26109302043914795, |
|
"learning_rate": 3.7647058823529415e-05, |
|
"loss": 1.3166, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2589414417743683, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3344, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.3100626468658447, |
|
"eval_runtime": 37.0282, |
|
"eval_samples_per_second": 0.972, |
|
"eval_steps_per_second": 0.972, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.2652131915092468, |
|
"learning_rate": 4.235294117647059e-05, |
|
"loss": 1.2648, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 0.2305995672941208, |
|
"learning_rate": 4.470588235294118e-05, |
|
"loss": 1.2315, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.22336632013320923, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 1.208, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 0.25589871406555176, |
|
"learning_rate": 4.9411764705882355e-05, |
|
"loss": 1.1869, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 0.3676718771457672, |
|
"learning_rate": 5.176470588235295e-05, |
|
"loss": 1.1343, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 0.308319628238678, |
|
"learning_rate": 5.411764705882353e-05, |
|
"loss": 1.1467, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 0.2406923770904541, |
|
"learning_rate": 5.647058823529412e-05, |
|
"loss": 1.1131, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.3291592299938202, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 1.0895, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.1146036386489868, |
|
"eval_runtime": 36.9993, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 0.16729487478733063, |
|
"learning_rate": 6.11764705882353e-05, |
|
"loss": 1.0574, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.176470588235294, |
|
"grad_norm": 0.15566912293434143, |
|
"learning_rate": 6.352941176470588e-05, |
|
"loss": 1.0398, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 0.15833047032356262, |
|
"learning_rate": 6.588235294117648e-05, |
|
"loss": 1.0902, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.411764705882353, |
|
"grad_norm": 0.16289210319519043, |
|
"learning_rate": 6.823529411764707e-05, |
|
"loss": 1.0512, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.16850221157073975, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 0.9916, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.6470588235294117, |
|
"grad_norm": 0.15705695748329163, |
|
"learning_rate": 7.294117647058823e-05, |
|
"loss": 1.0991, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 0.192249596118927, |
|
"learning_rate": 7.529411764705883e-05, |
|
"loss": 1.022, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 3.8823529411764706, |
|
"grad_norm": 0.18115530908107758, |
|
"learning_rate": 7.764705882352942e-05, |
|
"loss": 0.9873, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.17344814538955688, |
|
"learning_rate": 8e-05, |
|
"loss": 0.9755, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.0741065740585327, |
|
"eval_runtime": 36.9986, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.16507934033870697, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 1.0732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 0.17189058661460876, |
|
"learning_rate": 8.470588235294118e-05, |
|
"loss": 0.9364, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 4.352941176470588, |
|
"grad_norm": 0.2030569314956665, |
|
"learning_rate": 8.705882352941177e-05, |
|
"loss": 1.0275, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 0.1766645312309265, |
|
"learning_rate": 8.941176470588236e-05, |
|
"loss": 0.9621, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 4.588235294117647, |
|
"grad_norm": 0.19495844841003418, |
|
"learning_rate": 9.176470588235295e-05, |
|
"loss": 0.9856, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.17263540625572205, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 1.012, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.823529411764706, |
|
"grad_norm": 0.19147881865501404, |
|
"learning_rate": 9.647058823529412e-05, |
|
"loss": 0.9723, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"grad_norm": 0.19374307990074158, |
|
"learning_rate": 9.882352941176471e-05, |
|
"loss": 0.9637, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.052433967590332, |
|
"eval_runtime": 37.0057, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 5.0588235294117645, |
|
"grad_norm": 0.2304454743862152, |
|
"learning_rate": 9.99995783847866e-05, |
|
"loss": 0.9125, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.176470588235294, |
|
"grad_norm": 0.2263410985469818, |
|
"learning_rate": 9.999620550574153e-05, |
|
"loss": 0.942, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 5.294117647058823, |
|
"grad_norm": 0.2422245293855667, |
|
"learning_rate": 9.998945997517956e-05, |
|
"loss": 0.9778, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 5.411764705882353, |
|
"grad_norm": 0.20600196719169617, |
|
"learning_rate": 9.997934224814173e-05, |
|
"loss": 0.9842, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 5.529411764705882, |
|
"grad_norm": 0.23495109379291534, |
|
"learning_rate": 9.996585300715116e-05, |
|
"loss": 0.9389, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 5.647058823529412, |
|
"grad_norm": 0.27190443873405457, |
|
"learning_rate": 9.994899316216708e-05, |
|
"loss": 0.9416, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 5.764705882352941, |
|
"grad_norm": 0.24846382439136505, |
|
"learning_rate": 9.992876385052345e-05, |
|
"loss": 0.9654, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.23965619504451752, |
|
"learning_rate": 9.990516643685222e-05, |
|
"loss": 0.9653, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.23243875801563263, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.9215, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.0348577499389648, |
|
"eval_runtime": 36.9988, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 0.2187218964099884, |
|
"learning_rate": 9.984787389787688e-05, |
|
"loss": 0.886, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 6.235294117647059, |
|
"grad_norm": 0.24537856876850128, |
|
"learning_rate": 9.981418263742148e-05, |
|
"loss": 0.8928, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 6.352941176470588, |
|
"grad_norm": 0.25448307394981384, |
|
"learning_rate": 9.977713100437509e-05, |
|
"loss": 0.8498, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 6.470588235294118, |
|
"grad_norm": 0.2609221339225769, |
|
"learning_rate": 9.973672149817232e-05, |
|
"loss": 0.926, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 6.588235294117647, |
|
"grad_norm": 0.27997028827667236, |
|
"learning_rate": 9.96929568447637e-05, |
|
"loss": 0.8778, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 6.705882352941177, |
|
"grad_norm": 0.29801490902900696, |
|
"learning_rate": 9.964583999643174e-05, |
|
"loss": 0.9633, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 6.823529411764706, |
|
"grad_norm": 0.25596335530281067, |
|
"learning_rate": 9.95953741315919e-05, |
|
"loss": 0.9494, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 6.9411764705882355, |
|
"grad_norm": 0.2853046655654907, |
|
"learning_rate": 9.954156265457801e-05, |
|
"loss": 0.8984, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.0345184803009033, |
|
"eval_runtime": 36.9963, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.31706833839416504, |
|
"learning_rate": 9.948440919541278e-05, |
|
"loss": 0.8992, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 7.176470588235294, |
|
"grad_norm": 0.2826850712299347, |
|
"learning_rate": 9.942391760956277e-05, |
|
"loss": 0.8611, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 7.294117647058823, |
|
"grad_norm": 0.33343079686164856, |
|
"learning_rate": 9.936009197767845e-05, |
|
"loss": 0.8488, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 7.411764705882353, |
|
"grad_norm": 0.33269575238227844, |
|
"learning_rate": 9.929293660531888e-05, |
|
"loss": 0.809, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 7.529411764705882, |
|
"grad_norm": 0.3902694582939148, |
|
"learning_rate": 9.922245602266118e-05, |
|
"loss": 0.8352, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 7.647058823529412, |
|
"grad_norm": 0.34491705894470215, |
|
"learning_rate": 9.91486549841951e-05, |
|
"loss": 0.8561, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 7.764705882352941, |
|
"grad_norm": 0.3838919401168823, |
|
"learning_rate": 9.90715384684021e-05, |
|
"loss": 0.7896, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 7.882352941176471, |
|
"grad_norm": 0.3754192590713501, |
|
"learning_rate": 9.899111167741966e-05, |
|
"loss": 0.9033, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.3832869529724121, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.7983, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.0459100008010864, |
|
"eval_runtime": 37.0077, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 8.117647058823529, |
|
"grad_norm": 0.4289301633834839, |
|
"learning_rate": 9.882034919459555e-05, |
|
"loss": 0.7896, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.5225833058357239, |
|
"learning_rate": 9.873002502207503e-05, |
|
"loss": 0.7458, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 8.352941176470589, |
|
"grad_norm": 0.47407665848731995, |
|
"learning_rate": 9.863641361223024e-05, |
|
"loss": 0.8066, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 0.46604663133621216, |
|
"learning_rate": 9.853952127991372e-05, |
|
"loss": 0.7907, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 8.588235294117647, |
|
"grad_norm": 0.4920913577079773, |
|
"learning_rate": 9.843935456130295e-05, |
|
"loss": 0.8002, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 8.705882352941176, |
|
"grad_norm": 0.4699203670024872, |
|
"learning_rate": 9.833592021345937e-05, |
|
"loss": 0.7876, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 8.823529411764707, |
|
"grad_norm": 0.4708077907562256, |
|
"learning_rate": 9.822922521387276e-05, |
|
"loss": 0.7274, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 0.6590410470962524, |
|
"learning_rate": 9.811927675999036e-05, |
|
"loss": 0.711, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.0750151872634888, |
|
"eval_runtime": 37.003, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 9.058823529411764, |
|
"grad_norm": 0.6144815683364868, |
|
"learning_rate": 9.800608226873142e-05, |
|
"loss": 0.7504, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 9.176470588235293, |
|
"grad_norm": 0.5911457538604736, |
|
"learning_rate": 9.788964937598689e-05, |
|
"loss": 0.6873, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 9.294117647058824, |
|
"grad_norm": 0.6449252963066101, |
|
"learning_rate": 9.776998593610428e-05, |
|
"loss": 0.7058, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.5975269079208374, |
|
"learning_rate": 9.764710002135784e-05, |
|
"loss": 0.7004, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 9.529411764705882, |
|
"grad_norm": 0.5621377825737, |
|
"learning_rate": 9.752099992140399e-05, |
|
"loss": 0.672, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 9.647058823529411, |
|
"grad_norm": 0.6631340980529785, |
|
"learning_rate": 9.739169414272217e-05, |
|
"loss": 0.6917, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 9.764705882352942, |
|
"grad_norm": 0.6114043593406677, |
|
"learning_rate": 9.725919140804099e-05, |
|
"loss": 0.7236, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 9.882352941176471, |
|
"grad_norm": 0.6280692219734192, |
|
"learning_rate": 9.71235006557497e-05, |
|
"loss": 0.602, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6364344358444214, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.6725, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.134390950202942, |
|
"eval_runtime": 37.0, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.117647058823529, |
|
"grad_norm": 0.6386916041374207, |
|
"learning_rate": 9.684259192656553e-05, |
|
"loss": 0.561, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 10.235294117647058, |
|
"grad_norm": 0.9274471402168274, |
|
"learning_rate": 9.669739289925577e-05, |
|
"loss": 0.5975, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 10.352941176470589, |
|
"grad_norm": 0.758702278137207, |
|
"learning_rate": 9.654904375222385e-05, |
|
"loss": 0.621, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 10.470588235294118, |
|
"grad_norm": 0.949181854724884, |
|
"learning_rate": 9.639755449282875e-05, |
|
"loss": 0.5464, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 0.7223683595657349, |
|
"learning_rate": 9.62429353402556e-05, |
|
"loss": 0.6089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 10.705882352941176, |
|
"grad_norm": 0.6299746632575989, |
|
"learning_rate": 9.608519672482636e-05, |
|
"loss": 0.5711, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 10.823529411764707, |
|
"grad_norm": 0.7116110920906067, |
|
"learning_rate": 9.592434928729616e-05, |
|
"loss": 0.5963, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 10.941176470588236, |
|
"grad_norm": 0.6831782460212708, |
|
"learning_rate": 9.576040387813552e-05, |
|
"loss": 0.629, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 1.1629810333251953, |
|
"eval_runtime": 37.0044, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 11.058823529411764, |
|
"grad_norm": 0.6914322972297668, |
|
"learning_rate": 9.559337155679842e-05, |
|
"loss": 0.6061, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 11.176470588235293, |
|
"grad_norm": 0.8476060628890991, |
|
"learning_rate": 9.542326359097619e-05, |
|
"loss": 0.513, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 11.294117647058824, |
|
"grad_norm": 0.9703707098960876, |
|
"learning_rate": 9.525009145583745e-05, |
|
"loss": 0.5803, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 11.411764705882353, |
|
"grad_norm": 0.8944510221481323, |
|
"learning_rate": 9.507386683325404e-05, |
|
"loss": 0.3622, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 11.529411764705882, |
|
"grad_norm": 1.0056551694869995, |
|
"learning_rate": 9.489460161101291e-05, |
|
"loss": 0.5537, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 11.647058823529411, |
|
"grad_norm": 0.9109512567520142, |
|
"learning_rate": 9.471230788201429e-05, |
|
"loss": 0.519, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 0.82012540102005, |
|
"learning_rate": 9.452699794345581e-05, |
|
"loss": 0.5082, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 11.882352941176471, |
|
"grad_norm": 0.7492274641990662, |
|
"learning_rate": 9.43386842960031e-05, |
|
"loss": 0.5116, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.8394200801849365, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.4573, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.2680054903030396, |
|
"eval_runtime": 37.0068, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 12.117647058823529, |
|
"grad_norm": 0.8289351463317871, |
|
"learning_rate": 9.395309688934351e-05, |
|
"loss": 0.4454, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 12.235294117647058, |
|
"grad_norm": 0.97170090675354, |
|
"learning_rate": 9.375584914114963e-05, |
|
"loss": 0.3926, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 12.352941176470589, |
|
"grad_norm": 0.8402796983718872, |
|
"learning_rate": 9.355564970433288e-05, |
|
"loss": 0.4302, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 12.470588235294118, |
|
"grad_norm": 0.8999544978141785, |
|
"learning_rate": 9.335251208397684e-05, |
|
"loss": 0.3877, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 12.588235294117647, |
|
"grad_norm": 1.080120325088501, |
|
"learning_rate": 9.314644998336949e-05, |
|
"loss": 0.4688, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 12.705882352941176, |
|
"grad_norm": 0.8625339269638062, |
|
"learning_rate": 9.293747730307889e-05, |
|
"loss": 0.3901, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 12.823529411764707, |
|
"grad_norm": 1.0037257671356201, |
|
"learning_rate": 9.272560814001539e-05, |
|
"loss": 0.4283, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"grad_norm": 0.8916875720024109, |
|
"learning_rate": 9.251085678648072e-05, |
|
"loss": 0.4754, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 1.2756708860397339, |
|
"eval_runtime": 37.001, |
|
"eval_samples_per_second": 0.973, |
|
"eval_steps_per_second": 0.973, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 13.058823529411764, |
|
"grad_norm": 0.9820285439491272, |
|
"learning_rate": 9.229323772920381e-05, |
|
"loss": 0.4135, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 13.176470588235293, |
|
"grad_norm": 1.5194408893585205, |
|
"learning_rate": 9.207276564836366e-05, |
|
"loss": 0.3795, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 13.294117647058824, |
|
"grad_norm": 0.8674440383911133, |
|
"learning_rate": 9.184945541659889e-05, |
|
"loss": 0.3577, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 13.411764705882353, |
|
"grad_norm": 1.037719488143921, |
|
"learning_rate": 9.162332209800455e-05, |
|
"loss": 0.292, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 13.529411764705882, |
|
"grad_norm": 1.3491851091384888, |
|
"learning_rate": 9.139438094711589e-05, |
|
"loss": 0.3822, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 13.647058823529411, |
|
"grad_norm": 1.114529013633728, |
|
"learning_rate": 9.116264740787936e-05, |
|
"loss": 0.3377, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 13.764705882352942, |
|
"grad_norm": 0.971333920955658, |
|
"learning_rate": 9.092813711261074e-05, |
|
"loss": 0.4037, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 13.882352941176471, |
|
"grad_norm": 1.0582764148712158, |
|
"learning_rate": 9.069086588094067e-05, |
|
"loss": 0.3383, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.9523320198059082, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.4236, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.337109923362732, |
|
"eval_runtime": 37.033, |
|
"eval_samples_per_second": 0.972, |
|
"eval_steps_per_second": 0.972, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"step": 238, |
|
"total_flos": 1.8224671283478528e+17, |
|
"train_loss": 0.8704949083949337, |
|
"train_runtime": 6511.5654, |
|
"train_samples_per_second": 1.044, |
|
"train_steps_per_second": 0.131 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8224671283478528e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|