|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.98793242156074, |
|
"eval_steps": 500, |
|
"global_step": 775, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006436041834271922, |
|
"grad_norm": 857.0773315429688, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 12.2736, |
|
"num_input_tokens_seen": 6576, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012872083668543845, |
|
"grad_norm": 899.0701293945312, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 12.5518, |
|
"num_input_tokens_seen": 13312, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.019308125502815767, |
|
"grad_norm": 833.7578125, |
|
"learning_rate": 1.5e-06, |
|
"loss": 11.8551, |
|
"num_input_tokens_seen": 19952, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02574416733708769, |
|
"grad_norm": 712.9902954101562, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 10.9045, |
|
"num_input_tokens_seen": 26640, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032180209171359615, |
|
"grad_norm": 620.4878540039062, |
|
"learning_rate": 2.5e-06, |
|
"loss": 8.9845, |
|
"num_input_tokens_seen": 33360, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.038616251005631534, |
|
"grad_norm": 376.7406921386719, |
|
"learning_rate": 3e-06, |
|
"loss": 6.35, |
|
"num_input_tokens_seen": 39984, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04505229283990346, |
|
"grad_norm": 356.7503967285156, |
|
"learning_rate": 3.5e-06, |
|
"loss": 5.4864, |
|
"num_input_tokens_seen": 46496, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05148833467417538, |
|
"grad_norm": 383.4678955078125, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.8013, |
|
"num_input_tokens_seen": 53008, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.057924376508447305, |
|
"grad_norm": 285.64483642578125, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.9851, |
|
"num_input_tokens_seen": 59856, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06436041834271923, |
|
"grad_norm": 222.22850036621094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5088, |
|
"num_input_tokens_seen": 66352, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07079646017699115, |
|
"grad_norm": 338.3565368652344, |
|
"learning_rate": 4.99997891923933e-06, |
|
"loss": 2.022, |
|
"num_input_tokens_seen": 73184, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07723250201126307, |
|
"grad_norm": 97.87013244628906, |
|
"learning_rate": 4.999915677312839e-06, |
|
"loss": 0.5842, |
|
"num_input_tokens_seen": 80064, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.083668543845535, |
|
"grad_norm": 137.05706787109375, |
|
"learning_rate": 4.999810275287077e-06, |
|
"loss": 0.8538, |
|
"num_input_tokens_seen": 86688, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09010458567980692, |
|
"grad_norm": 88.00048828125, |
|
"learning_rate": 4.9996627149396075e-06, |
|
"loss": 0.5843, |
|
"num_input_tokens_seen": 93168, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09654062751407884, |
|
"grad_norm": 61.654090881347656, |
|
"learning_rate": 4.999472998758979e-06, |
|
"loss": 0.4456, |
|
"num_input_tokens_seen": 99696, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10297666934835076, |
|
"grad_norm": 37.3619499206543, |
|
"learning_rate": 4.99924112994468e-06, |
|
"loss": 0.3542, |
|
"num_input_tokens_seen": 106016, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10941271118262269, |
|
"grad_norm": 41.219093322753906, |
|
"learning_rate": 4.998967112407087e-06, |
|
"loss": 0.3416, |
|
"num_input_tokens_seen": 112672, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11584875301689461, |
|
"grad_norm": 22.267297744750977, |
|
"learning_rate": 4.9986509507673986e-06, |
|
"loss": 0.2803, |
|
"num_input_tokens_seen": 119312, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12228479485116653, |
|
"grad_norm": 22.40268898010254, |
|
"learning_rate": 4.998292650357558e-06, |
|
"loss": 0.2603, |
|
"num_input_tokens_seen": 126016, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12872083668543846, |
|
"grad_norm": 23.54829978942871, |
|
"learning_rate": 4.99789221722016e-06, |
|
"loss": 0.2456, |
|
"num_input_tokens_seen": 132704, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13515687851971037, |
|
"grad_norm": 24.01500701904297, |
|
"learning_rate": 4.997449658108354e-06, |
|
"loss": 0.2269, |
|
"num_input_tokens_seen": 139472, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1415929203539823, |
|
"grad_norm": 26.877809524536133, |
|
"learning_rate": 4.996964980485725e-06, |
|
"loss": 0.2896, |
|
"num_input_tokens_seen": 145968, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14802896218825423, |
|
"grad_norm": 11.89682388305664, |
|
"learning_rate": 4.996438192526173e-06, |
|
"loss": 0.1415, |
|
"num_input_tokens_seen": 152144, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15446500402252614, |
|
"grad_norm": 12.671065330505371, |
|
"learning_rate": 4.995869303113768e-06, |
|
"loss": 0.2228, |
|
"num_input_tokens_seen": 158432, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16090104585679807, |
|
"grad_norm": 12.23610782623291, |
|
"learning_rate": 4.995258321842611e-06, |
|
"loss": 0.1537, |
|
"num_input_tokens_seen": 164672, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16733708769107, |
|
"grad_norm": 21.917552947998047, |
|
"learning_rate": 4.994605259016658e-06, |
|
"loss": 0.2146, |
|
"num_input_tokens_seen": 170896, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1737731295253419, |
|
"grad_norm": 6.669096946716309, |
|
"learning_rate": 4.993910125649561e-06, |
|
"loss": 0.1688, |
|
"num_input_tokens_seen": 177904, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18020917135961384, |
|
"grad_norm": 10.865259170532227, |
|
"learning_rate": 4.99317293346447e-06, |
|
"loss": 0.1592, |
|
"num_input_tokens_seen": 184640, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18664521319388577, |
|
"grad_norm": 22.876623153686523, |
|
"learning_rate": 4.992393694893844e-06, |
|
"loss": 0.2333, |
|
"num_input_tokens_seen": 191008, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19308125502815768, |
|
"grad_norm": 12.427145004272461, |
|
"learning_rate": 4.991572423079236e-06, |
|
"loss": 0.1812, |
|
"num_input_tokens_seen": 197568, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1995172968624296, |
|
"grad_norm": 15.451845169067383, |
|
"learning_rate": 4.990709131871074e-06, |
|
"loss": 0.1925, |
|
"num_input_tokens_seen": 204272, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.20595333869670152, |
|
"grad_norm": 18.907636642456055, |
|
"learning_rate": 4.989803835828426e-06, |
|
"loss": 0.1864, |
|
"num_input_tokens_seen": 210944, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21238938053097345, |
|
"grad_norm": 15.60383129119873, |
|
"learning_rate": 4.988856550218755e-06, |
|
"loss": 0.1848, |
|
"num_input_tokens_seen": 217584, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.21882542236524538, |
|
"grad_norm": 7.129302024841309, |
|
"learning_rate": 4.987867291017662e-06, |
|
"loss": 0.1466, |
|
"num_input_tokens_seen": 224064, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2252614641995173, |
|
"grad_norm": 15.756115913391113, |
|
"learning_rate": 4.986836074908616e-06, |
|
"loss": 0.1481, |
|
"num_input_tokens_seen": 230880, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23169750603378922, |
|
"grad_norm": 8.845354080200195, |
|
"learning_rate": 4.985762919282674e-06, |
|
"loss": 0.149, |
|
"num_input_tokens_seen": 237312, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23813354786806115, |
|
"grad_norm": 15.7093505859375, |
|
"learning_rate": 4.984647842238185e-06, |
|
"loss": 0.1506, |
|
"num_input_tokens_seen": 243648, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24456958970233306, |
|
"grad_norm": 11.331380844116211, |
|
"learning_rate": 4.983490862580486e-06, |
|
"loss": 0.1709, |
|
"num_input_tokens_seen": 250096, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.251005631536605, |
|
"grad_norm": 6.254825115203857, |
|
"learning_rate": 4.982291999821587e-06, |
|
"loss": 0.0898, |
|
"num_input_tokens_seen": 256432, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2574416733708769, |
|
"grad_norm": 7.792216777801514, |
|
"learning_rate": 4.98105127417984e-06, |
|
"loss": 0.1856, |
|
"num_input_tokens_seen": 263088, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26387771520514886, |
|
"grad_norm": 4.186593055725098, |
|
"learning_rate": 4.979768706579595e-06, |
|
"loss": 0.0947, |
|
"num_input_tokens_seen": 269904, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27031375703942073, |
|
"grad_norm": 7.599153518676758, |
|
"learning_rate": 4.978444318650855e-06, |
|
"loss": 0.1367, |
|
"num_input_tokens_seen": 276592, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.27674979887369267, |
|
"grad_norm": 10.765763282775879, |
|
"learning_rate": 4.977078132728901e-06, |
|
"loss": 0.1645, |
|
"num_input_tokens_seen": 283424, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2831858407079646, |
|
"grad_norm": 6.705766677856445, |
|
"learning_rate": 4.975670171853926e-06, |
|
"loss": 0.1179, |
|
"num_input_tokens_seen": 290176, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.28962188254223653, |
|
"grad_norm": 6.055794715881348, |
|
"learning_rate": 4.9742204597706386e-06, |
|
"loss": 0.133, |
|
"num_input_tokens_seen": 296752, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.29605792437650846, |
|
"grad_norm": 7.4584760665893555, |
|
"learning_rate": 4.972729020927866e-06, |
|
"loss": 0.083, |
|
"num_input_tokens_seen": 303392, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3024939662107804, |
|
"grad_norm": 10.979104995727539, |
|
"learning_rate": 4.9711958804781385e-06, |
|
"loss": 0.1748, |
|
"num_input_tokens_seen": 310304, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3089300080450523, |
|
"grad_norm": 13.912871360778809, |
|
"learning_rate": 4.969621064277271e-06, |
|
"loss": 0.1854, |
|
"num_input_tokens_seen": 317440, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3153660498793242, |
|
"grad_norm": 6.554210186004639, |
|
"learning_rate": 4.968004598883923e-06, |
|
"loss": 0.1232, |
|
"num_input_tokens_seen": 324304, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32180209171359614, |
|
"grad_norm": 6.358190536499023, |
|
"learning_rate": 4.966346511559149e-06, |
|
"loss": 0.1172, |
|
"num_input_tokens_seen": 330832, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32823813354786807, |
|
"grad_norm": 6.195626258850098, |
|
"learning_rate": 4.964646830265944e-06, |
|
"loss": 0.1404, |
|
"num_input_tokens_seen": 337952, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33467417538214, |
|
"grad_norm": 12.585171699523926, |
|
"learning_rate": 4.962905583668766e-06, |
|
"loss": 0.137, |
|
"num_input_tokens_seen": 344384, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3411102172164119, |
|
"grad_norm": 3.7672178745269775, |
|
"learning_rate": 4.961122801133059e-06, |
|
"loss": 0.1191, |
|
"num_input_tokens_seen": 351184, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3475462590506838, |
|
"grad_norm": 17.48076629638672, |
|
"learning_rate": 4.9592985127247525e-06, |
|
"loss": 0.1624, |
|
"num_input_tokens_seen": 357696, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 7.758498668670654, |
|
"learning_rate": 4.957432749209755e-06, |
|
"loss": 0.1256, |
|
"num_input_tokens_seen": 364368, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3604183427192277, |
|
"grad_norm": 10.048332214355469, |
|
"learning_rate": 4.955525542053438e-06, |
|
"loss": 0.1274, |
|
"num_input_tokens_seen": 370896, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3668543845534996, |
|
"grad_norm": 17.495296478271484, |
|
"learning_rate": 4.953576923420105e-06, |
|
"loss": 0.174, |
|
"num_input_tokens_seen": 377168, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37329042638777155, |
|
"grad_norm": 7.546329021453857, |
|
"learning_rate": 4.9515869261724444e-06, |
|
"loss": 0.0805, |
|
"num_input_tokens_seen": 383728, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3797264682220434, |
|
"grad_norm": 5.6687188148498535, |
|
"learning_rate": 4.949555583870983e-06, |
|
"loss": 0.1181, |
|
"num_input_tokens_seen": 390448, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38616251005631536, |
|
"grad_norm": 9.777739524841309, |
|
"learning_rate": 4.9474829307735115e-06, |
|
"loss": 0.1613, |
|
"num_input_tokens_seen": 396960, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3925985518905873, |
|
"grad_norm": 10.064454078674316, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.0885, |
|
"num_input_tokens_seen": 403680, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3990345937248592, |
|
"grad_norm": 6.493910312652588, |
|
"learning_rate": 4.943213832704575e-06, |
|
"loss": 0.1526, |
|
"num_input_tokens_seen": 410096, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.40547063555913115, |
|
"grad_norm": 7.949091911315918, |
|
"learning_rate": 4.941017459729778e-06, |
|
"loss": 0.1114, |
|
"num_input_tokens_seen": 416672, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41190667739340303, |
|
"grad_norm": 8.829463958740234, |
|
"learning_rate": 4.938779919951092e-06, |
|
"loss": 0.1139, |
|
"num_input_tokens_seen": 423136, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.41834271922767496, |
|
"grad_norm": 3.495246410369873, |
|
"learning_rate": 4.936501251103751e-06, |
|
"loss": 0.0878, |
|
"num_input_tokens_seen": 429888, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4247787610619469, |
|
"grad_norm": 8.937992095947266, |
|
"learning_rate": 4.934181491616613e-06, |
|
"loss": 0.1047, |
|
"num_input_tokens_seen": 436720, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43121480289621883, |
|
"grad_norm": 12.225470542907715, |
|
"learning_rate": 4.9318206806115125e-06, |
|
"loss": 0.1323, |
|
"num_input_tokens_seen": 443648, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.43765084473049076, |
|
"grad_norm": 8.192527770996094, |
|
"learning_rate": 4.929418857902603e-06, |
|
"loss": 0.095, |
|
"num_input_tokens_seen": 450464, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4440868865647627, |
|
"grad_norm": 9.30573844909668, |
|
"learning_rate": 4.926976063995687e-06, |
|
"loss": 0.2024, |
|
"num_input_tokens_seen": 457296, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4505229283990346, |
|
"grad_norm": 16.341676712036133, |
|
"learning_rate": 4.9244923400875245e-06, |
|
"loss": 0.1614, |
|
"num_input_tokens_seen": 463984, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4569589702333065, |
|
"grad_norm": 19.184734344482422, |
|
"learning_rate": 4.921967728065147e-06, |
|
"loss": 0.2073, |
|
"num_input_tokens_seen": 470432, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46339501206757844, |
|
"grad_norm": 9.802066802978516, |
|
"learning_rate": 4.91940227050515e-06, |
|
"loss": 0.108, |
|
"num_input_tokens_seen": 476736, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.46983105390185037, |
|
"grad_norm": 8.744816780090332, |
|
"learning_rate": 4.916796010672969e-06, |
|
"loss": 0.1455, |
|
"num_input_tokens_seen": 482896, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4762670957361223, |
|
"grad_norm": 11.598526954650879, |
|
"learning_rate": 4.914148992522157e-06, |
|
"loss": 0.131, |
|
"num_input_tokens_seen": 489504, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4827031375703942, |
|
"grad_norm": 11.42316722869873, |
|
"learning_rate": 4.911461260693639e-06, |
|
"loss": 0.1416, |
|
"num_input_tokens_seen": 496160, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4891391794046661, |
|
"grad_norm": 3.1568145751953125, |
|
"learning_rate": 4.908732860514958e-06, |
|
"loss": 0.1045, |
|
"num_input_tokens_seen": 502528, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49557522123893805, |
|
"grad_norm": 9.457361221313477, |
|
"learning_rate": 4.905963837999518e-06, |
|
"loss": 0.1466, |
|
"num_input_tokens_seen": 509104, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.50201126307321, |
|
"grad_norm": 9.220935821533203, |
|
"learning_rate": 4.903154239845798e-06, |
|
"loss": 0.1502, |
|
"num_input_tokens_seen": 515760, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5084473049074819, |
|
"grad_norm": 5.706662654876709, |
|
"learning_rate": 4.900304113436571e-06, |
|
"loss": 0.1235, |
|
"num_input_tokens_seen": 522336, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5148833467417538, |
|
"grad_norm": 8.74252700805664, |
|
"learning_rate": 4.897413506838103e-06, |
|
"loss": 0.1022, |
|
"num_input_tokens_seen": 528960, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5213193885760258, |
|
"grad_norm": 4.498232841491699, |
|
"learning_rate": 4.894482468799344e-06, |
|
"loss": 0.0922, |
|
"num_input_tokens_seen": 535920, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5277554304102977, |
|
"grad_norm": 3.6902291774749756, |
|
"learning_rate": 4.891511048751102e-06, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 542496, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5341914722445696, |
|
"grad_norm": 5.754522323608398, |
|
"learning_rate": 4.888499296805214e-06, |
|
"loss": 0.1057, |
|
"num_input_tokens_seen": 548752, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5406275140788415, |
|
"grad_norm": 4.513391017913818, |
|
"learning_rate": 4.8854472637536966e-06, |
|
"loss": 0.0793, |
|
"num_input_tokens_seen": 555696, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5470635559131134, |
|
"grad_norm": 4.931502342224121, |
|
"learning_rate": 4.882355001067892e-06, |
|
"loss": 0.14, |
|
"num_input_tokens_seen": 562192, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5534995977473853, |
|
"grad_norm": 6.896547794342041, |
|
"learning_rate": 4.8792225608976e-06, |
|
"loss": 0.1538, |
|
"num_input_tokens_seen": 568672, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5599356395816573, |
|
"grad_norm": 3.4364850521087646, |
|
"learning_rate": 4.8760499960702005e-06, |
|
"loss": 0.1135, |
|
"num_input_tokens_seen": 575440, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5663716814159292, |
|
"grad_norm": 6.179934501647949, |
|
"learning_rate": 4.8728373600897535e-06, |
|
"loss": 0.1253, |
|
"num_input_tokens_seen": 581808, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5728077232502011, |
|
"grad_norm": 14.744488716125488, |
|
"learning_rate": 4.869584707136109e-06, |
|
"loss": 0.1408, |
|
"num_input_tokens_seen": 588576, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5792437650844731, |
|
"grad_norm": 8.414978981018066, |
|
"learning_rate": 4.8662920920639866e-06, |
|
"loss": 0.0916, |
|
"num_input_tokens_seen": 595328, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.585679806918745, |
|
"grad_norm": 3.016206979751587, |
|
"learning_rate": 4.86295957040205e-06, |
|
"loss": 0.1016, |
|
"num_input_tokens_seen": 601808, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5921158487530169, |
|
"grad_norm": 3.084475040435791, |
|
"learning_rate": 4.8595871983519705e-06, |
|
"loss": 0.0936, |
|
"num_input_tokens_seen": 608400, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5985518905872889, |
|
"grad_norm": 5.78838586807251, |
|
"learning_rate": 4.856175032787485e-06, |
|
"loss": 0.1557, |
|
"num_input_tokens_seen": 615296, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6049879324215608, |
|
"grad_norm": 4.525265216827393, |
|
"learning_rate": 4.852723131253429e-06, |
|
"loss": 0.0879, |
|
"num_input_tokens_seen": 621888, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6114239742558326, |
|
"grad_norm": 8.129231452941895, |
|
"learning_rate": 4.849231551964771e-06, |
|
"loss": 0.1399, |
|
"num_input_tokens_seen": 628768, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6178600160901045, |
|
"grad_norm": 7.902085781097412, |
|
"learning_rate": 4.845700353805629e-06, |
|
"loss": 0.1724, |
|
"num_input_tokens_seen": 635056, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6242960579243765, |
|
"grad_norm": 7.928036689758301, |
|
"learning_rate": 4.842129596328277e-06, |
|
"loss": 0.1018, |
|
"num_input_tokens_seen": 641872, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6307320997586484, |
|
"grad_norm": 5.5206756591796875, |
|
"learning_rate": 4.838519339752143e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 648752, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6371681415929203, |
|
"grad_norm": 5.910008907318115, |
|
"learning_rate": 4.834869644962789e-06, |
|
"loss": 0.1094, |
|
"num_input_tokens_seen": 655424, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6436041834271923, |
|
"grad_norm": 17.12012481689453, |
|
"learning_rate": 4.83118057351089e-06, |
|
"loss": 0.1915, |
|
"num_input_tokens_seen": 662224, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6500402252614642, |
|
"grad_norm": 13.876479148864746, |
|
"learning_rate": 4.827452187611192e-06, |
|
"loss": 0.1518, |
|
"num_input_tokens_seen": 668576, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6564762670957361, |
|
"grad_norm": 9.082406044006348, |
|
"learning_rate": 4.823684550141464e-06, |
|
"loss": 0.141, |
|
"num_input_tokens_seen": 675232, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6629123089300081, |
|
"grad_norm": 7.364652633666992, |
|
"learning_rate": 4.819877724641437e-06, |
|
"loss": 0.1564, |
|
"num_input_tokens_seen": 681856, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.66934835076428, |
|
"grad_norm": 16.54301643371582, |
|
"learning_rate": 4.8160317753117326e-06, |
|
"loss": 0.1267, |
|
"num_input_tokens_seen": 688416, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6757843925985519, |
|
"grad_norm": 25.702648162841797, |
|
"learning_rate": 4.81214676701278e-06, |
|
"loss": 0.228, |
|
"num_input_tokens_seen": 695248, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6822204344328238, |
|
"grad_norm": 17.066158294677734, |
|
"learning_rate": 4.808222765263724e-06, |
|
"loss": 0.1532, |
|
"num_input_tokens_seen": 701952, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6886564762670957, |
|
"grad_norm": 11.833669662475586, |
|
"learning_rate": 4.8042598362413175e-06, |
|
"loss": 0.1482, |
|
"num_input_tokens_seen": 708368, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6950925181013676, |
|
"grad_norm": 2.9714369773864746, |
|
"learning_rate": 4.800258046778809e-06, |
|
"loss": 0.1074, |
|
"num_input_tokens_seen": 714768, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7015285599356396, |
|
"grad_norm": 9.384042739868164, |
|
"learning_rate": 4.796217464364808e-06, |
|
"loss": 0.1163, |
|
"num_input_tokens_seen": 721600, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 12.922999382019043, |
|
"learning_rate": 4.792138157142158e-06, |
|
"loss": 0.1871, |
|
"num_input_tokens_seen": 728448, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7144006436041834, |
|
"grad_norm": 5.947402000427246, |
|
"learning_rate": 4.788020193906776e-06, |
|
"loss": 0.092, |
|
"num_input_tokens_seen": 734720, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7208366854384554, |
|
"grad_norm": 6.692570686340332, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.0959, |
|
"num_input_tokens_seen": 741216, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 4.1358771324157715, |
|
"learning_rate": 4.779668577839921e-06, |
|
"loss": 0.0853, |
|
"num_input_tokens_seen": 747664, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7337087691069992, |
|
"grad_norm": 4.695752143859863, |
|
"learning_rate": 4.775435065855183e-06, |
|
"loss": 0.1581, |
|
"num_input_tokens_seen": 754480, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7401448109412712, |
|
"grad_norm": 3.8357784748077393, |
|
"learning_rate": 4.771163179548809e-06, |
|
"loss": 0.0907, |
|
"num_input_tokens_seen": 761344, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7465808527755431, |
|
"grad_norm": 4.455271244049072, |
|
"learning_rate": 4.766852990964492e-06, |
|
"loss": 0.0826, |
|
"num_input_tokens_seen": 768160, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7530168946098149, |
|
"grad_norm": 3.682065963745117, |
|
"learning_rate": 4.762504572791873e-06, |
|
"loss": 0.0979, |
|
"num_input_tokens_seen": 774496, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7594529364440868, |
|
"grad_norm": 6.100201606750488, |
|
"learning_rate": 4.7581179983653224e-06, |
|
"loss": 0.1617, |
|
"num_input_tokens_seen": 781232, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7658889782783588, |
|
"grad_norm": 7.6822991371154785, |
|
"learning_rate": 4.753693341662702e-06, |
|
"loss": 0.1306, |
|
"num_input_tokens_seen": 788064, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7723250201126307, |
|
"grad_norm": 4.73075008392334, |
|
"learning_rate": 4.749230677304114e-06, |
|
"loss": 0.0955, |
|
"num_input_tokens_seen": 794656, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7787610619469026, |
|
"grad_norm": 4.435886859893799, |
|
"learning_rate": 4.7447300805506455e-06, |
|
"loss": 0.0683, |
|
"num_input_tokens_seen": 801184, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7851971037811746, |
|
"grad_norm": 3.517606735229492, |
|
"learning_rate": 4.7401916273031e-06, |
|
"loss": 0.1117, |
|
"num_input_tokens_seen": 808000, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7916331456154465, |
|
"grad_norm": 3.445953130722046, |
|
"learning_rate": 4.7356153941007145e-06, |
|
"loss": 0.1115, |
|
"num_input_tokens_seen": 814608, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7980691874497184, |
|
"grad_norm": 2.4660255908966064, |
|
"learning_rate": 4.73100145811987e-06, |
|
"loss": 0.0945, |
|
"num_input_tokens_seen": 821072, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8045052292839904, |
|
"grad_norm": 6.673710346221924, |
|
"learning_rate": 4.726349897172791e-06, |
|
"loss": 0.1125, |
|
"num_input_tokens_seen": 827840, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8109412711182623, |
|
"grad_norm": 7.910948753356934, |
|
"learning_rate": 4.721660789706232e-06, |
|
"loss": 0.15, |
|
"num_input_tokens_seen": 834880, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8173773129525342, |
|
"grad_norm": 4.5563154220581055, |
|
"learning_rate": 4.716934214800155e-06, |
|
"loss": 0.1015, |
|
"num_input_tokens_seen": 841360, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8238133547868061, |
|
"grad_norm": 5.7200422286987305, |
|
"learning_rate": 4.712170252166395e-06, |
|
"loss": 0.1271, |
|
"num_input_tokens_seen": 847888, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.830249396621078, |
|
"grad_norm": 4.4525465965271, |
|
"learning_rate": 4.707368982147318e-06, |
|
"loss": 0.0762, |
|
"num_input_tokens_seen": 854896, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8366854384553499, |
|
"grad_norm": 4.427840232849121, |
|
"learning_rate": 4.702530485714462e-06, |
|
"loss": 0.1196, |
|
"num_input_tokens_seen": 861600, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8431214802896219, |
|
"grad_norm": 3.674197196960449, |
|
"learning_rate": 4.697654844467175e-06, |
|
"loss": 0.0866, |
|
"num_input_tokens_seen": 868272, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8495575221238938, |
|
"grad_norm": 7.5055413246154785, |
|
"learning_rate": 4.69274214063124e-06, |
|
"loss": 0.0718, |
|
"num_input_tokens_seen": 875232, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8559935639581657, |
|
"grad_norm": 7.523169040679932, |
|
"learning_rate": 4.687792457057482e-06, |
|
"loss": 0.0808, |
|
"num_input_tokens_seen": 882112, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8624296057924377, |
|
"grad_norm": 10.57685375213623, |
|
"learning_rate": 4.682805877220378e-06, |
|
"loss": 0.1069, |
|
"num_input_tokens_seen": 888848, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8688656476267096, |
|
"grad_norm": 6.235794544219971, |
|
"learning_rate": 4.677782485216644e-06, |
|
"loss": 0.0804, |
|
"num_input_tokens_seen": 895136, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8753016894609815, |
|
"grad_norm": 5.526005268096924, |
|
"learning_rate": 4.672722365763821e-06, |
|
"loss": 0.068, |
|
"num_input_tokens_seen": 901552, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8817377312952535, |
|
"grad_norm": 6.142871856689453, |
|
"learning_rate": 4.667625604198842e-06, |
|
"loss": 0.1193, |
|
"num_input_tokens_seen": 908272, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8881737731295254, |
|
"grad_norm": 17.300273895263672, |
|
"learning_rate": 4.662492286476595e-06, |
|
"loss": 0.1535, |
|
"num_input_tokens_seen": 914928, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8946098149637972, |
|
"grad_norm": 13.767914772033691, |
|
"learning_rate": 4.657322499168475e-06, |
|
"loss": 0.1303, |
|
"num_input_tokens_seen": 921296, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9010458567980691, |
|
"grad_norm": 5.356888294219971, |
|
"learning_rate": 4.65211632946092e-06, |
|
"loss": 0.0879, |
|
"num_input_tokens_seen": 927728, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9074818986323411, |
|
"grad_norm": 10.261467933654785, |
|
"learning_rate": 4.646873865153945e-06, |
|
"loss": 0.0986, |
|
"num_input_tokens_seen": 934240, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.913917940466613, |
|
"grad_norm": 14.075957298278809, |
|
"learning_rate": 4.641595194659657e-06, |
|
"loss": 0.1219, |
|
"num_input_tokens_seen": 940832, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9203539823008849, |
|
"grad_norm": 11.964951515197754, |
|
"learning_rate": 4.63628040700077e-06, |
|
"loss": 0.1303, |
|
"num_input_tokens_seen": 947856, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9267900241351569, |
|
"grad_norm": 6.297915935516357, |
|
"learning_rate": 4.630929591809095e-06, |
|
"loss": 0.081, |
|
"num_input_tokens_seen": 954160, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9332260659694288, |
|
"grad_norm": 4.006863594055176, |
|
"learning_rate": 4.625542839324036e-06, |
|
"loss": 0.0979, |
|
"num_input_tokens_seen": 960848, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9396621078037007, |
|
"grad_norm": 9.041242599487305, |
|
"learning_rate": 4.620120240391065e-06, |
|
"loss": 0.1446, |
|
"num_input_tokens_seen": 967440, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9460981496379727, |
|
"grad_norm": 14.858406066894531, |
|
"learning_rate": 4.614661886460191e-06, |
|
"loss": 0.1267, |
|
"num_input_tokens_seen": 973808, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9525341914722446, |
|
"grad_norm": 12.371238708496094, |
|
"learning_rate": 4.609167869584416e-06, |
|
"loss": 0.1095, |
|
"num_input_tokens_seen": 980352, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9589702333065165, |
|
"grad_norm": 6.89439582824707, |
|
"learning_rate": 4.6036382824181836e-06, |
|
"loss": 0.1252, |
|
"num_input_tokens_seen": 987088, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9654062751407884, |
|
"grad_norm": 3.6482529640197754, |
|
"learning_rate": 4.598073218215817e-06, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 993648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9718423169750603, |
|
"grad_norm": 6.078918933868408, |
|
"learning_rate": 4.592472770829945e-06, |
|
"loss": 0.0974, |
|
"num_input_tokens_seen": 1000272, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9782783588093322, |
|
"grad_norm": 10.974119186401367, |
|
"learning_rate": 4.586837034709921e-06, |
|
"loss": 0.0833, |
|
"num_input_tokens_seen": 1006912, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9847144006436042, |
|
"grad_norm": 8.552461624145508, |
|
"learning_rate": 4.581166104900228e-06, |
|
"loss": 0.0787, |
|
"num_input_tokens_seen": 1013328, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9911504424778761, |
|
"grad_norm": 8.927652359008789, |
|
"learning_rate": 4.575460077038877e-06, |
|
"loss": 0.0814, |
|
"num_input_tokens_seen": 1020128, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.997586484312148, |
|
"grad_norm": 2.613471269607544, |
|
"learning_rate": 4.569719047355795e-06, |
|
"loss": 0.0278, |
|
"num_input_tokens_seen": 1026848, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.00402252614642, |
|
"grad_norm": 4.19236946105957, |
|
"learning_rate": 4.5639431126712e-06, |
|
"loss": 0.093, |
|
"num_input_tokens_seen": 1033728, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.010458567980692, |
|
"grad_norm": 6.943019866943359, |
|
"learning_rate": 4.5581323703939685e-06, |
|
"loss": 0.073, |
|
"num_input_tokens_seen": 1040720, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0168946098149638, |
|
"grad_norm": 6.545025825500488, |
|
"learning_rate": 4.552286918519996e-06, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 1047168, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0233306516492358, |
|
"grad_norm": 7.890603065490723, |
|
"learning_rate": 4.5464068556305375e-06, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 1053760, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0297666934835077, |
|
"grad_norm": 5.44887638092041, |
|
"learning_rate": 4.540492280890555e-06, |
|
"loss": 0.0318, |
|
"num_input_tokens_seen": 1060176, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0362027353177796, |
|
"grad_norm": 1.036007285118103, |
|
"learning_rate": 4.534543294047033e-06, |
|
"loss": 0.0068, |
|
"num_input_tokens_seen": 1066768, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0426387771520516, |
|
"grad_norm": 5.863292694091797, |
|
"learning_rate": 4.528559995427309e-06, |
|
"loss": 0.0462, |
|
"num_input_tokens_seen": 1073376, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0490748189863235, |
|
"grad_norm": 8.744257926940918, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 1079952, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0555108608205954, |
|
"grad_norm": 6.485115051269531, |
|
"learning_rate": 4.516490867060156e-06, |
|
"loss": 0.0664, |
|
"num_input_tokens_seen": 1086848, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 3.8945565223693848, |
|
"learning_rate": 4.5104052408538545e-06, |
|
"loss": 0.0347, |
|
"num_input_tokens_seen": 1093328, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.068382944489139, |
|
"grad_norm": 3.5805532932281494, |
|
"learning_rate": 4.504285709950167e-06, |
|
"loss": 0.0202, |
|
"num_input_tokens_seen": 1099840, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.074818986323411, |
|
"grad_norm": 6.033172607421875, |
|
"learning_rate": 4.498132377552587e-06, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 1106528, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.081255028157683, |
|
"grad_norm": 8.104386329650879, |
|
"learning_rate": 4.491945347434656e-06, |
|
"loss": 0.0848, |
|
"num_input_tokens_seen": 1113424, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0876910699919549, |
|
"grad_norm": 6.130471229553223, |
|
"learning_rate": 4.485724723938215e-06, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 1120064, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0941271118262268, |
|
"grad_norm": 8.871036529541016, |
|
"learning_rate": 4.479470611971646e-06, |
|
"loss": 0.1004, |
|
"num_input_tokens_seen": 1126960, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1005631536604987, |
|
"grad_norm": 7.802618026733398, |
|
"learning_rate": 4.473183117008096e-06, |
|
"loss": 0.0842, |
|
"num_input_tokens_seen": 1133664, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1069991954947707, |
|
"grad_norm": 2.848886489868164, |
|
"learning_rate": 4.4668623450837085e-06, |
|
"loss": 0.0452, |
|
"num_input_tokens_seen": 1140048, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1134352373290426, |
|
"grad_norm": 2.9371185302734375, |
|
"learning_rate": 4.460508402795827e-06, |
|
"loss": 0.0225, |
|
"num_input_tokens_seen": 1146448, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1198712791633145, |
|
"grad_norm": 2.1428751945495605, |
|
"learning_rate": 4.4541213973012005e-06, |
|
"loss": 0.0058, |
|
"num_input_tokens_seen": 1152960, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1263073209975865, |
|
"grad_norm": 6.480560302734375, |
|
"learning_rate": 4.447701436314176e-06, |
|
"loss": 0.0565, |
|
"num_input_tokens_seen": 1159632, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1327433628318584, |
|
"grad_norm": 8.678375244140625, |
|
"learning_rate": 4.441248628104884e-06, |
|
"loss": 0.0591, |
|
"num_input_tokens_seen": 1166640, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1391794046661303, |
|
"grad_norm": 8.184906005859375, |
|
"learning_rate": 4.434763081497407e-06, |
|
"loss": 0.0488, |
|
"num_input_tokens_seen": 1173408, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.1456154465004023, |
|
"grad_norm": 3.727961540222168, |
|
"learning_rate": 4.428244905867952e-06, |
|
"loss": 0.0318, |
|
"num_input_tokens_seen": 1179776, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1520514883346742, |
|
"grad_norm": 7.119325160980225, |
|
"learning_rate": 4.421694211142998e-06, |
|
"loss": 0.064, |
|
"num_input_tokens_seen": 1186720, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1584875301689461, |
|
"grad_norm": 3.094886541366577, |
|
"learning_rate": 4.415111107797445e-06, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 1193408, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.164923572003218, |
|
"grad_norm": 5.577038288116455, |
|
"learning_rate": 4.408495706852758e-06, |
|
"loss": 0.0344, |
|
"num_input_tokens_seen": 1200096, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.17135961383749, |
|
"grad_norm": 7.607036590576172, |
|
"learning_rate": 4.401848119875081e-06, |
|
"loss": 0.0747, |
|
"num_input_tokens_seen": 1206848, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.177795655671762, |
|
"grad_norm": 5.953075885772705, |
|
"learning_rate": 4.395168458973368e-06, |
|
"loss": 0.073, |
|
"num_input_tokens_seen": 1213632, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1842316975060339, |
|
"grad_norm": 7.784894943237305, |
|
"learning_rate": 4.388456836797484e-06, |
|
"loss": 0.0652, |
|
"num_input_tokens_seen": 1220336, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1906677393403058, |
|
"grad_norm": 6.535793781280518, |
|
"learning_rate": 4.381713366536312e-06, |
|
"loss": 0.0881, |
|
"num_input_tokens_seen": 1226736, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1971037811745777, |
|
"grad_norm": 4.9065093994140625, |
|
"learning_rate": 4.374938161915835e-06, |
|
"loss": 0.0676, |
|
"num_input_tokens_seen": 1233536, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2035398230088497, |
|
"grad_norm": 5.7732648849487305, |
|
"learning_rate": 4.368131337197228e-06, |
|
"loss": 0.0481, |
|
"num_input_tokens_seen": 1240032, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2099758648431216, |
|
"grad_norm": 5.656060218811035, |
|
"learning_rate": 4.361293007174926e-06, |
|
"loss": 0.0477, |
|
"num_input_tokens_seen": 1247008, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2164119066773935, |
|
"grad_norm": 3.2787587642669678, |
|
"learning_rate": 4.354423287174686e-06, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 1254032, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2228479485116655, |
|
"grad_norm": 9.526626586914062, |
|
"learning_rate": 4.3475222930516484e-06, |
|
"loss": 0.133, |
|
"num_input_tokens_seen": 1261104, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2292839903459372, |
|
"grad_norm": 4.272536277770996, |
|
"learning_rate": 4.340590141188377e-06, |
|
"loss": 0.0672, |
|
"num_input_tokens_seen": 1267680, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.235720032180209, |
|
"grad_norm": 4.5576701164245605, |
|
"learning_rate": 4.333626948492898e-06, |
|
"loss": 0.0352, |
|
"num_input_tokens_seen": 1274112, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.242156074014481, |
|
"grad_norm": 2.7765443325042725, |
|
"learning_rate": 4.326632832396733e-06, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 1280528, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.248592115848753, |
|
"grad_norm": 2.681631565093994, |
|
"learning_rate": 4.319607910852911e-06, |
|
"loss": 0.0432, |
|
"num_input_tokens_seen": 1287232, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.255028157683025, |
|
"grad_norm": 7.467050075531006, |
|
"learning_rate": 4.3125523023339825e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 1293792, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2614641995172968, |
|
"grad_norm": 2.6330530643463135, |
|
"learning_rate": 4.305466125830023e-06, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 1300624, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2679002413515688, |
|
"grad_norm": 5.228641510009766, |
|
"learning_rate": 4.2983495008466285e-06, |
|
"loss": 0.0394, |
|
"num_input_tokens_seen": 1307520, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2743362831858407, |
|
"grad_norm": 5.0004191398620605, |
|
"learning_rate": 4.29120254740289e-06, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 1313824, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2807723250201126, |
|
"grad_norm": 5.468844890594482, |
|
"learning_rate": 4.284025386029381e-06, |
|
"loss": 0.0663, |
|
"num_input_tokens_seen": 1319952, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2872083668543846, |
|
"grad_norm": 6.145412921905518, |
|
"learning_rate": 4.276818137766118e-06, |
|
"loss": 0.1067, |
|
"num_input_tokens_seen": 1326352, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2936444086886565, |
|
"grad_norm": 5.632473945617676, |
|
"learning_rate": 4.269580924160523e-06, |
|
"loss": 0.0603, |
|
"num_input_tokens_seen": 1332912, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.3000804505229284, |
|
"grad_norm": 1.326751947402954, |
|
"learning_rate": 4.262313867265369e-06, |
|
"loss": 0.0055, |
|
"num_input_tokens_seen": 1339872, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3065164923572004, |
|
"grad_norm": 6.162146091461182, |
|
"learning_rate": 4.255017089636725e-06, |
|
"loss": 0.0547, |
|
"num_input_tokens_seen": 1346240, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3129525341914723, |
|
"grad_norm": 2.500483989715576, |
|
"learning_rate": 4.24769071433189e-06, |
|
"loss": 0.0684, |
|
"num_input_tokens_seen": 1353104, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3193885760257442, |
|
"grad_norm": 5.962297439575195, |
|
"learning_rate": 4.240334864907317e-06, |
|
"loss": 0.0484, |
|
"num_input_tokens_seen": 1359664, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3258246178600162, |
|
"grad_norm": 4.183216571807861, |
|
"learning_rate": 4.232949665416526e-06, |
|
"loss": 0.0225, |
|
"num_input_tokens_seen": 1366112, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.332260659694288, |
|
"grad_norm": 2.874197244644165, |
|
"learning_rate": 4.225535240408014e-06, |
|
"loss": 0.0278, |
|
"num_input_tokens_seen": 1372912, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.33869670152856, |
|
"grad_norm": 5.831293106079102, |
|
"learning_rate": 4.218091714923157e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 1379200, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3451327433628317, |
|
"grad_norm": 4.147435665130615, |
|
"learning_rate": 4.210619214494099e-06, |
|
"loss": 0.0453, |
|
"num_input_tokens_seen": 1385520, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3515687851971037, |
|
"grad_norm": 6.03895378112793, |
|
"learning_rate": 4.203117865141635e-06, |
|
"loss": 0.0564, |
|
"num_input_tokens_seen": 1391968, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3580048270313756, |
|
"grad_norm": 4.196593284606934, |
|
"learning_rate": 4.195587793373085e-06, |
|
"loss": 0.0318, |
|
"num_input_tokens_seen": 1398576, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3644408688656475, |
|
"grad_norm": 6.364063739776611, |
|
"learning_rate": 4.188029126180161e-06, |
|
"loss": 0.0575, |
|
"num_input_tokens_seen": 1405280, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3708769106999195, |
|
"grad_norm": 5.420915603637695, |
|
"learning_rate": 4.180441991036827e-06, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 1411968, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3773129525341914, |
|
"grad_norm": 5.313647747039795, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.0493, |
|
"num_input_tokens_seen": 1418576, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3837489943684633, |
|
"grad_norm": 3.2573652267456055, |
|
"learning_rate": 4.165182829193126e-06, |
|
"loss": 0.0478, |
|
"num_input_tokens_seen": 1425360, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3901850362027353, |
|
"grad_norm": 4.227644443511963, |
|
"learning_rate": 4.15751105983255e-06, |
|
"loss": 0.0887, |
|
"num_input_tokens_seen": 1432144, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3966210780370072, |
|
"grad_norm": 6.514432907104492, |
|
"learning_rate": 4.149811337196808e-06, |
|
"loss": 0.0634, |
|
"num_input_tokens_seen": 1439200, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4030571198712791, |
|
"grad_norm": 3.3998050689697266, |
|
"learning_rate": 4.142083791138703e-06, |
|
"loss": 0.0349, |
|
"num_input_tokens_seen": 1445728, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.409493161705551, |
|
"grad_norm": 5.725708961486816, |
|
"learning_rate": 4.134328551980279e-06, |
|
"loss": 0.0459, |
|
"num_input_tokens_seen": 1452384, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 3.3524420261383057, |
|
"learning_rate": 4.126545750510605e-06, |
|
"loss": 0.0304, |
|
"num_input_tokens_seen": 1459136, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.422365245374095, |
|
"grad_norm": 9.169583320617676, |
|
"learning_rate": 4.118735517983584e-06, |
|
"loss": 0.0658, |
|
"num_input_tokens_seen": 1465632, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.4288012872083669, |
|
"grad_norm": 5.310299873352051, |
|
"learning_rate": 4.110897986115729e-06, |
|
"loss": 0.087, |
|
"num_input_tokens_seen": 1472592, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4352373290426388, |
|
"grad_norm": 4.850796222686768, |
|
"learning_rate": 4.1030332870839466e-06, |
|
"loss": 0.0952, |
|
"num_input_tokens_seen": 1479168, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4416733708769107, |
|
"grad_norm": 5.20851993560791, |
|
"learning_rate": 4.0951415535233065e-06, |
|
"loss": 0.0358, |
|
"num_input_tokens_seen": 1485664, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4481094127111827, |
|
"grad_norm": 4.104648113250732, |
|
"learning_rate": 4.087222918524807e-06, |
|
"loss": 0.0527, |
|
"num_input_tokens_seen": 1492368, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 2.5263378620147705, |
|
"learning_rate": 4.079277515633127e-06, |
|
"loss": 0.0452, |
|
"num_input_tokens_seen": 1498752, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4609814963797265, |
|
"grad_norm": 2.5317678451538086, |
|
"learning_rate": 4.0713054788443776e-06, |
|
"loss": 0.0313, |
|
"num_input_tokens_seen": 1505296, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4674175382139985, |
|
"grad_norm": 5.61666202545166, |
|
"learning_rate": 4.063306942603835e-06, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 1511584, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4738535800482704, |
|
"grad_norm": 3.4415996074676514, |
|
"learning_rate": 4.0552820418036855e-06, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 1517776, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4802896218825423, |
|
"grad_norm": 2.44814395904541, |
|
"learning_rate": 4.0472309117807365e-06, |
|
"loss": 0.0183, |
|
"num_input_tokens_seen": 1524416, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4867256637168142, |
|
"grad_norm": 3.827312469482422, |
|
"learning_rate": 4.039153688314146e-06, |
|
"loss": 0.0662, |
|
"num_input_tokens_seen": 1530864, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4931617055510862, |
|
"grad_norm": 2.9351532459259033, |
|
"learning_rate": 4.031050507623125e-06, |
|
"loss": 0.0258, |
|
"num_input_tokens_seen": 1537216, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.4995977473853581, |
|
"grad_norm": 6.040038585662842, |
|
"learning_rate": 4.022921506364644e-06, |
|
"loss": 0.0584, |
|
"num_input_tokens_seen": 1543824, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.50603378921963, |
|
"grad_norm": 2.7363831996917725, |
|
"learning_rate": 4.014766821631128e-06, |
|
"loss": 0.0916, |
|
"num_input_tokens_seen": 1550432, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.512469831053902, |
|
"grad_norm": 4.466485977172852, |
|
"learning_rate": 4.006586590948141e-06, |
|
"loss": 0.0397, |
|
"num_input_tokens_seen": 1556912, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.518905872888174, |
|
"grad_norm": 4.36499547958374, |
|
"learning_rate": 3.998380952272073e-06, |
|
"loss": 0.0405, |
|
"num_input_tokens_seen": 1563456, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5253419147224458, |
|
"grad_norm": 3.1068978309631348, |
|
"learning_rate": 3.990150043987806e-06, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 1570240, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5317779565567178, |
|
"grad_norm": 4.554339408874512, |
|
"learning_rate": 3.981894004906388e-06, |
|
"loss": 0.0389, |
|
"num_input_tokens_seen": 1576896, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5382139983909895, |
|
"grad_norm": 2.1207427978515625, |
|
"learning_rate": 3.973612974262685e-06, |
|
"loss": 0.0341, |
|
"num_input_tokens_seen": 1583440, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5446500402252614, |
|
"grad_norm": 4.71979284286499, |
|
"learning_rate": 3.965307091713037e-06, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 1589968, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5510860820595334, |
|
"grad_norm": 3.9797351360321045, |
|
"learning_rate": 3.956976497332903e-06, |
|
"loss": 0.0651, |
|
"num_input_tokens_seen": 1596416, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5575221238938053, |
|
"grad_norm": 4.844697952270508, |
|
"learning_rate": 3.948621331614495e-06, |
|
"loss": 0.0391, |
|
"num_input_tokens_seen": 1602944, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.5639581657280772, |
|
"grad_norm": 4.572307109832764, |
|
"learning_rate": 3.9402417354644115e-06, |
|
"loss": 0.0486, |
|
"num_input_tokens_seen": 1609632, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5703942075623492, |
|
"grad_norm": 7.0537309646606445, |
|
"learning_rate": 3.9318378502012636e-06, |
|
"loss": 0.1192, |
|
"num_input_tokens_seen": 1616096, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.576830249396621, |
|
"grad_norm": 4.42478609085083, |
|
"learning_rate": 3.923409817553284e-06, |
|
"loss": 0.0679, |
|
"num_input_tokens_seen": 1622848, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.583266291230893, |
|
"grad_norm": 5.157562255859375, |
|
"learning_rate": 3.914957779655946e-06, |
|
"loss": 0.0493, |
|
"num_input_tokens_seen": 1629600, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.589702333065165, |
|
"grad_norm": 2.8394153118133545, |
|
"learning_rate": 3.906481879049559e-06, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 1636192, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5961383748994369, |
|
"grad_norm": 2.4742684364318848, |
|
"learning_rate": 3.897982258676867e-06, |
|
"loss": 0.0391, |
|
"num_input_tokens_seen": 1642832, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6025744167337088, |
|
"grad_norm": 4.165124893188477, |
|
"learning_rate": 3.8894590618806435e-06, |
|
"loss": 0.0501, |
|
"num_input_tokens_seen": 1649904, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6090104585679805, |
|
"grad_norm": 2.7913286685943604, |
|
"learning_rate": 3.880912432401265e-06, |
|
"loss": 0.0397, |
|
"num_input_tokens_seen": 1656704, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6154465004022525, |
|
"grad_norm": 4.8400397300720215, |
|
"learning_rate": 3.872342514374291e-06, |
|
"loss": 0.0846, |
|
"num_input_tokens_seen": 1663680, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6218825422365244, |
|
"grad_norm": 3.111396074295044, |
|
"learning_rate": 3.863749452328035e-06, |
|
"loss": 0.0443, |
|
"num_input_tokens_seen": 1670160, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6283185840707963, |
|
"grad_norm": 3.1794304847717285, |
|
"learning_rate": 3.855133391181124e-06, |
|
"loss": 0.045, |
|
"num_input_tokens_seen": 1676832, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6347546259050683, |
|
"grad_norm": 1.6655223369598389, |
|
"learning_rate": 3.846494476240057e-06, |
|
"loss": 0.0172, |
|
"num_input_tokens_seen": 1683664, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6411906677393402, |
|
"grad_norm": 4.251989841461182, |
|
"learning_rate": 3.837832853196751e-06, |
|
"loss": 0.0949, |
|
"num_input_tokens_seen": 1690208, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6476267095736121, |
|
"grad_norm": 7.070593357086182, |
|
"learning_rate": 3.8291486681260904e-06, |
|
"loss": 0.0277, |
|
"num_input_tokens_seen": 1697296, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.654062751407884, |
|
"grad_norm": 2.8217155933380127, |
|
"learning_rate": 3.820442067483455e-06, |
|
"loss": 0.0247, |
|
"num_input_tokens_seen": 1703504, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.660498793242156, |
|
"grad_norm": 5.125271320343018, |
|
"learning_rate": 3.811713198102258e-06, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 1710016, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.666934835076428, |
|
"grad_norm": 5.227617263793945, |
|
"learning_rate": 3.802962207191463e-06, |
|
"loss": 0.0342, |
|
"num_input_tokens_seen": 1716960, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6733708769106999, |
|
"grad_norm": 3.3697738647460938, |
|
"learning_rate": 3.794189242333107e-06, |
|
"loss": 0.0617, |
|
"num_input_tokens_seen": 1723504, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6798069187449718, |
|
"grad_norm": 2.9104015827178955, |
|
"learning_rate": 3.785394451479806e-06, |
|
"loss": 0.0675, |
|
"num_input_tokens_seen": 1730160, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6862429605792437, |
|
"grad_norm": 4.513949394226074, |
|
"learning_rate": 3.7765779829522674e-06, |
|
"loss": 0.1055, |
|
"num_input_tokens_seen": 1736752, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6926790024135157, |
|
"grad_norm": 3.0852975845336914, |
|
"learning_rate": 3.7677399854367815e-06, |
|
"loss": 0.0355, |
|
"num_input_tokens_seen": 1743328, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6991150442477876, |
|
"grad_norm": 3.222297191619873, |
|
"learning_rate": 3.7588806079827147e-06, |
|
"loss": 0.0622, |
|
"num_input_tokens_seen": 1749776, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7055510860820595, |
|
"grad_norm": 2.017244338989258, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0297, |
|
"num_input_tokens_seen": 1756512, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7119871279163315, |
|
"grad_norm": 2.465116262435913, |
|
"learning_rate": 3.7410983112566166e-06, |
|
"loss": 0.0312, |
|
"num_input_tokens_seen": 1762928, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7184231697506034, |
|
"grad_norm": 2.8471832275390625, |
|
"learning_rate": 3.7321756918760587e-06, |
|
"loss": 0.0811, |
|
"num_input_tokens_seen": 1769392, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7248592115848753, |
|
"grad_norm": 3.4750540256500244, |
|
"learning_rate": 3.7232322923348093e-06, |
|
"loss": 0.067, |
|
"num_input_tokens_seen": 1776032, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7312952534191473, |
|
"grad_norm": 2.845557928085327, |
|
"learning_rate": 3.7142682634598016e-06, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 1782512, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7377312952534192, |
|
"grad_norm": 2.0945403575897217, |
|
"learning_rate": 3.7052837564258728e-06, |
|
"loss": 0.021, |
|
"num_input_tokens_seen": 1789280, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7441673370876911, |
|
"grad_norm": 2.614729642868042, |
|
"learning_rate": 3.6962789227532165e-06, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 1795696, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.750603378921963, |
|
"grad_norm": 3.331339120864868, |
|
"learning_rate": 3.6872539143048287e-06, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 1802448, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.757039420756235, |
|
"grad_norm": 2.845620632171631, |
|
"learning_rate": 3.6782088832839436e-06, |
|
"loss": 0.0402, |
|
"num_input_tokens_seen": 1809264, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.763475462590507, |
|
"grad_norm": 3.3971211910247803, |
|
"learning_rate": 3.6691439822314672e-06, |
|
"loss": 0.0363, |
|
"num_input_tokens_seen": 1815808, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 5.249027729034424, |
|
"learning_rate": 3.660059364023409e-06, |
|
"loss": 0.0523, |
|
"num_input_tokens_seen": 1822352, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7763475462590508, |
|
"grad_norm": 3.6546497344970703, |
|
"learning_rate": 3.650955181868298e-06, |
|
"loss": 0.0255, |
|
"num_input_tokens_seen": 1829056, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7827835880933227, |
|
"grad_norm": 7.767543792724609, |
|
"learning_rate": 3.641831589304602e-06, |
|
"loss": 0.1031, |
|
"num_input_tokens_seen": 1835696, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7892196299275946, |
|
"grad_norm": 1.5550068616867065, |
|
"learning_rate": 3.6326887401981386e-06, |
|
"loss": 0.0452, |
|
"num_input_tokens_seen": 1842288, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7956556717618666, |
|
"grad_norm": 4.8318986892700195, |
|
"learning_rate": 3.6235267887394774e-06, |
|
"loss": 0.0537, |
|
"num_input_tokens_seen": 1848960, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8020917135961385, |
|
"grad_norm": 4.691814422607422, |
|
"learning_rate": 3.6143458894413463e-06, |
|
"loss": 0.0572, |
|
"num_input_tokens_seen": 1855648, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8085277554304104, |
|
"grad_norm": 2.6937472820281982, |
|
"learning_rate": 3.6051461971360146e-06, |
|
"loss": 0.0298, |
|
"num_input_tokens_seen": 1862160, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.8149637972646824, |
|
"grad_norm": 4.052839279174805, |
|
"learning_rate": 3.595927866972694e-06, |
|
"loss": 0.037, |
|
"num_input_tokens_seen": 1868896, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8213998390989543, |
|
"grad_norm": 5.030338287353516, |
|
"learning_rate": 3.586691054414913e-06, |
|
"loss": 0.0783, |
|
"num_input_tokens_seen": 1875248, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.827835880933226, |
|
"grad_norm": 1.9826079607009888, |
|
"learning_rate": 3.577435915237899e-06, |
|
"loss": 0.0436, |
|
"num_input_tokens_seen": 1881728, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.834271922767498, |
|
"grad_norm": 1.8905837535858154, |
|
"learning_rate": 3.5681626055259526e-06, |
|
"loss": 0.0258, |
|
"num_input_tokens_seen": 1888384, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8407079646017699, |
|
"grad_norm": 1.9678194522857666, |
|
"learning_rate": 3.558871281669811e-06, |
|
"loss": 0.0235, |
|
"num_input_tokens_seen": 1894864, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8471440064360418, |
|
"grad_norm": 4.199605464935303, |
|
"learning_rate": 3.549562100364014e-06, |
|
"loss": 0.0541, |
|
"num_input_tokens_seen": 1901680, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8535800482703138, |
|
"grad_norm": 4.100510120391846, |
|
"learning_rate": 3.5402352186042602e-06, |
|
"loss": 0.0767, |
|
"num_input_tokens_seen": 1908304, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.8600160901045857, |
|
"grad_norm": 6.471580982208252, |
|
"learning_rate": 3.530890793684759e-06, |
|
"loss": 0.0558, |
|
"num_input_tokens_seen": 1914736, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.8664521319388576, |
|
"grad_norm": 6.2181525230407715, |
|
"learning_rate": 3.521528983195579e-06, |
|
"loss": 0.0483, |
|
"num_input_tokens_seen": 1921088, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8728881737731295, |
|
"grad_norm": 3.5814297199249268, |
|
"learning_rate": 3.512149945019989e-06, |
|
"loss": 0.0389, |
|
"num_input_tokens_seen": 1927408, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8793242156074015, |
|
"grad_norm": 3.193094491958618, |
|
"learning_rate": 3.502753837331797e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 1934160, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8857602574416734, |
|
"grad_norm": 3.2676048278808594, |
|
"learning_rate": 3.4933408185926805e-06, |
|
"loss": 0.0921, |
|
"num_input_tokens_seen": 1940912, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8921962992759453, |
|
"grad_norm": 4.060972690582275, |
|
"learning_rate": 3.4839110475495153e-06, |
|
"loss": 0.0661, |
|
"num_input_tokens_seen": 1947488, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.898632341110217, |
|
"grad_norm": 4.40585470199585, |
|
"learning_rate": 3.4744646832316985e-06, |
|
"loss": 0.0301, |
|
"num_input_tokens_seen": 1954000, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.905068382944489, |
|
"grad_norm": 4.472731113433838, |
|
"learning_rate": 3.465001884948468e-06, |
|
"loss": 0.0878, |
|
"num_input_tokens_seen": 1960400, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.911504424778761, |
|
"grad_norm": 3.2221555709838867, |
|
"learning_rate": 3.45552281228621e-06, |
|
"loss": 0.1126, |
|
"num_input_tokens_seen": 1967728, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9179404666130329, |
|
"grad_norm": 3.6210269927978516, |
|
"learning_rate": 3.446027625105776e-06, |
|
"loss": 0.0679, |
|
"num_input_tokens_seen": 1974096, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9243765084473048, |
|
"grad_norm": 2.038454055786133, |
|
"learning_rate": 3.436516483539781e-06, |
|
"loss": 0.031, |
|
"num_input_tokens_seen": 1980672, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.9308125502815767, |
|
"grad_norm": 2.2427828311920166, |
|
"learning_rate": 3.4269895479899023e-06, |
|
"loss": 0.0687, |
|
"num_input_tokens_seen": 1987104, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9372485921158487, |
|
"grad_norm": 6.37827730178833, |
|
"learning_rate": 3.4174469791241805e-06, |
|
"loss": 0.0497, |
|
"num_input_tokens_seen": 1994064, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.9436846339501206, |
|
"grad_norm": 9.542262077331543, |
|
"learning_rate": 3.4078889378743036e-06, |
|
"loss": 0.0829, |
|
"num_input_tokens_seen": 2001056, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9501206757843925, |
|
"grad_norm": 6.237174034118652, |
|
"learning_rate": 3.3983155854328942e-06, |
|
"loss": 0.0578, |
|
"num_input_tokens_seen": 2007712, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9565567176186645, |
|
"grad_norm": 2.3653266429901123, |
|
"learning_rate": 3.388727083250795e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 2014368, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9629927594529364, |
|
"grad_norm": 3.9448723793029785, |
|
"learning_rate": 3.379123593034342e-06, |
|
"loss": 0.0754, |
|
"num_input_tokens_seen": 2020592, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9694288012872083, |
|
"grad_norm": 2.1158804893493652, |
|
"learning_rate": 3.369505276742638e-06, |
|
"loss": 0.0395, |
|
"num_input_tokens_seen": 2026864, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9758648431214803, |
|
"grad_norm": 5.131661891937256, |
|
"learning_rate": 3.359872296584821e-06, |
|
"loss": 0.0575, |
|
"num_input_tokens_seen": 2033440, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9823008849557522, |
|
"grad_norm": 5.0783867835998535, |
|
"learning_rate": 3.350224815017331e-06, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 2039712, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9887369267900241, |
|
"grad_norm": 6.688424587249756, |
|
"learning_rate": 3.3405629947411687e-06, |
|
"loss": 0.0498, |
|
"num_input_tokens_seen": 2046576, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.995172968624296, |
|
"grad_norm": 5.24268102645874, |
|
"learning_rate": 3.3308869986991493e-06, |
|
"loss": 0.0447, |
|
"num_input_tokens_seen": 2053248, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.001609010458568, |
|
"grad_norm": 1.7300570011138916, |
|
"learning_rate": 3.32119699007316e-06, |
|
"loss": 0.0155, |
|
"num_input_tokens_seen": 2059840, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.00804505229284, |
|
"grad_norm": 2.5391845703125, |
|
"learning_rate": 3.311493132281402e-06, |
|
"loss": 0.0183, |
|
"num_input_tokens_seen": 2066384, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.014481094127112, |
|
"grad_norm": 1.9404152631759644, |
|
"learning_rate": 3.3017755889756382e-06, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 2073088, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.020917135961384, |
|
"grad_norm": 1.1497960090637207, |
|
"learning_rate": 3.292044524038433e-06, |
|
"loss": 0.0119, |
|
"num_input_tokens_seen": 2079600, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0273531777956557, |
|
"grad_norm": 0.6188907027244568, |
|
"learning_rate": 3.2823001015803863e-06, |
|
"loss": 0.0037, |
|
"num_input_tokens_seen": 2086080, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0337892196299276, |
|
"grad_norm": 2.5652434825897217, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.0048, |
|
"num_input_tokens_seen": 2092768, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0402252614641996, |
|
"grad_norm": 1.3636257648468018, |
|
"learning_rate": 3.2627718416677484e-06, |
|
"loss": 0.004, |
|
"num_input_tokens_seen": 2099296, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.0466613032984715, |
|
"grad_norm": 3.7406702041625977, |
|
"learning_rate": 3.2529883335496163e-06, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 2106176, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.0530973451327434, |
|
"grad_norm": 0.2876489460468292, |
|
"learning_rate": 3.243192126578007e-06, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 2112560, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.0595333869670154, |
|
"grad_norm": 3.388899087905884, |
|
"learning_rate": 3.2333833859621155e-06, |
|
"loss": 0.0332, |
|
"num_input_tokens_seen": 2119296, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0659694288012873, |
|
"grad_norm": 2.6212401390075684, |
|
"learning_rate": 3.223562277122513e-06, |
|
"loss": 0.0434, |
|
"num_input_tokens_seen": 2125632, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0724054706355592, |
|
"grad_norm": 3.6854021549224854, |
|
"learning_rate": 3.213728965688356e-06, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 2132096, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.078841512469831, |
|
"grad_norm": 3.9269893169403076, |
|
"learning_rate": 3.2038836174945907e-06, |
|
"loss": 0.0188, |
|
"num_input_tokens_seen": 2138336, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.085277554304103, |
|
"grad_norm": 2.3363194465637207, |
|
"learning_rate": 3.194026398579162e-06, |
|
"loss": 0.0382, |
|
"num_input_tokens_seen": 2144672, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.091713596138375, |
|
"grad_norm": 0.16176919639110565, |
|
"learning_rate": 3.184157475180208e-06, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 2151216, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.098149637972647, |
|
"grad_norm": 7.4007368087768555, |
|
"learning_rate": 3.1742770137332567e-06, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 2158000, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.104585679806919, |
|
"grad_norm": 0.3990660607814789, |
|
"learning_rate": 3.164385180868425e-06, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 2164448, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.111021721641191, |
|
"grad_norm": 5.447741508483887, |
|
"learning_rate": 3.1544821434076013e-06, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 2171120, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.1174577634754628, |
|
"grad_norm": 4.229776382446289, |
|
"learning_rate": 3.144568068361634e-06, |
|
"loss": 0.03, |
|
"num_input_tokens_seen": 2177648, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 5.920961380004883, |
|
"learning_rate": 3.1346431229275197e-06, |
|
"loss": 0.0207, |
|
"num_input_tokens_seen": 2183856, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1303298471440066, |
|
"grad_norm": 11.779773712158203, |
|
"learning_rate": 3.124707474485577e-06, |
|
"loss": 0.0172, |
|
"num_input_tokens_seen": 2190608, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.136765888978278, |
|
"grad_norm": 8.82557201385498, |
|
"learning_rate": 3.1147612905966286e-06, |
|
"loss": 0.0115, |
|
"num_input_tokens_seen": 2197232, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.14320193081255, |
|
"grad_norm": 1.1176470518112183, |
|
"learning_rate": 3.1048047389991693e-06, |
|
"loss": 0.0217, |
|
"num_input_tokens_seen": 2203456, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.149637972646822, |
|
"grad_norm": 0.8806192278862, |
|
"learning_rate": 3.094837987606547e-06, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 2209856, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.156074014481094, |
|
"grad_norm": 6.793837547302246, |
|
"learning_rate": 3.084861204504122e-06, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 2216400, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.162510056315366, |
|
"grad_norm": 5.5410075187683105, |
|
"learning_rate": 3.0748745579464347e-06, |
|
"loss": 0.0382, |
|
"num_input_tokens_seen": 2222864, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.168946098149638, |
|
"grad_norm": 1.8118884563446045, |
|
"learning_rate": 3.0648782163543696e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 2229760, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1753821399839097, |
|
"grad_norm": 2.607206106185913, |
|
"learning_rate": 3.0548723483123157e-06, |
|
"loss": 0.0338, |
|
"num_input_tokens_seen": 2236368, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 1.080344557762146, |
|
"learning_rate": 3.0448571225653195e-06, |
|
"loss": 0.0141, |
|
"num_input_tokens_seen": 2242816, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.1882542236524536, |
|
"grad_norm": 2.380739212036133, |
|
"learning_rate": 3.0348327080162438e-06, |
|
"loss": 0.0287, |
|
"num_input_tokens_seen": 2249488, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.1946902654867255, |
|
"grad_norm": 1.0098868608474731, |
|
"learning_rate": 3.0247992737229147e-06, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 2255968, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.2011263073209975, |
|
"grad_norm": 0.7940512895584106, |
|
"learning_rate": 3.014756988895275e-06, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 2262544, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2075623491552694, |
|
"grad_norm": 2.9759926795959473, |
|
"learning_rate": 3.0047060228925256e-06, |
|
"loss": 0.039, |
|
"num_input_tokens_seen": 2269312, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.2139983909895413, |
|
"grad_norm": 4.84032678604126, |
|
"learning_rate": 2.994646545220275e-06, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 2275968, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.2204344328238133, |
|
"grad_norm": 2.9671568870544434, |
|
"learning_rate": 2.9845787255276753e-06, |
|
"loss": 0.0231, |
|
"num_input_tokens_seen": 2282976, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.226870474658085, |
|
"grad_norm": 5.410647392272949, |
|
"learning_rate": 2.9745027336045652e-06, |
|
"loss": 0.04, |
|
"num_input_tokens_seen": 2289696, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.233306516492357, |
|
"grad_norm": 5.828602313995361, |
|
"learning_rate": 2.964418739378603e-06, |
|
"loss": 0.0282, |
|
"num_input_tokens_seen": 2296272, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.239742558326629, |
|
"grad_norm": 1.9481452703475952, |
|
"learning_rate": 2.954326912912404e-06, |
|
"loss": 0.0143, |
|
"num_input_tokens_seen": 2303120, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.246178600160901, |
|
"grad_norm": 3.2762415409088135, |
|
"learning_rate": 2.9442274244006725e-06, |
|
"loss": 0.0194, |
|
"num_input_tokens_seen": 2309728, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.252614641995173, |
|
"grad_norm": 2.3237709999084473, |
|
"learning_rate": 2.9341204441673267e-06, |
|
"loss": 0.0051, |
|
"num_input_tokens_seen": 2316144, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.259050683829445, |
|
"grad_norm": 1.7801238298416138, |
|
"learning_rate": 2.924006142662632e-06, |
|
"loss": 0.0162, |
|
"num_input_tokens_seen": 2322768, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.265486725663717, |
|
"grad_norm": 4.876129150390625, |
|
"learning_rate": 2.913884690460325e-06, |
|
"loss": 0.0313, |
|
"num_input_tokens_seen": 2329312, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.2719227674979887, |
|
"grad_norm": 0.9637519717216492, |
|
"learning_rate": 2.903756258254734e-06, |
|
"loss": 0.0041, |
|
"num_input_tokens_seen": 2335824, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.2783588093322606, |
|
"grad_norm": 2.7481493949890137, |
|
"learning_rate": 2.8936210168579043e-06, |
|
"loss": 0.0321, |
|
"num_input_tokens_seen": 2342272, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2847948511665326, |
|
"grad_norm": 1.682763934135437, |
|
"learning_rate": 2.883479137196714e-06, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 2349056, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.2912308930008045, |
|
"grad_norm": 5.632142066955566, |
|
"learning_rate": 2.8733307903099926e-06, |
|
"loss": 0.0237, |
|
"num_input_tokens_seen": 2355552, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.2976669348350764, |
|
"grad_norm": 2.460470199584961, |
|
"learning_rate": 2.8631761473456377e-06, |
|
"loss": 0.0152, |
|
"num_input_tokens_seen": 2361808, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.3041029766693484, |
|
"grad_norm": 0.9998040199279785, |
|
"learning_rate": 2.853015379557729e-06, |
|
"loss": 0.0038, |
|
"num_input_tokens_seen": 2368288, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3105390185036203, |
|
"grad_norm": 3.164407968521118, |
|
"learning_rate": 2.842848658303637e-06, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 2374960, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.3169750603378922, |
|
"grad_norm": 2.3879611492156982, |
|
"learning_rate": 2.832676155041135e-06, |
|
"loss": 0.0049, |
|
"num_input_tokens_seen": 2381776, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.323411102172164, |
|
"grad_norm": 1.3164470195770264, |
|
"learning_rate": 2.822498041325509e-06, |
|
"loss": 0.0114, |
|
"num_input_tokens_seen": 2388112, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.329847144006436, |
|
"grad_norm": 2.3726656436920166, |
|
"learning_rate": 2.8123144888066623e-06, |
|
"loss": 0.022, |
|
"num_input_tokens_seen": 2394736, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.336283185840708, |
|
"grad_norm": 1.7789826393127441, |
|
"learning_rate": 2.802125669226222e-06, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 2401248, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.34271922767498, |
|
"grad_norm": 3.68959641456604, |
|
"learning_rate": 2.7919317544146405e-06, |
|
"loss": 0.0204, |
|
"num_input_tokens_seen": 2407872, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.349155269509252, |
|
"grad_norm": 2.4927353858947754, |
|
"learning_rate": 2.7817329162883033e-06, |
|
"loss": 0.0334, |
|
"num_input_tokens_seen": 2414432, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.355591311343524, |
|
"grad_norm": 4.594964504241943, |
|
"learning_rate": 2.7715293268466204e-06, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 2420848, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.3620273531777958, |
|
"grad_norm": 4.325422286987305, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.0291, |
|
"num_input_tokens_seen": 2427728, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.3684633950120677, |
|
"grad_norm": 2.46122407913208, |
|
"learning_rate": 2.7511085824126133e-06, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 2434880, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.3748994368463396, |
|
"grad_norm": 2.729311227798462, |
|
"learning_rate": 2.74089177180815e-06, |
|
"loss": 0.0306, |
|
"num_input_tokens_seen": 2441168, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.3813354786806116, |
|
"grad_norm": 5.095163345336914, |
|
"learning_rate": 2.730670898658255e-06, |
|
"loss": 0.0297, |
|
"num_input_tokens_seen": 2447920, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.3877715205148835, |
|
"grad_norm": 1.902287483215332, |
|
"learning_rate": 2.7204461353339546e-06, |
|
"loss": 0.0247, |
|
"num_input_tokens_seen": 2454704, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.3942075623491554, |
|
"grad_norm": 3.267244577407837, |
|
"learning_rate": 2.7102176542718783e-06, |
|
"loss": 0.0234, |
|
"num_input_tokens_seen": 2461216, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.4006436041834274, |
|
"grad_norm": 4.101126670837402, |
|
"learning_rate": 2.699985627971354e-06, |
|
"loss": 0.0192, |
|
"num_input_tokens_seen": 2468032, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4070796460176993, |
|
"grad_norm": 4.104948997497559, |
|
"learning_rate": 2.689750228991503e-06, |
|
"loss": 0.0324, |
|
"num_input_tokens_seen": 2474544, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4135156878519712, |
|
"grad_norm": 2.1446776390075684, |
|
"learning_rate": 2.679511629948319e-06, |
|
"loss": 0.0332, |
|
"num_input_tokens_seen": 2481312, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.419951729686243, |
|
"grad_norm": 0.7457873225212097, |
|
"learning_rate": 2.669270003511769e-06, |
|
"loss": 0.0043, |
|
"num_input_tokens_seen": 2487888, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.426387771520515, |
|
"grad_norm": 2.1420276165008545, |
|
"learning_rate": 2.6590255224028725e-06, |
|
"loss": 0.0197, |
|
"num_input_tokens_seen": 2494784, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.432823813354787, |
|
"grad_norm": 3.0415239334106445, |
|
"learning_rate": 2.648778359390794e-06, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 2501712, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.439259855189059, |
|
"grad_norm": 3.6502788066864014, |
|
"learning_rate": 2.638528687289925e-06, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 2508592, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.445695897023331, |
|
"grad_norm": 2.2913506031036377, |
|
"learning_rate": 2.6282766789569742e-06, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 2515216, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.4521319388576024, |
|
"grad_norm": 3.3507297039031982, |
|
"learning_rate": 2.618022507288049e-06, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 2522064, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.4585679806918743, |
|
"grad_norm": 2.98098087310791, |
|
"learning_rate": 2.6077663452157398e-06, |
|
"loss": 0.0292, |
|
"num_input_tokens_seen": 2528608, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.4650040225261463, |
|
"grad_norm": 1.4962135553359985, |
|
"learning_rate": 2.5975083657062043e-06, |
|
"loss": 0.0095, |
|
"num_input_tokens_seen": 2535328, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.471440064360418, |
|
"grad_norm": 2.0819742679595947, |
|
"learning_rate": 2.587248741756253e-06, |
|
"loss": 0.015, |
|
"num_input_tokens_seen": 2542224, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 1.8906433582305908, |
|
"learning_rate": 2.576987646390426e-06, |
|
"loss": 0.0276, |
|
"num_input_tokens_seen": 2548976, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.484312148028962, |
|
"grad_norm": 2.451510190963745, |
|
"learning_rate": 2.566725252658081e-06, |
|
"loss": 0.0284, |
|
"num_input_tokens_seen": 2555568, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.490748189863234, |
|
"grad_norm": 3.7337939739227295, |
|
"learning_rate": 2.5564617336304703e-06, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 2562128, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.497184231697506, |
|
"grad_norm": 1.6401593685150146, |
|
"learning_rate": 2.546197262397825e-06, |
|
"loss": 0.0322, |
|
"num_input_tokens_seen": 2568640, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.503620273531778, |
|
"grad_norm": 0.9136457443237305, |
|
"learning_rate": 2.535932012066434e-06, |
|
"loss": 0.0057, |
|
"num_input_tokens_seen": 2575024, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.51005631536605, |
|
"grad_norm": 1.119612455368042, |
|
"learning_rate": 2.525666155755725e-06, |
|
"loss": 0.0054, |
|
"num_input_tokens_seen": 2581520, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.5164923572003217, |
|
"grad_norm": 2.4770889282226562, |
|
"learning_rate": 2.515399866595347e-06, |
|
"loss": 0.0199, |
|
"num_input_tokens_seen": 2588528, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.5229283990345936, |
|
"grad_norm": 0.35335639119148254, |
|
"learning_rate": 2.5051333177222476e-06, |
|
"loss": 0.0045, |
|
"num_input_tokens_seen": 2594992, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5293644408688656, |
|
"grad_norm": 2.8933093547821045, |
|
"learning_rate": 2.4948666822777536e-06, |
|
"loss": 0.0283, |
|
"num_input_tokens_seen": 2601568, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.5358004827031375, |
|
"grad_norm": 1.7032990455627441, |
|
"learning_rate": 2.4846001334046537e-06, |
|
"loss": 0.0248, |
|
"num_input_tokens_seen": 2608160, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.5422365245374094, |
|
"grad_norm": 1.9688091278076172, |
|
"learning_rate": 2.474333844244276e-06, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 2614656, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.5486725663716814, |
|
"grad_norm": 3.135990619659424, |
|
"learning_rate": 2.464067987933567e-06, |
|
"loss": 0.04, |
|
"num_input_tokens_seen": 2621600, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.5551086082059533, |
|
"grad_norm": 0.7140212059020996, |
|
"learning_rate": 2.453802737602176e-06, |
|
"loss": 0.0029, |
|
"num_input_tokens_seen": 2627984, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.5615446500402252, |
|
"grad_norm": 3.9643640518188477, |
|
"learning_rate": 2.4435382663695305e-06, |
|
"loss": 0.0254, |
|
"num_input_tokens_seen": 2634720, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.567980691874497, |
|
"grad_norm": 2.284302234649658, |
|
"learning_rate": 2.4332747473419193e-06, |
|
"loss": 0.0108, |
|
"num_input_tokens_seen": 2641456, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.574416733708769, |
|
"grad_norm": 2.6400082111358643, |
|
"learning_rate": 2.4230123536095746e-06, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 2647760, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.580852775543041, |
|
"grad_norm": 3.1969995498657227, |
|
"learning_rate": 2.4127512582437486e-06, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 2654608, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.587288817377313, |
|
"grad_norm": 3.651118516921997, |
|
"learning_rate": 2.4024916342937966e-06, |
|
"loss": 0.0222, |
|
"num_input_tokens_seen": 2661072, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.593724859211585, |
|
"grad_norm": 2.1281003952026367, |
|
"learning_rate": 2.392233654784262e-06, |
|
"loss": 0.0101, |
|
"num_input_tokens_seen": 2667712, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.600160901045857, |
|
"grad_norm": 2.6782784461975098, |
|
"learning_rate": 2.3819774927119523e-06, |
|
"loss": 0.0138, |
|
"num_input_tokens_seen": 2674496, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6065969428801288, |
|
"grad_norm": 2.2902138233184814, |
|
"learning_rate": 2.3717233210430258e-06, |
|
"loss": 0.0281, |
|
"num_input_tokens_seen": 2680816, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6130329847144007, |
|
"grad_norm": 1.9150536060333252, |
|
"learning_rate": 2.3614713127100752e-06, |
|
"loss": 0.0042, |
|
"num_input_tokens_seen": 2687632, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.6194690265486726, |
|
"grad_norm": 0.3568836748600006, |
|
"learning_rate": 2.3512216406092066e-06, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 2694464, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.6259050683829446, |
|
"grad_norm": 2.3506011962890625, |
|
"learning_rate": 2.340974477597128e-06, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 2701344, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.6323411102172165, |
|
"grad_norm": 2.780200481414795, |
|
"learning_rate": 2.3307299964882314e-06, |
|
"loss": 0.0399, |
|
"num_input_tokens_seen": 2707536, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6387771520514884, |
|
"grad_norm": 1.1793303489685059, |
|
"learning_rate": 2.3204883700516813e-06, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 2714544, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6452131938857604, |
|
"grad_norm": 1.7807022333145142, |
|
"learning_rate": 2.310249771008498e-06, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 2721056, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6516492357200323, |
|
"grad_norm": 12.764676094055176, |
|
"learning_rate": 2.3000143720286463e-06, |
|
"loss": 0.0406, |
|
"num_input_tokens_seen": 2727664, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.6580852775543042, |
|
"grad_norm": 0.44338610768318176, |
|
"learning_rate": 2.2897823457281225e-06, |
|
"loss": 0.0023, |
|
"num_input_tokens_seen": 2733600, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.664521319388576, |
|
"grad_norm": 3.5756232738494873, |
|
"learning_rate": 2.2795538646660462e-06, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 2740400, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.670957361222848, |
|
"grad_norm": 1.4301191568374634, |
|
"learning_rate": 2.269329101341745e-06, |
|
"loss": 0.0236, |
|
"num_input_tokens_seen": 2747248, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.67739340305712, |
|
"grad_norm": 2.0859804153442383, |
|
"learning_rate": 2.2591082281918507e-06, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 2753776, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.6838294448913915, |
|
"grad_norm": 2.9704370498657227, |
|
"learning_rate": 2.2488914175873876e-06, |
|
"loss": 0.015, |
|
"num_input_tokens_seen": 2760720, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.6902654867256635, |
|
"grad_norm": 3.1178269386291504, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.0483, |
|
"num_input_tokens_seen": 2767136, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.6967015285599354, |
|
"grad_norm": 0.6049777269363403, |
|
"learning_rate": 2.2284706731533805e-06, |
|
"loss": 0.0014, |
|
"num_input_tokens_seen": 2773680, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.7031375703942073, |
|
"grad_norm": 3.5615270137786865, |
|
"learning_rate": 2.2182670837116975e-06, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 2780160, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7095736122284793, |
|
"grad_norm": 3.4241111278533936, |
|
"learning_rate": 2.20806824558536e-06, |
|
"loss": 0.0705, |
|
"num_input_tokens_seen": 2786912, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.716009654062751, |
|
"grad_norm": 1.0644826889038086, |
|
"learning_rate": 2.197874330773779e-06, |
|
"loss": 0.005, |
|
"num_input_tokens_seen": 2793888, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.722445695897023, |
|
"grad_norm": 5.071107387542725, |
|
"learning_rate": 2.1876855111933385e-06, |
|
"loss": 0.0453, |
|
"num_input_tokens_seen": 2800320, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.728881737731295, |
|
"grad_norm": 1.9479647874832153, |
|
"learning_rate": 2.1775019586744924e-06, |
|
"loss": 0.0095, |
|
"num_input_tokens_seen": 2807088, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.735317779565567, |
|
"grad_norm": 2.730952262878418, |
|
"learning_rate": 2.167323844958867e-06, |
|
"loss": 0.0095, |
|
"num_input_tokens_seen": 2813312, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.741753821399839, |
|
"grad_norm": 2.1456387042999268, |
|
"learning_rate": 2.1571513416963647e-06, |
|
"loss": 0.0138, |
|
"num_input_tokens_seen": 2819936, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.748189863234111, |
|
"grad_norm": 2.14911150932312, |
|
"learning_rate": 2.1469846204422724e-06, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 2826224, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.754625905068383, |
|
"grad_norm": 0.5957837700843811, |
|
"learning_rate": 2.136823852654363e-06, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 2832960, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.7610619469026547, |
|
"grad_norm": 0.3253982961177826, |
|
"learning_rate": 2.126669209690008e-06, |
|
"loss": 0.0016, |
|
"num_input_tokens_seen": 2839888, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.7674979887369267, |
|
"grad_norm": 3.472017765045166, |
|
"learning_rate": 2.1165208628032863e-06, |
|
"loss": 0.0633, |
|
"num_input_tokens_seen": 2846688, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7739340305711986, |
|
"grad_norm": 2.029026985168457, |
|
"learning_rate": 2.1063789831420957e-06, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 2853184, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.7803700724054705, |
|
"grad_norm": 2.316349506378174, |
|
"learning_rate": 2.096243741745266e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 2859632, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.7868061142397424, |
|
"grad_norm": 3.786245346069336, |
|
"learning_rate": 2.086115309539675e-06, |
|
"loss": 0.0371, |
|
"num_input_tokens_seen": 2865920, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.7932421560740144, |
|
"grad_norm": 1.864402413368225, |
|
"learning_rate": 2.0759938573373683e-06, |
|
"loss": 0.0275, |
|
"num_input_tokens_seen": 2872464, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.7996781979082863, |
|
"grad_norm": 8.142292022705078, |
|
"learning_rate": 2.0658795558326745e-06, |
|
"loss": 0.0441, |
|
"num_input_tokens_seen": 2879168, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.8061142397425582, |
|
"grad_norm": 1.3945283889770508, |
|
"learning_rate": 2.0557725755993283e-06, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 2885520, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.81255028157683, |
|
"grad_norm": 1.951145887374878, |
|
"learning_rate": 2.0456730870875964e-06, |
|
"loss": 0.0412, |
|
"num_input_tokens_seen": 2892368, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.818986323411102, |
|
"grad_norm": 1.666693925857544, |
|
"learning_rate": 2.035581260621398e-06, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 2898640, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.825422365245374, |
|
"grad_norm": 0.8178473114967346, |
|
"learning_rate": 2.0254972663954356e-06, |
|
"loss": 0.0195, |
|
"num_input_tokens_seen": 2905312, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 2.1499900817871094, |
|
"learning_rate": 2.015421274472325e-06, |
|
"loss": 0.0104, |
|
"num_input_tokens_seen": 2911872, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.838294448913918, |
|
"grad_norm": 3.162245273590088, |
|
"learning_rate": 2.005353454779726e-06, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 2918496, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.84473049074819, |
|
"grad_norm": 1.1920592784881592, |
|
"learning_rate": 1.995293977107475e-06, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 2924944, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.8511665325824618, |
|
"grad_norm": 1.091436743736267, |
|
"learning_rate": 1.9852430111047254e-06, |
|
"loss": 0.0072, |
|
"num_input_tokens_seen": 2931440, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.8576025744167337, |
|
"grad_norm": 2.0469212532043457, |
|
"learning_rate": 1.9752007262770857e-06, |
|
"loss": 0.0058, |
|
"num_input_tokens_seen": 2938304, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.8640386162510056, |
|
"grad_norm": 1.6995850801467896, |
|
"learning_rate": 1.965167291983757e-06, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 2945168, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8704746580852776, |
|
"grad_norm": 5.4955735206604, |
|
"learning_rate": 1.955142877434681e-06, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 2951952, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.8769106999195495, |
|
"grad_norm": 1.5203238725662231, |
|
"learning_rate": 1.9451276516876856e-06, |
|
"loss": 0.0186, |
|
"num_input_tokens_seen": 2958432, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.8833467417538214, |
|
"grad_norm": 1.398633599281311, |
|
"learning_rate": 1.9351217836456316e-06, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 2965328, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.8897827835880934, |
|
"grad_norm": 1.4775344133377075, |
|
"learning_rate": 1.9251254420535665e-06, |
|
"loss": 0.0177, |
|
"num_input_tokens_seen": 2971872, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.8962188254223653, |
|
"grad_norm": 3.7046666145324707, |
|
"learning_rate": 1.9151387954958792e-06, |
|
"loss": 0.044, |
|
"num_input_tokens_seen": 2978784, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9026548672566372, |
|
"grad_norm": 1.9969475269317627, |
|
"learning_rate": 1.9051620123934538e-06, |
|
"loss": 0.0119, |
|
"num_input_tokens_seen": 2985760, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 1.3861935138702393, |
|
"learning_rate": 1.895195261000831e-06, |
|
"loss": 0.0121, |
|
"num_input_tokens_seen": 2992352, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.915526950925181, |
|
"grad_norm": 2.0632236003875732, |
|
"learning_rate": 1.885238709403372e-06, |
|
"loss": 0.0319, |
|
"num_input_tokens_seen": 2998800, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.921962992759453, |
|
"grad_norm": 0.31324344873428345, |
|
"learning_rate": 1.8752925255144228e-06, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 3005392, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.928399034593725, |
|
"grad_norm": 1.0096696615219116, |
|
"learning_rate": 1.8653568770724805e-06, |
|
"loss": 0.0102, |
|
"num_input_tokens_seen": 3012016, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.934835076427997, |
|
"grad_norm": 4.725823879241943, |
|
"learning_rate": 1.8554319316383657e-06, |
|
"loss": 0.0419, |
|
"num_input_tokens_seen": 3018768, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.941271118262269, |
|
"grad_norm": 1.6467297077178955, |
|
"learning_rate": 1.8455178565923993e-06, |
|
"loss": 0.0109, |
|
"num_input_tokens_seen": 3025328, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9477071600965408, |
|
"grad_norm": 1.3065979480743408, |
|
"learning_rate": 1.8356148191315753e-06, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 3032080, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.9541432019308127, |
|
"grad_norm": 2.6485443115234375, |
|
"learning_rate": 1.8257229862667437e-06, |
|
"loss": 0.0449, |
|
"num_input_tokens_seen": 3038880, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.9605792437650846, |
|
"grad_norm": 0.9736925363540649, |
|
"learning_rate": 1.8158425248197931e-06, |
|
"loss": 0.014, |
|
"num_input_tokens_seen": 3045552, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.9670152855993566, |
|
"grad_norm": 0.423833429813385, |
|
"learning_rate": 1.8059736014208388e-06, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 3052288, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.9734513274336285, |
|
"grad_norm": 3.7729272842407227, |
|
"learning_rate": 1.7961163825054101e-06, |
|
"loss": 0.016, |
|
"num_input_tokens_seen": 3058768, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9798873692679004, |
|
"grad_norm": 2.9312222003936768, |
|
"learning_rate": 1.7862710343116451e-06, |
|
"loss": 0.0151, |
|
"num_input_tokens_seen": 3065584, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.9863234111021724, |
|
"grad_norm": 0.6318484544754028, |
|
"learning_rate": 1.7764377228774877e-06, |
|
"loss": 0.0039, |
|
"num_input_tokens_seen": 3072368, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.9927594529364443, |
|
"grad_norm": 5.504857063293457, |
|
"learning_rate": 1.7666166140378853e-06, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 3078864, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.9991954947707162, |
|
"grad_norm": 2.98315167427063, |
|
"learning_rate": 1.7568078734219934e-06, |
|
"loss": 0.0609, |
|
"num_input_tokens_seen": 3085664, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.0056315366049877, |
|
"grad_norm": 0.24189546704292297, |
|
"learning_rate": 1.747011666450384e-06, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 3091568, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.0120675784392597, |
|
"grad_norm": 3.122098922729492, |
|
"learning_rate": 1.737228158332252e-06, |
|
"loss": 0.0097, |
|
"num_input_tokens_seen": 3098544, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.0185036202735316, |
|
"grad_norm": 2.117048740386963, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.0091, |
|
"num_input_tokens_seen": 3105120, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.0249396621078035, |
|
"grad_norm": 0.3818783760070801, |
|
"learning_rate": 1.7176998984196148e-06, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 3111552, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.0313757039420755, |
|
"grad_norm": 3.4925177097320557, |
|
"learning_rate": 1.7079554759615685e-06, |
|
"loss": 0.0311, |
|
"num_input_tokens_seen": 3118192, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.0378117457763474, |
|
"grad_norm": 0.1732572317123413, |
|
"learning_rate": 1.6982244110243626e-06, |
|
"loss": 0.0014, |
|
"num_input_tokens_seen": 3124640, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.0442477876106193, |
|
"grad_norm": 1.305844783782959, |
|
"learning_rate": 1.6885068677185989e-06, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 3130992, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.0506838294448912, |
|
"grad_norm": 0.9071294665336609, |
|
"learning_rate": 1.678803009926841e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 3137696, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.057119871279163, |
|
"grad_norm": 0.9389513731002808, |
|
"learning_rate": 1.6691130013008514e-06, |
|
"loss": 0.0069, |
|
"num_input_tokens_seen": 3144560, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.063555913113435, |
|
"grad_norm": 0.15343110263347626, |
|
"learning_rate": 1.6594370052588328e-06, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 3151072, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.069991954947707, |
|
"grad_norm": 0.5078912973403931, |
|
"learning_rate": 1.6497751849826692e-06, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 3158016, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.076427996781979, |
|
"grad_norm": 0.14821191132068634, |
|
"learning_rate": 1.6401277034151798e-06, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 3164560, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.082864038616251, |
|
"grad_norm": 0.3397853672504425, |
|
"learning_rate": 1.630494723257363e-06, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 3171088, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.089300080450523, |
|
"grad_norm": 0.25013279914855957, |
|
"learning_rate": 1.620876406965658e-06, |
|
"loss": 0.0018, |
|
"num_input_tokens_seen": 3177952, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.0957361222847948, |
|
"grad_norm": 0.04799158126115799, |
|
"learning_rate": 1.611272916749205e-06, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 3184592, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.1021721641190667, |
|
"grad_norm": 2.0195066928863525, |
|
"learning_rate": 1.6016844145671062e-06, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 3190896, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.1086082059533386, |
|
"grad_norm": 0.6244819164276123, |
|
"learning_rate": 1.5921110621256972e-06, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 3197376, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.1150442477876106, |
|
"grad_norm": 2.540050506591797, |
|
"learning_rate": 1.58255302087582e-06, |
|
"loss": 0.0059, |
|
"num_input_tokens_seen": 3203776, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.1214802896218825, |
|
"grad_norm": 0.7487736344337463, |
|
"learning_rate": 1.5730104520100984e-06, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 3210464, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.1279163314561544, |
|
"grad_norm": 0.052535440772771835, |
|
"learning_rate": 1.56348351646022e-06, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 3217056, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.1343523732904264, |
|
"grad_norm": 2.5393643379211426, |
|
"learning_rate": 1.5539723748942246e-06, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 3223840, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.1407884151246983, |
|
"grad_norm": 0.28790536522865295, |
|
"learning_rate": 1.544477187713791e-06, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 3230592, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.1472244569589702, |
|
"grad_norm": 2.5697410106658936, |
|
"learning_rate": 1.534998115051533e-06, |
|
"loss": 0.0318, |
|
"num_input_tokens_seen": 3237216, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.153660498793242, |
|
"grad_norm": 1.5203006267547607, |
|
"learning_rate": 1.5255353167683017e-06, |
|
"loss": 0.0216, |
|
"num_input_tokens_seen": 3243920, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.160096540627514, |
|
"grad_norm": 0.1484091877937317, |
|
"learning_rate": 1.5160889524504857e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3250656, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.166532582461786, |
|
"grad_norm": 3.3526744842529297, |
|
"learning_rate": 1.50665918140732e-06, |
|
"loss": 0.0286, |
|
"num_input_tokens_seen": 3257312, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.172968624296058, |
|
"grad_norm": 1.3879235982894897, |
|
"learning_rate": 1.4972461626682033e-06, |
|
"loss": 0.0254, |
|
"num_input_tokens_seen": 3264112, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.17940466613033, |
|
"grad_norm": 1.1939952373504639, |
|
"learning_rate": 1.4878500549800115e-06, |
|
"loss": 0.0039, |
|
"num_input_tokens_seen": 3270528, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 0.20248474180698395, |
|
"learning_rate": 1.4784710168044215e-06, |
|
"loss": 0.0005, |
|
"num_input_tokens_seen": 3277008, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.1922767497988738, |
|
"grad_norm": 1.903956413269043, |
|
"learning_rate": 1.4691092063152417e-06, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 3283376, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.1987127916331457, |
|
"grad_norm": 0.3746008276939392, |
|
"learning_rate": 1.459764781395741e-06, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 3289664, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.2051488334674176, |
|
"grad_norm": 4.635190486907959, |
|
"learning_rate": 1.4504378996359867e-06, |
|
"loss": 0.0088, |
|
"num_input_tokens_seen": 3296576, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.2115848753016896, |
|
"grad_norm": 1.4451507329940796, |
|
"learning_rate": 1.4411287183301902e-06, |
|
"loss": 0.0023, |
|
"num_input_tokens_seen": 3303120, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.2180209171359615, |
|
"grad_norm": 1.252470850944519, |
|
"learning_rate": 1.4318373944740485e-06, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 3310384, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.2244569589702334, |
|
"grad_norm": 0.6509237289428711, |
|
"learning_rate": 1.4225640847621006e-06, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 3316768, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.2308930008045054, |
|
"grad_norm": 0.2248382717370987, |
|
"learning_rate": 1.4133089455850878e-06, |
|
"loss": 0.0011, |
|
"num_input_tokens_seen": 3323488, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.2373290426387773, |
|
"grad_norm": 1.0306220054626465, |
|
"learning_rate": 1.4040721330273063e-06, |
|
"loss": 0.0057, |
|
"num_input_tokens_seen": 3330000, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.2437650844730492, |
|
"grad_norm": 0.1734343320131302, |
|
"learning_rate": 1.3948538028639851e-06, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 3336592, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.250201126307321, |
|
"grad_norm": 0.9872696399688721, |
|
"learning_rate": 1.3856541105586545e-06, |
|
"loss": 0.0066, |
|
"num_input_tokens_seen": 3343136, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.256637168141593, |
|
"grad_norm": 0.8048367500305176, |
|
"learning_rate": 1.3764732112605223e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 3349680, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.263073209975865, |
|
"grad_norm": 1.8275296688079834, |
|
"learning_rate": 1.367311259801863e-06, |
|
"loss": 0.0215, |
|
"num_input_tokens_seen": 3356304, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.2695092518101365, |
|
"grad_norm": 1.90727698802948, |
|
"learning_rate": 1.3581684106953987e-06, |
|
"loss": 0.0031, |
|
"num_input_tokens_seen": 3363008, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.2759452936444085, |
|
"grad_norm": 2.614037275314331, |
|
"learning_rate": 1.3490448181317025e-06, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 3369728, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.2823813354786804, |
|
"grad_norm": 1.9239071607589722, |
|
"learning_rate": 1.3399406359765921e-06, |
|
"loss": 0.0094, |
|
"num_input_tokens_seen": 3375968, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.2888173773129523, |
|
"grad_norm": 1.1601731777191162, |
|
"learning_rate": 1.3308560177685334e-06, |
|
"loss": 0.0054, |
|
"num_input_tokens_seen": 3383024, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.2952534191472242, |
|
"grad_norm": 0.31424281001091003, |
|
"learning_rate": 1.3217911167160575e-06, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 3389488, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.301689460981496, |
|
"grad_norm": 2.633910655975342, |
|
"learning_rate": 1.3127460856951724e-06, |
|
"loss": 0.0053, |
|
"num_input_tokens_seen": 3395712, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.308125502815768, |
|
"grad_norm": 0.9618326425552368, |
|
"learning_rate": 1.303721077246784e-06, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 3402384, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.31456154465004, |
|
"grad_norm": 0.22136647999286652, |
|
"learning_rate": 1.2947162435741278e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3409136, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.320997586484312, |
|
"grad_norm": 1.880077838897705, |
|
"learning_rate": 1.2857317365401997e-06, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 3415776, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.327433628318584, |
|
"grad_norm": 2.234178304672241, |
|
"learning_rate": 1.2767677076651913e-06, |
|
"loss": 0.0083, |
|
"num_input_tokens_seen": 3422496, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.333869670152856, |
|
"grad_norm": 0.21132518351078033, |
|
"learning_rate": 1.2678243081239421e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3429312, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.340305711987128, |
|
"grad_norm": 1.0334022045135498, |
|
"learning_rate": 1.2589016887433846e-06, |
|
"loss": 0.0038, |
|
"num_input_tokens_seen": 3435840, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.3467417538213997, |
|
"grad_norm": 2.751037359237671, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 3442176, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.3531777956556716, |
|
"grad_norm": 0.7970973253250122, |
|
"learning_rate": 1.2411193920172866e-06, |
|
"loss": 0.0062, |
|
"num_input_tokens_seen": 3448784, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.3596138374899436, |
|
"grad_norm": 0.09952107071876526, |
|
"learning_rate": 1.2322600145632204e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3455184, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.3660498793242155, |
|
"grad_norm": 0.6218022108078003, |
|
"learning_rate": 1.2234220170477332e-06, |
|
"loss": 0.0011, |
|
"num_input_tokens_seen": 3461792, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.3724859211584874, |
|
"grad_norm": 1.4417766332626343, |
|
"learning_rate": 1.2146055485201943e-06, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 3468624, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.3789219629927594, |
|
"grad_norm": 2.819247245788574, |
|
"learning_rate": 1.205810757666894e-06, |
|
"loss": 0.0183, |
|
"num_input_tokens_seen": 3474976, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.3853580048270313, |
|
"grad_norm": 1.7066518068313599, |
|
"learning_rate": 1.1970377928085372e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 3481360, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.3917940466613032, |
|
"grad_norm": 2.671914577484131, |
|
"learning_rate": 1.188286801897743e-06, |
|
"loss": 0.0123, |
|
"num_input_tokens_seen": 3487904, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.398230088495575, |
|
"grad_norm": 0.37451621890068054, |
|
"learning_rate": 1.1795579325165448e-06, |
|
"loss": 0.0018, |
|
"num_input_tokens_seen": 3494368, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.404666130329847, |
|
"grad_norm": 0.3565497398376465, |
|
"learning_rate": 1.1708513318739096e-06, |
|
"loss": 0.0014, |
|
"num_input_tokens_seen": 3500704, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.411102172164119, |
|
"grad_norm": 0.22408631443977356, |
|
"learning_rate": 1.1621671468032495e-06, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 3507216, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.417538213998391, |
|
"grad_norm": 1.3339484930038452, |
|
"learning_rate": 1.153505523759944e-06, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 3513664, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.423974255832663, |
|
"grad_norm": 0.6494855880737305, |
|
"learning_rate": 1.1448666088188766e-06, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 3520096, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.430410297666935, |
|
"grad_norm": 0.10890411585569382, |
|
"learning_rate": 1.1362505476719662e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3526560, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.4368463395012068, |
|
"grad_norm": 13.174049377441406, |
|
"learning_rate": 1.1276574856257097e-06, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 3533536, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.4432823813354787, |
|
"grad_norm": 2.0001068115234375, |
|
"learning_rate": 1.1190875675987355e-06, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 3540288, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.4497184231697506, |
|
"grad_norm": 0.15650025010108948, |
|
"learning_rate": 1.1105409381193572e-06, |
|
"loss": 0.0005, |
|
"num_input_tokens_seen": 3546720, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.4561544650040226, |
|
"grad_norm": 0.13460475206375122, |
|
"learning_rate": 1.1020177413231334e-06, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3553280, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.4625905068382945, |
|
"grad_norm": 2.165956735610962, |
|
"learning_rate": 1.0935181209504422e-06, |
|
"loss": 0.0294, |
|
"num_input_tokens_seen": 3559776, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.4690265486725664, |
|
"grad_norm": 0.6856318712234497, |
|
"learning_rate": 1.0850422203440555e-06, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 3566848, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.4754625905068384, |
|
"grad_norm": 1.306766152381897, |
|
"learning_rate": 1.0765901824467167e-06, |
|
"loss": 0.0051, |
|
"num_input_tokens_seen": 3573280, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.4818986323411103, |
|
"grad_norm": 0.3889179825782776, |
|
"learning_rate": 1.068162149798737e-06, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 3579712, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.4883346741753822, |
|
"grad_norm": 1.5245965719223022, |
|
"learning_rate": 1.0597582645355891e-06, |
|
"loss": 0.0231, |
|
"num_input_tokens_seen": 3586480, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.494770716009654, |
|
"grad_norm": 0.6708037257194519, |
|
"learning_rate": 1.0513786683855062e-06, |
|
"loss": 0.0041, |
|
"num_input_tokens_seen": 3593136, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.501206757843926, |
|
"grad_norm": 2.0138630867004395, |
|
"learning_rate": 1.0430235026670979e-06, |
|
"loss": 0.0124, |
|
"num_input_tokens_seen": 3599968, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.507642799678198, |
|
"grad_norm": 7.274059295654297, |
|
"learning_rate": 1.034692908286964e-06, |
|
"loss": 0.0171, |
|
"num_input_tokens_seen": 3606592, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.51407884151247, |
|
"grad_norm": 5.609940052032471, |
|
"learning_rate": 1.0263870257373162e-06, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 3613072, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.520514883346742, |
|
"grad_norm": 1.4191588163375854, |
|
"learning_rate": 1.0181059950936131e-06, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 3619696, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.526950925181014, |
|
"grad_norm": 0.1580982804298401, |
|
"learning_rate": 1.0098499560121943e-06, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 3626240, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.5333869670152858, |
|
"grad_norm": 0.637765109539032, |
|
"learning_rate": 1.0016190477279274e-06, |
|
"loss": 0.002, |
|
"num_input_tokens_seen": 3632704, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 0.07971790432929993, |
|
"learning_rate": 9.934134090518593e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 3639360, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.5462590506838296, |
|
"grad_norm": 0.15312433242797852, |
|
"learning_rate": 9.852331783688722e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 3646112, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.5526950925181016, |
|
"grad_norm": 0.5353730916976929, |
|
"learning_rate": 9.770784936353555e-07, |
|
"loss": 0.0016, |
|
"num_input_tokens_seen": 3652704, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.5591311343523735, |
|
"grad_norm": 0.3197666108608246, |
|
"learning_rate": 9.689494923768756e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 3659696, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.5655671761866454, |
|
"grad_norm": 1.4529962539672852, |
|
"learning_rate": 9.608463116858544e-07, |
|
"loss": 0.0057, |
|
"num_input_tokens_seen": 3666288, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.5720032180209174, |
|
"grad_norm": 2.7501587867736816, |
|
"learning_rate": 9.527690882192636e-07, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 3673104, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.5784392598551893, |
|
"grad_norm": 0.21036742627620697, |
|
"learning_rate": 9.447179581963156e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 3679872, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.5848753016894612, |
|
"grad_norm": 0.03335335850715637, |
|
"learning_rate": 9.366930573961649e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 3686288, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.591311343523733, |
|
"grad_norm": 1.3189131021499634, |
|
"learning_rate": 9.286945211556231e-07, |
|
"loss": 0.0025, |
|
"num_input_tokens_seen": 3692976, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.597747385358005, |
|
"grad_norm": 1.1787400245666504, |
|
"learning_rate": 9.207224843668733e-07, |
|
"loss": 0.0194, |
|
"num_input_tokens_seen": 3699312, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.604183427192277, |
|
"grad_norm": 0.9992094039916992, |
|
"learning_rate": 9.127770814751933e-07, |
|
"loss": 0.0055, |
|
"num_input_tokens_seen": 3705888, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.6106194690265485, |
|
"grad_norm": 2.264843702316284, |
|
"learning_rate": 9.048584464766938e-07, |
|
"loss": 0.0215, |
|
"num_input_tokens_seen": 3712688, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.6170555108608204, |
|
"grad_norm": 0.06527237594127655, |
|
"learning_rate": 8.969667129160547e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 3719168, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.6234915526950924, |
|
"grad_norm": 3.79392409324646, |
|
"learning_rate": 8.891020138842718e-07, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 3726048, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.6299275945293643, |
|
"grad_norm": 0.9232211112976074, |
|
"learning_rate": 8.81264482016416e-07, |
|
"loss": 0.0206, |
|
"num_input_tokens_seen": 3732672, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.5276843309402466, |
|
"learning_rate": 8.734542494893955e-07, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 3739456, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.642799678197908, |
|
"grad_norm": 1.1676807403564453, |
|
"learning_rate": 8.65671448019722e-07, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 3746160, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.64923572003218, |
|
"grad_norm": 1.3703765869140625, |
|
"learning_rate": 8.579162088612974e-07, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 3752560, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.655671761866452, |
|
"grad_norm": 0.06538532674312592, |
|
"learning_rate": 8.501886628031941e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 3759600, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.662107803700724, |
|
"grad_norm": 0.0386020764708519, |
|
"learning_rate": 8.424889401674505e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 3766096, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.668543845534996, |
|
"grad_norm": 0.20554865896701813, |
|
"learning_rate": 8.348171708068748e-07, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 3772944, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.674979887369268, |
|
"grad_norm": 0.9973205327987671, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 3779664, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.6814159292035398, |
|
"grad_norm": 0.30160781741142273, |
|
"learning_rate": 8.195580089631733e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 3786080, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.6878519710378117, |
|
"grad_norm": 0.49049124121665955, |
|
"learning_rate": 8.119708738198395e-07, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 3792768, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.6942880128720836, |
|
"grad_norm": 1.6590077877044678, |
|
"learning_rate": 8.04412206626915e-07, |
|
"loss": 0.0081, |
|
"num_input_tokens_seen": 3799472, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.7007240547063556, |
|
"grad_norm": 1.814943552017212, |
|
"learning_rate": 7.968821348583644e-07, |
|
"loss": 0.008, |
|
"num_input_tokens_seen": 3805984, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.7071600965406275, |
|
"grad_norm": 1.6639471054077148, |
|
"learning_rate": 7.89380785505901e-07, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 3813088, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.7135961383748994, |
|
"grad_norm": 0.946050763130188, |
|
"learning_rate": 7.819082850768433e-07, |
|
"loss": 0.0062, |
|
"num_input_tokens_seen": 3820208, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.7200321802091714, |
|
"grad_norm": 0.2189425230026245, |
|
"learning_rate": 7.744647595919869e-07, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 3826800, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.7264682220434433, |
|
"grad_norm": 2.796231985092163, |
|
"learning_rate": 7.670503345834757e-07, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 3833344, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.7329042638777152, |
|
"grad_norm": 0.13711552321910858, |
|
"learning_rate": 7.596651350926837e-07, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 3839920, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.739340305711987, |
|
"grad_norm": 0.3616367280483246, |
|
"learning_rate": 7.523092856681099e-07, |
|
"loss": 0.0016, |
|
"num_input_tokens_seen": 3846432, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.745776347546259, |
|
"grad_norm": 2.3357245922088623, |
|
"learning_rate": 7.44982910363276e-07, |
|
"loss": 0.0631, |
|
"num_input_tokens_seen": 3853216, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.752212389380531, |
|
"grad_norm": 1.7805283069610596, |
|
"learning_rate": 7.376861327346325e-07, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 3859664, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.758648431214803, |
|
"grad_norm": 0.5933414101600647, |
|
"learning_rate": 7.304190758394775e-07, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 3866208, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.765084473049075, |
|
"grad_norm": 3.1310431957244873, |
|
"learning_rate": 7.231818622338824e-07, |
|
"loss": 0.0633, |
|
"num_input_tokens_seen": 3872736, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.771520514883347, |
|
"grad_norm": 0.2022082656621933, |
|
"learning_rate": 7.159746139706194e-07, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 3879264, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.7779565567176188, |
|
"grad_norm": 6.451120376586914, |
|
"learning_rate": 7.087974525971103e-07, |
|
"loss": 0.0211, |
|
"num_input_tokens_seen": 3885744, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.7843925985518907, |
|
"grad_norm": 0.8931072354316711, |
|
"learning_rate": 7.016504991533727e-07, |
|
"loss": 0.009, |
|
"num_input_tokens_seen": 3892304, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.7908286403861626, |
|
"grad_norm": 1.4347479343414307, |
|
"learning_rate": 6.94533874169977e-07, |
|
"loss": 0.0152, |
|
"num_input_tokens_seen": 3898768, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.7972646822204346, |
|
"grad_norm": 0.5323463678359985, |
|
"learning_rate": 6.874476976660185e-07, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 3904976, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.8037007240547065, |
|
"grad_norm": 1.120011806488037, |
|
"learning_rate": 6.803920891470905e-07, |
|
"loss": 0.014, |
|
"num_input_tokens_seen": 3911360, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.8101367658889784, |
|
"grad_norm": 0.6292040348052979, |
|
"learning_rate": 6.733671676032674e-07, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 3918224, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.8165728077232504, |
|
"grad_norm": 3.3647360801696777, |
|
"learning_rate": 6.663730515071019e-07, |
|
"loss": 0.0161, |
|
"num_input_tokens_seen": 3924960, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.823008849557522, |
|
"grad_norm": 1.8465656042099, |
|
"learning_rate": 6.594098588116243e-07, |
|
"loss": 0.0234, |
|
"num_input_tokens_seen": 3931712, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.829444891391794, |
|
"grad_norm": 1.0739251375198364, |
|
"learning_rate": 6.524777069483526e-07, |
|
"loss": 0.0186, |
|
"num_input_tokens_seen": 3938304, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.8358809332260657, |
|
"grad_norm": 3.146777629852295, |
|
"learning_rate": 6.455767128253148e-07, |
|
"loss": 0.0199, |
|
"num_input_tokens_seen": 3945200, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.8423169750603376, |
|
"grad_norm": 1.1694271564483643, |
|
"learning_rate": 6.38706992825075e-07, |
|
"loss": 0.0052, |
|
"num_input_tokens_seen": 3951808, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.8487530168946096, |
|
"grad_norm": 1.1547743082046509, |
|
"learning_rate": 6.318686628027723e-07, |
|
"loss": 0.0165, |
|
"num_input_tokens_seen": 3958480, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.8551890587288815, |
|
"grad_norm": 1.1595410108566284, |
|
"learning_rate": 6.250618380841661e-07, |
|
"loss": 0.01, |
|
"num_input_tokens_seen": 3965072, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.8616251005631534, |
|
"grad_norm": 0.8452915549278259, |
|
"learning_rate": 6.182866334636889e-07, |
|
"loss": 0.0047, |
|
"num_input_tokens_seen": 3971808, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.8680611423974254, |
|
"grad_norm": 2.201892375946045, |
|
"learning_rate": 6.115431632025154e-07, |
|
"loss": 0.0039, |
|
"num_input_tokens_seen": 3978480, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.8744971842316973, |
|
"grad_norm": 0.24013373255729675, |
|
"learning_rate": 6.048315410266326e-07, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 3985216, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.8809332260659692, |
|
"grad_norm": 0.442757248878479, |
|
"learning_rate": 5.981518801249192e-07, |
|
"loss": 0.0042, |
|
"num_input_tokens_seen": 3991792, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.887369267900241, |
|
"grad_norm": 2.5312795639038086, |
|
"learning_rate": 5.915042931472426e-07, |
|
"loss": 0.0076, |
|
"num_input_tokens_seen": 3998224, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 0.3599741756916046, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 4004960, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.900241351568785, |
|
"grad_norm": 0.33045250177383423, |
|
"learning_rate": 5.783057888570034e-07, |
|
"loss": 0.0014, |
|
"num_input_tokens_seen": 4011984, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.906677393403057, |
|
"grad_norm": 0.540598452091217, |
|
"learning_rate": 5.717550941320482e-07, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 4018912, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.913113435237329, |
|
"grad_norm": 0.4901201128959656, |
|
"learning_rate": 5.65236918502593e-07, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 4025504, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.919549477071601, |
|
"grad_norm": 0.23451536893844604, |
|
"learning_rate": 5.587513718951165e-07, |
|
"loss": 0.0013, |
|
"num_input_tokens_seen": 4031776, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.9259855189058728, |
|
"grad_norm": 0.9038437604904175, |
|
"learning_rate": 5.522985636858238e-07, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 4038208, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.9324215607401447, |
|
"grad_norm": 1.4877148866653442, |
|
"learning_rate": 5.458786026988005e-07, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 4044928, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.9388576025744166, |
|
"grad_norm": 0.12848466634750366, |
|
"learning_rate": 5.394915972041739e-07, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 4051552, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.9452936444086886, |
|
"grad_norm": 0.22914128005504608, |
|
"learning_rate": 5.33137654916292e-07, |
|
"loss": 0.001, |
|
"num_input_tokens_seen": 4058304, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.9517296862429605, |
|
"grad_norm": 0.7593125700950623, |
|
"learning_rate": 5.268168829919046e-07, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 4064720, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.9581657280772324, |
|
"grad_norm": 0.6085631251335144, |
|
"learning_rate": 5.205293880283552e-07, |
|
"loss": 0.0033, |
|
"num_input_tokens_seen": 4071216, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.9646017699115044, |
|
"grad_norm": 0.9351167678833008, |
|
"learning_rate": 5.14275276061785e-07, |
|
"loss": 0.0065, |
|
"num_input_tokens_seen": 4077904, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.9710378117457763, |
|
"grad_norm": 2.1718461513519287, |
|
"learning_rate": 5.080546525653448e-07, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 4084656, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.9774738535800482, |
|
"grad_norm": 0.39000532031059265, |
|
"learning_rate": 5.018676224474139e-07, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 4091584, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.98390989541432, |
|
"grad_norm": 0.5723803639411926, |
|
"learning_rate": 4.957142900498335e-07, |
|
"loss": 0.0013, |
|
"num_input_tokens_seen": 4098768, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.990345937248592, |
|
"grad_norm": 1.5266039371490479, |
|
"learning_rate": 4.895947591461456e-07, |
|
"loss": 0.0148, |
|
"num_input_tokens_seen": 4105312, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.996781979082864, |
|
"grad_norm": 0.7928001880645752, |
|
"learning_rate": 4.835091329398436e-07, |
|
"loss": 0.0063, |
|
"num_input_tokens_seen": 4112000, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 4.003218020917136, |
|
"grad_norm": 0.09017051756381989, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4118624, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.009654062751408, |
|
"grad_norm": 0.2493676394224167, |
|
"learning_rate": 4.714400045726919e-07, |
|
"loss": 0.001, |
|
"num_input_tokens_seen": 4125408, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.01609010458568, |
|
"grad_norm": 0.03381378576159477, |
|
"learning_rate": 4.6545670595296686e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4131936, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.022526146419952, |
|
"grad_norm": 2.845327854156494, |
|
"learning_rate": 4.5950771910944603e-07, |
|
"loss": 0.0193, |
|
"num_input_tokens_seen": 4138352, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.028962188254224, |
|
"grad_norm": 0.6973279714584351, |
|
"learning_rate": 4.5359314436946275e-07, |
|
"loss": 0.0049, |
|
"num_input_tokens_seen": 4144672, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.035398230088496, |
|
"grad_norm": 0.3552819788455963, |
|
"learning_rate": 4.4771308148000487e-07, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 4151296, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.041834271922768, |
|
"grad_norm": 0.2976234555244446, |
|
"learning_rate": 4.418676296060323e-07, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 4157696, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.0482703137570395, |
|
"grad_norm": 0.6438854932785034, |
|
"learning_rate": 4.3605688732880097e-07, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 4164352, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.054706355591311, |
|
"grad_norm": 0.055070556700229645, |
|
"learning_rate": 4.302809526442053e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4170992, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.061142397425583, |
|
"grad_norm": 0.5393857359886169, |
|
"learning_rate": 4.2453992296112384e-07, |
|
"loss": 0.0031, |
|
"num_input_tokens_seen": 4177888, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.067578439259855, |
|
"grad_norm": 0.10041255503892899, |
|
"learning_rate": 4.188338950997728e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4184800, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.074014481094127, |
|
"grad_norm": 0.4824787378311157, |
|
"learning_rate": 4.1316296529007955e-07, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 4191136, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.080450522928399, |
|
"grad_norm": 0.8842573761940002, |
|
"learning_rate": 4.075272291700558e-07, |
|
"loss": 0.0047, |
|
"num_input_tokens_seen": 4197984, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.086886564762671, |
|
"grad_norm": 0.0672411248087883, |
|
"learning_rate": 4.019267817841835e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4204688, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.093322606596943, |
|
"grad_norm": 1.144921898841858, |
|
"learning_rate": 3.9636171758181657e-07, |
|
"loss": 0.0204, |
|
"num_input_tokens_seen": 4211360, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.099758648431215, |
|
"grad_norm": 1.0628600120544434, |
|
"learning_rate": 3.908321304155846e-07, |
|
"loss": 0.0043, |
|
"num_input_tokens_seen": 4218000, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.106194690265487, |
|
"grad_norm": 0.03438463807106018, |
|
"learning_rate": 3.853381135398093e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4224544, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.112630732099759, |
|
"grad_norm": 0.22854630649089813, |
|
"learning_rate": 3.798797596089351e-07, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 4230992, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.119066773934031, |
|
"grad_norm": 0.12790539860725403, |
|
"learning_rate": 3.7445716067596506e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4237808, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.125502815768303, |
|
"grad_norm": 0.040783047676086426, |
|
"learning_rate": 3.6907040819090604e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4244032, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.131938857602575, |
|
"grad_norm": 0.29912275075912476, |
|
"learning_rate": 3.63719592999231e-07, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 4250640, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.1383748994368466, |
|
"grad_norm": 0.1869562268257141, |
|
"learning_rate": 3.5840480534034355e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 4257440, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.1448109412711185, |
|
"grad_norm": 0.6986035108566284, |
|
"learning_rate": 3.5312613484605546e-07, |
|
"loss": 0.0041, |
|
"num_input_tokens_seen": 4263936, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.15124698310539, |
|
"grad_norm": 1.672957181930542, |
|
"learning_rate": 3.4788367053908087e-07, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 4270144, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.157683024939662, |
|
"grad_norm": 0.5007069110870361, |
|
"learning_rate": 3.4267750083152587e-07, |
|
"loss": 0.0025, |
|
"num_input_tokens_seen": 4276944, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.164119066773934, |
|
"grad_norm": 0.2200661450624466, |
|
"learning_rate": 3.375077135234051e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 4283488, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.170555108608206, |
|
"grad_norm": 1.018943190574646, |
|
"learning_rate": 3.323743958011588e-07, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 4290000, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.176991150442478, |
|
"grad_norm": 0.5488151907920837, |
|
"learning_rate": 3.2727763423617915e-07, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 4296544, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.18342719227675, |
|
"grad_norm": 0.12664268910884857, |
|
"learning_rate": 3.222175147833556e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4303056, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.189863234111022, |
|
"grad_norm": 0.04309312626719475, |
|
"learning_rate": 3.171941227796227e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4309664, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.196299275945294, |
|
"grad_norm": 2.9486300945281982, |
|
"learning_rate": 3.122075429425184e-07, |
|
"loss": 0.0118, |
|
"num_input_tokens_seen": 4316112, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.202735317779566, |
|
"grad_norm": 0.03176088631153107, |
|
"learning_rate": 3.072578593687606e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4322800, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.209171359613838, |
|
"grad_norm": 4.464654445648193, |
|
"learning_rate": 3.0234515553282523e-07, |
|
"loss": 0.0151, |
|
"num_input_tokens_seen": 4329408, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.21560740144811, |
|
"grad_norm": 0.017552955076098442, |
|
"learning_rate": 2.9746951428553884e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4335648, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.222043443282382, |
|
"grad_norm": 0.385110467672348, |
|
"learning_rate": 2.9263101785268253e-07, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 4342256, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.228479485116654, |
|
"grad_norm": 0.3891147971153259, |
|
"learning_rate": 2.8782974783360534e-07, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 4349280, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.2349155269509255, |
|
"grad_norm": 0.687170147895813, |
|
"learning_rate": 2.8306578519984526e-07, |
|
"loss": 0.0051, |
|
"num_input_tokens_seen": 4356128, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.2413515687851975, |
|
"grad_norm": 0.16641825437545776, |
|
"learning_rate": 2.783392102937682e-07, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 4362672, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.247787610619469, |
|
"grad_norm": 0.02807171456515789, |
|
"learning_rate": 2.7365010282720954e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4369440, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.254223652453741, |
|
"grad_norm": 1.0298210382461548, |
|
"learning_rate": 2.6899854188013054e-07, |
|
"loss": 0.0043, |
|
"num_input_tokens_seen": 4375760, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.260659694288013, |
|
"grad_norm": 0.15670017898082733, |
|
"learning_rate": 2.643846058992866e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4382768, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.267095736122285, |
|
"grad_norm": 1.2815680503845215, |
|
"learning_rate": 2.5980837269690056e-07, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 4389424, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.273531777956556, |
|
"grad_norm": 0.23917140066623688, |
|
"learning_rate": 2.552699194493549e-07, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 4395904, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.279967819790828, |
|
"grad_norm": 0.8005861043930054, |
|
"learning_rate": 2.507693226958871e-07, |
|
"loss": 0.0049, |
|
"num_input_tokens_seen": 4402144, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.2864038616251, |
|
"grad_norm": 0.8631348609924316, |
|
"learning_rate": 2.463066583372989e-07, |
|
"loss": 0.0058, |
|
"num_input_tokens_seen": 4408672, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.292839903459372, |
|
"grad_norm": 0.017498647794127464, |
|
"learning_rate": 2.418820016346779e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4415040, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.299275945293644, |
|
"grad_norm": 0.8143237829208374, |
|
"learning_rate": 2.3749542720812757e-07, |
|
"loss": 0.0063, |
|
"num_input_tokens_seen": 4421696, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.305711987127916, |
|
"grad_norm": 0.5140169262886047, |
|
"learning_rate": 2.331470090355084e-07, |
|
"loss": 0.0116, |
|
"num_input_tokens_seen": 4428096, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.312148028962188, |
|
"grad_norm": 0.6097451448440552, |
|
"learning_rate": 2.2883682045119066e-07, |
|
"loss": 0.003, |
|
"num_input_tokens_seen": 4434752, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.31858407079646, |
|
"grad_norm": 0.030739160254597664, |
|
"learning_rate": 2.2456493414481778e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4441584, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.325020112630732, |
|
"grad_norm": 0.017091860994696617, |
|
"learning_rate": 2.2033142216007913e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4448464, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.331456154465004, |
|
"grad_norm": 0.18128401041030884, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 4454976, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.337892196299276, |
|
"grad_norm": 0.0346699096262455, |
|
"learning_rate": 2.1197980609322406e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4461440, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.3443282381335475, |
|
"grad_norm": 0.018729638308286667, |
|
"learning_rate": 2.07861842857843e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4468080, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.3507642799678194, |
|
"grad_norm": 1.5200186967849731, |
|
"learning_rate": 2.0378253563519247e-07, |
|
"loss": 0.0105, |
|
"num_input_tokens_seen": 4474944, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.357200321802091, |
|
"grad_norm": 0.03133641555905342, |
|
"learning_rate": 1.997419532211925e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4481456, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.012541470117866993, |
|
"learning_rate": 1.9574016375868282e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4487472, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.370072405470635, |
|
"grad_norm": 1.6271870136260986, |
|
"learning_rate": 1.9177723473627647e-07, |
|
"loss": 0.0076, |
|
"num_input_tokens_seen": 4494320, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.376508447304907, |
|
"grad_norm": 0.03906352072954178, |
|
"learning_rate": 1.8785323298722098e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4501152, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.382944489139179, |
|
"grad_norm": 0.024355776607990265, |
|
"learning_rate": 1.839682246882682e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4507376, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.389380530973451, |
|
"grad_norm": 0.4699815511703491, |
|
"learning_rate": 1.801222753585638e-07, |
|
"loss": 0.002, |
|
"num_input_tokens_seen": 4513904, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.395816572807723, |
|
"grad_norm": 1.0409318208694458, |
|
"learning_rate": 1.7631544985853623e-07, |
|
"loss": 0.0088, |
|
"num_input_tokens_seen": 4520608, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.402252614641995, |
|
"grad_norm": 0.42572081089019775, |
|
"learning_rate": 1.725478123888083e-07, |
|
"loss": 0.0023, |
|
"num_input_tokens_seen": 4527184, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.408688656476267, |
|
"grad_norm": 0.04756924882531166, |
|
"learning_rate": 1.6881942648911077e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4533696, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.415124698310539, |
|
"grad_norm": 0.21432961523532867, |
|
"learning_rate": 1.6513035503721213e-07, |
|
"loss": 0.0017, |
|
"num_input_tokens_seen": 4540624, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.421560740144811, |
|
"grad_norm": 0.035157278180122375, |
|
"learning_rate": 1.614806602478583e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4547056, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.427996781979083, |
|
"grad_norm": 1.0397815704345703, |
|
"learning_rate": 1.5787040367172379e-07, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 4553712, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.434432823813355, |
|
"grad_norm": 0.8960546851158142, |
|
"learning_rate": 1.542996461943716e-07, |
|
"loss": 0.0013, |
|
"num_input_tokens_seen": 4560080, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.4408688656476265, |
|
"grad_norm": 0.1009814515709877, |
|
"learning_rate": 1.507684480352292e-07, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4566496, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.447304907481898, |
|
"grad_norm": 1.6075918674468994, |
|
"learning_rate": 1.4727686874657143e-07, |
|
"loss": 0.0149, |
|
"num_input_tokens_seen": 4573152, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.45374094931617, |
|
"grad_norm": 0.5051795840263367, |
|
"learning_rate": 1.4382496721251526e-07, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 4580432, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.460176991150442, |
|
"grad_norm": 0.01903243362903595, |
|
"learning_rate": 1.4041280164802967e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4587024, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.466613032984714, |
|
"grad_norm": 0.04848824068903923, |
|
"learning_rate": 1.3704042959795132e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4593296, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.473049074818986, |
|
"grad_norm": 1.155561923980713, |
|
"learning_rate": 1.3370790793601373e-07, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 4600000, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.479485116653258, |
|
"grad_norm": 0.06723422557115555, |
|
"learning_rate": 1.3041529286389078e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4606560, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.48592115848753, |
|
"grad_norm": 0.10645350813865662, |
|
"learning_rate": 1.2716263991024712e-07, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4613424, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.492357200321802, |
|
"grad_norm": 0.09232950955629349, |
|
"learning_rate": 1.2395000392980057e-07, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 4620064, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.498793242156074, |
|
"grad_norm": 0.04445146396756172, |
|
"learning_rate": 1.2077743910239998e-07, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4626608, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.505229283990346, |
|
"grad_norm": 0.03232429176568985, |
|
"learning_rate": 1.1764499893210879e-07, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4633280, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.511665325824618, |
|
"grad_norm": 0.7786117792129517, |
|
"learning_rate": 1.145527362463042e-07, |
|
"loss": 0.0033, |
|
"num_input_tokens_seen": 4639920, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.51810136765889, |
|
"grad_norm": 0.15632830560207367, |
|
"learning_rate": 1.1150070319478679e-07, |
|
"loss": 0.0009, |
|
"num_input_tokens_seen": 4646736, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.524537409493162, |
|
"grad_norm": 2.964639186859131, |
|
"learning_rate": 1.0848895124889819e-07, |
|
"loss": 0.0216, |
|
"num_input_tokens_seen": 4653328, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.530973451327434, |
|
"grad_norm": 0.8460017442703247, |
|
"learning_rate": 1.0551753120065621e-07, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 4660112, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.5374094931617055, |
|
"grad_norm": 1.1353970766067505, |
|
"learning_rate": 1.0258649316189722e-07, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 4666560, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.543845534995977, |
|
"grad_norm": 0.08409194648265839, |
|
"learning_rate": 9.969588656342982e-08, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4673152, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.550281576830249, |
|
"grad_norm": 0.1840662956237793, |
|
"learning_rate": 9.684576015420277e-08, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 4679360, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.556717618664521, |
|
"grad_norm": 0.049431972205638885, |
|
"learning_rate": 9.403616200048288e-08, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4685904, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.563153660498793, |
|
"grad_norm": 1.309277892112732, |
|
"learning_rate": 9.12671394850423e-08, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 4692320, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.569589702333065, |
|
"grad_norm": 0.3398638367652893, |
|
"learning_rate": 8.85387393063622e-08, |
|
"loss": 0.0015, |
|
"num_input_tokens_seen": 4698928, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.576025744167337, |
|
"grad_norm": 0.016252102330327034, |
|
"learning_rate": 8.585100747784376e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4706000, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.582461786001609, |
|
"grad_norm": 0.6447362303733826, |
|
"learning_rate": 8.320398932703145e-08, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 4712352, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.588897827835881, |
|
"grad_norm": 0.5575013756752014, |
|
"learning_rate": 8.059772949485068e-08, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 4718848, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.595333869670153, |
|
"grad_norm": 1.0512995719909668, |
|
"learning_rate": 7.803227193485336e-08, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 4725728, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.601769911504425, |
|
"grad_norm": 0.012490477412939072, |
|
"learning_rate": 7.550765991247655e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4732352, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.608205953338697, |
|
"grad_norm": 0.025810543447732925, |
|
"learning_rate": 7.30239360043139e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4738992, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.614641995172969, |
|
"grad_norm": 0.5250550508499146, |
|
"learning_rate": 7.058114209739675e-08, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 4745872, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 4.621078037007241, |
|
"grad_norm": 0.05289880558848381, |
|
"learning_rate": 6.817931938848805e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4752544, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.627514078841513, |
|
"grad_norm": 0.0455067902803421, |
|
"learning_rate": 6.581850838338816e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4759360, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 4.6339501206757845, |
|
"grad_norm": 0.1569470316171646, |
|
"learning_rate": 6.349874889624963e-08, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 4766016, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.640386162510056, |
|
"grad_norm": 0.7371820211410522, |
|
"learning_rate": 6.12200800489085e-08, |
|
"loss": 0.0043, |
|
"num_input_tokens_seen": 4772624, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 4.646822204344328, |
|
"grad_norm": 0.09805099666118622, |
|
"learning_rate": 5.898254027022293e-08, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4778960, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.6532582461786, |
|
"grad_norm": 0.6390008926391602, |
|
"learning_rate": 5.678616729542535e-08, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 4785600, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 4.659694288012872, |
|
"grad_norm": 0.4965854585170746, |
|
"learning_rate": 5.463099816548578e-08, |
|
"loss": 0.0024, |
|
"num_input_tokens_seen": 4792208, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.666130329847144, |
|
"grad_norm": 0.012886490672826767, |
|
"learning_rate": 5.2517069226488694e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4798816, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.672566371681416, |
|
"grad_norm": 0.04072566702961922, |
|
"learning_rate": 5.044441612901768e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4805408, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.679002413515688, |
|
"grad_norm": 0.07961362600326538, |
|
"learning_rate": 4.841307382755567e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4812480, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 4.68543845534996, |
|
"grad_norm": 0.006482974626123905, |
|
"learning_rate": 4.6423076579895646e-08, |
|
"loss": 0.0, |
|
"num_input_tokens_seen": 4819360, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.691874497184232, |
|
"grad_norm": 0.06561946123838425, |
|
"learning_rate": 4.4474457946562245e-08, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4825904, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.698310539018504, |
|
"grad_norm": 0.10200546681880951, |
|
"learning_rate": 4.256725079024554e-08, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 4832544, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.704746580852776, |
|
"grad_norm": 0.2905844449996948, |
|
"learning_rate": 4.070148727524814e-08, |
|
"loss": 0.001, |
|
"num_input_tokens_seen": 4838960, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.711182622687048, |
|
"grad_norm": 0.013804874382913113, |
|
"learning_rate": 3.887719886694091e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4845584, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.71761866452132, |
|
"grad_norm": 0.8108282685279846, |
|
"learning_rate": 3.709441633123367e-08, |
|
"loss": 0.0031, |
|
"num_input_tokens_seen": 4852416, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.7240547063555915, |
|
"grad_norm": 0.11679325252771378, |
|
"learning_rate": 3.535316973405672e-08, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 4858864, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.7304907481898635, |
|
"grad_norm": 0.013299252837896347, |
|
"learning_rate": 3.3653488440851255e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4865552, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.736926790024135, |
|
"grad_norm": 0.8139033913612366, |
|
"learning_rate": 3.1995401116077516e-08, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 4871984, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.743362831858407, |
|
"grad_norm": 0.023770008236169815, |
|
"learning_rate": 3.037893572272937e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4878688, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.749798873692679, |
|
"grad_norm": 0.1465252786874771, |
|
"learning_rate": 2.8804119521862183e-08, |
|
"loss": 0.001, |
|
"num_input_tokens_seen": 4885504, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.756234915526951, |
|
"grad_norm": 0.7871643304824829, |
|
"learning_rate": 2.7270979072135106e-08, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 4892272, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 4.762670957361223, |
|
"grad_norm": 0.6069658994674683, |
|
"learning_rate": 2.5779540229361744e-08, |
|
"loss": 0.0047, |
|
"num_input_tokens_seen": 4898848, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.769106999195495, |
|
"grad_norm": 0.07687046378850937, |
|
"learning_rate": 2.4329828146074096e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4905392, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 4.775543041029767, |
|
"grad_norm": 0.0760512426495552, |
|
"learning_rate": 2.2921867271099296e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4912672, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.781979082864039, |
|
"grad_norm": 0.14020369946956635, |
|
"learning_rate": 2.155568134914604e-08, |
|
"loss": 0.0006, |
|
"num_input_tokens_seen": 4919472, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 4.788415124698311, |
|
"grad_norm": 0.020505385473370552, |
|
"learning_rate": 2.0231293420405194e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4926448, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.794851166532583, |
|
"grad_norm": 0.6544126868247986, |
|
"learning_rate": 1.8948725820160663e-08, |
|
"loss": 0.0038, |
|
"num_input_tokens_seen": 4933152, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.801287208366855, |
|
"grad_norm": 1.0113900899887085, |
|
"learning_rate": 1.770800017841301e-08, |
|
"loss": 0.0088, |
|
"num_input_tokens_seen": 4939568, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.807723250201127, |
|
"grad_norm": 0.05956251546740532, |
|
"learning_rate": 1.650913741951421e-08, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 4946224, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 4.814159292035399, |
|
"grad_norm": 0.21396887302398682, |
|
"learning_rate": 1.5352157761815978e-08, |
|
"loss": 0.002, |
|
"num_input_tokens_seen": 4952880, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.8205953338696705, |
|
"grad_norm": 0.3376445472240448, |
|
"learning_rate": 1.4237080717326712e-08, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 4960288, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 4.8270313757039425, |
|
"grad_norm": 0.47905248403549194, |
|
"learning_rate": 1.3163925091384532e-08, |
|
"loss": 0.0038, |
|
"num_input_tokens_seen": 4966656, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.833467417538214, |
|
"grad_norm": 0.08333203196525574, |
|
"learning_rate": 1.2132708982338925e-08, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 4973184, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 4.839903459372486, |
|
"grad_norm": 0.45923787355422974, |
|
"learning_rate": 1.1143449781245985e-08, |
|
"loss": 0.0023, |
|
"num_input_tokens_seen": 4980080, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 4.846339501206758, |
|
"grad_norm": 0.038865748792886734, |
|
"learning_rate": 1.0196164171574762e-08, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 4987104, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 4.85277554304103, |
|
"grad_norm": 0.2948670983314514, |
|
"learning_rate": 9.290868128926378e-09, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 4993728, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 4.859211584875302, |
|
"grad_norm": 0.5785093903541565, |
|
"learning_rate": 8.427576920763957e-09, |
|
"loss": 0.004, |
|
"num_input_tokens_seen": 5000368, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.865647626709574, |
|
"grad_norm": 0.29199209809303284, |
|
"learning_rate": 7.606305106155898e-09, |
|
"loss": 0.0042, |
|
"num_input_tokens_seen": 5006768, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 4.872083668543846, |
|
"grad_norm": 0.25938865542411804, |
|
"learning_rate": 6.827066535529947e-09, |
|
"loss": 0.0004, |
|
"num_input_tokens_seen": 5013264, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 4.878519710378118, |
|
"grad_norm": 0.1507510095834732, |
|
"learning_rate": 6.089874350439507e-09, |
|
"loss": 0.0005, |
|
"num_input_tokens_seen": 5019744, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 4.88495575221239, |
|
"grad_norm": 0.009750754572451115, |
|
"learning_rate": 5.394740983341862e-09, |
|
"loss": 0.0, |
|
"num_input_tokens_seen": 5026080, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 4.891391794046662, |
|
"grad_norm": 0.049791790544986725, |
|
"learning_rate": 4.74167815738974e-09, |
|
"loss": 0.0001, |
|
"num_input_tokens_seen": 5032544, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.897827835880933, |
|
"grad_norm": 0.971774160861969, |
|
"learning_rate": 4.130696886231744e-09, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 5039248, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 4.904263877715205, |
|
"grad_norm": 0.9287542700767517, |
|
"learning_rate": 3.561807473827783e-09, |
|
"loss": 0.0118, |
|
"num_input_tokens_seen": 5045792, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 4.910699919549477, |
|
"grad_norm": 0.06942977011203766, |
|
"learning_rate": 3.035019514275317e-09, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 5052320, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 4.917135961383749, |
|
"grad_norm": 0.28999796509742737, |
|
"learning_rate": 2.5503418916464352e-09, |
|
"loss": 0.0013, |
|
"num_input_tokens_seen": 5059200, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 4.923572003218021, |
|
"grad_norm": 0.8473367691040039, |
|
"learning_rate": 2.1077827798404728e-09, |
|
"loss": 0.0058, |
|
"num_input_tokens_seen": 5065824, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.9300080450522925, |
|
"grad_norm": 0.14826533198356628, |
|
"learning_rate": 1.707349642442735e-09, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 5072080, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 4.936444086886564, |
|
"grad_norm": 0.9073830246925354, |
|
"learning_rate": 1.349049232601818e-09, |
|
"loss": 0.0059, |
|
"num_input_tokens_seen": 5078336, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 4.942880128720836, |
|
"grad_norm": 1.048936367034912, |
|
"learning_rate": 1.0328875929138671e-09, |
|
"loss": 0.0068, |
|
"num_input_tokens_seen": 5085152, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 4.949316170555108, |
|
"grad_norm": 0.06418836861848831, |
|
"learning_rate": 7.588700553209926e-10, |
|
"loss": 0.0003, |
|
"num_input_tokens_seen": 5092080, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 4.95575221238938, |
|
"grad_norm": 2.129972457885742, |
|
"learning_rate": 5.270012410216185e-10, |
|
"loss": 0.0251, |
|
"num_input_tokens_seen": 5098960, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.962188254223652, |
|
"grad_norm": 0.3871181309223175, |
|
"learning_rate": 3.3728506039276686e-10, |
|
"loss": 0.0054, |
|
"num_input_tokens_seen": 5105648, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 4.968624296057924, |
|
"grad_norm": 0.185493603348732, |
|
"learning_rate": 1.8972471292344474e-10, |
|
"loss": 0.0007, |
|
"num_input_tokens_seen": 5112096, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 4.975060337892196, |
|
"grad_norm": 0.05420377105474472, |
|
"learning_rate": 8.432268716135338e-11, |
|
"loss": 0.0002, |
|
"num_input_tokens_seen": 5118960, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 4.981496379726468, |
|
"grad_norm": 0.5486555695533752, |
|
"learning_rate": 2.108076067014464e-11, |
|
"loss": 0.0032, |
|
"num_input_tokens_seen": 5125872, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 4.98793242156074, |
|
"grad_norm": 1.0398619174957275, |
|
"learning_rate": 0.0, |
|
"loss": 0.0107, |
|
"num_input_tokens_seen": 5132288, |
|
"step": 775 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 775, |
|
"num_input_tokens_seen": 5132288, |
|
"num_train_epochs": 5, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3110461174474342e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|