bigscience-bot
commited on
Commit
•
7600865
1
Parent(s):
de855a0
new data
Browse files- logs/main_log.txt +41 -0
logs/main_log.txt
CHANGED
@@ -66319,3 +66319,44 @@ time (ms)
|
|
66319 |
[2021-09-25 09:09:21] PULSE: tr8-104B is scheduled to start in 17:12:55 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66320 |
[2021-09-25 09:09:21] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66321 |
[2021-09-25 09:09:21] PULSE: tr8-104B is running for 4:42:20 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66319 |
[2021-09-25 09:09:21] PULSE: tr8-104B is scheduled to start in 17:12:55 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66320 |
[2021-09-25 09:09:21] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66321 |
[2021-09-25 09:09:21] PULSE: tr8-104B is running for 4:42:20 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66322 |
+
iteration 7150/ 159576 | consumed samples: 275312 | elapsed time per iteration (ms): 18318.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.195872E+00 | loss scale: 2048.0 | grad norm: 14010.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66323 |
+
time (ms)
|
66324 |
+
iteration 7160/ 159576 | consumed samples: 276272 | elapsed time per iteration (ms): 18337.2 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.136990E+00 | loss scale: 2048.0 | grad norm: 23189.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66325 |
+
time (ms)
|
66326 |
+
iteration 7170/ 159576 | consumed samples: 277232 | elapsed time per iteration (ms): 18344.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.222323E+00 | loss scale: 2048.0 | grad norm: 22610.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66327 |
+
time (ms)
|
66328 |
+
iteration 7180/ 159576 | consumed samples: 278192 | elapsed time per iteration (ms): 18312.6 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.156533E+00 | loss scale: 2048.0 | grad norm: 12376.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66329 |
+
time (ms)
|
66330 |
+
iteration 7190/ 159576 | consumed samples: 279152 | elapsed time per iteration (ms): 18417.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.084262E+00 | loss scale: 2048.0 | grad norm: 38647.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66331 |
+
time (ms)
|
66332 |
+
iteration 7200/ 159576 | consumed samples: 280112 | elapsed time per iteration (ms): 18396.8 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.110893E+00 | loss scale: 2048.0 | grad norm: 21520.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66333 |
+
time (ms)
|
66334 |
+
iteration 7210/ 159576 | consumed samples: 281072 | elapsed time per iteration (ms): 18408.8 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.294872E+00 | loss scale: 2048.0 | grad norm: 77171.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66335 |
+
time (ms)
|
66336 |
+
iteration 7220/ 159576 | consumed samples: 282032 | elapsed time per iteration (ms): 18333.4 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.155109E+00 | loss scale: 2048.0 | grad norm: 16921.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66337 |
+
time (ms)
|
66338 |
+
iteration 7230/ 159576 | consumed samples: 282992 | elapsed time per iteration (ms): 18398.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.042103E+00 | loss scale: 2048.0 | grad norm: 13510.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66339 |
+
time (ms)
|
66340 |
+
iteration 7240/ 159576 | consumed samples: 284032 | elapsed time per iteration (ms): 19100.0 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 6.964984E+00 | loss scale: 2048.0 | grad norm: 11355.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66341 |
+
time (ms)
|
66342 |
+
iteration 7250/ 159576 | consumed samples: 285152 | elapsed time per iteration (ms): 19781.1 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.051522E+00 | loss scale: 2048.0 | grad norm: 14836.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66343 |
+
time (ms)
|
66344 |
+
iteration 7260/ 159576 | consumed samples: 286272 | elapsed time per iteration (ms): 19836.2 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.050404E+00 | loss scale: 2048.0 | grad norm: 32092.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66345 |
+
time (ms)
|
66346 |
+
iteration 7270/ 159576 | consumed samples: 287392 | elapsed time per iteration (ms): 19719.8 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.034865E+00 | loss scale: 2048.0 | grad norm: 25809.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66347 |
+
time (ms)
|
66348 |
+
iteration 7280/ 159576 | consumed samples: 288512 | elapsed time per iteration (ms): 19632.8 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.038512E+00 | loss scale: 2048.0 | grad norm: 19816.017 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66349 |
+
time (ms)
|
66350 |
+
iteration 7290/ 159576 | consumed samples: 289632 | elapsed time per iteration (ms): 19704.6 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.051814E+00 | loss scale: 2048.0 | grad norm: 13138.906 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66351 |
+
time (ms)
|
66352 |
+
iteration 7300/ 159576 | consumed samples: 290752 | elapsed time per iteration (ms): 19431.1 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 6.962708E+00 | loss scale: 2048.0 | grad norm: 15505.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66353 |
+
time (ms)
|
66354 |
+
iteration 7310/ 159576 | consumed samples: 291872 | elapsed time per iteration (ms): 19625.1 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.068867E+00 | loss scale: 2048.0 | grad norm: 26542.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66355 |
+
time (ms)
|
66356 |
+
iteration 7320/ 159576 | consumed samples: 292992 | elapsed time per iteration (ms): 19705.6 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.131171E+00 | loss scale: 2048.0 | grad norm: 59185.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66357 |
+
time (ms)
|
66358 |
+
iteration 7330/ 159576 | consumed samples: 294112 | elapsed time per iteration (ms): 19592.0 | learning rate: 6.000E-05 | global batch size: 112 | lm loss: 7.030576E+00 | loss scale: 2048.0 | grad norm: 32033.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66359 |
+
time (ms)
|
66360 |
+
[2021-09-25 10:09:39] PULSE: tr8-104B is scheduled to start in 17:07:05 (at 2021-09-26T03:16:45) (1188168 on 'gpu_p13' partition)
|
66361 |
+
[2021-09-25 10:09:39] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66362 |
+
[2021-09-25 10:09:39] PULSE: tr8-104B is running for 5:42:38 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|