bigscience-bot
commited on
Commit
•
cd1c71a
1
Parent(s):
bc7da15
new data
Browse files- logs/main_log.txt +35 -0
logs/main_log.txt
CHANGED
@@ -66560,3 +66560,38 @@ time (ms)
|
|
66560 |
[2021-09-25 15:10:12] PULSE: tr8-104B is scheduled to start in 20:25:18 (at 2021-09-26T11:35:31) (1188168 on 'gpu_p13' partition)
|
66561 |
[2021-09-25 15:10:12] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66562 |
[2021-09-25 15:10:12] PULSE: tr8-104B is running for 10:43:11 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66560 |
[2021-09-25 15:10:12] PULSE: tr8-104B is scheduled to start in 20:25:18 (at 2021-09-26T11:35:31) (1188168 on 'gpu_p13' partition)
|
66561 |
[2021-09-25 15:10:12] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66562 |
[2021-09-25 15:10:12] PULSE: tr8-104B is running for 10:43:11 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66563 |
+
iteration 8220/ 159576 | consumed samples: 405904 | elapsed time per iteration (ms): 21992.9 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.927856E+00 | loss scale: 4096.0 | grad norm: 191859.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66564 |
+
time (ms)
|
66565 |
+
iteration 8230/ 159576 | consumed samples: 407344 | elapsed time per iteration (ms): 21845.4 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.915263E+00 | loss scale: 4096.0 | grad norm: 136325.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66566 |
+
time (ms)
|
66567 |
+
iteration 8240/ 159576 | consumed samples: 408784 | elapsed time per iteration (ms): 21179.2 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.864025E+00 | loss scale: 2048.0 | grad norm: 118355.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66568 |
+
time (ms)
|
66569 |
+
iteration 8250/ 159576 | consumed samples: 410224 | elapsed time per iteration (ms): 21688.2 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.873029E+00 | loss scale: 2048.0 | grad norm: 72612.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66570 |
+
time (ms)
|
66571 |
+
iteration 8260/ 159576 | consumed samples: 411664 | elapsed time per iteration (ms): 21621.0 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.963725E+00 | loss scale: 2048.0 | grad norm: 77677.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66572 |
+
time (ms)
|
66573 |
+
iteration 8270/ 159576 | consumed samples: 413104 | elapsed time per iteration (ms): 21832.0 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.939199E+00 | loss scale: 2048.0 | grad norm: 80021.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66574 |
+
time (ms)
|
66575 |
+
iteration 8280/ 159576 | consumed samples: 414544 | elapsed time per iteration (ms): 21967.3 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.919482E+00 | loss scale: 2048.0 | grad norm: 58905.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66576 |
+
time (ms)
|
66577 |
+
iteration 8290/ 159576 | consumed samples: 415984 | elapsed time per iteration (ms): 21671.6 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 6.919662E+00 | loss scale: 2048.0 | grad norm: 52571.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66578 |
+
time (ms)
|
66579 |
+
iteration 8300/ 159576 | consumed samples: 417424 | elapsed time per iteration (ms): 21755.6 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.024297E+00 | loss scale: 2048.0 | grad norm: 77079.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66580 |
+
time (ms)
|
66581 |
+
iteration 8310/ 159576 | consumed samples: 418864 | elapsed time per iteration (ms): 21909.8 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.234490E+00 | loss scale: 2048.0 | grad norm: 102216.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66582 |
+
time (ms)
|
66583 |
+
iteration 8320/ 159576 | consumed samples: 420304 | elapsed time per iteration (ms): 21566.6 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.228243E+00 | loss scale: 2048.0 | grad norm: 88135.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66584 |
+
time (ms)
|
66585 |
+
iteration 8330/ 159576 | consumed samples: 421744 | elapsed time per iteration (ms): 22069.0 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.068048E+00 | loss scale: 2048.0 | grad norm: 65341.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66586 |
+
time (ms)
|
66587 |
+
iteration 8340/ 159576 | consumed samples: 423184 | elapsed time per iteration (ms): 21682.1 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.049673E+00 | loss scale: 2048.0 | grad norm: 45586.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66588 |
+
time (ms)
|
66589 |
+
iteration 8350/ 159576 | consumed samples: 424624 | elapsed time per iteration (ms): 21918.1 | learning rate: 6.000E-05 | global batch size: 144 | lm loss: 7.033588E+00 | loss scale: 2048.0 | grad norm: 60230.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66590 |
+
time (ms)
|
66591 |
+
iteration 8360/ 159576 | consumed samples: 426160 | elapsed time per iteration (ms): 22474.7 | learning rate: 6.000E-05 | global batch size: 160 | lm loss: 7.032515E+00 | loss scale: 2048.0 | grad norm: 55714.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66592 |
+
time (ms)
|
66593 |
+
iteration 8370/ 159576 | consumed samples: 427760 | elapsed time per iteration (ms): 22723.0 | learning rate: 6.000E-05 | global batch size: 160 | lm loss: 7.051062E+00 | loss scale: 2048.0 | grad norm: 68784.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66594 |
+
time (ms)
|
66595 |
+
[2021-09-25 16:10:22] PULSE: tr8-104B is scheduled to start in 19:16:12 (at 2021-09-26T11:26:35) (1188168 on 'gpu_p13' partition)
|
66596 |
+
[2021-09-25 16:10:22] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66597 |
+
[2021-09-25 16:10:22] PULSE: tr8-104B is running for 11:43:21 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|