bigscience-bot
commited on
Commit
•
2485070
1
Parent(s):
c72a597
new data
Browse files- logs/main_log.txt +50 -0
logs/main_log.txt
CHANGED
@@ -66269,3 +66269,53 @@ time (ms)
|
|
66269 |
[2021-09-25 08:09:15] PULSE: tr8-104B is scheduled to start in 18:13:01 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66270 |
[2021-09-25 08:09:15] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66271 |
[2021-09-25 08:09:15] PULSE: tr8-104B is running for 3:42:14 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66269 |
[2021-09-25 08:09:15] PULSE: tr8-104B is scheduled to start in 18:13:01 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66270 |
[2021-09-25 08:09:15] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66271 |
[2021-09-25 08:09:15] PULSE: tr8-104B is running for 3:42:14 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66272 |
+
iteration 6960/ 159576 | consumed samples: 257072 | elapsed time per iteration (ms): 18679.3 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.350162E+00 | loss scale: 4096.0 | grad norm: 146048.789 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66273 |
+
time (ms)
|
66274 |
+
iteration 6970/ 159576 | consumed samples: 258032 | elapsed time per iteration (ms): 17405.9 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.358824E+00 | loss scale: 2048.0 | grad norm: 83822.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66275 |
+
time (ms)
|
66276 |
+
iteration 6980/ 159576 | consumed samples: 258992 | elapsed time per iteration (ms): 18714.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.327154E+00 | loss scale: 2048.0 | grad norm: 55012.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66277 |
+
time (ms)
|
66278 |
+
iteration 6990/ 159576 | consumed samples: 259952 | elapsed time per iteration (ms): 18649.4 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.344659E+00 | loss scale: 2048.0 | grad norm: 62132.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66279 |
+
time (ms)
|
66280 |
+
iteration 7000/ 159576 | consumed samples: 260912 | elapsed time per iteration (ms): 18706.1 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.444662E+00 | loss scale: 2048.0 | grad norm: 98258.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66281 |
+
time (ms)
|
66282 |
+
------------------------------------------------------------------------------------------------
|
66283 |
+
validation loss at iteration 7000 | lm loss value: 7.174200E+00 | lm loss PPL: 1.305315E+03 |
|
66284 |
+
------------------------------------------------------------------------------------------------
|
66285 |
+
iteration 7010/ 159576 | consumed samples: 261872 | elapsed time per iteration (ms): 19904.0 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 1.142026E+01 | loss scale: 2048.0 | grad norm: 219645.978 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66286 |
+
time (ms)
|
66287 |
+
iteration 7020/ 159576 | consumed samples: 262832 | elapsed time per iteration (ms): 18580.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 1.367010E+01 | loss scale: 2048.0 | grad norm: 223286.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66288 |
+
time (ms)
|
66289 |
+
[2021-09-25 08:32:28] PULSE: tr8-104B is scheduled to start in 17:49:48 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66290 |
+
[2021-09-25 08:32:28] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66291 |
+
[2021-09-25 08:32:28] PULSE: tr8-104B is running for 4:05:27 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66292 |
+
iteration 7030/ 159576 | consumed samples: 263792 | elapsed time per iteration (ms): 18402.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 1.182180E+01 | loss scale: 2048.0 | grad norm: 19931.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66293 |
+
time (ms)
|
66294 |
+
iteration 7040/ 159576 | consumed samples: 264752 | elapsed time per iteration (ms): 18461.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 9.981701E+00 | loss scale: 2048.0 | grad norm: 205737.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66295 |
+
time (ms)
|
66296 |
+
iteration 7050/ 159576 | consumed samples: 265712 | elapsed time per iteration (ms): 18431.2 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 9.425107E+00 | loss scale: 2048.0 | grad norm: 195793.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66297 |
+
time (ms)
|
66298 |
+
iteration 7060/ 159576 | consumed samples: 266672 | elapsed time per iteration (ms): 18498.9 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 8.606621E+00 | loss scale: 2048.0 | grad norm: 50379.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66299 |
+
time (ms)
|
66300 |
+
iteration 7070/ 159576 | consumed samples: 267632 | elapsed time per iteration (ms): 18340.3 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 8.027315E+00 | loss scale: 2048.0 | grad norm: 37173.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66301 |
+
time (ms)
|
66302 |
+
iteration 7080/ 159576 | consumed samples: 268592 | elapsed time per iteration (ms): 18563.4 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.726066E+00 | loss scale: 2048.0 | grad norm: 22946.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66303 |
+
time (ms)
|
66304 |
+
iteration 7090/ 159576 | consumed samples: 269552 | elapsed time per iteration (ms): 18408.0 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.553810E+00 | loss scale: 2048.0 | grad norm: 16048.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66305 |
+
time (ms)
|
66306 |
+
iteration 7100/ 159576 | consumed samples: 270512 | elapsed time per iteration (ms): 18353.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.394469E+00 | loss scale: 2048.0 | grad norm: 10766.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66307 |
+
time (ms)
|
66308 |
+
[2021-09-25 08:57:55] PULSE: tr8-104B is scheduled to start in 17:24:21 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66309 |
+
[2021-09-25 08:57:55] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66310 |
+
[2021-09-25 08:57:55] PULSE: tr8-104B is running for 4:30:54 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66311 |
+
iteration 7110/ 159576 | consumed samples: 271472 | elapsed time per iteration (ms): 18511.6 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.327065E+00 | loss scale: 2048.0 | grad norm: 25940.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66312 |
+
time (ms)
|
66313 |
+
iteration 7120/ 159576 | consumed samples: 272432 | elapsed time per iteration (ms): 18333.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.337917E+00 | loss scale: 2048.0 | grad norm: 18319.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66314 |
+
time (ms)
|
66315 |
+
iteration 7130/ 159576 | consumed samples: 273392 | elapsed time per iteration (ms): 18249.8 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.273988E+00 | loss scale: 2048.0 | grad norm: 14331.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66316 |
+
time (ms)
|
66317 |
+
iteration 7140/ 159576 | consumed samples: 274352 | elapsed time per iteration (ms): 18274.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 7.204887E+00 | loss scale: 2048.0 | grad norm: 21767.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66318 |
+
time (ms)
|
66319 |
+
[2021-09-25 09:09:21] PULSE: tr8-104B is scheduled to start in 17:12:55 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66320 |
+
[2021-09-25 09:09:21] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66321 |
+
[2021-09-25 09:09:21] PULSE: tr8-104B is running for 4:42:20 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|