bigscience-bot
commited on
Commit
•
870d7e7
1
Parent(s):
8d62e2a
new data
Browse files- logs/main_log.txt +43 -0
logs/main_log.txt
CHANGED
@@ -66226,3 +66226,46 @@ time (ms)
|
|
66226 |
[2021-09-25 07:08:59] PULSE: tr8-104B is scheduled to start in 19:13:17 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66227 |
[2021-09-25 07:08:59] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66228 |
[2021-09-25 07:08:59] PULSE: tr8-104B is running for 2:41:58 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66226 |
[2021-09-25 07:08:59] PULSE: tr8-104B is scheduled to start in 19:13:17 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66227 |
[2021-09-25 07:08:59] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66228 |
[2021-09-25 07:08:59] PULSE: tr8-104B is running for 2:41:58 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|
66229 |
+
iteration 6760/ 159576 | consumed samples: 237872 | elapsed time per iteration (ms): 18776.3 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.303013E+00 | loss scale: 2048.0 | grad norm: 69740.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66230 |
+
time (ms)
|
66231 |
+
iteration 6770/ 159576 | consumed samples: 238832 | elapsed time per iteration (ms): 18675.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.319376E+00 | loss scale: 2048.0 | grad norm: 83900.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66232 |
+
time (ms)
|
66233 |
+
iteration 6780/ 159576 | consumed samples: 239792 | elapsed time per iteration (ms): 18605.9 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.336406E+00 | loss scale: 2048.0 | grad norm: 62443.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66234 |
+
time (ms)
|
66235 |
+
iteration 6790/ 159576 | consumed samples: 240752 | elapsed time per iteration (ms): 18746.1 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.333478E+00 | loss scale: 2048.0 | grad norm: 73606.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66236 |
+
time (ms)
|
66237 |
+
iteration 6800/ 159576 | consumed samples: 241712 | elapsed time per iteration (ms): 18688.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.336754E+00 | loss scale: 2048.0 | grad norm: 96323.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66238 |
+
time (ms)
|
66239 |
+
iteration 6810/ 159576 | consumed samples: 242672 | elapsed time per iteration (ms): 18568.8 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.315503E+00 | loss scale: 2048.0 | grad norm: 65008.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66240 |
+
time (ms)
|
66241 |
+
iteration 6820/ 159576 | consumed samples: 243632 | elapsed time per iteration (ms): 18731.9 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.301308E+00 | loss scale: 2048.0 | grad norm: 70887.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66242 |
+
time (ms)
|
66243 |
+
iteration 6830/ 159576 | consumed samples: 244592 | elapsed time per iteration (ms): 18612.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.331754E+00 | loss scale: 2048.0 | grad norm: 78393.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66244 |
+
time (ms)
|
66245 |
+
iteration 6840/ 159576 | consumed samples: 245552 | elapsed time per iteration (ms): 18584.4 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.318947E+00 | loss scale: 4096.0 | grad norm: 175812.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66246 |
+
time (ms)
|
66247 |
+
iteration 6850/ 159576 | consumed samples: 246512 | elapsed time per iteration (ms): 18855.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.349559E+00 | loss scale: 4096.0 | grad norm: 150858.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66248 |
+
time (ms)
|
66249 |
+
iteration 6860/ 159576 | consumed samples: 247472 | elapsed time per iteration (ms): 18778.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.341676E+00 | loss scale: 4096.0 | grad norm: 374400.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66250 |
+
time (ms)
|
66251 |
+
iteration 6870/ 159576 | consumed samples: 248432 | elapsed time per iteration (ms): 18648.3 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.313033E+00 | loss scale: 4096.0 | grad norm: 153615.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66252 |
+
time (ms)
|
66253 |
+
iteration 6880/ 159576 | consumed samples: 249392 | elapsed time per iteration (ms): 18783.0 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.332200E+00 | loss scale: 4096.0 | grad norm: 135045.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66254 |
+
time (ms)
|
66255 |
+
iteration 6890/ 159576 | consumed samples: 250352 | elapsed time per iteration (ms): 18757.2 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.370442E+00 | loss scale: 4096.0 | grad norm: 140003.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66256 |
+
time (ms)
|
66257 |
+
iteration 6900/ 159576 | consumed samples: 251312 | elapsed time per iteration (ms): 18547.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.426891E+00 | loss scale: 4096.0 | grad norm: 166603.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66258 |
+
time (ms)
|
66259 |
+
iteration 6910/ 159576 | consumed samples: 252272 | elapsed time per iteration (ms): 18775.5 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.383529E+00 | loss scale: 4096.0 | grad norm: 161102.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66260 |
+
time (ms)
|
66261 |
+
iteration 6920/ 159576 | consumed samples: 253232 | elapsed time per iteration (ms): 18674.9 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.362777E+00 | loss scale: 4096.0 | grad norm: 135239.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66262 |
+
time (ms)
|
66263 |
+
iteration 6930/ 159576 | consumed samples: 254192 | elapsed time per iteration (ms): 18723.1 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.348313E+00 | loss scale: 4096.0 | grad norm: 180298.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66264 |
+
time (ms)
|
66265 |
+
iteration 6940/ 159576 | consumed samples: 255152 | elapsed time per iteration (ms): 18629.7 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.304693E+00 | loss scale: 4096.0 | grad norm: 155481.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66266 |
+
time (ms)
|
66267 |
+
iteration 6950/ 159576 | consumed samples: 256112 | elapsed time per iteration (ms): 18736.2 | learning rate: 6.000E-05 | global batch size: 96 | lm loss: 6.335081E+00 | loss scale: 4096.0 | grad norm: 170157.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
66268 |
+
time (ms)
|
66269 |
+
[2021-09-25 08:09:15] PULSE: tr8-104B is scheduled to start in 18:13:01 (at 2021-09-26T02:22:17) (1188168 on 'gpu_p13' partition)
|
66270 |
+
[2021-09-25 08:09:15] PULSE: tr8-104B is waiting for the previous Job Array job to finish before scheduling a new one (1185639_[2-10%1] on 'gpu_p13' partition)
|
66271 |
+
[2021-09-25 08:09:15] PULSE: tr8-104B is running for 3:42:14 since 2021-09-25T04:27:01 (1185639_1 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r7i2n[4-5],r7i3n2,r7i6n[2-4],r7i7n[7-8],r8i0n[0,2-3,5-8],r8i1n[0,2-4],r8i2n8,r8i3n[0-2],r8i5n[3-4],r8i7n[3-8],r9i0n[0-5],r9i1n[0-3],r9i2n[3-6,8],r9i3n[0-1,7-8],r9i4n[0-3],r9i5n[3-8],r9i6n0)
|