bigscience-bot
commited on
Commit
•
3b96397
1
Parent(s):
580879d
new data
Browse files- logs/main_log.txt +47 -0
logs/main_log.txt
CHANGED
@@ -99904,3 +99904,50 @@ time (ms)
|
|
99904 |
[2021-09-28 06:32:50] PULSE: tr8-104B is scheduled to start in 17:29:26 (at 2021-09-29T00:02:17) (1277218 on 'gpu_p13' partition)
|
99905 |
[2021-09-28 06:32:50] PULSE: tr8-104B is waiting for the previous job to finish before scheduling a new one using the dependency mechanism (1277295_[1-10%1] on 'gpu_p13' partition)
|
99906 |
[2021-09-28 06:32:50] PULSE: tr8-104B is running for 12:49:24 since 2021-09-27T17:43:26 (1271196 on 'gpu_p13' partition (r7i7n[6-8],r8i0n[0-8],r8i1n[0-4],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n0,r9i4n8,r9i5n[0-8],r9i6n[0-8],r9i7n[3-6])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99904 |
[2021-09-28 06:32:50] PULSE: tr8-104B is scheduled to start in 17:29:26 (at 2021-09-29T00:02:17) (1277218 on 'gpu_p13' partition)
|
99905 |
[2021-09-28 06:32:50] PULSE: tr8-104B is waiting for the previous job to finish before scheduling a new one using the dependency mechanism (1277295_[1-10%1] on 'gpu_p13' partition)
|
99906 |
[2021-09-28 06:32:50] PULSE: tr8-104B is running for 12:49:24 since 2021-09-27T17:43:26 (1271196 on 'gpu_p13' partition (r7i7n[6-8],r8i0n[0-8],r8i1n[0-4],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n0,r9i4n8,r9i5n[0-8],r9i6n[0-8],r9i7n[3-6])
|
99907 |
+
iteration 8560/ 159576 | consumed samples: 458160 | elapsed time per iteration (ms): 15526.6 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99908 |
+
time (ms)
|
99909 |
+
iteration 8570/ 159576 | consumed samples: 459760 | elapsed time per iteration (ms): 15343.9 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99910 |
+
time (ms)
|
99911 |
+
iteration 8580/ 159576 | consumed samples: 461360 | elapsed time per iteration (ms): 15516.0 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99912 |
+
time (ms)
|
99913 |
+
iteration 8590/ 159576 | consumed samples: 462960 | elapsed time per iteration (ms): 15788.5 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99914 |
+
time (ms)
|
99915 |
+
iteration 8600/ 159576 | consumed samples: 464560 | elapsed time per iteration (ms): 15421.5 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99916 |
+
time (ms)
|
99917 |
+
iteration 8610/ 159576 | consumed samples: 466160 | elapsed time per iteration (ms): 15365.4 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99918 |
+
time (ms)
|
99919 |
+
iteration 8620/ 159576 | consumed samples: 467760 | elapsed time per iteration (ms): 15460.6 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99920 |
+
time (ms)
|
99921 |
+
iteration 8630/ 159576 | consumed samples: 469360 | elapsed time per iteration (ms): 15794.2 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99922 |
+
time (ms)
|
99923 |
+
iteration 8640/ 159576 | consumed samples: 470960 | elapsed time per iteration (ms): 15928.5 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99924 |
+
time (ms)
|
99925 |
+
iteration 8650/ 159576 | consumed samples: 472560 | elapsed time per iteration (ms): 15514.8 | learning rate: 6.000E-05 | global batch size: 160 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99926 |
+
time (ms)
|
99927 |
+
iteration 8660/ 159576 | consumed samples: 474320 | elapsed time per iteration (ms): 16639.1 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99928 |
+
time (ms)
|
99929 |
+
iteration 8670/ 159576 | consumed samples: 476080 | elapsed time per iteration (ms): 16569.6 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99930 |
+
time (ms)
|
99931 |
+
iteration 8680/ 159576 | consumed samples: 477840 | elapsed time per iteration (ms): 16695.6 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99932 |
+
time (ms)
|
99933 |
+
iteration 8690/ 159576 | consumed samples: 479600 | elapsed time per iteration (ms): 16700.3 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99934 |
+
time (ms)
|
99935 |
+
iteration 8700/ 159576 | consumed samples: 481360 | elapsed time per iteration (ms): 16569.3 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99936 |
+
time (ms)
|
99937 |
+
iteration 8710/ 159576 | consumed samples: 483120 | elapsed time per iteration (ms): 16526.6 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99938 |
+
time (ms)
|
99939 |
+
iteration 8720/ 159576 | consumed samples: 484880 | elapsed time per iteration (ms): 16370.8 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99940 |
+
time (ms)
|
99941 |
+
iteration 8730/ 159576 | consumed samples: 486640 | elapsed time per iteration (ms): 16678.1 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99942 |
+
time (ms)
|
99943 |
+
iteration 8740/ 159576 | consumed samples: 488400 | elapsed time per iteration (ms): 16715.4 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99944 |
+
time (ms)
|
99945 |
+
iteration 8750/ 159576 | consumed samples: 490160 | elapsed time per iteration (ms): 16605.2 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99946 |
+
time (ms)
|
99947 |
+
iteration 8760/ 159576 | consumed samples: 491920 | elapsed time per iteration (ms): 16522.8 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99948 |
+
time (ms)
|
99949 |
+
iteration 8770/ 159576 | consumed samples: 493680 | elapsed time per iteration (ms): 16607.3 | learning rate: 6.000E-05 | global batch size: 176 | loss scale: 1.0 | grad norm: 5533.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
99950 |
+
time (ms)
|
99951 |
+
[2021-09-28 07:32:48] PULSE: tr8-104B is scheduled to start in 17:38:05 (at 2021-09-29T01:10:54) (1277218 on 'gpu_p13' partition)
|
99952 |
+
[2021-09-28 07:32:48] PULSE: tr8-104B is waiting for the previous job to finish before scheduling a new one using the dependency mechanism (1277295_[1-10%1] on 'gpu_p13' partition)
|
99953 |
+
[2021-09-28 07:32:48] PULSE: tr8-104B is running for 13:49:22 since 2021-09-27T17:43:26 (1271196 on 'gpu_p13' partition (r7i7n[6-8],r8i0n[0-8],r8i1n[0-4],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n0,r9i4n8,r9i5n[0-8],r9i6n[0-8],r9i7n[3-6])
|