bigscience-bot
commited on
Commit
•
8b1c59e
1
Parent(s):
2aa7e7a
new data
Browse files- logs/main_log.txt +56 -0
logs/main_log.txt
CHANGED
@@ -86922,3 +86922,59 @@ time (ms)
|
|
86922 |
iteration 9840/ 159576 | consumed samples: 719216 | elapsed time per iteration (ms): 14188.9 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86923 |
time (ms)
|
86924 |
[2021-09-27 04:14:28] PULSE: tr8-104B is running for 20:12 since 2021-09-27T03:54:16 (1188168 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r6i7n[7-8],r7i0n[0-5],r7i1n[7-8],r7i2n[0-1,5,8],r7i3n2,r7i5n7,r7i6n[1-4,8],r7i7n[0-4,6-8],r8i0n[0-8],r8i1n[0-4],r8i2n8,r8i3n[0-3,8],r8i4n[0-1],r8i6n[2-3,5-6],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n[0,3-8],r9i3n[0-2,6-8],r9i4n[0-6,8],r9i5n[0-8],r9i6n[0-8],r9i7n[1-8])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86922 |
iteration 9840/ 159576 | consumed samples: 719216 | elapsed time per iteration (ms): 14188.9 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86923 |
time (ms)
|
86924 |
[2021-09-27 04:14:28] PULSE: tr8-104B is running for 20:12 since 2021-09-27T03:54:16 (1188168 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r6i7n[7-8],r7i0n[0-5],r7i1n[7-8],r7i2n[0-1,5,8],r7i3n2,r7i5n7,r7i6n[1-4,8],r7i7n[0-4,6-8],r8i0n[0-8],r8i1n[0-4],r8i2n8,r8i3n[0-3,8],r8i4n[0-1],r8i6n[2-3,5-6],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n[0,3-8],r9i3n[0-2,6-8],r9i4n[0-6,8],r9i5n[0-8],r9i6n[0-8],r9i7n[1-8])
|
86925 |
+
iteration 9850/ 159576 | consumed samples: 721776 | elapsed time per iteration (ms): 14071.1 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86926 |
+
time (ms)
|
86927 |
+
iteration 9860/ 159576 | consumed samples: 724336 | elapsed time per iteration (ms): 14125.1 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86928 |
+
time (ms)
|
86929 |
+
iteration 9870/ 159576 | consumed samples: 726896 | elapsed time per iteration (ms): 14170.2 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86930 |
+
time (ms)
|
86931 |
+
iteration 9880/ 159576 | consumed samples: 729456 | elapsed time per iteration (ms): 14139.5 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86932 |
+
time (ms)
|
86933 |
+
iteration 9890/ 159576 | consumed samples: 732016 | elapsed time per iteration (ms): 14156.0 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86934 |
+
time (ms)
|
86935 |
+
iteration 9900/ 159576 | consumed samples: 734576 | elapsed time per iteration (ms): 14057.9 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86936 |
+
time (ms)
|
86937 |
+
iteration 9910/ 159576 | consumed samples: 737136 | elapsed time per iteration (ms): 14129.8 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86938 |
+
time (ms)
|
86939 |
+
iteration 9920/ 159576 | consumed samples: 739696 | elapsed time per iteration (ms): 14157.7 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86940 |
+
time (ms)
|
86941 |
+
iteration 9930/ 159576 | consumed samples: 742256 | elapsed time per iteration (ms): 14024.1 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86942 |
+
time (ms)
|
86943 |
+
iteration 9940/ 159576 | consumed samples: 744816 | elapsed time per iteration (ms): 13971.4 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86944 |
+
time (ms)
|
86945 |
+
iteration 9950/ 159576 | consumed samples: 747376 | elapsed time per iteration (ms): 14101.5 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86946 |
+
time (ms)
|
86947 |
+
iteration 9960/ 159576 | consumed samples: 749936 | elapsed time per iteration (ms): 14210.0 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86948 |
+
time (ms)
|
86949 |
+
iteration 9970/ 159576 | consumed samples: 752496 | elapsed time per iteration (ms): 14219.6 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86950 |
+
time (ms)
|
86951 |
+
iteration 9980/ 159576 | consumed samples: 755056 | elapsed time per iteration (ms): 14117.6 | learning rate: 6.000E-05 | global batch size: 256 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86952 |
+
time (ms)
|
86953 |
+
iteration 9990/ 159576 | consumed samples: 757712 | elapsed time per iteration (ms): 14400.0 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86954 |
+
time (ms)
|
86955 |
+
[2021-09-27 04:51:19,357] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=1052, lr=[5.999919375575235e-05, 5.999919375575235e-05], mom=[(0.9, 0.999), (0.9, 0.999)]
|
86956 |
+
steps: 10000 loss: nan iter time (s): 0.007 samples/sec: 37472.688
|
86957 |
+
iteration 10000/ 159576 | consumed samples: 760432 | elapsed time per iteration (ms): 14648.0 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86958 |
+
time (ms)
|
86959 |
+
-------------------------------------------------------------------------------------------------
|
86960 |
+
validation loss at iteration 10000 | lm loss value: 7.270623E+00 | lm loss PPL: 1.437445E+03 |
|
86961 |
+
-------------------------------------------------------------------------------------------------
|
86962 |
+
iteration 10010/ 159576 | consumed samples: 763152 | elapsed time per iteration (ms): 16469.3 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86963 |
+
time (ms)
|
86964 |
+
iteration 10020/ 159576 | consumed samples: 765872 | elapsed time per iteration (ms): 14573.2 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86965 |
+
time (ms)
|
86966 |
+
iteration 10030/ 159576 | consumed samples: 768592 | elapsed time per iteration (ms): 14611.8 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86967 |
+
time (ms)
|
86968 |
+
iteration 10040/ 159576 | consumed samples: 771312 | elapsed time per iteration (ms): 14782.8 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86969 |
+
time (ms)
|
86970 |
+
iteration 10050/ 159576 | consumed samples: 774032 | elapsed time per iteration (ms): 14722.8 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86971 |
+
time (ms)
|
86972 |
+
iteration 10060/ 159576 | consumed samples: 776752 | elapsed time per iteration (ms): 14595.9 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86973 |
+
time (ms)
|
86974 |
+
iteration 10070/ 159576 | consumed samples: 779472 | elapsed time per iteration (ms): 14712.5 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86975 |
+
time (ms)
|
86976 |
+
iteration 10080/ 159576 | consumed samples: 782192 | elapsed time per iteration (ms): 14640.3 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86977 |
+
time (ms)
|
86978 |
+
iteration 10090/ 159576 | consumed samples: 784912 | elapsed time per iteration (ms): 15060.9 | learning rate: 6.000E-05 | global batch size: 272 | loss scale: 1.0 | grad norm: 0.000 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 |
|
86979 |
+
time (ms)
|
86980 |
+
[2021-09-27 05:14:32] PULSE: tr8-104B is running for 1:20:16 since 2021-09-27T03:54:16 (1188168 on 'gpu_p13' partition (r6i5n[7-8],r6i6n0,r6i7n[7-8],r7i0n[0-5],r7i1n[7-8],r7i2n[0-1,5,8],r7i3n2,r7i5n7,r7i6n[1-4,8],r7i7n[0-4,6-8],r8i0n[0-8],r8i1n[0-4],r8i2n8,r8i3n[0-3,8],r8i4n[0-1],r8i6n[2-3,5-6],r8i7n[3-8],r9i0n[0-6,8],r9i1n[0-8],r9i2n[0,3-8],r9i3n[0-2,6-8],r9i4n[0-6,8],r9i5n[0-8],r9i6n[0-8],r9i7n[1-8])
|