rs545837 commited on
Commit
e4f5bd3
1 Parent(s): bd5e9d3

Upload folder using huggingface_hub

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0f1a637ff5efd3a742cf0adbf6c58e6934d9a4a6215f862074742126fcf08c2
3
  size 213625344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b2c4db3a2d556ee001eb8fdf128d644b789b40b2f7cf64684b6fe78989053b
3
  size 213625344
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf290f7d31c224d60ac5c8ed732a0ecc556301b4c61d776c8545f9109d532312
3
  size 427334458
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:129003e0eae631ea59961c42baa93d6ece566b523e5b57d228356535ea34946d
3
  size 427334458
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435089f27ea78cb1b5ffb371da67f47c7a3dea92ab07479122e62b4e8dbeed97
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593cfa780b4b09ba583a139eb81c2eae72992c19fc5f8f38c81bd37ea47dbe04
3
  size 16433
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35c3205fe632396691980ab13bb747592db3b39a8f9bc42c6b4bce2ebc4e86d2
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9560f8c460a856a55828494146d2d52ecf0d95a3dec5919d8f29a972450cec34
3
  size 16433
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdd18dda129c8617269378af8c4207e690d6ed4efdbda0ce1aa5947221052d4a
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e7b7bce88125710e1c78de933cf62b48d6ec5a97b36fc43a09d7f70aebd0307
3
  size 16433
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d14c531768fd1b817e6ad83f2878c07af9aa939a5372ffc3020b84164720063
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cb51f987fa17a879db4d368b0564564ae49379d5c5ce803d79d24b4b5a43c13
3
  size 16433
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6343b5ecd40e08336f425dc8913f4b43aad1a7465797446cfa1892e3cb3133f3
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ed1a64882ec192e7bdbd0b9dda7c3dd977bc8ef889d26ddca3e2380d9debae
3
  size 16433
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0492bd561ad6444b11560f52c5a570d34e2dcc461489aae4bd2703fa55d13f47
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dd0f015bb5215ff40f1555f5c47c89a7bd89b00e7ef4568ca045dc1c2b5514a
3
  size 16433
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a181670929d07bdff88356f8cc3404c6ef87b1e788fc5b1dc6f473ac9b2bc12
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5bc7ba9f4a9e405864d41e902bc7509a5b6fa554a6cf09f24491e00dac06fb
3
  size 16433
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23b0154dd6c58d1151b932c0d3de52258209b2d646a7e6e34de818d4dfa12a13
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77e384968f192bdc497600d0108b82695a12247413143c0c9bd4e09fbb718212
3
  size 16433
rng_state_8.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0c53426dcd44d43fe238422e433fb361c04df5585664ed42f08da1365fc112f
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc46c9518ec829f507eb5833115c977024d13a12bc4e0ecff2238d818e6eb6dc
3
  size 16433
rng_state_9.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:268b7b1ca55cb9d2d96542e3789b74164ec07561d7774458ecef281e2a3ad163
3
  size 16433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56cb5d3bf2f9602568c76013ebf1c626061418df7f27c779c448921d362d5232
3
  size 16433
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fc0c0b23c1fbf47eda082a36b0831feacf86e3aa80c09efc720587a7e961c90
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a9c85c7b2c29125f99f000e54e900b05be0859260af7e4a0abf634beb2c469
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5998188640186312,
5
  "eval_steps": 2318,
6
- "global_step": 13908,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2856,1427 +2856,6 @@
2856
  "eval_samples_per_second": 610.045,
2857
  "eval_steps_per_second": 7.626,
2858
  "step": 9272
2859
- },
2860
- {
2861
- "epoch": 0.4007417949713201,
2862
- "grad_norm": 0.408203125,
2863
- "learning_rate": 0.0005998532205145915,
2864
- "loss": 1.0191,
2865
- "step": 9292
2866
- },
2867
- {
2868
- "epoch": 0.4017337301073878,
2869
- "grad_norm": 0.46484375,
2870
- "learning_rate": 0.0005988603004662408,
2871
- "loss": 1.0074,
2872
- "step": 9315
2873
- },
2874
- {
2875
- "epoch": 0.4027256652434554,
2876
- "grad_norm": 0.353515625,
2877
- "learning_rate": 0.0005978673804178898,
2878
- "loss": 1.0052,
2879
- "step": 9338
2880
- },
2881
- {
2882
- "epoch": 0.40371760037952303,
2883
- "grad_norm": 0.361328125,
2884
- "learning_rate": 0.000596874460369539,
2885
- "loss": 1.0248,
2886
- "step": 9361
2887
- },
2888
- {
2889
- "epoch": 0.40470953551559063,
2890
- "grad_norm": 0.376953125,
2891
- "learning_rate": 0.000595881540321188,
2892
- "loss": 1.0328,
2893
- "step": 9384
2894
- },
2895
- {
2896
- "epoch": 0.40570147065165824,
2897
- "grad_norm": 0.4453125,
2898
- "learning_rate": 0.0005948886202728372,
2899
- "loss": 1.0254,
2900
- "step": 9407
2901
- },
2902
- {
2903
- "epoch": 0.4066934057877259,
2904
- "grad_norm": 0.302734375,
2905
- "learning_rate": 0.0005938957002244862,
2906
- "loss": 1.0285,
2907
- "step": 9430
2908
- },
2909
- {
2910
- "epoch": 0.4076853409237935,
2911
- "grad_norm": 0.439453125,
2912
- "learning_rate": 0.0005929027801761354,
2913
- "loss": 1.0077,
2914
- "step": 9453
2915
- },
2916
- {
2917
- "epoch": 0.40867727605986115,
2918
- "grad_norm": 0.3515625,
2919
- "learning_rate": 0.0005919098601277845,
2920
- "loss": 1.0181,
2921
- "step": 9476
2922
- },
2923
- {
2924
- "epoch": 0.40966921119592875,
2925
- "grad_norm": 0.33203125,
2926
- "learning_rate": 0.0005909169400794336,
2927
- "loss": 1.0427,
2928
- "step": 9499
2929
- },
2930
- {
2931
- "epoch": 0.41066114633199635,
2932
- "grad_norm": 0.380859375,
2933
- "learning_rate": 0.0005899240200310827,
2934
- "loss": 1.0133,
2935
- "step": 9522
2936
- },
2937
- {
2938
- "epoch": 0.411653081468064,
2939
- "grad_norm": 0.40234375,
2940
- "learning_rate": 0.0005889310999827318,
2941
- "loss": 1.0187,
2942
- "step": 9545
2943
- },
2944
- {
2945
- "epoch": 0.4126450166041316,
2946
- "grad_norm": 0.44921875,
2947
- "learning_rate": 0.000587938179934381,
2948
- "loss": 1.0091,
2949
- "step": 9568
2950
- },
2951
- {
2952
- "epoch": 0.41363695174019927,
2953
- "grad_norm": 0.37109375,
2954
- "learning_rate": 0.0005869452598860301,
2955
- "loss": 1.002,
2956
- "step": 9591
2957
- },
2958
- {
2959
- "epoch": 0.41462888687626687,
2960
- "grad_norm": 0.47265625,
2961
- "learning_rate": 0.0005859523398376792,
2962
- "loss": 1.0321,
2963
- "step": 9614
2964
- },
2965
- {
2966
- "epoch": 0.4156208220123345,
2967
- "grad_norm": 0.4453125,
2968
- "learning_rate": 0.0005849594197893283,
2969
- "loss": 1.0244,
2970
- "step": 9637
2971
- },
2972
- {
2973
- "epoch": 0.4166127571484021,
2974
- "grad_norm": 0.4375,
2975
- "learning_rate": 0.0005839664997409773,
2976
- "loss": 1.0345,
2977
- "step": 9660
2978
- },
2979
- {
2980
- "epoch": 0.4176046922844697,
2981
- "grad_norm": 0.357421875,
2982
- "learning_rate": 0.0005829735796926266,
2983
- "loss": 1.0157,
2984
- "step": 9683
2985
- },
2986
- {
2987
- "epoch": 0.4185966274205374,
2988
- "grad_norm": 0.3515625,
2989
- "learning_rate": 0.0005819806596442756,
2990
- "loss": 0.9774,
2991
- "step": 9706
2992
- },
2993
- {
2994
- "epoch": 0.419588562556605,
2995
- "grad_norm": 0.408203125,
2996
- "learning_rate": 0.0005809877395959248,
2997
- "loss": 1.0011,
2998
- "step": 9729
2999
- },
3000
- {
3001
- "epoch": 0.42058049769267264,
3002
- "grad_norm": 0.41015625,
3003
- "learning_rate": 0.0005799948195475738,
3004
- "loss": 1.0148,
3005
- "step": 9752
3006
- },
3007
- {
3008
- "epoch": 0.42157243282874024,
3009
- "grad_norm": 0.53515625,
3010
- "learning_rate": 0.000579001899499223,
3011
- "loss": 1.0058,
3012
- "step": 9775
3013
- },
3014
- {
3015
- "epoch": 0.42256436796480784,
3016
- "grad_norm": 0.37890625,
3017
- "learning_rate": 0.000578008979450872,
3018
- "loss": 1.0241,
3019
- "step": 9798
3020
- },
3021
- {
3022
- "epoch": 0.4235563031008755,
3023
- "grad_norm": 0.38671875,
3024
- "learning_rate": 0.0005770160594025212,
3025
- "loss": 1.0067,
3026
- "step": 9821
3027
- },
3028
- {
3029
- "epoch": 0.4245482382369431,
3030
- "grad_norm": 0.3359375,
3031
- "learning_rate": 0.0005760231393541703,
3032
- "loss": 1.0105,
3033
- "step": 9844
3034
- },
3035
- {
3036
- "epoch": 0.42554017337301075,
3037
- "grad_norm": 0.40625,
3038
- "learning_rate": 0.0005750302193058193,
3039
- "loss": 1.0226,
3040
- "step": 9867
3041
- },
3042
- {
3043
- "epoch": 0.42653210850907836,
3044
- "grad_norm": 0.392578125,
3045
- "learning_rate": 0.0005740372992574685,
3046
- "loss": 1.0243,
3047
- "step": 9890
3048
- },
3049
- {
3050
- "epoch": 0.427524043645146,
3051
- "grad_norm": 0.419921875,
3052
- "learning_rate": 0.0005730443792091175,
3053
- "loss": 1.0128,
3054
- "step": 9913
3055
- },
3056
- {
3057
- "epoch": 0.4285159787812136,
3058
- "grad_norm": 0.373046875,
3059
- "learning_rate": 0.0005720514591607668,
3060
- "loss": 0.9993,
3061
- "step": 9936
3062
- },
3063
- {
3064
- "epoch": 0.4295079139172812,
3065
- "grad_norm": 0.365234375,
3066
- "learning_rate": 0.0005710585391124158,
3067
- "loss": 1.012,
3068
- "step": 9959
3069
- },
3070
- {
3071
- "epoch": 0.43049984905334887,
3072
- "grad_norm": 0.302734375,
3073
- "learning_rate": 0.000570065619064065,
3074
- "loss": 1.0178,
3075
- "step": 9982
3076
- },
3077
- {
3078
- "epoch": 0.43149178418941647,
3079
- "grad_norm": 0.3671875,
3080
- "learning_rate": 0.000569072699015714,
3081
- "loss": 1.0192,
3082
- "step": 10005
3083
- },
3084
- {
3085
- "epoch": 0.43248371932548413,
3086
- "grad_norm": 0.345703125,
3087
- "learning_rate": 0.0005680797789673631,
3088
- "loss": 0.9992,
3089
- "step": 10028
3090
- },
3091
- {
3092
- "epoch": 0.43347565446155173,
3093
- "grad_norm": 0.4140625,
3094
- "learning_rate": 0.0005670868589190123,
3095
- "loss": 1.0033,
3096
- "step": 10051
3097
- },
3098
- {
3099
- "epoch": 0.43446758959761933,
3100
- "grad_norm": 0.359375,
3101
- "learning_rate": 0.0005660939388706614,
3102
- "loss": 1.0153,
3103
- "step": 10074
3104
- },
3105
- {
3106
- "epoch": 0.435459524733687,
3107
- "grad_norm": 0.36328125,
3108
- "learning_rate": 0.0005651010188223105,
3109
- "loss": 0.997,
3110
- "step": 10097
3111
- },
3112
- {
3113
- "epoch": 0.4364514598697546,
3114
- "grad_norm": 0.3671875,
3115
- "learning_rate": 0.0005641080987739596,
3116
- "loss": 1.0147,
3117
- "step": 10120
3118
- },
3119
- {
3120
- "epoch": 0.43744339500582224,
3121
- "grad_norm": 0.3515625,
3122
- "learning_rate": 0.0005631151787256087,
3123
- "loss": 1.0046,
3124
- "step": 10143
3125
- },
3126
- {
3127
- "epoch": 0.43843533014188985,
3128
- "grad_norm": 0.37890625,
3129
- "learning_rate": 0.0005621222586772579,
3130
- "loss": 1.0008,
3131
- "step": 10166
3132
- },
3133
- {
3134
- "epoch": 0.4394272652779575,
3135
- "grad_norm": 0.41796875,
3136
- "learning_rate": 0.0005611293386289069,
3137
- "loss": 1.0315,
3138
- "step": 10189
3139
- },
3140
- {
3141
- "epoch": 0.4404192004140251,
3142
- "grad_norm": 0.365234375,
3143
- "learning_rate": 0.0005601364185805561,
3144
- "loss": 1.0164,
3145
- "step": 10212
3146
- },
3147
- {
3148
- "epoch": 0.4414111355500927,
3149
- "grad_norm": 0.361328125,
3150
- "learning_rate": 0.0005591434985322051,
3151
- "loss": 1.0044,
3152
- "step": 10235
3153
- },
3154
- {
3155
- "epoch": 0.44240307068616036,
3156
- "grad_norm": 0.439453125,
3157
- "learning_rate": 0.0005581505784838543,
3158
- "loss": 1.0236,
3159
- "step": 10258
3160
- },
3161
- {
3162
- "epoch": 0.44339500582222796,
3163
- "grad_norm": 0.341796875,
3164
- "learning_rate": 0.0005571576584355034,
3165
- "loss": 1.0156,
3166
- "step": 10281
3167
- },
3168
- {
3169
- "epoch": 0.4443869409582956,
3170
- "grad_norm": 0.373046875,
3171
- "learning_rate": 0.0005561647383871526,
3172
- "loss": 0.9916,
3173
- "step": 10304
3174
- },
3175
- {
3176
- "epoch": 0.4453788760943632,
3177
- "grad_norm": 0.33203125,
3178
- "learning_rate": 0.0005551718183388016,
3179
- "loss": 0.9961,
3180
- "step": 10327
3181
- },
3182
- {
3183
- "epoch": 0.4463708112304308,
3184
- "grad_norm": 0.392578125,
3185
- "learning_rate": 0.0005541788982904507,
3186
- "loss": 1.0021,
3187
- "step": 10350
3188
- },
3189
- {
3190
- "epoch": 0.4473627463664985,
3191
- "grad_norm": 0.375,
3192
- "learning_rate": 0.0005531859782420998,
3193
- "loss": 1.0219,
3194
- "step": 10373
3195
- },
3196
- {
3197
- "epoch": 0.4483546815025661,
3198
- "grad_norm": 0.4140625,
3199
- "learning_rate": 0.000552193058193749,
3200
- "loss": 0.9982,
3201
- "step": 10396
3202
- },
3203
- {
3204
- "epoch": 0.44934661663863373,
3205
- "grad_norm": 0.392578125,
3206
- "learning_rate": 0.0005512001381453981,
3207
- "loss": 0.994,
3208
- "step": 10419
3209
- },
3210
- {
3211
- "epoch": 0.45033855177470133,
3212
- "grad_norm": 0.34765625,
3213
- "learning_rate": 0.0005502072180970471,
3214
- "loss": 0.9899,
3215
- "step": 10442
3216
- },
3217
- {
3218
- "epoch": 0.451330486910769,
3219
- "grad_norm": 0.3828125,
3220
- "learning_rate": 0.0005492142980486963,
3221
- "loss": 1.0096,
3222
- "step": 10465
3223
- },
3224
- {
3225
- "epoch": 0.4523224220468366,
3226
- "grad_norm": 0.3984375,
3227
- "learning_rate": 0.0005482213780003453,
3228
- "loss": 0.9882,
3229
- "step": 10488
3230
- },
3231
- {
3232
- "epoch": 0.4533143571829042,
3233
- "grad_norm": 0.390625,
3234
- "learning_rate": 0.0005472284579519945,
3235
- "loss": 0.999,
3236
- "step": 10511
3237
- },
3238
- {
3239
- "epoch": 0.45430629231897185,
3240
- "grad_norm": 0.3984375,
3241
- "learning_rate": 0.0005462355379036436,
3242
- "loss": 1.0087,
3243
- "step": 10534
3244
- },
3245
- {
3246
- "epoch": 0.45529822745503945,
3247
- "grad_norm": 0.388671875,
3248
- "learning_rate": 0.0005452426178552927,
3249
- "loss": 0.9985,
3250
- "step": 10557
3251
- },
3252
- {
3253
- "epoch": 0.4562901625911071,
3254
- "grad_norm": 0.455078125,
3255
- "learning_rate": 0.0005442496978069418,
3256
- "loss": 1.0104,
3257
- "step": 10580
3258
- },
3259
- {
3260
- "epoch": 0.4572820977271747,
3261
- "grad_norm": 0.61328125,
3262
- "learning_rate": 0.0005432567777585909,
3263
- "loss": 1.0056,
3264
- "step": 10603
3265
- },
3266
- {
3267
- "epoch": 0.4582740328632423,
3268
- "grad_norm": 0.3359375,
3269
- "learning_rate": 0.00054226385771024,
3270
- "loss": 1.0115,
3271
- "step": 10626
3272
- },
3273
- {
3274
- "epoch": 0.45926596799930997,
3275
- "grad_norm": 0.3515625,
3276
- "learning_rate": 0.0005412709376618892,
3277
- "loss": 1.0143,
3278
- "step": 10649
3279
- },
3280
- {
3281
- "epoch": 0.46025790313537757,
3282
- "grad_norm": 0.388671875,
3283
- "learning_rate": 0.0005402780176135383,
3284
- "loss": 0.9916,
3285
- "step": 10672
3286
- },
3287
- {
3288
- "epoch": 0.4612498382714452,
3289
- "grad_norm": 0.396484375,
3290
- "learning_rate": 0.0005392850975651874,
3291
- "loss": 0.9967,
3292
- "step": 10695
3293
- },
3294
- {
3295
- "epoch": 0.4622417734075128,
3296
- "grad_norm": 0.41796875,
3297
- "learning_rate": 0.0005382921775168364,
3298
- "loss": 1.0009,
3299
- "step": 10718
3300
- },
3301
- {
3302
- "epoch": 0.4632337085435805,
3303
- "grad_norm": 0.34765625,
3304
- "learning_rate": 0.0005372992574684856,
3305
- "loss": 0.9919,
3306
- "step": 10741
3307
- },
3308
- {
3309
- "epoch": 0.4642256436796481,
3310
- "grad_norm": 0.3515625,
3311
- "learning_rate": 0.0005363063374201347,
3312
- "loss": 1.0128,
3313
- "step": 10764
3314
- },
3315
- {
3316
- "epoch": 0.4652175788157157,
3317
- "grad_norm": 0.400390625,
3318
- "learning_rate": 0.0005353134173717839,
3319
- "loss": 0.9982,
3320
- "step": 10787
3321
- },
3322
- {
3323
- "epoch": 0.46620951395178334,
3324
- "grad_norm": 0.3515625,
3325
- "learning_rate": 0.0005343204973234329,
3326
- "loss": 0.9998,
3327
- "step": 10810
3328
- },
3329
- {
3330
- "epoch": 0.46720144908785094,
3331
- "grad_norm": 0.5390625,
3332
- "learning_rate": 0.0005333275772750821,
3333
- "loss": 1.0177,
3334
- "step": 10833
3335
- },
3336
- {
3337
- "epoch": 0.4681933842239186,
3338
- "grad_norm": 0.37890625,
3339
- "learning_rate": 0.0005323346572267311,
3340
- "loss": 0.9899,
3341
- "step": 10856
3342
- },
3343
- {
3344
- "epoch": 0.4691853193599862,
3345
- "grad_norm": 0.38671875,
3346
- "learning_rate": 0.0005313417371783802,
3347
- "loss": 1.0052,
3348
- "step": 10879
3349
- },
3350
- {
3351
- "epoch": 0.4701772544960538,
3352
- "grad_norm": 0.36328125,
3353
- "learning_rate": 0.0005303488171300294,
3354
- "loss": 0.9741,
3355
- "step": 10902
3356
- },
3357
- {
3358
- "epoch": 0.47116918963212145,
3359
- "grad_norm": 0.4453125,
3360
- "learning_rate": 0.0005293558970816785,
3361
- "loss": 1.0021,
3362
- "step": 10925
3363
- },
3364
- {
3365
- "epoch": 0.47216112476818906,
3366
- "grad_norm": 0.322265625,
3367
- "learning_rate": 0.0005283629770333276,
3368
- "loss": 0.9896,
3369
- "step": 10948
3370
- },
3371
- {
3372
- "epoch": 0.4731530599042567,
3373
- "grad_norm": 0.36328125,
3374
- "learning_rate": 0.0005273700569849767,
3375
- "loss": 1.0046,
3376
- "step": 10971
3377
- },
3378
- {
3379
- "epoch": 0.4741449950403243,
3380
- "grad_norm": 0.345703125,
3381
- "learning_rate": 0.0005263771369366258,
3382
- "loss": 1.0004,
3383
- "step": 10994
3384
- },
3385
- {
3386
- "epoch": 0.47513693017639197,
3387
- "grad_norm": 0.357421875,
3388
- "learning_rate": 0.0005253842168882749,
3389
- "loss": 1.0031,
3390
- "step": 11017
3391
- },
3392
- {
3393
- "epoch": 0.47612886531245957,
3394
- "grad_norm": 0.359375,
3395
- "learning_rate": 0.0005243912968399241,
3396
- "loss": 1.007,
3397
- "step": 11040
3398
- },
3399
- {
3400
- "epoch": 0.47712080044852717,
3401
- "grad_norm": 0.38671875,
3402
- "learning_rate": 0.0005233983767915731,
3403
- "loss": 1.0046,
3404
- "step": 11063
3405
- },
3406
- {
3407
- "epoch": 0.47811273558459483,
3408
- "grad_norm": 0.341796875,
3409
- "learning_rate": 0.0005224054567432222,
3410
- "loss": 0.9956,
3411
- "step": 11086
3412
- },
3413
- {
3414
- "epoch": 0.47910467072066243,
3415
- "grad_norm": 0.3515625,
3416
- "learning_rate": 0.0005214125366948713,
3417
- "loss": 1.01,
3418
- "step": 11109
3419
- },
3420
- {
3421
- "epoch": 0.4800966058567301,
3422
- "grad_norm": 0.431640625,
3423
- "learning_rate": 0.0005204196166465205,
3424
- "loss": 1.0211,
3425
- "step": 11132
3426
- },
3427
- {
3428
- "epoch": 0.4810885409927977,
3429
- "grad_norm": 0.375,
3430
- "learning_rate": 0.0005194266965981696,
3431
- "loss": 1.0039,
3432
- "step": 11155
3433
- },
3434
- {
3435
- "epoch": 0.4820804761288653,
3436
- "grad_norm": 0.392578125,
3437
- "learning_rate": 0.0005184337765498187,
3438
- "loss": 0.9886,
3439
- "step": 11178
3440
- },
3441
- {
3442
- "epoch": 0.48307241126493294,
3443
- "grad_norm": 0.515625,
3444
- "learning_rate": 0.0005174408565014678,
3445
- "loss": 0.9973,
3446
- "step": 11201
3447
- },
3448
- {
3449
- "epoch": 0.48406434640100054,
3450
- "grad_norm": 0.396484375,
3451
- "learning_rate": 0.0005164479364531169,
3452
- "loss": 1.013,
3453
- "step": 11224
3454
- },
3455
- {
3456
- "epoch": 0.4850562815370682,
3457
- "grad_norm": 0.451171875,
3458
- "learning_rate": 0.000515455016404766,
3459
- "loss": 0.9876,
3460
- "step": 11247
3461
- },
3462
- {
3463
- "epoch": 0.4860482166731358,
3464
- "grad_norm": 0.375,
3465
- "learning_rate": 0.0005144620963564152,
3466
- "loss": 0.9984,
3467
- "step": 11270
3468
- },
3469
- {
3470
- "epoch": 0.48704015180920346,
3471
- "grad_norm": 0.416015625,
3472
- "learning_rate": 0.0005134691763080642,
3473
- "loss": 0.9875,
3474
- "step": 11293
3475
- },
3476
- {
3477
- "epoch": 0.48803208694527106,
3478
- "grad_norm": 0.369140625,
3479
- "learning_rate": 0.0005124762562597134,
3480
- "loss": 0.9954,
3481
- "step": 11316
3482
- },
3483
- {
3484
- "epoch": 0.48902402208133866,
3485
- "grad_norm": 0.337890625,
3486
- "learning_rate": 0.0005114833362113624,
3487
- "loss": 0.9825,
3488
- "step": 11339
3489
- },
3490
- {
3491
- "epoch": 0.4900159572174063,
3492
- "grad_norm": 0.37890625,
3493
- "learning_rate": 0.0005104904161630117,
3494
- "loss": 0.9983,
3495
- "step": 11362
3496
- },
3497
- {
3498
- "epoch": 0.4910078923534739,
3499
- "grad_norm": 0.328125,
3500
- "learning_rate": 0.0005094974961146607,
3501
- "loss": 0.9818,
3502
- "step": 11385
3503
- },
3504
- {
3505
- "epoch": 0.4919998274895416,
3506
- "grad_norm": 0.357421875,
3507
- "learning_rate": 0.0005085045760663098,
3508
- "loss": 0.9928,
3509
- "step": 11408
3510
- },
3511
- {
3512
- "epoch": 0.4929917626256092,
3513
- "grad_norm": 0.4921875,
3514
- "learning_rate": 0.0005075116560179589,
3515
- "loss": 0.9771,
3516
- "step": 11431
3517
- },
3518
- {
3519
- "epoch": 0.4939836977616768,
3520
- "grad_norm": 0.341796875,
3521
- "learning_rate": 0.000506518735969608,
3522
- "loss": 1.0059,
3523
- "step": 11454
3524
- },
3525
- {
3526
- "epoch": 0.49497563289774443,
3527
- "grad_norm": 0.400390625,
3528
- "learning_rate": 0.0005055258159212571,
3529
- "loss": 1.0058,
3530
- "step": 11477
3531
- },
3532
- {
3533
- "epoch": 0.49596756803381203,
3534
- "grad_norm": 0.328125,
3535
- "learning_rate": 0.0005045328958729063,
3536
- "loss": 0.9962,
3537
- "step": 11500
3538
- },
3539
- {
3540
- "epoch": 0.4969595031698797,
3541
- "grad_norm": 0.326171875,
3542
- "learning_rate": 0.0005035399758245554,
3543
- "loss": 0.9828,
3544
- "step": 11523
3545
- },
3546
- {
3547
- "epoch": 0.4979514383059473,
3548
- "grad_norm": 0.5078125,
3549
- "learning_rate": 0.0005025470557762045,
3550
- "loss": 0.9881,
3551
- "step": 11546
3552
- },
3553
- {
3554
- "epoch": 0.49894337344201495,
3555
- "grad_norm": 0.43359375,
3556
- "learning_rate": 0.0005015541357278536,
3557
- "loss": 0.9863,
3558
- "step": 11569
3559
- },
3560
- {
3561
- "epoch": 0.49984905334885926,
3562
- "eval_runtime": 163.9862,
3563
- "eval_samples_per_second": 609.807,
3564
- "eval_steps_per_second": 7.623,
3565
- "step": 11590
3566
- },
3567
- {
3568
- "epoch": 0.49993530857808255,
3569
- "grad_norm": 0.353515625,
3570
- "learning_rate": 0.0005005612156795026,
3571
- "loss": 0.9764,
3572
- "step": 11592
3573
- },
3574
- {
3575
- "epoch": 0.5009272437141502,
3576
- "grad_norm": 0.36328125,
3577
- "learning_rate": 0.0004995682956311518,
3578
- "loss": 0.9923,
3579
- "step": 11615
3580
- },
3581
- {
3582
- "epoch": 0.5019191788502178,
3583
- "grad_norm": 0.39453125,
3584
- "learning_rate": 0.0004985753755828009,
3585
- "loss": 0.9738,
3586
- "step": 11638
3587
- },
3588
- {
3589
- "epoch": 0.5029111139862854,
3590
- "grad_norm": 0.48828125,
3591
- "learning_rate": 0.00049758245553445,
3592
- "loss": 0.973,
3593
- "step": 11661
3594
- },
3595
- {
3596
- "epoch": 0.5039030491223531,
3597
- "grad_norm": 0.384765625,
3598
- "learning_rate": 0.0004965895354860991,
3599
- "loss": 0.9741,
3600
- "step": 11684
3601
- },
3602
- {
3603
- "epoch": 0.5048949842584207,
3604
- "grad_norm": 0.359375,
3605
- "learning_rate": 0.0004955966154377482,
3606
- "loss": 0.9842,
3607
- "step": 11707
3608
- },
3609
- {
3610
- "epoch": 0.5058869193944883,
3611
- "grad_norm": 0.443359375,
3612
- "learning_rate": 0.0004946036953893974,
3613
- "loss": 0.9927,
3614
- "step": 11730
3615
- },
3616
- {
3617
- "epoch": 0.5068788545305559,
3618
- "grad_norm": 0.44921875,
3619
- "learning_rate": 0.0004936107753410465,
3620
- "loss": 0.9921,
3621
- "step": 11753
3622
- },
3623
- {
3624
- "epoch": 0.5078707896666236,
3625
- "grad_norm": 0.40625,
3626
- "learning_rate": 0.0004926178552926956,
3627
- "loss": 0.9827,
3628
- "step": 11776
3629
- },
3630
- {
3631
- "epoch": 0.5088627248026911,
3632
- "grad_norm": 0.416015625,
3633
- "learning_rate": 0.0004916249352443447,
3634
- "loss": 0.9836,
3635
- "step": 11799
3636
- },
3637
- {
3638
- "epoch": 0.5098546599387588,
3639
- "grad_norm": 0.36328125,
3640
- "learning_rate": 0.0004906320151959938,
3641
- "loss": 0.9783,
3642
- "step": 11822
3643
- },
3644
- {
3645
- "epoch": 0.5108465950748264,
3646
- "grad_norm": 0.357421875,
3647
- "learning_rate": 0.0004896390951476428,
3648
- "loss": 1.0003,
3649
- "step": 11845
3650
- },
3651
- {
3652
- "epoch": 0.511838530210894,
3653
- "grad_norm": 0.39453125,
3654
- "learning_rate": 0.000488646175099292,
3655
- "loss": 0.995,
3656
- "step": 11868
3657
- },
3658
- {
3659
- "epoch": 0.5128304653469616,
3660
- "grad_norm": 0.376953125,
3661
- "learning_rate": 0.0004876532550509411,
3662
- "loss": 0.9952,
3663
- "step": 11891
3664
- },
3665
- {
3666
- "epoch": 0.5138224004830293,
3667
- "grad_norm": 0.3828125,
3668
- "learning_rate": 0.0004866603350025902,
3669
- "loss": 0.9912,
3670
- "step": 11914
3671
- },
3672
- {
3673
- "epoch": 0.514814335619097,
3674
- "grad_norm": 0.34375,
3675
- "learning_rate": 0.00048566741495423933,
3676
- "loss": 0.995,
3677
- "step": 11937
3678
- },
3679
- {
3680
- "epoch": 0.5158062707551645,
3681
- "grad_norm": 0.408203125,
3682
- "learning_rate": 0.00048467449490588845,
3683
- "loss": 0.9856,
3684
- "step": 11960
3685
- },
3686
- {
3687
- "epoch": 0.5167982058912322,
3688
- "grad_norm": 0.427734375,
3689
- "learning_rate": 0.00048368157485753757,
3690
- "loss": 0.9887,
3691
- "step": 11983
3692
- },
3693
- {
3694
- "epoch": 0.5177901410272998,
3695
- "grad_norm": 0.376953125,
3696
- "learning_rate": 0.0004826886548091867,
3697
- "loss": 0.9815,
3698
- "step": 12006
3699
- },
3700
- {
3701
- "epoch": 0.5187820761633674,
3702
- "grad_norm": 0.369140625,
3703
- "learning_rate": 0.00048169573476083575,
3704
- "loss": 0.9944,
3705
- "step": 12029
3706
- },
3707
- {
3708
- "epoch": 0.519774011299435,
3709
- "grad_norm": 0.427734375,
3710
- "learning_rate": 0.0004807028147124849,
3711
- "loss": 0.9714,
3712
- "step": 12052
3713
- },
3714
- {
3715
- "epoch": 0.5207659464355027,
3716
- "grad_norm": 0.326171875,
3717
- "learning_rate": 0.000479709894664134,
3718
- "loss": 0.9849,
3719
- "step": 12075
3720
- },
3721
- {
3722
- "epoch": 0.5217578815715703,
3723
- "grad_norm": 0.427734375,
3724
- "learning_rate": 0.0004787169746157831,
3725
- "loss": 0.9861,
3726
- "step": 12098
3727
- },
3728
- {
3729
- "epoch": 0.5227498167076379,
3730
- "grad_norm": 0.4765625,
3731
- "learning_rate": 0.00047772405456743223,
3732
- "loss": 1.0009,
3733
- "step": 12121
3734
- },
3735
- {
3736
- "epoch": 0.5237417518437055,
3737
- "grad_norm": 0.345703125,
3738
- "learning_rate": 0.00047673113451908135,
3739
- "loss": 0.9892,
3740
- "step": 12144
3741
- },
3742
- {
3743
- "epoch": 0.5247336869797732,
3744
- "grad_norm": 0.345703125,
3745
- "learning_rate": 0.00047573821447073047,
3746
- "loss": 0.9843,
3747
- "step": 12167
3748
- },
3749
- {
3750
- "epoch": 0.5257256221158407,
3751
- "grad_norm": 0.40234375,
3752
- "learning_rate": 0.0004747452944223796,
3753
- "loss": 0.9767,
3754
- "step": 12190
3755
- },
3756
- {
3757
- "epoch": 0.5267175572519084,
3758
- "grad_norm": 0.359375,
3759
- "learning_rate": 0.00047375237437402866,
3760
- "loss": 0.9599,
3761
- "step": 12213
3762
- },
3763
- {
3764
- "epoch": 0.527709492387976,
3765
- "grad_norm": 0.388671875,
3766
- "learning_rate": 0.0004727594543256778,
3767
- "loss": 0.9797,
3768
- "step": 12236
3769
- },
3770
- {
3771
- "epoch": 0.5287014275240437,
3772
- "grad_norm": 0.3359375,
3773
- "learning_rate": 0.0004717665342773269,
3774
- "loss": 0.9952,
3775
- "step": 12259
3776
- },
3777
- {
3778
- "epoch": 0.5296933626601112,
3779
- "grad_norm": 0.359375,
3780
- "learning_rate": 0.000470773614228976,
3781
- "loss": 0.9851,
3782
- "step": 12282
3783
- },
3784
- {
3785
- "epoch": 0.5306852977961789,
3786
- "grad_norm": 0.4140625,
3787
- "learning_rate": 0.00046978069418062514,
3788
- "loss": 0.9728,
3789
- "step": 12305
3790
- },
3791
- {
3792
- "epoch": 0.5316772329322466,
3793
- "grad_norm": 0.376953125,
3794
- "learning_rate": 0.00046878777413227426,
3795
- "loss": 0.9813,
3796
- "step": 12328
3797
- },
3798
- {
3799
- "epoch": 0.5326691680683141,
3800
- "grad_norm": 0.326171875,
3801
- "learning_rate": 0.0004677948540839234,
3802
- "loss": 0.9729,
3803
- "step": 12351
3804
- },
3805
- {
3806
- "epoch": 0.5336611032043818,
3807
- "grad_norm": 0.33203125,
3808
- "learning_rate": 0.0004668019340355725,
3809
- "loss": 0.969,
3810
- "step": 12374
3811
- },
3812
- {
3813
- "epoch": 0.5346530383404494,
3814
- "grad_norm": 0.43359375,
3815
- "learning_rate": 0.00046580901398722156,
3816
- "loss": 0.9786,
3817
- "step": 12397
3818
- },
3819
- {
3820
- "epoch": 0.535644973476517,
3821
- "grad_norm": 0.388671875,
3822
- "learning_rate": 0.00046481609393887063,
3823
- "loss": 0.9773,
3824
- "step": 12420
3825
- },
3826
- {
3827
- "epoch": 0.5366369086125846,
3828
- "grad_norm": 0.451171875,
3829
- "learning_rate": 0.00046382317389051975,
3830
- "loss": 0.9972,
3831
- "step": 12443
3832
- },
3833
- {
3834
- "epoch": 0.5376288437486523,
3835
- "grad_norm": 0.408203125,
3836
- "learning_rate": 0.00046283025384216887,
3837
- "loss": 0.9893,
3838
- "step": 12466
3839
- },
3840
- {
3841
- "epoch": 0.5386207788847199,
3842
- "grad_norm": 0.400390625,
3843
- "learning_rate": 0.000461837333793818,
3844
- "loss": 0.9747,
3845
- "step": 12489
3846
- },
3847
- {
3848
- "epoch": 0.5396127140207875,
3849
- "grad_norm": 0.4921875,
3850
- "learning_rate": 0.0004608444137454671,
3851
- "loss": 0.9795,
3852
- "step": 12512
3853
- },
3854
- {
3855
- "epoch": 0.5406046491568551,
3856
- "grad_norm": 0.37109375,
3857
- "learning_rate": 0.00045985149369711623,
3858
- "loss": 0.9608,
3859
- "step": 12535
3860
- },
3861
- {
3862
- "epoch": 0.5415965842929228,
3863
- "grad_norm": 0.3515625,
3864
- "learning_rate": 0.00045885857364876535,
3865
- "loss": 0.966,
3866
- "step": 12558
3867
- },
3868
- {
3869
- "epoch": 0.5425885194289903,
3870
- "grad_norm": 0.361328125,
3871
- "learning_rate": 0.0004578656536004144,
3872
- "loss": 0.9689,
3873
- "step": 12581
3874
- },
3875
- {
3876
- "epoch": 0.543580454565058,
3877
- "grad_norm": 0.404296875,
3878
- "learning_rate": 0.00045687273355206353,
3879
- "loss": 0.9655,
3880
- "step": 12604
3881
- },
3882
- {
3883
- "epoch": 0.5445723897011256,
3884
- "grad_norm": 0.37890625,
3885
- "learning_rate": 0.00045587981350371265,
3886
- "loss": 0.9693,
3887
- "step": 12627
3888
- },
3889
- {
3890
- "epoch": 0.5455643248371933,
3891
- "grad_norm": 0.41015625,
3892
- "learning_rate": 0.00045488689345536177,
3893
- "loss": 0.9986,
3894
- "step": 12650
3895
- },
3896
- {
3897
- "epoch": 0.5465562599732608,
3898
- "grad_norm": 0.345703125,
3899
- "learning_rate": 0.0004538939734070109,
3900
- "loss": 0.9715,
3901
- "step": 12673
3902
- },
3903
- {
3904
- "epoch": 0.5475481951093285,
3905
- "grad_norm": 0.37890625,
3906
- "learning_rate": 0.00045290105335866,
3907
- "loss": 0.9781,
3908
- "step": 12696
3909
- },
3910
- {
3911
- "epoch": 0.5485401302453962,
3912
- "grad_norm": 0.42578125,
3913
- "learning_rate": 0.00045190813331030913,
3914
- "loss": 1.0001,
3915
- "step": 12719
3916
- },
3917
- {
3918
- "epoch": 0.5495320653814637,
3919
- "grad_norm": 0.43359375,
3920
- "learning_rate": 0.0004509152132619582,
3921
- "loss": 0.9811,
3922
- "step": 12742
3923
- },
3924
- {
3925
- "epoch": 0.5505240005175314,
3926
- "grad_norm": 0.341796875,
3927
- "learning_rate": 0.0004499222932136073,
3928
- "loss": 0.9584,
3929
- "step": 12765
3930
- },
3931
- {
3932
- "epoch": 0.551515935653599,
3933
- "grad_norm": 0.419921875,
3934
- "learning_rate": 0.00044892937316525644,
3935
- "loss": 0.977,
3936
- "step": 12788
3937
- },
3938
- {
3939
- "epoch": 0.5525078707896667,
3940
- "grad_norm": 0.416015625,
3941
- "learning_rate": 0.00044793645311690556,
3942
- "loss": 0.9746,
3943
- "step": 12811
3944
- },
3945
- {
3946
- "epoch": 0.5534998059257342,
3947
- "grad_norm": 0.390625,
3948
- "learning_rate": 0.0004469435330685547,
3949
- "loss": 0.9811,
3950
- "step": 12834
3951
- },
3952
- {
3953
- "epoch": 0.5544917410618019,
3954
- "grad_norm": 0.35546875,
3955
- "learning_rate": 0.0004459506130202038,
3956
- "loss": 0.9523,
3957
- "step": 12857
3958
- },
3959
- {
3960
- "epoch": 0.5554836761978695,
3961
- "grad_norm": 0.37890625,
3962
- "learning_rate": 0.0004449576929718529,
3963
- "loss": 0.9641,
3964
- "step": 12880
3965
- },
3966
- {
3967
- "epoch": 0.5564756113339371,
3968
- "grad_norm": 0.36328125,
3969
- "learning_rate": 0.00044396477292350204,
3970
- "loss": 0.9845,
3971
- "step": 12903
3972
- },
3973
- {
3974
- "epoch": 0.5574675464700047,
3975
- "grad_norm": 0.365234375,
3976
- "learning_rate": 0.0004429718528751511,
3977
- "loss": 0.9788,
3978
- "step": 12926
3979
- },
3980
- {
3981
- "epoch": 0.5584594816060724,
3982
- "grad_norm": 0.390625,
3983
- "learning_rate": 0.0004419789328268002,
3984
- "loss": 0.9795,
3985
- "step": 12949
3986
- },
3987
- {
3988
- "epoch": 0.5594514167421399,
3989
- "grad_norm": 0.37109375,
3990
- "learning_rate": 0.00044098601277844934,
3991
- "loss": 0.9716,
3992
- "step": 12972
3993
- },
3994
- {
3995
- "epoch": 0.5604433518782076,
3996
- "grad_norm": 0.38671875,
3997
- "learning_rate": 0.00043999309273009846,
3998
- "loss": 0.9814,
3999
- "step": 12995
4000
- },
4001
- {
4002
- "epoch": 0.5614352870142753,
4003
- "grad_norm": 0.34765625,
4004
- "learning_rate": 0.00043900017268174753,
4005
- "loss": 0.9724,
4006
- "step": 13018
4007
- },
4008
- {
4009
- "epoch": 0.5624272221503429,
4010
- "grad_norm": 0.44921875,
4011
- "learning_rate": 0.00043800725263339665,
4012
- "loss": 0.9538,
4013
- "step": 13041
4014
- },
4015
- {
4016
- "epoch": 0.5634191572864105,
4017
- "grad_norm": 0.3828125,
4018
- "learning_rate": 0.00043701433258504577,
4019
- "loss": 0.9744,
4020
- "step": 13064
4021
- },
4022
- {
4023
- "epoch": 0.5644110924224781,
4024
- "grad_norm": 0.423828125,
4025
- "learning_rate": 0.0004360214125366949,
4026
- "loss": 0.9777,
4027
- "step": 13087
4028
- },
4029
- {
4030
- "epoch": 0.5654030275585458,
4031
- "grad_norm": 0.365234375,
4032
- "learning_rate": 0.00043502849248834395,
4033
- "loss": 0.9688,
4034
- "step": 13110
4035
- },
4036
- {
4037
- "epoch": 0.5663949626946133,
4038
- "grad_norm": 0.470703125,
4039
- "learning_rate": 0.00043403557243999307,
4040
- "loss": 0.988,
4041
- "step": 13133
4042
- },
4043
- {
4044
- "epoch": 0.567386897830681,
4045
- "grad_norm": 0.341796875,
4046
- "learning_rate": 0.0004330426523916422,
4047
- "loss": 0.9678,
4048
- "step": 13156
4049
- },
4050
- {
4051
- "epoch": 0.5683788329667486,
4052
- "grad_norm": 0.345703125,
4053
- "learning_rate": 0.0004320497323432913,
4054
- "loss": 0.9735,
4055
- "step": 13179
4056
- },
4057
- {
4058
- "epoch": 0.5693707681028163,
4059
- "grad_norm": 0.416015625,
4060
- "learning_rate": 0.00043105681229494043,
4061
- "loss": 0.9612,
4062
- "step": 13202
4063
- },
4064
- {
4065
- "epoch": 0.5703627032388838,
4066
- "grad_norm": 0.375,
4067
- "learning_rate": 0.00043006389224658955,
4068
- "loss": 0.9428,
4069
- "step": 13225
4070
- },
4071
- {
4072
- "epoch": 0.5713546383749515,
4073
- "grad_norm": 0.4296875,
4074
- "learning_rate": 0.00042907097219823867,
4075
- "loss": 0.9654,
4076
- "step": 13248
4077
- },
4078
- {
4079
- "epoch": 0.5723465735110191,
4080
- "grad_norm": 0.353515625,
4081
- "learning_rate": 0.0004280780521498878,
4082
- "loss": 0.9739,
4083
- "step": 13271
4084
- },
4085
- {
4086
- "epoch": 0.5733385086470867,
4087
- "grad_norm": 0.380859375,
4088
- "learning_rate": 0.00042708513210153686,
4089
- "loss": 0.9755,
4090
- "step": 13294
4091
- },
4092
- {
4093
- "epoch": 0.5743304437831543,
4094
- "grad_norm": 0.357421875,
4095
- "learning_rate": 0.000426092212053186,
4096
- "loss": 0.9784,
4097
- "step": 13317
4098
- },
4099
- {
4100
- "epoch": 0.575322378919222,
4101
- "grad_norm": 0.3125,
4102
- "learning_rate": 0.0004250992920048351,
4103
- "loss": 0.9625,
4104
- "step": 13340
4105
- },
4106
- {
4107
- "epoch": 0.5763143140552897,
4108
- "grad_norm": 0.345703125,
4109
- "learning_rate": 0.0004241063719564842,
4110
- "loss": 0.9521,
4111
- "step": 13363
4112
- },
4113
- {
4114
- "epoch": 0.5773062491913572,
4115
- "grad_norm": 0.333984375,
4116
- "learning_rate": 0.00042311345190813334,
4117
- "loss": 0.984,
4118
- "step": 13386
4119
- },
4120
- {
4121
- "epoch": 0.5782981843274249,
4122
- "grad_norm": 0.45703125,
4123
- "learning_rate": 0.00042212053185978246,
4124
- "loss": 0.9794,
4125
- "step": 13409
4126
- },
4127
- {
4128
- "epoch": 0.5792901194634925,
4129
- "grad_norm": 0.396484375,
4130
- "learning_rate": 0.0004211276118114316,
4131
- "loss": 0.9705,
4132
- "step": 13432
4133
- },
4134
- {
4135
- "epoch": 0.5802820545995601,
4136
- "grad_norm": 0.400390625,
4137
- "learning_rate": 0.00042013469176308064,
4138
- "loss": 0.97,
4139
- "step": 13455
4140
- },
4141
- {
4142
- "epoch": 0.5812739897356277,
4143
- "grad_norm": 0.37890625,
4144
- "learning_rate": 0.00041914177171472976,
4145
- "loss": 0.968,
4146
- "step": 13478
4147
- },
4148
- {
4149
- "epoch": 0.5822659248716954,
4150
- "grad_norm": 0.365234375,
4151
- "learning_rate": 0.0004181488516663789,
4152
- "loss": 0.9664,
4153
- "step": 13501
4154
- },
4155
- {
4156
- "epoch": 0.5832578600077629,
4157
- "grad_norm": 0.361328125,
4158
- "learning_rate": 0.000417155931618028,
4159
- "loss": 0.9722,
4160
- "step": 13524
4161
- },
4162
- {
4163
- "epoch": 0.5842497951438306,
4164
- "grad_norm": 0.369140625,
4165
- "learning_rate": 0.0004161630115696771,
4166
- "loss": 0.9695,
4167
- "step": 13547
4168
- },
4169
- {
4170
- "epoch": 0.5852417302798982,
4171
- "grad_norm": 0.337890625,
4172
- "learning_rate": 0.00041517009152132624,
4173
- "loss": 0.9628,
4174
- "step": 13570
4175
- },
4176
- {
4177
- "epoch": 0.5862336654159659,
4178
- "grad_norm": 0.330078125,
4179
- "learning_rate": 0.0004141771714729753,
4180
- "loss": 0.9515,
4181
- "step": 13593
4182
- },
4183
- {
4184
- "epoch": 0.5872256005520334,
4185
- "grad_norm": 0.359375,
4186
- "learning_rate": 0.0004131842514246244,
4187
- "loss": 0.965,
4188
- "step": 13616
4189
- },
4190
- {
4191
- "epoch": 0.5882175356881011,
4192
- "grad_norm": 0.392578125,
4193
- "learning_rate": 0.0004121913313762735,
4194
- "loss": 0.9598,
4195
- "step": 13639
4196
- },
4197
- {
4198
- "epoch": 0.5892094708241687,
4199
- "grad_norm": 0.41796875,
4200
- "learning_rate": 0.0004111984113279226,
4201
- "loss": 0.9575,
4202
- "step": 13662
4203
- },
4204
- {
4205
- "epoch": 0.5902014059602363,
4206
- "grad_norm": 0.5234375,
4207
- "learning_rate": 0.00041020549127957173,
4208
- "loss": 0.9933,
4209
- "step": 13685
4210
- },
4211
- {
4212
- "epoch": 0.591193341096304,
4213
- "grad_norm": 0.423828125,
4214
- "learning_rate": 0.00040921257123122085,
4215
- "loss": 0.9621,
4216
- "step": 13708
4217
- },
4218
- {
4219
- "epoch": 0.5921852762323716,
4220
- "grad_norm": 0.33203125,
4221
- "learning_rate": 0.00040821965118286997,
4222
- "loss": 0.964,
4223
- "step": 13731
4224
- },
4225
- {
4226
- "epoch": 0.5931772113684393,
4227
- "grad_norm": 0.423828125,
4228
- "learning_rate": 0.0004072267311345191,
4229
- "loss": 0.9854,
4230
- "step": 13754
4231
- },
4232
- {
4233
- "epoch": 0.5941691465045068,
4234
- "grad_norm": 0.3515625,
4235
- "learning_rate": 0.0004062338110861682,
4236
- "loss": 0.9883,
4237
- "step": 13777
4238
- },
4239
- {
4240
- "epoch": 0.5951610816405745,
4241
- "grad_norm": 0.408203125,
4242
- "learning_rate": 0.00040524089103781733,
4243
- "loss": 0.9853,
4244
- "step": 13800
4245
- },
4246
- {
4247
- "epoch": 0.5961530167766421,
4248
- "grad_norm": 0.408203125,
4249
- "learning_rate": 0.0004042479709894664,
4250
- "loss": 0.9557,
4251
- "step": 13823
4252
- },
4253
- {
4254
- "epoch": 0.5971449519127097,
4255
- "grad_norm": 0.42578125,
4256
- "learning_rate": 0.0004032550509411155,
4257
- "loss": 0.9587,
4258
- "step": 13846
4259
- },
4260
- {
4261
- "epoch": 0.5981368870487773,
4262
- "grad_norm": 0.44921875,
4263
- "learning_rate": 0.00040226213089276464,
4264
- "loss": 0.9771,
4265
- "step": 13869
4266
- },
4267
- {
4268
- "epoch": 0.599128822184845,
4269
- "grad_norm": 0.431640625,
4270
- "learning_rate": 0.00040126921084441376,
4271
- "loss": 0.9661,
4272
- "step": 13892
4273
- },
4274
- {
4275
- "epoch": 0.5998188640186312,
4276
- "eval_runtime": 163.7921,
4277
- "eval_samples_per_second": 610.53,
4278
- "eval_steps_per_second": 7.632,
4279
- "step": 13908
4280
  }
4281
  ],
4282
  "logging_steps": 23,
@@ -4296,7 +2875,7 @@
4296
  "attributes": {}
4297
  }
4298
  },
4299
- "total_flos": 1.0167159364234772e+18,
4300
  "train_batch_size": 8,
4301
  "trial_name": null,
4302
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3998792426790874,
5
  "eval_steps": 2318,
6
+ "global_step": 9272,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2856
  "eval_samples_per_second": 610.045,
2857
  "eval_steps_per_second": 7.626,
2858
  "step": 9272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2859
  }
2860
  ],
2861
  "logging_steps": 23,
 
2875
  "attributes": {}
2876
  }
2877
  },
2878
+ "total_flos": 6.778106242599485e+17,
2879
  "train_batch_size": 8,
2880
  "trial_name": null,
2881
  "trial_params": null