jasong03 commited on
Commit
d3078ce
·
verified ·
1 Parent(s): 6420aaa

Training in progress, step 3200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2d30406d7c467767499ef4fa93e0814e4b9c839e83cc877c2f8eaaf710781d1
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acde9c308eddee03ae7ba07078f126ecbfbf189649125ba5e28eb98b2eb7a498
3
  size 891644712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d95e37a4b2b1e8065e39772f0ad6f60340e21cd105dce5b06f199029f9a8d550
3
  size 1783444794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69a94b5c388ad02488cfa16d32d05e88a60512f1756f067232047f67b1bbc1d7
3
  size 1783444794
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d2f5ef411fc40d8d9b3f53029f0d2bde94e51e311c130b07e4428069fee892d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a9ec26c805fc0805503b452ed1d7a3e08af9f21c7d994d43e4705d7fe6b69c0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3543c156e7d71b14c19e0a0a6a897c5b126e8bc6938f4ff38dab3dadeb331bb4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ae1375ade70d0aa6318948d7a88aecd14c5ea3b408d7a30a7af5ef14aa83d44
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.7038269550748752,
5
  "eval_steps": 500,
6
- "global_step": 3072,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -10767,6 +10767,454 @@
10767
  "learning_rate": 1.128797253211723e-05,
10768
  "loss": 0.3036,
10769
  "step": 3072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10770
  }
10771
  ],
10772
  "logging_steps": 2,
@@ -10786,7 +11234,7 @@
10786
  "attributes": {}
10787
  }
10788
  },
10789
- "total_flos": 7482417840783360.0,
10790
  "train_batch_size": 8,
10791
  "trial_name": null,
10792
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.7748197448696617,
5
  "eval_steps": 500,
6
+ "global_step": 3200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
10767
  "learning_rate": 1.128797253211723e-05,
10768
  "loss": 0.3036,
10769
  "step": 3072
10770
+ },
10771
+ {
10772
+ "epoch": 1.7049362174154188,
10773
+ "grad_norm": 0.24164661765098572,
10774
+ "learning_rate": 1.120518952483819e-05,
10775
+ "loss": 0.3209,
10776
+ "step": 3074
10777
+ },
10778
+ {
10779
+ "epoch": 1.7060454797559623,
10780
+ "grad_norm": 0.34098076820373535,
10781
+ "learning_rate": 1.1122693170195164e-05,
10782
+ "loss": 0.446,
10783
+ "step": 3076
10784
+ },
10785
+ {
10786
+ "epoch": 1.7071547420965059,
10787
+ "grad_norm": 0.3181568384170532,
10788
+ "learning_rate": 1.1040483734507789e-05,
10789
+ "loss": 0.3758,
10790
+ "step": 3078
10791
+ },
10792
+ {
10793
+ "epoch": 1.7082640044370494,
10794
+ "grad_norm": 0.2597646415233612,
10795
+ "learning_rate": 1.095856148316936e-05,
10796
+ "loss": 0.35,
10797
+ "step": 3080
10798
+ },
10799
+ {
10800
+ "epoch": 1.709373266777593,
10801
+ "grad_norm": 0.27917012572288513,
10802
+ "learning_rate": 1.087692668064616e-05,
10803
+ "loss": 0.3619,
10804
+ "step": 3082
10805
+ },
10806
+ {
10807
+ "epoch": 1.7104825291181365,
10808
+ "grad_norm": 0.2992468774318695,
10809
+ "learning_rate": 1.0795579590476445e-05,
10810
+ "loss": 0.37,
10811
+ "step": 3084
10812
+ },
10813
+ {
10814
+ "epoch": 1.71159179145868,
10815
+ "grad_norm": 0.3110543191432953,
10816
+ "learning_rate": 1.0714520475269652e-05,
10817
+ "loss": 0.3318,
10818
+ "step": 3086
10819
+ },
10820
+ {
10821
+ "epoch": 1.7127010537992235,
10822
+ "grad_norm": 0.393775075674057,
10823
+ "learning_rate": 1.0633749596705645e-05,
10824
+ "loss": 0.4044,
10825
+ "step": 3088
10826
+ },
10827
+ {
10828
+ "epoch": 1.713810316139767,
10829
+ "grad_norm": 0.32126861810684204,
10830
+ "learning_rate": 1.055326721553368e-05,
10831
+ "loss": 0.4077,
10832
+ "step": 3090
10833
+ },
10834
+ {
10835
+ "epoch": 1.7149195784803106,
10836
+ "grad_norm": 0.316629558801651,
10837
+ "learning_rate": 1.0473073591571758e-05,
10838
+ "loss": 0.3887,
10839
+ "step": 3092
10840
+ },
10841
+ {
10842
+ "epoch": 1.7160288408208542,
10843
+ "grad_norm": 0.24358634650707245,
10844
+ "learning_rate": 1.0393168983705626e-05,
10845
+ "loss": 0.3439,
10846
+ "step": 3094
10847
+ },
10848
+ {
10849
+ "epoch": 1.7171381031613977,
10850
+ "grad_norm": 0.3309425413608551,
10851
+ "learning_rate": 1.0313553649888074e-05,
10852
+ "loss": 0.3894,
10853
+ "step": 3096
10854
+ },
10855
+ {
10856
+ "epoch": 1.7182473655019412,
10857
+ "grad_norm": 0.3401065468788147,
10858
+ "learning_rate": 1.0234227847138011e-05,
10859
+ "loss": 0.376,
10860
+ "step": 3098
10861
+ },
10862
+ {
10863
+ "epoch": 1.7193566278424848,
10864
+ "grad_norm": 0.33251863718032837,
10865
+ "learning_rate": 1.0155191831539645e-05,
10866
+ "loss": 0.4203,
10867
+ "step": 3100
10868
+ },
10869
+ {
10870
+ "epoch": 1.7204658901830283,
10871
+ "grad_norm": 0.3005315363407135,
10872
+ "learning_rate": 1.0076445858241679e-05,
10873
+ "loss": 0.2993,
10874
+ "step": 3102
10875
+ },
10876
+ {
10877
+ "epoch": 1.7215751525235718,
10878
+ "grad_norm": 0.2971371114253998,
10879
+ "learning_rate": 9.997990181456528e-06,
10880
+ "loss": 0.3881,
10881
+ "step": 3104
10882
+ },
10883
+ {
10884
+ "epoch": 1.7226844148641154,
10885
+ "grad_norm": 0.2904921770095825,
10886
+ "learning_rate": 9.919825054459442e-06,
10887
+ "loss": 0.3812,
10888
+ "step": 3106
10889
+ },
10890
+ {
10891
+ "epoch": 1.723793677204659,
10892
+ "grad_norm": 0.3357609212398529,
10893
+ "learning_rate": 9.841950729587668e-06,
10894
+ "loss": 0.4121,
10895
+ "step": 3108
10896
+ },
10897
+ {
10898
+ "epoch": 1.7249029395452025,
10899
+ "grad_norm": 0.2711123526096344,
10900
+ "learning_rate": 9.764367458239677e-06,
10901
+ "loss": 0.3789,
10902
+ "step": 3110
10903
+ },
10904
+ {
10905
+ "epoch": 1.726012201885746,
10906
+ "grad_norm": 0.24408982694149017,
10907
+ "learning_rate": 9.687075490874376e-06,
10908
+ "loss": 0.3457,
10909
+ "step": 3112
10910
+ },
10911
+ {
10912
+ "epoch": 1.7271214642262895,
10913
+ "grad_norm": 0.25458860397338867,
10914
+ "learning_rate": 9.61007507701024e-06,
10915
+ "loss": 0.3098,
10916
+ "step": 3114
10917
+ },
10918
+ {
10919
+ "epoch": 1.728230726566833,
10920
+ "grad_norm": 0.2704317569732666,
10921
+ "learning_rate": 9.533366465224514e-06,
10922
+ "loss": 0.3471,
10923
+ "step": 3116
10924
+ },
10925
+ {
10926
+ "epoch": 1.7293399889073766,
10927
+ "grad_norm": 0.2258918136358261,
10928
+ "learning_rate": 9.456949903152478e-06,
10929
+ "loss": 0.4087,
10930
+ "step": 3118
10931
+ },
10932
+ {
10933
+ "epoch": 1.7304492512479202,
10934
+ "grad_norm": 0.20709431171417236,
10935
+ "learning_rate": 9.38082563748659e-06,
10936
+ "loss": 0.3383,
10937
+ "step": 3120
10938
+ },
10939
+ {
10940
+ "epoch": 1.7315585135884637,
10941
+ "grad_norm": 0.24197116494178772,
10942
+ "learning_rate": 9.30499391397568e-06,
10943
+ "loss": 0.3323,
10944
+ "step": 3122
10945
+ },
10946
+ {
10947
+ "epoch": 1.7326677759290072,
10948
+ "grad_norm": 0.30395829677581787,
10949
+ "learning_rate": 9.229454977424157e-06,
10950
+ "loss": 0.378,
10951
+ "step": 3124
10952
+ },
10953
+ {
10954
+ "epoch": 1.7337770382695508,
10955
+ "grad_norm": 0.2813956141471863,
10956
+ "learning_rate": 9.154209071691289e-06,
10957
+ "loss": 0.3326,
10958
+ "step": 3126
10959
+ },
10960
+ {
10961
+ "epoch": 1.7348863006100943,
10962
+ "grad_norm": 0.3281961679458618,
10963
+ "learning_rate": 9.079256439690354e-06,
10964
+ "loss": 0.3518,
10965
+ "step": 3128
10966
+ },
10967
+ {
10968
+ "epoch": 1.7359955629506378,
10969
+ "grad_norm": 0.3628225326538086,
10970
+ "learning_rate": 9.004597323387798e-06,
10971
+ "loss": 0.4188,
10972
+ "step": 3130
10973
+ },
10974
+ {
10975
+ "epoch": 1.7371048252911814,
10976
+ "grad_norm": 0.3164060711860657,
10977
+ "learning_rate": 8.930231963802637e-06,
10978
+ "loss": 0.3381,
10979
+ "step": 3132
10980
+ },
10981
+ {
10982
+ "epoch": 1.738214087631725,
10983
+ "grad_norm": 0.27229782938957214,
10984
+ "learning_rate": 8.856160601005459e-06,
10985
+ "loss": 0.3767,
10986
+ "step": 3134
10987
+ },
10988
+ {
10989
+ "epoch": 1.7393233499722685,
10990
+ "grad_norm": 0.34024956822395325,
10991
+ "learning_rate": 8.782383474117838e-06,
10992
+ "loss": 0.4573,
10993
+ "step": 3136
10994
+ },
10995
+ {
10996
+ "epoch": 1.740432612312812,
10997
+ "grad_norm": 0.32661277055740356,
10998
+ "learning_rate": 8.708900821311405e-06,
10999
+ "loss": 0.5145,
11000
+ "step": 3138
11001
+ },
11002
+ {
11003
+ "epoch": 1.7415418746533555,
11004
+ "grad_norm": 0.24198585748672485,
11005
+ "learning_rate": 8.635712879807222e-06,
11006
+ "loss": 0.2969,
11007
+ "step": 3140
11008
+ },
11009
+ {
11010
+ "epoch": 1.742651136993899,
11011
+ "grad_norm": 0.37718066573143005,
11012
+ "learning_rate": 8.562819885874884e-06,
11013
+ "loss": 0.5287,
11014
+ "step": 3142
11015
+ },
11016
+ {
11017
+ "epoch": 1.7437603993344426,
11018
+ "grad_norm": 0.3092913329601288,
11019
+ "learning_rate": 8.490222074831845e-06,
11020
+ "loss": 0.3534,
11021
+ "step": 3144
11022
+ },
11023
+ {
11024
+ "epoch": 1.7448696616749861,
11025
+ "grad_norm": 0.2609056830406189,
11026
+ "learning_rate": 8.417919681042652e-06,
11027
+ "loss": 0.3774,
11028
+ "step": 3146
11029
+ },
11030
+ {
11031
+ "epoch": 1.7459789240155297,
11032
+ "grad_norm": 0.3176262080669403,
11033
+ "learning_rate": 8.345912937918121e-06,
11034
+ "loss": 0.3448,
11035
+ "step": 3148
11036
+ },
11037
+ {
11038
+ "epoch": 1.7470881863560732,
11039
+ "grad_norm": 0.3105904757976532,
11040
+ "learning_rate": 8.274202077914705e-06,
11041
+ "loss": 0.3949,
11042
+ "step": 3150
11043
+ },
11044
+ {
11045
+ "epoch": 1.7481974486966168,
11046
+ "grad_norm": 0.3904447555541992,
11047
+ "learning_rate": 8.20278733253359e-06,
11048
+ "loss": 0.45,
11049
+ "step": 3152
11050
+ },
11051
+ {
11052
+ "epoch": 1.7493067110371603,
11053
+ "grad_norm": 0.27570340037345886,
11054
+ "learning_rate": 8.13166893232008e-06,
11055
+ "loss": 0.4282,
11056
+ "step": 3154
11057
+ },
11058
+ {
11059
+ "epoch": 1.7504159733777038,
11060
+ "grad_norm": 0.2809303104877472,
11061
+ "learning_rate": 8.060847106862779e-06,
11062
+ "loss": 0.3358,
11063
+ "step": 3156
11064
+ },
11065
+ {
11066
+ "epoch": 1.7515252357182474,
11067
+ "grad_norm": 0.43461307883262634,
11068
+ "learning_rate": 7.990322084792867e-06,
11069
+ "loss": 0.3352,
11070
+ "step": 3158
11071
+ },
11072
+ {
11073
+ "epoch": 1.752634498058791,
11074
+ "grad_norm": 0.3733227550983429,
11075
+ "learning_rate": 7.92009409378337e-06,
11076
+ "loss": 0.4386,
11077
+ "step": 3160
11078
+ },
11079
+ {
11080
+ "epoch": 1.7537437603993344,
11081
+ "grad_norm": 0.22569668292999268,
11082
+ "learning_rate": 7.850163360548424e-06,
11083
+ "loss": 0.2785,
11084
+ "step": 3162
11085
+ },
11086
+ {
11087
+ "epoch": 1.754853022739878,
11088
+ "grad_norm": 0.286538690328598,
11089
+ "learning_rate": 7.780530110842565e-06,
11090
+ "loss": 0.312,
11091
+ "step": 3164
11092
+ },
11093
+ {
11094
+ "epoch": 1.7559622850804215,
11095
+ "grad_norm": 0.2738610804080963,
11096
+ "learning_rate": 7.711194569459934e-06,
11097
+ "loss": 0.3244,
11098
+ "step": 3166
11099
+ },
11100
+ {
11101
+ "epoch": 1.757071547420965,
11102
+ "grad_norm": 0.30075690150260925,
11103
+ "learning_rate": 7.642156960233592e-06,
11104
+ "loss": 0.3691,
11105
+ "step": 3168
11106
+ },
11107
+ {
11108
+ "epoch": 1.7581808097615086,
11109
+ "grad_norm": 0.2853529453277588,
11110
+ "learning_rate": 7.573417506034852e-06,
11111
+ "loss": 0.3259,
11112
+ "step": 3170
11113
+ },
11114
+ {
11115
+ "epoch": 1.7592900721020521,
11116
+ "grad_norm": 0.23462392389774323,
11117
+ "learning_rate": 7.504976428772437e-06,
11118
+ "loss": 0.3671,
11119
+ "step": 3172
11120
+ },
11121
+ {
11122
+ "epoch": 1.7603993344425957,
11123
+ "grad_norm": 0.365106999874115,
11124
+ "learning_rate": 7.436833949391853e-06,
11125
+ "loss": 0.3698,
11126
+ "step": 3174
11127
+ },
11128
+ {
11129
+ "epoch": 1.7615085967831392,
11130
+ "grad_norm": 0.2944175899028778,
11131
+ "learning_rate": 7.368990287874711e-06,
11132
+ "loss": 0.3515,
11133
+ "step": 3176
11134
+ },
11135
+ {
11136
+ "epoch": 1.7626178591236827,
11137
+ "grad_norm": 0.2920864224433899,
11138
+ "learning_rate": 7.301445663237861e-06,
11139
+ "loss": 0.3424,
11140
+ "step": 3178
11141
+ },
11142
+ {
11143
+ "epoch": 1.7637271214642263,
11144
+ "grad_norm": 0.26654571294784546,
11145
+ "learning_rate": 7.234200293532889e-06,
11146
+ "loss": 0.3553,
11147
+ "step": 3180
11148
+ },
11149
+ {
11150
+ "epoch": 1.7648363838047698,
11151
+ "grad_norm": 0.2544094920158386,
11152
+ "learning_rate": 7.167254395845202e-06,
11153
+ "loss": 0.3715,
11154
+ "step": 3182
11155
+ },
11156
+ {
11157
+ "epoch": 1.7659456461453134,
11158
+ "grad_norm": 0.2914319932460785,
11159
+ "learning_rate": 7.1006081862935444e-06,
11160
+ "loss": 0.4023,
11161
+ "step": 3184
11162
+ },
11163
+ {
11164
+ "epoch": 1.767054908485857,
11165
+ "grad_norm": 0.3055804371833801,
11166
+ "learning_rate": 7.034261880029114e-06,
11167
+ "loss": 0.3967,
11168
+ "step": 3186
11169
+ },
11170
+ {
11171
+ "epoch": 1.7681641708264004,
11172
+ "grad_norm": 0.2863101661205292,
11173
+ "learning_rate": 6.968215691234936e-06,
11174
+ "loss": 0.3853,
11175
+ "step": 3188
11176
+ },
11177
+ {
11178
+ "epoch": 1.769273433166944,
11179
+ "grad_norm": 0.28304606676101685,
11180
+ "learning_rate": 6.902469833125236e-06,
11181
+ "loss": 0.3937,
11182
+ "step": 3190
11183
+ },
11184
+ {
11185
+ "epoch": 1.7703826955074875,
11186
+ "grad_norm": 0.2828314006328583,
11187
+ "learning_rate": 6.837024517944657e-06,
11188
+ "loss": 0.3907,
11189
+ "step": 3192
11190
+ },
11191
+ {
11192
+ "epoch": 1.771491957848031,
11193
+ "grad_norm": 0.2963877022266388,
11194
+ "learning_rate": 6.77187995696763e-06,
11195
+ "loss": 0.3885,
11196
+ "step": 3194
11197
+ },
11198
+ {
11199
+ "epoch": 1.7726012201885746,
11200
+ "grad_norm": 0.24497413635253906,
11201
+ "learning_rate": 6.707036360497632e-06,
11202
+ "loss": 0.4195,
11203
+ "step": 3196
11204
+ },
11205
+ {
11206
+ "epoch": 1.7737104825291181,
11207
+ "grad_norm": 0.25655171275138855,
11208
+ "learning_rate": 6.642493937866623e-06,
11209
+ "loss": 0.3315,
11210
+ "step": 3198
11211
+ },
11212
+ {
11213
+ "epoch": 1.7748197448696617,
11214
+ "grad_norm": 0.3175029456615448,
11215
+ "learning_rate": 6.578252897434223e-06,
11216
+ "loss": 0.464,
11217
+ "step": 3200
11218
  }
11219
  ],
11220
  "logging_steps": 2,
 
11234
  "attributes": {}
11235
  }
11236
  },
11237
+ "total_flos": 7794204280750080.0,
11238
  "train_batch_size": 8,
11239
  "trial_name": null,
11240
  "trial_params": null