jasong03 commited on
Commit
6420aaa
·
verified ·
1 Parent(s): a6244a5

Training in progress, step 3072, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d032f96718f6d142d8d7a904b55ba96f7808abcaba152861f5404a278299f59a
3
  size 891644712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2d30406d7c467767499ef4fa93e0814e4b9c839e83cc877c2f8eaaf710781d1
3
  size 891644712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a4029dba8c340d5d9ec8b63bb3fe9e4e8cde3b9c6c63750b18009a4afb761dd
3
  size 1783444794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95e37a4b2b1e8065e39772f0ad6f60340e21cd105dce5b06f199029f9a8d550
3
  size 1783444794
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3522c8ca2140f6cef3b962bf99096e64fbc7d1c0bb35519541de70b733ef81e7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d2f5ef411fc40d8d9b3f53029f0d2bde94e51e311c130b07e4428069fee892d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2805ed60a71b90fa11cf913d76534a276f46b823d6cde07733a47e3c2571dca3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3543c156e7d71b14c19e0a0a6a897c5b126e8bc6938f4ff38dab3dadeb331bb4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.526344980587909,
5
  "eval_steps": 500,
6
- "global_step": 2752,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -9647,6 +9647,1126 @@
9647
  "learning_rate": 2.801472809339294e-05,
9648
  "loss": 0.3601,
9649
  "step": 2752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9650
  }
9651
  ],
9652
  "logging_steps": 2,
@@ -9666,7 +10786,7 @@
9666
  "attributes": {}
9667
  }
9668
  },
9669
- "total_flos": 6702951740866560.0,
9670
  "train_batch_size": 8,
9671
  "trial_name": null,
9672
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.7038269550748752,
5
  "eval_steps": 500,
6
+ "global_step": 3072,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
9647
  "learning_rate": 2.801472809339294e-05,
9648
  "loss": 0.3601,
9649
  "step": 2752
9650
+ },
9651
+ {
9652
+ "epoch": 1.5274542429284526,
9653
+ "grad_norm": 0.28103169798851013,
9654
+ "learning_rate": 2.7890128096392477e-05,
9655
+ "loss": 0.3864,
9656
+ "step": 2754
9657
+ },
9658
+ {
9659
+ "epoch": 1.528563505268996,
9660
+ "grad_norm": 0.3099244236946106,
9661
+ "learning_rate": 2.7765760888726855e-05,
9662
+ "loss": 0.3917,
9663
+ "step": 2756
9664
+ },
9665
+ {
9666
+ "epoch": 1.5296727676095396,
9667
+ "grad_norm": 0.23606978356838226,
9668
+ "learning_rate": 2.7641626871885596e-05,
9669
+ "loss": 0.3313,
9670
+ "step": 2758
9671
+ },
9672
+ {
9673
+ "epoch": 1.5307820299500832,
9674
+ "grad_norm": 0.36333397030830383,
9675
+ "learning_rate": 2.7517726446605406e-05,
9676
+ "loss": 0.3982,
9677
+ "step": 2760
9678
+ },
9679
+ {
9680
+ "epoch": 1.5318912922906267,
9681
+ "grad_norm": 0.27382388710975647,
9682
+ "learning_rate": 2.7394060012868995e-05,
9683
+ "loss": 0.2713,
9684
+ "step": 2762
9685
+ },
9686
+ {
9687
+ "epoch": 1.5330005546311702,
9688
+ "grad_norm": 0.2855754494667053,
9689
+ "learning_rate": 2.7270627969903608e-05,
9690
+ "loss": 0.4792,
9691
+ "step": 2764
9692
+ },
9693
+ {
9694
+ "epoch": 1.5341098169717138,
9695
+ "grad_norm": 0.35454511642456055,
9696
+ "learning_rate": 2.714743071617979e-05,
9697
+ "loss": 0.3661,
9698
+ "step": 2766
9699
+ },
9700
+ {
9701
+ "epoch": 1.5352190793122573,
9702
+ "grad_norm": 0.22000765800476074,
9703
+ "learning_rate": 2.7024468649410228e-05,
9704
+ "loss": 0.3621,
9705
+ "step": 2768
9706
+ },
9707
+ {
9708
+ "epoch": 1.5363283416528009,
9709
+ "grad_norm": 0.28072547912597656,
9710
+ "learning_rate": 2.6901742166548262e-05,
9711
+ "loss": 0.3846,
9712
+ "step": 2770
9713
+ },
9714
+ {
9715
+ "epoch": 1.5374376039933444,
9716
+ "grad_norm": 0.2560584545135498,
9717
+ "learning_rate": 2.6779251663786797e-05,
9718
+ "loss": 0.4105,
9719
+ "step": 2772
9720
+ },
9721
+ {
9722
+ "epoch": 1.538546866333888,
9723
+ "grad_norm": 0.33404773473739624,
9724
+ "learning_rate": 2.665699753655684e-05,
9725
+ "loss": 0.3561,
9726
+ "step": 2774
9727
+ },
9728
+ {
9729
+ "epoch": 1.5396561286744315,
9730
+ "grad_norm": 0.33240342140197754,
9731
+ "learning_rate": 2.6534980179526415e-05,
9732
+ "loss": 0.3972,
9733
+ "step": 2776
9734
+ },
9735
+ {
9736
+ "epoch": 1.540765391014975,
9737
+ "grad_norm": 0.26221776008605957,
9738
+ "learning_rate": 2.6413199986599112e-05,
9739
+ "loss": 0.3542,
9740
+ "step": 2778
9741
+ },
9742
+ {
9743
+ "epoch": 1.5418746533555185,
9744
+ "grad_norm": 0.2851394712924957,
9745
+ "learning_rate": 2.6291657350912923e-05,
9746
+ "loss": 0.3402,
9747
+ "step": 2780
9748
+ },
9749
+ {
9750
+ "epoch": 1.542983915696062,
9751
+ "grad_norm": 0.27777722477912903,
9752
+ "learning_rate": 2.6170352664838903e-05,
9753
+ "loss": 0.4094,
9754
+ "step": 2782
9755
+ },
9756
+ {
9757
+ "epoch": 1.5440931780366056,
9758
+ "grad_norm": 0.32692790031433105,
9759
+ "learning_rate": 2.6049286319980014e-05,
9760
+ "loss": 0.4145,
9761
+ "step": 2784
9762
+ },
9763
+ {
9764
+ "epoch": 1.5452024403771492,
9765
+ "grad_norm": 0.37069231271743774,
9766
+ "learning_rate": 2.5928458707169813e-05,
9767
+ "loss": 0.4012,
9768
+ "step": 2786
9769
+ },
9770
+ {
9771
+ "epoch": 1.5463117027176927,
9772
+ "grad_norm": 0.28681105375289917,
9773
+ "learning_rate": 2.5807870216471052e-05,
9774
+ "loss": 0.4338,
9775
+ "step": 2788
9776
+ },
9777
+ {
9778
+ "epoch": 1.5474209650582362,
9779
+ "grad_norm": 0.3061560094356537,
9780
+ "learning_rate": 2.5687521237174584e-05,
9781
+ "loss": 0.4096,
9782
+ "step": 2790
9783
+ },
9784
+ {
9785
+ "epoch": 1.5485302273987798,
9786
+ "grad_norm": 0.3024190664291382,
9787
+ "learning_rate": 2.5567412157798133e-05,
9788
+ "loss": 0.3737,
9789
+ "step": 2792
9790
+ },
9791
+ {
9792
+ "epoch": 1.5496394897393233,
9793
+ "grad_norm": 0.22082455456256866,
9794
+ "learning_rate": 2.544754336608486e-05,
9795
+ "loss": 0.3517,
9796
+ "step": 2794
9797
+ },
9798
+ {
9799
+ "epoch": 1.5507487520798668,
9800
+ "grad_norm": 0.23570817708969116,
9801
+ "learning_rate": 2.5327915249002245e-05,
9802
+ "loss": 0.358,
9803
+ "step": 2796
9804
+ },
9805
+ {
9806
+ "epoch": 1.5518580144204104,
9807
+ "grad_norm": 0.28938063979148865,
9808
+ "learning_rate": 2.5208528192740834e-05,
9809
+ "loss": 0.3861,
9810
+ "step": 2798
9811
+ },
9812
+ {
9813
+ "epoch": 1.552967276760954,
9814
+ "grad_norm": 0.22857078909873962,
9815
+ "learning_rate": 2.5089382582712994e-05,
9816
+ "loss": 0.3072,
9817
+ "step": 2800
9818
+ },
9819
+ {
9820
+ "epoch": 1.5540765391014975,
9821
+ "grad_norm": 0.28918081521987915,
9822
+ "learning_rate": 2.4970478803551565e-05,
9823
+ "loss": 0.3366,
9824
+ "step": 2802
9825
+ },
9826
+ {
9827
+ "epoch": 1.555185801442041,
9828
+ "grad_norm": 0.28605908155441284,
9829
+ "learning_rate": 2.4851817239108688e-05,
9830
+ "loss": 0.31,
9831
+ "step": 2804
9832
+ },
9833
+ {
9834
+ "epoch": 1.5562950637825845,
9835
+ "grad_norm": 0.26103734970092773,
9836
+ "learning_rate": 2.4733398272454687e-05,
9837
+ "loss": 0.3324,
9838
+ "step": 2806
9839
+ },
9840
+ {
9841
+ "epoch": 1.557404326123128,
9842
+ "grad_norm": 0.3307429850101471,
9843
+ "learning_rate": 2.4615222285876616e-05,
9844
+ "loss": 0.3568,
9845
+ "step": 2808
9846
+ },
9847
+ {
9848
+ "epoch": 1.5585135884636716,
9849
+ "grad_norm": 0.2729584574699402,
9850
+ "learning_rate": 2.449728966087712e-05,
9851
+ "loss": 0.3475,
9852
+ "step": 2810
9853
+ },
9854
+ {
9855
+ "epoch": 1.5596228508042151,
9856
+ "grad_norm": 0.27880561351776123,
9857
+ "learning_rate": 2.437960077817326e-05,
9858
+ "loss": 0.371,
9859
+ "step": 2812
9860
+ },
9861
+ {
9862
+ "epoch": 1.5607321131447587,
9863
+ "grad_norm": 0.29016315937042236,
9864
+ "learning_rate": 2.426215601769526e-05,
9865
+ "loss": 0.3247,
9866
+ "step": 2814
9867
+ },
9868
+ {
9869
+ "epoch": 1.5618413754853022,
9870
+ "grad_norm": 0.3246505558490753,
9871
+ "learning_rate": 2.4144955758585184e-05,
9872
+ "loss": 0.4428,
9873
+ "step": 2816
9874
+ },
9875
+ {
9876
+ "epoch": 1.5629506378258458,
9877
+ "grad_norm": 0.27316877245903015,
9878
+ "learning_rate": 2.402800037919578e-05,
9879
+ "loss": 0.3025,
9880
+ "step": 2818
9881
+ },
9882
+ {
9883
+ "epoch": 1.5640599001663893,
9884
+ "grad_norm": 0.2579948902130127,
9885
+ "learning_rate": 2.3911290257089348e-05,
9886
+ "loss": 0.3673,
9887
+ "step": 2820
9888
+ },
9889
+ {
9890
+ "epoch": 1.5651691625069328,
9891
+ "grad_norm": 0.3941158354282379,
9892
+ "learning_rate": 2.3794825769036334e-05,
9893
+ "loss": 0.4028,
9894
+ "step": 2822
9895
+ },
9896
+ {
9897
+ "epoch": 1.5662784248474764,
9898
+ "grad_norm": 0.2645871341228485,
9899
+ "learning_rate": 2.3678607291014242e-05,
9900
+ "loss": 0.3511,
9901
+ "step": 2824
9902
+ },
9903
+ {
9904
+ "epoch": 1.56738768718802,
9905
+ "grad_norm": 0.2745266854763031,
9906
+ "learning_rate": 2.356263519820647e-05,
9907
+ "loss": 0.3726,
9908
+ "step": 2826
9909
+ },
9910
+ {
9911
+ "epoch": 1.5684969495285634,
9912
+ "grad_norm": 0.4434897303581238,
9913
+ "learning_rate": 2.3446909865000886e-05,
9914
+ "loss": 0.5269,
9915
+ "step": 2828
9916
+ },
9917
+ {
9918
+ "epoch": 1.569606211869107,
9919
+ "grad_norm": 0.27076244354248047,
9920
+ "learning_rate": 2.333143166498889e-05,
9921
+ "loss": 0.3558,
9922
+ "step": 2830
9923
+ },
9924
+ {
9925
+ "epoch": 1.5707154742096505,
9926
+ "grad_norm": 0.3606158196926117,
9927
+ "learning_rate": 2.3216200970963954e-05,
9928
+ "loss": 0.4266,
9929
+ "step": 2832
9930
+ },
9931
+ {
9932
+ "epoch": 1.571824736550194,
9933
+ "grad_norm": 0.2903146743774414,
9934
+ "learning_rate": 2.3101218154920633e-05,
9935
+ "loss": 0.3087,
9936
+ "step": 2834
9937
+ },
9938
+ {
9939
+ "epoch": 1.5729339988907376,
9940
+ "grad_norm": 0.3455762565135956,
9941
+ "learning_rate": 2.298648358805322e-05,
9942
+ "loss": 0.389,
9943
+ "step": 2836
9944
+ },
9945
+ {
9946
+ "epoch": 1.5740432612312811,
9947
+ "grad_norm": 0.2828775942325592,
9948
+ "learning_rate": 2.2871997640754572e-05,
9949
+ "loss": 0.3795,
9950
+ "step": 2838
9951
+ },
9952
+ {
9953
+ "epoch": 1.5751525235718247,
9954
+ "grad_norm": 0.3083617389202118,
9955
+ "learning_rate": 2.275776068261495e-05,
9956
+ "loss": 0.3764,
9957
+ "step": 2840
9958
+ },
9959
+ {
9960
+ "epoch": 1.5762617859123682,
9961
+ "grad_norm": 0.32137981057167053,
9962
+ "learning_rate": 2.264377308242086e-05,
9963
+ "loss": 0.3609,
9964
+ "step": 2842
9965
+ },
9966
+ {
9967
+ "epoch": 1.5773710482529117,
9968
+ "grad_norm": 0.2916238605976105,
9969
+ "learning_rate": 2.2530035208153822e-05,
9970
+ "loss": 0.3584,
9971
+ "step": 2844
9972
+ },
9973
+ {
9974
+ "epoch": 1.5784803105934553,
9975
+ "grad_norm": 0.26702216267585754,
9976
+ "learning_rate": 2.241654742698909e-05,
9977
+ "loss": 0.3635,
9978
+ "step": 2846
9979
+ },
9980
+ {
9981
+ "epoch": 1.5795895729339988,
9982
+ "grad_norm": 0.3438095152378082,
9983
+ "learning_rate": 2.2303310105294582e-05,
9984
+ "loss": 0.372,
9985
+ "step": 2848
9986
+ },
9987
+ {
9988
+ "epoch": 1.5806988352745424,
9989
+ "grad_norm": 0.31688249111175537,
9990
+ "learning_rate": 2.219032360862976e-05,
9991
+ "loss": 0.3912,
9992
+ "step": 2850
9993
+ },
9994
+ {
9995
+ "epoch": 1.581808097615086,
9996
+ "grad_norm": 0.2813704013824463,
9997
+ "learning_rate": 2.2077588301744233e-05,
9998
+ "loss": 0.3545,
9999
+ "step": 2852
10000
+ },
10001
+ {
10002
+ "epoch": 1.5829173599556294,
10003
+ "grad_norm": 0.23737509548664093,
10004
+ "learning_rate": 2.1965104548576753e-05,
10005
+ "loss": 0.3507,
10006
+ "step": 2854
10007
+ },
10008
+ {
10009
+ "epoch": 1.584026622296173,
10010
+ "grad_norm": 0.32858365774154663,
10011
+ "learning_rate": 2.1852872712254002e-05,
10012
+ "loss": 0.3221,
10013
+ "step": 2856
10014
+ },
10015
+ {
10016
+ "epoch": 1.5851358846367165,
10017
+ "grad_norm": 0.2847982943058014,
10018
+ "learning_rate": 2.1740893155089447e-05,
10019
+ "loss": 0.3456,
10020
+ "step": 2858
10021
+ },
10022
+ {
10023
+ "epoch": 1.58624514697726,
10024
+ "grad_norm": 0.28835567831993103,
10025
+ "learning_rate": 2.1629166238582056e-05,
10026
+ "loss": 0.3682,
10027
+ "step": 2860
10028
+ },
10029
+ {
10030
+ "epoch": 1.5873544093178036,
10031
+ "grad_norm": 0.2693901062011719,
10032
+ "learning_rate": 2.1517692323415205e-05,
10033
+ "loss": 0.3503,
10034
+ "step": 2862
10035
+ },
10036
+ {
10037
+ "epoch": 1.5884636716583471,
10038
+ "grad_norm": 0.2496192455291748,
10039
+ "learning_rate": 2.1406471769455615e-05,
10040
+ "loss": 0.3414,
10041
+ "step": 2864
10042
+ },
10043
+ {
10044
+ "epoch": 1.5895729339988907,
10045
+ "grad_norm": 0.2739793658256531,
10046
+ "learning_rate": 2.129550493575201e-05,
10047
+ "loss": 0.4304,
10048
+ "step": 2866
10049
+ },
10050
+ {
10051
+ "epoch": 1.5906821963394342,
10052
+ "grad_norm": 0.2115955650806427,
10053
+ "learning_rate": 2.118479218053401e-05,
10054
+ "loss": 0.3131,
10055
+ "step": 2868
10056
+ },
10057
+ {
10058
+ "epoch": 1.5917914586799777,
10059
+ "grad_norm": 0.283636212348938,
10060
+ "learning_rate": 2.1074333861211103e-05,
10061
+ "loss": 0.4183,
10062
+ "step": 2870
10063
+ },
10064
+ {
10065
+ "epoch": 1.5929007210205213,
10066
+ "grad_norm": 0.2762402594089508,
10067
+ "learning_rate": 2.096413033437131e-05,
10068
+ "loss": 0.3805,
10069
+ "step": 2872
10070
+ },
10071
+ {
10072
+ "epoch": 1.5940099833610648,
10073
+ "grad_norm": 0.27344250679016113,
10074
+ "learning_rate": 2.0854181955780183e-05,
10075
+ "loss": 0.3537,
10076
+ "step": 2874
10077
+ },
10078
+ {
10079
+ "epoch": 1.5951192457016083,
10080
+ "grad_norm": 0.3143325448036194,
10081
+ "learning_rate": 2.0744489080379504e-05,
10082
+ "loss": 0.3461,
10083
+ "step": 2876
10084
+ },
10085
+ {
10086
+ "epoch": 1.5962285080421519,
10087
+ "grad_norm": 0.26111075282096863,
10088
+ "learning_rate": 2.063505206228632e-05,
10089
+ "loss": 0.3634,
10090
+ "step": 2878
10091
+ },
10092
+ {
10093
+ "epoch": 1.5973377703826954,
10094
+ "grad_norm": 0.32173627614974976,
10095
+ "learning_rate": 2.0525871254791627e-05,
10096
+ "loss": 0.3973,
10097
+ "step": 2880
10098
+ },
10099
+ {
10100
+ "epoch": 1.598447032723239,
10101
+ "grad_norm": 0.2806760370731354,
10102
+ "learning_rate": 2.0416947010359355e-05,
10103
+ "loss": 0.3786,
10104
+ "step": 2882
10105
+ },
10106
+ {
10107
+ "epoch": 1.5995562950637825,
10108
+ "grad_norm": 0.30123627185821533,
10109
+ "learning_rate": 2.030827968062513e-05,
10110
+ "loss": 0.427,
10111
+ "step": 2884
10112
+ },
10113
+ {
10114
+ "epoch": 1.600665557404326,
10115
+ "grad_norm": 0.322729229927063,
10116
+ "learning_rate": 2.019986961639524e-05,
10117
+ "loss": 0.353,
10118
+ "step": 2886
10119
+ },
10120
+ {
10121
+ "epoch": 1.6017748197448696,
10122
+ "grad_norm": 0.2584727108478546,
10123
+ "learning_rate": 2.0091717167645475e-05,
10124
+ "loss": 0.2905,
10125
+ "step": 2888
10126
+ },
10127
+ {
10128
+ "epoch": 1.602884082085413,
10129
+ "grad_norm": 0.2751784026622772,
10130
+ "learning_rate": 1.9983822683519915e-05,
10131
+ "loss": 0.3394,
10132
+ "step": 2890
10133
+ },
10134
+ {
10135
+ "epoch": 1.6039933444259566,
10136
+ "grad_norm": 0.29693764448165894,
10137
+ "learning_rate": 1.9876186512329853e-05,
10138
+ "loss": 0.4027,
10139
+ "step": 2892
10140
+ },
10141
+ {
10142
+ "epoch": 1.6051026067665002,
10143
+ "grad_norm": 0.2711539566516876,
10144
+ "learning_rate": 1.9768809001552768e-05,
10145
+ "loss": 0.349,
10146
+ "step": 2894
10147
+ },
10148
+ {
10149
+ "epoch": 1.6062118691070437,
10150
+ "grad_norm": 0.25827860832214355,
10151
+ "learning_rate": 1.9661690497831053e-05,
10152
+ "loss": 0.4183,
10153
+ "step": 2896
10154
+ },
10155
+ {
10156
+ "epoch": 1.6073211314475873,
10157
+ "grad_norm": 0.34938088059425354,
10158
+ "learning_rate": 1.9554831346970925e-05,
10159
+ "loss": 0.3684,
10160
+ "step": 2898
10161
+ },
10162
+ {
10163
+ "epoch": 1.6084303937881308,
10164
+ "grad_norm": 0.26432278752326965,
10165
+ "learning_rate": 1.9448231893941414e-05,
10166
+ "loss": 0.4979,
10167
+ "step": 2900
10168
+ },
10169
+ {
10170
+ "epoch": 1.6095396561286743,
10171
+ "grad_norm": 0.32702112197875977,
10172
+ "learning_rate": 1.9341892482873192e-05,
10173
+ "loss": 0.3844,
10174
+ "step": 2902
10175
+ },
10176
+ {
10177
+ "epoch": 1.6106489184692179,
10178
+ "grad_norm": 0.36097395420074463,
10179
+ "learning_rate": 1.923581345705736e-05,
10180
+ "loss": 0.3576,
10181
+ "step": 2904
10182
+ },
10183
+ {
10184
+ "epoch": 1.6117581808097614,
10185
+ "grad_norm": 0.3077182471752167,
10186
+ "learning_rate": 1.912999515894448e-05,
10187
+ "loss": 0.5143,
10188
+ "step": 2906
10189
+ },
10190
+ {
10191
+ "epoch": 1.612867443150305,
10192
+ "grad_norm": 0.2704939544200897,
10193
+ "learning_rate": 1.9024437930143435e-05,
10194
+ "loss": 0.3342,
10195
+ "step": 2908
10196
+ },
10197
+ {
10198
+ "epoch": 1.6139767054908485,
10199
+ "grad_norm": 0.22881537675857544,
10200
+ "learning_rate": 1.8919142111420284e-05,
10201
+ "loss": 0.3769,
10202
+ "step": 2910
10203
+ },
10204
+ {
10205
+ "epoch": 1.615085967831392,
10206
+ "grad_norm": 0.29385611414909363,
10207
+ "learning_rate": 1.8814108042697144e-05,
10208
+ "loss": 0.3847,
10209
+ "step": 2912
10210
+ },
10211
+ {
10212
+ "epoch": 1.6161952301719356,
10213
+ "grad_norm": 0.4236384630203247,
10214
+ "learning_rate": 1.870933606305122e-05,
10215
+ "loss": 0.4581,
10216
+ "step": 2914
10217
+ },
10218
+ {
10219
+ "epoch": 1.617304492512479,
10220
+ "grad_norm": 0.2979065477848053,
10221
+ "learning_rate": 1.8604826510713613e-05,
10222
+ "loss": 0.4182,
10223
+ "step": 2916
10224
+ },
10225
+ {
10226
+ "epoch": 1.6184137548530226,
10227
+ "grad_norm": 0.335405170917511,
10228
+ "learning_rate": 1.8500579723068177e-05,
10229
+ "loss": 0.3544,
10230
+ "step": 2918
10231
+ },
10232
+ {
10233
+ "epoch": 1.6195230171935662,
10234
+ "grad_norm": 0.2822960615158081,
10235
+ "learning_rate": 1.8396596036650514e-05,
10236
+ "loss": 0.336,
10237
+ "step": 2920
10238
+ },
10239
+ {
10240
+ "epoch": 1.6206322795341097,
10241
+ "grad_norm": 0.3513801395893097,
10242
+ "learning_rate": 1.8292875787146946e-05,
10243
+ "loss": 0.4,
10244
+ "step": 2922
10245
+ },
10246
+ {
10247
+ "epoch": 1.6217415418746532,
10248
+ "grad_norm": 0.2501135766506195,
10249
+ "learning_rate": 1.8189419309393242e-05,
10250
+ "loss": 0.3641,
10251
+ "step": 2924
10252
+ },
10253
+ {
10254
+ "epoch": 1.6228508042151968,
10255
+ "grad_norm": 0.3006201684474945,
10256
+ "learning_rate": 1.8086226937373674e-05,
10257
+ "loss": 0.4112,
10258
+ "step": 2926
10259
+ },
10260
+ {
10261
+ "epoch": 1.6239600665557403,
10262
+ "grad_norm": 0.2748831808567047,
10263
+ "learning_rate": 1.798329900422e-05,
10264
+ "loss": 0.31,
10265
+ "step": 2928
10266
+ },
10267
+ {
10268
+ "epoch": 1.6250693288962839,
10269
+ "grad_norm": 0.3650710880756378,
10270
+ "learning_rate": 1.788063584221017e-05,
10271
+ "loss": 0.3872,
10272
+ "step": 2930
10273
+ },
10274
+ {
10275
+ "epoch": 1.6261785912368274,
10276
+ "grad_norm": 0.3932930827140808,
10277
+ "learning_rate": 1.7778237782767504e-05,
10278
+ "loss": 0.4484,
10279
+ "step": 2932
10280
+ },
10281
+ {
10282
+ "epoch": 1.627287853577371,
10283
+ "grad_norm": 0.25739145278930664,
10284
+ "learning_rate": 1.7676105156459398e-05,
10285
+ "loss": 0.3541,
10286
+ "step": 2934
10287
+ },
10288
+ {
10289
+ "epoch": 1.6283971159179145,
10290
+ "grad_norm": 0.22192710638046265,
10291
+ "learning_rate": 1.7574238292996458e-05,
10292
+ "loss": 0.3301,
10293
+ "step": 2936
10294
+ },
10295
+ {
10296
+ "epoch": 1.629506378258458,
10297
+ "grad_norm": 0.2925964593887329,
10298
+ "learning_rate": 1.7472637521231283e-05,
10299
+ "loss": 0.4855,
10300
+ "step": 2938
10301
+ },
10302
+ {
10303
+ "epoch": 1.6306156405990015,
10304
+ "grad_norm": 0.2878285050392151,
10305
+ "learning_rate": 1.737130316915744e-05,
10306
+ "loss": 0.4119,
10307
+ "step": 2940
10308
+ },
10309
+ {
10310
+ "epoch": 1.631724902939545,
10311
+ "grad_norm": 0.2855752110481262,
10312
+ "learning_rate": 1.7270235563908443e-05,
10313
+ "loss": 0.4221,
10314
+ "step": 2942
10315
+ },
10316
+ {
10317
+ "epoch": 1.6328341652800886,
10318
+ "grad_norm": 0.30537575483322144,
10319
+ "learning_rate": 1.716943503175671e-05,
10320
+ "loss": 0.4187,
10321
+ "step": 2944
10322
+ },
10323
+ {
10324
+ "epoch": 1.6339434276206322,
10325
+ "grad_norm": 0.32603779435157776,
10326
+ "learning_rate": 1.7068901898112478e-05,
10327
+ "loss": 0.4118,
10328
+ "step": 2946
10329
+ },
10330
+ {
10331
+ "epoch": 1.6350526899611757,
10332
+ "grad_norm": 0.21832433342933655,
10333
+ "learning_rate": 1.6968636487522705e-05,
10334
+ "loss": 0.3122,
10335
+ "step": 2948
10336
+ },
10337
+ {
10338
+ "epoch": 1.6361619523017192,
10339
+ "grad_norm": 0.30126479268074036,
10340
+ "learning_rate": 1.686863912367006e-05,
10341
+ "loss": 0.322,
10342
+ "step": 2950
10343
+ },
10344
+ {
10345
+ "epoch": 1.6372712146422628,
10346
+ "grad_norm": 0.27455347776412964,
10347
+ "learning_rate": 1.6768910129371986e-05,
10348
+ "loss": 0.3588,
10349
+ "step": 2952
10350
+ },
10351
+ {
10352
+ "epoch": 1.6383804769828063,
10353
+ "grad_norm": 0.26136961579322815,
10354
+ "learning_rate": 1.6669449826579464e-05,
10355
+ "loss": 0.3672,
10356
+ "step": 2954
10357
+ },
10358
+ {
10359
+ "epoch": 1.6394897393233498,
10360
+ "grad_norm": 0.24132628738880157,
10361
+ "learning_rate": 1.6570258536376083e-05,
10362
+ "loss": 0.3935,
10363
+ "step": 2956
10364
+ },
10365
+ {
10366
+ "epoch": 1.6405990016638934,
10367
+ "grad_norm": 0.38164663314819336,
10368
+ "learning_rate": 1.6471336578977016e-05,
10369
+ "loss": 0.4923,
10370
+ "step": 2958
10371
+ },
10372
+ {
10373
+ "epoch": 1.641708264004437,
10374
+ "grad_norm": 0.3024519979953766,
10375
+ "learning_rate": 1.637268427372799e-05,
10376
+ "loss": 0.4043,
10377
+ "step": 2960
10378
+ },
10379
+ {
10380
+ "epoch": 1.6428175263449805,
10381
+ "grad_norm": 0.29123973846435547,
10382
+ "learning_rate": 1.627430193910414e-05,
10383
+ "loss": 0.3372,
10384
+ "step": 2962
10385
+ },
10386
+ {
10387
+ "epoch": 1.643926788685524,
10388
+ "grad_norm": 0.2549437880516052,
10389
+ "learning_rate": 1.6176189892709127e-05,
10390
+ "loss": 0.2834,
10391
+ "step": 2964
10392
+ },
10393
+ {
10394
+ "epoch": 1.6450360510260675,
10395
+ "grad_norm": 0.3285108804702759,
10396
+ "learning_rate": 1.607834845127405e-05,
10397
+ "loss": 0.3657,
10398
+ "step": 2966
10399
+ },
10400
+ {
10401
+ "epoch": 1.646145313366611,
10402
+ "grad_norm": 0.24914546310901642,
10403
+ "learning_rate": 1.59807779306564e-05,
10404
+ "loss": 0.3498,
10405
+ "step": 2968
10406
+ },
10407
+ {
10408
+ "epoch": 1.6472545757071546,
10409
+ "grad_norm": 0.2854565978050232,
10410
+ "learning_rate": 1.5883478645839045e-05,
10411
+ "loss": 0.3597,
10412
+ "step": 2970
10413
+ },
10414
+ {
10415
+ "epoch": 1.6483638380476981,
10416
+ "grad_norm": 0.24184933304786682,
10417
+ "learning_rate": 1.578645091092933e-05,
10418
+ "loss": 0.3682,
10419
+ "step": 2972
10420
+ },
10421
+ {
10422
+ "epoch": 1.6494731003882417,
10423
+ "grad_norm": 0.30457058548927307,
10424
+ "learning_rate": 1.5689695039157848e-05,
10425
+ "loss": 0.3172,
10426
+ "step": 2974
10427
+ },
10428
+ {
10429
+ "epoch": 1.6505823627287852,
10430
+ "grad_norm": 0.23675574362277985,
10431
+ "learning_rate": 1.5593211342877645e-05,
10432
+ "loss": 0.27,
10433
+ "step": 2976
10434
+ },
10435
+ {
10436
+ "epoch": 1.6516916250693288,
10437
+ "grad_norm": 0.3284320533275604,
10438
+ "learning_rate": 1.5497000133563022e-05,
10439
+ "loss": 0.4104,
10440
+ "step": 2978
10441
+ },
10442
+ {
10443
+ "epoch": 1.6528008874098723,
10444
+ "grad_norm": 0.4357747435569763,
10445
+ "learning_rate": 1.540106172180873e-05,
10446
+ "loss": 0.4127,
10447
+ "step": 2980
10448
+ },
10449
+ {
10450
+ "epoch": 1.6539101497504158,
10451
+ "grad_norm": 0.3309295177459717,
10452
+ "learning_rate": 1.5305396417328756e-05,
10453
+ "loss": 0.4256,
10454
+ "step": 2982
10455
+ },
10456
+ {
10457
+ "epoch": 1.6550194120909594,
10458
+ "grad_norm": 0.27238425612449646,
10459
+ "learning_rate": 1.5210004528955468e-05,
10460
+ "loss": 0.359,
10461
+ "step": 2984
10462
+ },
10463
+ {
10464
+ "epoch": 1.656128674431503,
10465
+ "grad_norm": 0.30173739790916443,
10466
+ "learning_rate": 1.5114886364638614e-05,
10467
+ "loss": 0.4343,
10468
+ "step": 2986
10469
+ },
10470
+ {
10471
+ "epoch": 1.6572379367720464,
10472
+ "grad_norm": 0.29943424463272095,
10473
+ "learning_rate": 1.5020042231444197e-05,
10474
+ "loss": 0.344,
10475
+ "step": 2988
10476
+ },
10477
+ {
10478
+ "epoch": 1.65834719911259,
10479
+ "grad_norm": 0.34404435753822327,
10480
+ "learning_rate": 1.4925472435553701e-05,
10481
+ "loss": 0.3992,
10482
+ "step": 2990
10483
+ },
10484
+ {
10485
+ "epoch": 1.6594564614531335,
10486
+ "grad_norm": 0.3563268184661865,
10487
+ "learning_rate": 1.4831177282262842e-05,
10488
+ "loss": 0.4014,
10489
+ "step": 2992
10490
+ },
10491
+ {
10492
+ "epoch": 1.660565723793677,
10493
+ "grad_norm": 0.2387107014656067,
10494
+ "learning_rate": 1.4737157075980845e-05,
10495
+ "loss": 0.3141,
10496
+ "step": 2994
10497
+ },
10498
+ {
10499
+ "epoch": 1.6616749861342206,
10500
+ "grad_norm": 0.3576110899448395,
10501
+ "learning_rate": 1.4643412120229262e-05,
10502
+ "loss": 0.3765,
10503
+ "step": 2996
10504
+ },
10505
+ {
10506
+ "epoch": 1.6627842484747641,
10507
+ "grad_norm": 0.335438072681427,
10508
+ "learning_rate": 1.4549942717641052e-05,
10509
+ "loss": 0.3619,
10510
+ "step": 2998
10511
+ },
10512
+ {
10513
+ "epoch": 1.6638935108153077,
10514
+ "grad_norm": 0.2912905216217041,
10515
+ "learning_rate": 1.4456749169959648e-05,
10516
+ "loss": 0.3389,
10517
+ "step": 3000
10518
+ },
10519
+ {
10520
+ "epoch": 1.6650027731558512,
10521
+ "grad_norm": 0.2544459104537964,
10522
+ "learning_rate": 1.4363831778037961e-05,
10523
+ "loss": 0.2778,
10524
+ "step": 3002
10525
+ },
10526
+ {
10527
+ "epoch": 1.6661120354963947,
10528
+ "grad_norm": 0.36533600091934204,
10529
+ "learning_rate": 1.42711908418374e-05,
10530
+ "loss": 0.3798,
10531
+ "step": 3004
10532
+ },
10533
+ {
10534
+ "epoch": 1.6672212978369383,
10535
+ "grad_norm": 0.2710284888744354,
10536
+ "learning_rate": 1.4178826660426891e-05,
10537
+ "loss": 0.305,
10538
+ "step": 3006
10539
+ },
10540
+ {
10541
+ "epoch": 1.6683305601774818,
10542
+ "grad_norm": 0.21859732270240784,
10543
+ "learning_rate": 1.4086739531981885e-05,
10544
+ "loss": 0.4432,
10545
+ "step": 3008
10546
+ },
10547
+ {
10548
+ "epoch": 1.6694398225180254,
10549
+ "grad_norm": 0.22141209244728088,
10550
+ "learning_rate": 1.3994929753783515e-05,
10551
+ "loss": 0.3012,
10552
+ "step": 3010
10553
+ },
10554
+ {
10555
+ "epoch": 1.670549084858569,
10556
+ "grad_norm": 0.3187447786331177,
10557
+ "learning_rate": 1.3903397622217506e-05,
10558
+ "loss": 0.3794,
10559
+ "step": 3012
10560
+ },
10561
+ {
10562
+ "epoch": 1.6716583471991124,
10563
+ "grad_norm": 0.37787067890167236,
10564
+ "learning_rate": 1.381214343277324e-05,
10565
+ "loss": 0.3672,
10566
+ "step": 3014
10567
+ },
10568
+ {
10569
+ "epoch": 1.672767609539656,
10570
+ "grad_norm": 0.3910747766494751,
10571
+ "learning_rate": 1.3721167480042885e-05,
10572
+ "loss": 0.4577,
10573
+ "step": 3016
10574
+ },
10575
+ {
10576
+ "epoch": 1.6738768718801995,
10577
+ "grad_norm": 0.3260791003704071,
10578
+ "learning_rate": 1.3630470057720402e-05,
10579
+ "loss": 0.4624,
10580
+ "step": 3018
10581
+ },
10582
+ {
10583
+ "epoch": 1.674986134220743,
10584
+ "grad_norm": 0.3197901248931885,
10585
+ "learning_rate": 1.3540051458600523e-05,
10586
+ "loss": 0.3861,
10587
+ "step": 3020
10588
+ },
10589
+ {
10590
+ "epoch": 1.6760953965612866,
10591
+ "grad_norm": 0.3465018570423126,
10592
+ "learning_rate": 1.3449911974577877e-05,
10593
+ "loss": 0.5036,
10594
+ "step": 3022
10595
+ },
10596
+ {
10597
+ "epoch": 1.6772046589018301,
10598
+ "grad_norm": 0.32030799984931946,
10599
+ "learning_rate": 1.3360051896646086e-05,
10600
+ "loss": 0.3244,
10601
+ "step": 3024
10602
+ },
10603
+ {
10604
+ "epoch": 1.6783139212423737,
10605
+ "grad_norm": 0.2779349088668823,
10606
+ "learning_rate": 1.3270471514896743e-05,
10607
+ "loss": 0.3362,
10608
+ "step": 3026
10609
+ },
10610
+ {
10611
+ "epoch": 1.6794231835829172,
10612
+ "grad_norm": 0.312095046043396,
10613
+ "learning_rate": 1.3181171118518465e-05,
10614
+ "loss": 0.5161,
10615
+ "step": 3028
10616
+ },
10617
+ {
10618
+ "epoch": 1.6805324459234607,
10619
+ "grad_norm": 0.2570739984512329,
10620
+ "learning_rate": 1.3092150995796115e-05,
10621
+ "loss": 0.3741,
10622
+ "step": 3030
10623
+ },
10624
+ {
10625
+ "epoch": 1.6816417082640043,
10626
+ "grad_norm": 0.27533021569252014,
10627
+ "learning_rate": 1.3003411434109647e-05,
10628
+ "loss": 0.3173,
10629
+ "step": 3032
10630
+ },
10631
+ {
10632
+ "epoch": 1.6827509706045478,
10633
+ "grad_norm": 0.2738421559333801,
10634
+ "learning_rate": 1.2914952719933371e-05,
10635
+ "loss": 0.4167,
10636
+ "step": 3034
10637
+ },
10638
+ {
10639
+ "epoch": 1.6838602329450914,
10640
+ "grad_norm": 0.21936124563217163,
10641
+ "learning_rate": 1.282677513883489e-05,
10642
+ "loss": 0.3356,
10643
+ "step": 3036
10644
+ },
10645
+ {
10646
+ "epoch": 1.6849694952856349,
10647
+ "grad_norm": 0.2971390187740326,
10648
+ "learning_rate": 1.2738878975474288e-05,
10649
+ "loss": 0.3919,
10650
+ "step": 3038
10651
+ },
10652
+ {
10653
+ "epoch": 1.6860787576261784,
10654
+ "grad_norm": 0.3661748766899109,
10655
+ "learning_rate": 1.2651264513603134e-05,
10656
+ "loss": 0.3864,
10657
+ "step": 3040
10658
+ },
10659
+ {
10660
+ "epoch": 1.687188019966722,
10661
+ "grad_norm": 0.3551200330257416,
10662
+ "learning_rate": 1.2563932036063586e-05,
10663
+ "loss": 0.3555,
10664
+ "step": 3042
10665
+ },
10666
+ {
10667
+ "epoch": 1.6882972823072657,
10668
+ "grad_norm": 0.27041590213775635,
10669
+ "learning_rate": 1.2476881824787467e-05,
10670
+ "loss": 0.295,
10671
+ "step": 3044
10672
+ },
10673
+ {
10674
+ "epoch": 1.6894065446478093,
10675
+ "grad_norm": 0.2313155084848404,
10676
+ "learning_rate": 1.2390114160795419e-05,
10677
+ "loss": 0.3177,
10678
+ "step": 3046
10679
+ },
10680
+ {
10681
+ "epoch": 1.6905158069883528,
10682
+ "grad_norm": 0.3004077970981598,
10683
+ "learning_rate": 1.2303629324195943e-05,
10684
+ "loss": 0.3845,
10685
+ "step": 3048
10686
+ },
10687
+ {
10688
+ "epoch": 1.6916250693288963,
10689
+ "grad_norm": 0.29577240347862244,
10690
+ "learning_rate": 1.2217427594184461e-05,
10691
+ "loss": 0.3376,
10692
+ "step": 3050
10693
+ },
10694
+ {
10695
+ "epoch": 1.6927343316694399,
10696
+ "grad_norm": 0.363438218832016,
10697
+ "learning_rate": 1.213150924904245e-05,
10698
+ "loss": 0.465,
10699
+ "step": 3052
10700
+ },
10701
+ {
10702
+ "epoch": 1.6938435940099834,
10703
+ "grad_norm": 0.2636345624923706,
10704
+ "learning_rate": 1.2045874566136617e-05,
10705
+ "loss": 0.2845,
10706
+ "step": 3054
10707
+ },
10708
+ {
10709
+ "epoch": 1.694952856350527,
10710
+ "grad_norm": 0.3315665125846863,
10711
+ "learning_rate": 1.1960523821917868e-05,
10712
+ "loss": 0.4179,
10713
+ "step": 3056
10714
+ },
10715
+ {
10716
+ "epoch": 1.6960621186910705,
10717
+ "grad_norm": 0.27641746401786804,
10718
+ "learning_rate": 1.1875457291920477e-05,
10719
+ "loss": 0.3542,
10720
+ "step": 3058
10721
+ },
10722
+ {
10723
+ "epoch": 1.697171381031614,
10724
+ "grad_norm": 0.39690592885017395,
10725
+ "learning_rate": 1.1790675250761263e-05,
10726
+ "loss": 0.4511,
10727
+ "step": 3060
10728
+ },
10729
+ {
10730
+ "epoch": 1.6982806433721576,
10731
+ "grad_norm": 0.1926700472831726,
10732
+ "learning_rate": 1.1706177972138599e-05,
10733
+ "loss": 0.2946,
10734
+ "step": 3062
10735
+ },
10736
+ {
10737
+ "epoch": 1.699389905712701,
10738
+ "grad_norm": 0.27746477723121643,
10739
+ "learning_rate": 1.1621965728831564e-05,
10740
+ "loss": 0.3691,
10741
+ "step": 3064
10742
+ },
10743
+ {
10744
+ "epoch": 1.7004991680532446,
10745
+ "grad_norm": 0.2863025367259979,
10746
+ "learning_rate": 1.1538038792699068e-05,
10747
+ "loss": 0.3466,
10748
+ "step": 3066
10749
+ },
10750
+ {
10751
+ "epoch": 1.7016084303937882,
10752
+ "grad_norm": 0.31509512662887573,
10753
+ "learning_rate": 1.1454397434679021e-05,
10754
+ "loss": 0.3379,
10755
+ "step": 3068
10756
+ },
10757
+ {
10758
+ "epoch": 1.7027176927343317,
10759
+ "grad_norm": 0.3157186806201935,
10760
+ "learning_rate": 1.1371041924787362e-05,
10761
+ "loss": 0.3854,
10762
+ "step": 3070
10763
+ },
10764
+ {
10765
+ "epoch": 1.7038269550748752,
10766
+ "grad_norm": 0.32956090569496155,
10767
+ "learning_rate": 1.128797253211723e-05,
10768
+ "loss": 0.3036,
10769
+ "step": 3072
10770
  }
10771
  ],
10772
  "logging_steps": 2,
 
10786
  "attributes": {}
10787
  }
10788
  },
10789
+ "total_flos": 7482417840783360.0,
10790
  "train_batch_size": 8,
10791
  "trial_name": null,
10792
  "trial_params": null