0x1202 commited on
Commit
d02f2a0
·
verified ·
1 Parent(s): 59a68cb

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad89e7ef44e3420a69273763f488d922bc681dd54b5c88ebc1350ab5ff2845dd
3
  size 2269195160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7647ae0627a7b725e322f0e4d5e33cd169586fc185828a0e8f595005b45fa3df
3
  size 2269195160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad2e51b938e6282e3249c7a5e8e446cab9bdb4ee1ed45b9997d4d4fbada5f2dd
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f4f0d2f5d6b2ead9512276ab4388b03851af6e42e5e40a1fdb4e34188d3049
3
  size 335922386
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc7c408c7066e13926b0a10035f94a1eb657992a0c39a1652dc373d4d643e9c2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36b216c590f6eca2a00a39d4b3b2093147d8c280798056ecb0b21597d90051e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5d2a6c6aafc669cea03b9634666f204de949a3d45ce2f48a07e7e3eaf18c715
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e70710c409284f74d525f8db5cfaccc22a8afd29416f19c595da9242ec92d936
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.363052248954773,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.19805656990777992,
5
  "eval_steps": 25,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -747,6 +747,372 @@
747
  "eval_samples_per_second": 6.29,
748
  "eval_steps_per_second": 6.29,
749
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  }
751
  ],
752
  "logging_steps": 1,
@@ -775,7 +1141,7 @@
775
  "attributes": {}
776
  }
777
  },
778
- "total_flos": 1.576511333203968e+17,
779
  "train_batch_size": 1,
780
  "trial_name": null,
781
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.169845700263977,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-150",
4
+ "epoch": 0.29708485486166986,
5
  "eval_steps": 25,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
747
  "eval_samples_per_second": 6.29,
748
  "eval_steps_per_second": 6.29,
749
  "step": 100
750
+ },
751
+ {
752
+ "epoch": 0.2000371356068577,
753
+ "grad_norm": 1.788689136505127,
754
+ "learning_rate": 0.0001599135876488549,
755
+ "loss": 1.1028,
756
+ "step": 101
757
+ },
758
+ {
759
+ "epoch": 0.20201770130593552,
760
+ "grad_norm": 1.1090993881225586,
761
+ "learning_rate": 0.00015743756320098332,
762
+ "loss": 1.14,
763
+ "step": 102
764
+ },
765
+ {
766
+ "epoch": 0.2039982670050133,
767
+ "grad_norm": 0.710551917552948,
768
+ "learning_rate": 0.0001549595053975962,
769
+ "loss": 1.0624,
770
+ "step": 103
771
+ },
772
+ {
773
+ "epoch": 0.20597883270409112,
774
+ "grad_norm": 0.42503175139427185,
775
+ "learning_rate": 0.00015248009171495378,
776
+ "loss": 0.9895,
777
+ "step": 104
778
+ },
779
+ {
780
+ "epoch": 0.2079593984031689,
781
+ "grad_norm": 0.30752989649772644,
782
+ "learning_rate": 0.00015,
783
+ "loss": 0.9048,
784
+ "step": 105
785
+ },
786
+ {
787
+ "epoch": 0.20993996410224672,
788
+ "grad_norm": 0.28837981820106506,
789
+ "learning_rate": 0.00014751990828504622,
790
+ "loss": 0.9729,
791
+ "step": 106
792
+ },
793
+ {
794
+ "epoch": 0.2119205298013245,
795
+ "grad_norm": 0.28646722435951233,
796
+ "learning_rate": 0.00014504049460240375,
797
+ "loss": 0.8549,
798
+ "step": 107
799
+ },
800
+ {
801
+ "epoch": 0.21390109550040232,
802
+ "grad_norm": 0.2909419536590576,
803
+ "learning_rate": 0.00014256243679901663,
804
+ "loss": 0.8896,
805
+ "step": 108
806
+ },
807
+ {
808
+ "epoch": 0.2158816611994801,
809
+ "grad_norm": 0.3322446346282959,
810
+ "learning_rate": 0.00014008641235114508,
811
+ "loss": 0.7194,
812
+ "step": 109
813
+ },
814
+ {
815
+ "epoch": 0.2178622268985579,
816
+ "grad_norm": 0.2969757318496704,
817
+ "learning_rate": 0.00013761309817915014,
818
+ "loss": 0.79,
819
+ "step": 110
820
+ },
821
+ {
822
+ "epoch": 0.2198427925976357,
823
+ "grad_norm": 0.3453933596611023,
824
+ "learning_rate": 0.00013514317046243058,
825
+ "loss": 0.8365,
826
+ "step": 111
827
+ },
828
+ {
829
+ "epoch": 0.22182335829671349,
830
+ "grad_norm": 0.26184096932411194,
831
+ "learning_rate": 0.00013267730445456208,
832
+ "loss": 0.581,
833
+ "step": 112
834
+ },
835
+ {
836
+ "epoch": 0.2238039239957913,
837
+ "grad_norm": 0.33915719389915466,
838
+ "learning_rate": 0.00013021617429868963,
839
+ "loss": 0.7551,
840
+ "step": 113
841
+ },
842
+ {
843
+ "epoch": 0.22578448969486908,
844
+ "grad_norm": 0.3160857856273651,
845
+ "learning_rate": 0.00012776045284322368,
846
+ "loss": 0.7751,
847
+ "step": 114
848
+ },
849
+ {
850
+ "epoch": 0.2277650553939469,
851
+ "grad_norm": 0.32658034563064575,
852
+ "learning_rate": 0.00012531081145788987,
853
+ "loss": 0.8707,
854
+ "step": 115
855
+ },
856
+ {
857
+ "epoch": 0.22974562109302468,
858
+ "grad_norm": 0.3431919813156128,
859
+ "learning_rate": 0.00012286791985018355,
860
+ "loss": 0.7935,
861
+ "step": 116
862
+ },
863
+ {
864
+ "epoch": 0.2317261867921025,
865
+ "grad_norm": 0.3515626788139343,
866
+ "learning_rate": 0.00012043244588227796,
867
+ "loss": 0.8467,
868
+ "step": 117
869
+ },
870
+ {
871
+ "epoch": 0.23370675249118028,
872
+ "grad_norm": 0.387687623500824,
873
+ "learning_rate": 0.00011800505538843798,
874
+ "loss": 1.0536,
875
+ "step": 118
876
+ },
877
+ {
878
+ "epoch": 0.2356873181902581,
879
+ "grad_norm": 0.47919800877571106,
880
+ "learning_rate": 0.00011558641199298727,
881
+ "loss": 1.253,
882
+ "step": 119
883
+ },
884
+ {
885
+ "epoch": 0.23766788388933588,
886
+ "grad_norm": 0.460734099149704,
887
+ "learning_rate": 0.00011317717692888012,
888
+ "loss": 1.2217,
889
+ "step": 120
890
+ },
891
+ {
892
+ "epoch": 0.2396484495884137,
893
+ "grad_norm": 0.418082058429718,
894
+ "learning_rate": 0.00011077800885692702,
895
+ "loss": 1.2639,
896
+ "step": 121
897
+ },
898
+ {
899
+ "epoch": 0.24162901528749148,
900
+ "grad_norm": 0.43898528814315796,
901
+ "learning_rate": 0.00010838956368572334,
902
+ "loss": 1.2732,
903
+ "step": 122
904
+ },
905
+ {
906
+ "epoch": 0.2436095809865693,
907
+ "grad_norm": 0.557872474193573,
908
+ "learning_rate": 0.0001060124943923303,
909
+ "loss": 1.2141,
910
+ "step": 123
911
+ },
912
+ {
913
+ "epoch": 0.24559014668564708,
914
+ "grad_norm": 0.5319792628288269,
915
+ "learning_rate": 0.0001036474508437579,
916
+ "loss": 1.2522,
917
+ "step": 124
918
+ },
919
+ {
920
+ "epoch": 0.2475707123847249,
921
+ "grad_norm": 0.5045695304870605,
922
+ "learning_rate": 0.00010129507961929748,
923
+ "loss": 1.2974,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 0.2475707123847249,
928
+ "eval_loss": 1.1529182195663452,
929
+ "eval_runtime": 7.944,
930
+ "eval_samples_per_second": 6.294,
931
+ "eval_steps_per_second": 6.294,
932
+ "step": 125
933
+ },
934
+ {
935
+ "epoch": 0.24955127808380267,
936
+ "grad_norm": 0.5417290925979614,
937
+ "learning_rate": 9.895602383375353e-05,
938
+ "loss": 1.212,
939
+ "step": 126
940
+ },
941
+ {
942
+ "epoch": 0.25153184378288046,
943
+ "grad_norm": 0.4937967360019684,
944
+ "learning_rate": 9.663092296162251e-05,
945
+ "loss": 1.2267,
946
+ "step": 127
947
+ },
948
+ {
949
+ "epoch": 0.2535124094819583,
950
+ "grad_norm": 0.49752992391586304,
951
+ "learning_rate": 9.432041266226686e-05,
952
+ "loss": 1.0415,
953
+ "step": 128
954
+ },
955
+ {
956
+ "epoch": 0.2554929751810361,
957
+ "grad_norm": 0.48785874247550964,
958
+ "learning_rate": 9.202512460613219e-05,
959
+ "loss": 1.1829,
960
+ "step": 129
961
+ },
962
+ {
963
+ "epoch": 0.25747354088011387,
964
+ "grad_norm": 0.4991934895515442,
965
+ "learning_rate": 8.97456863020546e-05,
966
+ "loss": 1.2282,
967
+ "step": 130
968
+ },
969
+ {
970
+ "epoch": 0.25945410657919166,
971
+ "grad_norm": 0.5931552648544312,
972
+ "learning_rate": 8.748272092570646e-05,
973
+ "loss": 1.2068,
974
+ "step": 131
975
+ },
976
+ {
977
+ "epoch": 0.2614346722782695,
978
+ "grad_norm": 0.5376507639884949,
979
+ "learning_rate": 8.523684714922608e-05,
980
+ "loss": 1.2745,
981
+ "step": 132
982
+ },
983
+ {
984
+ "epoch": 0.2634152379773473,
985
+ "grad_norm": 0.5740606188774109,
986
+ "learning_rate": 8.300867897207903e-05,
987
+ "loss": 0.9724,
988
+ "step": 133
989
+ },
990
+ {
991
+ "epoch": 0.26539580367642507,
992
+ "grad_norm": 0.552382230758667,
993
+ "learning_rate": 8.079882555319684e-05,
994
+ "loss": 1.2513,
995
+ "step": 134
996
+ },
997
+ {
998
+ "epoch": 0.26737636937550285,
999
+ "grad_norm": 0.4819226861000061,
1000
+ "learning_rate": 7.860789104443896e-05,
1001
+ "loss": 1.029,
1002
+ "step": 135
1003
+ },
1004
+ {
1005
+ "epoch": 0.2693569350745807,
1006
+ "grad_norm": 0.5257924199104309,
1007
+ "learning_rate": 7.643647442542382e-05,
1008
+ "loss": 1.2768,
1009
+ "step": 136
1010
+ },
1011
+ {
1012
+ "epoch": 0.2713375007736585,
1013
+ "grad_norm": 0.5025389194488525,
1014
+ "learning_rate": 7.428516933977347e-05,
1015
+ "loss": 1.1005,
1016
+ "step": 137
1017
+ },
1018
+ {
1019
+ "epoch": 0.27331806647273627,
1020
+ "grad_norm": 0.5294015407562256,
1021
+ "learning_rate": 7.215456393281776e-05,
1022
+ "loss": 0.9751,
1023
+ "step": 138
1024
+ },
1025
+ {
1026
+ "epoch": 0.27529863217181405,
1027
+ "grad_norm": 0.5134867429733276,
1028
+ "learning_rate": 7.004524069080096e-05,
1029
+ "loss": 0.8709,
1030
+ "step": 139
1031
+ },
1032
+ {
1033
+ "epoch": 0.2772791978708919,
1034
+ "grad_norm": 0.6456707119941711,
1035
+ "learning_rate": 6.795777628163599e-05,
1036
+ "loss": 0.9567,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 0.2792597635699697,
1041
+ "grad_norm": 0.5765627026557922,
1042
+ "learning_rate": 6.58927413972491e-05,
1043
+ "loss": 0.6746,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 0.28124032926904746,
1048
+ "grad_norm": 0.6089555025100708,
1049
+ "learning_rate": 6.385070059755846e-05,
1050
+ "loss": 0.8622,
1051
+ "step": 142
1052
+ },
1053
+ {
1054
+ "epoch": 0.28322089496812525,
1055
+ "grad_norm": 0.6544927954673767,
1056
+ "learning_rate": 6.183221215612904e-05,
1057
+ "loss": 0.7769,
1058
+ "step": 143
1059
+ },
1060
+ {
1061
+ "epoch": 0.2852014606672031,
1062
+ "grad_norm": 0.5500593781471252,
1063
+ "learning_rate": 5.983782790754623e-05,
1064
+ "loss": 0.5781,
1065
+ "step": 144
1066
+ },
1067
+ {
1068
+ "epoch": 0.2871820263662809,
1069
+ "grad_norm": 0.5492486357688904,
1070
+ "learning_rate": 5.786809309654982e-05,
1071
+ "loss": 0.4864,
1072
+ "step": 145
1073
+ },
1074
+ {
1075
+ "epoch": 0.28916259206535866,
1076
+ "grad_norm": 0.7028548121452332,
1077
+ "learning_rate": 5.592354622896944e-05,
1078
+ "loss": 0.9125,
1079
+ "step": 146
1080
+ },
1081
+ {
1082
+ "epoch": 0.29114315776443644,
1083
+ "grad_norm": 0.8846566081047058,
1084
+ "learning_rate": 5.40047189245025e-05,
1085
+ "loss": 1.2438,
1086
+ "step": 147
1087
+ },
1088
+ {
1089
+ "epoch": 0.2931237234635143,
1090
+ "grad_norm": 0.9810320734977722,
1091
+ "learning_rate": 5.211213577137469e-05,
1092
+ "loss": 1.1933,
1093
+ "step": 148
1094
+ },
1095
+ {
1096
+ "epoch": 0.29510428916259207,
1097
+ "grad_norm": 1.4606691598892212,
1098
+ "learning_rate": 5.024631418292274e-05,
1099
+ "loss": 0.839,
1100
+ "step": 149
1101
+ },
1102
+ {
1103
+ "epoch": 0.29708485486166986,
1104
+ "grad_norm": 2.203315258026123,
1105
+ "learning_rate": 4.840776425613886e-05,
1106
+ "loss": 0.7995,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 0.29708485486166986,
1111
+ "eval_loss": 1.169845700263977,
1112
+ "eval_runtime": 7.946,
1113
+ "eval_samples_per_second": 6.292,
1114
+ "eval_steps_per_second": 6.292,
1115
+ "step": 150
1116
  }
1117
  ],
1118
  "logging_steps": 1,
 
1141
  "attributes": {}
1142
  }
1143
  },
1144
+ "total_flos": 2.364766999805952e+17,
1145
  "train_batch_size": 1,
1146
  "trial_name": null,
1147
  "trial_params": null