File size: 5,127 Bytes
50bb399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6531149665213268,
  "eval_steps": 500,
  "global_step": 2500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.05539654694857354,
      "grad_norm": 3.9366531372070312,
      "learning_rate": 2.7700831024930747e-06,
      "loss": 12.1842,
      "step": 100
    },
    {
      "epoch": 0.11079309389714707,
      "grad_norm": 0.5843711495399475,
      "learning_rate": 5.540166204986149e-06,
      "loss": 5.2545,
      "step": 200
    },
    {
      "epoch": 0.1661896408457206,
      "grad_norm": 0.3186987638473511,
      "learning_rate": 8.310249307479224e-06,
      "loss": 1.5282,
      "step": 300
    },
    {
      "epoch": 0.22158618779429415,
      "grad_norm": 0.22694998979568481,
      "learning_rate": 1.1080332409972299e-05,
      "loss": 0.8633,
      "step": 400
    },
    {
      "epoch": 0.2769827347428677,
      "grad_norm": 0.4204826056957245,
      "learning_rate": 1.3850415512465375e-05,
      "loss": 0.6224,
      "step": 500
    },
    {
      "epoch": 0.3323792816914412,
      "grad_norm": 0.23341064155101776,
      "learning_rate": 1.6620498614958448e-05,
      "loss": 0.5006,
      "step": 600
    },
    {
      "epoch": 0.38777582864001475,
      "grad_norm": 0.29134589433670044,
      "learning_rate": 1.9390581717451524e-05,
      "loss": 0.4471,
      "step": 700
    },
    {
      "epoch": 0.4431723755885883,
      "grad_norm": 0.4506121277809143,
      "learning_rate": 2.2160664819944597e-05,
      "loss": 0.4142,
      "step": 800
    },
    {
      "epoch": 0.49856892253716184,
      "grad_norm": 0.46713459491729736,
      "learning_rate": 2.4930747922437674e-05,
      "loss": 0.3951,
      "step": 900
    },
    {
      "epoch": 0.5539654694857354,
      "grad_norm": 0.49562305212020874,
      "learning_rate": 2.770083102493075e-05,
      "loss": 0.3778,
      "step": 1000
    },
    {
      "epoch": 0.6093620164343089,
      "grad_norm": 0.4143618047237396,
      "learning_rate": 3.0470914127423823e-05,
      "loss": 0.374,
      "step": 1100
    },
    {
      "epoch": 0.6647585633828824,
      "grad_norm": 0.28604185581207275,
      "learning_rate": 3.3240997229916896e-05,
      "loss": 0.3631,
      "step": 1200
    },
    {
      "epoch": 0.720155110331456,
      "grad_norm": 0.3391754627227783,
      "learning_rate": 3.601108033240997e-05,
      "loss": 0.3613,
      "step": 1300
    },
    {
      "epoch": 0.7755516572800295,
      "grad_norm": 0.237514466047287,
      "learning_rate": 3.878116343490305e-05,
      "loss": 0.3551,
      "step": 1400
    },
    {
      "epoch": 0.830948204228603,
      "grad_norm": 0.2939525246620178,
      "learning_rate": 4.155124653739612e-05,
      "loss": 0.3504,
      "step": 1500
    },
    {
      "epoch": 0.8863447511771766,
      "grad_norm": 0.3228910267353058,
      "learning_rate": 4.4321329639889195e-05,
      "loss": 0.3489,
      "step": 1600
    },
    {
      "epoch": 0.9417412981257501,
      "grad_norm": 0.4955917000770569,
      "learning_rate": 4.709141274238227e-05,
      "loss": 0.346,
      "step": 1700
    },
    {
      "epoch": 0.9971378450743237,
      "grad_norm": 0.7099990248680115,
      "learning_rate": 4.986149584487535e-05,
      "loss": 0.3378,
      "step": 1800
    },
    {
      "epoch": 1.0525343920228973,
      "grad_norm": 0.39987462759017944,
      "learning_rate": 4.997864395968252e-05,
      "loss": 0.3337,
      "step": 1900
    },
    {
      "epoch": 1.1079309389714709,
      "grad_norm": 0.3917470872402191,
      "learning_rate": 4.991006183625085e-05,
      "loss": 0.3329,
      "step": 2000
    },
    {
      "epoch": 0.5486165718779146,
      "grad_norm": 0.07418235391378403,
      "learning_rate": 2.7436634439508755e-06,
      "loss": 0.3044,
      "step": 2100
    },
    {
      "epoch": 0.5747411705387676,
      "grad_norm": 0.09766950458288193,
      "learning_rate": 2.874314084139012e-06,
      "loss": 0.3074,
      "step": 2200
    },
    {
      "epoch": 0.6008657691996206,
      "grad_norm": 0.09052286297082901,
      "learning_rate": 3.0049647243271495e-06,
      "loss": 0.303,
      "step": 2300
    },
    {
      "epoch": 0.6269903678604738,
      "grad_norm": 0.08768751472234726,
      "learning_rate": 3.135615364515286e-06,
      "loss": 0.3006,
      "step": 2400
    },
    {
      "epoch": 0.6531149665213268,
      "grad_norm": 0.08635412901639938,
      "learning_rate": 3.2662660047034235e-06,
      "loss": 0.2994,
      "step": 2500
    }
  ],
  "logging_steps": 100,
  "max_steps": 191350,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 50,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.3435564032e+18,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}