Femboyuwu2000 commited on
Commit
dd4ada4
1 Parent(s): d462c54

Training in progress, step 360, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -21,6 +21,7 @@
21
  "revision": null,
22
  "target_modules": [
23
  "word_embeddings",
 
24
  "query_key_valuelm_head"
25
  ],
26
  "task_type": "CAUSAL_LM",
 
21
  "revision": null,
22
  "target_modules": [
23
  "word_embeddings",
24
+ "dense_h_to_4h",
25
  "query_key_valuelm_head"
26
  ],
27
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e02434f63bb76768d0764fb21761fc6bc5a8c5697635718d6ef1c79a814a08e1
3
- size 8077608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55fbe6b68371476a49a5dcf47cb5f9d87019cedff7dbfeba90f3bd7c5af98334
3
+ size 13982248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71b11e37189ca2b87525395bedccb828279727662e1a1b44bc3e761c6ac638b2
3
- size 4052500
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dc0adaf7cde965efef091b643e2b3d0711b0799235ea64091a8320ceeb4047b
3
+ size 7062522
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:384b1441690cc32db0a09ece90eb10816e434402fb6deac7559104dc40fcf585
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a768f08cb42c9ce59ce3607be711941a2de1bfe32b4b4e516fea093f13486afb
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0072,
5
  "eval_steps": 500,
6
  "global_step": 360,
7
  "is_hyper_param_search": false,
@@ -10,138 +10,138 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 79.12922668457031,
14
  "learning_rate": 1e-06,
15
- "loss": 4.6771,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.0,
20
- "grad_norm": 239.7991943359375,
21
  "learning_rate": 2e-06,
22
- "loss": 4.5023,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.0,
27
- "grad_norm": 77.5164794921875,
28
  "learning_rate": 3e-06,
29
- "loss": 4.8171,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.0,
34
- "grad_norm": 207.87796020507812,
35
  "learning_rate": 4e-06,
36
- "loss": 4.6765,
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.0,
41
- "grad_norm": 147.12461853027344,
42
  "learning_rate": 4.9999999999999996e-06,
43
- "loss": 4.7237,
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.0,
48
- "grad_norm": 115.61023712158203,
49
  "learning_rate": 6e-06,
50
- "loss": 4.4426,
51
  "step": 120
52
  },
53
  {
54
- "epoch": 0.0,
55
- "grad_norm": 43.68627166748047,
56
  "learning_rate": 7e-06,
57
- "loss": 5.0142,
58
  "step": 140
59
  },
60
  {
61
- "epoch": 0.0,
62
- "grad_norm": 37.58155059814453,
63
  "learning_rate": 8e-06,
64
- "loss": 4.8572,
65
  "step": 160
66
  },
67
  {
68
- "epoch": 0.0,
69
- "grad_norm": 86.82076263427734,
70
  "learning_rate": 9e-06,
71
- "loss": 4.6848,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 0.0,
76
- "grad_norm": 289.0603332519531,
77
  "learning_rate": 9.999999999999999e-06,
78
- "loss": 4.6474,
79
  "step": 200
80
  },
81
  {
82
- "epoch": 0.0,
83
- "grad_norm": 69.93185424804688,
84
  "learning_rate": 1.1e-05,
85
- "loss": 4.9724,
86
  "step": 220
87
  },
88
  {
89
- "epoch": 0.0,
90
- "grad_norm": 98.215087890625,
91
  "learning_rate": 1.2e-05,
92
- "loss": 4.4447,
93
  "step": 240
94
  },
95
  {
96
- "epoch": 0.01,
97
- "grad_norm": 92.3516845703125,
98
  "learning_rate": 1.3000000000000001e-05,
99
- "loss": 4.9076,
100
  "step": 260
101
  },
102
  {
103
- "epoch": 0.01,
104
- "grad_norm": 150.6816864013672,
105
  "learning_rate": 1.4e-05,
106
- "loss": 4.8402,
107
  "step": 280
108
  },
109
  {
110
- "epoch": 0.01,
111
- "grad_norm": 162.9401397705078,
112
  "learning_rate": 1.5e-05,
113
- "loss": 4.9279,
114
  "step": 300
115
  },
116
  {
117
- "epoch": 0.01,
118
- "grad_norm": 38.77900695800781,
119
  "learning_rate": 1.6e-05,
120
- "loss": 4.5021,
121
  "step": 320
122
  },
123
  {
124
- "epoch": 0.01,
125
- "grad_norm": 91.0234146118164,
126
  "learning_rate": 1.7e-05,
127
- "loss": 4.5546,
128
  "step": 340
129
  },
130
  {
131
- "epoch": 0.01,
132
- "grad_norm": 78.24981689453125,
133
  "learning_rate": 1.8e-05,
134
- "loss": 4.3795,
135
  "step": 360
136
  }
137
  ],
138
  "logging_steps": 20,
139
  "max_steps": 20000,
140
  "num_input_tokens_seen": 0,
141
- "num_train_epochs": 1,
142
  "save_steps": 20,
143
- "total_flos": 127576999796736.0,
144
- "train_batch_size": 2,
145
  "trial_name": null,
146
  "trial_params": null
147
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0288,
5
  "eval_steps": 500,
6
  "global_step": 360,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 62.10089111328125,
14
  "learning_rate": 1e-06,
15
+ "loss": 4.5777,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.0,
20
+ "grad_norm": 39.39016342163086,
21
  "learning_rate": 2e-06,
22
+ "loss": 4.4077,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.0,
27
+ "grad_norm": 54.24020767211914,
28
  "learning_rate": 3e-06,
29
+ "loss": 4.4807,
30
  "step": 60
31
  },
32
  {
33
+ "epoch": 0.01,
34
+ "grad_norm": 30.161609649658203,
35
  "learning_rate": 4e-06,
36
+ "loss": 4.5756,
37
  "step": 80
38
  },
39
  {
40
+ "epoch": 0.01,
41
+ "grad_norm": 40.131675720214844,
42
  "learning_rate": 4.9999999999999996e-06,
43
+ "loss": 4.4352,
44
  "step": 100
45
  },
46
  {
47
+ "epoch": 0.01,
48
+ "grad_norm": 52.3621940612793,
49
  "learning_rate": 6e-06,
50
+ "loss": 4.5096,
51
  "step": 120
52
  },
53
  {
54
+ "epoch": 0.01,
55
+ "grad_norm": 49.86561584472656,
56
  "learning_rate": 7e-06,
57
+ "loss": 4.493,
58
  "step": 140
59
  },
60
  {
61
+ "epoch": 0.01,
62
+ "grad_norm": 20.034923553466797,
63
  "learning_rate": 8e-06,
64
+ "loss": 4.4088,
65
  "step": 160
66
  },
67
  {
68
+ "epoch": 0.01,
69
+ "grad_norm": 50.790679931640625,
70
  "learning_rate": 9e-06,
71
+ "loss": 4.4901,
72
  "step": 180
73
  },
74
  {
75
+ "epoch": 0.02,
76
+ "grad_norm": 48.5693473815918,
77
  "learning_rate": 9.999999999999999e-06,
78
+ "loss": 4.3628,
79
  "step": 200
80
  },
81
  {
82
+ "epoch": 0.02,
83
+ "grad_norm": 37.95353698730469,
84
  "learning_rate": 1.1e-05,
85
+ "loss": 4.3298,
86
  "step": 220
87
  },
88
  {
89
+ "epoch": 0.02,
90
+ "grad_norm": 35.7153434753418,
91
  "learning_rate": 1.2e-05,
92
+ "loss": 4.2839,
93
  "step": 240
94
  },
95
  {
96
+ "epoch": 0.02,
97
+ "grad_norm": 91.47773742675781,
98
  "learning_rate": 1.3000000000000001e-05,
99
+ "loss": 4.1238,
100
  "step": 260
101
  },
102
  {
103
+ "epoch": 0.02,
104
+ "grad_norm": 23.16193389892578,
105
  "learning_rate": 1.4e-05,
106
+ "loss": 4.1245,
107
  "step": 280
108
  },
109
  {
110
+ "epoch": 0.02,
111
+ "grad_norm": 28.304485321044922,
112
  "learning_rate": 1.5e-05,
113
+ "loss": 4.2198,
114
  "step": 300
115
  },
116
  {
117
+ "epoch": 0.03,
118
+ "grad_norm": 34.03230285644531,
119
  "learning_rate": 1.6e-05,
120
+ "loss": 4.0958,
121
  "step": 320
122
  },
123
  {
124
+ "epoch": 0.03,
125
+ "grad_norm": 29.786975860595703,
126
  "learning_rate": 1.7e-05,
127
+ "loss": 4.024,
128
  "step": 340
129
  },
130
  {
131
+ "epoch": 0.03,
132
+ "grad_norm": 33.04754638671875,
133
  "learning_rate": 1.8e-05,
134
+ "loss": 4.0832,
135
  "step": 360
136
  }
137
  ],
138
  "logging_steps": 20,
139
  "max_steps": 20000,
140
  "num_input_tokens_seen": 0,
141
+ "num_train_epochs": 2,
142
  "save_steps": 20,
143
+ "total_flos": 865682421055488.0,
144
+ "train_batch_size": 8,
145
  "trial_name": null,
146
  "trial_params": null
147
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32e994f25267341e613f4d352977cc7a2847de358db5ece7a60fcf21be944170
3
  size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29d6911b5aeefa0beece74e38b3ce4711e31d40f8c9b6627972f1c5a74e68732
3
  size 4984