kahou1234 commited on
Commit
2bf6292
1 Parent(s): a3ee8c3

Upload 11 files

Browse files
README.md CHANGED
@@ -44,7 +44,7 @@ The following hyperparameters were used during training:
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
- - num_epochs: 50.0
48
  - mixed_precision_training: Native AMP
49
 
50
  ### Training results
 
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 65.0
48
  - mixed_precision_training: Native AMP
49
 
50
  ### Training results
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "up_proj",
 
24
  "o_proj",
25
- "gate_proj",
26
- "k_proj",
27
  "down_proj",
28
- "v_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "k_proj",
24
+ "gate_proj",
25
  "up_proj",
26
+ "q_proj",
27
  "o_proj",
 
 
28
  "down_proj",
29
+ "v_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f4b7393e56e2f0346f390598222852eb21d40dabdca7cd2a40f958dfb77a4dc
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebb031c13017ac3ec5e54ea625c9ed18656d90880c5a70962e8f756b23ffcc40
3
  size 83945296
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 49.6,
3
- "total_flos": 1.7939255986343117e+17,
4
- "train_loss": 0.172668604437382,
5
- "train_runtime": 11274.7291,
6
- "train_samples_per_second": 2.217,
7
- "train_steps_per_second": 0.275
8
  }
 
1
  {
2
+ "epoch": 64.48,
3
+ "total_flos": 2.3325606118844006e+17,
4
+ "train_loss": 0.1495482857003993,
5
+ "train_runtime": 6882.5617,
6
+ "train_samples_per_second": 4.722,
7
+ "train_steps_per_second": 0.586
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 49.6,
3
- "total_flos": 1.7939255986343117e+17,
4
- "train_loss": 0.172668604437382,
5
- "train_runtime": 11274.7291,
6
- "train_samples_per_second": 2.217,
7
- "train_steps_per_second": 0.275
8
  }
 
1
  {
2
+ "epoch": 64.48,
3
+ "total_flos": 2.3325606118844006e+17,
4
+ "train_loss": 0.1495482857003993,
5
+ "train_runtime": 6882.5617,
6
+ "train_samples_per_second": 4.722,
7
+ "train_steps_per_second": 0.586
8
  }
trainer_log.jsonl CHANGED
@@ -1,156 +1,202 @@
1
- {"current_steps": 20, "total_steps": 3100, "loss": 2.8022, "learning_rate": 3.064516129032258e-06, "epoch": 0.32, "percentage": 0.65, "elapsed_time": "0:01:15", "remaining_time": "3:13:27", "throughput": "0.00", "total_tokens": 0}
2
- {"current_steps": 40, "total_steps": 3100, "loss": 2.6463, "learning_rate": 6.290322580645161e-06, "epoch": 0.64, "percentage": 1.29, "elapsed_time": "0:02:28", "remaining_time": "3:09:21", "throughput": "0.00", "total_tokens": 0}
3
- {"current_steps": 60, "total_steps": 3100, "loss": 2.4026, "learning_rate": 9.516129032258064e-06, "epoch": 0.96, "percentage": 1.94, "elapsed_time": "0:03:42", "remaining_time": "3:07:38", "throughput": "0.00", "total_tokens": 0}
4
- {"current_steps": 80, "total_steps": 3100, "loss": 1.9639, "learning_rate": 1.2741935483870968e-05, "epoch": 1.28, "percentage": 2.58, "elapsed_time": "0:04:54", "remaining_time": "3:05:17", "throughput": "0.00", "total_tokens": 0}
5
- {"current_steps": 100, "total_steps": 3100, "loss": 1.7115, "learning_rate": 1.596774193548387e-05, "epoch": 1.6, "percentage": 3.23, "elapsed_time": "0:06:08", "remaining_time": "3:04:08", "throughput": "0.00", "total_tokens": 0}
6
- {"current_steps": 120, "total_steps": 3100, "loss": 1.7534, "learning_rate": 1.9193548387096774e-05, "epoch": 1.92, "percentage": 3.87, "elapsed_time": "0:07:23", "remaining_time": "3:03:30", "throughput": "0.00", "total_tokens": 0}
7
- {"current_steps": 140, "total_steps": 3100, "loss": 1.3422, "learning_rate": 2.2419354838709678e-05, "epoch": 2.24, "percentage": 4.52, "elapsed_time": "0:08:37", "remaining_time": "3:02:14", "throughput": "0.00", "total_tokens": 0}
8
- {"current_steps": 160, "total_steps": 3100, "loss": 1.1582, "learning_rate": 2.5645161290322582e-05, "epoch": 2.56, "percentage": 5.16, "elapsed_time": "0:09:48", "remaining_time": "3:00:17", "throughput": "0.00", "total_tokens": 0}
9
- {"current_steps": 180, "total_steps": 3100, "loss": 1.1084, "learning_rate": 2.8870967741935483e-05, "epoch": 2.88, "percentage": 5.81, "elapsed_time": "0:10:59", "remaining_time": "2:58:23", "throughput": "0.00", "total_tokens": 0}
10
- {"current_steps": 200, "total_steps": 3100, "loss": 0.844, "learning_rate": 3.2096774193548393e-05, "epoch": 3.2, "percentage": 6.45, "elapsed_time": "0:12:12", "remaining_time": "2:56:57", "throughput": "0.00", "total_tokens": 0}
11
- {"current_steps": 220, "total_steps": 3100, "loss": 0.6051, "learning_rate": 3.532258064516129e-05, "epoch": 3.52, "percentage": 7.1, "elapsed_time": "0:13:22", "remaining_time": "2:55:04", "throughput": "0.00", "total_tokens": 0}
12
- {"current_steps": 240, "total_steps": 3100, "loss": 0.7587, "learning_rate": 3.8548387096774195e-05, "epoch": 3.84, "percentage": 7.74, "elapsed_time": "0:14:34", "remaining_time": "2:53:47", "throughput": "0.00", "total_tokens": 0}
13
- {"current_steps": 260, "total_steps": 3100, "loss": 0.537, "learning_rate": 4.17741935483871e-05, "epoch": 4.16, "percentage": 8.39, "elapsed_time": "0:15:46", "remaining_time": "2:52:23", "throughput": "0.00", "total_tokens": 0}
14
- {"current_steps": 280, "total_steps": 3100, "loss": 0.4063, "learning_rate": 4.5e-05, "epoch": 4.48, "percentage": 9.03, "elapsed_time": "0:16:59", "remaining_time": "2:51:12", "throughput": "0.00", "total_tokens": 0}
15
- {"current_steps": 300, "total_steps": 3100, "loss": 0.4817, "learning_rate": 4.822580645161291e-05, "epoch": 4.8, "percentage": 9.68, "elapsed_time": "0:18:11", "remaining_time": "2:49:50", "throughput": "0.00", "total_tokens": 0}
16
- {"current_steps": 320, "total_steps": 3100, "loss": 0.3556, "learning_rate": 4.9998716243505096e-05, "epoch": 5.12, "percentage": 10.32, "elapsed_time": "0:19:23", "remaining_time": "2:48:28", "throughput": "0.00", "total_tokens": 0}
17
- {"current_steps": 340, "total_steps": 3100, "loss": 0.2663, "learning_rate": 4.9986672191133314e-05, "epoch": 5.44, "percentage": 10.97, "elapsed_time": "0:20:35", "remaining_time": "2:47:11", "throughput": "0.00", "total_tokens": 0}
18
- {"current_steps": 360, "total_steps": 3100, "loss": 0.3273, "learning_rate": 4.9961956248762694e-05, "epoch": 5.76, "percentage": 11.61, "elapsed_time": "0:21:46", "remaining_time": "2:45:41", "throughput": "0.00", "total_tokens": 0}
19
- {"current_steps": 380, "total_steps": 3100, "loss": 0.2174, "learning_rate": 4.992458095098368e-05, "epoch": 6.08, "percentage": 12.26, "elapsed_time": "0:22:57", "remaining_time": "2:44:20", "throughput": "0.00", "total_tokens": 0}
20
- {"current_steps": 400, "total_steps": 3100, "loss": 0.1886, "learning_rate": 4.9874565252527765e-05, "epoch": 6.4, "percentage": 12.9, "elapsed_time": "0:24:11", "remaining_time": "2:43:20", "throughput": "0.00", "total_tokens": 0}
21
- {"current_steps": 420, "total_steps": 3100, "loss": 0.2278, "learning_rate": 4.981193451865465e-05, "epoch": 6.72, "percentage": 13.55, "elapsed_time": "0:25:24", "remaining_time": "2:42:05", "throughput": "0.00", "total_tokens": 0}
22
- {"current_steps": 440, "total_steps": 3100, "loss": 0.168, "learning_rate": 4.9736720512288334e-05, "epoch": 7.04, "percentage": 14.19, "elapsed_time": "0:26:37", "remaining_time": "2:40:54", "throughput": "0.00", "total_tokens": 0}
23
- {"current_steps": 460, "total_steps": 3100, "loss": 0.1227, "learning_rate": 4.964896137790873e-05, "epoch": 7.36, "percentage": 14.84, "elapsed_time": "0:27:47", "remaining_time": "2:39:31", "throughput": "0.00", "total_tokens": 0}
24
- {"current_steps": 480, "total_steps": 3100, "loss": 0.1261, "learning_rate": 4.954870162220679e-05, "epoch": 7.68, "percentage": 15.48, "elapsed_time": "0:28:59", "remaining_time": "2:38:15", "throughput": "0.00", "total_tokens": 0}
25
- {"current_steps": 500, "total_steps": 3100, "loss": 0.1167, "learning_rate": 4.943599209151314e-05, "epoch": 8.0, "percentage": 16.13, "elapsed_time": "0:30:11", "remaining_time": "2:36:58", "throughput": "0.00", "total_tokens": 0}
26
- {"current_steps": 520, "total_steps": 3100, "loss": 0.1049, "learning_rate": 4.931088994601157e-05, "epoch": 8.32, "percentage": 16.77, "elapsed_time": "0:31:20", "remaining_time": "2:35:32", "throughput": "0.00", "total_tokens": 0}
27
- {"current_steps": 540, "total_steps": 3100, "loss": 0.1016, "learning_rate": 4.917345863075048e-05, "epoch": 8.64, "percentage": 17.42, "elapsed_time": "0:32:32", "remaining_time": "2:34:14", "throughput": "0.00", "total_tokens": 0}
28
- {"current_steps": 560, "total_steps": 3100, "loss": 0.1157, "learning_rate": 4.902376784346697e-05, "epoch": 8.96, "percentage": 18.06, "elapsed_time": "0:33:48", "remaining_time": "2:33:19", "throughput": "0.00", "total_tokens": 0}
29
- {"current_steps": 580, "total_steps": 3100, "loss": 0.073, "learning_rate": 4.886189349923992e-05, "epoch": 9.28, "percentage": 18.71, "elapsed_time": "0:35:01", "remaining_time": "2:32:11", "throughput": "0.00", "total_tokens": 0}
30
- {"current_steps": 600, "total_steps": 3100, "loss": 0.0908, "learning_rate": 4.868791769198995e-05, "epoch": 9.6, "percentage": 19.35, "elapsed_time": "0:36:13", "remaining_time": "2:30:56", "throughput": "0.00", "total_tokens": 0}
31
- {"current_steps": 620, "total_steps": 3100, "loss": 0.0557, "learning_rate": 4.8501928652845854e-05, "epoch": 9.92, "percentage": 20.0, "elapsed_time": "0:37:22", "remaining_time": "2:29:31", "throughput": "0.00", "total_tokens": 0}
32
- {"current_steps": 640, "total_steps": 3100, "loss": 0.0779, "learning_rate": 4.83040207053985e-05, "epoch": 10.24, "percentage": 20.65, "elapsed_time": "0:38:33", "remaining_time": "2:28:12", "throughput": "0.00", "total_tokens": 0}
33
- {"current_steps": 660, "total_steps": 3100, "loss": 0.048, "learning_rate": 4.809429421786502e-05, "epoch": 10.56, "percentage": 21.29, "elapsed_time": "0:39:43", "remaining_time": "2:26:52", "throughput": "0.00", "total_tokens": 0}
34
- {"current_steps": 680, "total_steps": 3100, "loss": 0.0747, "learning_rate": 4.787285555218748e-05, "epoch": 10.88, "percentage": 21.94, "elapsed_time": "0:40:54", "remaining_time": "2:25:35", "throughput": "0.00", "total_tokens": 0}
35
- {"current_steps": 700, "total_steps": 3100, "loss": 0.0629, "learning_rate": 4.763981701009184e-05, "epoch": 11.2, "percentage": 22.58, "elapsed_time": "0:42:06", "remaining_time": "2:24:20", "throughput": "0.00", "total_tokens": 0}
36
- {"current_steps": 720, "total_steps": 3100, "loss": 0.051, "learning_rate": 4.739529677613456e-05, "epoch": 11.52, "percentage": 23.23, "elapsed_time": "0:43:21", "remaining_time": "2:23:18", "throughput": "0.00", "total_tokens": 0}
37
- {"current_steps": 740, "total_steps": 3100, "loss": 0.0699, "learning_rate": 4.713941885776586e-05, "epoch": 11.84, "percentage": 23.87, "elapsed_time": "0:44:34", "remaining_time": "2:22:10", "throughput": "0.00", "total_tokens": 0}
38
- {"current_steps": 760, "total_steps": 3100, "loss": 0.0526, "learning_rate": 4.687231302243975e-05, "epoch": 12.16, "percentage": 24.52, "elapsed_time": "0:45:46", "remaining_time": "2:20:54", "throughput": "0.00", "total_tokens": 0}
39
- {"current_steps": 780, "total_steps": 3100, "loss": 0.0412, "learning_rate": 4.659411473180304e-05, "epoch": 12.48, "percentage": 25.16, "elapsed_time": "0:46:59", "remaining_time": "2:19:45", "throughput": "0.00", "total_tokens": 0}
40
- {"current_steps": 800, "total_steps": 3100, "loss": 0.0495, "learning_rate": 4.6304965072996495e-05, "epoch": 12.8, "percentage": 25.81, "elapsed_time": "0:48:10", "remaining_time": "2:18:29", "throughput": "0.00", "total_tokens": 0}
41
- {"current_steps": 820, "total_steps": 3100, "loss": 0.063, "learning_rate": 4.6005010687103076e-05, "epoch": 13.12, "percentage": 26.45, "elapsed_time": "0:49:22", "remaining_time": "2:17:17", "throughput": "0.00", "total_tokens": 0}
42
- {"current_steps": 840, "total_steps": 3100, "loss": 0.0425, "learning_rate": 4.569440369477951e-05, "epoch": 13.44, "percentage": 27.1, "elapsed_time": "0:50:34", "remaining_time": "2:16:05", "throughput": "0.00", "total_tokens": 0}
43
- {"current_steps": 860, "total_steps": 3100, "loss": 0.0451, "learning_rate": 4.5373301619108854e-05, "epoch": 13.76, "percentage": 27.74, "elapsed_time": "0:51:47", "remaining_time": "2:14:54", "throughput": "0.00", "total_tokens": 0}
44
- {"current_steps": 880, "total_steps": 3100, "loss": 0.0445, "learning_rate": 4.5041867305713384e-05, "epoch": 14.08, "percentage": 28.39, "elapsed_time": "0:53:00", "remaining_time": "2:13:44", "throughput": "0.00", "total_tokens": 0}
45
- {"current_steps": 900, "total_steps": 3100, "loss": 0.0214, "learning_rate": 4.4700268840168045e-05, "epoch": 14.4, "percentage": 29.03, "elapsed_time": "0:54:14", "remaining_time": "2:12:36", "throughput": "0.00", "total_tokens": 0}
46
- {"current_steps": 920, "total_steps": 3100, "loss": 0.0552, "learning_rate": 4.4348679462756556e-05, "epoch": 14.72, "percentage": 29.68, "elapsed_time": "0:55:29", "remaining_time": "2:11:28", "throughput": "0.00", "total_tokens": 0}
47
- {"current_steps": 940, "total_steps": 3100, "loss": 0.0524, "learning_rate": 4.398727748061324e-05, "epoch": 15.04, "percentage": 30.32, "elapsed_time": "0:56:42", "remaining_time": "2:10:19", "throughput": "0.00", "total_tokens": 0}
48
- {"current_steps": 960, "total_steps": 3100, "loss": 0.0318, "learning_rate": 4.361624617729536e-05, "epoch": 15.36, "percentage": 30.97, "elapsed_time": "0:57:55", "remaining_time": "2:09:07", "throughput": "0.00", "total_tokens": 0}
49
- {"current_steps": 980, "total_steps": 3100, "loss": 0.0347, "learning_rate": 4.323577371983155e-05, "epoch": 15.68, "percentage": 31.61, "elapsed_time": "0:59:10", "remaining_time": "2:08:01", "throughput": "0.00", "total_tokens": 0}
50
- {"current_steps": 1000, "total_steps": 3100, "loss": 0.0541, "learning_rate": 4.28460530632937e-05, "epoch": 16.0, "percentage": 32.26, "elapsed_time": "1:00:25", "remaining_time": "2:06:53", "throughput": "0.00", "total_tokens": 0}
51
- {"current_steps": 1020, "total_steps": 3100, "loss": 0.0327, "learning_rate": 4.2447281852940525e-05, "epoch": 16.32, "percentage": 32.9, "elapsed_time": "1:01:40", "remaining_time": "2:05:45", "throughput": "0.00", "total_tokens": 0}
52
- {"current_steps": 1040, "total_steps": 3100, "loss": 0.0251, "learning_rate": 4.203966232398261e-05, "epoch": 16.64, "percentage": 33.55, "elapsed_time": "1:02:52", "remaining_time": "2:04:33", "throughput": "0.00", "total_tokens": 0}
53
- {"current_steps": 1060, "total_steps": 3100, "loss": 0.0451, "learning_rate": 4.162340119901961e-05, "epoch": 16.96, "percentage": 34.19, "elapsed_time": "1:04:05", "remaining_time": "2:03:20", "throughput": "0.00", "total_tokens": 0}
54
- {"current_steps": 1080, "total_steps": 3100, "loss": 0.0272, "learning_rate": 4.1198709583201754e-05, "epoch": 17.28, "percentage": 34.84, "elapsed_time": "1:05:16", "remaining_time": "2:02:05", "throughput": "0.00", "total_tokens": 0}
55
- {"current_steps": 1100, "total_steps": 3100, "loss": 0.0517, "learning_rate": 4.0765802857168687e-05, "epoch": 17.6, "percentage": 35.48, "elapsed_time": "1:06:29", "remaining_time": "2:00:54", "throughput": "0.00", "total_tokens": 0}
56
- {"current_steps": 1120, "total_steps": 3100, "loss": 0.0286, "learning_rate": 4.0324900567820046e-05, "epoch": 17.92, "percentage": 36.13, "elapsed_time": "1:07:45", "remaining_time": "1:59:46", "throughput": "0.00", "total_tokens": 0}
57
- {"current_steps": 1140, "total_steps": 3100, "loss": 0.0258, "learning_rate": 3.987622631697316e-05, "epoch": 18.24, "percentage": 36.77, "elapsed_time": "1:08:59", "remaining_time": "1:58:37", "throughput": "0.00", "total_tokens": 0}
58
- {"current_steps": 1160, "total_steps": 3100, "loss": 0.0289, "learning_rate": 3.942000764796427e-05, "epoch": 18.56, "percentage": 37.42, "elapsed_time": "1:10:14", "remaining_time": "1:57:29", "throughput": "0.00", "total_tokens": 0}
59
- {"current_steps": 1180, "total_steps": 3100, "loss": 0.0457, "learning_rate": 3.895647593025088e-05, "epoch": 18.88, "percentage": 38.06, "elapsed_time": "1:11:29", "remaining_time": "1:56:19", "throughput": "0.00", "total_tokens": 0}
60
- {"current_steps": 1200, "total_steps": 3100, "loss": 0.0316, "learning_rate": 3.8485866242073584e-05, "epoch": 19.2, "percentage": 38.71, "elapsed_time": "1:12:42", "remaining_time": "1:55:07", "throughput": "0.00", "total_tokens": 0}
61
- {"current_steps": 1220, "total_steps": 3100, "loss": 0.0326, "learning_rate": 3.80084172512372e-05, "epoch": 19.52, "percentage": 39.35, "elapsed_time": "1:13:55", "remaining_time": "1:53:55", "throughput": "0.00", "total_tokens": 0}
62
- {"current_steps": 1240, "total_steps": 3100, "loss": 0.0238, "learning_rate": 3.7524371094071266e-05, "epoch": 19.84, "percentage": 40.0, "elapsed_time": "1:15:07", "remaining_time": "1:52:41", "throughput": "0.00", "total_tokens": 0}
63
- {"current_steps": 1260, "total_steps": 3100, "loss": 0.0286, "learning_rate": 3.703397325263162e-05, "epoch": 20.16, "percentage": 40.65, "elapsed_time": "1:16:19", "remaining_time": "1:51:27", "throughput": "0.00", "total_tokens": 0}
64
- {"current_steps": 1280, "total_steps": 3100, "loss": 0.0294, "learning_rate": 3.653747243020515e-05, "epoch": 20.48, "percentage": 41.29, "elapsed_time": "1:17:32", "remaining_time": "1:50:15", "throughput": "0.00", "total_tokens": 0}
65
- {"current_steps": 1300, "total_steps": 3100, "loss": 0.0364, "learning_rate": 3.603512042518093e-05, "epoch": 20.8, "percentage": 41.94, "elapsed_time": "1:18:45", "remaining_time": "1:49:02", "throughput": "0.00", "total_tokens": 0}
66
- {"current_steps": 1320, "total_steps": 3100, "loss": 0.0265, "learning_rate": 3.552717200335171e-05, "epoch": 21.12, "percentage": 42.58, "elapsed_time": "1:19:57", "remaining_time": "1:47:49", "throughput": "0.00", "total_tokens": 0}
67
- {"current_steps": 1340, "total_steps": 3100, "loss": 0.0319, "learning_rate": 3.501388476871039e-05, "epoch": 21.44, "percentage": 43.23, "elapsed_time": "1:21:08", "remaining_time": "1:46:34", "throughput": "0.00", "total_tokens": 0}
68
- {"current_steps": 1360, "total_steps": 3100, "loss": 0.0137, "learning_rate": 3.449551903280729e-05, "epoch": 21.76, "percentage": 43.87, "elapsed_time": "1:22:19", "remaining_time": "1:45:19", "throughput": "0.00", "total_tokens": 0}
69
- {"current_steps": 1380, "total_steps": 3100, "loss": 0.0416, "learning_rate": 3.397233768273415e-05, "epoch": 22.08, "percentage": 44.52, "elapsed_time": "1:23:30", "remaining_time": "1:44:05", "throughput": "0.00", "total_tokens": 0}
70
- {"current_steps": 1400, "total_steps": 3100, "loss": 0.0179, "learning_rate": 3.344460604780202e-05, "epoch": 22.4, "percentage": 45.16, "elapsed_time": "1:24:42", "remaining_time": "1:42:51", "throughput": "0.00", "total_tokens": 0}
71
- {"current_steps": 1420, "total_steps": 3100, "loss": 0.0276, "learning_rate": 3.291259176498052e-05, "epoch": 22.72, "percentage": 45.81, "elapsed_time": "1:25:52", "remaining_time": "1:41:35", "throughput": "0.00", "total_tokens": 0}
72
- {"current_steps": 1440, "total_steps": 3100, "loss": 0.0352, "learning_rate": 3.237656464316693e-05, "epoch": 23.04, "percentage": 46.45, "elapsed_time": "1:27:01", "remaining_time": "1:40:19", "throughput": "0.00", "total_tokens": 0}
73
- {"current_steps": 1460, "total_steps": 3100, "loss": 0.0212, "learning_rate": 3.183679652635357e-05, "epoch": 23.36, "percentage": 47.1, "elapsed_time": "1:28:11", "remaining_time": "1:39:04", "throughput": "0.00", "total_tokens": 0}
74
- {"current_steps": 1480, "total_steps": 3100, "loss": 0.0338, "learning_rate": 3.129356115576332e-05, "epoch": 23.68, "percentage": 47.74, "elapsed_time": "1:29:24", "remaining_time": "1:37:51", "throughput": "0.00", "total_tokens": 0}
75
- {"current_steps": 1500, "total_steps": 3100, "loss": 0.0295, "learning_rate": 3.074713403102284e-05, "epoch": 24.0, "percentage": 48.39, "elapsed_time": "1:30:35", "remaining_time": "1:36:38", "throughput": "0.00", "total_tokens": 0}
76
- {"current_steps": 1520, "total_steps": 3100, "loss": 0.0185, "learning_rate": 3.0197792270443982e-05, "epoch": 24.32, "percentage": 49.03, "elapsed_time": "1:31:46", "remaining_time": "1:35:24", "throughput": "0.00", "total_tokens": 0}
77
- {"current_steps": 1540, "total_steps": 3100, "loss": 0.0328, "learning_rate": 2.9645814470484452e-05, "epoch": 24.64, "percentage": 49.68, "elapsed_time": "1:32:59", "remaining_time": "1:34:11", "throughput": "0.00", "total_tokens": 0}
78
- {"current_steps": 1560, "total_steps": 3100, "loss": 0.025, "learning_rate": 2.9091480564458666e-05, "epoch": 24.96, "percentage": 50.32, "elapsed_time": "1:34:12", "remaining_time": "1:32:59", "throughput": "0.00", "total_tokens": 0}
79
- {"current_steps": 1580, "total_steps": 3100, "loss": 0.0294, "learning_rate": 2.8535071680570734e-05, "epoch": 25.28, "percentage": 50.97, "elapsed_time": "1:35:23", "remaining_time": "1:31:45", "throughput": "0.00", "total_tokens": 0}
80
- {"current_steps": 1600, "total_steps": 3100, "loss": 0.0282, "learning_rate": 2.7976869999341426e-05, "epoch": 25.6, "percentage": 51.61, "elapsed_time": "1:36:38", "remaining_time": "1:30:35", "throughput": "0.00", "total_tokens": 0}
81
- {"current_steps": 1620, "total_steps": 3100, "loss": 0.0294, "learning_rate": 2.741715861050143e-05, "epoch": 25.92, "percentage": 52.26, "elapsed_time": "1:37:51", "remaining_time": "1:29:23", "throughput": "0.00", "total_tokens": 0}
82
- {"current_steps": 1640, "total_steps": 3100, "loss": 0.0354, "learning_rate": 2.685622136942359e-05, "epoch": 26.24, "percentage": 52.9, "elapsed_time": "1:39:04", "remaining_time": "1:28:11", "throughput": "0.00", "total_tokens": 0}
83
- {"current_steps": 1660, "total_steps": 3100, "loss": 0.0162, "learning_rate": 2.629434275316673e-05, "epoch": 26.56, "percentage": 53.55, "elapsed_time": "1:40:19", "remaining_time": "1:27:01", "throughput": "0.00", "total_tokens": 0}
84
- {"current_steps": 1680, "total_steps": 3100, "loss": 0.0205, "learning_rate": 2.573180771620432e-05, "epoch": 26.88, "percentage": 54.19, "elapsed_time": "1:41:32", "remaining_time": "1:25:49", "throughput": "0.00", "total_tokens": 0}
85
- {"current_steps": 1700, "total_steps": 3100, "loss": 0.0129, "learning_rate": 2.516890154591095e-05, "epoch": 27.2, "percentage": 54.84, "elapsed_time": "1:42:47", "remaining_time": "1:24:39", "throughput": "0.00", "total_tokens": 0}
86
- {"current_steps": 1720, "total_steps": 3100, "loss": 0.0333, "learning_rate": 2.4605909717879964e-05, "epoch": 27.52, "percentage": 55.48, "elapsed_time": "1:44:01", "remaining_time": "1:23:27", "throughput": "0.00", "total_tokens": 0}
87
- {"current_steps": 1740, "total_steps": 3100, "loss": 0.0261, "learning_rate": 2.4043117751145694e-05, "epoch": 27.84, "percentage": 56.13, "elapsed_time": "1:45:14", "remaining_time": "1:22:15", "throughput": "0.00", "total_tokens": 0}
88
- {"current_steps": 1760, "total_steps": 3100, "loss": 0.0418, "learning_rate": 2.34808110633836e-05, "epoch": 28.16, "percentage": 56.77, "elapsed_time": "1:46:27", "remaining_time": "1:21:03", "throughput": "0.00", "total_tokens": 0}
89
- {"current_steps": 1780, "total_steps": 3100, "loss": 0.0111, "learning_rate": 2.291927482616191e-05, "epoch": 28.48, "percentage": 57.42, "elapsed_time": "1:47:39", "remaining_time": "1:19:50", "throughput": "0.00", "total_tokens": 0}
90
- {"current_steps": 1800, "total_steps": 3100, "loss": 0.0263, "learning_rate": 2.235879382031794e-05, "epoch": 28.8, "percentage": 58.06, "elapsed_time": "1:48:52", "remaining_time": "1:18:38", "throughput": "0.00", "total_tokens": 0}
91
- {"current_steps": 1820, "total_steps": 3100, "loss": 0.0299, "learning_rate": 2.179965229153265e-05, "epoch": 29.12, "percentage": 58.71, "elapsed_time": "1:50:05", "remaining_time": "1:17:25", "throughput": "0.00", "total_tokens": 0}
92
- {"current_steps": 1840, "total_steps": 3100, "loss": 0.0267, "learning_rate": 2.1242133806176667e-05, "epoch": 29.44, "percentage": 59.35, "elapsed_time": "1:51:16", "remaining_time": "1:16:12", "throughput": "0.00", "total_tokens": 0}
93
- {"current_steps": 1860, "total_steps": 3100, "loss": 0.0204, "learning_rate": 2.0686521107500638e-05, "epoch": 29.76, "percentage": 60.0, "elapsed_time": "1:52:29", "remaining_time": "1:14:59", "throughput": "0.00", "total_tokens": 0}
94
- {"current_steps": 1880, "total_steps": 3100, "loss": 0.0308, "learning_rate": 2.0133095972243233e-05, "epoch": 30.08, "percentage": 60.65, "elapsed_time": "1:53:42", "remaining_time": "1:13:47", "throughput": "0.00", "total_tokens": 0}
95
- {"current_steps": 1900, "total_steps": 3100, "loss": 0.024, "learning_rate": 1.9582139067729117e-05, "epoch": 30.4, "percentage": 61.29, "elapsed_time": "1:54:52", "remaining_time": "1:12:33", "throughput": "0.00", "total_tokens": 0}
96
- {"current_steps": 1920, "total_steps": 3100, "loss": 0.0113, "learning_rate": 1.90339298095297e-05, "epoch": 30.72, "percentage": 61.94, "elapsed_time": "1:56:03", "remaining_time": "1:11:19", "throughput": "0.00", "total_tokens": 0}
97
- {"current_steps": 1940, "total_steps": 3100, "loss": 0.0301, "learning_rate": 1.8488746219758674e-05, "epoch": 31.04, "percentage": 62.58, "elapsed_time": "1:57:14", "remaining_time": "1:10:06", "throughput": "0.00", "total_tokens": 0}
98
- {"current_steps": 1960, "total_steps": 3100, "loss": 0.0293, "learning_rate": 1.7946864786074165e-05, "epoch": 31.36, "percentage": 63.23, "elapsed_time": "1:58:26", "remaining_time": "1:08:53", "throughput": "0.00", "total_tokens": 0}
99
- {"current_steps": 1980, "total_steps": 3100, "loss": 0.0242, "learning_rate": 1.740856032145917e-05, "epoch": 31.68, "percentage": 63.87, "elapsed_time": "1:59:35", "remaining_time": "1:07:39", "throughput": "0.00", "total_tokens": 0}
100
- {"current_steps": 2000, "total_steps": 3100, "loss": 0.022, "learning_rate": 1.6874105824851267e-05, "epoch": 32.0, "percentage": 64.52, "elapsed_time": "2:00:48", "remaining_time": "1:06:26", "throughput": "0.00", "total_tokens": 0}
101
- {"current_steps": 2020, "total_steps": 3100, "loss": 0.0264, "learning_rate": 1.634377234269226e-05, "epoch": 32.32, "percentage": 65.16, "elapsed_time": "2:02:02", "remaining_time": "1:05:15", "throughput": "0.00", "total_tokens": 0}
102
- {"current_steps": 2040, "total_steps": 3100, "loss": 0.0155, "learning_rate": 1.5817828831468144e-05, "epoch": 32.64, "percentage": 65.81, "elapsed_time": "2:03:17", "remaining_time": "1:04:03", "throughput": "0.00", "total_tokens": 0}
103
- {"current_steps": 2060, "total_steps": 3100, "loss": 0.0208, "learning_rate": 1.5296542021308825e-05, "epoch": 32.96, "percentage": 66.45, "elapsed_time": "2:04:32", "remaining_time": "1:02:52", "throughput": "0.00", "total_tokens": 0}
104
- {"current_steps": 2080, "total_steps": 3100, "loss": 0.0264, "learning_rate": 1.478017628071706e-05, "epoch": 33.28, "percentage": 67.1, "elapsed_time": "2:05:48", "remaining_time": "1:01:41", "throughput": "0.00", "total_tokens": 0}
105
- {"current_steps": 2100, "total_steps": 3100, "loss": 0.0174, "learning_rate": 1.4268993482495055e-05, "epoch": 33.6, "percentage": 67.74, "elapsed_time": "2:07:02", "remaining_time": "1:00:29", "throughput": "0.00", "total_tokens": 0}
106
- {"current_steps": 2120, "total_steps": 3100, "loss": 0.0214, "learning_rate": 1.3763252870936649e-05, "epoch": 33.92, "percentage": 68.39, "elapsed_time": "2:08:20", "remaining_time": "0:59:19", "throughput": "0.00", "total_tokens": 0}
107
- {"current_steps": 2140, "total_steps": 3100, "loss": 0.0306, "learning_rate": 1.3263210930352737e-05, "epoch": 34.24, "percentage": 69.03, "elapsed_time": "2:09:32", "remaining_time": "0:58:06", "throughput": "0.00", "total_tokens": 0}
108
- {"current_steps": 2160, "total_steps": 3100, "loss": 0.0146, "learning_rate": 1.2769121254996159e-05, "epoch": 34.56, "percentage": 69.68, "elapsed_time": "2:10:44", "remaining_time": "0:56:54", "throughput": "0.00", "total_tokens": 0}
109
- {"current_steps": 2180, "total_steps": 3100, "loss": 0.0254, "learning_rate": 1.228123442045249e-05, "epoch": 34.88, "percentage": 70.32, "elapsed_time": "2:12:00", "remaining_time": "0:55:42", "throughput": "0.00", "total_tokens": 0}
110
- {"current_steps": 2200, "total_steps": 3100, "loss": 0.0176, "learning_rate": 1.1799797856561606e-05, "epoch": 35.2, "percentage": 70.97, "elapsed_time": "2:13:13", "remaining_time": "0:54:30", "throughput": "0.00", "total_tokens": 0}
111
- {"current_steps": 2220, "total_steps": 3100, "loss": 0.0205, "learning_rate": 1.1325055721934637e-05, "epoch": 35.52, "percentage": 71.61, "elapsed_time": "2:14:27", "remaining_time": "0:53:17", "throughput": "0.00", "total_tokens": 0}
112
- {"current_steps": 2240, "total_steps": 3100, "loss": 0.0153, "learning_rate": 1.0857248780129928e-05, "epoch": 35.84, "percentage": 72.26, "elapsed_time": "2:15:41", "remaining_time": "0:52:05", "throughput": "0.00", "total_tokens": 0}
113
- {"current_steps": 2260, "total_steps": 3100, "loss": 0.0251, "learning_rate": 1.0396614277550752e-05, "epoch": 36.16, "percentage": 72.9, "elapsed_time": "2:16:56", "remaining_time": "0:50:53", "throughput": "0.00", "total_tokens": 0}
114
- {"current_steps": 2280, "total_steps": 3100, "loss": 0.0224, "learning_rate": 9.943385823126775e-06, "epoch": 36.48, "percentage": 73.55, "elapsed_time": "2:18:09", "remaining_time": "0:49:41", "throughput": "0.00", "total_tokens": 0}
115
- {"current_steps": 2300, "total_steps": 3100, "loss": 0.0219, "learning_rate": 9.497793269840211e-06, "epoch": 36.8, "percentage": 74.19, "elapsed_time": "2:19:24", "remaining_time": "0:48:29", "throughput": "0.00", "total_tokens": 0}
116
- {"current_steps": 2320, "total_steps": 3100, "loss": 0.021, "learning_rate": 9.06006259815683e-06, "epoch": 37.12, "percentage": 74.84, "elapsed_time": "2:20:38", "remaining_time": "0:47:17", "throughput": "0.00", "total_tokens": 0}
117
- {"current_steps": 2340, "total_steps": 3100, "loss": 0.0236, "learning_rate": 8.630415801420835e-06, "epoch": 37.44, "percentage": 75.48, "elapsed_time": "2:21:52", "remaining_time": "0:46:04", "throughput": "0.00", "total_tokens": 0}
118
- {"current_steps": 2360, "total_steps": 3100, "loss": 0.0216, "learning_rate": 8.209070773271894e-06, "epoch": 37.76, "percentage": 76.13, "elapsed_time": "2:23:05", "remaining_time": "0:44:52", "throughput": "0.00", "total_tokens": 0}
119
- {"current_steps": 2380, "total_steps": 3100, "loss": 0.0268, "learning_rate": 7.79624119714121e-06, "epoch": 38.08, "percentage": 76.77, "elapsed_time": "2:24:15", "remaining_time": "0:43:38", "throughput": "0.00", "total_tokens": 0}
120
- {"current_steps": 2400, "total_steps": 3100, "loss": 0.0306, "learning_rate": 7.392136437882855e-06, "epoch": 38.4, "percentage": 77.42, "elapsed_time": "2:25:28", "remaining_time": "0:42:25", "throughput": "0.00", "total_tokens": 0}
121
- {"current_steps": 2420, "total_steps": 3100, "loss": 0.0224, "learning_rate": 6.996961435595223e-06, "epoch": 38.72, "percentage": 78.06, "elapsed_time": "2:26:44", "remaining_time": "0:41:13", "throughput": "0.00", "total_tokens": 0}
122
- {"current_steps": 2440, "total_steps": 3100, "loss": 0.0138, "learning_rate": 6.610916601686481e-06, "epoch": 39.04, "percentage": 78.71, "elapsed_time": "2:27:57", "remaining_time": "0:40:01", "throughput": "0.00", "total_tokens": 0}
123
- {"current_steps": 2460, "total_steps": 3100, "loss": 0.0234, "learning_rate": 6.234197717236742e-06, "epoch": 39.36, "percentage": 79.35, "elapsed_time": "2:29:10", "remaining_time": "0:38:48", "throughput": "0.00", "total_tokens": 0}
124
- {"current_steps": 2480, "total_steps": 3100, "loss": 0.0164, "learning_rate": 5.866995833708464e-06, "epoch": 39.68, "percentage": 80.0, "elapsed_time": "2:30:25", "remaining_time": "0:37:36", "throughput": "0.00", "total_tokens": 0}
125
- {"current_steps": 2500, "total_steps": 3100, "loss": 0.0169, "learning_rate": 5.509497176055492e-06, "epoch": 40.0, "percentage": 80.65, "elapsed_time": "2:31:36", "remaining_time": "0:36:23", "throughput": "0.00", "total_tokens": 0}
126
- {"current_steps": 2520, "total_steps": 3100, "loss": 0.0116, "learning_rate": 5.161883048279817e-06, "epoch": 40.32, "percentage": 81.29, "elapsed_time": "2:32:48", "remaining_time": "0:35:10", "throughput": "0.00", "total_tokens": 0}
127
- {"current_steps": 2540, "total_steps": 3100, "loss": 0.0219, "learning_rate": 4.824329741483949e-06, "epoch": 40.64, "percentage": 81.94, "elapsed_time": "2:33:57", "remaining_time": "0:33:56", "throughput": "0.00", "total_tokens": 0}
128
- {"current_steps": 2560, "total_steps": 3100, "loss": 0.0366, "learning_rate": 4.497008444465681e-06, "epoch": 40.96, "percentage": 82.58, "elapsed_time": "2:35:09", "remaining_time": "0:32:43", "throughput": "0.00", "total_tokens": 0}
129
- {"current_steps": 2580, "total_steps": 3100, "loss": 0.0228, "learning_rate": 4.180085156900274e-06, "epoch": 41.28, "percentage": 83.23, "elapsed_time": "2:36:21", "remaining_time": "0:31:30", "throughput": "0.00", "total_tokens": 0}
130
- {"current_steps": 2600, "total_steps": 3100, "loss": 0.0135, "learning_rate": 3.873720605154468e-06, "epoch": 41.6, "percentage": 83.87, "elapsed_time": "2:37:34", "remaining_time": "0:30:18", "throughput": "0.00", "total_tokens": 0}
131
- {"current_steps": 2620, "total_steps": 3100, "loss": 0.0267, "learning_rate": 3.578070160774724e-06, "epoch": 41.92, "percentage": 84.52, "elapsed_time": "2:38:46", "remaining_time": "0:29:05", "throughput": "0.00", "total_tokens": 0}
132
- {"current_steps": 2640, "total_steps": 3100, "loss": 0.0218, "learning_rate": 3.293283761691182e-06, "epoch": 42.24, "percentage": 85.16, "elapsed_time": "2:40:00", "remaining_time": "0:27:52", "throughput": "0.00", "total_tokens": 0}
133
- {"current_steps": 2660, "total_steps": 3100, "loss": 0.0315, "learning_rate": 3.0195058361772277e-06, "epoch": 42.56, "percentage": 85.81, "elapsed_time": "2:41:16", "remaining_time": "0:26:40", "throughput": "0.00", "total_tokens": 0}
134
- {"current_steps": 2680, "total_steps": 3100, "loss": 0.022, "learning_rate": 2.756875229603295e-06, "epoch": 42.88, "percentage": 86.45, "elapsed_time": "2:42:28", "remaining_time": "0:25:27", "throughput": "0.00", "total_tokens": 0}
135
- {"current_steps": 2700, "total_steps": 3100, "loss": 0.016, "learning_rate": 2.5055251340219855e-06, "epoch": 43.2, "percentage": 87.1, "elapsed_time": "2:43:40", "remaining_time": "0:24:14", "throughput": "0.00", "total_tokens": 0}
136
- {"current_steps": 2720, "total_steps": 3100, "loss": 0.0267, "learning_rate": 2.2655830206202655e-06, "epoch": 43.52, "percentage": 87.74, "elapsed_time": "2:44:52", "remaining_time": "0:23:01", "throughput": "0.00", "total_tokens": 0}
137
- {"current_steps": 2740, "total_steps": 3100, "loss": 0.0167, "learning_rate": 2.037170575072944e-06, "epoch": 43.84, "percentage": 88.39, "elapsed_time": "2:46:03", "remaining_time": "0:21:49", "throughput": "0.00", "total_tokens": 0}
138
- {"current_steps": 2760, "total_steps": 3100, "loss": 0.0492, "learning_rate": 1.8204036358303173e-06, "epoch": 44.16, "percentage": 89.03, "elapsed_time": "2:47:16", "remaining_time": "0:20:36", "throughput": "0.00", "total_tokens": 0}
139
- {"current_steps": 2780, "total_steps": 3100, "loss": 0.0254, "learning_rate": 1.615392135371116e-06, "epoch": 44.48, "percentage": 89.68, "elapsed_time": "2:48:27", "remaining_time": "0:19:23", "throughput": "0.00", "total_tokens": 0}
140
- {"current_steps": 2800, "total_steps": 3100, "loss": 0.0136, "learning_rate": 1.4222400444507318e-06, "epoch": 44.8, "percentage": 90.32, "elapsed_time": "2:49:38", "remaining_time": "0:18:10", "throughput": "0.00", "total_tokens": 0}
141
- {"current_steps": 2820, "total_steps": 3100, "loss": 0.0116, "learning_rate": 1.2410453193728493e-06, "epoch": 45.12, "percentage": 90.97, "elapsed_time": "2:50:51", "remaining_time": "0:16:57", "throughput": "0.00", "total_tokens": 0}
142
- {"current_steps": 2840, "total_steps": 3100, "loss": 0.0311, "learning_rate": 1.0718998523113004e-06, "epoch": 45.44, "percentage": 91.61, "elapsed_time": "2:52:02", "remaining_time": "0:15:45", "throughput": "0.00", "total_tokens": 0}
143
- {"current_steps": 2860, "total_steps": 3100, "loss": 0.0283, "learning_rate": 9.148894247073298e-07, "epoch": 45.76, "percentage": 92.26, "elapsed_time": "2:53:15", "remaining_time": "0:14:32", "throughput": "0.00", "total_tokens": 0}
144
- {"current_steps": 2880, "total_steps": 3100, "loss": 0.0186, "learning_rate": 7.700936637658779e-07, "epoch": 46.08, "percentage": 92.9, "elapsed_time": "2:54:29", "remaining_time": "0:13:19", "throughput": "0.00", "total_tokens": 0}
145
- {"current_steps": 2900, "total_steps": 3100, "loss": 0.0229, "learning_rate": 6.375860020729541e-07, "epoch": 46.4, "percentage": 93.55, "elapsed_time": "2:55:42", "remaining_time": "0:12:07", "throughput": "0.00", "total_tokens": 0}
146
- {"current_steps": 2920, "total_steps": 3100, "loss": 0.0268, "learning_rate": 5.174336403546226e-07, "epoch": 46.72, "percentage": 94.19, "elapsed_time": "2:56:56", "remaining_time": "0:10:54", "throughput": "0.00", "total_tokens": 0}
147
- {"current_steps": 2940, "total_steps": 3100, "loss": 0.0161, "learning_rate": 4.096975133963954e-07, "epoch": 47.04, "percentage": 94.84, "elapsed_time": "2:58:07", "remaining_time": "0:09:41", "throughput": "0.00", "total_tokens": 0}
148
- {"current_steps": 2960, "total_steps": 3100, "loss": 0.0113, "learning_rate": 3.144322591404292e-07, "epoch": 47.36, "percentage": 95.48, "elapsed_time": "2:59:19", "remaining_time": "0:08:28", "throughput": "0.00", "total_tokens": 0}
149
- {"current_steps": 2980, "total_steps": 3100, "loss": 0.0222, "learning_rate": 2.316861909760909e-07, "epoch": 47.68, "percentage": 96.13, "elapsed_time": "3:00:30", "remaining_time": "0:07:16", "throughput": "0.00", "total_tokens": 0}
150
- {"current_steps": 3000, "total_steps": 3100, "loss": 0.028, "learning_rate": 1.6150127323803222e-07, "epoch": 48.0, "percentage": 96.77, "elapsed_time": "3:01:48", "remaining_time": "0:06:03", "throughput": "0.00", "total_tokens": 0}
151
- {"current_steps": 3020, "total_steps": 3100, "loss": 0.0217, "learning_rate": 1.0391309992413833e-07, "epoch": 48.32, "percentage": 97.42, "elapsed_time": "3:03:02", "remaining_time": "0:04:50", "throughput": "0.00", "total_tokens": 0}
152
- {"current_steps": 3040, "total_steps": 3100, "loss": 0.0191, "learning_rate": 5.895087664417876e-08, "epoch": 48.64, "percentage": 98.06, "elapsed_time": "3:04:13", "remaining_time": "0:03:38", "throughput": "0.00", "total_tokens": 0}
153
- {"current_steps": 3060, "total_steps": 3100, "loss": 0.0162, "learning_rate": 2.6637405808302428e-08, "epoch": 48.96, "percentage": 98.71, "elapsed_time": "3:05:27", "remaining_time": "0:02:25", "throughput": "0.00", "total_tokens": 0}
154
- {"current_steps": 3080, "total_steps": 3100, "loss": 0.0146, "learning_rate": 6.989075062879824e-09, "epoch": 49.28, "percentage": 99.35, "elapsed_time": "3:06:41", "remaining_time": "0:01:12", "throughput": "0.00", "total_tokens": 0}
155
- {"current_steps": 3100, "total_steps": 3100, "loss": 0.0217, "learning_rate": 1.584897958428755e-11, "epoch": 49.6, "percentage": 100.0, "elapsed_time": "3:07:54", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
156
- {"current_steps": 3100, "total_steps": 3100, "epoch": 49.6, "percentage": 100.0, "elapsed_time": "3:07:54", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 20, "total_steps": 4030, "loss": 2.826, "learning_rate": 2.3573200992555833e-06, "epoch": 0.32, "percentage": 0.5, "elapsed_time": "0:00:34", "remaining_time": "1:55:11", "throughput": "0.00", "total_tokens": 0}
2
+ {"current_steps": 40, "total_steps": 4030, "loss": 2.72, "learning_rate": 4.838709677419355e-06, "epoch": 0.64, "percentage": 0.99, "elapsed_time": "0:01:06", "remaining_time": "1:51:00", "throughput": "0.00", "total_tokens": 0}
3
+ {"current_steps": 60, "total_steps": 4030, "loss": 2.4912, "learning_rate": 7.320099255583126e-06, "epoch": 0.96, "percentage": 1.49, "elapsed_time": "0:01:40", "remaining_time": "1:50:44", "throughput": "0.00", "total_tokens": 0}
4
+ {"current_steps": 80, "total_steps": 4030, "loss": 2.0561, "learning_rate": 9.801488833746898e-06, "epoch": 1.28, "percentage": 1.99, "elapsed_time": "0:02:15", "remaining_time": "1:51:10", "throughput": "0.00", "total_tokens": 0}
5
+ {"current_steps": 100, "total_steps": 4030, "loss": 1.7744, "learning_rate": 1.2282878411910669e-05, "epoch": 1.6, "percentage": 2.48, "elapsed_time": "0:02:51", "remaining_time": "1:52:01", "throughput": "0.00", "total_tokens": 0}
6
+ {"current_steps": 120, "total_steps": 4030, "loss": 1.8387, "learning_rate": 1.4764267990074444e-05, "epoch": 1.92, "percentage": 2.98, "elapsed_time": "0:03:25", "remaining_time": "1:51:28", "throughput": "0.00", "total_tokens": 0}
7
+ {"current_steps": 140, "total_steps": 4030, "loss": 1.4478, "learning_rate": 1.7245657568238215e-05, "epoch": 2.24, "percentage": 3.47, "elapsed_time": "0:03:59", "remaining_time": "1:50:52", "throughput": "0.00", "total_tokens": 0}
8
+ {"current_steps": 160, "total_steps": 4030, "loss": 1.285, "learning_rate": 1.9727047146401986e-05, "epoch": 2.56, "percentage": 3.97, "elapsed_time": "0:04:34", "remaining_time": "1:50:27", "throughput": "0.00", "total_tokens": 0}
9
+ {"current_steps": 180, "total_steps": 4030, "loss": 1.2235, "learning_rate": 2.2208436724565757e-05, "epoch": 2.88, "percentage": 4.47, "elapsed_time": "0:05:08", "remaining_time": "1:50:02", "throughput": "0.00", "total_tokens": 0}
10
+ {"current_steps": 200, "total_steps": 4030, "loss": 0.9575, "learning_rate": 2.468982630272953e-05, "epoch": 3.2, "percentage": 4.96, "elapsed_time": "0:05:40", "remaining_time": "1:48:48", "throughput": "0.00", "total_tokens": 0}
11
+ {"current_steps": 220, "total_steps": 4030, "loss": 0.7086, "learning_rate": 2.7171215880893302e-05, "epoch": 3.52, "percentage": 5.46, "elapsed_time": "0:06:15", "remaining_time": "1:48:24", "throughput": "0.00", "total_tokens": 0}
12
+ {"current_steps": 240, "total_steps": 4030, "loss": 0.8587, "learning_rate": 2.9652605459057077e-05, "epoch": 3.84, "percentage": 5.96, "elapsed_time": "0:06:50", "remaining_time": "1:48:03", "throughput": "0.00", "total_tokens": 0}
13
+ {"current_steps": 260, "total_steps": 4030, "loss": 0.5978, "learning_rate": 3.2133995037220844e-05, "epoch": 4.16, "percentage": 6.45, "elapsed_time": "0:07:25", "remaining_time": "1:47:33", "throughput": "0.00", "total_tokens": 0}
14
+ {"current_steps": 280, "total_steps": 4030, "loss": 0.4668, "learning_rate": 3.461538461538462e-05, "epoch": 4.48, "percentage": 6.95, "elapsed_time": "0:07:58", "remaining_time": "1:46:54", "throughput": "0.00", "total_tokens": 0}
15
+ {"current_steps": 300, "total_steps": 4030, "loss": 0.5667, "learning_rate": 3.7096774193548386e-05, "epoch": 4.8, "percentage": 7.44, "elapsed_time": "0:08:32", "remaining_time": "1:46:10", "throughput": "0.00", "total_tokens": 0}
16
+ {"current_steps": 320, "total_steps": 4030, "loss": 0.4373, "learning_rate": 3.957816377171216e-05, "epoch": 5.12, "percentage": 7.94, "elapsed_time": "0:09:06", "remaining_time": "1:45:31", "throughput": "0.00", "total_tokens": 0}
17
+ {"current_steps": 340, "total_steps": 4030, "loss": 0.3492, "learning_rate": 4.205955334987593e-05, "epoch": 5.44, "percentage": 8.44, "elapsed_time": "0:09:39", "remaining_time": "1:44:54", "throughput": "0.00", "total_tokens": 0}
18
+ {"current_steps": 360, "total_steps": 4030, "loss": 0.4018, "learning_rate": 4.45409429280397e-05, "epoch": 5.76, "percentage": 8.93, "elapsed_time": "0:10:14", "remaining_time": "1:44:19", "throughput": "0.00", "total_tokens": 0}
19
+ {"current_steps": 380, "total_steps": 4030, "loss": 0.279, "learning_rate": 4.702233250620348e-05, "epoch": 6.08, "percentage": 9.43, "elapsed_time": "0:10:48", "remaining_time": "1:43:45", "throughput": "0.00", "total_tokens": 0}
20
+ {"current_steps": 400, "total_steps": 4030, "loss": 0.2362, "learning_rate": 4.950372208436725e-05, "epoch": 6.4, "percentage": 9.93, "elapsed_time": "0:11:22", "remaining_time": "1:43:09", "throughput": "0.00", "total_tokens": 0}
21
+ {"current_steps": 420, "total_steps": 4030, "loss": 0.2665, "learning_rate": 4.99975992459978e-05, "epoch": 6.72, "percentage": 10.42, "elapsed_time": "0:11:55", "remaining_time": "1:42:27", "throughput": "0.00", "total_tokens": 0}
22
+ {"current_steps": 440, "total_steps": 4030, "loss": 0.2029, "learning_rate": 4.9987846973104825e-05, "epoch": 7.04, "percentage": 10.92, "elapsed_time": "0:12:29", "remaining_time": "1:41:58", "throughput": "0.00", "total_tokens": 0}
23
+ {"current_steps": 460, "total_steps": 4030, "loss": 0.1747, "learning_rate": 4.9970596058519116e-05, "epoch": 7.36, "percentage": 11.41, "elapsed_time": "0:13:05", "remaining_time": "1:41:37", "throughput": "0.00", "total_tokens": 0}
24
+ {"current_steps": 480, "total_steps": 4030, "loss": 0.1486, "learning_rate": 4.994585167909436e-05, "epoch": 7.68, "percentage": 11.91, "elapsed_time": "0:13:41", "remaining_time": "1:41:16", "throughput": "0.00", "total_tokens": 0}
25
+ {"current_steps": 500, "total_steps": 4030, "loss": 0.1866, "learning_rate": 4.9913621260409695e-05, "epoch": 8.0, "percentage": 12.41, "elapsed_time": "0:14:17", "remaining_time": "1:40:57", "throughput": "0.00", "total_tokens": 0}
26
+ {"current_steps": 520, "total_steps": 4030, "loss": 0.1476, "learning_rate": 4.987391447454136e-05, "epoch": 8.32, "percentage": 12.9, "elapsed_time": "0:14:54", "remaining_time": "1:40:37", "throughput": "0.00", "total_tokens": 0}
27
+ {"current_steps": 540, "total_steps": 4030, "loss": 0.1403, "learning_rate": 4.982674323716023e-05, "epoch": 8.64, "percentage": 13.4, "elapsed_time": "0:15:31", "remaining_time": "1:40:18", "throughput": "0.00", "total_tokens": 0}
28
+ {"current_steps": 560, "total_steps": 4030, "loss": 0.1018, "learning_rate": 4.977212170395598e-05, "epoch": 8.96, "percentage": 13.9, "elapsed_time": "0:16:08", "remaining_time": "1:40:00", "throughput": "0.00", "total_tokens": 0}
29
+ {"current_steps": 580, "total_steps": 4030, "loss": 0.0992, "learning_rate": 4.9710066266389074e-05, "epoch": 9.28, "percentage": 14.39, "elapsed_time": "0:16:45", "remaining_time": "1:39:39", "throughput": "0.00", "total_tokens": 0}
30
+ {"current_steps": 600, "total_steps": 4030, "loss": 0.1134, "learning_rate": 4.964059554677187e-05, "epoch": 9.6, "percentage": 14.89, "elapsed_time": "0:17:21", "remaining_time": "1:39:14", "throughput": "0.00", "total_tokens": 0}
31
+ {"current_steps": 620, "total_steps": 4030, "loss": 0.0781, "learning_rate": 4.956373039268022e-05, "epoch": 9.92, "percentage": 15.38, "elapsed_time": "0:17:57", "remaining_time": "1:38:45", "throughput": "0.00", "total_tokens": 0}
32
+ {"current_steps": 640, "total_steps": 4030, "loss": 0.0892, "learning_rate": 4.947949387069721e-05, "epoch": 10.24, "percentage": 15.88, "elapsed_time": "0:18:33", "remaining_time": "1:38:16", "throughput": "0.00", "total_tokens": 0}
33
+ {"current_steps": 660, "total_steps": 4030, "loss": 0.0499, "learning_rate": 4.938791125949119e-05, "epoch": 10.56, "percentage": 16.38, "elapsed_time": "0:19:06", "remaining_time": "1:37:35", "throughput": "0.00", "total_tokens": 0}
34
+ {"current_steps": 680, "total_steps": 4030, "loss": 0.0831, "learning_rate": 4.9289010042229765e-05, "epoch": 10.88, "percentage": 16.87, "elapsed_time": "0:19:40", "remaining_time": "1:36:56", "throughput": "0.00", "total_tokens": 0}
35
+ {"current_steps": 700, "total_steps": 4030, "loss": 0.0715, "learning_rate": 4.918281989833238e-05, "epoch": 11.2, "percentage": 17.37, "elapsed_time": "0:20:14", "remaining_time": "1:36:19", "throughput": "0.00", "total_tokens": 0}
36
+ {"current_steps": 720, "total_steps": 4030, "loss": 0.0718, "learning_rate": 4.9069372694563756e-05, "epoch": 11.52, "percentage": 17.87, "elapsed_time": "0:20:48", "remaining_time": "1:35:41", "throughput": "0.00", "total_tokens": 0}
37
+ {"current_steps": 740, "total_steps": 4030, "loss": 0.0849, "learning_rate": 4.8948702475470933e-05, "epoch": 11.84, "percentage": 18.36, "elapsed_time": "0:21:22", "remaining_time": "1:35:03", "throughput": "0.00", "total_tokens": 0}
38
+ {"current_steps": 760, "total_steps": 4030, "loss": 0.0683, "learning_rate": 4.882084545316684e-05, "epoch": 12.16, "percentage": 18.86, "elapsed_time": "0:21:56", "remaining_time": "1:34:25", "throughput": "0.00", "total_tokens": 0}
39
+ {"current_steps": 780, "total_steps": 4030, "loss": 0.0808, "learning_rate": 4.868583999646329e-05, "epoch": 12.48, "percentage": 19.35, "elapsed_time": "0:22:30", "remaining_time": "1:33:48", "throughput": "0.00", "total_tokens": 0}
40
+ {"current_steps": 800, "total_steps": 4030, "loss": 0.0607, "learning_rate": 4.8543726619356846e-05, "epoch": 12.8, "percentage": 19.85, "elapsed_time": "0:23:04", "remaining_time": "1:33:09", "throughput": "0.00", "total_tokens": 0}
41
+ {"current_steps": 820, "total_steps": 4030, "loss": 0.062, "learning_rate": 4.83945479688709e-05, "epoch": 13.12, "percentage": 20.35, "elapsed_time": "0:23:38", "remaining_time": "1:32:32", "throughput": "0.00", "total_tokens": 0}
42
+ {"current_steps": 840, "total_steps": 4030, "loss": 0.0461, "learning_rate": 4.8238348812257684e-05, "epoch": 13.44, "percentage": 20.84, "elapsed_time": "0:24:12", "remaining_time": "1:31:54", "throughput": "0.00", "total_tokens": 0}
43
+ {"current_steps": 860, "total_steps": 4030, "loss": 0.0482, "learning_rate": 4.808349953928184e-05, "epoch": 13.76, "percentage": 21.34, "elapsed_time": "0:24:46", "remaining_time": "1:31:17", "throughput": "0.00", "total_tokens": 0}
44
+ {"current_steps": 880, "total_steps": 4030, "loss": 0.0388, "learning_rate": 4.791374712344622e-05, "epoch": 14.08, "percentage": 21.84, "elapsed_time": "0:25:20", "remaining_time": "1:30:41", "throughput": "0.00", "total_tokens": 0}
45
+ {"current_steps": 900, "total_steps": 4030, "loss": 0.0251, "learning_rate": 4.7737118485753564e-05, "epoch": 14.4, "percentage": 22.33, "elapsed_time": "0:25:54", "remaining_time": "1:30:05", "throughput": "0.00", "total_tokens": 0}
46
+ {"current_steps": 920, "total_steps": 4030, "loss": 0.0515, "learning_rate": 4.75536666309653e-05, "epoch": 14.72, "percentage": 22.83, "elapsed_time": "0:26:27", "remaining_time": "1:29:27", "throughput": "0.00", "total_tokens": 0}
47
+ {"current_steps": 940, "total_steps": 4030, "loss": 0.0536, "learning_rate": 4.73634466114326e-05, "epoch": 15.04, "percentage": 23.33, "elapsed_time": "0:27:01", "remaining_time": "1:28:51", "throughput": "0.00", "total_tokens": 0}
48
+ {"current_steps": 960, "total_steps": 4030, "loss": 0.0392, "learning_rate": 4.7166515510575676e-05, "epoch": 15.36, "percentage": 23.82, "elapsed_time": "0:27:36", "remaining_time": "1:28:17", "throughput": "0.00", "total_tokens": 0}
49
+ {"current_steps": 980, "total_steps": 4030, "loss": 0.0369, "learning_rate": 4.696293242575356e-05, "epoch": 15.68, "percentage": 24.32, "elapsed_time": "0:28:10", "remaining_time": "1:27:41", "throughput": "0.00", "total_tokens": 0}
50
+ {"current_steps": 1000, "total_steps": 4030, "loss": 0.0651, "learning_rate": 4.675275845052942e-05, "epoch": 16.0, "percentage": 24.81, "elapsed_time": "0:28:45", "remaining_time": "1:27:07", "throughput": "0.00", "total_tokens": 0}
51
+ {"current_steps": 1020, "total_steps": 4030, "loss": 0.037, "learning_rate": 4.6536056656336947e-05, "epoch": 16.32, "percentage": 25.31, "elapsed_time": "0:29:20", "remaining_time": "1:26:34", "throughput": "0.00", "total_tokens": 0}
52
+ {"current_steps": 1040, "total_steps": 4030, "loss": 0.0272, "learning_rate": 4.631289207355313e-05, "epoch": 16.64, "percentage": 25.81, "elapsed_time": "0:29:53", "remaining_time": "1:25:57", "throughput": "0.00", "total_tokens": 0}
53
+ {"current_steps": 1060, "total_steps": 4030, "loss": 0.0507, "learning_rate": 4.6083331671983185e-05, "epoch": 16.96, "percentage": 26.3, "elapsed_time": "0:30:27", "remaining_time": "1:25:19", "throughput": "0.00", "total_tokens": 0}
54
+ {"current_steps": 1080, "total_steps": 4030, "loss": 0.0274, "learning_rate": 4.584744434076352e-05, "epoch": 17.28, "percentage": 26.8, "elapsed_time": "0:31:00", "remaining_time": "1:24:42", "throughput": "0.00", "total_tokens": 0}
55
+ {"current_steps": 1100, "total_steps": 4030, "loss": 0.0565, "learning_rate": 4.560530086768863e-05, "epoch": 17.6, "percentage": 27.3, "elapsed_time": "0:31:33", "remaining_time": "1:24:04", "throughput": "0.00", "total_tokens": 0}
56
+ {"current_steps": 1120, "total_steps": 4030, "loss": 0.0425, "learning_rate": 4.535697391796832e-05, "epoch": 17.92, "percentage": 27.79, "elapsed_time": "0:32:07", "remaining_time": "1:23:28", "throughput": "0.00", "total_tokens": 0}
57
+ {"current_steps": 1140, "total_steps": 4030, "loss": 0.0273, "learning_rate": 4.510253801242147e-05, "epoch": 18.24, "percentage": 28.29, "elapsed_time": "0:32:41", "remaining_time": "1:22:52", "throughput": "0.00", "total_tokens": 0}
58
+ {"current_steps": 1160, "total_steps": 4030, "loss": 0.0438, "learning_rate": 4.4842069505112984e-05, "epoch": 18.56, "percentage": 28.78, "elapsed_time": "0:33:15", "remaining_time": "1:22:17", "throughput": "0.00", "total_tokens": 0}
59
+ {"current_steps": 1180, "total_steps": 4030, "loss": 0.0544, "learning_rate": 4.457564656044056e-05, "epoch": 18.88, "percentage": 29.28, "elapsed_time": "0:33:49", "remaining_time": "1:21:40", "throughput": "0.00", "total_tokens": 0}
60
+ {"current_steps": 1200, "total_steps": 4030, "loss": 0.0283, "learning_rate": 4.430334912967824e-05, "epoch": 19.2, "percentage": 29.78, "elapsed_time": "0:34:23", "remaining_time": "1:21:05", "throughput": "0.00", "total_tokens": 0}
61
+ {"current_steps": 1220, "total_steps": 4030, "loss": 0.0393, "learning_rate": 4.402525892698367e-05, "epoch": 19.52, "percentage": 30.27, "elapsed_time": "0:34:57", "remaining_time": "1:20:30", "throughput": "0.00", "total_tokens": 0}
62
+ {"current_steps": 1240, "total_steps": 4030, "loss": 0.0249, "learning_rate": 4.374145940487641e-05, "epoch": 19.84, "percentage": 30.77, "elapsed_time": "0:35:30", "remaining_time": "1:19:53", "throughput": "0.00", "total_tokens": 0}
63
+ {"current_steps": 1260, "total_steps": 4030, "loss": 0.0293, "learning_rate": 4.345203572919454e-05, "epoch": 20.16, "percentage": 31.27, "elapsed_time": "0:36:04", "remaining_time": "1:19:18", "throughput": "0.00", "total_tokens": 0}
64
+ {"current_steps": 1280, "total_steps": 4030, "loss": 0.0287, "learning_rate": 4.315707475353706e-05, "epoch": 20.48, "percentage": 31.76, "elapsed_time": "0:36:38", "remaining_time": "1:18:43", "throughput": "0.00", "total_tokens": 0}
65
+ {"current_steps": 1300, "total_steps": 4030, "loss": 0.0521, "learning_rate": 4.285666499319992e-05, "epoch": 20.8, "percentage": 32.26, "elapsed_time": "0:37:12", "remaining_time": "1:18:07", "throughput": "0.00", "total_tokens": 0}
66
+ {"current_steps": 1320, "total_steps": 4030, "loss": 0.0285, "learning_rate": 4.25508965986133e-05, "epoch": 21.12, "percentage": 32.75, "elapsed_time": "0:37:46", "remaining_time": "1:17:32", "throughput": "0.00", "total_tokens": 0}
67
+ {"current_steps": 1340, "total_steps": 4030, "loss": 0.0346, "learning_rate": 4.2239861328288214e-05, "epoch": 21.44, "percentage": 33.25, "elapsed_time": "0:38:19", "remaining_time": "1:16:56", "throughput": "0.00", "total_tokens": 0}
68
+ {"current_steps": 1360, "total_steps": 4030, "loss": 0.022, "learning_rate": 4.1923652521280585e-05, "epoch": 21.76, "percentage": 33.75, "elapsed_time": "0:38:54", "remaining_time": "1:16:22", "throughput": "0.00", "total_tokens": 0}
69
+ {"current_steps": 1380, "total_steps": 4030, "loss": 0.0482, "learning_rate": 4.160236506918098e-05, "epoch": 22.08, "percentage": 34.24, "elapsed_time": "0:39:28", "remaining_time": "1:15:48", "throughput": "0.00", "total_tokens": 0}
70
+ {"current_steps": 1400, "total_steps": 4030, "loss": 0.019, "learning_rate": 4.127609538763842e-05, "epoch": 22.4, "percentage": 34.74, "elapsed_time": "0:40:02", "remaining_time": "1:15:12", "throughput": "0.00", "total_tokens": 0}
71
+ {"current_steps": 1420, "total_steps": 4030, "loss": 0.0312, "learning_rate": 4.094494138742685e-05, "epoch": 22.72, "percentage": 35.24, "elapsed_time": "0:40:36", "remaining_time": "1:14:37", "throughput": "0.00", "total_tokens": 0}
72
+ {"current_steps": 1440, "total_steps": 4030, "loss": 0.0377, "learning_rate": 4.0609002445063036e-05, "epoch": 23.04, "percentage": 35.73, "elapsed_time": "0:41:09", "remaining_time": "1:14:02", "throughput": "0.00", "total_tokens": 0}
73
+ {"current_steps": 1460, "total_steps": 4030, "loss": 0.0307, "learning_rate": 4.02683793729844e-05, "epoch": 23.36, "percentage": 36.23, "elapsed_time": "0:41:43", "remaining_time": "1:13:26", "throughput": "0.00", "total_tokens": 0}
74
+ {"current_steps": 1480, "total_steps": 4030, "loss": 0.0419, "learning_rate": 3.9923174389296085e-05, "epoch": 23.68, "percentage": 36.72, "elapsed_time": "0:42:16", "remaining_time": "1:12:51", "throughput": "0.00", "total_tokens": 0}
75
+ {"current_steps": 1500, "total_steps": 4030, "loss": 0.0223, "learning_rate": 3.957349108709623e-05, "epoch": 24.0, "percentage": 37.22, "elapsed_time": "0:42:50", "remaining_time": "1:12:16", "throughput": "0.00", "total_tokens": 0}
76
+ {"current_steps": 1520, "total_steps": 4030, "loss": 0.0209, "learning_rate": 3.921943440338849e-05, "epoch": 24.32, "percentage": 37.72, "elapsed_time": "0:43:24", "remaining_time": "1:11:40", "throughput": "0.00", "total_tokens": 0}
77
+ {"current_steps": 1540, "total_steps": 4030, "loss": 0.0491, "learning_rate": 3.886111058759132e-05, "epoch": 24.64, "percentage": 38.21, "elapsed_time": "0:43:58", "remaining_time": "1:11:06", "throughput": "0.00", "total_tokens": 0}
78
+ {"current_steps": 1560, "total_steps": 4030, "loss": 0.0298, "learning_rate": 3.849862716965352e-05, "epoch": 24.96, "percentage": 38.71, "elapsed_time": "0:44:32", "remaining_time": "1:10:31", "throughput": "0.00", "total_tokens": 0}
79
+ {"current_steps": 1580, "total_steps": 4030, "loss": 0.0319, "learning_rate": 3.813209292778527e-05, "epoch": 25.28, "percentage": 39.21, "elapsed_time": "0:45:06", "remaining_time": "1:09:56", "throughput": "0.00", "total_tokens": 0}
80
+ {"current_steps": 1600, "total_steps": 4030, "loss": 0.0302, "learning_rate": 3.776161785581481e-05, "epoch": 25.6, "percentage": 39.7, "elapsed_time": "0:45:40", "remaining_time": "1:09:22", "throughput": "0.00", "total_tokens": 0}
81
+ {"current_steps": 1620, "total_steps": 4030, "loss": 0.04, "learning_rate": 3.738731313018019e-05, "epoch": 25.92, "percentage": 40.2, "elapsed_time": "0:46:14", "remaining_time": "1:08:47", "throughput": "0.00", "total_tokens": 0}
82
+ {"current_steps": 1640, "total_steps": 4030, "loss": 0.0354, "learning_rate": 3.700929107656614e-05, "epoch": 26.24, "percentage": 40.69, "elapsed_time": "0:46:48", "remaining_time": "1:08:12", "throughput": "0.00", "total_tokens": 0}
83
+ {"current_steps": 1660, "total_steps": 4030, "loss": 0.0186, "learning_rate": 3.662766513619611e-05, "epoch": 26.56, "percentage": 41.19, "elapsed_time": "0:47:22", "remaining_time": "1:07:38", "throughput": "0.00", "total_tokens": 0}
84
+ {"current_steps": 1680, "total_steps": 4030, "loss": 0.022, "learning_rate": 3.62425498317895e-05, "epoch": 26.88, "percentage": 41.69, "elapsed_time": "0:47:57", "remaining_time": "1:07:04", "throughput": "0.00", "total_tokens": 0}
85
+ {"current_steps": 1700, "total_steps": 4030, "loss": 0.015, "learning_rate": 3.585406073319439e-05, "epoch": 27.2, "percentage": 42.18, "elapsed_time": "0:48:31", "remaining_time": "1:06:30", "throughput": "0.00", "total_tokens": 0}
86
+ {"current_steps": 1720, "total_steps": 4030, "loss": 0.0381, "learning_rate": 3.546231442270596e-05, "epoch": 27.52, "percentage": 42.68, "elapsed_time": "0:49:05", "remaining_time": "1:05:55", "throughput": "0.00", "total_tokens": 0}
87
+ {"current_steps": 1740, "total_steps": 4030, "loss": 0.0277, "learning_rate": 3.506742846008116e-05, "epoch": 27.84, "percentage": 43.18, "elapsed_time": "0:49:39", "remaining_time": "1:05:20", "throughput": "0.00", "total_tokens": 0}
88
+ {"current_steps": 1760, "total_steps": 4030, "loss": 0.0423, "learning_rate": 3.4669521347259996e-05, "epoch": 28.16, "percentage": 43.67, "elapsed_time": "0:50:12", "remaining_time": "1:04:45", "throughput": "0.00", "total_tokens": 0}
89
+ {"current_steps": 1780, "total_steps": 4030, "loss": 0.0115, "learning_rate": 3.426871249280414e-05, "epoch": 28.48, "percentage": 44.17, "elapsed_time": "0:50:46", "remaining_time": "1:04:10", "throughput": "0.00", "total_tokens": 0}
90
+ {"current_steps": 1800, "total_steps": 4030, "loss": 0.0275, "learning_rate": 3.386512217606339e-05, "epoch": 28.8, "percentage": 44.67, "elapsed_time": "0:51:19", "remaining_time": "1:03:35", "throughput": "0.00", "total_tokens": 0}
91
+ {"current_steps": 1820, "total_steps": 4030, "loss": 0.0309, "learning_rate": 3.345887151108087e-05, "epoch": 29.12, "percentage": 45.16, "elapsed_time": "0:51:53", "remaining_time": "1:03:00", "throughput": "0.00", "total_tokens": 0}
92
+ {"current_steps": 1840, "total_steps": 4030, "loss": 0.0294, "learning_rate": 3.305008241024774e-05, "epoch": 29.44, "percentage": 45.66, "elapsed_time": "0:52:26", "remaining_time": "1:02:25", "throughput": "0.00", "total_tokens": 0}
93
+ {"current_steps": 1860, "total_steps": 4030, "loss": 0.0213, "learning_rate": 3.2638877547718264e-05, "epoch": 29.76, "percentage": 46.15, "elapsed_time": "0:53:00", "remaining_time": "1:01:50", "throughput": "0.00", "total_tokens": 0}
94
+ {"current_steps": 1880, "total_steps": 4030, "loss": 0.0326, "learning_rate": 3.222538032259643e-05, "epoch": 30.08, "percentage": 46.65, "elapsed_time": "0:53:34", "remaining_time": "1:01:16", "throughput": "0.00", "total_tokens": 0}
95
+ {"current_steps": 1900, "total_steps": 4030, "loss": 0.0249, "learning_rate": 3.1809714821904834e-05, "epoch": 30.4, "percentage": 47.15, "elapsed_time": "0:54:08", "remaining_time": "1:00:41", "throughput": "0.00", "total_tokens": 0}
96
+ {"current_steps": 1920, "total_steps": 4030, "loss": 0.0115, "learning_rate": 3.1392005783347244e-05, "epoch": 30.72, "percentage": 47.64, "elapsed_time": "0:54:42", "remaining_time": "1:00:07", "throughput": "0.00", "total_tokens": 0}
97
+ {"current_steps": 1940, "total_steps": 4030, "loss": 0.0322, "learning_rate": 3.0972378557875884e-05, "epoch": 31.04, "percentage": 48.14, "elapsed_time": "0:55:16", "remaining_time": "0:59:32", "throughput": "0.00", "total_tokens": 0}
98
+ {"current_steps": 1960, "total_steps": 4030, "loss": 0.0316, "learning_rate": 3.055095907207465e-05, "epoch": 31.36, "percentage": 48.64, "elapsed_time": "0:55:50", "remaining_time": "0:58:58", "throughput": "0.00", "total_tokens": 0}
99
+ {"current_steps": 1980, "total_steps": 4030, "loss": 0.0248, "learning_rate": 3.0127873790369627e-05, "epoch": 31.68, "percentage": 49.13, "elapsed_time": "0:56:24", "remaining_time": "0:58:24", "throughput": "0.00", "total_tokens": 0}
100
+ {"current_steps": 2000, "total_steps": 4030, "loss": 0.0234, "learning_rate": 2.9703249677078156e-05, "epoch": 32.0, "percentage": 49.63, "elapsed_time": "0:56:58", "remaining_time": "0:57:49", "throughput": "0.00", "total_tokens": 0}
101
+ {"current_steps": 2020, "total_steps": 4030, "loss": 0.0277, "learning_rate": 2.9277214158307937e-05, "epoch": 32.32, "percentage": 50.12, "elapsed_time": "0:57:33", "remaining_time": "0:57:16", "throughput": "0.00", "total_tokens": 0}
102
+ {"current_steps": 2040, "total_steps": 4030, "loss": 0.0162, "learning_rate": 2.8849895083717537e-05, "epoch": 32.64, "percentage": 50.62, "elapsed_time": "0:58:07", "remaining_time": "0:56:41", "throughput": "0.00", "total_tokens": 0}
103
+ {"current_steps": 2060, "total_steps": 4030, "loss": 0.022, "learning_rate": 2.842142068814977e-05, "epoch": 32.96, "percentage": 51.12, "elapsed_time": "0:58:41", "remaining_time": "0:56:08", "throughput": "0.00", "total_tokens": 0}
104
+ {"current_steps": 2080, "total_steps": 4030, "loss": 0.0278, "learning_rate": 2.7991919553149497e-05, "epoch": 33.28, "percentage": 51.61, "elapsed_time": "0:59:15", "remaining_time": "0:55:33", "throughput": "0.00", "total_tokens": 0}
105
+ {"current_steps": 2100, "total_steps": 4030, "loss": 0.0189, "learning_rate": 2.756152056837743e-05, "epoch": 33.6, "percentage": 52.11, "elapsed_time": "0:59:49", "remaining_time": "0:54:59", "throughput": "0.00", "total_tokens": 0}
106
+ {"current_steps": 2120, "total_steps": 4030, "loss": 0.0228, "learning_rate": 2.7130352892931388e-05, "epoch": 33.92, "percentage": 52.61, "elapsed_time": "1:00:23", "remaining_time": "0:54:24", "throughput": "0.00", "total_tokens": 0}
107
+ {"current_steps": 2140, "total_steps": 4030, "loss": 0.0319, "learning_rate": 2.669854591658679e-05, "epoch": 34.24, "percentage": 53.1, "elapsed_time": "1:00:56", "remaining_time": "0:53:49", "throughput": "0.00", "total_tokens": 0}
108
+ {"current_steps": 2160, "total_steps": 4030, "loss": 0.0153, "learning_rate": 2.6266229220967818e-05, "epoch": 34.56, "percentage": 53.6, "elapsed_time": "1:01:29", "remaining_time": "0:53:14", "throughput": "0.00", "total_tokens": 0}
109
+ {"current_steps": 2180, "total_steps": 4030, "loss": 0.0267, "learning_rate": 2.5833532540661127e-05, "epoch": 34.88, "percentage": 54.09, "elapsed_time": "1:02:03", "remaining_time": "0:52:39", "throughput": "0.00", "total_tokens": 0}
110
+ {"current_steps": 2200, "total_steps": 4030, "loss": 0.0178, "learning_rate": 2.540058572428356e-05, "epoch": 35.2, "percentage": 54.59, "elapsed_time": "1:02:37", "remaining_time": "0:52:05", "throughput": "0.00", "total_tokens": 0}
111
+ {"current_steps": 2220, "total_steps": 4030, "loss": 0.0217, "learning_rate": 2.496751869551567e-05, "epoch": 35.52, "percentage": 55.09, "elapsed_time": "1:03:10", "remaining_time": "0:51:30", "throughput": "0.00", "total_tokens": 0}
112
+ {"current_steps": 2240, "total_steps": 4030, "loss": 0.017, "learning_rate": 2.453446141411273e-05, "epoch": 35.84, "percentage": 55.58, "elapsed_time": "1:03:44", "remaining_time": "0:50:56", "throughput": "0.00", "total_tokens": 0}
113
+ {"current_steps": 2260, "total_steps": 4030, "loss": 0.0257, "learning_rate": 2.4101543836904938e-05, "epoch": 36.16, "percentage": 56.08, "elapsed_time": "1:04:18", "remaining_time": "0:50:21", "throughput": "0.00", "total_tokens": 0}
114
+ {"current_steps": 2280, "total_steps": 4030, "loss": 0.0237, "learning_rate": 2.3668895878798424e-05, "epoch": 36.48, "percentage": 56.58, "elapsed_time": "1:04:52", "remaining_time": "0:49:47", "throughput": "0.00", "total_tokens": 0}
115
+ {"current_steps": 2300, "total_steps": 4030, "loss": 0.024, "learning_rate": 2.32366473737889e-05, "epoch": 36.8, "percentage": 57.07, "elapsed_time": "1:05:25", "remaining_time": "0:49:12", "throughput": "0.00", "total_tokens": 0}
116
+ {"current_steps": 2320, "total_steps": 4030, "loss": 0.0225, "learning_rate": 2.2804928035999594e-05, "epoch": 37.12, "percentage": 57.57, "elapsed_time": "1:06:00", "remaining_time": "0:48:39", "throughput": "0.00", "total_tokens": 0}
117
+ {"current_steps": 2340, "total_steps": 4030, "loss": 0.0239, "learning_rate": 2.23738674207551e-05, "epoch": 37.44, "percentage": 58.06, "elapsed_time": "1:06:35", "remaining_time": "0:48:05", "throughput": "0.00", "total_tokens": 0}
118
+ {"current_steps": 2360, "total_steps": 4030, "loss": 0.0235, "learning_rate": 2.1943594885702984e-05, "epoch": 37.76, "percentage": 58.56, "elapsed_time": "1:07:09", "remaining_time": "0:47:31", "throughput": "0.00", "total_tokens": 0}
119
+ {"current_steps": 2380, "total_steps": 4030, "loss": 0.0286, "learning_rate": 2.151423955199456e-05, "epoch": 38.08, "percentage": 59.06, "elapsed_time": "1:07:43", "remaining_time": "0:46:57", "throughput": "0.00", "total_tokens": 0}
120
+ {"current_steps": 2400, "total_steps": 4030, "loss": 0.0323, "learning_rate": 2.108593026553681e-05, "epoch": 38.4, "percentage": 59.55, "elapsed_time": "1:08:18", "remaining_time": "0:46:23", "throughput": "0.00", "total_tokens": 0}
121
+ {"current_steps": 2420, "total_steps": 4030, "loss": 0.0241, "learning_rate": 2.0658795558326743e-05, "epoch": 38.72, "percentage": 60.05, "elapsed_time": "1:08:52", "remaining_time": "0:45:48", "throughput": "0.00", "total_tokens": 0}
122
+ {"current_steps": 2440, "total_steps": 4030, "loss": 0.0158, "learning_rate": 2.0232963609880093e-05, "epoch": 39.04, "percentage": 60.55, "elapsed_time": "1:09:26", "remaining_time": "0:45:14", "throughput": "0.00", "total_tokens": 0}
123
+ {"current_steps": 2460, "total_steps": 4030, "loss": 0.0241, "learning_rate": 1.9808562208765667e-05, "epoch": 39.36, "percentage": 61.04, "elapsed_time": "1:10:00", "remaining_time": "0:44:40", "throughput": "0.00", "total_tokens": 0}
124
+ {"current_steps": 2480, "total_steps": 4030, "loss": 0.0174, "learning_rate": 1.938571871425715e-05, "epoch": 39.68, "percentage": 61.54, "elapsed_time": "1:10:34", "remaining_time": "0:44:06", "throughput": "0.00", "total_tokens": 0}
125
+ {"current_steps": 2500, "total_steps": 4030, "loss": 0.0183, "learning_rate": 1.896456001811357e-05, "epoch": 40.0, "percentage": 62.03, "elapsed_time": "1:11:08", "remaining_time": "0:43:32", "throughput": "0.00", "total_tokens": 0}
126
+ {"current_steps": 2520, "total_steps": 4030, "loss": 0.012, "learning_rate": 1.854521250650026e-05, "epoch": 40.32, "percentage": 62.53, "elapsed_time": "1:11:43", "remaining_time": "0:42:58", "throughput": "0.00", "total_tokens": 0}
127
+ {"current_steps": 2540, "total_steps": 4030, "loss": 0.0225, "learning_rate": 1.8127802022061334e-05, "epoch": 40.64, "percentage": 63.03, "elapsed_time": "1:12:16", "remaining_time": "0:42:24", "throughput": "0.00", "total_tokens": 0}
128
+ {"current_steps": 2560, "total_steps": 4030, "loss": 0.0391, "learning_rate": 1.7712453826155457e-05, "epoch": 40.96, "percentage": 63.52, "elapsed_time": "1:12:50", "remaining_time": "0:41:49", "throughput": "0.00", "total_tokens": 0}
129
+ {"current_steps": 2580, "total_steps": 4030, "loss": 0.0229, "learning_rate": 1.72992925612659e-05, "epoch": 41.28, "percentage": 64.02, "elapsed_time": "1:13:24", "remaining_time": "0:41:15", "throughput": "0.00", "total_tokens": 0}
130
+ {"current_steps": 2600, "total_steps": 4030, "loss": 0.015, "learning_rate": 1.688844221359645e-05, "epoch": 41.6, "percentage": 64.52, "elapsed_time": "1:13:59", "remaining_time": "0:40:41", "throughput": "0.00", "total_tokens": 0}
131
+ {"current_steps": 2620, "total_steps": 4030, "loss": 0.0287, "learning_rate": 1.6480026075864163e-05, "epoch": 41.92, "percentage": 65.01, "elapsed_time": "1:14:33", "remaining_time": "0:40:07", "throughput": "0.00", "total_tokens": 0}
132
+ {"current_steps": 2640, "total_steps": 4030, "loss": 0.0229, "learning_rate": 1.6074166710300247e-05, "epoch": 42.24, "percentage": 65.51, "elapsed_time": "1:15:08", "remaining_time": "0:39:33", "throughput": "0.00", "total_tokens": 0}
133
+ {"current_steps": 2660, "total_steps": 4030, "loss": 0.0352, "learning_rate": 1.567098591187021e-05, "epoch": 42.56, "percentage": 66.0, "elapsed_time": "1:15:42", "remaining_time": "0:38:59", "throughput": "0.00", "total_tokens": 0}
134
+ {"current_steps": 2680, "total_steps": 4030, "loss": 0.0242, "learning_rate": 1.5270604671724188e-05, "epoch": 42.88, "percentage": 66.5, "elapsed_time": "1:16:16", "remaining_time": "0:38:25", "throughput": "0.00", "total_tokens": 0}
135
+ {"current_steps": 2700, "total_steps": 4030, "loss": 0.0165, "learning_rate": 1.4873143140888538e-05, "epoch": 43.2, "percentage": 67.0, "elapsed_time": "1:16:50", "remaining_time": "0:37:51", "throughput": "0.00", "total_tokens": 0}
136
+ {"current_steps": 2720, "total_steps": 4030, "loss": 0.0274, "learning_rate": 1.4478720594209532e-05, "epoch": 43.52, "percentage": 67.49, "elapsed_time": "1:17:24", "remaining_time": "0:37:16", "throughput": "0.00", "total_tokens": 0}
137
+ {"current_steps": 2740, "total_steps": 4030, "loss": 0.0185, "learning_rate": 1.4087455394559984e-05, "epoch": 43.84, "percentage": 67.99, "elapsed_time": "1:17:58", "remaining_time": "0:36:42", "throughput": "0.00", "total_tokens": 0}
138
+ {"current_steps": 2760, "total_steps": 4030, "loss": 0.0509, "learning_rate": 1.369946495731954e-05, "epoch": 44.16, "percentage": 68.49, "elapsed_time": "1:18:31", "remaining_time": "0:36:08", "throughput": "0.00", "total_tokens": 0}
139
+ {"current_steps": 2780, "total_steps": 4030, "loss": 0.027, "learning_rate": 1.3314865715139346e-05, "epoch": 44.48, "percentage": 68.98, "elapsed_time": "1:19:05", "remaining_time": "0:35:33", "throughput": "0.00", "total_tokens": 0}
140
+ {"current_steps": 2800, "total_steps": 4030, "loss": 0.0163, "learning_rate": 1.2933773083001517e-05, "epoch": 44.8, "percentage": 69.48, "elapsed_time": "1:19:39", "remaining_time": "0:34:59", "throughput": "0.00", "total_tokens": 0}
141
+ {"current_steps": 2820, "total_steps": 4030, "loss": 0.0125, "learning_rate": 1.255630142358421e-05, "epoch": 45.12, "percentage": 69.98, "elapsed_time": "1:20:14", "remaining_time": "0:34:25", "throughput": "0.00", "total_tokens": 0}
142
+ {"current_steps": 2840, "total_steps": 4030, "loss": 0.0327, "learning_rate": 1.2182564012942193e-05, "epoch": 45.44, "percentage": 70.47, "elapsed_time": "1:20:49", "remaining_time": "0:33:51", "throughput": "0.00", "total_tokens": 0}
143
+ {"current_steps": 2860, "total_steps": 4030, "loss": 0.0302, "learning_rate": 1.1812673006513789e-05, "epoch": 45.76, "percentage": 70.97, "elapsed_time": "1:21:23", "remaining_time": "0:33:17", "throughput": "0.00", "total_tokens": 0}
144
+ {"current_steps": 2880, "total_steps": 4030, "loss": 0.0209, "learning_rate": 1.14467394054639e-05, "epoch": 46.08, "percentage": 71.46, "elapsed_time": "1:21:57", "remaining_time": "0:32:43", "throughput": "0.00", "total_tokens": 0}
145
+ {"current_steps": 2900, "total_steps": 4030, "loss": 0.025, "learning_rate": 1.108487302337353e-05, "epoch": 46.4, "percentage": 71.96, "elapsed_time": "1:22:32", "remaining_time": "0:32:09", "throughput": "0.00", "total_tokens": 0}
146
+ {"current_steps": 2920, "total_steps": 4030, "loss": 0.0284, "learning_rate": 1.0727182453285647e-05, "epoch": 46.72, "percentage": 72.46, "elapsed_time": "1:23:06", "remaining_time": "0:31:35", "throughput": "0.00", "total_tokens": 0}
147
+ {"current_steps": 2940, "total_steps": 4030, "loss": 0.0174, "learning_rate": 1.0373775035117305e-05, "epoch": 47.04, "percentage": 72.95, "elapsed_time": "1:23:40", "remaining_time": "0:31:01", "throughput": "0.00", "total_tokens": 0}
148
+ {"current_steps": 2960, "total_steps": 4030, "loss": 0.0115, "learning_rate": 1.002475682344792e-05, "epoch": 47.36, "percentage": 73.45, "elapsed_time": "1:24:15", "remaining_time": "0:30:27", "throughput": "0.00", "total_tokens": 0}
149
+ {"current_steps": 2980, "total_steps": 4030, "loss": 0.0238, "learning_rate": 9.680232555693067e-06, "epoch": 47.68, "percentage": 73.95, "elapsed_time": "1:24:49", "remaining_time": "0:29:53", "throughput": "0.00", "total_tokens": 0}
150
+ {"current_steps": 3000, "total_steps": 4030, "loss": 0.0294, "learning_rate": 9.340305620673778e-06, "epoch": 48.0, "percentage": 74.44, "elapsed_time": "1:25:23", "remaining_time": "0:29:19", "throughput": "0.00", "total_tokens": 0}
151
+ {"current_steps": 3020, "total_steps": 4030, "loss": 0.0226, "learning_rate": 9.005078027590375e-06, "epoch": 48.32, "percentage": 74.94, "elapsed_time": "1:25:58", "remaining_time": "0:28:45", "throughput": "0.00", "total_tokens": 0}
152
+ {"current_steps": 3040, "total_steps": 4030, "loss": 0.0196, "learning_rate": 8.67465037541038e-06, "epoch": 48.64, "percentage": 75.43, "elapsed_time": "1:26:32", "remaining_time": "0:28:11", "throughput": "0.00", "total_tokens": 0}
153
+ {"current_steps": 3060, "total_steps": 4030, "loss": 0.0175, "learning_rate": 8.34912182267959e-06, "epoch": 48.96, "percentage": 75.93, "elapsed_time": "1:27:07", "remaining_time": "0:27:36", "throughput": "0.00", "total_tokens": 0}
154
+ {"current_steps": 3080, "total_steps": 4030, "loss": 0.015, "learning_rate": 8.028590057765523e-06, "epoch": 49.28, "percentage": 76.43, "elapsed_time": "1:27:41", "remaining_time": "0:27:02", "throughput": "0.00", "total_tokens": 0}
155
+ {"current_steps": 3100, "total_steps": 4030, "loss": 0.0221, "learning_rate": 7.713151269541844e-06, "epoch": 49.6, "percentage": 76.92, "elapsed_time": "1:28:15", "remaining_time": "0:26:28", "throughput": "0.00", "total_tokens": 0}
156
+ {"current_steps": 3120, "total_steps": 4030, "loss": 0.0161, "learning_rate": 7.402900118522979e-06, "epoch": 49.92, "percentage": 77.42, "elapsed_time": "1:28:48", "remaining_time": "0:25:54", "throughput": "0.00", "total_tokens": 0}
157
+ {"current_steps": 3140, "total_steps": 4030, "loss": 0.0237, "learning_rate": 7.097929708457282e-06, "epoch": 50.24, "percentage": 77.92, "elapsed_time": "1:29:23", "remaining_time": "0:25:20", "throughput": "0.00", "total_tokens": 0}
158
+ {"current_steps": 3160, "total_steps": 4030, "loss": 0.0172, "learning_rate": 6.7983315583873695e-06, "epoch": 50.56, "percentage": 78.41, "elapsed_time": "1:29:56", "remaining_time": "0:24:45", "throughput": "0.00", "total_tokens": 0}
159
+ {"current_steps": 3180, "total_steps": 4030, "loss": 0.0198, "learning_rate": 6.504195575186009e-06, "epoch": 50.88, "percentage": 78.91, "elapsed_time": "1:30:30", "remaining_time": "0:24:11", "throughput": "0.00", "total_tokens": 0}
160
+ {"current_steps": 3200, "total_steps": 4030, "loss": 0.0227, "learning_rate": 6.215610026575916e-06, "epoch": 51.2, "percentage": 79.4, "elapsed_time": "1:31:04", "remaining_time": "0:23:37", "throughput": "0.00", "total_tokens": 0}
161
+ {"current_steps": 3220, "total_steps": 4030, "loss": 0.0156, "learning_rate": 5.93266151464123e-06, "epoch": 51.52, "percentage": 79.9, "elapsed_time": "1:31:37", "remaining_time": "0:23:03", "throughput": "0.00", "total_tokens": 0}
162
+ {"current_steps": 3240, "total_steps": 4030, "loss": 0.0268, "learning_rate": 5.655434949839061e-06, "epoch": 51.84, "percentage": 80.4, "elapsed_time": "1:32:12", "remaining_time": "0:22:28", "throughput": "0.00", "total_tokens": 0}
163
+ {"current_steps": 3260, "total_steps": 4030, "loss": 0.0209, "learning_rate": 5.384013525518541e-06, "epoch": 52.16, "percentage": 80.89, "elapsed_time": "1:32:46", "remaining_time": "0:21:54", "throughput": "0.00", "total_tokens": 0}
164
+ {"current_steps": 3280, "total_steps": 4030, "loss": 0.0202, "learning_rate": 5.118478692955194e-06, "epoch": 52.48, "percentage": 81.39, "elapsed_time": "1:33:19", "remaining_time": "0:21:20", "throughput": "0.00", "total_tokens": 0}
165
+ {"current_steps": 3300, "total_steps": 4030, "loss": 0.0192, "learning_rate": 4.858910136908123e-06, "epoch": 52.8, "percentage": 81.89, "elapsed_time": "1:33:53", "remaining_time": "0:20:46", "throughput": "0.00", "total_tokens": 0}
166
+ {"current_steps": 3320, "total_steps": 4030, "loss": 0.0205, "learning_rate": 4.605385751707248e-06, "epoch": 53.12, "percentage": 82.38, "elapsed_time": "1:34:27", "remaining_time": "0:20:12", "throughput": "0.00", "total_tokens": 0}
167
+ {"current_steps": 3340, "total_steps": 4030, "loss": 0.0129, "learning_rate": 4.357981617877932e-06, "epoch": 53.44, "percentage": 82.88, "elapsed_time": "1:35:01", "remaining_time": "0:19:37", "throughput": "0.00", "total_tokens": 0}
168
+ {"current_steps": 3360, "total_steps": 4030, "loss": 0.0258, "learning_rate": 4.116771979309797e-06, "epoch": 53.76, "percentage": 83.37, "elapsed_time": "1:35:35", "remaining_time": "0:19:03", "throughput": "0.00", "total_tokens": 0}
169
+ {"current_steps": 3380, "total_steps": 4030, "loss": 0.0306, "learning_rate": 3.881829220976807e-06, "epoch": 54.08, "percentage": 83.87, "elapsed_time": "1:36:09", "remaining_time": "0:18:29", "throughput": "0.00", "total_tokens": 0}
170
+ {"current_steps": 3400, "total_steps": 4030, "loss": 0.0198, "learning_rate": 3.653223847215126e-06, "epoch": 54.4, "percentage": 84.37, "elapsed_time": "1:36:43", "remaining_time": "0:17:55", "throughput": "0.00", "total_tokens": 0}
171
+ {"current_steps": 3420, "total_steps": 4030, "loss": 0.0257, "learning_rate": 3.4310244605653797e-06, "epoch": 54.72, "percentage": 84.86, "elapsed_time": "1:37:17", "remaining_time": "0:17:21", "throughput": "0.00", "total_tokens": 0}
172
+ {"current_steps": 3440, "total_steps": 4030, "loss": 0.0125, "learning_rate": 3.215297741185572e-06, "epoch": 55.04, "percentage": 85.36, "elapsed_time": "1:37:52", "remaining_time": "0:16:47", "throughput": "0.00", "total_tokens": 0}
173
+ {"current_steps": 3460, "total_steps": 4030, "loss": 0.0124, "learning_rate": 3.0061084268410006e-06, "epoch": 55.36, "percentage": 85.86, "elapsed_time": "1:38:27", "remaining_time": "0:16:13", "throughput": "0.00", "total_tokens": 0}
174
+ {"current_steps": 3480, "total_steps": 4030, "loss": 0.023, "learning_rate": 2.8035192934769362e-06, "epoch": 55.68, "percentage": 86.35, "elapsed_time": "1:39:01", "remaining_time": "0:15:39", "throughput": "0.00", "total_tokens": 0}
175
+ {"current_steps": 3500, "total_steps": 4030, "loss": 0.0194, "learning_rate": 2.607591136380122e-06, "epoch": 56.0, "percentage": 86.85, "elapsed_time": "1:39:36", "remaining_time": "0:15:04", "throughput": "0.00", "total_tokens": 0}
176
+ {"current_steps": 3520, "total_steps": 4030, "loss": 0.0162, "learning_rate": 2.4183827519346308e-06, "epoch": 56.32, "percentage": 87.34, "elapsed_time": "1:40:10", "remaining_time": "0:14:30", "throughput": "0.00", "total_tokens": 0}
177
+ {"current_steps": 3540, "total_steps": 4030, "loss": 0.0337, "learning_rate": 2.235950919977545e-06, "epoch": 56.64, "percentage": 87.84, "elapsed_time": "1:40:44", "remaining_time": "0:13:56", "throughput": "0.00", "total_tokens": 0}
178
+ {"current_steps": 3560, "total_steps": 4030, "loss": 0.0139, "learning_rate": 2.0603503867598182e-06, "epoch": 56.96, "percentage": 88.34, "elapsed_time": "1:41:18", "remaining_time": "0:13:22", "throughput": "0.00", "total_tokens": 0}
179
+ {"current_steps": 3580, "total_steps": 4030, "loss": 0.0193, "learning_rate": 1.8916338485173823e-06, "epoch": 57.28, "percentage": 88.83, "elapsed_time": "1:41:52", "remaining_time": "0:12:48", "throughput": "0.00", "total_tokens": 0}
180
+ {"current_steps": 3600, "total_steps": 4030, "loss": 0.0203, "learning_rate": 1.7298519356574727e-06, "epoch": 57.6, "percentage": 89.33, "elapsed_time": "1:42:28", "remaining_time": "0:12:14", "throughput": "0.00", "total_tokens": 0}
181
+ {"current_steps": 3620, "total_steps": 4030, "loss": 0.0212, "learning_rate": 1.5750531975648324e-06, "epoch": 57.92, "percentage": 89.83, "elapsed_time": "1:43:02", "remaining_time": "0:11:40", "throughput": "0.00", "total_tokens": 0}
182
+ {"current_steps": 3640, "total_steps": 4030, "loss": 0.0173, "learning_rate": 1.4272840880324934e-06, "epoch": 58.24, "percentage": 90.32, "elapsed_time": "1:43:36", "remaining_time": "0:11:06", "throughput": "0.00", "total_tokens": 0}
183
+ {"current_steps": 3660, "total_steps": 4030, "loss": 0.0139, "learning_rate": 1.286588951321363e-06, "epoch": 58.56, "percentage": 90.82, "elapsed_time": "1:44:10", "remaining_time": "0:10:31", "throughput": "0.00", "total_tokens": 0}
184
+ {"current_steps": 3680, "total_steps": 4030, "loss": 0.0268, "learning_rate": 1.1530100088528867e-06, "epoch": 58.88, "percentage": 91.32, "elapsed_time": "1:44:45", "remaining_time": "0:09:57", "throughput": "0.00", "total_tokens": 0}
185
+ {"current_steps": 3700, "total_steps": 4030, "loss": 0.0191, "learning_rate": 1.0265873465387516e-06, "epoch": 59.2, "percentage": 91.81, "elapsed_time": "1:45:18", "remaining_time": "0:09:23", "throughput": "0.00", "total_tokens": 0}
186
+ {"current_steps": 3720, "total_steps": 4030, "loss": 0.0168, "learning_rate": 9.073589027514789e-07, "epoch": 59.52, "percentage": 92.31, "elapsed_time": "1:45:53", "remaining_time": "0:08:49", "throughput": "0.00", "total_tokens": 0}
187
+ {"current_steps": 3740, "total_steps": 4030, "loss": 0.0246, "learning_rate": 7.953604569393841e-07, "epoch": 59.84, "percentage": 92.8, "elapsed_time": "1:46:27", "remaining_time": "0:08:15", "throughput": "0.00", "total_tokens": 0}
188
+ {"current_steps": 3760, "total_steps": 4030, "loss": 0.019, "learning_rate": 6.906256188895038e-07, "epoch": 60.16, "percentage": 93.3, "elapsed_time": "1:47:00", "remaining_time": "0:07:41", "throughput": "0.00", "total_tokens": 0}
189
+ {"current_steps": 3780, "total_steps": 4030, "loss": 0.0168, "learning_rate": 5.931858186415756e-07, "epoch": 60.48, "percentage": 93.8, "elapsed_time": "1:47:34", "remaining_time": "0:07:06", "throughput": "0.00", "total_tokens": 0}
190
+ {"current_steps": 3800, "total_steps": 4030, "loss": 0.0197, "learning_rate": 5.03070297056149e-07, "epoch": 60.8, "percentage": 94.29, "elapsed_time": "1:48:08", "remaining_time": "0:06:32", "throughput": "0.00", "total_tokens": 0}
191
+ {"current_steps": 3820, "total_steps": 4030, "loss": 0.0207, "learning_rate": 4.203060970396383e-07, "epoch": 61.12, "percentage": 94.79, "elapsed_time": "1:48:42", "remaining_time": "0:05:58", "throughput": "0.00", "total_tokens": 0}
192
+ {"current_steps": 3840, "total_steps": 4030, "loss": 0.0224, "learning_rate": 3.4491805542899157e-07, "epoch": 61.44, "percentage": 95.29, "elapsed_time": "1:49:16", "remaining_time": "0:05:24", "throughput": "0.00", "total_tokens": 0}
193
+ {"current_steps": 3860, "total_steps": 4030, "loss": 0.0151, "learning_rate": 2.769287955383532e-07, "epoch": 61.76, "percentage": 95.78, "elapsed_time": "1:49:51", "remaining_time": "0:04:50", "throughput": "0.00", "total_tokens": 0}
194
+ {"current_steps": 3880, "total_steps": 4030, "loss": 0.0284, "learning_rate": 2.1635872037001626e-07, "epoch": 62.08, "percentage": 96.28, "elapsed_time": "1:50:25", "remaining_time": "0:04:16", "throughput": "0.00", "total_tokens": 0}
195
+ {"current_steps": 3900, "total_steps": 4030, "loss": 0.0217, "learning_rate": 1.6322600649162356e-07, "epoch": 62.4, "percentage": 96.77, "elapsed_time": "1:50:59", "remaining_time": "0:03:41", "throughput": "0.00", "total_tokens": 0}
196
+ {"current_steps": 3920, "total_steps": 4030, "loss": 0.0103, "learning_rate": 1.1754659858156659e-07, "epoch": 62.72, "percentage": 97.27, "elapsed_time": "1:51:34", "remaining_time": "0:03:07", "throughput": "0.00", "total_tokens": 0}
197
+ {"current_steps": 3940, "total_steps": 4030, "loss": 0.0333, "learning_rate": 7.933420464410201e-08, "epoch": 63.04, "percentage": 97.77, "elapsed_time": "1:52:08", "remaining_time": "0:02:33", "throughput": "0.00", "total_tokens": 0}
198
+ {"current_steps": 3960, "total_steps": 4030, "loss": 0.0231, "learning_rate": 4.860029189569237e-08, "epoch": 63.36, "percentage": 98.26, "elapsed_time": "1:52:42", "remaining_time": "0:01:59", "throughput": "0.00", "total_tokens": 0}
199
+ {"current_steps": 3980, "total_steps": 4030, "loss": 0.0226, "learning_rate": 2.535408332381417e-08, "epoch": 63.68, "percentage": 98.76, "elapsed_time": "1:53:16", "remaining_time": "0:01:25", "throughput": "0.00", "total_tokens": 0}
200
+ {"current_steps": 4000, "total_steps": 4030, "loss": 0.0218, "learning_rate": 9.60255491919415e-09, "epoch": 64.0, "percentage": 99.26, "elapsed_time": "1:53:50", "remaining_time": "0:00:51", "throughput": "0.00", "total_tokens": 0}
201
+ {"current_steps": 4020, "total_steps": 4030, "loss": 0.0219, "learning_rate": 1.3504335823810722e-09, "epoch": 64.32, "percentage": 99.75, "elapsed_time": "1:54:25", "remaining_time": "0:00:17", "throughput": "0.00", "total_tokens": 0}
202
+ {"current_steps": 4030, "total_steps": 4030, "epoch": 64.48, "percentage": 100.0, "elapsed_time": "1:54:42", "remaining_time": "0:00:00", "throughput": "0.00", "total_tokens": 0}
trainer_state.json CHANGED
@@ -1,1112 +1,1434 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 49.6,
5
  "eval_steps": 500,
6
- "global_step": 3100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.32,
13
- "grad_norm": 2.214611053466797,
14
- "learning_rate": 3.064516129032258e-06,
15
- "loss": 2.8022,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.64,
20
- "grad_norm": 0.9293187260627747,
21
- "learning_rate": 6.290322580645161e-06,
22
- "loss": 2.6463,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.96,
27
- "grad_norm": 2.630194902420044,
28
- "learning_rate": 9.516129032258064e-06,
29
- "loss": 2.4026,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 1.28,
34
- "grad_norm": 1.7517138719558716,
35
- "learning_rate": 1.2741935483870968e-05,
36
- "loss": 1.9639,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 1.6,
41
- "grad_norm": 1.6100302934646606,
42
- "learning_rate": 1.596774193548387e-05,
43
- "loss": 1.7115,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 1.92,
48
- "grad_norm": 3.468630790710449,
49
- "learning_rate": 1.9193548387096774e-05,
50
- "loss": 1.7534,
51
  "step": 120
52
  },
53
  {
54
  "epoch": 2.24,
55
- "grad_norm": 2.6692144870758057,
56
- "learning_rate": 2.2419354838709678e-05,
57
- "loss": 1.3422,
58
  "step": 140
59
  },
60
  {
61
  "epoch": 2.56,
62
- "grad_norm": 1.5186293125152588,
63
- "learning_rate": 2.5645161290322582e-05,
64
- "loss": 1.1582,
65
  "step": 160
66
  },
67
  {
68
  "epoch": 2.88,
69
- "grad_norm": 4.727887153625488,
70
- "learning_rate": 2.8870967741935483e-05,
71
- "loss": 1.1084,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 3.2,
76
- "grad_norm": 2.800290107727051,
77
- "learning_rate": 3.2096774193548393e-05,
78
- "loss": 0.844,
79
  "step": 200
80
  },
81
  {
82
  "epoch": 3.52,
83
- "grad_norm": 1.1538563966751099,
84
- "learning_rate": 3.532258064516129e-05,
85
- "loss": 0.6051,
86
  "step": 220
87
  },
88
  {
89
  "epoch": 3.84,
90
- "grad_norm": 3.282315492630005,
91
- "learning_rate": 3.8548387096774195e-05,
92
- "loss": 0.7587,
93
  "step": 240
94
  },
95
  {
96
  "epoch": 4.16,
97
- "grad_norm": 0.9393389821052551,
98
- "learning_rate": 4.17741935483871e-05,
99
- "loss": 0.537,
100
  "step": 260
101
  },
102
  {
103
  "epoch": 4.48,
104
- "grad_norm": 1.5951660871505737,
105
- "learning_rate": 4.5e-05,
106
- "loss": 0.4063,
107
  "step": 280
108
  },
109
  {
110
  "epoch": 4.8,
111
- "grad_norm": 2.4143407344818115,
112
- "learning_rate": 4.822580645161291e-05,
113
- "loss": 0.4817,
114
  "step": 300
115
  },
116
  {
117
  "epoch": 5.12,
118
- "grad_norm": 1.1503572463989258,
119
- "learning_rate": 4.9998716243505096e-05,
120
- "loss": 0.3556,
121
  "step": 320
122
  },
123
  {
124
  "epoch": 5.44,
125
- "grad_norm": 1.5514352321624756,
126
- "learning_rate": 4.9986672191133314e-05,
127
- "loss": 0.2663,
128
  "step": 340
129
  },
130
  {
131
  "epoch": 5.76,
132
- "grad_norm": 2.0045723915100098,
133
- "learning_rate": 4.9961956248762694e-05,
134
- "loss": 0.3273,
135
  "step": 360
136
  },
137
  {
138
  "epoch": 6.08,
139
- "grad_norm": 0.716200053691864,
140
- "learning_rate": 4.992458095098368e-05,
141
- "loss": 0.2174,
142
  "step": 380
143
  },
144
  {
145
  "epoch": 6.4,
146
- "grad_norm": 3.2870705127716064,
147
- "learning_rate": 4.9874565252527765e-05,
148
- "loss": 0.1886,
149
  "step": 400
150
  },
151
  {
152
  "epoch": 6.72,
153
- "grad_norm": 2.3181071281433105,
154
- "learning_rate": 4.981193451865465e-05,
155
- "loss": 0.2278,
156
  "step": 420
157
  },
158
  {
159
  "epoch": 7.04,
160
- "grad_norm": 1.6296441555023193,
161
- "learning_rate": 4.9736720512288334e-05,
162
- "loss": 0.168,
163
  "step": 440
164
  },
165
  {
166
  "epoch": 7.36,
167
- "grad_norm": 1.757379174232483,
168
- "learning_rate": 4.964896137790873e-05,
169
- "loss": 0.1227,
170
  "step": 460
171
  },
172
  {
173
  "epoch": 7.68,
174
- "grad_norm": 1.2622524499893188,
175
- "learning_rate": 4.954870162220679e-05,
176
- "loss": 0.1261,
177
  "step": 480
178
  },
179
  {
180
  "epoch": 8.0,
181
- "grad_norm": 1.1865277290344238,
182
- "learning_rate": 4.943599209151314e-05,
183
- "loss": 0.1167,
184
  "step": 500
185
  },
186
  {
187
  "epoch": 8.32,
188
- "grad_norm": 1.244333028793335,
189
- "learning_rate": 4.931088994601157e-05,
190
- "loss": 0.1049,
191
  "step": 520
192
  },
193
  {
194
  "epoch": 8.64,
195
- "grad_norm": 2.6851558685302734,
196
- "learning_rate": 4.917345863075048e-05,
197
- "loss": 0.1016,
198
  "step": 540
199
  },
200
  {
201
  "epoch": 8.96,
202
- "grad_norm": 1.0457267761230469,
203
- "learning_rate": 4.902376784346697e-05,
204
- "loss": 0.1157,
205
  "step": 560
206
  },
207
  {
208
  "epoch": 9.28,
209
- "grad_norm": 0.2963043749332428,
210
- "learning_rate": 4.886189349923992e-05,
211
- "loss": 0.073,
212
  "step": 580
213
  },
214
  {
215
  "epoch": 9.6,
216
- "grad_norm": 0.45336633920669556,
217
- "learning_rate": 4.868791769198995e-05,
218
- "loss": 0.0908,
219
  "step": 600
220
  },
221
  {
222
  "epoch": 9.92,
223
- "grad_norm": 1.0918829441070557,
224
- "learning_rate": 4.8501928652845854e-05,
225
- "loss": 0.0557,
226
  "step": 620
227
  },
228
  {
229
  "epoch": 10.24,
230
- "grad_norm": 2.7920358180999756,
231
- "learning_rate": 4.83040207053985e-05,
232
- "loss": 0.0779,
233
  "step": 640
234
  },
235
  {
236
  "epoch": 10.56,
237
- "grad_norm": 0.8492644429206848,
238
- "learning_rate": 4.809429421786502e-05,
239
- "loss": 0.048,
240
  "step": 660
241
  },
242
  {
243
  "epoch": 10.88,
244
- "grad_norm": 0.6128495335578918,
245
- "learning_rate": 4.787285555218748e-05,
246
- "loss": 0.0747,
247
  "step": 680
248
  },
249
  {
250
  "epoch": 11.2,
251
- "grad_norm": 0.5186921954154968,
252
- "learning_rate": 4.763981701009184e-05,
253
- "loss": 0.0629,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 11.52,
258
- "grad_norm": 1.6753857135772705,
259
- "learning_rate": 4.739529677613456e-05,
260
- "loss": 0.051,
261
  "step": 720
262
  },
263
  {
264
  "epoch": 11.84,
265
- "grad_norm": 0.18876530230045319,
266
- "learning_rate": 4.713941885776586e-05,
267
- "loss": 0.0699,
268
  "step": 740
269
  },
270
  {
271
  "epoch": 12.16,
272
- "grad_norm": 0.4428744912147522,
273
- "learning_rate": 4.687231302243975e-05,
274
- "loss": 0.0526,
275
  "step": 760
276
  },
277
  {
278
  "epoch": 12.48,
279
- "grad_norm": 0.4765178859233856,
280
- "learning_rate": 4.659411473180304e-05,
281
- "loss": 0.0412,
282
  "step": 780
283
  },
284
  {
285
  "epoch": 12.8,
286
- "grad_norm": 1.4559166431427002,
287
- "learning_rate": 4.6304965072996495e-05,
288
- "loss": 0.0495,
289
  "step": 800
290
  },
291
  {
292
  "epoch": 13.12,
293
- "grad_norm": 1.1842377185821533,
294
- "learning_rate": 4.6005010687103076e-05,
295
- "loss": 0.063,
296
  "step": 820
297
  },
298
  {
299
  "epoch": 13.44,
300
- "grad_norm": 0.5502442717552185,
301
- "learning_rate": 4.569440369477951e-05,
302
- "loss": 0.0425,
303
  "step": 840
304
  },
305
  {
306
  "epoch": 13.76,
307
- "grad_norm": 1.819698452949524,
308
- "learning_rate": 4.5373301619108854e-05,
309
- "loss": 0.0451,
310
  "step": 860
311
  },
312
  {
313
  "epoch": 14.08,
314
- "grad_norm": 0.045023053884506226,
315
- "learning_rate": 4.5041867305713384e-05,
316
- "loss": 0.0445,
317
  "step": 880
318
  },
319
  {
320
  "epoch": 14.4,
321
- "grad_norm": 0.10354474931955338,
322
- "learning_rate": 4.4700268840168045e-05,
323
- "loss": 0.0214,
324
  "step": 900
325
  },
326
  {
327
  "epoch": 14.72,
328
- "grad_norm": 0.23762211203575134,
329
- "learning_rate": 4.4348679462756556e-05,
330
- "loss": 0.0552,
331
  "step": 920
332
  },
333
  {
334
  "epoch": 15.04,
335
- "grad_norm": 0.845052182674408,
336
- "learning_rate": 4.398727748061324e-05,
337
- "loss": 0.0524,
338
  "step": 940
339
  },
340
  {
341
  "epoch": 15.36,
342
- "grad_norm": 0.29455024003982544,
343
- "learning_rate": 4.361624617729536e-05,
344
- "loss": 0.0318,
345
  "step": 960
346
  },
347
  {
348
  "epoch": 15.68,
349
- "grad_norm": 0.05131356045603752,
350
- "learning_rate": 4.323577371983155e-05,
351
- "loss": 0.0347,
352
  "step": 980
353
  },
354
  {
355
  "epoch": 16.0,
356
- "grad_norm": 0.2307986468076706,
357
- "learning_rate": 4.28460530632937e-05,
358
- "loss": 0.0541,
359
  "step": 1000
360
  },
361
  {
362
  "epoch": 16.32,
363
- "grad_norm": 0.9982470870018005,
364
- "learning_rate": 4.2447281852940525e-05,
365
- "loss": 0.0327,
366
  "step": 1020
367
  },
368
  {
369
  "epoch": 16.64,
370
- "grad_norm": 2.9247119426727295,
371
- "learning_rate": 4.203966232398261e-05,
372
- "loss": 0.0251,
373
  "step": 1040
374
  },
375
  {
376
  "epoch": 16.96,
377
- "grad_norm": 0.5199835896492004,
378
- "learning_rate": 4.162340119901961e-05,
379
- "loss": 0.0451,
380
  "step": 1060
381
  },
382
  {
383
  "epoch": 17.28,
384
- "grad_norm": 0.6802399754524231,
385
- "learning_rate": 4.1198709583201754e-05,
386
- "loss": 0.0272,
387
  "step": 1080
388
  },
389
  {
390
  "epoch": 17.6,
391
- "grad_norm": 0.9806874394416809,
392
- "learning_rate": 4.0765802857168687e-05,
393
- "loss": 0.0517,
394
  "step": 1100
395
  },
396
  {
397
  "epoch": 17.92,
398
- "grad_norm": 0.12717130780220032,
399
- "learning_rate": 4.0324900567820046e-05,
400
- "loss": 0.0286,
401
  "step": 1120
402
  },
403
  {
404
  "epoch": 18.24,
405
- "grad_norm": 0.4048568308353424,
406
- "learning_rate": 3.987622631697316e-05,
407
- "loss": 0.0258,
408
  "step": 1140
409
  },
410
  {
411
  "epoch": 18.56,
412
- "grad_norm": 1.2760275602340698,
413
- "learning_rate": 3.942000764796427e-05,
414
- "loss": 0.0289,
415
  "step": 1160
416
  },
417
  {
418
  "epoch": 18.88,
419
- "grad_norm": 0.30493712425231934,
420
- "learning_rate": 3.895647593025088e-05,
421
- "loss": 0.0457,
422
  "step": 1180
423
  },
424
  {
425
  "epoch": 19.2,
426
- "grad_norm": 0.6177706122398376,
427
- "learning_rate": 3.8485866242073584e-05,
428
- "loss": 0.0316,
429
  "step": 1200
430
  },
431
  {
432
  "epoch": 19.52,
433
- "grad_norm": 0.10721703618764877,
434
- "learning_rate": 3.80084172512372e-05,
435
- "loss": 0.0326,
436
  "step": 1220
437
  },
438
  {
439
  "epoch": 19.84,
440
- "grad_norm": 0.06823062896728516,
441
- "learning_rate": 3.7524371094071266e-05,
442
- "loss": 0.0238,
443
  "step": 1240
444
  },
445
  {
446
  "epoch": 20.16,
447
- "grad_norm": 0.9792996644973755,
448
- "learning_rate": 3.703397325263162e-05,
449
- "loss": 0.0286,
450
  "step": 1260
451
  },
452
  {
453
  "epoch": 20.48,
454
- "grad_norm": 0.20131415128707886,
455
- "learning_rate": 3.653747243020515e-05,
456
- "loss": 0.0294,
457
  "step": 1280
458
  },
459
  {
460
  "epoch": 20.8,
461
- "grad_norm": 0.530823826789856,
462
- "learning_rate": 3.603512042518093e-05,
463
- "loss": 0.0364,
464
  "step": 1300
465
  },
466
  {
467
  "epoch": 21.12,
468
- "grad_norm": 0.006112121045589447,
469
- "learning_rate": 3.552717200335171e-05,
470
- "loss": 0.0265,
471
  "step": 1320
472
  },
473
  {
474
  "epoch": 21.44,
475
- "grad_norm": 0.005943021737039089,
476
- "learning_rate": 3.501388476871039e-05,
477
- "loss": 0.0319,
478
  "step": 1340
479
  },
480
  {
481
  "epoch": 21.76,
482
- "grad_norm": 0.07666248083114624,
483
- "learning_rate": 3.449551903280729e-05,
484
- "loss": 0.0137,
485
  "step": 1360
486
  },
487
  {
488
  "epoch": 22.08,
489
- "grad_norm": 0.6763716340065002,
490
- "learning_rate": 3.397233768273415e-05,
491
- "loss": 0.0416,
492
  "step": 1380
493
  },
494
  {
495
  "epoch": 22.4,
496
- "grad_norm": 0.07458912581205368,
497
- "learning_rate": 3.344460604780202e-05,
498
- "loss": 0.0179,
499
  "step": 1400
500
  },
501
  {
502
  "epoch": 22.72,
503
- "grad_norm": 0.25514400005340576,
504
- "learning_rate": 3.291259176498052e-05,
505
- "loss": 0.0276,
506
  "step": 1420
507
  },
508
  {
509
  "epoch": 23.04,
510
- "grad_norm": 0.006252670660614967,
511
- "learning_rate": 3.237656464316693e-05,
512
- "loss": 0.0352,
513
  "step": 1440
514
  },
515
  {
516
  "epoch": 23.36,
517
- "grad_norm": 0.36337536573410034,
518
- "learning_rate": 3.183679652635357e-05,
519
- "loss": 0.0212,
520
  "step": 1460
521
  },
522
  {
523
  "epoch": 23.68,
524
- "grad_norm": 0.28176209330558777,
525
- "learning_rate": 3.129356115576332e-05,
526
- "loss": 0.0338,
527
  "step": 1480
528
  },
529
  {
530
  "epoch": 24.0,
531
- "grad_norm": 0.012418941594660282,
532
- "learning_rate": 3.074713403102284e-05,
533
- "loss": 0.0295,
534
  "step": 1500
535
  },
536
  {
537
  "epoch": 24.32,
538
- "grad_norm": 0.16376622021198273,
539
- "learning_rate": 3.0197792270443982e-05,
540
- "loss": 0.0185,
541
  "step": 1520
542
  },
543
  {
544
  "epoch": 24.64,
545
- "grad_norm": 0.45608577132225037,
546
- "learning_rate": 2.9645814470484452e-05,
547
- "loss": 0.0328,
548
  "step": 1540
549
  },
550
  {
551
  "epoch": 24.96,
552
- "grad_norm": 0.1006656065583229,
553
- "learning_rate": 2.9091480564458666e-05,
554
- "loss": 0.025,
555
  "step": 1560
556
  },
557
  {
558
  "epoch": 25.28,
559
- "grad_norm": 0.3901682496070862,
560
- "learning_rate": 2.8535071680570734e-05,
561
- "loss": 0.0294,
562
  "step": 1580
563
  },
564
  {
565
  "epoch": 25.6,
566
- "grad_norm": 0.12243347615003586,
567
- "learning_rate": 2.7976869999341426e-05,
568
- "loss": 0.0282,
569
  "step": 1600
570
  },
571
  {
572
  "epoch": 25.92,
573
- "grad_norm": 0.0033943182788789272,
574
- "learning_rate": 2.741715861050143e-05,
575
- "loss": 0.0294,
576
  "step": 1620
577
  },
578
  {
579
  "epoch": 26.24,
580
- "grad_norm": 0.0014275741996243596,
581
- "learning_rate": 2.685622136942359e-05,
582
  "loss": 0.0354,
583
  "step": 1640
584
  },
585
  {
586
  "epoch": 26.56,
587
- "grad_norm": 0.018641650676727295,
588
- "learning_rate": 2.629434275316673e-05,
589
- "loss": 0.0162,
590
  "step": 1660
591
  },
592
  {
593
  "epoch": 26.88,
594
- "grad_norm": 0.13316482305526733,
595
- "learning_rate": 2.573180771620432e-05,
596
- "loss": 0.0205,
597
  "step": 1680
598
  },
599
  {
600
  "epoch": 27.2,
601
- "grad_norm": 0.003175324061885476,
602
- "learning_rate": 2.516890154591095e-05,
603
- "loss": 0.0129,
604
  "step": 1700
605
  },
606
  {
607
  "epoch": 27.52,
608
- "grad_norm": 0.28820428252220154,
609
- "learning_rate": 2.4605909717879964e-05,
610
- "loss": 0.0333,
611
  "step": 1720
612
  },
613
  {
614
  "epoch": 27.84,
615
- "grad_norm": 0.08302447199821472,
616
- "learning_rate": 2.4043117751145694e-05,
617
- "loss": 0.0261,
618
  "step": 1740
619
  },
620
  {
621
  "epoch": 28.16,
622
- "grad_norm": 0.37718892097473145,
623
- "learning_rate": 2.34808110633836e-05,
624
- "loss": 0.0418,
625
  "step": 1760
626
  },
627
  {
628
  "epoch": 28.48,
629
- "grad_norm": 0.16185913980007172,
630
- "learning_rate": 2.291927482616191e-05,
631
- "loss": 0.0111,
632
  "step": 1780
633
  },
634
  {
635
  "epoch": 28.8,
636
- "grad_norm": 0.33071696758270264,
637
- "learning_rate": 2.235879382031794e-05,
638
- "loss": 0.0263,
639
  "step": 1800
640
  },
641
  {
642
  "epoch": 29.12,
643
- "grad_norm": 0.002949915360659361,
644
- "learning_rate": 2.179965229153265e-05,
645
- "loss": 0.0299,
646
  "step": 1820
647
  },
648
  {
649
  "epoch": 29.44,
650
- "grad_norm": 0.0035819699987769127,
651
- "learning_rate": 2.1242133806176667e-05,
652
- "loss": 0.0267,
653
  "step": 1840
654
  },
655
  {
656
  "epoch": 29.76,
657
- "grad_norm": 0.0039703804068267345,
658
- "learning_rate": 2.0686521107500638e-05,
659
- "loss": 0.0204,
660
  "step": 1860
661
  },
662
  {
663
  "epoch": 30.08,
664
- "grad_norm": 0.004021900240331888,
665
- "learning_rate": 2.0133095972243233e-05,
666
- "loss": 0.0308,
667
  "step": 1880
668
  },
669
  {
670
  "epoch": 30.4,
671
- "grad_norm": 0.2821226418018341,
672
- "learning_rate": 1.9582139067729117e-05,
673
- "loss": 0.024,
674
  "step": 1900
675
  },
676
  {
677
  "epoch": 30.72,
678
- "grad_norm": 0.5529562830924988,
679
- "learning_rate": 1.90339298095297e-05,
680
- "loss": 0.0113,
681
  "step": 1920
682
  },
683
  {
684
  "epoch": 31.04,
685
- "grad_norm": 0.1492016613483429,
686
- "learning_rate": 1.8488746219758674e-05,
687
- "loss": 0.0301,
688
  "step": 1940
689
  },
690
  {
691
  "epoch": 31.36,
692
- "grad_norm": 0.19194553792476654,
693
- "learning_rate": 1.7946864786074165e-05,
694
- "loss": 0.0293,
695
  "step": 1960
696
  },
697
  {
698
  "epoch": 31.68,
699
- "grad_norm": 0.1448647677898407,
700
- "learning_rate": 1.740856032145917e-05,
701
- "loss": 0.0242,
702
  "step": 1980
703
  },
704
  {
705
  "epoch": 32.0,
706
- "grad_norm": 0.06869282573461533,
707
- "learning_rate": 1.6874105824851267e-05,
708
- "loss": 0.022,
709
  "step": 2000
710
  },
711
  {
712
  "epoch": 32.32,
713
- "grad_norm": 0.002635813085362315,
714
- "learning_rate": 1.634377234269226e-05,
715
- "loss": 0.0264,
716
  "step": 2020
717
  },
718
  {
719
  "epoch": 32.64,
720
- "grad_norm": 0.0027471587527543306,
721
- "learning_rate": 1.5817828831468144e-05,
722
- "loss": 0.0155,
723
  "step": 2040
724
  },
725
  {
726
  "epoch": 32.96,
727
- "grad_norm": 0.28300318121910095,
728
- "learning_rate": 1.5296542021308825e-05,
729
- "loss": 0.0208,
730
  "step": 2060
731
  },
732
  {
733
  "epoch": 33.28,
734
- "grad_norm": 0.0016769981011748314,
735
- "learning_rate": 1.478017628071706e-05,
736
- "loss": 0.0264,
737
  "step": 2080
738
  },
739
  {
740
  "epoch": 33.6,
741
- "grad_norm": 0.0014146752655506134,
742
- "learning_rate": 1.4268993482495055e-05,
743
- "loss": 0.0174,
744
  "step": 2100
745
  },
746
  {
747
  "epoch": 33.92,
748
- "grad_norm": 0.05964767187833786,
749
- "learning_rate": 1.3763252870936649e-05,
750
- "loss": 0.0214,
751
  "step": 2120
752
  },
753
  {
754
  "epoch": 34.24,
755
- "grad_norm": 0.0016291196225211024,
756
- "learning_rate": 1.3263210930352737e-05,
757
- "loss": 0.0306,
758
  "step": 2140
759
  },
760
  {
761
  "epoch": 34.56,
762
- "grad_norm": 0.034806057810783386,
763
- "learning_rate": 1.2769121254996159e-05,
764
- "loss": 0.0146,
765
  "step": 2160
766
  },
767
  {
768
  "epoch": 34.88,
769
- "grad_norm": 0.315729558467865,
770
- "learning_rate": 1.228123442045249e-05,
771
- "loss": 0.0254,
772
  "step": 2180
773
  },
774
  {
775
  "epoch": 35.2,
776
- "grad_norm": 0.5809018015861511,
777
- "learning_rate": 1.1799797856561606e-05,
778
- "loss": 0.0176,
779
  "step": 2200
780
  },
781
  {
782
  "epoch": 35.52,
783
- "grad_norm": 0.18777510523796082,
784
- "learning_rate": 1.1325055721934637e-05,
785
- "loss": 0.0205,
786
  "step": 2220
787
  },
788
  {
789
  "epoch": 35.84,
790
- "grad_norm": 0.11027589440345764,
791
- "learning_rate": 1.0857248780129928e-05,
792
- "loss": 0.0153,
793
  "step": 2240
794
  },
795
  {
796
  "epoch": 36.16,
797
- "grad_norm": 0.18985402584075928,
798
- "learning_rate": 1.0396614277550752e-05,
799
- "loss": 0.0251,
800
  "step": 2260
801
  },
802
  {
803
  "epoch": 36.48,
804
- "grad_norm": 0.011078303679823875,
805
- "learning_rate": 9.943385823126775e-06,
806
- "loss": 0.0224,
807
  "step": 2280
808
  },
809
  {
810
  "epoch": 36.8,
811
- "grad_norm": 0.15364831686019897,
812
- "learning_rate": 9.497793269840211e-06,
813
- "loss": 0.0219,
814
  "step": 2300
815
  },
816
  {
817
  "epoch": 37.12,
818
- "grad_norm": 0.16373781859874725,
819
- "learning_rate": 9.06006259815683e-06,
820
- "loss": 0.021,
821
  "step": 2320
822
  },
823
  {
824
  "epoch": 37.44,
825
- "grad_norm": 0.41937291622161865,
826
- "learning_rate": 8.630415801420835e-06,
827
- "loss": 0.0236,
828
  "step": 2340
829
  },
830
  {
831
  "epoch": 37.76,
832
- "grad_norm": 0.4237123727798462,
833
- "learning_rate": 8.209070773271894e-06,
834
- "loss": 0.0216,
835
  "step": 2360
836
  },
837
  {
838
  "epoch": 38.08,
839
- "grad_norm": 0.08532612025737762,
840
- "learning_rate": 7.79624119714121e-06,
841
- "loss": 0.0268,
842
  "step": 2380
843
  },
844
  {
845
  "epoch": 38.4,
846
- "grad_norm": 0.2872686982154846,
847
- "learning_rate": 7.392136437882855e-06,
848
- "loss": 0.0306,
849
  "step": 2400
850
  },
851
  {
852
  "epoch": 38.72,
853
- "grad_norm": 0.2834513485431671,
854
- "learning_rate": 6.996961435595223e-06,
855
- "loss": 0.0224,
856
  "step": 2420
857
  },
858
  {
859
  "epoch": 39.04,
860
- "grad_norm": 0.0006759735988453031,
861
- "learning_rate": 6.610916601686481e-06,
862
- "loss": 0.0138,
863
  "step": 2440
864
  },
865
  {
866
  "epoch": 39.36,
867
- "grad_norm": 0.24796371161937714,
868
- "learning_rate": 6.234197717236742e-06,
869
- "loss": 0.0234,
870
  "step": 2460
871
  },
872
  {
873
  "epoch": 39.68,
874
- "grad_norm": 0.2657662332057953,
875
- "learning_rate": 5.866995833708464e-06,
876
- "loss": 0.0164,
877
  "step": 2480
878
  },
879
  {
880
  "epoch": 40.0,
881
- "grad_norm": 0.0017559522530063987,
882
- "learning_rate": 5.509497176055492e-06,
883
- "loss": 0.0169,
884
  "step": 2500
885
  },
886
  {
887
  "epoch": 40.32,
888
- "grad_norm": 0.18896016478538513,
889
- "learning_rate": 5.161883048279817e-06,
890
- "loss": 0.0116,
891
  "step": 2520
892
  },
893
  {
894
  "epoch": 40.64,
895
- "grad_norm": 0.2336779683828354,
896
- "learning_rate": 4.824329741483949e-06,
897
- "loss": 0.0219,
898
  "step": 2540
899
  },
900
  {
901
  "epoch": 40.96,
902
- "grad_norm": 0.16521525382995605,
903
- "learning_rate": 4.497008444465681e-06,
904
- "loss": 0.0366,
905
  "step": 2560
906
  },
907
  {
908
  "epoch": 41.28,
909
- "grad_norm": 0.0013411182444542646,
910
- "learning_rate": 4.180085156900274e-06,
911
- "loss": 0.0228,
912
  "step": 2580
913
  },
914
  {
915
  "epoch": 41.6,
916
- "grad_norm": 0.0015802403213456273,
917
- "learning_rate": 3.873720605154468e-06,
918
- "loss": 0.0135,
919
  "step": 2600
920
  },
921
  {
922
  "epoch": 41.92,
923
- "grad_norm": 0.002210975391790271,
924
- "learning_rate": 3.578070160774724e-06,
925
- "loss": 0.0267,
926
  "step": 2620
927
  },
928
  {
929
  "epoch": 42.24,
930
- "grad_norm": 0.23047080636024475,
931
- "learning_rate": 3.293283761691182e-06,
932
- "loss": 0.0218,
933
  "step": 2640
934
  },
935
  {
936
  "epoch": 42.56,
937
- "grad_norm": 0.2625073492527008,
938
- "learning_rate": 3.0195058361772277e-06,
939
- "loss": 0.0315,
940
  "step": 2660
941
  },
942
  {
943
  "epoch": 42.88,
944
- "grad_norm": 0.22485554218292236,
945
- "learning_rate": 2.756875229603295e-06,
946
- "loss": 0.022,
947
  "step": 2680
948
  },
949
  {
950
  "epoch": 43.2,
951
- "grad_norm": 0.001550053246319294,
952
- "learning_rate": 2.5055251340219855e-06,
953
- "loss": 0.016,
954
  "step": 2700
955
  },
956
  {
957
  "epoch": 43.52,
958
- "grad_norm": 0.22042883932590485,
959
- "learning_rate": 2.2655830206202655e-06,
960
- "loss": 0.0267,
961
  "step": 2720
962
  },
963
  {
964
  "epoch": 43.84,
965
- "grad_norm": 0.08822102099657059,
966
- "learning_rate": 2.037170575072944e-06,
967
- "loss": 0.0167,
968
  "step": 2740
969
  },
970
  {
971
  "epoch": 44.16,
972
- "grad_norm": 0.0016505387611687183,
973
- "learning_rate": 1.8204036358303173e-06,
974
- "loss": 0.0492,
975
  "step": 2760
976
  },
977
  {
978
  "epoch": 44.48,
979
- "grad_norm": 0.3908143639564514,
980
- "learning_rate": 1.615392135371116e-06,
981
- "loss": 0.0254,
982
  "step": 2780
983
  },
984
  {
985
  "epoch": 44.8,
986
- "grad_norm": 0.3156016767024994,
987
- "learning_rate": 1.4222400444507318e-06,
988
- "loss": 0.0136,
989
  "step": 2800
990
  },
991
  {
992
  "epoch": 45.12,
993
- "grad_norm": 0.001897096517495811,
994
- "learning_rate": 1.2410453193728493e-06,
995
- "loss": 0.0116,
996
  "step": 2820
997
  },
998
  {
999
  "epoch": 45.44,
1000
- "grad_norm": 0.20051489770412445,
1001
- "learning_rate": 1.0718998523113004e-06,
1002
- "loss": 0.0311,
1003
  "step": 2840
1004
  },
1005
  {
1006
  "epoch": 45.76,
1007
- "grad_norm": 0.0016040581976994872,
1008
- "learning_rate": 9.148894247073298e-07,
1009
- "loss": 0.0283,
1010
  "step": 2860
1011
  },
1012
  {
1013
  "epoch": 46.08,
1014
- "grad_norm": 0.0015667045954614878,
1015
- "learning_rate": 7.700936637658779e-07,
1016
- "loss": 0.0186,
1017
  "step": 2880
1018
  },
1019
  {
1020
  "epoch": 46.4,
1021
- "grad_norm": 0.04447433352470398,
1022
- "learning_rate": 6.375860020729541e-07,
1023
- "loss": 0.0229,
1024
  "step": 2900
1025
  },
1026
  {
1027
  "epoch": 46.72,
1028
- "grad_norm": 0.20941714942455292,
1029
- "learning_rate": 5.174336403546226e-07,
1030
- "loss": 0.0268,
1031
  "step": 2920
1032
  },
1033
  {
1034
  "epoch": 47.04,
1035
- "grad_norm": 0.0015096565475687385,
1036
- "learning_rate": 4.096975133963954e-07,
1037
- "loss": 0.0161,
1038
  "step": 2940
1039
  },
1040
  {
1041
  "epoch": 47.36,
1042
- "grad_norm": 0.19307351112365723,
1043
- "learning_rate": 3.144322591404292e-07,
1044
- "loss": 0.0113,
1045
  "step": 2960
1046
  },
1047
  {
1048
  "epoch": 47.68,
1049
- "grad_norm": 0.002177381655201316,
1050
- "learning_rate": 2.316861909760909e-07,
1051
- "loss": 0.0222,
1052
  "step": 2980
1053
  },
1054
  {
1055
  "epoch": 48.0,
1056
- "grad_norm": 0.09334852546453476,
1057
- "learning_rate": 1.6150127323803222e-07,
1058
- "loss": 0.028,
1059
  "step": 3000
1060
  },
1061
  {
1062
  "epoch": 48.32,
1063
- "grad_norm": 0.09507758915424347,
1064
- "learning_rate": 1.0391309992413833e-07,
1065
- "loss": 0.0217,
1066
  "step": 3020
1067
  },
1068
  {
1069
  "epoch": 48.64,
1070
- "grad_norm": 0.0010601489339023829,
1071
- "learning_rate": 5.895087664417876e-08,
1072
- "loss": 0.0191,
1073
  "step": 3040
1074
  },
1075
  {
1076
  "epoch": 48.96,
1077
- "grad_norm": 0.001158875529654324,
1078
- "learning_rate": 2.6637405808302428e-08,
1079
- "loss": 0.0162,
1080
  "step": 3060
1081
  },
1082
  {
1083
  "epoch": 49.28,
1084
- "grad_norm": 0.0015347091248258948,
1085
- "learning_rate": 6.989075062879824e-09,
1086
- "loss": 0.0146,
1087
  "step": 3080
1088
  },
1089
  {
1090
  "epoch": 49.6,
1091
- "grad_norm": 0.0010455228621140122,
1092
- "learning_rate": 1.584897958428755e-11,
1093
- "loss": 0.0217,
1094
  "step": 3100
1095
  },
1096
  {
1097
- "epoch": 49.6,
1098
- "step": 3100,
1099
- "total_flos": 1.7939255986343117e+17,
1100
- "train_loss": 0.172668604437382,
1101
- "train_runtime": 11274.7291,
1102
- "train_samples_per_second": 2.217,
1103
- "train_steps_per_second": 0.275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1104
  }
1105
  ],
1106
  "logging_steps": 20,
1107
- "max_steps": 3100,
1108
  "num_input_tokens_seen": 0,
1109
- "num_train_epochs": 50,
1110
  "save_steps": 1000,
1111
  "stateful_callbacks": {
1112
  "TrainerControl": {
@@ -1120,7 +1442,7 @@
1120
  "attributes": {}
1121
  }
1122
  },
1123
- "total_flos": 1.7939255986343117e+17,
1124
  "train_batch_size": 2,
1125
  "trial_name": null,
1126
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 64.48,
5
  "eval_steps": 500,
6
+ "global_step": 4030,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.32,
13
+ "grad_norm": 2.246424436569214,
14
+ "learning_rate": 2.3573200992555833e-06,
15
+ "loss": 2.826,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.64,
20
+ "grad_norm": 0.9050242900848389,
21
+ "learning_rate": 4.838709677419355e-06,
22
+ "loss": 2.72,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.96,
27
+ "grad_norm": 2.6034655570983887,
28
+ "learning_rate": 7.320099255583126e-06,
29
+ "loss": 2.4912,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 1.28,
34
+ "grad_norm": 1.3487274646759033,
35
+ "learning_rate": 9.801488833746898e-06,
36
+ "loss": 2.0561,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 1.6,
41
+ "grad_norm": 1.6185756921768188,
42
+ "learning_rate": 1.2282878411910669e-05,
43
+ "loss": 1.7744,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 1.92,
48
+ "grad_norm": 3.017139196395874,
49
+ "learning_rate": 1.4764267990074444e-05,
50
+ "loss": 1.8387,
51
  "step": 120
52
  },
53
  {
54
  "epoch": 2.24,
55
+ "grad_norm": 2.2100813388824463,
56
+ "learning_rate": 1.7245657568238215e-05,
57
+ "loss": 1.4478,
58
  "step": 140
59
  },
60
  {
61
  "epoch": 2.56,
62
+ "grad_norm": 1.574629545211792,
63
+ "learning_rate": 1.9727047146401986e-05,
64
+ "loss": 1.285,
65
  "step": 160
66
  },
67
  {
68
  "epoch": 2.88,
69
+ "grad_norm": 4.586638450622559,
70
+ "learning_rate": 2.2208436724565757e-05,
71
+ "loss": 1.2235,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 3.2,
76
+ "grad_norm": 2.7081515789031982,
77
+ "learning_rate": 2.468982630272953e-05,
78
+ "loss": 0.9575,
79
  "step": 200
80
  },
81
  {
82
  "epoch": 3.52,
83
+ "grad_norm": 0.9670729041099548,
84
+ "learning_rate": 2.7171215880893302e-05,
85
+ "loss": 0.7086,
86
  "step": 220
87
  },
88
  {
89
  "epoch": 3.84,
90
+ "grad_norm": 3.229243040084839,
91
+ "learning_rate": 2.9652605459057077e-05,
92
+ "loss": 0.8587,
93
  "step": 240
94
  },
95
  {
96
  "epoch": 4.16,
97
+ "grad_norm": 1.1293463706970215,
98
+ "learning_rate": 3.2133995037220844e-05,
99
+ "loss": 0.5978,
100
  "step": 260
101
  },
102
  {
103
  "epoch": 4.48,
104
+ "grad_norm": 1.7043830156326294,
105
+ "learning_rate": 3.461538461538462e-05,
106
+ "loss": 0.4668,
107
  "step": 280
108
  },
109
  {
110
  "epoch": 4.8,
111
+ "grad_norm": 2.565268039703369,
112
+ "learning_rate": 3.7096774193548386e-05,
113
+ "loss": 0.5667,
114
  "step": 300
115
  },
116
  {
117
  "epoch": 5.12,
118
+ "grad_norm": 1.158849835395813,
119
+ "learning_rate": 3.957816377171216e-05,
120
+ "loss": 0.4373,
121
  "step": 320
122
  },
123
  {
124
  "epoch": 5.44,
125
+ "grad_norm": 2.714164972305298,
126
+ "learning_rate": 4.205955334987593e-05,
127
+ "loss": 0.3492,
128
  "step": 340
129
  },
130
  {
131
  "epoch": 5.76,
132
+ "grad_norm": 2.2089672088623047,
133
+ "learning_rate": 4.45409429280397e-05,
134
+ "loss": 0.4018,
135
  "step": 360
136
  },
137
  {
138
  "epoch": 6.08,
139
+ "grad_norm": 1.8179335594177246,
140
+ "learning_rate": 4.702233250620348e-05,
141
+ "loss": 0.279,
142
  "step": 380
143
  },
144
  {
145
  "epoch": 6.4,
146
+ "grad_norm": 1.4858269691467285,
147
+ "learning_rate": 4.950372208436725e-05,
148
+ "loss": 0.2362,
149
  "step": 400
150
  },
151
  {
152
  "epoch": 6.72,
153
+ "grad_norm": 1.7704375982284546,
154
+ "learning_rate": 4.99975992459978e-05,
155
+ "loss": 0.2665,
156
  "step": 420
157
  },
158
  {
159
  "epoch": 7.04,
160
+ "grad_norm": 1.2611212730407715,
161
+ "learning_rate": 4.9987846973104825e-05,
162
+ "loss": 0.2029,
163
  "step": 440
164
  },
165
  {
166
  "epoch": 7.36,
167
+ "grad_norm": 2.994542360305786,
168
+ "learning_rate": 4.9970596058519116e-05,
169
+ "loss": 0.1747,
170
  "step": 460
171
  },
172
  {
173
  "epoch": 7.68,
174
+ "grad_norm": 2.7456889152526855,
175
+ "learning_rate": 4.994585167909436e-05,
176
+ "loss": 0.1486,
177
  "step": 480
178
  },
179
  {
180
  "epoch": 8.0,
181
+ "grad_norm": 1.8236416578292847,
182
+ "learning_rate": 4.9913621260409695e-05,
183
+ "loss": 0.1866,
184
  "step": 500
185
  },
186
  {
187
  "epoch": 8.32,
188
+ "grad_norm": 2.636003017425537,
189
+ "learning_rate": 4.987391447454136e-05,
190
+ "loss": 0.1476,
191
  "step": 520
192
  },
193
  {
194
  "epoch": 8.64,
195
+ "grad_norm": 2.879154920578003,
196
+ "learning_rate": 4.982674323716023e-05,
197
+ "loss": 0.1403,
198
  "step": 540
199
  },
200
  {
201
  "epoch": 8.96,
202
+ "grad_norm": 0.9377075433731079,
203
+ "learning_rate": 4.977212170395598e-05,
204
+ "loss": 0.1018,
205
  "step": 560
206
  },
207
  {
208
  "epoch": 9.28,
209
+ "grad_norm": 0.311233788728714,
210
+ "learning_rate": 4.9710066266389074e-05,
211
+ "loss": 0.0992,
212
  "step": 580
213
  },
214
  {
215
  "epoch": 9.6,
216
+ "grad_norm": 0.8316205739974976,
217
+ "learning_rate": 4.964059554677187e-05,
218
+ "loss": 0.1134,
219
  "step": 600
220
  },
221
  {
222
  "epoch": 9.92,
223
+ "grad_norm": 2.567354679107666,
224
+ "learning_rate": 4.956373039268022e-05,
225
+ "loss": 0.0781,
226
  "step": 620
227
  },
228
  {
229
  "epoch": 10.24,
230
+ "grad_norm": 0.0829504132270813,
231
+ "learning_rate": 4.947949387069721e-05,
232
+ "loss": 0.0892,
233
  "step": 640
234
  },
235
  {
236
  "epoch": 10.56,
237
+ "grad_norm": 0.8588472008705139,
238
+ "learning_rate": 4.938791125949119e-05,
239
+ "loss": 0.0499,
240
  "step": 660
241
  },
242
  {
243
  "epoch": 10.88,
244
+ "grad_norm": 1.2792423963546753,
245
+ "learning_rate": 4.9289010042229765e-05,
246
+ "loss": 0.0831,
247
  "step": 680
248
  },
249
  {
250
  "epoch": 11.2,
251
+ "grad_norm": 0.4728279709815979,
252
+ "learning_rate": 4.918281989833238e-05,
253
+ "loss": 0.0715,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 11.52,
258
+ "grad_norm": 2.5855355262756348,
259
+ "learning_rate": 4.9069372694563756e-05,
260
+ "loss": 0.0718,
261
  "step": 720
262
  },
263
  {
264
  "epoch": 11.84,
265
+ "grad_norm": 0.8059779405593872,
266
+ "learning_rate": 4.8948702475470933e-05,
267
+ "loss": 0.0849,
268
  "step": 740
269
  },
270
  {
271
  "epoch": 12.16,
272
+ "grad_norm": 1.2841193675994873,
273
+ "learning_rate": 4.882084545316684e-05,
274
+ "loss": 0.0683,
275
  "step": 760
276
  },
277
  {
278
  "epoch": 12.48,
279
+ "grad_norm": 1.3422589302062988,
280
+ "learning_rate": 4.868583999646329e-05,
281
+ "loss": 0.0808,
282
  "step": 780
283
  },
284
  {
285
  "epoch": 12.8,
286
+ "grad_norm": 1.3376965522766113,
287
+ "learning_rate": 4.8543726619356846e-05,
288
+ "loss": 0.0607,
289
  "step": 800
290
  },
291
  {
292
  "epoch": 13.12,
293
+ "grad_norm": 1.008899450302124,
294
+ "learning_rate": 4.83945479688709e-05,
295
+ "loss": 0.062,
296
  "step": 820
297
  },
298
  {
299
  "epoch": 13.44,
300
+ "grad_norm": 0.441413551568985,
301
+ "learning_rate": 4.8238348812257684e-05,
302
+ "loss": 0.0461,
303
  "step": 840
304
  },
305
  {
306
  "epoch": 13.76,
307
+ "grad_norm": 1.296985149383545,
308
+ "learning_rate": 4.808349953928184e-05,
309
+ "loss": 0.0482,
310
  "step": 860
311
  },
312
  {
313
  "epoch": 14.08,
314
+ "grad_norm": 0.035805635154247284,
315
+ "learning_rate": 4.791374712344622e-05,
316
+ "loss": 0.0388,
317
  "step": 880
318
  },
319
  {
320
  "epoch": 14.4,
321
+ "grad_norm": 0.10618308186531067,
322
+ "learning_rate": 4.7737118485753564e-05,
323
+ "loss": 0.0251,
324
  "step": 900
325
  },
326
  {
327
  "epoch": 14.72,
328
+ "grad_norm": 0.866423487663269,
329
+ "learning_rate": 4.75536666309653e-05,
330
+ "loss": 0.0515,
331
  "step": 920
332
  },
333
  {
334
  "epoch": 15.04,
335
+ "grad_norm": 0.5916399955749512,
336
+ "learning_rate": 4.73634466114326e-05,
337
+ "loss": 0.0536,
338
  "step": 940
339
  },
340
  {
341
  "epoch": 15.36,
342
+ "grad_norm": 0.1653570532798767,
343
+ "learning_rate": 4.7166515510575676e-05,
344
+ "loss": 0.0392,
345
  "step": 960
346
  },
347
  {
348
  "epoch": 15.68,
349
+ "grad_norm": 0.027391331270337105,
350
+ "learning_rate": 4.696293242575356e-05,
351
+ "loss": 0.0369,
352
  "step": 980
353
  },
354
  {
355
  "epoch": 16.0,
356
+ "grad_norm": 2.17256760597229,
357
+ "learning_rate": 4.675275845052942e-05,
358
+ "loss": 0.0651,
359
  "step": 1000
360
  },
361
  {
362
  "epoch": 16.32,
363
+ "grad_norm": 0.8612786531448364,
364
+ "learning_rate": 4.6536056656336947e-05,
365
+ "loss": 0.037,
366
  "step": 1020
367
  },
368
  {
369
  "epoch": 16.64,
370
+ "grad_norm": 4.489969253540039,
371
+ "learning_rate": 4.631289207355313e-05,
372
+ "loss": 0.0272,
373
  "step": 1040
374
  },
375
  {
376
  "epoch": 16.96,
377
+ "grad_norm": 0.4311043918132782,
378
+ "learning_rate": 4.6083331671983185e-05,
379
+ "loss": 0.0507,
380
  "step": 1060
381
  },
382
  {
383
  "epoch": 17.28,
384
+ "grad_norm": 0.4327545762062073,
385
+ "learning_rate": 4.584744434076352e-05,
386
+ "loss": 0.0274,
387
  "step": 1080
388
  },
389
  {
390
  "epoch": 17.6,
391
+ "grad_norm": 0.12099918723106384,
392
+ "learning_rate": 4.560530086768863e-05,
393
+ "loss": 0.0565,
394
  "step": 1100
395
  },
396
  {
397
  "epoch": 17.92,
398
+ "grad_norm": 0.103216253221035,
399
+ "learning_rate": 4.535697391796832e-05,
400
+ "loss": 0.0425,
401
  "step": 1120
402
  },
403
  {
404
  "epoch": 18.24,
405
+ "grad_norm": 0.419209748506546,
406
+ "learning_rate": 4.510253801242147e-05,
407
+ "loss": 0.0273,
408
  "step": 1140
409
  },
410
  {
411
  "epoch": 18.56,
412
+ "grad_norm": 1.3193784952163696,
413
+ "learning_rate": 4.4842069505112984e-05,
414
+ "loss": 0.0438,
415
  "step": 1160
416
  },
417
  {
418
  "epoch": 18.88,
419
+ "grad_norm": 1.5185387134552002,
420
+ "learning_rate": 4.457564656044056e-05,
421
+ "loss": 0.0544,
422
  "step": 1180
423
  },
424
  {
425
  "epoch": 19.2,
426
+ "grad_norm": 0.4024270474910736,
427
+ "learning_rate": 4.430334912967824e-05,
428
+ "loss": 0.0283,
429
  "step": 1200
430
  },
431
  {
432
  "epoch": 19.52,
433
+ "grad_norm": 0.16141988337039948,
434
+ "learning_rate": 4.402525892698367e-05,
435
+ "loss": 0.0393,
436
  "step": 1220
437
  },
438
  {
439
  "epoch": 19.84,
440
+ "grad_norm": 0.07228437811136246,
441
+ "learning_rate": 4.374145940487641e-05,
442
+ "loss": 0.0249,
443
  "step": 1240
444
  },
445
  {
446
  "epoch": 20.16,
447
+ "grad_norm": 0.7919737696647644,
448
+ "learning_rate": 4.345203572919454e-05,
449
+ "loss": 0.0293,
450
  "step": 1260
451
  },
452
  {
453
  "epoch": 20.48,
454
+ "grad_norm": 0.26585039496421814,
455
+ "learning_rate": 4.315707475353706e-05,
456
+ "loss": 0.0287,
457
  "step": 1280
458
  },
459
  {
460
  "epoch": 20.8,
461
+ "grad_norm": 0.5761149525642395,
462
+ "learning_rate": 4.285666499319992e-05,
463
+ "loss": 0.0521,
464
  "step": 1300
465
  },
466
  {
467
  "epoch": 21.12,
468
+ "grad_norm": 0.018601374700665474,
469
+ "learning_rate": 4.25508965986133e-05,
470
+ "loss": 0.0285,
471
  "step": 1320
472
  },
473
  {
474
  "epoch": 21.44,
475
+ "grad_norm": 0.00528874434530735,
476
+ "learning_rate": 4.2239861328288214e-05,
477
+ "loss": 0.0346,
478
  "step": 1340
479
  },
480
  {
481
  "epoch": 21.76,
482
+ "grad_norm": 0.3073647618293762,
483
+ "learning_rate": 4.1923652521280585e-05,
484
+ "loss": 0.022,
485
  "step": 1360
486
  },
487
  {
488
  "epoch": 22.08,
489
+ "grad_norm": 0.42911043763160706,
490
+ "learning_rate": 4.160236506918098e-05,
491
+ "loss": 0.0482,
492
  "step": 1380
493
  },
494
  {
495
  "epoch": 22.4,
496
+ "grad_norm": 0.6457176804542542,
497
+ "learning_rate": 4.127609538763842e-05,
498
+ "loss": 0.019,
499
  "step": 1400
500
  },
501
  {
502
  "epoch": 22.72,
503
+ "grad_norm": 2.3716557025909424,
504
+ "learning_rate": 4.094494138742685e-05,
505
+ "loss": 0.0312,
506
  "step": 1420
507
  },
508
  {
509
  "epoch": 23.04,
510
+ "grad_norm": 0.01667410507798195,
511
+ "learning_rate": 4.0609002445063036e-05,
512
+ "loss": 0.0377,
513
  "step": 1440
514
  },
515
  {
516
  "epoch": 23.36,
517
+ "grad_norm": 0.6381007432937622,
518
+ "learning_rate": 4.02683793729844e-05,
519
+ "loss": 0.0307,
520
  "step": 1460
521
  },
522
  {
523
  "epoch": 23.68,
524
+ "grad_norm": 0.42919328808784485,
525
+ "learning_rate": 3.9923174389296085e-05,
526
+ "loss": 0.0419,
527
  "step": 1480
528
  },
529
  {
530
  "epoch": 24.0,
531
+ "grad_norm": 0.01456019002944231,
532
+ "learning_rate": 3.957349108709623e-05,
533
+ "loss": 0.0223,
534
  "step": 1500
535
  },
536
  {
537
  "epoch": 24.32,
538
+ "grad_norm": 0.31073492765426636,
539
+ "learning_rate": 3.921943440338849e-05,
540
+ "loss": 0.0209,
541
  "step": 1520
542
  },
543
  {
544
  "epoch": 24.64,
545
+ "grad_norm": 0.38279736042022705,
546
+ "learning_rate": 3.886111058759132e-05,
547
+ "loss": 0.0491,
548
  "step": 1540
549
  },
550
  {
551
  "epoch": 24.96,
552
+ "grad_norm": 0.30651962757110596,
553
+ "learning_rate": 3.849862716965352e-05,
554
+ "loss": 0.0298,
555
  "step": 1560
556
  },
557
  {
558
  "epoch": 25.28,
559
+ "grad_norm": 0.4538489580154419,
560
+ "learning_rate": 3.813209292778527e-05,
561
+ "loss": 0.0319,
562
  "step": 1580
563
  },
564
  {
565
  "epoch": 25.6,
566
+ "grad_norm": 0.11643072962760925,
567
+ "learning_rate": 3.776161785581481e-05,
568
+ "loss": 0.0302,
569
  "step": 1600
570
  },
571
  {
572
  "epoch": 25.92,
573
+ "grad_norm": 0.008515519089996815,
574
+ "learning_rate": 3.738731313018019e-05,
575
+ "loss": 0.04,
576
  "step": 1620
577
  },
578
  {
579
  "epoch": 26.24,
580
+ "grad_norm": 0.002214708598330617,
581
+ "learning_rate": 3.700929107656614e-05,
582
  "loss": 0.0354,
583
  "step": 1640
584
  },
585
  {
586
  "epoch": 26.56,
587
+ "grad_norm": 0.02200801856815815,
588
+ "learning_rate": 3.662766513619611e-05,
589
+ "loss": 0.0186,
590
  "step": 1660
591
  },
592
  {
593
  "epoch": 26.88,
594
+ "grad_norm": 0.1882447600364685,
595
+ "learning_rate": 3.62425498317895e-05,
596
+ "loss": 0.022,
597
  "step": 1680
598
  },
599
  {
600
  "epoch": 27.2,
601
+ "grad_norm": 0.004948125686496496,
602
+ "learning_rate": 3.585406073319439e-05,
603
+ "loss": 0.015,
604
  "step": 1700
605
  },
606
  {
607
  "epoch": 27.52,
608
+ "grad_norm": 0.3387264013290405,
609
+ "learning_rate": 3.546231442270596e-05,
610
+ "loss": 0.0381,
611
  "step": 1720
612
  },
613
  {
614
  "epoch": 27.84,
615
+ "grad_norm": 0.09048642963171005,
616
+ "learning_rate": 3.506742846008116e-05,
617
+ "loss": 0.0277,
618
  "step": 1740
619
  },
620
  {
621
  "epoch": 28.16,
622
+ "grad_norm": 0.6405784487724304,
623
+ "learning_rate": 3.4669521347259996e-05,
624
+ "loss": 0.0423,
625
  "step": 1760
626
  },
627
  {
628
  "epoch": 28.48,
629
+ "grad_norm": 0.16012047231197357,
630
+ "learning_rate": 3.426871249280414e-05,
631
+ "loss": 0.0115,
632
  "step": 1780
633
  },
634
  {
635
  "epoch": 28.8,
636
+ "grad_norm": 0.3279825448989868,
637
+ "learning_rate": 3.386512217606339e-05,
638
+ "loss": 0.0275,
639
  "step": 1800
640
  },
641
  {
642
  "epoch": 29.12,
643
+ "grad_norm": 0.005494344513863325,
644
+ "learning_rate": 3.345887151108087e-05,
645
+ "loss": 0.0309,
646
  "step": 1820
647
  },
648
  {
649
  "epoch": 29.44,
650
+ "grad_norm": 0.0037028896622359753,
651
+ "learning_rate": 3.305008241024774e-05,
652
+ "loss": 0.0294,
653
  "step": 1840
654
  },
655
  {
656
  "epoch": 29.76,
657
+ "grad_norm": 0.003084386931732297,
658
+ "learning_rate": 3.2638877547718264e-05,
659
+ "loss": 0.0213,
660
  "step": 1860
661
  },
662
  {
663
  "epoch": 30.08,
664
+ "grad_norm": 0.0017954249633476138,
665
+ "learning_rate": 3.222538032259643e-05,
666
+ "loss": 0.0326,
667
  "step": 1880
668
  },
669
  {
670
  "epoch": 30.4,
671
+ "grad_norm": 0.26840922236442566,
672
+ "learning_rate": 3.1809714821904834e-05,
673
+ "loss": 0.0249,
674
  "step": 1900
675
  },
676
  {
677
  "epoch": 30.72,
678
+ "grad_norm": 0.7214370965957642,
679
+ "learning_rate": 3.1392005783347244e-05,
680
+ "loss": 0.0115,
681
  "step": 1920
682
  },
683
  {
684
  "epoch": 31.04,
685
+ "grad_norm": 0.1613769233226776,
686
+ "learning_rate": 3.0972378557875884e-05,
687
+ "loss": 0.0322,
688
  "step": 1940
689
  },
690
  {
691
  "epoch": 31.36,
692
+ "grad_norm": 0.18066717684268951,
693
+ "learning_rate": 3.055095907207465e-05,
694
+ "loss": 0.0316,
695
  "step": 1960
696
  },
697
  {
698
  "epoch": 31.68,
699
+ "grad_norm": 0.24756371974945068,
700
+ "learning_rate": 3.0127873790369627e-05,
701
+ "loss": 0.0248,
702
  "step": 1980
703
  },
704
  {
705
  "epoch": 32.0,
706
+ "grad_norm": 0.08604203909635544,
707
+ "learning_rate": 2.9703249677078156e-05,
708
+ "loss": 0.0234,
709
  "step": 2000
710
  },
711
  {
712
  "epoch": 32.32,
713
+ "grad_norm": 0.0022385423071682453,
714
+ "learning_rate": 2.9277214158307937e-05,
715
+ "loss": 0.0277,
716
  "step": 2020
717
  },
718
  {
719
  "epoch": 32.64,
720
+ "grad_norm": 0.0020592950750142336,
721
+ "learning_rate": 2.8849895083717537e-05,
722
+ "loss": 0.0162,
723
  "step": 2040
724
  },
725
  {
726
  "epoch": 32.96,
727
+ "grad_norm": 0.20633552968502045,
728
+ "learning_rate": 2.842142068814977e-05,
729
+ "loss": 0.022,
730
  "step": 2060
731
  },
732
  {
733
  "epoch": 33.28,
734
+ "grad_norm": 0.0019172705942764878,
735
+ "learning_rate": 2.7991919553149497e-05,
736
+ "loss": 0.0278,
737
  "step": 2080
738
  },
739
  {
740
  "epoch": 33.6,
741
+ "grad_norm": 0.0013098755152896047,
742
+ "learning_rate": 2.756152056837743e-05,
743
+ "loss": 0.0189,
744
  "step": 2100
745
  },
746
  {
747
  "epoch": 33.92,
748
+ "grad_norm": 0.09349821507930756,
749
+ "learning_rate": 2.7130352892931388e-05,
750
+ "loss": 0.0228,
751
  "step": 2120
752
  },
753
  {
754
  "epoch": 34.24,
755
+ "grad_norm": 0.0017231553792953491,
756
+ "learning_rate": 2.669854591658679e-05,
757
+ "loss": 0.0319,
758
  "step": 2140
759
  },
760
  {
761
  "epoch": 34.56,
762
+ "grad_norm": 0.047173839062452316,
763
+ "learning_rate": 2.6266229220967818e-05,
764
+ "loss": 0.0153,
765
  "step": 2160
766
  },
767
  {
768
  "epoch": 34.88,
769
+ "grad_norm": 0.2877206802368164,
770
+ "learning_rate": 2.5833532540661127e-05,
771
+ "loss": 0.0267,
772
  "step": 2180
773
  },
774
  {
775
  "epoch": 35.2,
776
+ "grad_norm": 0.25823402404785156,
777
+ "learning_rate": 2.540058572428356e-05,
778
+ "loss": 0.0178,
779
  "step": 2200
780
  },
781
  {
782
  "epoch": 35.52,
783
+ "grad_norm": 0.23003694415092468,
784
+ "learning_rate": 2.496751869551567e-05,
785
+ "loss": 0.0217,
786
  "step": 2220
787
  },
788
  {
789
  "epoch": 35.84,
790
+ "grad_norm": 0.23193888366222382,
791
+ "learning_rate": 2.453446141411273e-05,
792
+ "loss": 0.017,
793
  "step": 2240
794
  },
795
  {
796
  "epoch": 36.16,
797
+ "grad_norm": 0.1941184252500534,
798
+ "learning_rate": 2.4101543836904938e-05,
799
+ "loss": 0.0257,
800
  "step": 2260
801
  },
802
  {
803
  "epoch": 36.48,
804
+ "grad_norm": 0.012731954455375671,
805
+ "learning_rate": 2.3668895878798424e-05,
806
+ "loss": 0.0237,
807
  "step": 2280
808
  },
809
  {
810
  "epoch": 36.8,
811
+ "grad_norm": 0.18219026923179626,
812
+ "learning_rate": 2.32366473737889e-05,
813
+ "loss": 0.024,
814
  "step": 2300
815
  },
816
  {
817
  "epoch": 37.12,
818
+ "grad_norm": 0.256547212600708,
819
+ "learning_rate": 2.2804928035999594e-05,
820
+ "loss": 0.0225,
821
  "step": 2320
822
  },
823
  {
824
  "epoch": 37.44,
825
+ "grad_norm": 0.45314905047416687,
826
+ "learning_rate": 2.23738674207551e-05,
827
+ "loss": 0.0239,
828
  "step": 2340
829
  },
830
  {
831
  "epoch": 37.76,
832
+ "grad_norm": 0.3919714689254761,
833
+ "learning_rate": 2.1943594885702984e-05,
834
+ "loss": 0.0235,
835
  "step": 2360
836
  },
837
  {
838
  "epoch": 38.08,
839
+ "grad_norm": 0.0769328773021698,
840
+ "learning_rate": 2.151423955199456e-05,
841
+ "loss": 0.0286,
842
  "step": 2380
843
  },
844
  {
845
  "epoch": 38.4,
846
+ "grad_norm": 0.3520802855491638,
847
+ "learning_rate": 2.108593026553681e-05,
848
+ "loss": 0.0323,
849
  "step": 2400
850
  },
851
  {
852
  "epoch": 38.72,
853
+ "grad_norm": 0.3691672384738922,
854
+ "learning_rate": 2.0658795558326743e-05,
855
+ "loss": 0.0241,
856
  "step": 2420
857
  },
858
  {
859
  "epoch": 39.04,
860
+ "grad_norm": 0.001480752951465547,
861
+ "learning_rate": 2.0232963609880093e-05,
862
+ "loss": 0.0158,
863
  "step": 2440
864
  },
865
  {
866
  "epoch": 39.36,
867
+ "grad_norm": 0.31921085715293884,
868
+ "learning_rate": 1.9808562208765667e-05,
869
+ "loss": 0.0241,
870
  "step": 2460
871
  },
872
  {
873
  "epoch": 39.68,
874
+ "grad_norm": 0.20936931669712067,
875
+ "learning_rate": 1.938571871425715e-05,
876
+ "loss": 0.0174,
877
  "step": 2480
878
  },
879
  {
880
  "epoch": 40.0,
881
+ "grad_norm": 0.0011563162552192807,
882
+ "learning_rate": 1.896456001811357e-05,
883
+ "loss": 0.0183,
884
  "step": 2500
885
  },
886
  {
887
  "epoch": 40.32,
888
+ "grad_norm": 0.19230084121227264,
889
+ "learning_rate": 1.854521250650026e-05,
890
+ "loss": 0.012,
891
  "step": 2520
892
  },
893
  {
894
  "epoch": 40.64,
895
+ "grad_norm": 0.32013317942619324,
896
+ "learning_rate": 1.8127802022061334e-05,
897
+ "loss": 0.0225,
898
  "step": 2540
899
  },
900
  {
901
  "epoch": 40.96,
902
+ "grad_norm": 0.11989307403564453,
903
+ "learning_rate": 1.7712453826155457e-05,
904
+ "loss": 0.0391,
905
  "step": 2560
906
  },
907
  {
908
  "epoch": 41.28,
909
+ "grad_norm": 0.0009496643324382603,
910
+ "learning_rate": 1.72992925612659e-05,
911
+ "loss": 0.0229,
912
  "step": 2580
913
  },
914
  {
915
  "epoch": 41.6,
916
+ "grad_norm": 0.0012078011641278863,
917
+ "learning_rate": 1.688844221359645e-05,
918
+ "loss": 0.015,
919
  "step": 2600
920
  },
921
  {
922
  "epoch": 41.92,
923
+ "grad_norm": 0.0012093032710254192,
924
+ "learning_rate": 1.6480026075864163e-05,
925
+ "loss": 0.0287,
926
  "step": 2620
927
  },
928
  {
929
  "epoch": 42.24,
930
+ "grad_norm": 0.2027181088924408,
931
+ "learning_rate": 1.6074166710300247e-05,
932
+ "loss": 0.0229,
933
  "step": 2640
934
  },
935
  {
936
  "epoch": 42.56,
937
+ "grad_norm": 0.2977555990219116,
938
+ "learning_rate": 1.567098591187021e-05,
939
+ "loss": 0.0352,
940
  "step": 2660
941
  },
942
  {
943
  "epoch": 42.88,
944
+ "grad_norm": 0.36129167675971985,
945
+ "learning_rate": 1.5270604671724188e-05,
946
+ "loss": 0.0242,
947
  "step": 2680
948
  },
949
  {
950
  "epoch": 43.2,
951
+ "grad_norm": 0.001115540275350213,
952
+ "learning_rate": 1.4873143140888538e-05,
953
+ "loss": 0.0165,
954
  "step": 2700
955
  },
956
  {
957
  "epoch": 43.52,
958
+ "grad_norm": 0.19148553907871246,
959
+ "learning_rate": 1.4478720594209532e-05,
960
+ "loss": 0.0274,
961
  "step": 2720
962
  },
963
  {
964
  "epoch": 43.84,
965
+ "grad_norm": 0.057757727801799774,
966
+ "learning_rate": 1.4087455394559984e-05,
967
+ "loss": 0.0185,
968
  "step": 2740
969
  },
970
  {
971
  "epoch": 44.16,
972
+ "grad_norm": 0.0009874219540506601,
973
+ "learning_rate": 1.369946495731954e-05,
974
+ "loss": 0.0509,
975
  "step": 2760
976
  },
977
  {
978
  "epoch": 44.48,
979
+ "grad_norm": 0.3896861672401428,
980
+ "learning_rate": 1.3314865715139346e-05,
981
+ "loss": 0.027,
982
  "step": 2780
983
  },
984
  {
985
  "epoch": 44.8,
986
+ "grad_norm": 0.19004037976264954,
987
+ "learning_rate": 1.2933773083001517e-05,
988
+ "loss": 0.0163,
989
  "step": 2800
990
  },
991
  {
992
  "epoch": 45.12,
993
+ "grad_norm": 0.0009183284710161388,
994
+ "learning_rate": 1.255630142358421e-05,
995
+ "loss": 0.0125,
996
  "step": 2820
997
  },
998
  {
999
  "epoch": 45.44,
1000
+ "grad_norm": 0.1238480657339096,
1001
+ "learning_rate": 1.2182564012942193e-05,
1002
+ "loss": 0.0327,
1003
  "step": 2840
1004
  },
1005
  {
1006
  "epoch": 45.76,
1007
+ "grad_norm": 0.0009572324343025684,
1008
+ "learning_rate": 1.1812673006513789e-05,
1009
+ "loss": 0.0302,
1010
  "step": 2860
1011
  },
1012
  {
1013
  "epoch": 46.08,
1014
+ "grad_norm": 0.0011610776418820024,
1015
+ "learning_rate": 1.14467394054639e-05,
1016
+ "loss": 0.0209,
1017
  "step": 2880
1018
  },
1019
  {
1020
  "epoch": 46.4,
1021
+ "grad_norm": 0.04993343725800514,
1022
+ "learning_rate": 1.108487302337353e-05,
1023
+ "loss": 0.025,
1024
  "step": 2900
1025
  },
1026
  {
1027
  "epoch": 46.72,
1028
+ "grad_norm": 0.1806841343641281,
1029
+ "learning_rate": 1.0727182453285647e-05,
1030
+ "loss": 0.0284,
1031
  "step": 2920
1032
  },
1033
  {
1034
  "epoch": 47.04,
1035
+ "grad_norm": 0.0011777572799474,
1036
+ "learning_rate": 1.0373775035117305e-05,
1037
+ "loss": 0.0174,
1038
  "step": 2940
1039
  },
1040
  {
1041
  "epoch": 47.36,
1042
+ "grad_norm": 0.14497865736484528,
1043
+ "learning_rate": 1.002475682344792e-05,
1044
+ "loss": 0.0115,
1045
  "step": 2960
1046
  },
1047
  {
1048
  "epoch": 47.68,
1049
+ "grad_norm": 0.0014984839363023639,
1050
+ "learning_rate": 9.680232555693067e-06,
1051
+ "loss": 0.0238,
1052
  "step": 2980
1053
  },
1054
  {
1055
  "epoch": 48.0,
1056
+ "grad_norm": 0.07430601865053177,
1057
+ "learning_rate": 9.340305620673778e-06,
1058
+ "loss": 0.0294,
1059
  "step": 3000
1060
  },
1061
  {
1062
  "epoch": 48.32,
1063
+ "grad_norm": 0.07801785320043564,
1064
+ "learning_rate": 9.005078027590375e-06,
1065
+ "loss": 0.0226,
1066
  "step": 3020
1067
  },
1068
  {
1069
  "epoch": 48.64,
1070
+ "grad_norm": 0.0007196432561613619,
1071
+ "learning_rate": 8.67465037541038e-06,
1072
+ "loss": 0.0196,
1073
  "step": 3040
1074
  },
1075
  {
1076
  "epoch": 48.96,
1077
+ "grad_norm": 0.0008374506141990423,
1078
+ "learning_rate": 8.34912182267959e-06,
1079
+ "loss": 0.0175,
1080
  "step": 3060
1081
  },
1082
  {
1083
  "epoch": 49.28,
1084
+ "grad_norm": 0.0010465418454259634,
1085
+ "learning_rate": 8.028590057765523e-06,
1086
+ "loss": 0.015,
1087
  "step": 3080
1088
  },
1089
  {
1090
  "epoch": 49.6,
1091
+ "grad_norm": 0.0007761380402371287,
1092
+ "learning_rate": 7.713151269541844e-06,
1093
+ "loss": 0.0221,
1094
  "step": 3100
1095
  },
1096
  {
1097
+ "epoch": 49.92,
1098
+ "grad_norm": 0.0216947291046381,
1099
+ "learning_rate": 7.402900118522979e-06,
1100
+ "loss": 0.0161,
1101
+ "step": 3120
1102
+ },
1103
+ {
1104
+ "epoch": 50.24,
1105
+ "grad_norm": 0.26546710729599,
1106
+ "learning_rate": 7.097929708457282e-06,
1107
+ "loss": 0.0237,
1108
+ "step": 3140
1109
+ },
1110
+ {
1111
+ "epoch": 50.56,
1112
+ "grad_norm": 0.0011781662469729781,
1113
+ "learning_rate": 6.7983315583873695e-06,
1114
+ "loss": 0.0172,
1115
+ "step": 3160
1116
+ },
1117
+ {
1118
+ "epoch": 50.88,
1119
+ "grad_norm": 0.39518535137176514,
1120
+ "learning_rate": 6.504195575186009e-06,
1121
+ "loss": 0.0198,
1122
+ "step": 3180
1123
+ },
1124
+ {
1125
+ "epoch": 51.2,
1126
+ "grad_norm": 0.3506232500076294,
1127
+ "learning_rate": 6.215610026575916e-06,
1128
+ "loss": 0.0227,
1129
+ "step": 3200
1130
+ },
1131
+ {
1132
+ "epoch": 51.52,
1133
+ "grad_norm": 0.31244903802871704,
1134
+ "learning_rate": 5.93266151464123e-06,
1135
+ "loss": 0.0156,
1136
+ "step": 3220
1137
+ },
1138
+ {
1139
+ "epoch": 51.84,
1140
+ "grad_norm": 0.17840787768363953,
1141
+ "learning_rate": 5.655434949839061e-06,
1142
+ "loss": 0.0268,
1143
+ "step": 3240
1144
+ },
1145
+ {
1146
+ "epoch": 52.16,
1147
+ "grad_norm": 0.1670505702495575,
1148
+ "learning_rate": 5.384013525518541e-06,
1149
+ "loss": 0.0209,
1150
+ "step": 3260
1151
+ },
1152
+ {
1153
+ "epoch": 52.48,
1154
+ "grad_norm": 0.0010594127234071493,
1155
+ "learning_rate": 5.118478692955194e-06,
1156
+ "loss": 0.0202,
1157
+ "step": 3280
1158
+ },
1159
+ {
1160
+ "epoch": 52.8,
1161
+ "grad_norm": 0.0015649694250896573,
1162
+ "learning_rate": 4.858910136908123e-06,
1163
+ "loss": 0.0192,
1164
+ "step": 3300
1165
+ },
1166
+ {
1167
+ "epoch": 53.12,
1168
+ "grad_norm": 0.19762022793293,
1169
+ "learning_rate": 4.605385751707248e-06,
1170
+ "loss": 0.0205,
1171
+ "step": 3320
1172
+ },
1173
+ {
1174
+ "epoch": 53.44,
1175
+ "grad_norm": 0.2010522186756134,
1176
+ "learning_rate": 4.357981617877932e-06,
1177
+ "loss": 0.0129,
1178
+ "step": 3340
1179
+ },
1180
+ {
1181
+ "epoch": 53.76,
1182
+ "grad_norm": 0.19793441891670227,
1183
+ "learning_rate": 4.116771979309797e-06,
1184
+ "loss": 0.0258,
1185
+ "step": 3360
1186
+ },
1187
+ {
1188
+ "epoch": 54.08,
1189
+ "grad_norm": 0.2605569064617157,
1190
+ "learning_rate": 3.881829220976807e-06,
1191
+ "loss": 0.0306,
1192
+ "step": 3380
1193
+ },
1194
+ {
1195
+ "epoch": 54.4,
1196
+ "grad_norm": 0.037421807646751404,
1197
+ "learning_rate": 3.653223847215126e-06,
1198
+ "loss": 0.0198,
1199
+ "step": 3400
1200
+ },
1201
+ {
1202
+ "epoch": 54.72,
1203
+ "grad_norm": 0.0007586870342493057,
1204
+ "learning_rate": 3.4310244605653797e-06,
1205
+ "loss": 0.0257,
1206
+ "step": 3420
1207
+ },
1208
+ {
1209
+ "epoch": 55.04,
1210
+ "grad_norm": 0.27584579586982727,
1211
+ "learning_rate": 3.215297741185572e-06,
1212
+ "loss": 0.0125,
1213
+ "step": 3440
1214
+ },
1215
+ {
1216
+ "epoch": 55.36,
1217
+ "grad_norm": 0.0007228174363262951,
1218
+ "learning_rate": 3.0061084268410006e-06,
1219
+ "loss": 0.0124,
1220
+ "step": 3460
1221
+ },
1222
+ {
1223
+ "epoch": 55.68,
1224
+ "grad_norm": 0.04090801998972893,
1225
+ "learning_rate": 2.8035192934769362e-06,
1226
+ "loss": 0.023,
1227
+ "step": 3480
1228
+ },
1229
+ {
1230
+ "epoch": 56.0,
1231
+ "grad_norm": 0.3518761694431305,
1232
+ "learning_rate": 2.607591136380122e-06,
1233
+ "loss": 0.0194,
1234
+ "step": 3500
1235
+ },
1236
+ {
1237
+ "epoch": 56.32,
1238
+ "grad_norm": 0.06331823766231537,
1239
+ "learning_rate": 2.4183827519346308e-06,
1240
+ "loss": 0.0162,
1241
+ "step": 3520
1242
+ },
1243
+ {
1244
+ "epoch": 56.64,
1245
+ "grad_norm": 0.22303640842437744,
1246
+ "learning_rate": 2.235950919977545e-06,
1247
+ "loss": 0.0337,
1248
+ "step": 3540
1249
+ },
1250
+ {
1251
+ "epoch": 56.96,
1252
+ "grad_norm": 0.08465743064880371,
1253
+ "learning_rate": 2.0603503867598182e-06,
1254
+ "loss": 0.0139,
1255
+ "step": 3560
1256
+ },
1257
+ {
1258
+ "epoch": 57.28,
1259
+ "grad_norm": 0.20135080814361572,
1260
+ "learning_rate": 1.8916338485173823e-06,
1261
+ "loss": 0.0193,
1262
+ "step": 3580
1263
+ },
1264
+ {
1265
+ "epoch": 57.6,
1266
+ "grad_norm": 0.0006721566896885633,
1267
+ "learning_rate": 1.7298519356574727e-06,
1268
+ "loss": 0.0203,
1269
+ "step": 3600
1270
+ },
1271
+ {
1272
+ "epoch": 57.92,
1273
+ "grad_norm": 0.10799671709537506,
1274
+ "learning_rate": 1.5750531975648324e-06,
1275
+ "loss": 0.0212,
1276
+ "step": 3620
1277
+ },
1278
+ {
1279
+ "epoch": 58.24,
1280
+ "grad_norm": 0.0010109569411724806,
1281
+ "learning_rate": 1.4272840880324934e-06,
1282
+ "loss": 0.0173,
1283
+ "step": 3640
1284
+ },
1285
+ {
1286
+ "epoch": 58.56,
1287
+ "grad_norm": 0.0008448906592093408,
1288
+ "learning_rate": 1.286588951321363e-06,
1289
+ "loss": 0.0139,
1290
+ "step": 3660
1291
+ },
1292
+ {
1293
+ "epoch": 58.88,
1294
+ "grad_norm": 0.0010856656590476632,
1295
+ "learning_rate": 1.1530100088528867e-06,
1296
+ "loss": 0.0268,
1297
+ "step": 3680
1298
+ },
1299
+ {
1300
+ "epoch": 59.2,
1301
+ "grad_norm": 0.23958024382591248,
1302
+ "learning_rate": 1.0265873465387516e-06,
1303
+ "loss": 0.0191,
1304
+ "step": 3700
1305
+ },
1306
+ {
1307
+ "epoch": 59.52,
1308
+ "grad_norm": 0.20584586262702942,
1309
+ "learning_rate": 9.073589027514789e-07,
1310
+ "loss": 0.0168,
1311
+ "step": 3720
1312
+ },
1313
+ {
1314
+ "epoch": 59.84,
1315
+ "grad_norm": 0.031580936163663864,
1316
+ "learning_rate": 7.953604569393841e-07,
1317
+ "loss": 0.0246,
1318
+ "step": 3740
1319
+ },
1320
+ {
1321
+ "epoch": 60.16,
1322
+ "grad_norm": 0.14215555787086487,
1323
+ "learning_rate": 6.906256188895038e-07,
1324
+ "loss": 0.019,
1325
+ "step": 3760
1326
+ },
1327
+ {
1328
+ "epoch": 60.48,
1329
+ "grad_norm": 0.0012006442993879318,
1330
+ "learning_rate": 5.931858186415756e-07,
1331
+ "loss": 0.0168,
1332
+ "step": 3780
1333
+ },
1334
+ {
1335
+ "epoch": 60.8,
1336
+ "grad_norm": 0.0063135698437690735,
1337
+ "learning_rate": 5.03070297056149e-07,
1338
+ "loss": 0.0197,
1339
+ "step": 3800
1340
+ },
1341
+ {
1342
+ "epoch": 61.12,
1343
+ "grad_norm": 0.07496818155050278,
1344
+ "learning_rate": 4.203060970396383e-07,
1345
+ "loss": 0.0207,
1346
+ "step": 3820
1347
+ },
1348
+ {
1349
+ "epoch": 61.44,
1350
+ "grad_norm": 0.16551247239112854,
1351
+ "learning_rate": 3.4491805542899157e-07,
1352
+ "loss": 0.0224,
1353
+ "step": 3840
1354
+ },
1355
+ {
1356
+ "epoch": 61.76,
1357
+ "grad_norm": 0.0008456969517283142,
1358
+ "learning_rate": 2.769287955383532e-07,
1359
+ "loss": 0.0151,
1360
+ "step": 3860
1361
+ },
1362
+ {
1363
+ "epoch": 62.08,
1364
+ "grad_norm": 0.0008134017698466778,
1365
+ "learning_rate": 2.1635872037001626e-07,
1366
+ "loss": 0.0284,
1367
+ "step": 3880
1368
+ },
1369
+ {
1370
+ "epoch": 62.4,
1371
+ "grad_norm": 0.18878595530986786,
1372
+ "learning_rate": 1.6322600649162356e-07,
1373
+ "loss": 0.0217,
1374
+ "step": 3900
1375
+ },
1376
+ {
1377
+ "epoch": 62.72,
1378
+ "grad_norm": 0.0008310906123369932,
1379
+ "learning_rate": 1.1754659858156659e-07,
1380
+ "loss": 0.0103,
1381
+ "step": 3920
1382
+ },
1383
+ {
1384
+ "epoch": 63.04,
1385
+ "grad_norm": 0.38621172308921814,
1386
+ "learning_rate": 7.933420464410201e-08,
1387
+ "loss": 0.0333,
1388
+ "step": 3940
1389
+ },
1390
+ {
1391
+ "epoch": 63.36,
1392
+ "grad_norm": 0.016794312745332718,
1393
+ "learning_rate": 4.860029189569237e-08,
1394
+ "loss": 0.0231,
1395
+ "step": 3960
1396
+ },
1397
+ {
1398
+ "epoch": 63.68,
1399
+ "grad_norm": 0.16253815591335297,
1400
+ "learning_rate": 2.535408332381417e-08,
1401
+ "loss": 0.0226,
1402
+ "step": 3980
1403
+ },
1404
+ {
1405
+ "epoch": 64.0,
1406
+ "grad_norm": 0.2387680560350418,
1407
+ "learning_rate": 9.60255491919415e-09,
1408
+ "loss": 0.0218,
1409
+ "step": 4000
1410
+ },
1411
+ {
1412
+ "epoch": 64.32,
1413
+ "grad_norm": 0.16293394565582275,
1414
+ "learning_rate": 1.3504335823810722e-09,
1415
+ "loss": 0.0219,
1416
+ "step": 4020
1417
+ },
1418
+ {
1419
+ "epoch": 64.48,
1420
+ "step": 4030,
1421
+ "total_flos": 2.3325606118844006e+17,
1422
+ "train_loss": 0.1495482857003993,
1423
+ "train_runtime": 6882.5617,
1424
+ "train_samples_per_second": 4.722,
1425
+ "train_steps_per_second": 0.586
1426
  }
1427
  ],
1428
  "logging_steps": 20,
1429
+ "max_steps": 4030,
1430
  "num_input_tokens_seen": 0,
1431
+ "num_train_epochs": 65,
1432
  "save_steps": 1000,
1433
  "stateful_callbacks": {
1434
  "TrainerControl": {
 
1442
  "attributes": {}
1443
  }
1444
  },
1445
+ "total_flos": 2.3325606118844006e+17,
1446
  "train_batch_size": 2,
1447
  "trial_name": null,
1448
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4feb1ab5744e1b2819b309b2001c004a373fe1e946dea49af8b0cd713f11d50f
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85bd8392ac04e1d95cc4bb0c1bc398edcef3309a98ea6502b653c72aeb29939f
3
  size 5240