hugodk-sch commited on
Commit
3be5042
1 Parent(s): 73046c1

Model save

Browse files
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - alignment-handbook
5
  - trl
6
  - dpo
 
7
  - generated_from_trainer
8
  base_model: NbAiLab/nb-gpt-j-6B-v2
9
- datasets:
10
- - hugodk-sch/aftonposten_title_prefs
11
  model-index:
12
  - name: aftonposten-6b-align-scan
13
  results: []
@@ -18,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # aftonposten-6b-align-scan
20
 
21
- This model is a fine-tuned version of [data/ap-gpt-j-6b-sft-qlora-04-08](https://huggingface.co/data/ap-gpt-j-6b-sft-qlora-04-08) on the hugodk-sch/aftonposten_title_prefs dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.4934
24
- - Rewards/chosen: 0.2139
25
- - Rewards/rejected: 0.1872
26
- - Rewards/accuracies: 0.5457
27
- - Rewards/margins: 0.0267
28
- - Logps/rejected: -37.2826
29
- - Logps/chosen: -33.7672
30
- - Logits/rejected: -2.2262
31
- - Logits/chosen: -2.2310
32
 
33
  ## Model description
34
 
@@ -57,15 +55,27 @@ The following hyperparameters were used during training:
57
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
  - lr_scheduler_type: cosine
59
  - lr_scheduler_warmup_ratio: 0.1
60
- - num_epochs: 1
61
 
62
  ### Training results
63
 
64
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
- | 0.4749 | 0.26 | 100 | 0.4963 | 0.1467 | 0.1303 | 0.5336 | 0.0164 | -37.3537 | -33.8512 | -2.2327 | -2.2375 |
67
- | 0.4376 | 0.52 | 200 | 0.4956 | 0.1959 | 0.1769 | 0.5486 | 0.0191 | -37.2955 | -33.7896 | -2.2291 | -2.2339 |
68
- | 0.3835 | 0.78 | 300 | 0.4950 | 0.2045 | 0.1836 | 0.5245 | 0.0210 | -37.2872 | -33.7789 | -2.2264 | -2.2312 |
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  ### Framework versions
 
1
  ---
2
  library_name: peft
3
  tags:
 
4
  - trl
5
  - dpo
6
+ - alignment-handbook
7
  - generated_from_trainer
8
  base_model: NbAiLab/nb-gpt-j-6B-v2
 
 
9
  model-index:
10
  - name: aftonposten-6b-align-scan
11
  results: []
 
16
 
17
  # aftonposten-6b-align-scan
18
 
19
+ This model is a fine-tuned version of [NbAiLab/nb-gpt-j-6B-v2](https://huggingface.co/NbAiLab/nb-gpt-j-6B-v2) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4684
22
+ - Rewards/chosen: 0.3669
23
+ - Rewards/rejected: 0.2161
24
+ - Rewards/accuracies: 0.5743
25
+ - Rewards/margins: 0.1508
26
+ - Logps/rejected: -37.2465
27
+ - Logps/chosen: -33.5759
28
+ - Logits/rejected: -2.1622
29
+ - Logits/chosen: -2.1669
30
 
31
  ## Model description
32
 
 
55
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
  - lr_scheduler_type: cosine
57
  - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 4
59
 
60
  ### Training results
61
 
62
+ | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
63
+ |:-------------:|:-----:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
64
+ | 0.4749 | 0.26 | 100 | -2.2375 | -2.2327 | -33.8512 | -37.3537 | 0.4963 | 0.5336 | 0.1467 | 0.0164 | 0.1303 |
65
+ | 0.4376 | 0.52 | 200 | -2.2339 | -2.2291 | -33.7896 | -37.2955 | 0.4956 | 0.5486 | 0.1959 | 0.0191 | 0.1769 |
66
+ | 0.3835 | 0.78 | 300 | -2.2312 | -2.2264 | -33.7789 | -37.2872 | 0.4950 | 0.5245 | 0.2045 | 0.0210 | 0.1836 |
67
+ | 0.3117 | 1.04 | 400 | 0.4891 | 0.3054 | 0.2586 | 0.5652 | 0.0468 | -37.1934 | -33.6528 | -2.2112 | -2.2160 |
68
+ | 0.2459 | 1.3 | 500 | 0.4885 | 0.3186 | 0.2671 | 0.5623 | 0.0514 | -37.1827 | -33.6364 | -2.1858 | -2.1906 |
69
+ | 0.2639 | 1.56 | 600 | 0.4750 | 0.3623 | 0.2503 | 0.5855 | 0.1120 | -37.2038 | -33.5817 | -2.1784 | -2.1832 |
70
+ | 0.2437 | 1.82 | 700 | 0.4742 | 0.3483 | 0.2298 | 0.5748 | 0.1184 | -37.2294 | -33.5992 | -2.1739 | -2.1786 |
71
+ | 0.1567 | 2.08 | 800 | 0.4695 | 0.3879 | 0.2480 | 0.5826 | 0.1399 | -37.2066 | -33.5496 | -2.1755 | -2.1803 |
72
+ | 0.131 | 2.34 | 900 | 0.4716 | 0.3533 | 0.2206 | 0.5860 | 0.1326 | -37.2408 | -33.5930 | -2.1658 | -2.1705 |
73
+ | 0.1784 | 2.6 | 1000 | 0.4673 | 0.3677 | 0.2130 | 0.5860 | 0.1548 | -37.2504 | -33.5749 | -2.1646 | -2.1693 |
74
+ | 0.1956 | 2.86 | 1100 | 0.4706 | 0.3580 | 0.2180 | 0.5860 | 0.1400 | -37.2442 | -33.5871 | -2.1622 | -2.1669 |
75
+ | 0.137 | 3.12 | 1200 | 0.4680 | 0.3694 | 0.2182 | 0.6063 | 0.1511 | -37.2438 | -33.5728 | -2.1625 | -2.1672 |
76
+ | 0.1211 | 3.38 | 1300 | 0.4705 | 0.3633 | 0.2219 | 0.5918 | 0.1414 | -37.2393 | -33.5805 | -2.1622 | -2.1669 |
77
+ | 0.1553 | 3.64 | 1400 | 0.4654 | 0.3698 | 0.2068 | 0.6034 | 0.1630 | -37.2582 | -33.5723 | -2.1621 | -2.1668 |
78
+ | 0.1447 | 3.9 | 1500 | 0.4684 | 0.3669 | 0.2161 | 0.5743 | 0.1508 | -37.2465 | -33.5759 | -2.1622 | -2.1669 |
79
 
80
 
81
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cdb7f9e0e0b705c20536e15c1ca04c77652c7bef18fe3501181e476dfb6b8bd
3
  size 176183216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3847fa5053c205f61f041cbc4e44ddec25eb91ff0ac76b5bc16fcf571960eb4b
3
  size 176183216
all_results.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 1.0,
3
  "eval_logits/chosen": -2.2309982776641846,
4
  "eval_logits/rejected": -2.2261931896209717,
5
  "eval_logps/chosen": -33.767173767089844,
@@ -13,9 +13,9 @@
13
  "eval_samples": 343,
14
  "eval_samples_per_second": 2.356,
15
  "eval_steps_per_second": 0.295,
16
- "train_loss": 0.4538224170734356,
17
- "train_runtime": 3252.427,
18
  "train_samples": 3079,
19
- "train_samples_per_second": 0.947,
20
- "train_steps_per_second": 0.118
21
  }
 
1
  {
2
+ "epoch": 4.0,
3
  "eval_logits/chosen": -2.2309982776641846,
4
  "eval_logits/rejected": -2.2261931896209717,
5
  "eval_logps/chosen": -33.767173767089844,
 
13
  "eval_samples": 343,
14
  "eval_samples_per_second": 2.356,
15
  "eval_steps_per_second": 0.295,
16
+ "train_loss": 0.16076272354497537,
17
+ "train_runtime": 10802.3033,
18
  "train_samples": 3079,
19
+ "train_samples_per_second": 1.14,
20
+ "train_steps_per_second": 0.143
21
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.4538224170734356,
4
- "train_runtime": 3252.427,
5
  "train_samples": 3079,
6
- "train_samples_per_second": 0.947,
7
- "train_steps_per_second": 0.118
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.16076272354497537,
4
+ "train_runtime": 10802.3033,
5
  "train_samples": 3079,
6
+ "train_samples_per_second": 1.14,
7
+ "train_steps_per_second": 0.143
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 385,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -492,130 +492,1946 @@
492
  },
493
  {
494
  "epoch": 0.81,
495
- "learning_rate": 5.576113578589035e-07,
496
- "logits/chosen": -1.9173246622085571,
497
- "logits/rejected": -1.9141733646392822,
498
- "logps/chosen": -31.059444427490234,
499
- "logps/rejected": -33.56504440307617,
500
- "loss": 0.4356,
501
  "rewards/accuracies": 0.7124999761581421,
502
- "rewards/chosen": 0.4351249635219574,
503
- "rewards/margins": 0.293440043926239,
504
- "rewards/rejected": 0.141684889793396,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
- "learning_rate": 4.229036944380913e-07,
510
- "logits/chosen": -1.9676717519760132,
511
- "logits/rejected": -1.9555410146713257,
512
- "logps/chosen": -34.03219985961914,
513
- "logps/rejected": -33.442317962646484,
514
- "loss": 0.417,
515
  "rewards/accuracies": 0.7124999761581421,
516
- "rewards/chosen": 0.4510994851589203,
517
- "rewards/margins": 0.37388378381729126,
518
- "rewards/rejected": 0.07721573859453201,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
- "learning_rate": 3.053082288996112e-07,
524
- "logits/chosen": -2.002480983734131,
525
- "logits/rejected": -2.0011117458343506,
526
- "logps/chosen": -32.882102966308594,
527
- "logps/rejected": -32.251502990722656,
528
- "loss": 0.4339,
529
- "rewards/accuracies": 0.6499999761581421,
530
- "rewards/chosen": 0.4829506278038025,
531
- "rewards/margins": 0.30048781633377075,
532
- "rewards/rejected": 0.18246281147003174,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
- "learning_rate": 2.0579377374915805e-07,
538
- "logits/chosen": -2.0899081230163574,
539
- "logits/rejected": -2.0742757320404053,
540
- "logps/chosen": -33.487709045410156,
541
- "logps/rejected": -32.8193359375,
542
- "loss": 0.4286,
543
- "rewards/accuracies": 0.675000011920929,
544
- "rewards/chosen": 0.5533460974693298,
545
- "rewards/margins": 0.3143841624259949,
546
- "rewards/rejected": 0.23896190524101257,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
- "learning_rate": 1.2518018074041684e-07,
552
- "logits/chosen": -1.9617509841918945,
553
- "logits/rejected": -1.9609159231185913,
554
- "logps/chosen": -32.60249710083008,
555
- "logps/rejected": -32.25555419921875,
556
- "loss": 0.418,
557
- "rewards/accuracies": 0.6875,
558
- "rewards/chosen": 0.5436802506446838,
559
- "rewards/margins": 0.37600547075271606,
560
- "rewards/rejected": 0.16767482459545135,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
- "learning_rate": 6.41315865106129e-08,
566
- "logits/chosen": -1.9185073375701904,
567
- "logits/rejected": -1.9287922382354736,
568
- "logps/chosen": -31.57277488708496,
569
- "logps/rejected": -35.039085388183594,
570
- "loss": 0.4304,
571
  "rewards/accuracies": 0.675000011920929,
572
- "rewards/chosen": 0.505532443523407,
573
- "rewards/margins": 0.305793821811676,
574
- "rewards/rejected": 0.19973860681056976,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
- "learning_rate": 2.3150941078050325e-08,
580
- "logits/chosen": -2.0573890209198,
581
- "logits/rejected": -2.0508790016174316,
582
- "logps/chosen": -33.04835891723633,
583
- "logps/rejected": -28.99324607849121,
584
- "loss": 0.4308,
585
- "rewards/accuracies": 0.7124999761581421,
586
- "rewards/chosen": 0.4828890860080719,
587
- "rewards/margins": 0.2963466942310333,
588
- "rewards/rejected": 0.18654237687587738,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
- "learning_rate": 2.575864278703266e-09,
594
- "logits/chosen": -1.9167007207870483,
595
- "logits/rejected": -1.9188674688339233,
596
- "logps/chosen": -33.65839385986328,
597
- "logps/rejected": -30.719829559326172,
598
- "loss": 0.4247,
599
- "rewards/accuracies": 0.699999988079071,
600
- "rewards/chosen": 0.46452397108078003,
601
- "rewards/margins": 0.34228652715682983,
602
- "rewards/rejected": 0.12223746627569199,
603
  "step": 380
604
  },
605
  {
606
- "epoch": 1.0,
607
- "step": 385,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
  "total_flos": 0.0,
609
- "train_loss": 0.4538224170734356,
610
- "train_runtime": 3252.427,
611
- "train_samples_per_second": 0.947,
612
- "train_steps_per_second": 0.118
613
  }
614
  ],
615
  "logging_steps": 10,
616
- "max_steps": 385,
617
  "num_input_tokens_seen": 0,
618
- "num_train_epochs": 1,
619
  "save_steps": 100,
620
  "total_flos": 0.0,
621
  "train_batch_size": 4,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 1540,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
492
  },
493
  {
494
  "epoch": 0.81,
495
+ "learning_rate": 4.84533120650964e-06,
496
+ "logits/chosen": -1.9174331426620483,
497
+ "logits/rejected": -1.9142847061157227,
498
+ "logps/chosen": -31.049779891967773,
499
+ "logps/rejected": -33.54362487792969,
500
+ "loss": 0.4366,
501
  "rewards/accuracies": 0.7124999761581421,
502
+ "rewards/chosen": 0.4428572654724121,
503
+ "rewards/margins": 0.28403452038764954,
504
+ "rewards/rejected": 0.15882274508476257,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
+ "learning_rate": 4.825108134172131e-06,
510
+ "logits/chosen": -1.9670381546020508,
511
+ "logits/rejected": -1.9549226760864258,
512
+ "logps/chosen": -34.00459671020508,
513
+ "logps/rejected": -33.429378509521484,
514
+ "loss": 0.4145,
515
  "rewards/accuracies": 0.7124999761581421,
516
+ "rewards/chosen": 0.4731821119785309,
517
+ "rewards/margins": 0.38561564683914185,
518
+ "rewards/rejected": 0.08756652474403381,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
+ "learning_rate": 4.80369052967602e-06,
524
+ "logits/chosen": -2.001861095428467,
525
+ "logits/rejected": -2.0005335807800293,
526
+ "logps/chosen": -32.78104019165039,
527
+ "logps/rejected": -32.248443603515625,
528
+ "loss": 0.4183,
529
+ "rewards/accuracies": 0.7124999761581421,
530
+ "rewards/chosen": 0.5638058185577393,
531
+ "rewards/margins": 0.37889450788497925,
532
+ "rewards/rejected": 0.18491129577159882,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
+ "learning_rate": 4.781089396387968e-06,
538
+ "logits/chosen": -2.0892767906188965,
539
+ "logits/rejected": -2.073693037033081,
540
+ "logps/chosen": -33.377323150634766,
541
+ "logps/rejected": -32.779090881347656,
542
+ "loss": 0.4167,
543
+ "rewards/accuracies": 0.6875,
544
+ "rewards/chosen": 0.641654908657074,
545
+ "rewards/margins": 0.37049978971481323,
546
+ "rewards/rejected": 0.2711551785469055,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
+ "learning_rate": 4.757316345716554e-06,
552
+ "logits/chosen": -1.9618675708770752,
553
+ "logits/rejected": -1.9610551595687866,
554
+ "logps/chosen": -32.51572799682617,
555
+ "logps/rejected": -32.19974899291992,
556
+ "loss": 0.4137,
557
+ "rewards/accuracies": 0.699999988079071,
558
+ "rewards/chosen": 0.6130937933921814,
559
+ "rewards/margins": 0.4007722735404968,
560
+ "rewards/rejected": 0.21232159435749054,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
+ "learning_rate": 4.73238359114687e-06,
566
+ "logits/chosen": -1.916259765625,
567
+ "logits/rejected": -1.9264857769012451,
568
+ "logps/chosen": -31.4000244140625,
569
+ "logps/rejected": -35.018882751464844,
570
+ "loss": 0.4065,
571
  "rewards/accuracies": 0.675000011920929,
572
+ "rewards/chosen": 0.6437316536903381,
573
+ "rewards/margins": 0.427828311920166,
574
+ "rewards/rejected": 0.21590332686901093,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
+ "learning_rate": 4.706303941965804e-06,
580
+ "logits/chosen": -2.050283908843994,
581
+ "logits/rejected": -2.043849468231201,
582
+ "logps/chosen": -32.88298797607422,
583
+ "logps/rejected": -28.950063705444336,
584
+ "loss": 0.4093,
585
+ "rewards/accuracies": 0.699999988079071,
586
+ "rewards/chosen": 0.6151864528656006,
587
+ "rewards/margins": 0.3940979838371277,
588
+ "rewards/rejected": 0.2210884839296341,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
+ "learning_rate": 4.679090796681225e-06,
594
+ "logits/chosen": -1.9082043170928955,
595
+ "logits/rejected": -1.9103626012802124,
596
+ "logps/chosen": -33.2851676940918,
597
+ "logps/rejected": -30.665771484375,
598
+ "loss": 0.3759,
599
+ "rewards/accuracies": 0.7250000238418579,
600
+ "rewards/chosen": 0.7631045579910278,
601
+ "rewards/margins": 0.5976192355155945,
602
+ "rewards/rejected": 0.1654852330684662,
603
  "step": 380
604
  },
605
  {
606
+ "epoch": 1.01,
607
+ "learning_rate": 4.650758136138454e-06,
608
+ "logits/chosen": -1.9357115030288696,
609
+ "logits/rejected": -1.9344127178192139,
610
+ "logps/chosen": -33.36685562133789,
611
+ "logps/rejected": -35.62567138671875,
612
+ "loss": 0.3409,
613
+ "rewards/accuracies": 0.7583333253860474,
614
+ "rewards/chosen": 0.8143496513366699,
615
+ "rewards/margins": 0.790077805519104,
616
+ "rewards/rejected": 0.02427184209227562,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 1.04,
621
+ "learning_rate": 4.621320516337559e-06,
622
+ "logits/chosen": -1.8698232173919678,
623
+ "logits/rejected": -1.8616269826889038,
624
+ "logps/chosen": -30.623443603515625,
625
+ "logps/rejected": -36.03126907348633,
626
+ "loss": 0.3117,
627
+ "rewards/accuracies": 0.8500000238418579,
628
+ "rewards/chosen": 0.920426070690155,
629
+ "rewards/margins": 0.9929585456848145,
630
+ "rewards/rejected": -0.07253243774175644,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 1.04,
635
+ "eval_logits/chosen": -2.2160162925720215,
636
+ "eval_logits/rejected": -2.2111997604370117,
637
+ "eval_logps/chosen": -33.652793884277344,
638
+ "eval_logps/rejected": -37.193359375,
639
+ "eval_loss": 0.4891091287136078,
640
+ "eval_rewards/accuracies": 0.5651993155479431,
641
+ "eval_rewards/chosen": 0.30540817975997925,
642
+ "eval_rewards/margins": 0.04680241644382477,
643
+ "eval_rewards/rejected": 0.2586057484149933,
644
+ "eval_runtime": 146.2599,
645
+ "eval_samples_per_second": 2.345,
646
+ "eval_steps_per_second": 0.294,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 1.06,
651
+ "learning_rate": 4.590793060955158e-06,
652
+ "logits/chosen": -2.044127941131592,
653
+ "logits/rejected": -2.046945571899414,
654
+ "logps/chosen": -31.79901695251465,
655
+ "logps/rejected": -34.87114334106445,
656
+ "loss": 0.299,
657
+ "rewards/accuracies": 0.9125000238418579,
658
+ "rewards/chosen": 1.003937005996704,
659
+ "rewards/margins": 1.0665733814239502,
660
+ "rewards/rejected": -0.0626363754272461,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 1.09,
665
+ "learning_rate": 4.559191453574582e-06,
666
+ "logits/chosen": -1.8841445446014404,
667
+ "logits/rejected": -1.8827756643295288,
668
+ "logps/chosen": -27.981430053710938,
669
+ "logps/rejected": -32.39961624145508,
670
+ "loss": 0.3151,
671
+ "rewards/accuracies": 0.9624999761581421,
672
+ "rewards/chosen": 0.868902325630188,
673
+ "rewards/margins": 0.9344655871391296,
674
+ "rewards/rejected": -0.06556323915719986,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 1.12,
679
+ "learning_rate": 4.52653192962838e-06,
680
+ "logits/chosen": -1.8422701358795166,
681
+ "logits/rejected": -1.835268259048462,
682
+ "logps/chosen": -32.6194953918457,
683
+ "logps/rejected": -34.0600700378418,
684
+ "loss": 0.2904,
685
+ "rewards/accuracies": 0.9375,
686
+ "rewards/chosen": 1.1636759042739868,
687
+ "rewards/margins": 1.0155143737792969,
688
+ "rewards/rejected": 0.14816154539585114,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 1.14,
693
+ "learning_rate": 4.492831268057307e-06,
694
+ "logits/chosen": -2.0123181343078613,
695
+ "logits/rejected": -2.0072364807128906,
696
+ "logps/chosen": -30.41460609436035,
697
+ "logps/rejected": -32.033443450927734,
698
+ "loss": 0.284,
699
+ "rewards/accuracies": 0.925000011920929,
700
+ "rewards/chosen": 1.0949517488479614,
701
+ "rewards/margins": 1.1476168632507324,
702
+ "rewards/rejected": -0.05266512185335159,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 1.17,
707
+ "learning_rate": 4.458106782690094e-06,
708
+ "logits/chosen": -1.8972448110580444,
709
+ "logits/rejected": -1.9013574123382568,
710
+ "logps/chosen": -33.069793701171875,
711
+ "logps/rejected": -32.7395133972168,
712
+ "loss": 0.263,
713
+ "rewards/accuracies": 0.9375,
714
+ "rewards/chosen": 1.156842589378357,
715
+ "rewards/margins": 1.2775455713272095,
716
+ "rewards/rejected": -0.12070278078317642,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 1.19,
721
+ "learning_rate": 4.422376313348405e-06,
722
+ "logits/chosen": -1.9023644924163818,
723
+ "logits/rejected": -1.8966976404190063,
724
+ "logps/chosen": -33.858001708984375,
725
+ "logps/rejected": -35.19942092895508,
726
+ "loss": 0.2537,
727
+ "rewards/accuracies": 0.9125000238418579,
728
+ "rewards/chosen": 1.249218225479126,
729
+ "rewards/margins": 1.4494789838790894,
730
+ "rewards/rejected": -0.20026080310344696,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 1.22,
735
+ "learning_rate": 4.3856582166815696e-06,
736
+ "logits/chosen": -1.9303083419799805,
737
+ "logits/rejected": -1.9299156665802002,
738
+ "logps/chosen": -32.68412399291992,
739
+ "logps/rejected": -34.166542053222656,
740
+ "loss": 0.2809,
741
+ "rewards/accuracies": 0.887499988079071,
742
+ "rewards/chosen": 1.1873619556427002,
743
+ "rewards/margins": 1.1722075939178467,
744
+ "rewards/rejected": 0.01515440084040165,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.25,
749
+ "learning_rate": 4.347971356735789e-06,
750
+ "logits/chosen": -1.97918701171875,
751
+ "logits/rejected": -1.960694670677185,
752
+ "logps/chosen": -32.545894622802734,
753
+ "logps/rejected": -33.19091033935547,
754
+ "loss": 0.2501,
755
+ "rewards/accuracies": 0.9375,
756
+ "rewards/chosen": 1.291896104812622,
757
+ "rewards/margins": 1.3886420726776123,
758
+ "rewards/rejected": -0.09674612432718277,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.27,
763
+ "learning_rate": 4.309335095262675e-06,
764
+ "logits/chosen": -1.9456336498260498,
765
+ "logits/rejected": -1.9451408386230469,
766
+ "logps/chosen": -30.120983123779297,
767
+ "logps/rejected": -31.22079849243164,
768
+ "loss": 0.2773,
769
+ "rewards/accuracies": 0.875,
770
+ "rewards/chosen": 1.1922577619552612,
771
+ "rewards/margins": 1.1720596551895142,
772
+ "rewards/rejected": 0.020198041573166847,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.3,
777
+ "learning_rate": 4.269769281772082e-06,
778
+ "logits/chosen": -1.9087337255477905,
779
+ "logits/rejected": -1.9019416570663452,
780
+ "logps/chosen": -30.988162994384766,
781
+ "logps/rejected": -34.79794692993164,
782
+ "loss": 0.2459,
783
+ "rewards/accuracies": 0.875,
784
+ "rewards/chosen": 1.345572829246521,
785
+ "rewards/margins": 1.4825364351272583,
786
+ "rewards/rejected": -0.13696344196796417,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.3,
791
+ "eval_logits/chosen": -2.1905555725097656,
792
+ "eval_logits/rejected": -2.1857681274414062,
793
+ "eval_logps/chosen": -33.6363525390625,
794
+ "eval_logps/rejected": -37.182701110839844,
795
+ "eval_loss": 0.4884692430496216,
796
+ "eval_rewards/accuracies": 0.5622923374176025,
797
+ "eval_rewards/chosen": 0.3185593783855438,
798
+ "eval_rewards/margins": 0.05142458155751228,
799
+ "eval_rewards/rejected": 0.26713478565216064,
800
+ "eval_runtime": 145.9464,
801
+ "eval_samples_per_second": 2.35,
802
+ "eval_steps_per_second": 0.295,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.32,
807
+ "learning_rate": 4.22929424333435e-06,
808
+ "logits/chosen": -1.903969168663025,
809
+ "logits/rejected": -1.9078088998794556,
810
+ "logps/chosen": -27.756145477294922,
811
+ "logps/rejected": -33.003501892089844,
812
+ "loss": 0.2721,
813
+ "rewards/accuracies": 0.887499988079071,
814
+ "rewards/chosen": 1.1349422931671143,
815
+ "rewards/margins": 1.2503130435943604,
816
+ "rewards/rejected": -0.11537061631679535,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.35,
821
+ "learning_rate": 4.1879307741372085e-06,
822
+ "logits/chosen": -1.9105024337768555,
823
+ "logits/rejected": -1.9210312366485596,
824
+ "logps/chosen": -31.711151123046875,
825
+ "logps/rejected": -30.852153778076172,
826
+ "loss": 0.2636,
827
+ "rewards/accuracies": 0.8999999761581421,
828
+ "rewards/chosen": 1.240944743156433,
829
+ "rewards/margins": 1.4013227224349976,
830
+ "rewards/rejected": -0.1603778600692749,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.38,
835
+ "learning_rate": 4.145700124802693e-06,
836
+ "logits/chosen": -1.8562660217285156,
837
+ "logits/rejected": -1.8539501428604126,
838
+ "logps/chosen": -30.094646453857422,
839
+ "logps/rejected": -30.31314468383789,
840
+ "loss": 0.2542,
841
+ "rewards/accuracies": 0.887499988079071,
842
+ "rewards/chosen": 1.2579472064971924,
843
+ "rewards/margins": 1.3670498132705688,
844
+ "rewards/rejected": -0.10910268872976303,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.4,
849
+ "learning_rate": 4.102623991469562e-06,
850
+ "logits/chosen": -1.9388301372528076,
851
+ "logits/rejected": -1.931828260421753,
852
+ "logps/chosen": -32.52373504638672,
853
+ "logps/rejected": -33.19480514526367,
854
+ "loss": 0.234,
855
+ "rewards/accuracies": 0.925000011920929,
856
+ "rewards/chosen": 1.3672724962234497,
857
+ "rewards/margins": 1.5091286897659302,
858
+ "rewards/rejected": -0.14185625314712524,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.43,
863
+ "learning_rate": 4.058724504646834e-06,
864
+ "logits/chosen": -1.9054105281829834,
865
+ "logits/rejected": -1.9118045568466187,
866
+ "logps/chosen": -30.23921775817871,
867
+ "logps/rejected": -32.79788589477539,
868
+ "loss": 0.2692,
869
+ "rewards/accuracies": 0.875,
870
+ "rewards/chosen": 1.211173176765442,
871
+ "rewards/margins": 1.2424962520599365,
872
+ "rewards/rejected": -0.03132311627268791,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.45,
877
+ "learning_rate": 4.014024217844167e-06,
878
+ "logits/chosen": -1.9779608249664307,
879
+ "logits/rejected": -1.9552392959594727,
880
+ "logps/chosen": -29.897476196289062,
881
+ "logps/rejected": -33.047569274902344,
882
+ "loss": 0.2672,
883
+ "rewards/accuracies": 0.887499988079071,
884
+ "rewards/chosen": 1.2320712804794312,
885
+ "rewards/margins": 1.273280143737793,
886
+ "rewards/rejected": -0.041208960115909576,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.48,
891
+ "learning_rate": 3.968546095984911e-06,
892
+ "logits/chosen": -1.9123926162719727,
893
+ "logits/rejected": -1.9074643850326538,
894
+ "logps/chosen": -30.69790267944336,
895
+ "logps/rejected": -32.02743911743164,
896
+ "loss": 0.2612,
897
+ "rewards/accuracies": 0.8500000238418579,
898
+ "rewards/chosen": 1.3750633001327515,
899
+ "rewards/margins": 1.3002793788909912,
900
+ "rewards/rejected": 0.07478378713130951,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.51,
905
+ "learning_rate": 3.922313503607806e-06,
906
+ "logits/chosen": -1.9546325206756592,
907
+ "logits/rejected": -1.956498384475708,
908
+ "logps/chosen": -32.70453643798828,
909
+ "logps/rejected": -35.04417037963867,
910
+ "loss": 0.2395,
911
+ "rewards/accuracies": 0.9375,
912
+ "rewards/chosen": 1.337005853652954,
913
+ "rewards/margins": 1.5316107273101807,
914
+ "rewards/rejected": -0.19460472464561462,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.53,
919
+ "learning_rate": 3.875350192863368e-06,
920
+ "logits/chosen": -1.9385898113250732,
921
+ "logits/rejected": -1.9379469156265259,
922
+ "logps/chosen": -28.891321182250977,
923
+ "logps/rejected": -31.504505157470703,
924
+ "loss": 0.2493,
925
+ "rewards/accuracies": 0.887499988079071,
926
+ "rewards/chosen": 1.3605870008468628,
927
+ "rewards/margins": 1.3948794603347778,
928
+ "rewards/rejected": -0.034292541444301605,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.56,
933
+ "learning_rate": 3.8276802913111436e-06,
934
+ "logits/chosen": -1.9496517181396484,
935
+ "logits/rejected": -1.9475892782211304,
936
+ "logps/chosen": -31.307537078857422,
937
+ "logps/rejected": -32.269371032714844,
938
+ "loss": 0.2639,
939
+ "rewards/accuracies": 0.949999988079071,
940
+ "rewards/chosen": 1.3525952100753784,
941
+ "rewards/margins": 1.3281266689300537,
942
+ "rewards/rejected": 0.024468522518873215,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.56,
947
+ "eval_logits/chosen": -2.183182716369629,
948
+ "eval_logits/rejected": -2.178420066833496,
949
+ "eval_logps/chosen": -33.58167266845703,
950
+ "eval_logps/rejected": -37.20375442504883,
951
+ "eval_loss": 0.47496160864830017,
952
+ "eval_rewards/accuracies": 0.5855481624603271,
953
+ "eval_rewards/chosen": 0.3623029887676239,
954
+ "eval_rewards/margins": 0.11201038211584091,
955
+ "eval_rewards/rejected": 0.2502925992012024,
956
+ "eval_runtime": 145.9651,
957
+ "eval_samples_per_second": 2.35,
958
+ "eval_steps_per_second": 0.295,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.58,
963
+ "learning_rate": 3.7793282895240927e-06,
964
+ "logits/chosen": -1.9928886890411377,
965
+ "logits/rejected": -1.9994275569915771,
966
+ "logps/chosen": -30.57297706604004,
967
+ "logps/rejected": -32.0395622253418,
968
+ "loss": 0.2611,
969
+ "rewards/accuracies": 0.9125000238418579,
970
+ "rewards/chosen": 1.3315242528915405,
971
+ "rewards/margins": 1.3126256465911865,
972
+ "rewards/rejected": 0.01889852061867714,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.61,
977
+ "learning_rate": 3.730319028506478e-06,
978
+ "logits/chosen": -1.9484421014785767,
979
+ "logits/rejected": -1.946126937866211,
980
+ "logps/chosen": -32.853248596191406,
981
+ "logps/rejected": -30.977664947509766,
982
+ "loss": 0.2377,
983
+ "rewards/accuracies": 0.887499988079071,
984
+ "rewards/chosen": 1.5221078395843506,
985
+ "rewards/margins": 1.5158333778381348,
986
+ "rewards/rejected": 0.0062743364833295345,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 1.64,
991
+ "learning_rate": 3.6806776869317074e-06,
992
+ "logits/chosen": -1.8969027996063232,
993
+ "logits/rejected": -1.8902422189712524,
994
+ "logps/chosen": -33.44438171386719,
995
+ "logps/rejected": -32.37852478027344,
996
+ "loss": 0.2252,
997
+ "rewards/accuracies": 0.9375,
998
+ "rewards/chosen": 1.6158740520477295,
999
+ "rewards/margins": 1.6329265832901,
1000
+ "rewards/rejected": -0.017052406445145607,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 1.66,
1005
+ "learning_rate": 3.6304297682067146e-06,
1006
+ "logits/chosen": -1.9018688201904297,
1007
+ "logits/rejected": -1.9080841541290283,
1008
+ "logps/chosen": -32.16619110107422,
1009
+ "logps/rejected": -33.13113021850586,
1010
+ "loss": 0.2565,
1011
+ "rewards/accuracies": 0.875,
1012
+ "rewards/chosen": 1.450488567352295,
1013
+ "rewards/margins": 1.3631832599639893,
1014
+ "rewards/rejected": 0.08730525523424149,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 1.69,
1019
+ "learning_rate": 3.579601087369492e-06,
1020
+ "logits/chosen": -1.98688542842865,
1021
+ "logits/rejected": -2.0009474754333496,
1022
+ "logps/chosen": -30.202587127685547,
1023
+ "logps/rejected": -31.93996238708496,
1024
+ "loss": 0.2613,
1025
+ "rewards/accuracies": 0.862500011920929,
1026
+ "rewards/chosen": 1.3199015855789185,
1027
+ "rewards/margins": 1.2737209796905518,
1028
+ "rewards/rejected": 0.046180594712495804,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 1.71,
1033
+ "learning_rate": 3.5282177578265295e-06,
1034
+ "logits/chosen": -1.861903429031372,
1035
+ "logits/rejected": -1.8589446544647217,
1036
+ "logps/chosen": -31.801767349243164,
1037
+ "logps/rejected": -34.892234802246094,
1038
+ "loss": 0.1937,
1039
+ "rewards/accuracies": 0.925000011920929,
1040
+ "rewards/chosen": 1.6969530582427979,
1041
+ "rewards/margins": 1.8505403995513916,
1042
+ "rewards/rejected": -0.15358731150627136,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 1.74,
1047
+ "learning_rate": 3.476306177936961e-06,
1048
+ "logits/chosen": -1.9529634714126587,
1049
+ "logits/rejected": -1.9532558917999268,
1050
+ "logps/chosen": -29.594532012939453,
1051
+ "logps/rejected": -33.946571350097656,
1052
+ "loss": 0.2422,
1053
+ "rewards/accuracies": 0.8999999761581421,
1054
+ "rewards/chosen": 1.3528481721878052,
1055
+ "rewards/margins": 1.449000597000122,
1056
+ "rewards/rejected": -0.09615238755941391,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 1.77,
1061
+ "learning_rate": 3.423893017450324e-06,
1062
+ "logits/chosen": -1.9031528234481812,
1063
+ "logits/rejected": -1.9001739025115967,
1064
+ "logps/chosen": -29.16634178161621,
1065
+ "logps/rejected": -32.86220169067383,
1066
+ "loss": 0.2577,
1067
+ "rewards/accuracies": 0.8500000238418579,
1068
+ "rewards/chosen": 1.3648045063018799,
1069
+ "rewards/margins": 1.40032160282135,
1070
+ "rewards/rejected": -0.03551710769534111,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 1.79,
1075
+ "learning_rate": 3.3710052038048794e-06,
1076
+ "logits/chosen": -1.9385061264038086,
1077
+ "logits/rejected": -1.9385082721710205,
1078
+ "logps/chosen": -27.996692657470703,
1079
+ "logps/rejected": -30.74009132385254,
1080
+ "loss": 0.2085,
1081
+ "rewards/accuracies": 0.9375,
1082
+ "rewards/chosen": 1.7331215143203735,
1083
+ "rewards/margins": 1.7887967824935913,
1084
+ "rewards/rejected": -0.05567514896392822,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 1.82,
1089
+ "learning_rate": 3.3176699082935546e-06,
1090
+ "logits/chosen": -1.8473536968231201,
1091
+ "logits/rejected": -1.850669503211975,
1092
+ "logps/chosen": -32.4169921875,
1093
+ "logps/rejected": -31.4881649017334,
1094
+ "loss": 0.2437,
1095
+ "rewards/accuracies": 0.8500000238418579,
1096
+ "rewards/chosen": 1.6491864919662476,
1097
+ "rewards/margins": 1.6679694652557373,
1098
+ "rewards/rejected": -0.01878293789923191,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 1.82,
1103
+ "eval_logits/chosen": -2.178642988204956,
1104
+ "eval_logits/rejected": -2.173909902572632,
1105
+ "eval_logps/chosen": -33.59923553466797,
1106
+ "eval_logps/rejected": -37.22935485839844,
1107
+ "eval_loss": 0.4741733968257904,
1108
+ "eval_rewards/accuracies": 0.5747508406639099,
1109
+ "eval_rewards/chosen": 0.34825241565704346,
1110
+ "eval_rewards/margins": 0.1184391975402832,
1111
+ "eval_rewards/rejected": 0.22981324791908264,
1112
+ "eval_runtime": 145.9455,
1113
+ "eval_samples_per_second": 2.35,
1114
+ "eval_steps_per_second": 0.295,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 1.84,
1119
+ "learning_rate": 3.2639145321045933e-06,
1120
+ "logits/chosen": -1.9303743839263916,
1121
+ "logits/rejected": -1.9213898181915283,
1122
+ "logps/chosen": -34.50909423828125,
1123
+ "logps/rejected": -31.8797550201416,
1124
+ "loss": 0.2249,
1125
+ "rewards/accuracies": 0.9375,
1126
+ "rewards/chosen": 1.602341890335083,
1127
+ "rewards/margins": 1.5814943313598633,
1128
+ "rewards/rejected": 0.02084733545780182,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 1.87,
1133
+ "learning_rate": 3.2097666922441107e-06,
1134
+ "logits/chosen": -1.9459857940673828,
1135
+ "logits/rejected": -1.9470113515853882,
1136
+ "logps/chosen": -34.359092712402344,
1137
+ "logps/rejected": -33.046634674072266,
1138
+ "loss": 0.2173,
1139
+ "rewards/accuracies": 0.925000011920929,
1140
+ "rewards/chosen": 1.6610389947891235,
1141
+ "rewards/margins": 1.6375271081924438,
1142
+ "rewards/rejected": 0.02351185865700245,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 1.9,
1147
+ "learning_rate": 3.1552542073477554e-06,
1148
+ "logits/chosen": -1.962262511253357,
1149
+ "logits/rejected": -1.9598932266235352,
1150
+ "logps/chosen": -30.564239501953125,
1151
+ "logps/rejected": -32.997047424316406,
1152
+ "loss": 0.23,
1153
+ "rewards/accuracies": 0.9125000238418579,
1154
+ "rewards/chosen": 1.5992707014083862,
1155
+ "rewards/margins": 1.5993889570236206,
1156
+ "rewards/rejected": -0.00011831596202682704,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 1.92,
1161
+ "learning_rate": 3.100405083388799e-06,
1162
+ "logits/chosen": -1.9474010467529297,
1163
+ "logits/rejected": -1.952619194984436,
1164
+ "logps/chosen": -29.694049835205078,
1165
+ "logps/rejected": -33.282554626464844,
1166
+ "loss": 0.2069,
1167
+ "rewards/accuracies": 0.8999999761581421,
1168
+ "rewards/chosen": 1.6733427047729492,
1169
+ "rewards/margins": 1.7755340337753296,
1170
+ "rewards/rejected": -0.10219136625528336,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 1.95,
1175
+ "learning_rate": 3.0452474992899645e-06,
1176
+ "logits/chosen": -1.892179250717163,
1177
+ "logits/rejected": -1.8910681009292603,
1178
+ "logps/chosen": -31.141448974609375,
1179
+ "logps/rejected": -34.69000244140625,
1180
+ "loss": 0.24,
1181
+ "rewards/accuracies": 0.8999999761581421,
1182
+ "rewards/chosen": 1.5868273973464966,
1183
+ "rewards/margins": 1.5900630950927734,
1184
+ "rewards/rejected": -0.0032357454765588045,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 1.97,
1189
+ "learning_rate": 2.989809792446417e-06,
1190
+ "logits/chosen": -1.7668031454086304,
1191
+ "logits/rejected": -1.7617915868759155,
1192
+ "logps/chosen": -33.82783889770508,
1193
+ "logps/rejected": -35.33191680908203,
1194
+ "loss": 0.2193,
1195
+ "rewards/accuracies": 0.8999999761581421,
1196
+ "rewards/chosen": 1.7364822626113892,
1197
+ "rewards/margins": 1.8352861404418945,
1198
+ "rewards/rejected": -0.0988037958741188,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 2.0,
1203
+ "learning_rate": 2.9341204441673267e-06,
1204
+ "logits/chosen": -1.9073164463043213,
1205
+ "logits/rejected": -1.9110107421875,
1206
+ "logps/chosen": -33.157711029052734,
1207
+ "logps/rejected": -33.576969146728516,
1208
+ "loss": 0.2221,
1209
+ "rewards/accuracies": 0.8708333969116211,
1210
+ "rewards/chosen": 1.6447594165802002,
1211
+ "rewards/margins": 1.6304632425308228,
1212
+ "rewards/rejected": 0.014296052046120167,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 2.03,
1217
+ "learning_rate": 2.878208065043501e-06,
1218
+ "logits/chosen": -1.8515024185180664,
1219
+ "logits/rejected": -1.8497060537338257,
1220
+ "logps/chosen": -31.297021865844727,
1221
+ "logps/rejected": -35.01959991455078,
1222
+ "loss": 0.1312,
1223
+ "rewards/accuracies": 0.949999988079071,
1224
+ "rewards/chosen": 2.0836594104766846,
1225
+ "rewards/margins": 2.575618028640747,
1226
+ "rewards/rejected": -0.4919588565826416,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 2.05,
1231
+ "learning_rate": 2.8221013802485974e-06,
1232
+ "logits/chosen": -1.899592638015747,
1233
+ "logits/rejected": -1.8983027935028076,
1234
+ "logps/chosen": -30.751636505126953,
1235
+ "logps/rejected": -33.40520477294922,
1236
+ "loss": 0.1431,
1237
+ "rewards/accuracies": 0.9375,
1238
+ "rewards/chosen": 2.0992822647094727,
1239
+ "rewards/margins": 2.354191541671753,
1240
+ "rewards/rejected": -0.25490933656692505,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 2.08,
1245
+ "learning_rate": 2.76582921478147e-06,
1246
+ "logits/chosen": -1.83371901512146,
1247
+ "logits/rejected": -1.8279550075531006,
1248
+ "logps/chosen": -31.871252059936523,
1249
+ "logps/rejected": -31.73343849182129,
1250
+ "loss": 0.1567,
1251
+ "rewards/accuracies": 0.925000011920929,
1252
+ "rewards/chosen": 2.0094635486602783,
1253
+ "rewards/margins": 2.224806547164917,
1254
+ "rewards/rejected": -0.21534299850463867,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 2.08,
1259
+ "eval_logits/chosen": -2.1802878379821777,
1260
+ "eval_logits/rejected": -2.1755456924438477,
1261
+ "eval_logps/chosen": -33.54963302612305,
1262
+ "eval_logps/rejected": -37.20660400390625,
1263
+ "eval_loss": 0.4695200026035309,
1264
+ "eval_rewards/accuracies": 0.5826411843299866,
1265
+ "eval_rewards/chosen": 0.3879339396953583,
1266
+ "eval_rewards/margins": 0.1399237960577011,
1267
+ "eval_rewards/rejected": 0.24801018834114075,
1268
+ "eval_runtime": 145.8879,
1269
+ "eval_samples_per_second": 2.351,
1270
+ "eval_steps_per_second": 0.295,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 2.1,
1275
+ "learning_rate": 2.7094204786572254e-06,
1276
+ "logits/chosen": -1.9300823211669922,
1277
+ "logits/rejected": -1.9375168085098267,
1278
+ "logps/chosen": -29.62691307067871,
1279
+ "logps/rejected": -34.63849639892578,
1280
+ "loss": 0.1613,
1281
+ "rewards/accuracies": 0.949999988079071,
1282
+ "rewards/chosen": 2.0010743141174316,
1283
+ "rewards/margins": 2.3072001934051514,
1284
+ "rewards/rejected": -0.3061259686946869,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 2.13,
1289
+ "learning_rate": 2.6529041520546072e-06,
1290
+ "logits/chosen": -1.9036821126937866,
1291
+ "logits/rejected": -1.9064573049545288,
1292
+ "logps/chosen": -30.34005355834961,
1293
+ "logps/rejected": -33.30119705200195,
1294
+ "loss": 0.1999,
1295
+ "rewards/accuracies": 0.8999999761581421,
1296
+ "rewards/chosen": 1.784820556640625,
1297
+ "rewards/margins": 1.8419349193572998,
1298
+ "rewards/rejected": -0.05711423233151436,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 2.16,
1303
+ "learning_rate": 2.5963092704273302e-06,
1304
+ "logits/chosen": -1.805069923400879,
1305
+ "logits/rejected": -1.8091821670532227,
1306
+ "logps/chosen": -30.127094268798828,
1307
+ "logps/rejected": -35.30133056640625,
1308
+ "loss": 0.1557,
1309
+ "rewards/accuracies": 0.9750000238418579,
1310
+ "rewards/chosen": 1.9714915752410889,
1311
+ "rewards/margins": 2.2844126224517822,
1312
+ "rewards/rejected": -0.31292054057121277,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 2.18,
1317
+ "learning_rate": 2.53966490958702e-06,
1318
+ "logits/chosen": -1.8699356317520142,
1319
+ "logits/rejected": -1.865962028503418,
1320
+ "logps/chosen": -30.581180572509766,
1321
+ "logps/rejected": -33.02351379394531,
1322
+ "loss": 0.1821,
1323
+ "rewards/accuracies": 0.887499988079071,
1324
+ "rewards/chosen": 1.9099775552749634,
1325
+ "rewards/margins": 1.9330689907073975,
1326
+ "rewards/rejected": -0.02309143915772438,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 2.21,
1331
+ "learning_rate": 2.4830001707654135e-06,
1332
+ "logits/chosen": -1.9513721466064453,
1333
+ "logits/rejected": -1.9533140659332275,
1334
+ "logps/chosen": -30.263479232788086,
1335
+ "logps/rejected": -36.080474853515625,
1336
+ "loss": 0.1402,
1337
+ "rewards/accuracies": 0.987500011920929,
1338
+ "rewards/chosen": 2.0094857215881348,
1339
+ "rewards/margins": 2.344318389892578,
1340
+ "rewards/rejected": -0.3348326086997986,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 2.23,
1345
+ "learning_rate": 2.4263441656635054e-06,
1346
+ "logits/chosen": -1.7577623128890991,
1347
+ "logits/rejected": -1.7520900964736938,
1348
+ "logps/chosen": -33.64806365966797,
1349
+ "logps/rejected": -33.127098083496094,
1350
+ "loss": 0.157,
1351
+ "rewards/accuracies": 0.925000011920929,
1352
+ "rewards/chosen": 2.106886625289917,
1353
+ "rewards/margins": 2.2950429916381836,
1354
+ "rewards/rejected": -0.18815645575523376,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 2.26,
1359
+ "learning_rate": 2.3697260014953107e-06,
1360
+ "logits/chosen": -1.8190982341766357,
1361
+ "logits/rejected": -1.8191255331039429,
1362
+ "logps/chosen": -33.16907501220703,
1363
+ "logps/rejected": -35.244049072265625,
1364
+ "loss": 0.1428,
1365
+ "rewards/accuracies": 0.925000011920929,
1366
+ "rewards/chosen": 2.138986110687256,
1367
+ "rewards/margins": 2.457213878631592,
1368
+ "rewards/rejected": -0.31822749972343445,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 2.29,
1373
+ "learning_rate": 2.3131747660339396e-06,
1374
+ "logits/chosen": -1.867693543434143,
1375
+ "logits/rejected": -1.8564653396606445,
1376
+ "logps/chosen": -31.62196922302246,
1377
+ "logps/rejected": -33.34123992919922,
1378
+ "loss": 0.1484,
1379
+ "rewards/accuracies": 0.9750000238418579,
1380
+ "rewards/chosen": 1.9876899719238281,
1381
+ "rewards/margins": 2.3230578899383545,
1382
+ "rewards/rejected": -0.3353681266307831,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 2.31,
1387
+ "learning_rate": 2.256719512667651e-06,
1388
+ "logits/chosen": -1.9699180126190186,
1389
+ "logits/rejected": -1.9743999242782593,
1390
+ "logps/chosen": -30.770462036132812,
1391
+ "logps/rejected": -32.943992614746094,
1392
+ "loss": 0.1449,
1393
+ "rewards/accuracies": 0.9375,
1394
+ "rewards/chosen": 2.0475802421569824,
1395
+ "rewards/margins": 2.453592300415039,
1396
+ "rewards/rejected": -0.40601205825805664,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 2.34,
1401
+ "learning_rate": 2.2003892454735786e-06,
1402
+ "logits/chosen": -1.8890403509140015,
1403
+ "logits/rejected": -1.881980538368225,
1404
+ "logps/chosen": -32.13774490356445,
1405
+ "logps/rejected": -32.89883041381836,
1406
+ "loss": 0.131,
1407
+ "rewards/accuracies": 0.9750000238418579,
1408
+ "rewards/chosen": 2.173067808151245,
1409
+ "rewards/margins": 2.5704257488250732,
1410
+ "rewards/rejected": -0.397358238697052,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 2.34,
1415
+ "eval_logits/chosen": -2.1704888343811035,
1416
+ "eval_logits/rejected": -2.165771961212158,
1417
+ "eval_logps/chosen": -33.59296798706055,
1418
+ "eval_logps/rejected": -37.2408447265625,
1419
+ "eval_loss": 0.4716451168060303,
1420
+ "eval_rewards/accuracies": 0.5859634280204773,
1421
+ "eval_rewards/chosen": 0.3532681465148926,
1422
+ "eval_rewards/margins": 0.1326470971107483,
1423
+ "eval_rewards/rejected": 0.22062106430530548,
1424
+ "eval_runtime": 145.952,
1425
+ "eval_samples_per_second": 2.35,
1426
+ "eval_steps_per_second": 0.295,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 2.36,
1431
+ "learning_rate": 2.1442129043167877e-06,
1432
+ "logits/chosen": -1.8803346157073975,
1433
+ "logits/rejected": -1.8807852268218994,
1434
+ "logps/chosen": -28.69057846069336,
1435
+ "logps/rejected": -35.19638442993164,
1436
+ "loss": 0.1531,
1437
+ "rewards/accuracies": 0.949999988079071,
1438
+ "rewards/chosen": 1.9812660217285156,
1439
+ "rewards/margins": 2.3866193294525146,
1440
+ "rewards/rejected": -0.4053533673286438,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 2.39,
1445
+ "learning_rate": 2.088219349982323e-06,
1446
+ "logits/chosen": -1.8385648727416992,
1447
+ "logits/rejected": -1.8306169509887695,
1448
+ "logps/chosen": -29.56528091430664,
1449
+ "logps/rejected": -34.04883575439453,
1450
+ "loss": 0.1693,
1451
+ "rewards/accuracies": 0.9375,
1452
+ "rewards/chosen": 1.8189325332641602,
1453
+ "rewards/margins": 2.2009072303771973,
1454
+ "rewards/rejected": -0.3819747567176819,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 2.42,
1459
+ "learning_rate": 2.0324373493478803e-06,
1460
+ "logits/chosen": -2.004300117492676,
1461
+ "logits/rejected": -2.004765033721924,
1462
+ "logps/chosen": -27.645904541015625,
1463
+ "logps/rejected": -33.17176818847656,
1464
+ "loss": 0.1901,
1465
+ "rewards/accuracies": 0.949999988079071,
1466
+ "rewards/chosen": 1.8809070587158203,
1467
+ "rewards/margins": 2.126433849334717,
1468
+ "rewards/rejected": -0.24552695453166962,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 2.44,
1473
+ "learning_rate": 1.976895560604729e-06,
1474
+ "logits/chosen": -1.883927583694458,
1475
+ "logits/rejected": -1.893393874168396,
1476
+ "logps/chosen": -31.66170883178711,
1477
+ "logps/rejected": -33.47935104370117,
1478
+ "loss": 0.1308,
1479
+ "rewards/accuracies": 0.9624999761581421,
1480
+ "rewards/chosen": 2.2106852531433105,
1481
+ "rewards/margins": 2.602447509765625,
1482
+ "rewards/rejected": -0.3917620778083801,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 2.47,
1487
+ "learning_rate": 1.921622518534466e-06,
1488
+ "logits/chosen": -1.9267995357513428,
1489
+ "logits/rejected": -1.9306179285049438,
1490
+ "logps/chosen": -28.294620513916016,
1491
+ "logps/rejected": -32.00605010986328,
1492
+ "loss": 0.1628,
1493
+ "rewards/accuracies": 0.9624999761581421,
1494
+ "rewards/chosen": 1.882483720779419,
1495
+ "rewards/margins": 2.0984606742858887,
1496
+ "rewards/rejected": -0.21597695350646973,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 2.49,
1501
+ "learning_rate": 1.8666466198491794e-06,
1502
+ "logits/chosen": -1.9250777959823608,
1503
+ "logits/rejected": -1.921263337135315,
1504
+ "logps/chosen": -31.393346786499023,
1505
+ "logps/rejected": -34.266326904296875,
1506
+ "loss": 0.1531,
1507
+ "rewards/accuracies": 0.949999988079071,
1508
+ "rewards/chosen": 2.145495891571045,
1509
+ "rewards/margins": 2.4334449768066406,
1510
+ "rewards/rejected": -0.28794899582862854,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 2.52,
1515
+ "learning_rate": 1.8119961086025376e-06,
1516
+ "logits/chosen": -1.8407471179962158,
1517
+ "logits/rejected": -1.84335458278656,
1518
+ "logps/chosen": -30.177082061767578,
1519
+ "logps/rejected": -35.5606803894043,
1520
+ "loss": 0.15,
1521
+ "rewards/accuracies": 0.9750000238418579,
1522
+ "rewards/chosen": 2.1085331439971924,
1523
+ "rewards/margins": 2.411801815032959,
1524
+ "rewards/rejected": -0.3032683730125427,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 2.55,
1529
+ "learning_rate": 1.7576990616793139e-06,
1530
+ "logits/chosen": -1.8759149312973022,
1531
+ "logits/rejected": -1.8695703744888306,
1532
+ "logps/chosen": -32.879905700683594,
1533
+ "logps/rejected": -36.688812255859375,
1534
+ "loss": 0.1591,
1535
+ "rewards/accuracies": 0.925000011920929,
1536
+ "rewards/chosen": 2.020131826400757,
1537
+ "rewards/margins": 2.3616156578063965,
1538
+ "rewards/rejected": -0.34148353338241577,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 2.57,
1543
+ "learning_rate": 1.7037833743707892e-06,
1544
+ "logits/chosen": -1.8502181768417358,
1545
+ "logits/rejected": -1.845171332359314,
1546
+ "logps/chosen": -28.556514739990234,
1547
+ "logps/rejected": -36.38850784301758,
1548
+ "loss": 0.167,
1549
+ "rewards/accuracies": 0.9125000238418579,
1550
+ "rewards/chosen": 1.9571492671966553,
1551
+ "rewards/margins": 2.2760462760925293,
1552
+ "rewards/rejected": -0.31889697909355164,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 2.6,
1557
+ "learning_rate": 1.6502767460434588e-06,
1558
+ "logits/chosen": -1.8279926776885986,
1559
+ "logits/rejected": -1.8174269199371338,
1560
+ "logps/chosen": -29.500579833984375,
1561
+ "logps/rejected": -29.74460220336914,
1562
+ "loss": 0.1784,
1563
+ "rewards/accuracies": 0.949999988079071,
1564
+ "rewards/chosen": 1.798485517501831,
1565
+ "rewards/margins": 1.9143791198730469,
1566
+ "rewards/rejected": -0.11589355766773224,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 2.6,
1571
+ "eval_logits/chosen": -2.169346570968628,
1572
+ "eval_logits/rejected": -2.1646294593811035,
1573
+ "eval_logps/chosen": -33.57487106323242,
1574
+ "eval_logps/rejected": -37.250404357910156,
1575
+ "eval_loss": 0.4672938883304596,
1576
+ "eval_rewards/accuracies": 0.5859634280204773,
1577
+ "eval_rewards/chosen": 0.3677443265914917,
1578
+ "eval_rewards/margins": 0.15477292239665985,
1579
+ "eval_rewards/rejected": 0.21297141909599304,
1580
+ "eval_runtime": 145.9529,
1581
+ "eval_samples_per_second": 2.35,
1582
+ "eval_steps_per_second": 0.295,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 2.62,
1587
+ "learning_rate": 1.5972066659083796e-06,
1588
+ "logits/chosen": -1.938071846961975,
1589
+ "logits/rejected": -1.9374967813491821,
1590
+ "logps/chosen": -29.38279151916504,
1591
+ "logps/rejected": -30.51906394958496,
1592
+ "loss": 0.1659,
1593
+ "rewards/accuracies": 0.9125000238418579,
1594
+ "rewards/chosen": 1.953122854232788,
1595
+ "rewards/margins": 2.1615920066833496,
1596
+ "rewards/rejected": -0.20846888422966003,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 2.65,
1601
+ "learning_rate": 1.5446003988985041e-06,
1602
+ "logits/chosen": -1.9777361154556274,
1603
+ "logits/rejected": -1.9785385131835938,
1604
+ "logps/chosen": -29.455398559570312,
1605
+ "logps/rejected": -31.60211753845215,
1606
+ "loss": 0.1457,
1607
+ "rewards/accuracies": 0.9624999761581421,
1608
+ "rewards/chosen": 1.9905421733856201,
1609
+ "rewards/margins": 2.3395895957946777,
1610
+ "rewards/rejected": -0.3490474820137024,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 2.68,
1615
+ "learning_rate": 1.4924849716612211e-06,
1616
+ "logits/chosen": -1.948656439781189,
1617
+ "logits/rejected": -1.951843500137329,
1618
+ "logps/chosen": -29.8292236328125,
1619
+ "logps/rejected": -28.163440704345703,
1620
+ "loss": 0.1712,
1621
+ "rewards/accuracies": 0.949999988079071,
1622
+ "rewards/chosen": 1.960677146911621,
1623
+ "rewards/margins": 2.1044540405273438,
1624
+ "rewards/rejected": -0.14377683401107788,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 2.7,
1629
+ "learning_rate": 1.440887158673332e-06,
1630
+ "logits/chosen": -1.9582103490829468,
1631
+ "logits/rejected": -1.9505630731582642,
1632
+ "logps/chosen": -28.690776824951172,
1633
+ "logps/rejected": -33.91931915283203,
1634
+ "loss": 0.1486,
1635
+ "rewards/accuracies": 0.9375,
1636
+ "rewards/chosen": 1.9896939992904663,
1637
+ "rewards/margins": 2.3249640464782715,
1638
+ "rewards/rejected": -0.33527034521102905,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 2.73,
1643
+ "learning_rate": 1.3898334684855647e-06,
1644
+ "logits/chosen": -1.8943002223968506,
1645
+ "logits/rejected": -1.905225157737732,
1646
+ "logps/chosen": -30.873615264892578,
1647
+ "logps/rejected": -32.47500228881836,
1648
+ "loss": 0.1427,
1649
+ "rewards/accuracies": 0.9375,
1650
+ "rewards/chosen": 2.143594264984131,
1651
+ "rewards/margins": 2.3275885581970215,
1652
+ "rewards/rejected": -0.18399406969547272,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 2.75,
1657
+ "learning_rate": 1.3393501301037245e-06,
1658
+ "logits/chosen": -1.9732818603515625,
1659
+ "logits/rejected": -1.9640705585479736,
1660
+ "logps/chosen": -30.988611221313477,
1661
+ "logps/rejected": -36.728416442871094,
1662
+ "loss": 0.171,
1663
+ "rewards/accuracies": 0.8999999761581421,
1664
+ "rewards/chosen": 2.009733200073242,
1665
+ "rewards/margins": 2.327359199523926,
1666
+ "rewards/rejected": -0.31762614846229553,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 2.78,
1671
+ "learning_rate": 1.2894630795134454e-06,
1672
+ "logits/chosen": -1.8804638385772705,
1673
+ "logits/rejected": -1.8824098110198975,
1674
+ "logps/chosen": -33.24834442138672,
1675
+ "logps/rejected": -33.089820861816406,
1676
+ "loss": 0.1391,
1677
+ "rewards/accuracies": 0.949999988079071,
1678
+ "rewards/chosen": 2.1927542686462402,
1679
+ "rewards/margins": 2.506317615509033,
1680
+ "rewards/rejected": -0.31356340646743774,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 2.81,
1685
+ "learning_rate": 1.2401979463554984e-06,
1686
+ "logits/chosen": -2.0114994049072266,
1687
+ "logits/rejected": -2.012619972229004,
1688
+ "logps/chosen": -30.447467803955078,
1689
+ "logps/rejected": -34.64774703979492,
1690
+ "loss": 0.1376,
1691
+ "rewards/accuracies": 0.987500011920929,
1692
+ "rewards/chosen": 2.088136911392212,
1693
+ "rewards/margins": 2.513786792755127,
1694
+ "rewards/rejected": -0.42564988136291504,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 2.83,
1699
+ "learning_rate": 1.1915800407584705e-06,
1700
+ "logits/chosen": -1.9806941747665405,
1701
+ "logits/rejected": -1.9849655628204346,
1702
+ "logps/chosen": -28.488088607788086,
1703
+ "logps/rejected": -33.84292984008789,
1704
+ "loss": 0.1691,
1705
+ "rewards/accuracies": 0.949999988079071,
1706
+ "rewards/chosen": 1.9241825342178345,
1707
+ "rewards/margins": 2.160259485244751,
1708
+ "rewards/rejected": -0.23607663810253143,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 2.86,
1713
+ "learning_rate": 1.1436343403356019e-06,
1714
+ "logits/chosen": -1.9700733423233032,
1715
+ "logits/rejected": -1.9752953052520752,
1716
+ "logps/chosen": -31.235626220703125,
1717
+ "logps/rejected": -30.409204483032227,
1718
+ "loss": 0.1956,
1719
+ "rewards/accuracies": 0.8999999761581421,
1720
+ "rewards/chosen": 1.8667011260986328,
1721
+ "rewards/margins": 1.8727245330810547,
1722
+ "rewards/rejected": -0.0060233683325350285,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 2.86,
1727
+ "eval_logits/chosen": -2.1669461727142334,
1728
+ "eval_logits/rejected": -2.162247896194458,
1729
+ "eval_logps/chosen": -33.58707809448242,
1730
+ "eval_logps/rejected": -37.24416732788086,
1731
+ "eval_loss": 0.47056397795677185,
1732
+ "eval_rewards/accuracies": 0.5859634280204773,
1733
+ "eval_rewards/chosen": 0.3579804301261902,
1734
+ "eval_rewards/margins": 0.14001740515232086,
1735
+ "eval_rewards/rejected": 0.21796302497386932,
1736
+ "eval_runtime": 145.9563,
1737
+ "eval_samples_per_second": 2.35,
1738
+ "eval_steps_per_second": 0.295,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 2.88,
1743
+ "learning_rate": 1.0963854773524548e-06,
1744
+ "logits/chosen": -1.9610811471939087,
1745
+ "logits/rejected": -1.9610563516616821,
1746
+ "logps/chosen": -30.302623748779297,
1747
+ "logps/rejected": -31.24424171447754,
1748
+ "loss": 0.1495,
1749
+ "rewards/accuracies": 0.949999988079071,
1750
+ "rewards/chosen": 2.1100215911865234,
1751
+ "rewards/margins": 2.2926793098449707,
1752
+ "rewards/rejected": -0.18265783786773682,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 2.91,
1757
+ "learning_rate": 1.049857726072005e-06,
1758
+ "logits/chosen": -1.8055435419082642,
1759
+ "logits/rejected": -1.8071558475494385,
1760
+ "logps/chosen": -31.964313507080078,
1761
+ "logps/rejected": -33.263916015625,
1762
+ "loss": 0.1722,
1763
+ "rewards/accuracies": 0.887499988079071,
1764
+ "rewards/chosen": 2.08052921295166,
1765
+ "rewards/margins": 2.258647918701172,
1766
+ "rewards/rejected": -0.17811879515647888,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 2.94,
1771
+ "learning_rate": 1.0040749902836508e-06,
1772
+ "logits/chosen": -1.8441641330718994,
1773
+ "logits/rejected": -1.8414533138275146,
1774
+ "logps/chosen": -28.680904388427734,
1775
+ "logps/rejected": -31.291534423828125,
1776
+ "loss": 0.1972,
1777
+ "rewards/accuracies": 0.9125000238418579,
1778
+ "rewards/chosen": 1.8646504878997803,
1779
+ "rewards/margins": 1.9652849435806274,
1780
+ "rewards/rejected": -0.10063423216342926,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 2.96,
1785
+ "learning_rate": 9.59060791022566e-07,
1786
+ "logits/chosen": -1.979723334312439,
1787
+ "logits/rejected": -1.9741865396499634,
1788
+ "logps/chosen": -30.334341049194336,
1789
+ "logps/rejected": -33.13202667236328,
1790
+ "loss": 0.1522,
1791
+ "rewards/accuracies": 0.949999988079071,
1792
+ "rewards/chosen": 2.1643624305725098,
1793
+ "rewards/margins": 2.3067753314971924,
1794
+ "rewards/rejected": -0.14241299033164978,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 2.99,
1799
+ "learning_rate": 9.148382544856885e-07,
1800
+ "logits/chosen": -1.8336684703826904,
1801
+ "logits/rejected": -1.823862075805664,
1802
+ "logps/chosen": -30.90520668029785,
1803
+ "logps/rejected": -31.456531524658203,
1804
+ "loss": 0.1763,
1805
+ "rewards/accuracies": 0.9375,
1806
+ "rewards/chosen": 2.037224292755127,
1807
+ "rewards/margins": 2.1685280799865723,
1808
+ "rewards/rejected": -0.13130377233028412,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 3.01,
1813
+ "learning_rate": 8.714301001505568e-07,
1814
+ "logits/chosen": -1.9085521697998047,
1815
+ "logits/rejected": -1.908399224281311,
1816
+ "logps/chosen": -31.05959701538086,
1817
+ "logps/rejected": -31.56801414489746,
1818
+ "loss": 0.1469,
1819
+ "rewards/accuracies": 0.9624999761581421,
1820
+ "rewards/chosen": 2.09771990776062,
1821
+ "rewards/margins": 2.3909716606140137,
1822
+ "rewards/rejected": -0.2932516634464264,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 3.04,
1827
+ "learning_rate": 8.288586291031025e-07,
1828
+ "logits/chosen": -1.9817787408828735,
1829
+ "logits/rejected": -1.9764562845230103,
1830
+ "logps/chosen": -30.92386817932129,
1831
+ "logps/rejected": -33.19076919555664,
1832
+ "loss": 0.1496,
1833
+ "rewards/accuracies": 0.9375,
1834
+ "rewards/chosen": 2.0816802978515625,
1835
+ "rewards/margins": 2.2813220024108887,
1836
+ "rewards/rejected": -0.19964155554771423,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 3.06,
1841
+ "learning_rate": 7.871457125803897e-07,
1842
+ "logits/chosen": -1.8389742374420166,
1843
+ "logits/rejected": -1.8466594219207764,
1844
+ "logps/chosen": -30.842212677001953,
1845
+ "logps/rejected": -32.71555709838867,
1846
+ "loss": 0.1498,
1847
+ "rewards/accuracies": 0.9624999761581421,
1848
+ "rewards/chosen": 1.9926265478134155,
1849
+ "rewards/margins": 2.2759203910827637,
1850
+ "rewards/rejected": -0.28329357504844666,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 3.09,
1855
+ "learning_rate": 7.463127807341966e-07,
1856
+ "logits/chosen": -1.9125378131866455,
1857
+ "logits/rejected": -1.9067026376724243,
1858
+ "logps/chosen": -29.305688858032227,
1859
+ "logps/rejected": -33.628910064697266,
1860
+ "loss": 0.1274,
1861
+ "rewards/accuracies": 0.9375,
1862
+ "rewards/chosen": 2.294342279434204,
1863
+ "rewards/margins": 2.570711612701416,
1864
+ "rewards/rejected": -0.27636927366256714,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 3.12,
1869
+ "learning_rate": 7.063808116212021e-07,
1870
+ "logits/chosen": -1.8545643091201782,
1871
+ "logits/rejected": -1.8566617965698242,
1872
+ "logps/chosen": -30.835662841796875,
1873
+ "logps/rejected": -33.26459503173828,
1874
+ "loss": 0.137,
1875
+ "rewards/accuracies": 0.9375,
1876
+ "rewards/chosen": 2.1238210201263428,
1877
+ "rewards/margins": 2.5274574756622314,
1878
+ "rewards/rejected": -0.403636634349823,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 3.12,
1883
+ "eval_logits/chosen": -2.167179584503174,
1884
+ "eval_logits/rejected": -2.1624755859375,
1885
+ "eval_logps/chosen": -33.572837829589844,
1886
+ "eval_logps/rejected": -37.24382400512695,
1887
+ "eval_loss": 0.4679512083530426,
1888
+ "eval_rewards/accuracies": 0.6063122749328613,
1889
+ "eval_rewards/chosen": 0.36937177181243896,
1890
+ "eval_rewards/margins": 0.15113388001918793,
1891
+ "eval_rewards/rejected": 0.21823787689208984,
1892
+ "eval_runtime": 145.8851,
1893
+ "eval_samples_per_second": 2.351,
1894
+ "eval_steps_per_second": 0.295,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 3.14,
1899
+ "learning_rate": 6.673703204254348e-07,
1900
+ "logits/chosen": -1.7856515645980835,
1901
+ "logits/rejected": -1.784690260887146,
1902
+ "logps/chosen": -33.143531799316406,
1903
+ "logps/rejected": -33.10202407836914,
1904
+ "loss": 0.1273,
1905
+ "rewards/accuracies": 0.9624999761581421,
1906
+ "rewards/chosen": 2.3271608352661133,
1907
+ "rewards/margins": 2.641291618347168,
1908
+ "rewards/rejected": -0.3141304850578308,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 3.17,
1913
+ "learning_rate": 6.293013489185315e-07,
1914
+ "logits/chosen": -1.9604772329330444,
1915
+ "logits/rejected": -1.9546642303466797,
1916
+ "logps/chosen": -28.979385375976562,
1917
+ "logps/rejected": -33.37938690185547,
1918
+ "loss": 0.1352,
1919
+ "rewards/accuracies": 0.949999988079071,
1920
+ "rewards/chosen": 2.1794400215148926,
1921
+ "rewards/margins": 2.5801169872283936,
1922
+ "rewards/rejected": -0.40067729353904724,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 3.19,
1927
+ "learning_rate": 5.921934551632086e-07,
1928
+ "logits/chosen": -1.8098831176757812,
1929
+ "logits/rejected": -1.798370361328125,
1930
+ "logps/chosen": -31.50105857849121,
1931
+ "logps/rejected": -33.25225067138672,
1932
+ "loss": 0.1311,
1933
+ "rewards/accuracies": 0.949999988079071,
1934
+ "rewards/chosen": 2.3076980113983154,
1935
+ "rewards/margins": 2.5944409370422363,
1936
+ "rewards/rejected": -0.2867427468299866,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 3.22,
1941
+ "learning_rate": 5.560657034652405e-07,
1942
+ "logits/chosen": -1.9010969400405884,
1943
+ "logits/rejected": -1.8942989110946655,
1944
+ "logps/chosen": -28.258697509765625,
1945
+ "logps/rejected": -29.34195327758789,
1946
+ "loss": 0.1603,
1947
+ "rewards/accuracies": 0.949999988079071,
1948
+ "rewards/chosen": 1.9532296657562256,
1949
+ "rewards/margins": 2.33274507522583,
1950
+ "rewards/rejected": -0.3795151114463806,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 3.25,
1955
+ "learning_rate": 5.2093665457911e-07,
1956
+ "logits/chosen": -1.9308841228485107,
1957
+ "logits/rejected": -1.9373613595962524,
1958
+ "logps/chosen": -32.524845123291016,
1959
+ "logps/rejected": -31.774621963500977,
1960
+ "loss": 0.1315,
1961
+ "rewards/accuracies": 0.9375,
1962
+ "rewards/chosen": 2.2964370250701904,
1963
+ "rewards/margins": 2.5014843940734863,
1964
+ "rewards/rejected": -0.20504704117774963,
1965
+ "step": 1250
1966
+ },
1967
+ {
1968
+ "epoch": 3.27,
1969
+ "learning_rate": 4.868243561723535e-07,
1970
+ "logits/chosen": -1.9048112630844116,
1971
+ "logits/rejected": -1.9054410457611084,
1972
+ "logps/chosen": -30.393224716186523,
1973
+ "logps/rejected": -33.286048889160156,
1974
+ "loss": 0.1317,
1975
+ "rewards/accuracies": 0.9624999761581421,
1976
+ "rewards/chosen": 2.1082680225372314,
1977
+ "rewards/margins": 2.485210418701172,
1978
+ "rewards/rejected": -0.37694239616394043,
1979
+ "step": 1260
1980
+ },
1981
+ {
1982
+ "epoch": 3.3,
1983
+ "learning_rate": 4.537463335535161e-07,
1984
+ "logits/chosen": -1.846727728843689,
1985
+ "logits/rejected": -1.8463557958602905,
1986
+ "logps/chosen": -30.23922348022461,
1987
+ "logps/rejected": -33.92576217651367,
1988
+ "loss": 0.1224,
1989
+ "rewards/accuracies": 0.9375,
1990
+ "rewards/chosen": 2.2812910079956055,
1991
+ "rewards/margins": 2.6054136753082275,
1992
+ "rewards/rejected": -0.3241223692893982,
1993
+ "step": 1270
1994
+ },
1995
+ {
1996
+ "epoch": 3.32,
1997
+ "learning_rate": 4.217195806684629e-07,
1998
+ "logits/chosen": -1.730064034461975,
1999
+ "logits/rejected": -1.7253376245498657,
2000
+ "logps/chosen": -32.30054473876953,
2001
+ "logps/rejected": -31.246471405029297,
2002
+ "loss": 0.1154,
2003
+ "rewards/accuracies": 0.9375,
2004
+ "rewards/chosen": 2.349520206451416,
2005
+ "rewards/margins": 2.605508327484131,
2006
+ "rewards/rejected": -0.2559877932071686,
2007
+ "step": 1280
2008
+ },
2009
+ {
2010
+ "epoch": 3.35,
2011
+ "learning_rate": 3.907605513696808e-07,
2012
+ "logits/chosen": -1.9358218908309937,
2013
+ "logits/rejected": -1.9203758239746094,
2014
+ "logps/chosen": -31.7402400970459,
2015
+ "logps/rejected": -35.388179779052734,
2016
+ "loss": 0.1234,
2017
+ "rewards/accuracies": 0.987500011920929,
2018
+ "rewards/chosen": 2.085869550704956,
2019
+ "rewards/margins": 2.5841598510742188,
2020
+ "rewards/rejected": -0.4982902407646179,
2021
+ "step": 1290
2022
+ },
2023
+ {
2024
+ "epoch": 3.38,
2025
+ "learning_rate": 3.6088515096305675e-07,
2026
+ "logits/chosen": -1.879227638244629,
2027
+ "logits/rejected": -1.8840656280517578,
2028
+ "logps/chosen": -30.69500732421875,
2029
+ "logps/rejected": -36.70278549194336,
2030
+ "loss": 0.1211,
2031
+ "rewards/accuracies": 0.987500011920929,
2032
+ "rewards/chosen": 2.2580931186676025,
2033
+ "rewards/margins": 2.7377448081970215,
2034
+ "rewards/rejected": -0.4796522259712219,
2035
+ "step": 1300
2036
+ },
2037
+ {
2038
+ "epoch": 3.38,
2039
+ "eval_logits/chosen": -2.1668777465820312,
2040
+ "eval_logits/rejected": -2.1621785163879395,
2041
+ "eval_logps/chosen": -33.58045959472656,
2042
+ "eval_logps/rejected": -37.23927688598633,
2043
+ "eval_loss": 0.47049835324287415,
2044
+ "eval_rewards/accuracies": 0.5917773842811584,
2045
+ "eval_rewards/chosen": 0.3632733225822449,
2046
+ "eval_rewards/margins": 0.14139726758003235,
2047
+ "eval_rewards/rejected": 0.22187604010105133,
2048
+ "eval_runtime": 145.9507,
2049
+ "eval_samples_per_second": 2.35,
2050
+ "eval_steps_per_second": 0.295,
2051
+ "step": 1300
2052
+ },
2053
+ {
2054
+ "epoch": 3.4,
2055
+ "learning_rate": 3.321087280364757e-07,
2056
+ "logits/chosen": -1.8520374298095703,
2057
+ "logits/rejected": -1.8522069454193115,
2058
+ "logps/chosen": -33.18568801879883,
2059
+ "logps/rejected": -37.46868896484375,
2060
+ "loss": 0.134,
2061
+ "rewards/accuracies": 0.949999988079071,
2062
+ "rewards/chosen": 2.446664333343506,
2063
+ "rewards/margins": 2.7543227672576904,
2064
+ "rewards/rejected": -0.30765873193740845,
2065
+ "step": 1310
2066
+ },
2067
+ {
2068
+ "epoch": 3.43,
2069
+ "learning_rate": 3.044460665744284e-07,
2070
+ "logits/chosen": -1.9394071102142334,
2071
+ "logits/rejected": -1.9380232095718384,
2072
+ "logps/chosen": -29.4769344329834,
2073
+ "logps/rejected": -31.30813980102539,
2074
+ "loss": 0.159,
2075
+ "rewards/accuracies": 0.949999988079071,
2076
+ "rewards/chosen": 2.093536853790283,
2077
+ "rewards/margins": 2.3773601055145264,
2078
+ "rewards/rejected": -0.2838229537010193,
2079
+ "step": 1320
2080
+ },
2081
+ {
2082
+ "epoch": 3.45,
2083
+ "learning_rate": 2.779113783626916e-07,
2084
+ "logits/chosen": -1.844186782836914,
2085
+ "logits/rejected": -1.8454091548919678,
2086
+ "logps/chosen": -31.52813720703125,
2087
+ "logps/rejected": -33.989559173583984,
2088
+ "loss": 0.1366,
2089
+ "rewards/accuracies": 0.9624999761581421,
2090
+ "rewards/chosen": 2.276893138885498,
2091
+ "rewards/margins": 2.549605131149292,
2092
+ "rewards/rejected": -0.27271172404289246,
2093
+ "step": 1330
2094
+ },
2095
+ {
2096
+ "epoch": 3.48,
2097
+ "learning_rate": 2.5251829568697204e-07,
2098
+ "logits/chosen": -1.9051072597503662,
2099
+ "logits/rejected": -1.9038896560668945,
2100
+ "logps/chosen": -28.674198150634766,
2101
+ "logps/rejected": -32.49821090698242,
2102
+ "loss": 0.1373,
2103
+ "rewards/accuracies": 0.9750000238418579,
2104
+ "rewards/chosen": 2.01957631111145,
2105
+ "rewards/margins": 2.424564838409424,
2106
+ "rewards/rejected": -0.4049888551235199,
2107
+ "step": 1340
2108
+ },
2109
+ {
2110
+ "epoch": 3.51,
2111
+ "learning_rate": 2.2827986432927774e-07,
2112
+ "logits/chosen": -1.9206342697143555,
2113
+ "logits/rejected": -1.9062645435333252,
2114
+ "logps/chosen": -31.68826675415039,
2115
+ "logps/rejected": -36.69408416748047,
2116
+ "loss": 0.1396,
2117
+ "rewards/accuracies": 0.9624999761581421,
2118
+ "rewards/chosen": 2.102721691131592,
2119
+ "rewards/margins": 2.4464478492736816,
2120
+ "rewards/rejected": -0.34372615814208984,
2121
+ "step": 1350
2122
+ },
2123
+ {
2124
+ "epoch": 3.53,
2125
+ "learning_rate": 2.0520853686560177e-07,
2126
+ "logits/chosen": -1.921002984046936,
2127
+ "logits/rejected": -1.9334627389907837,
2128
+ "logps/chosen": -29.323871612548828,
2129
+ "logps/rejected": -32.485260009765625,
2130
+ "loss": 0.1368,
2131
+ "rewards/accuracies": 0.925000011920929,
2132
+ "rewards/chosen": 2.195680856704712,
2133
+ "rewards/margins": 2.3920950889587402,
2134
+ "rewards/rejected": -0.1964142769575119,
2135
+ "step": 1360
2136
+ },
2137
+ {
2138
+ "epoch": 3.56,
2139
+ "learning_rate": 1.833161662683672e-07,
2140
+ "logits/chosen": -2.0142505168914795,
2141
+ "logits/rejected": -2.0135538578033447,
2142
+ "logps/chosen": -29.253097534179688,
2143
+ "logps/rejected": -36.559532165527344,
2144
+ "loss": 0.1185,
2145
+ "rewards/accuracies": 0.987500011920929,
2146
+ "rewards/chosen": 2.2218658924102783,
2147
+ "rewards/margins": 2.8047871589660645,
2148
+ "rewards/rejected": -0.5829211473464966,
2149
+ "step": 1370
2150
+ },
2151
+ {
2152
+ "epoch": 3.58,
2153
+ "learning_rate": 1.626139998169246e-07,
2154
+ "logits/chosen": -1.8797496557235718,
2155
+ "logits/rejected": -1.8874984979629517,
2156
+ "logps/chosen": -31.32064437866211,
2157
+ "logps/rejected": -37.89208221435547,
2158
+ "loss": 0.144,
2159
+ "rewards/accuracies": 0.925000011920929,
2160
+ "rewards/chosen": 2.287868022918701,
2161
+ "rewards/margins": 2.641819477081299,
2162
+ "rewards/rejected": -0.3539513051509857,
2163
+ "step": 1380
2164
+ },
2165
+ {
2166
+ "epoch": 3.61,
2167
+ "learning_rate": 1.4311267331922535e-07,
2168
+ "logits/chosen": -1.8338594436645508,
2169
+ "logits/rejected": -1.8298187255859375,
2170
+ "logps/chosen": -31.842514038085938,
2171
+ "logps/rejected": -31.77047348022461,
2172
+ "loss": 0.1419,
2173
+ "rewards/accuracies": 0.925000011920929,
2174
+ "rewards/chosen": 2.3397603034973145,
2175
+ "rewards/margins": 2.4782257080078125,
2176
+ "rewards/rejected": -0.13846543431282043,
2177
+ "step": 1390
2178
+ },
2179
+ {
2180
+ "epoch": 3.64,
2181
+ "learning_rate": 1.2482220564763669e-07,
2182
+ "logits/chosen": -1.9958465099334717,
2183
+ "logits/rejected": -1.9933058023452759,
2184
+ "logps/chosen": -28.800048828125,
2185
+ "logps/rejected": -32.510841369628906,
2186
+ "loss": 0.1553,
2187
+ "rewards/accuracies": 0.949999988079071,
2188
+ "rewards/chosen": 1.9972022771835327,
2189
+ "rewards/margins": 2.2903003692626953,
2190
+ "rewards/rejected": -0.29309794306755066,
2191
+ "step": 1400
2192
+ },
2193
+ {
2194
+ "epoch": 3.64,
2195
+ "eval_logits/chosen": -2.1668412685394287,
2196
+ "eval_logits/rejected": -2.162137985229492,
2197
+ "eval_logps/chosen": -33.57229232788086,
2198
+ "eval_logps/rejected": -37.258155822753906,
2199
+ "eval_loss": 0.46543803811073303,
2200
+ "eval_rewards/accuracies": 0.6034052968025208,
2201
+ "eval_rewards/chosen": 0.36980658769607544,
2202
+ "eval_rewards/margins": 0.16303294897079468,
2203
+ "eval_rewards/rejected": 0.20677369832992554,
2204
+ "eval_runtime": 145.9296,
2205
+ "eval_samples_per_second": 2.35,
2206
+ "eval_steps_per_second": 0.295,
2207
+ "step": 1400
2208
+ },
2209
+ {
2210
+ "epoch": 3.66,
2211
+ "learning_rate": 1.0775199359171346e-07,
2212
+ "logits/chosen": -1.9372856616973877,
2213
+ "logits/rejected": -1.9331865310668945,
2214
+ "logps/chosen": -30.8883113861084,
2215
+ "logps/rejected": -29.84084129333496,
2216
+ "loss": 0.1363,
2217
+ "rewards/accuracies": 0.9750000238418579,
2218
+ "rewards/chosen": 2.2033774852752686,
2219
+ "rewards/margins": 2.4324822425842285,
2220
+ "rewards/rejected": -0.22910502552986145,
2221
+ "step": 1410
2222
+ },
2223
+ {
2224
+ "epoch": 3.69,
2225
+ "learning_rate": 9.191080703056604e-08,
2226
+ "logits/chosen": -1.8948686122894287,
2227
+ "logits/rejected": -1.8960151672363281,
2228
+ "logps/chosen": -30.679889678955078,
2229
+ "logps/rejected": -35.14348602294922,
2230
+ "loss": 0.152,
2231
+ "rewards/accuracies": 0.9375,
2232
+ "rewards/chosen": 2.1479902267456055,
2233
+ "rewards/margins": 2.323575735092163,
2234
+ "rewards/rejected": -0.17558541893959045,
2235
+ "step": 1420
2236
+ },
2237
+ {
2238
+ "epoch": 3.71,
2239
+ "learning_rate": 7.730678442730539e-08,
2240
+ "logits/chosen": -1.8465667963027954,
2241
+ "logits/rejected": -1.8398901224136353,
2242
+ "logps/chosen": -31.448232650756836,
2243
+ "logps/rejected": -37.158573150634766,
2244
+ "loss": 0.1367,
2245
+ "rewards/accuracies": 0.9375,
2246
+ "rewards/chosen": 2.1548283100128174,
2247
+ "rewards/margins": 2.530158519744873,
2248
+ "rewards/rejected": -0.3753301799297333,
2249
+ "step": 1430
2250
+ },
2251
+ {
2252
+ "epoch": 3.74,
2253
+ "learning_rate": 6.394742864787806e-08,
2254
+ "logits/chosen": -1.859572172164917,
2255
+ "logits/rejected": -1.8541902303695679,
2256
+ "logps/chosen": -26.713485717773438,
2257
+ "logps/rejected": -32.027252197265625,
2258
+ "loss": 0.1488,
2259
+ "rewards/accuracies": 0.9125000238418579,
2260
+ "rewards/chosen": 2.0798051357269287,
2261
+ "rewards/margins": 2.317080020904541,
2262
+ "rewards/rejected": -0.2372746467590332,
2263
+ "step": 1440
2264
+ },
2265
+ {
2266
+ "epoch": 3.77,
2267
+ "learning_rate": 5.183960310644748e-08,
2268
+ "logits/chosen": -1.8826467990875244,
2269
+ "logits/rejected": -1.8725192546844482,
2270
+ "logps/chosen": -30.08587074279785,
2271
+ "logps/rejected": -36.018455505371094,
2272
+ "loss": 0.1431,
2273
+ "rewards/accuracies": 0.9750000238418579,
2274
+ "rewards/chosen": 1.9165277481079102,
2275
+ "rewards/margins": 2.503002405166626,
2276
+ "rewards/rejected": -0.5864745378494263,
2277
+ "step": 1450
2278
+ },
2279
+ {
2280
+ "epoch": 3.79,
2281
+ "learning_rate": 4.098952823928693e-08,
2282
+ "logits/chosen": -1.8568382263183594,
2283
+ "logits/rejected": -1.8540499210357666,
2284
+ "logps/chosen": -30.78998374938965,
2285
+ "logps/rejected": -31.02651596069336,
2286
+ "loss": 0.1648,
2287
+ "rewards/accuracies": 0.9375,
2288
+ "rewards/chosen": 2.0158348083496094,
2289
+ "rewards/margins": 2.124389886856079,
2290
+ "rewards/rejected": -0.10855510085821152,
2291
+ "step": 1460
2292
+ },
2293
+ {
2294
+ "epoch": 3.82,
2295
+ "learning_rate": 3.1402778309014284e-08,
2296
+ "logits/chosen": -1.9178192615509033,
2297
+ "logits/rejected": -1.924325704574585,
2298
+ "logps/chosen": -29.135822296142578,
2299
+ "logps/rejected": -33.322288513183594,
2300
+ "loss": 0.1271,
2301
+ "rewards/accuracies": 0.9624999761581421,
2302
+ "rewards/chosen": 2.3466856479644775,
2303
+ "rewards/margins": 2.634577989578247,
2304
+ "rewards/rejected": -0.28789228200912476,
2305
+ "step": 1470
2306
+ },
2307
+ {
2308
+ "epoch": 3.84,
2309
+ "learning_rate": 2.3084278540791427e-08,
2310
+ "logits/chosen": -1.9184119701385498,
2311
+ "logits/rejected": -1.9287493228912354,
2312
+ "logps/chosen": -29.0635986328125,
2313
+ "logps/rejected": -29.924457550048828,
2314
+ "loss": 0.1386,
2315
+ "rewards/accuracies": 0.9750000238418579,
2316
+ "rewards/chosen": 2.1053242683410645,
2317
+ "rewards/margins": 2.342036008834839,
2318
+ "rewards/rejected": -0.2367115318775177,
2319
+ "step": 1480
2320
+ },
2321
+ {
2322
+ "epoch": 3.87,
2323
+ "learning_rate": 1.6038302591975807e-08,
2324
+ "logits/chosen": -1.8501653671264648,
2325
+ "logits/rejected": -1.8432512283325195,
2326
+ "logps/chosen": -31.104766845703125,
2327
+ "logps/rejected": -32.60163879394531,
2328
+ "loss": 0.144,
2329
+ "rewards/accuracies": 0.9624999761581421,
2330
+ "rewards/chosen": 2.0413756370544434,
2331
+ "rewards/margins": 2.3543033599853516,
2332
+ "rewards/rejected": -0.31292763352394104,
2333
+ "step": 1490
2334
+ },
2335
+ {
2336
+ "epoch": 3.9,
2337
+ "learning_rate": 1.0268470356514237e-08,
2338
+ "logits/chosen": -1.9025371074676514,
2339
+ "logits/rejected": -1.8992702960968018,
2340
+ "logps/chosen": -31.016992568969727,
2341
+ "logps/rejected": -33.745033264160156,
2342
+ "loss": 0.1447,
2343
+ "rewards/accuracies": 0.9750000238418579,
2344
+ "rewards/chosen": 2.086864709854126,
2345
+ "rewards/margins": 2.474714517593384,
2346
+ "rewards/rejected": -0.3878494203090668,
2347
+ "step": 1500
2348
+ },
2349
+ {
2350
+ "epoch": 3.9,
2351
+ "eval_logits/chosen": -2.1669082641601562,
2352
+ "eval_logits/rejected": -2.1621992588043213,
2353
+ "eval_logps/chosen": -33.575931549072266,
2354
+ "eval_logps/rejected": -37.24650192260742,
2355
+ "eval_loss": 0.46842658519744873,
2356
+ "eval_rewards/accuracies": 0.574335515499115,
2357
+ "eval_rewards/chosen": 0.3668961524963379,
2358
+ "eval_rewards/margins": 0.15079911053180695,
2359
+ "eval_rewards/rejected": 0.21609705686569214,
2360
+ "eval_runtime": 145.9178,
2361
+ "eval_samples_per_second": 2.351,
2362
+ "eval_steps_per_second": 0.295,
2363
+ "step": 1500
2364
+ },
2365
+ {
2366
+ "epoch": 3.92,
2367
+ "learning_rate": 5.777746105209147e-09,
2368
+ "logits/chosen": -1.9817174673080444,
2369
+ "logits/rejected": -1.9823877811431885,
2370
+ "logps/chosen": -27.261791229248047,
2371
+ "logps/rejected": -33.12643814086914,
2372
+ "loss": 0.1654,
2373
+ "rewards/accuracies": 0.875,
2374
+ "rewards/chosen": 2.0435855388641357,
2375
+ "rewards/margins": 2.35237717628479,
2376
+ "rewards/rejected": -0.30879148840904236,
2377
+ "step": 1510
2378
+ },
2379
+ {
2380
+ "epoch": 3.95,
2381
+ "learning_rate": 2.5684369628148352e-09,
2382
+ "logits/chosen": -1.834775686264038,
2383
+ "logits/rejected": -1.8347667455673218,
2384
+ "logps/chosen": -30.5068416595459,
2385
+ "logps/rejected": -34.2733154296875,
2386
+ "loss": 0.1565,
2387
+ "rewards/accuracies": 0.9375,
2388
+ "rewards/chosen": 2.0280237197875977,
2389
+ "rewards/margins": 2.347689151763916,
2390
+ "rewards/rejected": -0.31966525316238403,
2391
+ "step": 1520
2392
+ },
2393
+ {
2394
+ "epoch": 3.97,
2395
+ "learning_rate": 6.421917227455999e-10,
2396
+ "logits/chosen": -1.986581563949585,
2397
+ "logits/rejected": -1.9839556217193604,
2398
+ "logps/chosen": -29.06634521484375,
2399
+ "logps/rejected": -31.9202938079834,
2400
+ "loss": 0.1594,
2401
+ "rewards/accuracies": 0.9750000238418579,
2402
+ "rewards/chosen": 1.926171898841858,
2403
+ "rewards/margins": 2.2754859924316406,
2404
+ "rewards/rejected": -0.34931421279907227,
2405
+ "step": 1530
2406
+ },
2407
+ {
2408
+ "epoch": 4.0,
2409
+ "learning_rate": 0.0,
2410
+ "logits/chosen": -1.9706960916519165,
2411
+ "logits/rejected": -1.9718106985092163,
2412
+ "logps/chosen": -28.249774932861328,
2413
+ "logps/rejected": -30.143047332763672,
2414
+ "loss": 0.1594,
2415
+ "rewards/accuracies": 0.9583333730697632,
2416
+ "rewards/chosen": 1.8961349725723267,
2417
+ "rewards/margins": 2.178429126739502,
2418
+ "rewards/rejected": -0.2822941541671753,
2419
+ "step": 1540
2420
+ },
2421
+ {
2422
+ "epoch": 4.0,
2423
+ "step": 1540,
2424
  "total_flos": 0.0,
2425
+ "train_loss": 0.16076272354497537,
2426
+ "train_runtime": 10802.3033,
2427
+ "train_samples_per_second": 1.14,
2428
+ "train_steps_per_second": 0.143
2429
  }
2430
  ],
2431
  "logging_steps": 10,
2432
+ "max_steps": 1540,
2433
  "num_input_tokens_seen": 0,
2434
+ "num_train_epochs": 4,
2435
  "save_steps": 100,
2436
  "total_flos": 0.0,
2437
  "train_batch_size": 4,