hugodk-sch commited on
Commit
28b0977
1 Parent(s): 317b0fc

Model save

Browse files
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - alignment-handbook
5
  - trl
6
  - dpo
 
7
  - generated_from_trainer
8
  base_model: NbAiLab/nb-gpt-j-6B-v2
9
- datasets:
10
- - hugodk-sch/aftonposten_title_prefs
11
  model-index:
12
  - name: aftonposten-6b-align-scan
13
  results: []
@@ -18,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # aftonposten-6b-align-scan
20
 
21
- This model is a fine-tuned version of [data/ap-gpt-j-6b-sft-qlora-04-08](https://huggingface.co/data/ap-gpt-j-6b-sft-qlora-04-08) on the hugodk-sch/aftonposten_title_prefs dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9997
24
- - Rewards/chosen: -0.0006
25
- - Rewards/rejected: -0.0009
26
- - Rewards/accuracies: 0.5478
27
- - Rewards/margins: 0.0003
28
- - Logps/rejected: -37.6060
29
- - Logps/chosen: -34.0918
30
- - Logits/rejected: -2.2164
31
- - Logits/chosen: -2.2213
32
 
33
  ## Model description
34
 
@@ -57,15 +55,27 @@ The following hyperparameters were used during training:
57
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
  - lr_scheduler_type: cosine
59
  - lr_scheduler_warmup_ratio: 0.1
60
- - num_epochs: 1
61
 
62
  ### Training results
63
 
64
- | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
- |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
- | 0.9987 | 0.26 | 100 | 0.9998 | -0.0002 | -0.0004 | 0.5336 | 0.0002 | -37.5528 | -34.0499 | -2.2264 | -2.2313 |
67
- | 0.9965 | 0.52 | 200 | 0.9996 | -0.0003 | -0.0006 | 0.5071 | 0.0004 | -37.5790 | -34.0618 | -2.2204 | -2.2252 |
68
- | 0.9925 | 0.78 | 300 | 0.9996 | -0.0005 | -0.0009 | 0.5594 | 0.0004 | -37.6063 | -34.0836 | -2.2166 | -2.2214 |
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  ### Framework versions
 
1
  ---
2
  library_name: peft
3
  tags:
 
4
  - trl
5
  - dpo
6
+ - alignment-handbook
7
  - generated_from_trainer
8
  base_model: NbAiLab/nb-gpt-j-6B-v2
 
 
9
  model-index:
10
  - name: aftonposten-6b-align-scan
11
  results: []
 
16
 
17
  # aftonposten-6b-align-scan
18
 
19
+ This model is a fine-tuned version of [NbAiLab/nb-gpt-j-6B-v2](https://huggingface.co/NbAiLab/nb-gpt-j-6B-v2) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.9913
22
+ - Rewards/chosen: -0.0567
23
+ - Rewards/rejected: -0.0655
24
+ - Rewards/accuracies: 0.5424
25
+ - Rewards/margins: 0.0088
26
+ - Logps/rejected: -44.0648
27
+ - Logps/chosen: -39.7075
28
+ - Logits/rejected: -1.5832
29
+ - Logits/chosen: -1.5871
30
 
31
  ## Model description
32
 
 
55
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
  - lr_scheduler_type: cosine
57
  - lr_scheduler_warmup_ratio: 0.1
58
+ - num_epochs: 4
59
 
60
  ### Training results
61
 
62
+ | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
63
+ |:-------------:|:-----:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
64
+ | 0.9987 | 0.26 | 100 | -2.2313 | -2.2264 | -34.0499 | -37.5528 | 0.9998 | 0.5336 | -0.0002 | 0.0002 | -0.0004 |
65
+ | 0.9965 | 0.52 | 200 | -2.2252 | -2.2204 | -34.0618 | -37.5790 | 0.9996 | 0.5071 | -0.0003 | 0.0004 | -0.0006 |
66
+ | 0.9925 | 0.78 | 300 | -2.2214 | -2.2166 | -34.0836 | -37.6063 | 0.9996 | 0.5594 | -0.0005 | 0.0004 | -0.0009 |
67
+ | 0.986 | 1.04 | 400 | 0.9994 | -0.0014 | -0.0019 | 0.5212 | 0.0006 | -37.7105 | -34.1717 | -2.1877 | -2.1926 |
68
+ | 0.9781 | 1.3 | 500 | 0.9987 | -0.0031 | -0.0044 | 0.5855 | 0.0013 | -37.9551 | -34.3418 | -2.1137 | -2.1185 |
69
+ | 0.9774 | 1.56 | 600 | 0.9973 | -0.0073 | -0.0101 | 0.5743 | 0.0027 | -38.5228 | -34.7671 | -2.0162 | -2.0208 |
70
+ | 0.9688 | 1.82 | 700 | 0.9969 | -0.0143 | -0.0174 | 0.5482 | 0.0031 | -39.2598 | -35.4681 | -1.9235 | -1.9280 |
71
+ | 0.957 | 2.08 | 800 | 0.9954 | -0.0214 | -0.0260 | 0.5540 | 0.0046 | -40.1194 | -36.1733 | -1.8363 | -1.8407 |
72
+ | 0.9358 | 2.34 | 900 | 0.9939 | -0.0362 | -0.0423 | 0.5365 | 0.0061 | -41.7483 | -37.6532 | -1.6988 | -1.7029 |
73
+ | 0.9535 | 2.6 | 1000 | 0.9921 | -0.0511 | -0.0591 | 0.5453 | 0.0079 | -43.4237 | -39.1479 | -1.6143 | -1.6183 |
74
+ | 0.9616 | 2.86 | 1100 | 0.9916 | -0.0562 | -0.0646 | 0.5453 | 0.0084 | -43.9754 | -39.6505 | -1.5880 | -1.5920 |
75
+ | 0.9167 | 3.12 | 1200 | 0.9912 | -0.0563 | -0.0651 | 0.5482 | 0.0088 | -44.0289 | -39.6666 | -1.5851 | -1.5890 |
76
+ | 0.9033 | 3.38 | 1300 | 0.9913 | -0.0570 | -0.0657 | 0.5453 | 0.0087 | -44.0868 | -39.7316 | -1.5817 | -1.5856 |
77
+ | 0.9285 | 3.64 | 1400 | 0.9912 | -0.0569 | -0.0657 | 0.5395 | 0.0088 | -44.0852 | -39.7216 | -1.5825 | -1.5864 |
78
+ | 0.9196 | 3.9 | 1500 | 0.9913 | -0.0567 | -0.0655 | 0.5424 | 0.0088 | -44.0648 | -39.7075 | -1.5832 | -1.5871 |
79
 
80
 
81
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81e9822be818f0506cf3e58408b20367a0f45ac8175d9ac2a0de81f78618ad01
3
  size 176183216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9968a4bb556b78754d04833393473ea61e6cc9ca517cac1ac531d1dbcbe29e5
3
  size 176183216
all_results.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 1.0,
3
  "eval_logits/chosen": -2.221278667449951,
4
  "eval_logits/rejected": -2.2164483070373535,
5
  "eval_logps/chosen": -34.091766357421875,
@@ -13,9 +13,9 @@
13
  "eval_samples": 343,
14
  "eval_samples_per_second": 2.356,
15
  "eval_steps_per_second": 0.295,
16
- "train_loss": 0.9973225085766284,
17
- "train_runtime": 3253.1307,
18
  "train_samples": 3079,
19
- "train_samples_per_second": 0.946,
20
- "train_steps_per_second": 0.118
21
  }
 
1
  {
2
+ "epoch": 4.0,
3
  "eval_logits/chosen": -2.221278667449951,
4
  "eval_logits/rejected": -2.2164483070373535,
5
  "eval_logps/chosen": -34.091766357421875,
 
13
  "eval_samples": 343,
14
  "eval_samples_per_second": 2.356,
15
  "eval_steps_per_second": 0.295,
16
+ "train_loss": 0.7654441864459546,
17
+ "train_runtime": 10797.8934,
18
  "train_samples": 3079,
19
+ "train_samples_per_second": 1.141,
20
+ "train_steps_per_second": 0.143
21
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 0.9973225085766284,
4
- "train_runtime": 3253.1307,
5
  "train_samples": 3079,
6
- "train_samples_per_second": 0.946,
7
- "train_steps_per_second": 0.118
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.7654441864459546,
4
+ "train_runtime": 10797.8934,
5
  "train_samples": 3079,
6
+ "train_samples_per_second": 1.141,
7
+ "train_steps_per_second": 0.143
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 385,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -492,130 +492,1946 @@
492
  },
493
  {
494
  "epoch": 0.81,
495
- "learning_rate": 5.576113578589035e-07,
496
- "logits/chosen": -1.9060642719268799,
497
- "logits/rejected": -1.9028133153915405,
498
- "logps/chosen": -31.319162368774414,
499
- "logps/rejected": -33.85043716430664,
500
- "loss": 0.9961,
501
- "rewards/accuracies": 0.737500011920929,
502
- "rewards/chosen": 0.002841859357431531,
503
- "rewards/margins": 0.003924719989299774,
504
- "rewards/rejected": -0.0010828599333763123,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
- "learning_rate": 4.229036944380913e-07,
510
- "logits/chosen": -1.9553836584091187,
511
- "logits/rejected": -1.9432109594345093,
512
- "logps/chosen": -34.27588653564453,
513
- "logps/rejected": -33.672359466552734,
514
- "loss": 0.9955,
515
- "rewards/accuracies": 0.699999988079071,
516
- "rewards/chosen": 0.0032019000500440598,
517
- "rewards/margins": 0.004537059459835291,
518
- "rewards/rejected": -0.0013351596426218748,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
- "learning_rate": 3.053082288996112e-07,
524
- "logits/chosen": -1.9905990362167358,
525
- "logits/rejected": -1.9891618490219116,
526
- "logps/chosen": -33.116233825683594,
527
- "logps/rejected": -32.55724334716797,
528
- "loss": 0.9955,
529
  "rewards/accuracies": 0.737500011920929,
530
- "rewards/chosen": 0.0036955769173800945,
531
- "rewards/margins": 0.004472161643207073,
532
- "rewards/rejected": -0.0007765850750729442,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
- "learning_rate": 2.0579377374915805e-07,
538
- "logits/chosen": -2.0769362449645996,
539
- "logits/rejected": -2.0613036155700684,
540
- "logps/chosen": -33.791297912597656,
541
- "logps/rejected": -33.12422180175781,
542
- "loss": 0.9961,
543
- "rewards/accuracies": 0.699999988079071,
544
- "rewards/chosen": 0.003880967851728201,
545
- "rewards/margins": 0.00394281093031168,
546
- "rewards/rejected": -6.184288213262334e-05,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
- "learning_rate": 1.2518018074041684e-07,
552
- "logits/chosen": -1.950060248374939,
553
- "logits/rejected": -1.9492241144180298,
554
- "logps/chosen": -32.82404327392578,
555
- "logps/rejected": -32.50709915161133,
556
- "loss": 0.995,
557
- "rewards/accuracies": 0.7124999761581421,
558
- "rewards/chosen": 0.004580510314553976,
559
- "rewards/margins": 0.005000022705644369,
560
- "rewards/rejected": -0.000419511750806123,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
- "learning_rate": 6.41315865106129e-08,
566
- "logits/chosen": -1.9050449132919312,
567
- "logits/rejected": -1.915305733680725,
568
- "logps/chosen": -31.87860679626465,
569
- "logps/rejected": -35.34981155395508,
570
- "loss": 0.9961,
571
- "rewards/accuracies": 0.6875,
572
- "rewards/chosen": 0.0032608420588076115,
573
- "rewards/margins": 0.0038713677786290646,
574
- "rewards/rejected": -0.0006105261854827404,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
- "learning_rate": 2.3150941078050325e-08,
580
- "logits/chosen": -2.04546856880188,
581
- "logits/rejected": -2.039043426513672,
582
- "logps/chosen": -33.336219787597656,
583
- "logps/rejected": -29.269311904907227,
584
- "loss": 0.9964,
585
- "rewards/accuracies": 0.6625000238418579,
586
- "rewards/chosen": 0.0031574335880577564,
587
- "rewards/margins": 0.003586276201531291,
588
- "rewards/rejected": -0.0004288425261620432,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
- "learning_rate": 2.575864278703266e-09,
594
- "logits/chosen": -1.905160665512085,
595
- "logits/rejected": -1.907360315322876,
596
- "logps/chosen": -33.86741256713867,
597
- "logps/rejected": -30.982807159423828,
598
- "loss": 0.9952,
599
- "rewards/accuracies": 0.7250000238418579,
600
- "rewards/chosen": 0.0037163912784308195,
601
- "rewards/margins": 0.004818186163902283,
602
- "rewards/rejected": -0.0011017953511327505,
603
  "step": 380
604
  },
605
  {
606
- "epoch": 1.0,
607
- "step": 385,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
  "total_flos": 0.0,
609
- "train_loss": 0.9973225085766284,
610
- "train_runtime": 3253.1307,
611
- "train_samples_per_second": 0.946,
612
- "train_steps_per_second": 0.118
613
  }
614
  ],
615
  "logging_steps": 10,
616
- "max_steps": 385,
617
  "num_input_tokens_seen": 0,
618
- "num_train_epochs": 1,
619
  "save_steps": 100,
620
  "total_flos": 0.0,
621
  "train_batch_size": 4,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 1540,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
492
  },
493
  {
494
  "epoch": 0.81,
495
+ "learning_rate": 4.84533120650964e-06,
496
+ "logits/chosen": -1.9058300256729126,
497
+ "logits/rejected": -1.902580976486206,
498
+ "logps/chosen": -31.3160400390625,
499
+ "logps/rejected": -33.86355972290039,
500
+ "loss": 0.9959,
501
+ "rewards/accuracies": 0.7250000238418579,
502
+ "rewards/chosen": 0.0028731029015034437,
503
+ "rewards/margins": 0.004087135195732117,
504
+ "rewards/rejected": -0.0012140319449827075,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
+ "learning_rate": 4.825108134172131e-06,
510
+ "logits/chosen": -1.9523051977157593,
511
+ "logits/rejected": -1.9401578903198242,
512
+ "logps/chosen": -34.279632568359375,
513
+ "logps/rejected": -33.72241973876953,
514
+ "loss": 0.995,
515
+ "rewards/accuracies": 0.675000011920929,
516
+ "rewards/chosen": 0.003164437832310796,
517
+ "rewards/margins": 0.005000208970159292,
518
+ "rewards/rejected": -0.0018357712542638183,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
+ "learning_rate": 4.80369052967602e-06,
524
+ "logits/chosen": -1.985467553138733,
525
+ "logits/rejected": -1.9840377569198608,
526
+ "logps/chosen": -33.06474685668945,
527
+ "logps/rejected": -32.610504150390625,
528
+ "loss": 0.9945,
529
  "rewards/accuracies": 0.737500011920929,
530
+ "rewards/chosen": 0.004210514482110739,
531
+ "rewards/margins": 0.005519717000424862,
532
+ "rewards/rejected": -0.0013092018198221922,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
+ "learning_rate": 4.781089396387968e-06,
538
+ "logits/chosen": -2.0711922645568848,
539
+ "logits/rejected": -2.0555691719055176,
540
+ "logps/chosen": -33.74380874633789,
541
+ "logps/rejected": -33.10470199584961,
542
+ "loss": 0.9958,
543
+ "rewards/accuracies": 0.637499988079071,
544
+ "rewards/chosen": 0.0043558296747505665,
545
+ "rewards/margins": 0.004222516901791096,
546
+ "rewards/rejected": 0.00013331293303053826,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
+ "learning_rate": 4.757316345716554e-06,
552
+ "logits/chosen": -1.94492506980896,
553
+ "logits/rejected": -1.9441289901733398,
554
+ "logps/chosen": -32.790000915527344,
555
+ "logps/rejected": -32.51346969604492,
556
+ "loss": 0.9946,
557
+ "rewards/accuracies": 0.675000011920929,
558
+ "rewards/chosen": 0.004920940846204758,
559
+ "rewards/margins": 0.005404182709753513,
560
+ "rewards/rejected": -0.0004832421545870602,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
+ "learning_rate": 4.73238359114687e-06,
566
+ "logits/chosen": -1.8960033655166626,
567
+ "logits/rejected": -1.9061963558197021,
568
+ "logps/chosen": -31.744653701782227,
569
+ "logps/rejected": -35.42363739013672,
570
+ "loss": 0.9941,
571
+ "rewards/accuracies": 0.699999988079071,
572
+ "rewards/chosen": 0.004600368440151215,
573
+ "rewards/margins": 0.0059491791762411594,
574
+ "rewards/rejected": -0.0013488102704286575,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
+ "learning_rate": 4.706303941965804e-06,
580
+ "logits/chosen": -2.029266834259033,
581
+ "logits/rejected": -2.0229573249816895,
582
+ "logps/chosen": -33.29435729980469,
583
+ "logps/rejected": -29.330352783203125,
584
+ "loss": 0.9954,
585
+ "rewards/accuracies": 0.675000011920929,
586
+ "rewards/chosen": 0.003576122224330902,
587
+ "rewards/margins": 0.004615402314811945,
588
+ "rewards/rejected": -0.0010392797412350774,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
+ "learning_rate": 4.679090796681225e-06,
594
+ "logits/chosen": -1.8853864669799805,
595
+ "logits/rejected": -1.8876174688339233,
596
+ "logps/chosen": -33.62360382080078,
597
+ "logps/rejected": -31.0107479095459,
598
+ "loss": 0.9925,
599
+ "rewards/accuracies": 0.7875000238418579,
600
+ "rewards/chosen": 0.00615445151925087,
601
+ "rewards/margins": 0.007535641081631184,
602
+ "rewards/rejected": -0.0013811895623803139,
603
  "step": 380
604
  },
605
  {
606
+ "epoch": 1.01,
607
+ "learning_rate": 4.650758136138454e-06,
608
+ "logits/chosen": -1.9119482040405273,
609
+ "logits/rejected": -1.910658597946167,
610
+ "logps/chosen": -33.74114227294922,
611
+ "logps/rejected": -36.10308074951172,
612
+ "loss": 0.9891,
613
+ "rewards/accuracies": 0.7333333492279053,
614
+ "rewards/chosen": 0.006436466239392757,
615
+ "rewards/margins": 0.010907178744673729,
616
+ "rewards/rejected": -0.004470714367926121,
617
+ "step": 390
618
+ },
619
+ {
620
+ "epoch": 1.04,
621
+ "learning_rate": 4.621320516337559e-06,
622
+ "logits/chosen": -1.845229148864746,
623
+ "logits/rejected": -1.8368085622787476,
624
+ "logps/chosen": -30.943897247314453,
625
+ "logps/rejected": -36.51512908935547,
626
+ "loss": 0.986,
627
+ "rewards/accuracies": 0.8500000238418579,
628
+ "rewards/chosen": 0.00830076914280653,
629
+ "rewards/margins": 0.014046055264770985,
630
+ "rewards/rejected": -0.005745284259319305,
631
+ "step": 400
632
+ },
633
+ {
634
+ "epoch": 1.04,
635
+ "eval_logits/chosen": -2.1925649642944336,
636
+ "eval_logits/rejected": -2.1877267360687256,
637
+ "eval_logps/chosen": -34.17170333862305,
638
+ "eval_logps/rejected": -37.71051788330078,
639
+ "eval_loss": 0.9994248747825623,
640
+ "eval_rewards/accuracies": 0.5211793780326843,
641
+ "eval_rewards/chosen": -0.0013715263921767473,
642
+ "eval_rewards/margins": 0.0005674380226992071,
643
+ "eval_rewards/rejected": -0.0019389643566682935,
644
+ "eval_runtime": 146.3059,
645
+ "eval_samples_per_second": 2.344,
646
+ "eval_steps_per_second": 0.294,
647
+ "step": 400
648
+ },
649
+ {
650
+ "epoch": 1.06,
651
+ "learning_rate": 4.590793060955158e-06,
652
+ "logits/chosen": -2.012714385986328,
653
+ "logits/rejected": -2.015584945678711,
654
+ "logps/chosen": -32.184898376464844,
655
+ "logps/rejected": -35.35422897338867,
656
+ "loss": 0.9857,
657
+ "rewards/accuracies": 0.8374999761581421,
658
+ "rewards/chosen": 0.00869043916463852,
659
+ "rewards/margins": 0.014304302632808685,
660
+ "rewards/rejected": -0.005613864399492741,
661
+ "step": 410
662
+ },
663
+ {
664
+ "epoch": 1.09,
665
+ "learning_rate": 4.559191453574582e-06,
666
+ "logits/chosen": -1.8472423553466797,
667
+ "logits/rejected": -1.8458467721939087,
668
+ "logps/chosen": -28.332477569580078,
669
+ "logps/rejected": -32.81929397583008,
670
+ "loss": 0.9876,
671
+ "rewards/accuracies": 0.9125000238418579,
672
+ "rewards/chosen": 0.0073507861234247684,
673
+ "rewards/margins": 0.012367108836770058,
674
+ "rewards/rejected": -0.005016324110329151,
675
+ "step": 420
676
+ },
677
+ {
678
+ "epoch": 1.12,
679
+ "learning_rate": 4.52653192962838e-06,
680
+ "logits/chosen": -1.8031494617462158,
681
+ "logits/rejected": -1.796241044998169,
682
+ "logps/chosen": -33.12567901611328,
683
+ "logps/rejected": -34.58681106567383,
684
+ "loss": 0.9871,
685
+ "rewards/accuracies": 0.9125000238418579,
686
+ "rewards/chosen": 0.0094840619713068,
687
+ "rewards/margins": 0.012899428606033325,
688
+ "rewards/rejected": -0.003415366169065237,
689
+ "step": 430
690
+ },
691
+ {
692
+ "epoch": 1.14,
693
+ "learning_rate": 4.492831268057307e-06,
694
+ "logits/chosen": -1.9694416522979736,
695
+ "logits/rejected": -1.9643234014511108,
696
+ "logps/chosen": -30.745025634765625,
697
+ "logps/rejected": -32.631690979003906,
698
+ "loss": 0.983,
699
+ "rewards/accuracies": 0.9125000238418579,
700
+ "rewards/chosen": 0.010382669046521187,
701
+ "rewards/margins": 0.017023462802171707,
702
+ "rewards/rejected": -0.0066407956182956696,
703
+ "step": 440
704
+ },
705
+ {
706
+ "epoch": 1.17,
707
+ "learning_rate": 4.458106782690094e-06,
708
+ "logits/chosen": -1.8479034900665283,
709
+ "logits/rejected": -1.8521947860717773,
710
+ "logps/chosen": -33.47832107543945,
711
+ "logps/rejected": -33.297340393066406,
712
+ "loss": 0.9825,
713
+ "rewards/accuracies": 0.925000011920929,
714
+ "rewards/chosen": 0.010375277139246464,
715
+ "rewards/margins": 0.017462292686104774,
716
+ "rewards/rejected": -0.007087015546858311,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 1.19,
721
+ "learning_rate": 4.422376313348405e-06,
722
+ "logits/chosen": -1.848722219467163,
723
+ "logits/rejected": -1.8431631326675415,
724
+ "logps/chosen": -34.28997039794922,
725
+ "logps/rejected": -35.908599853515625,
726
+ "loss": 0.9791,
727
+ "rewards/accuracies": 0.925000011920929,
728
+ "rewards/chosen": 0.011295564472675323,
729
+ "rewards/margins": 0.0208906102925539,
730
+ "rewards/rejected": -0.009595044888556004,
731
+ "step": 460
732
+ },
733
+ {
734
+ "epoch": 1.22,
735
+ "learning_rate": 4.3856582166815696e-06,
736
+ "logits/chosen": -1.8663924932479858,
737
+ "logits/rejected": -1.8662691116333008,
738
+ "logps/chosen": -33.07716369628906,
739
+ "logps/rejected": -34.7927131652832,
740
+ "loss": 0.983,
741
+ "rewards/accuracies": 0.875,
742
+ "rewards/chosen": 0.010911665856838226,
743
+ "rewards/margins": 0.016983961686491966,
744
+ "rewards/rejected": -0.006072297692298889,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 1.25,
749
+ "learning_rate": 4.347971356735789e-06,
750
+ "logits/chosen": -1.9087026119232178,
751
+ "logits/rejected": -1.8900855779647827,
752
+ "logps/chosen": -32.9819450378418,
753
+ "logps/rejected": -33.98347854614258,
754
+ "loss": 0.9791,
755
+ "rewards/accuracies": 0.9125000238418579,
756
+ "rewards/chosen": 0.011788198724389076,
757
+ "rewards/margins": 0.02092314139008522,
758
+ "rewards/rejected": -0.009134944528341293,
759
+ "step": 480
760
+ },
761
+ {
762
+ "epoch": 1.27,
763
+ "learning_rate": 4.309335095262675e-06,
764
+ "logits/chosen": -1.8713194131851196,
765
+ "logits/rejected": -1.870599389076233,
766
+ "logps/chosen": -30.50040054321289,
767
+ "logps/rejected": -31.814233779907227,
768
+ "loss": 0.9832,
769
+ "rewards/accuracies": 0.875,
770
+ "rewards/chosen": 0.011109036393463612,
771
+ "rewards/margins": 0.016790907829999924,
772
+ "rewards/rejected": -0.005681873299181461,
773
+ "step": 490
774
+ },
775
+ {
776
+ "epoch": 1.3,
777
+ "learning_rate": 4.269769281772082e-06,
778
+ "logits/chosen": -1.8269096612930298,
779
+ "logits/rejected": -1.820021629333496,
780
+ "logps/chosen": -31.442739486694336,
781
+ "logps/rejected": -35.589012145996094,
782
+ "loss": 0.9781,
783
+ "rewards/accuracies": 0.862500011920929,
784
+ "rewards/chosen": 0.012273869477212429,
785
+ "rewards/margins": 0.02189657650887966,
786
+ "rewards/rejected": -0.009622708894312382,
787
+ "step": 500
788
+ },
789
+ {
790
+ "epoch": 1.3,
791
+ "eval_logits/chosen": -2.118459939956665,
792
+ "eval_logits/rejected": -2.113690137863159,
793
+ "eval_logps/chosen": -34.34184646606445,
794
+ "eval_logps/rejected": -37.955074310302734,
795
+ "eval_loss": 0.9986757040023804,
796
+ "eval_rewards/accuracies": 0.5855481624603271,
797
+ "eval_rewards/chosen": -0.0030729183927178383,
798
+ "eval_rewards/margins": 0.0013115763431414962,
799
+ "eval_rewards/rejected": -0.004384494852274656,
800
+ "eval_runtime": 145.8957,
801
+ "eval_samples_per_second": 2.351,
802
+ "eval_steps_per_second": 0.295,
803
+ "step": 500
804
+ },
805
+ {
806
+ "epoch": 1.32,
807
+ "learning_rate": 4.22929424333435e-06,
808
+ "logits/chosen": -1.818110466003418,
809
+ "logits/rejected": -1.821658730506897,
810
+ "logps/chosen": -28.320688247680664,
811
+ "logps/rejected": -33.88779830932617,
812
+ "loss": 0.9812,
813
+ "rewards/accuracies": 0.8500000238418579,
814
+ "rewards/chosen": 0.008541365154087543,
815
+ "rewards/margins": 0.01882646046578884,
816
+ "rewards/rejected": -0.010285094380378723,
817
+ "step": 510
818
+ },
819
+ {
820
+ "epoch": 1.35,
821
+ "learning_rate": 4.1879307741372085e-06,
822
+ "logits/chosen": -1.8071190118789673,
823
+ "logits/rejected": -1.817929983139038,
824
+ "logps/chosen": -32.183380126953125,
825
+ "logps/rejected": -31.759241104125977,
826
+ "loss": 0.9781,
827
+ "rewards/accuracies": 0.862500011920929,
828
+ "rewards/chosen": 0.010789523832499981,
829
+ "rewards/margins": 0.021865110844373703,
830
+ "rewards/rejected": -0.011075586080551147,
831
+ "step": 520
832
+ },
833
+ {
834
+ "epoch": 1.38,
835
+ "learning_rate": 4.145700124802693e-06,
836
+ "logits/chosen": -1.7446613311767578,
837
+ "logits/rejected": -1.74239182472229,
838
+ "logps/chosen": -30.616100311279297,
839
+ "logps/rejected": -31.322246551513672,
840
+ "loss": 0.978,
841
+ "rewards/accuracies": 0.7875000238418579,
842
+ "rewards/chosen": 0.010509822517633438,
843
+ "rewards/margins": 0.02196466363966465,
844
+ "rewards/rejected": -0.011454842053353786,
845
+ "step": 530
846
+ },
847
+ {
848
+ "epoch": 1.4,
849
+ "learning_rate": 4.102623991469562e-06,
850
+ "logits/chosen": -1.8112165927886963,
851
+ "logits/rejected": -1.8044904470443726,
852
+ "logps/chosen": -33.239593505859375,
853
+ "logps/rejected": -34.19357681274414,
854
+ "loss": 0.9783,
855
+ "rewards/accuracies": 0.8500000238418579,
856
+ "rewards/chosen": 0.009932359680533409,
857
+ "rewards/margins": 0.021693259477615356,
858
+ "rewards/rejected": -0.011760897003114223,
859
+ "step": 540
860
+ },
861
+ {
862
+ "epoch": 1.43,
863
+ "learning_rate": 4.058724504646834e-06,
864
+ "logits/chosen": -1.77358078956604,
865
+ "logits/rejected": -1.7800006866455078,
866
+ "logps/chosen": -30.977636337280273,
867
+ "logps/rejected": -33.71123123168945,
868
+ "loss": 0.9827,
869
+ "rewards/accuracies": 0.762499988079071,
870
+ "rewards/chosen": 0.007755486760288477,
871
+ "rewards/margins": 0.01728045754134655,
872
+ "rewards/rejected": -0.00952497310936451,
873
+ "step": 550
874
+ },
875
+ {
876
+ "epoch": 1.45,
877
+ "learning_rate": 4.014024217844167e-06,
878
+ "logits/chosen": -1.8399150371551514,
879
+ "logits/rejected": -1.8170970678329468,
880
+ "logps/chosen": -30.506546020507812,
881
+ "logps/rejected": -33.8461799621582,
882
+ "loss": 0.9822,
883
+ "rewards/accuracies": 0.8374999761581421,
884
+ "rewards/chosen": 0.009310225024819374,
885
+ "rewards/margins": 0.01781143620610237,
886
+ "rewards/rejected": -0.008501212112605572,
887
+ "step": 560
888
+ },
889
+ {
890
+ "epoch": 1.48,
891
+ "learning_rate": 3.968546095984911e-06,
892
+ "logits/chosen": -1.768343210220337,
893
+ "logits/rejected": -1.7633851766586304,
894
+ "logps/chosen": -31.467700958251953,
895
+ "logps/rejected": -33.090362548828125,
896
+ "loss": 0.9808,
897
+ "rewards/accuracies": 0.8125,
898
+ "rewards/chosen": 0.009490304626524448,
899
+ "rewards/margins": 0.019184768199920654,
900
+ "rewards/rejected": -0.009694463573396206,
901
+ "step": 570
902
+ },
903
+ {
904
+ "epoch": 1.51,
905
+ "learning_rate": 3.922313503607806e-06,
906
+ "logits/chosen": -1.7968637943267822,
907
+ "logits/rejected": -1.798661231994629,
908
+ "logps/chosen": -33.596519470214844,
909
+ "logps/rejected": -36.36685562133789,
910
+ "loss": 0.9765,
911
+ "rewards/accuracies": 0.8374999761581421,
912
+ "rewards/chosen": 0.007792733609676361,
913
+ "rewards/margins": 0.02345210127532482,
914
+ "rewards/rejected": -0.01565936766564846,
915
+ "step": 580
916
+ },
917
+ {
918
+ "epoch": 1.53,
919
+ "learning_rate": 3.875350192863368e-06,
920
+ "logits/chosen": -1.7735790014266968,
921
+ "logits/rejected": -1.773097038269043,
922
+ "logps/chosen": -29.556873321533203,
923
+ "logps/rejected": -32.799781799316406,
924
+ "loss": 0.9763,
925
+ "rewards/accuracies": 0.862500011920929,
926
+ "rewards/chosen": 0.01035183947533369,
927
+ "rewards/margins": 0.02373330667614937,
928
+ "rewards/rejected": -0.013381466269493103,
929
+ "step": 590
930
+ },
931
+ {
932
+ "epoch": 1.56,
933
+ "learning_rate": 3.8276802913111436e-06,
934
+ "logits/chosen": -1.77323317527771,
935
+ "logits/rejected": -1.770957589149475,
936
+ "logps/chosen": -32.05171585083008,
937
+ "logps/rejected": -33.61422348022461,
938
+ "loss": 0.9774,
939
+ "rewards/accuracies": 0.862500011920929,
940
+ "rewards/chosen": 0.009465712122619152,
941
+ "rewards/margins": 0.02260836958885193,
942
+ "rewards/rejected": -0.013142657466232777,
943
+ "step": 600
944
+ },
945
+ {
946
+ "epoch": 1.56,
947
+ "eval_logits/chosen": -2.0208382606506348,
948
+ "eval_logits/rejected": -2.0161898136138916,
949
+ "eval_logps/chosen": -34.76713562011719,
950
+ "eval_logps/rejected": -38.52279281616211,
951
+ "eval_loss": 0.997253954410553,
952
+ "eval_rewards/accuracies": 0.574335515499115,
953
+ "eval_rewards/chosen": -0.007325790822505951,
954
+ "eval_rewards/margins": 0.002735937014222145,
955
+ "eval_rewards/rejected": -0.01006172876805067,
956
+ "eval_runtime": 145.8309,
957
+ "eval_samples_per_second": 2.352,
958
+ "eval_steps_per_second": 0.295,
959
+ "step": 600
960
+ },
961
+ {
962
+ "epoch": 1.58,
963
+ "learning_rate": 3.7793282895240927e-06,
964
+ "logits/chosen": -1.7979720830917358,
965
+ "logits/rejected": -1.8042287826538086,
966
+ "logps/chosen": -31.683053970336914,
967
+ "logps/rejected": -33.631996154785156,
968
+ "loss": 0.9788,
969
+ "rewards/accuracies": 0.862500011920929,
970
+ "rewards/chosen": 0.005543296225368977,
971
+ "rewards/margins": 0.02123137377202511,
972
+ "rewards/rejected": -0.015688076615333557,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 1.61,
977
+ "learning_rate": 3.730319028506478e-06,
978
+ "logits/chosen": -1.741254210472107,
979
+ "logits/rejected": -1.7390741109848022,
980
+ "logps/chosen": -33.84577178955078,
981
+ "logps/rejected": -32.38778305053711,
982
+ "loss": 0.9769,
983
+ "rewards/accuracies": 0.800000011920929,
984
+ "rewards/chosen": 0.009101121686398983,
985
+ "rewards/margins": 0.023123882710933685,
986
+ "rewards/rejected": -0.014022761955857277,
987
+ "step": 620
988
+ },
989
+ {
990
+ "epoch": 1.64,
991
+ "learning_rate": 3.6806776869317074e-06,
992
+ "logits/chosen": -1.681199312210083,
993
+ "logits/rejected": -1.6748110055923462,
994
+ "logps/chosen": -34.46784210205078,
995
+ "logps/rejected": -34.00514221191406,
996
+ "loss": 0.9736,
997
+ "rewards/accuracies": 0.800000011920929,
998
+ "rewards/chosen": 0.00996384583413601,
999
+ "rewards/margins": 0.02644318714737892,
1000
+ "rewards/rejected": -0.016479339450597763,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 1.66,
1005
+ "learning_rate": 3.6304297682067146e-06,
1006
+ "logits/chosen": -1.6963602304458618,
1007
+ "logits/rejected": -1.7026523351669312,
1008
+ "logps/chosen": -33.33148193359375,
1009
+ "logps/rejected": -34.69775390625,
1010
+ "loss": 0.9789,
1011
+ "rewards/accuracies": 0.7875000238418579,
1012
+ "rewards/chosen": 0.00647822767496109,
1013
+ "rewards/margins": 0.021053150296211243,
1014
+ "rewards/rejected": -0.014574920758605003,
1015
+ "step": 640
1016
+ },
1017
+ {
1018
+ "epoch": 1.69,
1019
+ "learning_rate": 3.579601087369492e-06,
1020
+ "logits/chosen": -1.767584204673767,
1021
+ "logits/rejected": -1.7815086841583252,
1022
+ "logps/chosen": -31.229949951171875,
1023
+ "logps/rejected": -33.55590057373047,
1024
+ "loss": 0.9782,
1025
+ "rewards/accuracies": 0.8125,
1026
+ "rewards/chosen": 0.006225164048373699,
1027
+ "rewards/margins": 0.02180730551481247,
1028
+ "rewards/rejected": -0.015582139603793621,
1029
+ "step": 650
1030
+ },
1031
+ {
1032
+ "epoch": 1.71,
1033
+ "learning_rate": 3.5282177578265295e-06,
1034
+ "logits/chosen": -1.6283632516860962,
1035
+ "logits/rejected": -1.6250991821289062,
1036
+ "logps/chosen": -32.852622985839844,
1037
+ "logps/rejected": -36.96779251098633,
1038
+ "loss": 0.9666,
1039
+ "rewards/accuracies": 0.8500000238418579,
1040
+ "rewards/chosen": 0.01070336066186428,
1041
+ "rewards/margins": 0.03337875381112099,
1042
+ "rewards/rejected": -0.022675391286611557,
1043
+ "step": 660
1044
+ },
1045
+ {
1046
+ "epoch": 1.74,
1047
+ "learning_rate": 3.476306177936961e-06,
1048
+ "logits/chosen": -1.7106907367706299,
1049
+ "logits/rejected": -1.710681676864624,
1050
+ "logps/chosen": -30.867549896240234,
1051
+ "logps/rejected": -36.25127410888672,
1052
+ "loss": 0.9716,
1053
+ "rewards/accuracies": 0.7875000238418579,
1054
+ "rewards/chosen": 0.00418041180819273,
1055
+ "rewards/margins": 0.028429334983229637,
1056
+ "rewards/rejected": -0.024248924106359482,
1057
+ "step": 670
1058
+ },
1059
+ {
1060
+ "epoch": 1.77,
1061
+ "learning_rate": 3.423893017450324e-06,
1062
+ "logits/chosen": -1.6477956771850586,
1063
+ "logits/rejected": -1.6444308757781982,
1064
+ "logps/chosen": -30.36174964904785,
1065
+ "logps/rejected": -34.880332946777344,
1066
+ "loss": 0.9743,
1067
+ "rewards/accuracies": 0.8125,
1068
+ "rewards/chosen": 0.005105969496071339,
1069
+ "rewards/margins": 0.025731250643730164,
1070
+ "rewards/rejected": -0.0206252820789814,
1071
+ "step": 680
1072
+ },
1073
+ {
1074
+ "epoch": 1.79,
1075
+ "learning_rate": 3.3710052038048794e-06,
1076
+ "logits/chosen": -1.6614688634872437,
1077
+ "logits/rejected": -1.6616519689559937,
1078
+ "logps/chosen": -29.40378189086914,
1079
+ "logps/rejected": -32.798709869384766,
1080
+ "loss": 0.9711,
1081
+ "rewards/accuracies": 0.824999988079071,
1082
+ "rewards/chosen": 0.007593109272420406,
1083
+ "rewards/margins": 0.028875216841697693,
1084
+ "rewards/rejected": -0.021282104775309563,
1085
+ "step": 690
1086
+ },
1087
+ {
1088
+ "epoch": 1.82,
1089
+ "learning_rate": 3.3176699082935546e-06,
1090
+ "logits/chosen": -1.579671025276184,
1091
+ "logits/rejected": -1.583088994026184,
1092
+ "logps/chosen": -33.53199005126953,
1093
+ "logps/rejected": -33.63783264160156,
1094
+ "loss": 0.9688,
1095
+ "rewards/accuracies": 0.675000011920929,
1096
+ "rewards/chosen": 0.009464827366173267,
1097
+ "rewards/margins": 0.031196285039186478,
1098
+ "rewards/rejected": -0.021731454879045486,
1099
+ "step": 700
1100
+ },
1101
+ {
1102
+ "epoch": 1.82,
1103
+ "eval_logits/chosen": -1.9279817342758179,
1104
+ "eval_logits/rejected": -1.923504114151001,
1105
+ "eval_logps/chosen": -35.46812438964844,
1106
+ "eval_logps/rejected": -39.259788513183594,
1107
+ "eval_loss": 0.9969056844711304,
1108
+ "eval_rewards/accuracies": 0.5481727719306946,
1109
+ "eval_rewards/chosen": -0.014335726387798786,
1110
+ "eval_rewards/margins": 0.0030959637369960546,
1111
+ "eval_rewards/rejected": -0.01743169128894806,
1112
+ "eval_runtime": 145.8801,
1113
+ "eval_samples_per_second": 2.351,
1114
+ "eval_steps_per_second": 0.295,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 1.84,
1119
+ "learning_rate": 3.2639145321045933e-06,
1120
+ "logits/chosen": -1.6492347717285156,
1121
+ "logits/rejected": -1.6412155628204346,
1122
+ "logps/chosen": -36.10542297363281,
1123
+ "logps/rejected": -34.03607940673828,
1124
+ "loss": 0.9746,
1125
+ "rewards/accuracies": 0.7875000238418579,
1126
+ "rewards/chosen": 0.004065921995788813,
1127
+ "rewards/margins": 0.025368575006723404,
1128
+ "rewards/rejected": -0.02130264975130558,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 1.87,
1133
+ "learning_rate": 3.2097666922441107e-06,
1134
+ "logits/chosen": -1.649939775466919,
1135
+ "logits/rejected": -1.651545524597168,
1136
+ "logps/chosen": -36.18703842163086,
1137
+ "logps/rejected": -35.8452262878418,
1138
+ "loss": 0.9698,
1139
+ "rewards/accuracies": 0.8374999761581421,
1140
+ "rewards/chosen": 0.002483480144292116,
1141
+ "rewards/margins": 0.03017548657953739,
1142
+ "rewards/rejected": -0.027692005038261414,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 1.9,
1147
+ "learning_rate": 3.1552542073477554e-06,
1148
+ "logits/chosen": -1.6672731637954712,
1149
+ "logits/rejected": -1.6649030447006226,
1150
+ "logps/chosen": -31.656982421875,
1151
+ "logps/rejected": -35.12821578979492,
1152
+ "loss": 0.9696,
1153
+ "rewards/accuracies": 0.8125,
1154
+ "rewards/chosen": 0.009063473902642727,
1155
+ "rewards/margins": 0.030376678332686424,
1156
+ "rewards/rejected": -0.02131320908665657,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 1.92,
1161
+ "learning_rate": 3.100405083388799e-06,
1162
+ "logits/chosen": -1.6315555572509766,
1163
+ "logits/rejected": -1.6366676092147827,
1164
+ "logps/chosen": -30.99042320251465,
1165
+ "logps/rejected": -35.70110321044922,
1166
+ "loss": 0.9666,
1167
+ "rewards/accuracies": 0.862500011920929,
1168
+ "rewards/chosen": 0.007953016087412834,
1169
+ "rewards/margins": 0.03341587260365486,
1170
+ "rewards/rejected": -0.025462854653596878,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 1.95,
1175
+ "learning_rate": 3.0452474992899645e-06,
1176
+ "logits/chosen": -1.5864739418029785,
1177
+ "logits/rejected": -1.5849554538726807,
1178
+ "logps/chosen": -32.77734375,
1179
+ "logps/rejected": -37.71215057373047,
1180
+ "loss": 0.9663,
1181
+ "rewards/accuracies": 0.75,
1182
+ "rewards/chosen": 0.0034763626754283905,
1183
+ "rewards/margins": 0.03373824805021286,
1184
+ "rewards/rejected": -0.03026188537478447,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 1.97,
1189
+ "learning_rate": 2.989809792446417e-06,
1190
+ "logits/chosen": -1.453434705734253,
1191
+ "logits/rejected": -1.448988676071167,
1192
+ "logps/chosen": -35.56426239013672,
1193
+ "logps/rejected": -38.61144256591797,
1194
+ "loss": 0.9616,
1195
+ "rewards/accuracies": 0.7875000238418579,
1196
+ "rewards/chosen": 0.004341802094131708,
1197
+ "rewards/margins": 0.03837207704782486,
1198
+ "rewards/rejected": -0.03403027355670929,
1199
+ "step": 760
1200
+ },
1201
+ {
1202
+ "epoch": 2.0,
1203
+ "learning_rate": 2.9341204441673267e-06,
1204
+ "logits/chosen": -1.5692594051361084,
1205
+ "logits/rejected": -1.5738866329193115,
1206
+ "logps/chosen": -35.17799758911133,
1207
+ "logps/rejected": -36.24814224243164,
1208
+ "loss": 0.9731,
1209
+ "rewards/accuracies": 0.7208333611488342,
1210
+ "rewards/chosen": 0.0003566344385035336,
1211
+ "rewards/margins": 0.02688964083790779,
1212
+ "rewards/rejected": -0.026533011347055435,
1213
+ "step": 770
1214
+ },
1215
+ {
1216
+ "epoch": 2.03,
1217
+ "learning_rate": 2.878208065043501e-06,
1218
+ "logits/chosen": -1.5177220106124878,
1219
+ "logits/rejected": -1.5161765813827515,
1220
+ "logps/chosen": -32.8176155090332,
1221
+ "logps/rejected": -39.03325653076172,
1222
+ "loss": 0.9429,
1223
+ "rewards/accuracies": 0.887499988079071,
1224
+ "rewards/chosen": 0.010839789174497128,
1225
+ "rewards/margins": 0.0571257583796978,
1226
+ "rewards/rejected": -0.0462859682738781,
1227
+ "step": 780
1228
+ },
1229
+ {
1230
+ "epoch": 2.05,
1231
+ "learning_rate": 2.8221013802485974e-06,
1232
+ "logits/chosen": -1.5499645471572876,
1233
+ "logits/rejected": -1.5473253726959229,
1234
+ "logps/chosen": -32.2547607421875,
1235
+ "logps/rejected": -36.87131881713867,
1236
+ "loss": 0.9509,
1237
+ "rewards/accuracies": 0.8500000238418579,
1238
+ "rewards/chosen": 0.011209758929908276,
1239
+ "rewards/margins": 0.04905728995800018,
1240
+ "rewards/rejected": -0.037847526371479034,
1241
+ "step": 790
1242
+ },
1243
+ {
1244
+ "epoch": 2.08,
1245
+ "learning_rate": 2.76582921478147e-06,
1246
+ "logits/chosen": -1.4648706912994385,
1247
+ "logits/rejected": -1.459632396697998,
1248
+ "logps/chosen": -33.919517517089844,
1249
+ "logps/rejected": -35.301055908203125,
1250
+ "loss": 0.957,
1251
+ "rewards/accuracies": 0.875,
1252
+ "rewards/chosen": 0.004635663237422705,
1253
+ "rewards/margins": 0.04300360754132271,
1254
+ "rewards/rejected": -0.038367945700883865,
1255
+ "step": 800
1256
+ },
1257
+ {
1258
+ "epoch": 2.08,
1259
+ "eval_logits/chosen": -1.840654969215393,
1260
+ "eval_logits/rejected": -1.8362884521484375,
1261
+ "eval_logps/chosen": -36.17332458496094,
1262
+ "eval_logps/rejected": -40.119380950927734,
1263
+ "eval_loss": 0.9953566193580627,
1264
+ "eval_rewards/accuracies": 0.5539867281913757,
1265
+ "eval_rewards/chosen": -0.021387748420238495,
1266
+ "eval_rewards/margins": 0.004639865830540657,
1267
+ "eval_rewards/rejected": -0.0260276161134243,
1268
+ "eval_runtime": 145.904,
1269
+ "eval_samples_per_second": 2.351,
1270
+ "eval_steps_per_second": 0.295,
1271
+ "step": 800
1272
+ },
1273
+ {
1274
+ "epoch": 2.1,
1275
+ "learning_rate": 2.7094204786572254e-06,
1276
+ "logits/chosen": -1.547839641571045,
1277
+ "logits/rejected": -1.5548707246780396,
1278
+ "logps/chosen": -31.346927642822266,
1279
+ "logps/rejected": -39.03870391845703,
1280
+ "loss": 0.9444,
1281
+ "rewards/accuracies": 0.875,
1282
+ "rewards/chosen": 0.007813268341124058,
1283
+ "rewards/margins": 0.05564187094569206,
1284
+ "rewards/rejected": -0.04782859981060028,
1285
+ "step": 810
1286
+ },
1287
+ {
1288
+ "epoch": 2.13,
1289
+ "learning_rate": 2.6529041520546072e-06,
1290
+ "logits/chosen": -1.514336347579956,
1291
+ "logits/rejected": -1.5161477327346802,
1292
+ "logps/chosen": -32.147666931152344,
1293
+ "logps/rejected": -36.56987762451172,
1294
+ "loss": 0.9624,
1295
+ "rewards/accuracies": 0.8125,
1296
+ "rewards/chosen": 0.004234147723764181,
1297
+ "rewards/margins": 0.03763490542769432,
1298
+ "rewards/rejected": -0.0334007553756237,
1299
+ "step": 820
1300
+ },
1301
+ {
1302
+ "epoch": 2.16,
1303
+ "learning_rate": 2.5963092704273302e-06,
1304
+ "logits/chosen": -1.3985604047775269,
1305
+ "logits/rejected": -1.4026976823806763,
1306
+ "logps/chosen": -32.4110221862793,
1307
+ "logps/rejected": -40.09288787841797,
1308
+ "loss": 0.9464,
1309
+ "rewards/accuracies": 0.875,
1310
+ "rewards/chosen": 0.0018043376039713621,
1311
+ "rewards/margins": 0.053631413727998734,
1312
+ "rewards/rejected": -0.051827073097229004,
1313
+ "step": 830
1314
+ },
1315
+ {
1316
+ "epoch": 2.18,
1317
+ "learning_rate": 2.53966490958702e-06,
1318
+ "logits/chosen": -1.4642579555511475,
1319
+ "logits/rejected": -1.4605650901794434,
1320
+ "logps/chosen": -32.868621826171875,
1321
+ "logps/rejected": -37.319786071777344,
1322
+ "loss": 0.9557,
1323
+ "rewards/accuracies": 0.800000011920929,
1324
+ "rewards/chosen": 0.0010002745548263192,
1325
+ "rewards/margins": 0.04425168037414551,
1326
+ "rewards/rejected": -0.0432514064013958,
1327
+ "step": 840
1328
+ },
1329
+ {
1330
+ "epoch": 2.21,
1331
+ "learning_rate": 2.4830001707654135e-06,
1332
+ "logits/chosen": -1.5074342489242554,
1333
+ "logits/rejected": -1.5097802877426147,
1334
+ "logps/chosen": -32.264610290527344,
1335
+ "logps/rejected": -41.21857833862305,
1336
+ "loss": 0.9393,
1337
+ "rewards/accuracies": 0.8999999761581421,
1338
+ "rewards/chosen": 0.005107272416353226,
1339
+ "rewards/margins": 0.06067372113466263,
1340
+ "rewards/rejected": -0.0555664524435997,
1341
+ "step": 850
1342
+ },
1343
+ {
1344
+ "epoch": 2.23,
1345
+ "learning_rate": 2.4263441656635054e-06,
1346
+ "logits/chosen": -1.3380223512649536,
1347
+ "logits/rejected": -1.3343451023101807,
1348
+ "logps/chosen": -37.09096908569336,
1349
+ "logps/rejected": -38.383792877197266,
1350
+ "loss": 0.9532,
1351
+ "rewards/accuracies": 0.7749999761581421,
1352
+ "rewards/chosen": -0.008092949166893959,
1353
+ "rewards/margins": 0.04682592675089836,
1354
+ "rewards/rejected": -0.05491887778043747,
1355
+ "step": 860
1356
+ },
1357
+ {
1358
+ "epoch": 2.26,
1359
+ "learning_rate": 2.3697260014953107e-06,
1360
+ "logits/chosen": -1.360643744468689,
1361
+ "logits/rejected": -1.3605453968048096,
1362
+ "logps/chosen": -36.00204086303711,
1363
+ "logps/rejected": -40.74755096435547,
1364
+ "loss": 0.9426,
1365
+ "rewards/accuracies": 0.8125,
1366
+ "rewards/chosen": -0.0015923639293760061,
1367
+ "rewards/margins": 0.057420529425144196,
1368
+ "rewards/rejected": -0.059012897312641144,
1369
+ "step": 870
1370
+ },
1371
+ {
1372
+ "epoch": 2.29,
1373
+ "learning_rate": 2.3131747660339396e-06,
1374
+ "logits/chosen": -1.3869389295578003,
1375
+ "logits/rejected": -1.3750654458999634,
1376
+ "logps/chosen": -34.1107292175293,
1377
+ "logps/rejected": -38.999202728271484,
1378
+ "loss": 0.9393,
1379
+ "rewards/accuracies": 0.8500000238418579,
1380
+ "rewards/chosen": -4.148450534557924e-05,
1381
+ "rewards/margins": 0.06073024123907089,
1382
+ "rewards/rejected": -0.06077173352241516,
1383
+ "step": 880
1384
+ },
1385
+ {
1386
+ "epoch": 2.31,
1387
+ "learning_rate": 2.256719512667651e-06,
1388
+ "logits/chosen": -1.476839303970337,
1389
+ "logits/rejected": -1.4818614721298218,
1390
+ "logps/chosen": -34.29611587524414,
1391
+ "logps/rejected": -39.490264892578125,
1392
+ "loss": 0.9391,
1393
+ "rewards/accuracies": 0.762499988079071,
1394
+ "rewards/chosen": -0.009661799296736717,
1395
+ "rewards/margins": 0.06087601184844971,
1396
+ "rewards/rejected": -0.07053781300783157,
1397
+ "step": 890
1398
+ },
1399
+ {
1400
+ "epoch": 2.34,
1401
+ "learning_rate": 2.2003892454735786e-06,
1402
+ "logits/chosen": -1.4013174772262573,
1403
+ "logits/rejected": -1.3942521810531616,
1404
+ "logps/chosen": -35.21047592163086,
1405
+ "logps/rejected": -39.179725646972656,
1406
+ "loss": 0.9358,
1407
+ "rewards/accuracies": 0.862500011920929,
1408
+ "rewards/chosen": -0.0035639957059174776,
1409
+ "rewards/margins": 0.064211905002594,
1410
+ "rewards/rejected": -0.0677758976817131,
1411
+ "step": 900
1412
+ },
1413
+ {
1414
+ "epoch": 2.34,
1415
+ "eval_logits/chosen": -1.7029222249984741,
1416
+ "eval_logits/rejected": -1.6987833976745605,
1417
+ "eval_logps/chosen": -37.65321350097656,
1418
+ "eval_logps/rejected": -41.74834060668945,
1419
+ "eval_loss": 0.9938713312149048,
1420
+ "eval_rewards/accuracies": 0.5365448594093323,
1421
+ "eval_rewards/chosen": -0.03618660196661949,
1422
+ "eval_rewards/margins": 0.00613059988245368,
1423
+ "eval_rewards/rejected": -0.04231720417737961,
1424
+ "eval_runtime": 145.9089,
1425
+ "eval_samples_per_second": 2.351,
1426
+ "eval_steps_per_second": 0.295,
1427
+ "step": 900
1428
+ },
1429
+ {
1430
+ "epoch": 2.36,
1431
+ "learning_rate": 2.1442129043167877e-06,
1432
+ "logits/chosen": -1.3909523487091064,
1433
+ "logits/rejected": -1.3909533023834229,
1434
+ "logps/chosen": -31.57651138305664,
1435
+ "logps/rejected": -42.50416564941406,
1436
+ "loss": 0.9259,
1437
+ "rewards/accuracies": 0.8374999761581421,
1438
+ "rewards/chosen": -0.004093508701771498,
1439
+ "rewards/margins": 0.07405127584934235,
1440
+ "rewards/rejected": -0.07814478874206543,
1441
+ "step": 910
1442
+ },
1443
+ {
1444
+ "epoch": 2.39,
1445
+ "learning_rate": 2.088219349982323e-06,
1446
+ "logits/chosen": -1.3331390619277954,
1447
+ "logits/rejected": -1.3248008489608765,
1448
+ "logps/chosen": -33.280738830566406,
1449
+ "logps/rejected": -41.13130569458008,
1450
+ "loss": 0.9388,
1451
+ "rewards/accuracies": 0.8125,
1452
+ "rewards/chosen": -0.014417916536331177,
1453
+ "rewards/margins": 0.06118142604827881,
1454
+ "rewards/rejected": -0.07559934258460999,
1455
+ "step": 920
1456
+ },
1457
+ {
1458
+ "epoch": 2.42,
1459
+ "learning_rate": 2.0324373493478803e-06,
1460
+ "logits/chosen": -1.474799394607544,
1461
+ "logits/rejected": -1.473085641860962,
1462
+ "logps/chosen": -30.739898681640625,
1463
+ "logps/rejected": -40.02002716064453,
1464
+ "loss": 0.9359,
1465
+ "rewards/accuracies": 0.7749999761581421,
1466
+ "rewards/chosen": -0.007428646087646484,
1467
+ "rewards/margins": 0.06412303447723389,
1468
+ "rewards/rejected": -0.07155168056488037,
1469
+ "step": 930
1470
+ },
1471
+ {
1472
+ "epoch": 2.44,
1473
+ "learning_rate": 1.976895560604729e-06,
1474
+ "logits/chosen": -1.3616539239883423,
1475
+ "logits/rejected": -1.372312307357788,
1476
+ "logps/chosen": -36.05510330200195,
1477
+ "logps/rejected": -41.12104034423828,
1478
+ "loss": 0.935,
1479
+ "rewards/accuracies": 0.7875000238418579,
1480
+ "rewards/chosen": -0.016300354152917862,
1481
+ "rewards/margins": 0.06501360237598419,
1482
+ "rewards/rejected": -0.08131395280361176,
1483
+ "step": 940
1484
+ },
1485
+ {
1486
+ "epoch": 2.47,
1487
+ "learning_rate": 1.921622518534466e-06,
1488
+ "logits/chosen": -1.3951683044433594,
1489
+ "logits/rejected": -1.3979324102401733,
1490
+ "logps/chosen": -32.248931884765625,
1491
+ "logps/rejected": -39.3216552734375,
1492
+ "loss": 0.9402,
1493
+ "rewards/accuracies": 0.7124999761581421,
1494
+ "rewards/chosen": -0.016012068837881088,
1495
+ "rewards/margins": 0.05984366685152054,
1496
+ "rewards/rejected": -0.07585573941469193,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 2.49,
1501
+ "learning_rate": 1.8666466198491794e-06,
1502
+ "logits/chosen": -1.3717939853668213,
1503
+ "logits/rejected": -1.3667659759521484,
1504
+ "logps/chosen": -35.57506561279297,
1505
+ "logps/rejected": -42.31549835205078,
1506
+ "loss": 0.9309,
1507
+ "rewards/accuracies": 0.762499988079071,
1508
+ "rewards/chosen": -0.014998522587120533,
1509
+ "rewards/margins": 0.06909255683422089,
1510
+ "rewards/rejected": -0.08409108221530914,
1511
+ "step": 960
1512
+ },
1513
+ {
1514
+ "epoch": 2.52,
1515
+ "learning_rate": 1.8119961086025376e-06,
1516
+ "logits/chosen": -1.294325590133667,
1517
+ "logits/rejected": -1.2963002920150757,
1518
+ "logps/chosen": -34.213260650634766,
1519
+ "logps/rejected": -43.49393081665039,
1520
+ "loss": 0.9309,
1521
+ "rewards/accuracies": 0.800000011920929,
1522
+ "rewards/chosen": -0.014005112461745739,
1523
+ "rewards/margins": 0.06911824643611908,
1524
+ "rewards/rejected": -0.0831233561038971,
1525
+ "step": 970
1526
+ },
1527
+ {
1528
+ "epoch": 2.55,
1529
+ "learning_rate": 1.7576990616793139e-06,
1530
+ "logits/chosen": -1.3165125846862793,
1531
+ "logits/rejected": -1.310859203338623,
1532
+ "logps/chosen": -38.791927337646484,
1533
+ "logps/rejected": -45.34046936035156,
1534
+ "loss": 0.9431,
1535
+ "rewards/accuracies": 0.762499988079071,
1536
+ "rewards/chosen": -0.03386862203478813,
1537
+ "rewards/margins": 0.05691651254892349,
1538
+ "rewards/rejected": -0.09078512340784073,
1539
+ "step": 980
1540
+ },
1541
+ {
1542
+ "epoch": 2.57,
1543
+ "learning_rate": 1.7037833743707892e-06,
1544
+ "logits/chosen": -1.3041751384735107,
1545
+ "logits/rejected": -1.2975685596466064,
1546
+ "logps/chosen": -32.80427551269531,
1547
+ "logps/rejected": -45.00722885131836,
1548
+ "loss": 0.9278,
1549
+ "rewards/accuracies": 0.8125,
1550
+ "rewards/chosen": -0.018013231456279755,
1551
+ "rewards/margins": 0.07216015458106995,
1552
+ "rewards/rejected": -0.0901733785867691,
1553
+ "step": 990
1554
+ },
1555
+ {
1556
+ "epoch": 2.6,
1557
+ "learning_rate": 1.6502767460434588e-06,
1558
+ "logits/chosen": -1.2888100147247314,
1559
+ "logits/rejected": -1.2791659832000732,
1560
+ "logps/chosen": -34.17406463623047,
1561
+ "logps/rejected": -36.677513122558594,
1562
+ "loss": 0.9535,
1563
+ "rewards/accuracies": 0.699999988079071,
1564
+ "rewards/chosen": -0.024253757670521736,
1565
+ "rewards/margins": 0.04652407765388489,
1566
+ "rewards/rejected": -0.07077784091234207,
1567
+ "step": 1000
1568
+ },
1569
+ {
1570
+ "epoch": 2.6,
1571
+ "eval_logits/chosen": -1.6182585954666138,
1572
+ "eval_logits/rejected": -1.614295244216919,
1573
+ "eval_logps/chosen": -39.14792251586914,
1574
+ "eval_logps/rejected": -43.423744201660156,
1575
+ "eval_loss": 0.9920738935470581,
1576
+ "eval_rewards/accuracies": 0.545265793800354,
1577
+ "eval_rewards/chosen": -0.05113373324275017,
1578
+ "eval_rewards/margins": 0.007937532849609852,
1579
+ "eval_rewards/rejected": -0.059071265161037445,
1580
+ "eval_runtime": 145.6856,
1581
+ "eval_samples_per_second": 2.354,
1582
+ "eval_steps_per_second": 0.295,
1583
+ "step": 1000
1584
+ },
1585
+ {
1586
+ "epoch": 2.62,
1587
+ "learning_rate": 1.5972066659083796e-06,
1588
+ "logits/chosen": -1.353121042251587,
1589
+ "logits/rejected": -1.35282301902771,
1590
+ "logps/chosen": -34.075523376464844,
1591
+ "logps/rejected": -38.43246078491211,
1592
+ "loss": 0.9408,
1593
+ "rewards/accuracies": 0.699999988079071,
1594
+ "rewards/chosen": -0.022513313218951225,
1595
+ "rewards/margins": 0.059226490557193756,
1596
+ "rewards/rejected": -0.08173979073762894,
1597
+ "step": 1010
1598
+ },
1599
+ {
1600
+ "epoch": 2.65,
1601
+ "learning_rate": 1.5446003988985041e-06,
1602
+ "logits/chosen": -1.403580904006958,
1603
+ "logits/rejected": -1.4037662744522095,
1604
+ "logps/chosen": -33.98898696899414,
1605
+ "logps/rejected": -39.30225372314453,
1606
+ "loss": 0.9391,
1607
+ "rewards/accuracies": 0.762499988079071,
1608
+ "rewards/chosen": -0.020454101264476776,
1609
+ "rewards/margins": 0.06091039255261421,
1610
+ "rewards/rejected": -0.08136448264122009,
1611
+ "step": 1020
1612
+ },
1613
+ {
1614
+ "epoch": 2.68,
1615
+ "learning_rate": 1.4924849716612211e-06,
1616
+ "logits/chosen": -1.3622567653656006,
1617
+ "logits/rejected": -1.3672521114349365,
1618
+ "logps/chosen": -35.09298324584961,
1619
+ "logps/rejected": -35.331912994384766,
1620
+ "loss": 0.9546,
1621
+ "rewards/accuracies": 0.6875,
1622
+ "rewards/chosen": -0.02812912128865719,
1623
+ "rewards/margins": 0.04535282403230667,
1624
+ "rewards/rejected": -0.07348194718360901,
1625
+ "step": 1030
1626
+ },
1627
+ {
1628
+ "epoch": 2.7,
1629
+ "learning_rate": 1.440887158673332e-06,
1630
+ "logits/chosen": -1.3483327627182007,
1631
+ "logits/rejected": -1.3397239446640015,
1632
+ "logps/chosen": -33.612525939941406,
1633
+ "logps/rejected": -43.52740478515625,
1634
+ "loss": 0.9241,
1635
+ "rewards/accuracies": 0.7749999761581421,
1636
+ "rewards/chosen": -0.024346303194761276,
1637
+ "rewards/margins": 0.07592549920082092,
1638
+ "rewards/rejected": -0.1002717837691307,
1639
+ "step": 1040
1640
+ },
1641
+ {
1642
+ "epoch": 2.73,
1643
+ "learning_rate": 1.3898334684855647e-06,
1644
+ "logits/chosen": -1.3182249069213867,
1645
+ "logits/rejected": -1.3285300731658936,
1646
+ "logps/chosen": -36.65705108642578,
1647
+ "logps/rejected": -41.1858024597168,
1648
+ "loss": 0.9416,
1649
+ "rewards/accuracies": 0.824999988079071,
1650
+ "rewards/chosen": -0.0310394074767828,
1651
+ "rewards/margins": 0.05836848169565201,
1652
+ "rewards/rejected": -0.08940788358449936,
1653
+ "step": 1050
1654
+ },
1655
+ {
1656
+ "epoch": 2.75,
1657
+ "learning_rate": 1.3393501301037245e-06,
1658
+ "logits/chosen": -1.3787955045700073,
1659
+ "logits/rejected": -1.3705497980117798,
1660
+ "logps/chosen": -35.9283561706543,
1661
+ "logps/rejected": -47.05866241455078,
1662
+ "loss": 0.917,
1663
+ "rewards/accuracies": 0.7250000238418579,
1664
+ "rewards/chosen": -0.024275777861475945,
1665
+ "rewards/margins": 0.08299694955348969,
1666
+ "rewards/rejected": -0.10727272927761078,
1667
+ "step": 1060
1668
+ },
1669
+ {
1670
+ "epoch": 2.78,
1671
+ "learning_rate": 1.2894630795134454e-06,
1672
+ "logits/chosen": -1.2876031398773193,
1673
+ "logits/rejected": -1.2903258800506592,
1674
+ "logps/chosen": -38.158668518066406,
1675
+ "logps/rejected": -41.497459411621094,
1676
+ "loss": 0.9337,
1677
+ "rewards/accuracies": 0.762499988079071,
1678
+ "rewards/chosen": -0.021693874150514603,
1679
+ "rewards/margins": 0.06630205363035202,
1680
+ "rewards/rejected": -0.08799593150615692,
1681
+ "step": 1070
1682
+ },
1683
+ {
1684
+ "epoch": 2.81,
1685
+ "learning_rate": 1.2401979463554984e-06,
1686
+ "logits/chosen": -1.3835922479629517,
1687
+ "logits/rejected": -1.38229238986969,
1688
+ "logps/chosen": -35.45813751220703,
1689
+ "logps/rejected": -44.54060363769531,
1690
+ "loss": 0.9198,
1691
+ "rewards/accuracies": 0.800000011920929,
1692
+ "rewards/chosen": -0.024004962295293808,
1693
+ "rewards/margins": 0.08024422824382782,
1694
+ "rewards/rejected": -0.10424919426441193,
1695
+ "step": 1080
1696
+ },
1697
+ {
1698
+ "epoch": 2.83,
1699
+ "learning_rate": 1.1915800407584705e-06,
1700
+ "logits/chosen": -1.3808025121688843,
1701
+ "logits/rejected": -1.3838340044021606,
1702
+ "logps/chosen": -33.26921463012695,
1703
+ "logps/rejected": -42.89200210571289,
1704
+ "loss": 0.9303,
1705
+ "rewards/accuracies": 0.800000011920929,
1706
+ "rewards/chosen": -0.023758988827466965,
1707
+ "rewards/margins": 0.06968275457620621,
1708
+ "rewards/rejected": -0.09344174712896347,
1709
+ "step": 1090
1710
+ },
1711
+ {
1712
+ "epoch": 2.86,
1713
+ "learning_rate": 1.1436343403356019e-06,
1714
+ "logits/chosen": -1.3661361932754517,
1715
+ "logits/rejected": -1.3705623149871826,
1716
+ "logps/chosen": -36.603370666503906,
1717
+ "logps/rejected": -37.271080017089844,
1718
+ "loss": 0.9616,
1719
+ "rewards/accuracies": 0.7250000238418579,
1720
+ "rewards/chosen": -0.030343661084771156,
1721
+ "rewards/margins": 0.03835037350654602,
1722
+ "rewards/rejected": -0.06869403272867203,
1723
+ "step": 1100
1724
+ },
1725
+ {
1726
+ "epoch": 2.86,
1727
+ "eval_logits/chosen": -1.5919773578643799,
1728
+ "eval_logits/rejected": -1.5880482196807861,
1729
+ "eval_logps/chosen": -39.65049743652344,
1730
+ "eval_logps/rejected": -43.97543716430664,
1731
+ "eval_loss": 0.9915903806686401,
1732
+ "eval_rewards/accuracies": 0.545265793800354,
1733
+ "eval_rewards/chosen": -0.05615944042801857,
1734
+ "eval_rewards/margins": 0.008428744971752167,
1735
+ "eval_rewards/rejected": -0.06458818912506104,
1736
+ "eval_runtime": 145.863,
1737
+ "eval_samples_per_second": 2.352,
1738
+ "eval_steps_per_second": 0.295,
1739
+ "step": 1100
1740
+ },
1741
+ {
1742
+ "epoch": 2.88,
1743
+ "learning_rate": 1.0963854773524548e-06,
1744
+ "logits/chosen": -1.355208158493042,
1745
+ "logits/rejected": -1.3564389944076538,
1746
+ "logps/chosen": -34.99750900268555,
1747
+ "logps/rejected": -39.34065628051758,
1748
+ "loss": 0.9373,
1749
+ "rewards/accuracies": 0.7749999761581421,
1750
+ "rewards/chosen": -0.020573578774929047,
1751
+ "rewards/margins": 0.06267382204532623,
1752
+ "rewards/rejected": -0.08324739336967468,
1753
+ "step": 1110
1754
+ },
1755
+ {
1756
+ "epoch": 2.91,
1757
+ "learning_rate": 1.049857726072005e-06,
1758
+ "logits/chosen": -1.227752923965454,
1759
+ "logits/rejected": -1.2306647300720215,
1760
+ "logps/chosen": -37.04521942138672,
1761
+ "logps/rejected": -41.808509826660156,
1762
+ "loss": 0.9371,
1763
+ "rewards/accuracies": 0.699999988079071,
1764
+ "rewards/chosen": -0.02480238489806652,
1765
+ "rewards/margins": 0.06287000328302383,
1766
+ "rewards/rejected": -0.0876723974943161,
1767
+ "step": 1120
1768
+ },
1769
+ {
1770
+ "epoch": 2.94,
1771
+ "learning_rate": 1.0040749902836508e-06,
1772
+ "logits/chosen": -1.250138521194458,
1773
+ "logits/rejected": -1.2487337589263916,
1774
+ "logps/chosen": -34.29961013793945,
1775
+ "logps/rejected": -39.423301696777344,
1776
+ "loss": 0.9503,
1777
+ "rewards/accuracies": 0.7124999761581421,
1778
+ "rewards/chosen": -0.032878875732421875,
1779
+ "rewards/margins": 0.04969673603773117,
1780
+ "rewards/rejected": -0.08257561177015305,
1781
+ "step": 1130
1782
+ },
1783
+ {
1784
+ "epoch": 2.96,
1785
+ "learning_rate": 9.59060791022566e-07,
1786
+ "logits/chosen": -1.363937258720398,
1787
+ "logits/rejected": -1.360128402709961,
1788
+ "logps/chosen": -34.89466094970703,
1789
+ "logps/rejected": -41.99665832519531,
1790
+ "loss": 0.9281,
1791
+ "rewards/accuracies": 0.8125,
1792
+ "rewards/chosen": -0.018548697233200073,
1793
+ "rewards/margins": 0.07187779247760773,
1794
+ "rewards/rejected": -0.0904264897108078,
1795
+ "step": 1140
1796
+ },
1797
+ {
1798
+ "epoch": 2.99,
1799
+ "learning_rate": 9.148382544856885e-07,
1800
+ "logits/chosen": -1.2763797044754028,
1801
+ "logits/rejected": -1.2683088779449463,
1802
+ "logps/chosen": -36.54337692260742,
1803
+ "logps/rejected": -40.25540542602539,
1804
+ "loss": 0.9413,
1805
+ "rewards/accuracies": 0.7250000238418579,
1806
+ "rewards/chosen": -0.030916398391127586,
1807
+ "rewards/margins": 0.05871362239122391,
1808
+ "rewards/rejected": -0.08963000774383545,
1809
+ "step": 1150
1810
+ },
1811
+ {
1812
+ "epoch": 3.01,
1813
+ "learning_rate": 8.714301001505568e-07,
1814
+ "logits/chosen": -1.301361083984375,
1815
+ "logits/rejected": -1.3027942180633545,
1816
+ "logps/chosen": -36.6050910949707,
1817
+ "logps/rejected": -39.68968963623047,
1818
+ "loss": 0.9444,
1819
+ "rewards/accuracies": 0.7666667103767395,
1820
+ "rewards/chosen": -0.029233410954475403,
1821
+ "rewards/margins": 0.05564901977777481,
1822
+ "rewards/rejected": -0.08488242328166962,
1823
+ "step": 1160
1824
+ },
1825
+ {
1826
+ "epoch": 3.04,
1827
+ "learning_rate": 8.288586291031025e-07,
1828
+ "logits/chosen": -1.3797643184661865,
1829
+ "logits/rejected": -1.3745005130767822,
1830
+ "logps/chosen": -36.120018005371094,
1831
+ "logps/rejected": -41.199337005615234,
1832
+ "loss": 0.9434,
1833
+ "rewards/accuracies": 0.7124999761581421,
1834
+ "rewards/chosen": -0.02594051882624626,
1835
+ "rewards/margins": 0.05664067342877388,
1836
+ "rewards/rejected": -0.08258119225502014,
1837
+ "step": 1170
1838
+ },
1839
+ {
1840
+ "epoch": 3.06,
1841
+ "learning_rate": 7.871457125803897e-07,
1842
+ "logits/chosen": -1.2806499004364014,
1843
+ "logits/rejected": -1.289930820465088,
1844
+ "logps/chosen": -36.737998962402344,
1845
+ "logps/rejected": -41.14373016357422,
1846
+ "loss": 0.9462,
1847
+ "rewards/accuracies": 0.7124999761581421,
1848
+ "rewards/chosen": -0.03404999524354935,
1849
+ "rewards/margins": 0.053772974759340286,
1850
+ "rewards/rejected": -0.08782295882701874,
1851
+ "step": 1180
1852
+ },
1853
+ {
1854
+ "epoch": 3.09,
1855
+ "learning_rate": 7.463127807341966e-07,
1856
+ "logits/chosen": -1.2992810010910034,
1857
+ "logits/rejected": -1.2938917875289917,
1858
+ "logps/chosen": -33.985870361328125,
1859
+ "logps/rejected": -42.187801361083984,
1860
+ "loss": 0.9291,
1861
+ "rewards/accuracies": 0.8125,
1862
+ "rewards/chosen": -0.018122535198926926,
1863
+ "rewards/margins": 0.07092103362083435,
1864
+ "rewards/rejected": -0.08904357254505157,
1865
+ "step": 1190
1866
+ },
1867
+ {
1868
+ "epoch": 3.12,
1869
+ "learning_rate": 7.063808116212021e-07,
1870
+ "logits/chosen": -1.261099934577942,
1871
+ "logits/rejected": -1.262424349784851,
1872
+ "logps/chosen": -35.9713249206543,
1873
+ "logps/rejected": -43.56898880004883,
1874
+ "loss": 0.9167,
1875
+ "rewards/accuracies": 0.7124999761581421,
1876
+ "rewards/chosen": -0.02480882592499256,
1877
+ "rewards/margins": 0.08328056335449219,
1878
+ "rewards/rejected": -0.1080893874168396,
1879
+ "step": 1200
1880
+ },
1881
+ {
1882
+ "epoch": 3.12,
1883
+ "eval_logits/chosen": -1.5890045166015625,
1884
+ "eval_logits/rejected": -1.585091471672058,
1885
+ "eval_logps/chosen": -39.66655349731445,
1886
+ "eval_logps/rejected": -44.02885818481445,
1887
+ "eval_loss": 0.9912154078483582,
1888
+ "eval_rewards/accuracies": 0.5481727719306946,
1889
+ "eval_rewards/chosen": -0.056320033967494965,
1890
+ "eval_rewards/margins": 0.00880234595388174,
1891
+ "eval_rewards/rejected": -0.06512238085269928,
1892
+ "eval_runtime": 145.8882,
1893
+ "eval_samples_per_second": 2.351,
1894
+ "eval_steps_per_second": 0.295,
1895
+ "step": 1200
1896
+ },
1897
+ {
1898
+ "epoch": 3.14,
1899
+ "learning_rate": 6.673703204254348e-07,
1900
+ "logits/chosen": -1.2175233364105225,
1901
+ "logits/rejected": -1.2172149419784546,
1902
+ "logps/chosen": -37.891151428222656,
1903
+ "logps/rejected": -42.90294647216797,
1904
+ "loss": 0.9165,
1905
+ "rewards/accuracies": 0.8374999761581421,
1906
+ "rewards/chosen": -0.01838667131960392,
1907
+ "rewards/margins": 0.08354915678501129,
1908
+ "rewards/rejected": -0.10193584114313126,
1909
+ "step": 1210
1910
+ },
1911
+ {
1912
+ "epoch": 3.17,
1913
+ "learning_rate": 6.293013489185315e-07,
1914
+ "logits/chosen": -1.3444710969924927,
1915
+ "logits/rejected": -1.3372966051101685,
1916
+ "logps/chosen": -34.07860565185547,
1917
+ "logps/rejected": -43.33241653442383,
1918
+ "loss": 0.9192,
1919
+ "rewards/accuracies": 0.8125,
1920
+ "rewards/chosen": -0.02374918945133686,
1921
+ "rewards/margins": 0.08078955113887787,
1922
+ "rewards/rejected": -0.10453873872756958,
1923
+ "step": 1220
1924
+ },
1925
+ {
1926
+ "epoch": 3.19,
1927
+ "learning_rate": 5.921934551632086e-07,
1928
+ "logits/chosen": -1.2269846200942993,
1929
+ "logits/rejected": -1.2168152332305908,
1930
+ "logps/chosen": -36.28278350830078,
1931
+ "logps/rejected": -42.86933517456055,
1932
+ "loss": 0.9192,
1933
+ "rewards/accuracies": 0.824999988079071,
1934
+ "rewards/chosen": -0.01897098869085312,
1935
+ "rewards/margins": 0.08078411966562271,
1936
+ "rewards/rejected": -0.09975510835647583,
1937
+ "step": 1230
1938
+ },
1939
+ {
1940
+ "epoch": 3.22,
1941
+ "learning_rate": 5.560657034652405e-07,
1942
+ "logits/chosen": -1.3140621185302734,
1943
+ "logits/rejected": -1.308857798576355,
1944
+ "logps/chosen": -33.93239212036133,
1945
+ "logps/rejected": -37.61920928955078,
1946
+ "loss": 0.9448,
1947
+ "rewards/accuracies": 0.675000011920929,
1948
+ "rewards/chosen": -0.03232160210609436,
1949
+ "rewards/margins": 0.0551949217915535,
1950
+ "rewards/rejected": -0.08751652389764786,
1951
+ "step": 1240
1952
+ },
1953
+ {
1954
+ "epoch": 3.25,
1955
+ "learning_rate": 5.2093665457911e-07,
1956
+ "logits/chosen": -1.3194572925567627,
1957
+ "logits/rejected": -1.3273426294326782,
1958
+ "logps/chosen": -38.09012985229492,
1959
+ "logps/rejected": -40.47280502319336,
1960
+ "loss": 0.9374,
1961
+ "rewards/accuracies": 0.7250000238418579,
1962
+ "rewards/chosen": -0.026947390288114548,
1963
+ "rewards/margins": 0.06259752810001373,
1964
+ "rewards/rejected": -0.08954491466283798,
1965
+ "step": 1250
1966
+ },
1967
+ {
1968
+ "epoch": 3.27,
1969
+ "learning_rate": 4.868243561723535e-07,
1970
+ "logits/chosen": -1.3223885297775269,
1971
+ "logits/rejected": -1.3221849203109741,
1972
+ "logps/chosen": -36.7206916809082,
1973
+ "logps/rejected": -43.311153411865234,
1974
+ "loss": 0.932,
1975
+ "rewards/accuracies": 0.8374999761581421,
1976
+ "rewards/chosen": -0.03692134469747543,
1977
+ "rewards/margins": 0.06804148107767105,
1978
+ "rewards/rejected": -0.10496282577514648,
1979
+ "step": 1260
1980
+ },
1981
+ {
1982
+ "epoch": 3.3,
1983
+ "learning_rate": 4.537463335535161e-07,
1984
+ "logits/chosen": -1.2348535060882568,
1985
+ "logits/rejected": -1.2325520515441895,
1986
+ "logps/chosen": -34.88422393798828,
1987
+ "logps/rejected": -44.12788009643555,
1988
+ "loss": 0.9119,
1989
+ "rewards/accuracies": 0.8125,
1990
+ "rewards/chosen": -0.01793385110795498,
1991
+ "rewards/margins": 0.08813884109258652,
1992
+ "rewards/rejected": -0.10607268661260605,
1993
+ "step": 1270
1994
+ },
1995
+ {
1996
+ "epoch": 3.32,
1997
+ "learning_rate": 4.217195806684629e-07,
1998
+ "logits/chosen": -1.1531862020492554,
1999
+ "logits/rejected": -1.1495788097381592,
2000
+ "logps/chosen": -37.46773147583008,
2001
+ "logps/rejected": -39.666324615478516,
2002
+ "loss": 0.9349,
2003
+ "rewards/accuracies": 0.737500011920929,
2004
+ "rewards/chosen": -0.022302847355604172,
2005
+ "rewards/margins": 0.06509546935558319,
2006
+ "rewards/rejected": -0.08739831298589706,
2007
+ "step": 1280
2008
+ },
2009
+ {
2010
+ "epoch": 3.35,
2011
+ "learning_rate": 3.907605513696808e-07,
2012
+ "logits/chosen": -1.3214530944824219,
2013
+ "logits/rejected": -1.3079793453216553,
2014
+ "logps/chosen": -37.78839111328125,
2015
+ "logps/rejected": -46.277854919433594,
2016
+ "loss": 0.9193,
2017
+ "rewards/accuracies": 0.7749999761581421,
2018
+ "rewards/chosen": -0.03440812602639198,
2019
+ "rewards/margins": 0.0807172879576683,
2020
+ "rewards/rejected": -0.11512543261051178,
2021
+ "step": 1290
2022
+ },
2023
+ {
2024
+ "epoch": 3.38,
2025
+ "learning_rate": 3.6088515096305675e-07,
2026
+ "logits/chosen": -1.274196982383728,
2027
+ "logits/rejected": -1.2778257131576538,
2028
+ "logps/chosen": -36.133689880371094,
2029
+ "logps/rejected": -48.38513946533203,
2030
+ "loss": 0.9033,
2031
+ "rewards/accuracies": 0.875,
2032
+ "rewards/chosen": -0.026160676032304764,
2033
+ "rewards/margins": 0.09665848314762115,
2034
+ "rewards/rejected": -0.12281917035579681,
2035
+ "step": 1300
2036
+ },
2037
+ {
2038
+ "epoch": 3.38,
2039
+ "eval_logits/chosen": -1.5856225490570068,
2040
+ "eval_logits/rejected": -1.5817017555236816,
2041
+ "eval_logps/chosen": -39.73164367675781,
2042
+ "eval_logps/rejected": -44.086830139160156,
2043
+ "eval_loss": 0.9912906289100647,
2044
+ "eval_rewards/accuracies": 0.545265793800354,
2045
+ "eval_rewards/chosen": -0.056970901787281036,
2046
+ "eval_rewards/margins": 0.00873124971985817,
2047
+ "eval_rewards/rejected": -0.06570214778184891,
2048
+ "eval_runtime": 145.8872,
2049
+ "eval_samples_per_second": 2.351,
2050
+ "eval_steps_per_second": 0.295,
2051
+ "step": 1300
2052
+ },
2053
+ {
2054
+ "epoch": 3.4,
2055
+ "learning_rate": 3.321087280364757e-07,
2056
+ "logits/chosen": -1.2609808444976807,
2057
+ "logits/rejected": -1.2618134021759033,
2058
+ "logps/chosen": -38.979774475097656,
2059
+ "logps/rejected": -48.166404724121094,
2060
+ "loss": 0.9165,
2061
+ "rewards/accuracies": 0.7875000238418579,
2062
+ "rewards/chosen": -0.02735757827758789,
2063
+ "rewards/margins": 0.0834653228521347,
2064
+ "rewards/rejected": -0.11082291603088379,
2065
+ "step": 1310
2066
+ },
2067
+ {
2068
+ "epoch": 3.43,
2069
+ "learning_rate": 3.044460665744284e-07,
2070
+ "logits/chosen": -1.3282499313354492,
2071
+ "logits/rejected": -1.3269433975219727,
2072
+ "logps/chosen": -34.594669342041016,
2073
+ "logps/rejected": -40.66201400756836,
2074
+ "loss": 0.9279,
2075
+ "rewards/accuracies": 0.75,
2076
+ "rewards/chosen": -0.02500814199447632,
2077
+ "rewards/margins": 0.07207842171192169,
2078
+ "rewards/rejected": -0.09708657115697861,
2079
+ "step": 1320
2080
+ },
2081
+ {
2082
+ "epoch": 3.45,
2083
+ "learning_rate": 2.779113783626916e-07,
2084
+ "logits/chosen": -1.2696067094802856,
2085
+ "logits/rejected": -1.2711015939712524,
2086
+ "logps/chosen": -36.31203079223633,
2087
+ "logps/rejected": -43.704010009765625,
2088
+ "loss": 0.9188,
2089
+ "rewards/accuracies": 0.8125,
2090
+ "rewards/chosen": -0.019377777352929115,
2091
+ "rewards/margins": 0.08117558807134628,
2092
+ "rewards/rejected": -0.10055337101221085,
2093
+ "step": 1330
2094
+ },
2095
+ {
2096
+ "epoch": 3.48,
2097
+ "learning_rate": 2.5251829568697204e-07,
2098
+ "logits/chosen": -1.312336802482605,
2099
+ "logits/rejected": -1.3112585544586182,
2100
+ "logps/chosen": -33.062835693359375,
2101
+ "logps/rejected": -41.20447540283203,
2102
+ "loss": 0.9265,
2103
+ "rewards/accuracies": 0.800000011920929,
2104
+ "rewards/chosen": -0.018641695380210876,
2105
+ "rewards/margins": 0.07348335534334183,
2106
+ "rewards/rejected": -0.0921250432729721,
2107
+ "step": 1340
2108
+ },
2109
+ {
2110
+ "epoch": 3.51,
2111
+ "learning_rate": 2.2827986432927774e-07,
2112
+ "logits/chosen": -1.3330471515655518,
2113
+ "logits/rejected": -1.318232774734497,
2114
+ "logps/chosen": -37.70689392089844,
2115
+ "logps/rejected": -48.62843322753906,
2116
+ "loss": 0.9103,
2117
+ "rewards/accuracies": 0.862500011920929,
2118
+ "rewards/chosen": -0.033902253955602646,
2119
+ "rewards/margins": 0.08973778784275055,
2120
+ "rewards/rejected": -0.1236400380730629,
2121
+ "step": 1350
2122
+ },
2123
+ {
2124
+ "epoch": 3.53,
2125
+ "learning_rate": 2.0520853686560177e-07,
2126
+ "logits/chosen": -1.2979753017425537,
2127
+ "logits/rejected": -1.3076903820037842,
2128
+ "logps/chosen": -33.92186737060547,
2129
+ "logps/rejected": -42.06645965576172,
2130
+ "loss": 0.9203,
2131
+ "rewards/accuracies": 0.800000011920929,
2132
+ "rewards/chosen": -0.018533948808908463,
2133
+ "rewards/margins": 0.07973320782184601,
2134
+ "rewards/rejected": -0.09826715290546417,
2135
+ "step": 1360
2136
+ },
2137
+ {
2138
+ "epoch": 3.56,
2139
+ "learning_rate": 1.833161662683672e-07,
2140
+ "logits/chosen": -1.4156075716018677,
2141
+ "logits/rejected": -1.4154436588287354,
2142
+ "logps/chosen": -33.43849563598633,
2143
+ "logps/rejected": -48.292179107666016,
2144
+ "loss": 0.8895,
2145
+ "rewards/accuracies": 0.887499988079071,
2146
+ "rewards/chosen": -0.014080649241805077,
2147
+ "rewards/margins": 0.11053232103586197,
2148
+ "rewards/rejected": -0.1246129646897316,
2149
+ "step": 1370
2150
+ },
2151
+ {
2152
+ "epoch": 3.58,
2153
+ "learning_rate": 1.626139998169246e-07,
2154
+ "logits/chosen": -1.2881263494491577,
2155
+ "logits/rejected": -1.295348882675171,
2156
+ "logps/chosen": -36.35157775878906,
2157
+ "logps/rejected": -48.889339447021484,
2158
+ "loss": 0.9073,
2159
+ "rewards/accuracies": 0.862500011920929,
2160
+ "rewards/chosen": -0.021710969507694244,
2161
+ "rewards/margins": 0.09268596768379211,
2162
+ "rewards/rejected": -0.11439694464206696,
2163
+ "step": 1380
2164
+ },
2165
+ {
2166
+ "epoch": 3.61,
2167
+ "learning_rate": 1.4311267331922535e-07,
2168
+ "logits/chosen": -1.2490915060043335,
2169
+ "logits/rejected": -1.2452831268310547,
2170
+ "logps/chosen": -36.50374984741211,
2171
+ "logps/rejected": -40.645694732666016,
2172
+ "loss": 0.9269,
2173
+ "rewards/accuracies": 0.8125,
2174
+ "rewards/chosen": -0.01736537739634514,
2175
+ "rewards/margins": 0.07311762124300003,
2176
+ "rewards/rejected": -0.09048298746347427,
2177
+ "step": 1390
2178
+ },
2179
+ {
2180
+ "epoch": 3.64,
2181
+ "learning_rate": 1.2482220564763669e-07,
2182
+ "logits/chosen": -1.37045419216156,
2183
+ "logits/rejected": -1.3693218231201172,
2184
+ "logps/chosen": -33.22849655151367,
2185
+ "logps/rejected": -41.22184371948242,
2186
+ "loss": 0.9285,
2187
+ "rewards/accuracies": 0.762499988079071,
2188
+ "rewards/chosen": -0.01931949146091938,
2189
+ "rewards/margins": 0.07145430892705917,
2190
+ "rewards/rejected": -0.090773805975914,
2191
+ "step": 1400
2192
+ },
2193
+ {
2194
+ "epoch": 3.64,
2195
+ "eval_logits/chosen": -1.586363673210144,
2196
+ "eval_logits/rejected": -1.5824512243270874,
2197
+ "eval_logps/chosen": -39.72161102294922,
2198
+ "eval_logps/rejected": -44.085166931152344,
2199
+ "eval_loss": 0.9912044405937195,
2200
+ "eval_rewards/accuracies": 0.5394518375396729,
2201
+ "eval_rewards/chosen": -0.056870583444833755,
2202
+ "eval_rewards/margins": 0.0088148582726717,
2203
+ "eval_rewards/rejected": -0.06568543612957001,
2204
+ "eval_runtime": 145.8196,
2205
+ "eval_samples_per_second": 2.352,
2206
+ "eval_steps_per_second": 0.295,
2207
+ "step": 1400
2208
+ },
2209
+ {
2210
+ "epoch": 3.66,
2211
+ "learning_rate": 1.0775199359171346e-07,
2212
+ "logits/chosen": -1.3426971435546875,
2213
+ "logits/rejected": -1.337646484375,
2214
+ "logps/chosen": -36.17827224731445,
2215
+ "logps/rejected": -37.915504455566406,
2216
+ "loss": 0.9417,
2217
+ "rewards/accuracies": 0.675000011920929,
2218
+ "rewards/chosen": -0.025357365608215332,
2219
+ "rewards/margins": 0.05825306102633476,
2220
+ "rewards/rejected": -0.0836104229092598,
2221
+ "step": 1410
2222
+ },
2223
+ {
2224
+ "epoch": 3.69,
2225
+ "learning_rate": 9.191080703056604e-08,
2226
+ "logits/chosen": -1.2913800477981567,
2227
+ "logits/rejected": -1.2924379110336304,
2228
+ "logps/chosen": -35.52798843383789,
2229
+ "logps/rejected": -44.0432243347168,
2230
+ "loss": 0.9304,
2231
+ "rewards/accuracies": 0.8125,
2232
+ "rewards/chosen": -0.021631117910146713,
2233
+ "rewards/margins": 0.06956102699041367,
2234
+ "rewards/rejected": -0.09119214862585068,
2235
+ "step": 1420
2236
+ },
2237
+ {
2238
+ "epoch": 3.71,
2239
+ "learning_rate": 7.730678442730539e-08,
2240
+ "logits/chosen": -1.241626501083374,
2241
+ "logits/rejected": -1.2360563278198242,
2242
+ "logps/chosen": -35.85869216918945,
2243
+ "logps/rejected": -48.056610107421875,
2244
+ "loss": 0.9035,
2245
+ "rewards/accuracies": 0.8125,
2246
+ "rewards/chosen": -0.017169218510389328,
2247
+ "rewards/margins": 0.09650282561779022,
2248
+ "rewards/rejected": -0.11367203295230865,
2249
+ "step": 1430
2250
+ },
2251
+ {
2252
+ "epoch": 3.74,
2253
+ "learning_rate": 6.394742864787806e-08,
2254
+ "logits/chosen": -1.2561070919036865,
2255
+ "logits/rejected": -1.2504056692123413,
2256
+ "logps/chosen": -31.515949249267578,
2257
+ "logps/rejected": -41.77183151245117,
2258
+ "loss": 0.9216,
2259
+ "rewards/accuracies": 0.7749999761581421,
2260
+ "rewards/chosen": -0.02202703058719635,
2261
+ "rewards/margins": 0.07838472723960876,
2262
+ "rewards/rejected": -0.10041175782680511,
2263
+ "step": 1440
2264
+ },
2265
+ {
2266
+ "epoch": 3.77,
2267
+ "learning_rate": 5.183960310644748e-08,
2268
+ "logits/chosen": -1.3071388006210327,
2269
+ "logits/rejected": -1.2970083951950073,
2270
+ "logps/chosen": -35.57398986816406,
2271
+ "logps/rejected": -45.840171813964844,
2272
+ "loss": 0.9254,
2273
+ "rewards/accuracies": 0.7875000238418579,
2274
+ "rewards/chosen": -0.030924629420042038,
2275
+ "rewards/margins": 0.0746234804391861,
2276
+ "rewards/rejected": -0.10554809868335724,
2277
+ "step": 1450
2278
+ },
2279
+ {
2280
+ "epoch": 3.79,
2281
+ "learning_rate": 4.098952823928693e-08,
2282
+ "logits/chosen": -1.2653499841690063,
2283
+ "logits/rejected": -1.2617781162261963,
2284
+ "logps/chosen": -35.99557876586914,
2285
+ "logps/rejected": -39.989891052246094,
2286
+ "loss": 0.9359,
2287
+ "rewards/accuracies": 0.75,
2288
+ "rewards/chosen": -0.026857968419790268,
2289
+ "rewards/margins": 0.06413265317678452,
2290
+ "rewards/rejected": -0.09099061787128448,
2291
+ "step": 1460
2292
+ },
2293
+ {
2294
+ "epoch": 3.82,
2295
+ "learning_rate": 3.1402778309014284e-08,
2296
+ "logits/chosen": -1.3204753398895264,
2297
+ "logits/rejected": -1.3249455690383911,
2298
+ "logps/chosen": -33.896812438964844,
2299
+ "logps/rejected": -42.878475189208984,
2300
+ "loss": 0.9191,
2301
+ "rewards/accuracies": 0.8125,
2302
+ "rewards/chosen": -0.018276356160640717,
2303
+ "rewards/margins": 0.08088420331478119,
2304
+ "rewards/rejected": -0.0991605594754219,
2305
+ "step": 1470
2306
+ },
2307
+ {
2308
+ "epoch": 3.84,
2309
+ "learning_rate": 2.3084278540791427e-08,
2310
+ "logits/chosen": -1.3220521211624146,
2311
+ "logits/rejected": -1.3311010599136353,
2312
+ "logps/chosen": -33.49136734008789,
2313
+ "logps/rejected": -38.53743362426758,
2314
+ "loss": 0.9289,
2315
+ "rewards/accuracies": 0.824999988079071,
2316
+ "rewards/chosen": -0.017961082980036736,
2317
+ "rewards/margins": 0.07112761586904526,
2318
+ "rewards/rejected": -0.08908869326114655,
2319
+ "step": 1480
2320
+ },
2321
+ {
2322
+ "epoch": 3.87,
2323
+ "learning_rate": 1.6038302591975807e-08,
2324
+ "logits/chosen": -1.258320689201355,
2325
+ "logits/rejected": -1.2530874013900757,
2326
+ "logps/chosen": -36.1824951171875,
2327
+ "logps/rejected": -41.13605880737305,
2328
+ "loss": 0.936,
2329
+ "rewards/accuracies": 0.7749999761581421,
2330
+ "rewards/chosen": -0.02526010572910309,
2331
+ "rewards/margins": 0.06399567425251007,
2332
+ "rewards/rejected": -0.08925577253103256,
2333
+ "step": 1490
2334
+ },
2335
+ {
2336
+ "epoch": 3.9,
2337
+ "learning_rate": 1.0268470356514237e-08,
2338
+ "logits/chosen": -1.3191821575164795,
2339
+ "logits/rejected": -1.3164719343185425,
2340
+ "logps/chosen": -36.40666961669922,
2341
+ "logps/rejected": -44.079254150390625,
2342
+ "loss": 0.9196,
2343
+ "rewards/accuracies": 0.7749999761581421,
2344
+ "rewards/chosen": -0.02781098522245884,
2345
+ "rewards/margins": 0.08037930727005005,
2346
+ "rewards/rejected": -0.10819028317928314,
2347
+ "step": 1500
2348
+ },
2349
+ {
2350
+ "epoch": 3.9,
2351
+ "eval_logits/chosen": -1.587085485458374,
2352
+ "eval_logits/rejected": -1.5831539630889893,
2353
+ "eval_logps/chosen": -39.70748519897461,
2354
+ "eval_logps/rejected": -44.0648307800293,
2355
+ "eval_loss": 0.9912670850753784,
2356
+ "eval_rewards/accuracies": 0.5423588156700134,
2357
+ "eval_rewards/chosen": -0.0567292720079422,
2358
+ "eval_rewards/margins": 0.008752820082008839,
2359
+ "eval_rewards/rejected": -0.06548209488391876,
2360
+ "eval_runtime": 145.8973,
2361
+ "eval_samples_per_second": 2.351,
2362
+ "eval_steps_per_second": 0.295,
2363
+ "step": 1500
2364
+ },
2365
+ {
2366
+ "epoch": 3.92,
2367
+ "learning_rate": 5.777746105209147e-09,
2368
+ "logits/chosen": -1.3802618980407715,
2369
+ "logits/rejected": -1.3801379203796387,
2370
+ "logps/chosen": -31.544485092163086,
2371
+ "logps/rejected": -42.76384735107422,
2372
+ "loss": 0.917,
2373
+ "rewards/accuracies": 0.75,
2374
+ "rewards/chosen": -0.017282087355852127,
2375
+ "rewards/margins": 0.08295184373855591,
2376
+ "rewards/rejected": -0.10023393481969833,
2377
+ "step": 1510
2378
+ },
2379
+ {
2380
+ "epoch": 3.95,
2381
+ "learning_rate": 2.5684369628148352e-09,
2382
+ "logits/chosen": -1.2462044954299927,
2383
+ "logits/rejected": -1.2448112964630127,
2384
+ "logps/chosen": -34.93328857421875,
2385
+ "logps/rejected": -43.264244079589844,
2386
+ "loss": 0.925,
2387
+ "rewards/accuracies": 0.800000011920929,
2388
+ "rewards/chosen": -0.01891419291496277,
2389
+ "rewards/margins": 0.07499094307422638,
2390
+ "rewards/rejected": -0.09390512108802795,
2391
+ "step": 1520
2392
+ },
2393
+ {
2394
+ "epoch": 3.97,
2395
+ "learning_rate": 6.421917227455999e-10,
2396
+ "logits/chosen": -1.3795892000198364,
2397
+ "logits/rejected": -1.3771770000457764,
2398
+ "logps/chosen": -33.931644439697266,
2399
+ "logps/rejected": -40.94723892211914,
2400
+ "loss": 0.9299,
2401
+ "rewards/accuracies": 0.8125,
2402
+ "rewards/chosen": -0.024575814604759216,
2403
+ "rewards/margins": 0.07006003707647324,
2404
+ "rewards/rejected": -0.09463585913181305,
2405
+ "step": 1530
2406
+ },
2407
+ {
2408
+ "epoch": 4.0,
2409
+ "learning_rate": 0.0,
2410
+ "logits/chosen": -1.3671410083770752,
2411
+ "logits/rejected": -1.3676393032073975,
2412
+ "logps/chosen": -33.82143020629883,
2413
+ "logps/rejected": -37.72132873535156,
2414
+ "loss": 0.9527,
2415
+ "rewards/accuracies": 0.762499988079071,
2416
+ "rewards/chosen": -0.0320148840546608,
2417
+ "rewards/margins": 0.04729658365249634,
2418
+ "rewards/rejected": -0.07931147515773773,
2419
+ "step": 1540
2420
+ },
2421
+ {
2422
+ "epoch": 4.0,
2423
+ "step": 1540,
2424
  "total_flos": 0.0,
2425
+ "train_loss": 0.7654441864459546,
2426
+ "train_runtime": 10797.8934,
2427
+ "train_samples_per_second": 1.141,
2428
+ "train_steps_per_second": 0.143
2429
  }
2430
  ],
2431
  "logging_steps": 10,
2432
+ "max_steps": 1540,
2433
  "num_input_tokens_seen": 0,
2434
+ "num_train_epochs": 4,
2435
  "save_steps": 100,
2436
  "total_flos": 0.0,
2437
  "train_batch_size": 4,