alex-prutko commited on
Commit
492a2e5
·
1 Parent(s): 0000f2f
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6093a5b775df40fd252098bbd99075cdb6a5d9ed155662a597d86f6b7a873cb
3
  size 27297544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703557f4188aa469524b4f3e4e0c1ec07378f5b46172241444e83d8b22ff18d2
3
  size 27297544
compressa-config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "model_name": "llama_3_fp_epoch_4",
3
  "torch_dtype": "torch.float16",
4
  "device_map": "auto",
5
  "trust_remote_code": false,
 
1
  {
2
+ "model_name": "llama_3_fp_epoch_3",
3
  "torch_dtype": "torch.float16",
4
  "device_map": "auto",
5
  "trust_remote_code": false,
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2db7654e77aaa548cc3911fa5d2bd7b1bb3357776d5c7dc2edbc222ce8dea4a2
3
  size 54741498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5cf4a89f17db7f467339252db1577234ba169051f4361835f08564e5cc20904
3
  size 54741498
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4a7d167cc0b54c93d76464aeb107ff1a05a7a83704a1406c15657be54bfd689
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eeeb62a720c4b597a880e07f8cb750bf18c118e1971a65ea3ef58761ce474cc
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2da190a923ee76753674b1d1c5a16cda1ee1e8ebbc5962294eb015f30fcb76d8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99dd3c2fb764f31a43e0a9282ffd1057f18692ce958664acb9811c310e4a5fe0
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.0,
5
  "eval_steps": 20,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4321,1452 +4321,6 @@
4321
  "learning_rate": 0.0001,
4322
  "loss": 0.716,
4323
  "step": 675
4324
- },
4325
- {
4326
- "epoch": 3.0,
4327
- "learning_rate": 0.0001,
4328
- "loss": 0.7913,
4329
- "step": 676
4330
- },
4331
- {
4332
- "epoch": 3.01,
4333
- "learning_rate": 0.0001,
4334
- "loss": 0.6763,
4335
- "step": 677
4336
- },
4337
- {
4338
- "epoch": 3.01,
4339
- "learning_rate": 0.0001,
4340
- "loss": 0.6952,
4341
- "step": 678
4342
- },
4343
- {
4344
- "epoch": 3.02,
4345
- "learning_rate": 0.0001,
4346
- "loss": 0.6933,
4347
- "step": 679
4348
- },
4349
- {
4350
- "epoch": 3.02,
4351
- "learning_rate": 0.0001,
4352
- "loss": 0.7001,
4353
- "step": 680
4354
- },
4355
- {
4356
- "epoch": 3.02,
4357
- "eval_loss": 0.7300973534584045,
4358
- "eval_runtime": 17.9948,
4359
- "eval_samples_per_second": 4.501,
4360
- "eval_steps_per_second": 1.167,
4361
- "step": 680
4362
- },
4363
- {
4364
- "epoch": 3.03,
4365
- "learning_rate": 0.0001,
4366
- "loss": 0.6507,
4367
- "step": 681
4368
- },
4369
- {
4370
- "epoch": 3.03,
4371
- "learning_rate": 0.0001,
4372
- "loss": 0.6635,
4373
- "step": 682
4374
- },
4375
- {
4376
- "epoch": 3.04,
4377
- "learning_rate": 0.0001,
4378
- "loss": 0.6627,
4379
- "step": 683
4380
- },
4381
- {
4382
- "epoch": 3.04,
4383
- "learning_rate": 0.0001,
4384
- "loss": 0.6461,
4385
- "step": 684
4386
- },
4387
- {
4388
- "epoch": 3.04,
4389
- "learning_rate": 0.0001,
4390
- "loss": 0.6493,
4391
- "step": 685
4392
- },
4393
- {
4394
- "epoch": 3.05,
4395
- "learning_rate": 0.0001,
4396
- "loss": 0.6392,
4397
- "step": 686
4398
- },
4399
- {
4400
- "epoch": 3.05,
4401
- "learning_rate": 0.0001,
4402
- "loss": 0.6457,
4403
- "step": 687
4404
- },
4405
- {
4406
- "epoch": 3.06,
4407
- "learning_rate": 0.0001,
4408
- "loss": 0.6702,
4409
- "step": 688
4410
- },
4411
- {
4412
- "epoch": 3.06,
4413
- "learning_rate": 0.0001,
4414
- "loss": 0.6652,
4415
- "step": 689
4416
- },
4417
- {
4418
- "epoch": 3.07,
4419
- "learning_rate": 0.0001,
4420
- "loss": 0.6266,
4421
- "step": 690
4422
- },
4423
- {
4424
- "epoch": 3.07,
4425
- "learning_rate": 0.0001,
4426
- "loss": 0.6648,
4427
- "step": 691
4428
- },
4429
- {
4430
- "epoch": 3.08,
4431
- "learning_rate": 0.0001,
4432
- "loss": 0.5778,
4433
- "step": 692
4434
- },
4435
- {
4436
- "epoch": 3.08,
4437
- "learning_rate": 0.0001,
4438
- "loss": 0.6581,
4439
- "step": 693
4440
- },
4441
- {
4442
- "epoch": 3.08,
4443
- "learning_rate": 0.0001,
4444
- "loss": 0.7089,
4445
- "step": 694
4446
- },
4447
- {
4448
- "epoch": 3.09,
4449
- "learning_rate": 0.0001,
4450
- "loss": 0.6478,
4451
- "step": 695
4452
- },
4453
- {
4454
- "epoch": 3.09,
4455
- "learning_rate": 0.0001,
4456
- "loss": 0.5843,
4457
- "step": 696
4458
- },
4459
- {
4460
- "epoch": 3.1,
4461
- "learning_rate": 0.0001,
4462
- "loss": 0.5995,
4463
- "step": 697
4464
- },
4465
- {
4466
- "epoch": 3.1,
4467
- "learning_rate": 0.0001,
4468
- "loss": 0.6076,
4469
- "step": 698
4470
- },
4471
- {
4472
- "epoch": 3.11,
4473
- "learning_rate": 0.0001,
4474
- "loss": 0.7328,
4475
- "step": 699
4476
- },
4477
- {
4478
- "epoch": 3.11,
4479
- "learning_rate": 0.0001,
4480
- "loss": 0.5725,
4481
- "step": 700
4482
- },
4483
- {
4484
- "epoch": 3.11,
4485
- "eval_loss": 0.7330772280693054,
4486
- "eval_runtime": 18.8269,
4487
- "eval_samples_per_second": 4.302,
4488
- "eval_steps_per_second": 1.115,
4489
- "step": 700
4490
- },
4491
- {
4492
- "epoch": 3.12,
4493
- "learning_rate": 0.0001,
4494
- "loss": 0.6693,
4495
- "step": 701
4496
- },
4497
- {
4498
- "epoch": 3.12,
4499
- "learning_rate": 0.0001,
4500
- "loss": 0.6445,
4501
- "step": 702
4502
- },
4503
- {
4504
- "epoch": 3.12,
4505
- "learning_rate": 0.0001,
4506
- "loss": 0.6478,
4507
- "step": 703
4508
- },
4509
- {
4510
- "epoch": 3.13,
4511
- "learning_rate": 0.0001,
4512
- "loss": 0.661,
4513
- "step": 704
4514
- },
4515
- {
4516
- "epoch": 3.13,
4517
- "learning_rate": 0.0001,
4518
- "loss": 0.6351,
4519
- "step": 705
4520
- },
4521
- {
4522
- "epoch": 3.14,
4523
- "learning_rate": 0.0001,
4524
- "loss": 0.5796,
4525
- "step": 706
4526
- },
4527
- {
4528
- "epoch": 3.14,
4529
- "learning_rate": 0.0001,
4530
- "loss": 0.607,
4531
- "step": 707
4532
- },
4533
- {
4534
- "epoch": 3.15,
4535
- "learning_rate": 0.0001,
4536
- "loss": 0.6227,
4537
- "step": 708
4538
- },
4539
- {
4540
- "epoch": 3.15,
4541
- "learning_rate": 0.0001,
4542
- "loss": 0.6283,
4543
- "step": 709
4544
- },
4545
- {
4546
- "epoch": 3.16,
4547
- "learning_rate": 0.0001,
4548
- "loss": 0.6654,
4549
- "step": 710
4550
- },
4551
- {
4552
- "epoch": 3.16,
4553
- "learning_rate": 0.0001,
4554
- "loss": 0.6348,
4555
- "step": 711
4556
- },
4557
- {
4558
- "epoch": 3.16,
4559
- "learning_rate": 0.0001,
4560
- "loss": 0.5861,
4561
- "step": 712
4562
- },
4563
- {
4564
- "epoch": 3.17,
4565
- "learning_rate": 0.0001,
4566
- "loss": 0.5794,
4567
- "step": 713
4568
- },
4569
- {
4570
- "epoch": 3.17,
4571
- "learning_rate": 0.0001,
4572
- "loss": 0.6011,
4573
- "step": 714
4574
- },
4575
- {
4576
- "epoch": 3.18,
4577
- "learning_rate": 0.0001,
4578
- "loss": 0.5992,
4579
- "step": 715
4580
- },
4581
- {
4582
- "epoch": 3.18,
4583
- "learning_rate": 0.0001,
4584
- "loss": 0.624,
4585
- "step": 716
4586
- },
4587
- {
4588
- "epoch": 3.19,
4589
- "learning_rate": 0.0001,
4590
- "loss": 0.636,
4591
- "step": 717
4592
- },
4593
- {
4594
- "epoch": 3.19,
4595
- "learning_rate": 0.0001,
4596
- "loss": 0.5255,
4597
- "step": 718
4598
- },
4599
- {
4600
- "epoch": 3.2,
4601
- "learning_rate": 0.0001,
4602
- "loss": 0.6403,
4603
- "step": 719
4604
- },
4605
- {
4606
- "epoch": 3.2,
4607
- "learning_rate": 0.0001,
4608
- "loss": 0.5832,
4609
- "step": 720
4610
- },
4611
- {
4612
- "epoch": 3.2,
4613
- "eval_loss": 0.7388717532157898,
4614
- "eval_runtime": 18.8118,
4615
- "eval_samples_per_second": 4.306,
4616
- "eval_steps_per_second": 1.116,
4617
- "step": 720
4618
- },
4619
- {
4620
- "epoch": 3.2,
4621
- "learning_rate": 0.0001,
4622
- "loss": 0.5699,
4623
- "step": 721
4624
- },
4625
- {
4626
- "epoch": 3.21,
4627
- "learning_rate": 0.0001,
4628
- "loss": 0.6878,
4629
- "step": 722
4630
- },
4631
- {
4632
- "epoch": 3.21,
4633
- "learning_rate": 0.0001,
4634
- "loss": 0.5529,
4635
- "step": 723
4636
- },
4637
- {
4638
- "epoch": 3.22,
4639
- "learning_rate": 0.0001,
4640
- "loss": 0.5619,
4641
- "step": 724
4642
- },
4643
- {
4644
- "epoch": 3.22,
4645
- "learning_rate": 0.0001,
4646
- "loss": 0.5764,
4647
- "step": 725
4648
- },
4649
- {
4650
- "epoch": 3.23,
4651
- "learning_rate": 0.0001,
4652
- "loss": 0.7349,
4653
- "step": 726
4654
- },
4655
- {
4656
- "epoch": 3.23,
4657
- "learning_rate": 0.0001,
4658
- "loss": 0.7237,
4659
- "step": 727
4660
- },
4661
- {
4662
- "epoch": 3.24,
4663
- "learning_rate": 0.0001,
4664
- "loss": 0.6804,
4665
- "step": 728
4666
- },
4667
- {
4668
- "epoch": 3.24,
4669
- "learning_rate": 0.0001,
4670
- "loss": 0.6911,
4671
- "step": 729
4672
- },
4673
- {
4674
- "epoch": 3.24,
4675
- "learning_rate": 0.0001,
4676
- "loss": 0.6582,
4677
- "step": 730
4678
- },
4679
- {
4680
- "epoch": 3.25,
4681
- "learning_rate": 0.0001,
4682
- "loss": 0.6994,
4683
- "step": 731
4684
- },
4685
- {
4686
- "epoch": 3.25,
4687
- "learning_rate": 0.0001,
4688
- "loss": 0.6864,
4689
- "step": 732
4690
- },
4691
- {
4692
- "epoch": 3.26,
4693
- "learning_rate": 0.0001,
4694
- "loss": 0.7323,
4695
- "step": 733
4696
- },
4697
- {
4698
- "epoch": 3.26,
4699
- "learning_rate": 0.0001,
4700
- "loss": 0.6921,
4701
- "step": 734
4702
- },
4703
- {
4704
- "epoch": 3.27,
4705
- "learning_rate": 0.0001,
4706
- "loss": 0.6115,
4707
- "step": 735
4708
- },
4709
- {
4710
- "epoch": 3.27,
4711
- "learning_rate": 0.0001,
4712
- "loss": 0.6423,
4713
- "step": 736
4714
- },
4715
- {
4716
- "epoch": 3.28,
4717
- "learning_rate": 0.0001,
4718
- "loss": 0.629,
4719
- "step": 737
4720
- },
4721
- {
4722
- "epoch": 3.28,
4723
- "learning_rate": 0.0001,
4724
- "loss": 0.6512,
4725
- "step": 738
4726
- },
4727
- {
4728
- "epoch": 3.28,
4729
- "learning_rate": 0.0001,
4730
- "loss": 0.6384,
4731
- "step": 739
4732
- },
4733
- {
4734
- "epoch": 3.29,
4735
- "learning_rate": 0.0001,
4736
- "loss": 0.629,
4737
- "step": 740
4738
- },
4739
- {
4740
- "epoch": 3.29,
4741
- "eval_loss": 0.7320723533630371,
4742
- "eval_runtime": 18.8594,
4743
- "eval_samples_per_second": 4.295,
4744
- "eval_steps_per_second": 1.114,
4745
- "step": 740
4746
- },
4747
- {
4748
- "epoch": 3.29,
4749
- "learning_rate": 0.0001,
4750
- "loss": 0.6447,
4751
- "step": 741
4752
- },
4753
- {
4754
- "epoch": 3.3,
4755
- "learning_rate": 0.0001,
4756
- "loss": 0.5919,
4757
- "step": 742
4758
- },
4759
- {
4760
- "epoch": 3.3,
4761
- "learning_rate": 0.0001,
4762
- "loss": 0.6678,
4763
- "step": 743
4764
- },
4765
- {
4766
- "epoch": 3.31,
4767
- "learning_rate": 0.0001,
4768
- "loss": 0.5893,
4769
- "step": 744
4770
- },
4771
- {
4772
- "epoch": 3.31,
4773
- "learning_rate": 0.0001,
4774
- "loss": 0.593,
4775
- "step": 745
4776
- },
4777
- {
4778
- "epoch": 3.32,
4779
- "learning_rate": 0.0001,
4780
- "loss": 0.6514,
4781
- "step": 746
4782
- },
4783
- {
4784
- "epoch": 3.32,
4785
- "learning_rate": 0.0001,
4786
- "loss": 0.6722,
4787
- "step": 747
4788
- },
4789
- {
4790
- "epoch": 3.32,
4791
- "learning_rate": 0.0001,
4792
- "loss": 0.5961,
4793
- "step": 748
4794
- },
4795
- {
4796
- "epoch": 3.33,
4797
- "learning_rate": 0.0001,
4798
- "loss": 0.6213,
4799
- "step": 749
4800
- },
4801
- {
4802
- "epoch": 3.33,
4803
- "learning_rate": 0.0001,
4804
- "loss": 0.6091,
4805
- "step": 750
4806
- },
4807
- {
4808
- "epoch": 3.34,
4809
- "learning_rate": 0.0001,
4810
- "loss": 0.6327,
4811
- "step": 751
4812
- },
4813
- {
4814
- "epoch": 3.34,
4815
- "learning_rate": 0.0001,
4816
- "loss": 0.6781,
4817
- "step": 752
4818
- },
4819
- {
4820
- "epoch": 3.35,
4821
- "learning_rate": 0.0001,
4822
- "loss": 0.609,
4823
- "step": 753
4824
- },
4825
- {
4826
- "epoch": 3.35,
4827
- "learning_rate": 0.0001,
4828
- "loss": 0.6491,
4829
- "step": 754
4830
- },
4831
- {
4832
- "epoch": 3.36,
4833
- "learning_rate": 0.0001,
4834
- "loss": 0.6231,
4835
- "step": 755
4836
- },
4837
- {
4838
- "epoch": 3.36,
4839
- "learning_rate": 0.0001,
4840
- "loss": 0.6353,
4841
- "step": 756
4842
- },
4843
- {
4844
- "epoch": 3.36,
4845
- "learning_rate": 0.0001,
4846
- "loss": 0.7171,
4847
- "step": 757
4848
- },
4849
- {
4850
- "epoch": 3.37,
4851
- "learning_rate": 0.0001,
4852
- "loss": 0.702,
4853
- "step": 758
4854
- },
4855
- {
4856
- "epoch": 3.37,
4857
- "learning_rate": 0.0001,
4858
- "loss": 0.6205,
4859
- "step": 759
4860
- },
4861
- {
4862
- "epoch": 3.38,
4863
- "learning_rate": 0.0001,
4864
- "loss": 0.6037,
4865
- "step": 760
4866
- },
4867
- {
4868
- "epoch": 3.38,
4869
- "eval_loss": 0.735625147819519,
4870
- "eval_runtime": 18.9293,
4871
- "eval_samples_per_second": 4.279,
4872
- "eval_steps_per_second": 1.109,
4873
- "step": 760
4874
- },
4875
- {
4876
- "epoch": 3.38,
4877
- "learning_rate": 0.0001,
4878
- "loss": 0.6143,
4879
- "step": 761
4880
- },
4881
- {
4882
- "epoch": 3.39,
4883
- "learning_rate": 0.0001,
4884
- "loss": 0.6124,
4885
- "step": 762
4886
- },
4887
- {
4888
- "epoch": 3.39,
4889
- "learning_rate": 0.0001,
4890
- "loss": 0.5528,
4891
- "step": 763
4892
- },
4893
- {
4894
- "epoch": 3.4,
4895
- "learning_rate": 0.0001,
4896
- "loss": 0.5531,
4897
- "step": 764
4898
- },
4899
- {
4900
- "epoch": 3.4,
4901
- "learning_rate": 0.0001,
4902
- "loss": 0.5917,
4903
- "step": 765
4904
- },
4905
- {
4906
- "epoch": 3.4,
4907
- "learning_rate": 0.0001,
4908
- "loss": 0.5962,
4909
- "step": 766
4910
- },
4911
- {
4912
- "epoch": 3.41,
4913
- "learning_rate": 0.0001,
4914
- "loss": 0.6423,
4915
- "step": 767
4916
- },
4917
- {
4918
- "epoch": 3.41,
4919
- "learning_rate": 0.0001,
4920
- "loss": 0.613,
4921
- "step": 768
4922
- },
4923
- {
4924
- "epoch": 3.42,
4925
- "learning_rate": 0.0001,
4926
- "loss": 0.6326,
4927
- "step": 769
4928
- },
4929
- {
4930
- "epoch": 3.42,
4931
- "learning_rate": 0.0001,
4932
- "loss": 0.6229,
4933
- "step": 770
4934
- },
4935
- {
4936
- "epoch": 3.43,
4937
- "learning_rate": 0.0001,
4938
- "loss": 0.6613,
4939
- "step": 771
4940
- },
4941
- {
4942
- "epoch": 3.43,
4943
- "learning_rate": 0.0001,
4944
- "loss": 0.6048,
4945
- "step": 772
4946
- },
4947
- {
4948
- "epoch": 3.44,
4949
- "learning_rate": 0.0001,
4950
- "loss": 0.5452,
4951
- "step": 773
4952
- },
4953
- {
4954
- "epoch": 3.44,
4955
- "learning_rate": 0.0001,
4956
- "loss": 0.5207,
4957
- "step": 774
4958
- },
4959
- {
4960
- "epoch": 3.44,
4961
- "learning_rate": 0.0001,
4962
- "loss": 0.5988,
4963
- "step": 775
4964
- },
4965
- {
4966
- "epoch": 3.45,
4967
- "learning_rate": 0.0001,
4968
- "loss": 0.7064,
4969
- "step": 776
4970
- },
4971
- {
4972
- "epoch": 3.45,
4973
- "learning_rate": 0.0001,
4974
- "loss": 0.6414,
4975
- "step": 777
4976
- },
4977
- {
4978
- "epoch": 3.46,
4979
- "learning_rate": 0.0001,
4980
- "loss": 0.7127,
4981
- "step": 778
4982
- },
4983
- {
4984
- "epoch": 3.46,
4985
- "learning_rate": 0.0001,
4986
- "loss": 0.7062,
4987
- "step": 779
4988
- },
4989
- {
4990
- "epoch": 3.47,
4991
- "learning_rate": 0.0001,
4992
- "loss": 0.7391,
4993
- "step": 780
4994
- },
4995
- {
4996
- "epoch": 3.47,
4997
- "eval_loss": 0.7382710576057434,
4998
- "eval_runtime": 19.0816,
4999
- "eval_samples_per_second": 4.245,
5000
- "eval_steps_per_second": 1.101,
5001
- "step": 780
5002
- },
5003
- {
5004
- "epoch": 3.47,
5005
- "learning_rate": 0.0001,
5006
- "loss": 0.7217,
5007
- "step": 781
5008
- },
5009
- {
5010
- "epoch": 3.48,
5011
- "learning_rate": 0.0001,
5012
- "loss": 0.6948,
5013
- "step": 782
5014
- },
5015
- {
5016
- "epoch": 3.48,
5017
- "learning_rate": 0.0001,
5018
- "loss": 0.7053,
5019
- "step": 783
5020
- },
5021
- {
5022
- "epoch": 3.48,
5023
- "learning_rate": 0.0001,
5024
- "loss": 0.7944,
5025
- "step": 784
5026
- },
5027
- {
5028
- "epoch": 3.49,
5029
- "learning_rate": 0.0001,
5030
- "loss": 0.683,
5031
- "step": 785
5032
- },
5033
- {
5034
- "epoch": 3.49,
5035
- "learning_rate": 0.0001,
5036
- "loss": 0.7693,
5037
- "step": 786
5038
- },
5039
- {
5040
- "epoch": 3.5,
5041
- "learning_rate": 0.0001,
5042
- "loss": 0.6613,
5043
- "step": 787
5044
- },
5045
- {
5046
- "epoch": 3.5,
5047
- "learning_rate": 0.0001,
5048
- "loss": 0.6312,
5049
- "step": 788
5050
- },
5051
- {
5052
- "epoch": 3.51,
5053
- "learning_rate": 0.0001,
5054
- "loss": 0.7292,
5055
- "step": 789
5056
- },
5057
- {
5058
- "epoch": 3.51,
5059
- "learning_rate": 0.0001,
5060
- "loss": 0.6532,
5061
- "step": 790
5062
- },
5063
- {
5064
- "epoch": 3.52,
5065
- "learning_rate": 0.0001,
5066
- "loss": 0.6847,
5067
- "step": 791
5068
- },
5069
- {
5070
- "epoch": 3.52,
5071
- "learning_rate": 0.0001,
5072
- "loss": 0.6814,
5073
- "step": 792
5074
- },
5075
- {
5076
- "epoch": 3.52,
5077
- "learning_rate": 0.0001,
5078
- "loss": 0.6137,
5079
- "step": 793
5080
- },
5081
- {
5082
- "epoch": 3.53,
5083
- "learning_rate": 0.0001,
5084
- "loss": 0.6355,
5085
- "step": 794
5086
- },
5087
- {
5088
- "epoch": 3.53,
5089
- "learning_rate": 0.0001,
5090
- "loss": 0.5924,
5091
- "step": 795
5092
- },
5093
- {
5094
- "epoch": 3.54,
5095
- "learning_rate": 0.0001,
5096
- "loss": 0.5793,
5097
- "step": 796
5098
- },
5099
- {
5100
- "epoch": 3.54,
5101
- "learning_rate": 0.0001,
5102
- "loss": 0.5977,
5103
- "step": 797
5104
- },
5105
- {
5106
- "epoch": 3.55,
5107
- "learning_rate": 0.0001,
5108
- "loss": 0.7161,
5109
- "step": 798
5110
- },
5111
- {
5112
- "epoch": 3.55,
5113
- "learning_rate": 0.0001,
5114
- "loss": 0.6635,
5115
- "step": 799
5116
- },
5117
- {
5118
- "epoch": 3.56,
5119
- "learning_rate": 0.0001,
5120
- "loss": 0.6454,
5121
- "step": 800
5122
- },
5123
- {
5124
- "epoch": 3.56,
5125
- "eval_loss": 0.7320914268493652,
5126
- "eval_runtime": 19.2836,
5127
- "eval_samples_per_second": 4.2,
5128
- "eval_steps_per_second": 1.089,
5129
- "step": 800
5130
- },
5131
- {
5132
- "epoch": 3.56,
5133
- "learning_rate": 0.0001,
5134
- "loss": 0.642,
5135
- "step": 801
5136
- },
5137
- {
5138
- "epoch": 3.56,
5139
- "learning_rate": 0.0001,
5140
- "loss": 0.5794,
5141
- "step": 802
5142
- },
5143
- {
5144
- "epoch": 3.57,
5145
- "learning_rate": 0.0001,
5146
- "loss": 0.637,
5147
- "step": 803
5148
- },
5149
- {
5150
- "epoch": 3.57,
5151
- "learning_rate": 0.0001,
5152
- "loss": 0.6212,
5153
- "step": 804
5154
- },
5155
- {
5156
- "epoch": 3.58,
5157
- "learning_rate": 0.0001,
5158
- "loss": 0.6635,
5159
- "step": 805
5160
- },
5161
- {
5162
- "epoch": 3.58,
5163
- "learning_rate": 0.0001,
5164
- "loss": 0.6311,
5165
- "step": 806
5166
- },
5167
- {
5168
- "epoch": 3.59,
5169
- "learning_rate": 0.0001,
5170
- "loss": 0.5901,
5171
- "step": 807
5172
- },
5173
- {
5174
- "epoch": 3.59,
5175
- "learning_rate": 0.0001,
5176
- "loss": 0.648,
5177
- "step": 808
5178
- },
5179
- {
5180
- "epoch": 3.6,
5181
- "learning_rate": 0.0001,
5182
- "loss": 0.6348,
5183
- "step": 809
5184
- },
5185
- {
5186
- "epoch": 3.6,
5187
- "learning_rate": 0.0001,
5188
- "loss": 0.6346,
5189
- "step": 810
5190
- },
5191
- {
5192
- "epoch": 3.6,
5193
- "learning_rate": 0.0001,
5194
- "loss": 0.6892,
5195
- "step": 811
5196
- },
5197
- {
5198
- "epoch": 3.61,
5199
- "learning_rate": 0.0001,
5200
- "loss": 0.6337,
5201
- "step": 812
5202
- },
5203
- {
5204
- "epoch": 3.61,
5205
- "learning_rate": 0.0001,
5206
- "loss": 0.6348,
5207
- "step": 813
5208
- },
5209
- {
5210
- "epoch": 3.62,
5211
- "learning_rate": 0.0001,
5212
- "loss": 0.6576,
5213
- "step": 814
5214
- },
5215
- {
5216
- "epoch": 3.62,
5217
- "learning_rate": 0.0001,
5218
- "loss": 0.6202,
5219
- "step": 815
5220
- },
5221
- {
5222
- "epoch": 3.63,
5223
- "learning_rate": 0.0001,
5224
- "loss": 0.6148,
5225
- "step": 816
5226
- },
5227
- {
5228
- "epoch": 3.63,
5229
- "learning_rate": 0.0001,
5230
- "loss": 0.668,
5231
- "step": 817
5232
- },
5233
- {
5234
- "epoch": 3.64,
5235
- "learning_rate": 0.0001,
5236
- "loss": 0.6663,
5237
- "step": 818
5238
- },
5239
- {
5240
- "epoch": 3.64,
5241
- "learning_rate": 0.0001,
5242
- "loss": 0.6004,
5243
- "step": 819
5244
- },
5245
- {
5246
- "epoch": 3.64,
5247
- "learning_rate": 0.0001,
5248
- "loss": 0.5944,
5249
- "step": 820
5250
- },
5251
- {
5252
- "epoch": 3.64,
5253
- "eval_loss": 0.7350344061851501,
5254
- "eval_runtime": 19.0874,
5255
- "eval_samples_per_second": 4.244,
5256
- "eval_steps_per_second": 1.1,
5257
- "step": 820
5258
- },
5259
- {
5260
- "epoch": 3.65,
5261
- "learning_rate": 0.0001,
5262
- "loss": 0.5598,
5263
- "step": 821
5264
- },
5265
- {
5266
- "epoch": 3.65,
5267
- "learning_rate": 0.0001,
5268
- "loss": 0.5836,
5269
- "step": 822
5270
- },
5271
- {
5272
- "epoch": 3.66,
5273
- "learning_rate": 0.0001,
5274
- "loss": 0.5664,
5275
- "step": 823
5276
- },
5277
- {
5278
- "epoch": 3.66,
5279
- "learning_rate": 0.0001,
5280
- "loss": 0.7259,
5281
- "step": 824
5282
- },
5283
- {
5284
- "epoch": 3.67,
5285
- "learning_rate": 0.0001,
5286
- "loss": 0.5605,
5287
- "step": 825
5288
- },
5289
- {
5290
- "epoch": 3.67,
5291
- "learning_rate": 0.0001,
5292
- "loss": 0.7895,
5293
- "step": 826
5294
- },
5295
- {
5296
- "epoch": 3.68,
5297
- "learning_rate": 0.0001,
5298
- "loss": 0.6594,
5299
- "step": 827
5300
- },
5301
- {
5302
- "epoch": 3.68,
5303
- "learning_rate": 0.0001,
5304
- "loss": 0.6306,
5305
- "step": 828
5306
- },
5307
- {
5308
- "epoch": 3.68,
5309
- "learning_rate": 0.0001,
5310
- "loss": 0.7212,
5311
- "step": 829
5312
- },
5313
- {
5314
- "epoch": 3.69,
5315
- "learning_rate": 0.0001,
5316
- "loss": 0.6964,
5317
- "step": 830
5318
- },
5319
- {
5320
- "epoch": 3.69,
5321
- "learning_rate": 0.0001,
5322
- "loss": 0.7085,
5323
- "step": 831
5324
- },
5325
- {
5326
- "epoch": 3.7,
5327
- "learning_rate": 0.0001,
5328
- "loss": 0.7088,
5329
- "step": 832
5330
- },
5331
- {
5332
- "epoch": 3.7,
5333
- "learning_rate": 0.0001,
5334
- "loss": 0.7106,
5335
- "step": 833
5336
- },
5337
- {
5338
- "epoch": 3.71,
5339
- "learning_rate": 0.0001,
5340
- "loss": 0.6695,
5341
- "step": 834
5342
- },
5343
- {
5344
- "epoch": 3.71,
5345
- "learning_rate": 0.0001,
5346
- "loss": 0.6925,
5347
- "step": 835
5348
- },
5349
- {
5350
- "epoch": 3.72,
5351
- "learning_rate": 0.0001,
5352
- "loss": 0.6677,
5353
- "step": 836
5354
- },
5355
- {
5356
- "epoch": 3.72,
5357
- "learning_rate": 0.0001,
5358
- "loss": 0.6597,
5359
- "step": 837
5360
- },
5361
- {
5362
- "epoch": 3.72,
5363
- "learning_rate": 0.0001,
5364
- "loss": 0.6428,
5365
- "step": 838
5366
- },
5367
- {
5368
- "epoch": 3.73,
5369
- "learning_rate": 0.0001,
5370
- "loss": 0.6708,
5371
- "step": 839
5372
- },
5373
- {
5374
- "epoch": 3.73,
5375
- "learning_rate": 0.0001,
5376
- "loss": 0.6382,
5377
- "step": 840
5378
- },
5379
- {
5380
- "epoch": 3.73,
5381
- "eval_loss": 0.7277283668518066,
5382
- "eval_runtime": 19.254,
5383
- "eval_samples_per_second": 4.207,
5384
- "eval_steps_per_second": 1.091,
5385
- "step": 840
5386
- },
5387
- {
5388
- "epoch": 3.74,
5389
- "learning_rate": 0.0001,
5390
- "loss": 0.6431,
5391
- "step": 841
5392
- },
5393
- {
5394
- "epoch": 3.74,
5395
- "learning_rate": 0.0001,
5396
- "loss": 0.5893,
5397
- "step": 842
5398
- },
5399
- {
5400
- "epoch": 3.75,
5401
- "learning_rate": 0.0001,
5402
- "loss": 0.5862,
5403
- "step": 843
5404
- },
5405
- {
5406
- "epoch": 3.75,
5407
- "learning_rate": 0.0001,
5408
- "loss": 0.608,
5409
- "step": 844
5410
- },
5411
- {
5412
- "epoch": 3.76,
5413
- "learning_rate": 0.0001,
5414
- "loss": 0.5948,
5415
- "step": 845
5416
- },
5417
- {
5418
- "epoch": 3.76,
5419
- "learning_rate": 0.0001,
5420
- "loss": 0.7078,
5421
- "step": 846
5422
- },
5423
- {
5424
- "epoch": 3.76,
5425
- "learning_rate": 0.0001,
5426
- "loss": 0.6372,
5427
- "step": 847
5428
- },
5429
- {
5430
- "epoch": 3.77,
5431
- "learning_rate": 0.0001,
5432
- "loss": 0.6611,
5433
- "step": 848
5434
- },
5435
- {
5436
- "epoch": 3.77,
5437
- "learning_rate": 0.0001,
5438
- "loss": 0.6026,
5439
- "step": 849
5440
- },
5441
- {
5442
- "epoch": 3.78,
5443
- "learning_rate": 0.0001,
5444
- "loss": 0.651,
5445
- "step": 850
5446
- },
5447
- {
5448
- "epoch": 3.78,
5449
- "learning_rate": 0.0001,
5450
- "loss": 0.6261,
5451
- "step": 851
5452
- },
5453
- {
5454
- "epoch": 3.79,
5455
- "learning_rate": 0.0001,
5456
- "loss": 0.6362,
5457
- "step": 852
5458
- },
5459
- {
5460
- "epoch": 3.79,
5461
- "learning_rate": 0.0001,
5462
- "loss": 0.5957,
5463
- "step": 853
5464
- },
5465
- {
5466
- "epoch": 3.8,
5467
- "learning_rate": 0.0001,
5468
- "loss": 0.6254,
5469
- "step": 854
5470
- },
5471
- {
5472
- "epoch": 3.8,
5473
- "learning_rate": 0.0001,
5474
- "loss": 0.6521,
5475
- "step": 855
5476
- },
5477
- {
5478
- "epoch": 3.8,
5479
- "learning_rate": 0.0001,
5480
- "loss": 0.6361,
5481
- "step": 856
5482
- },
5483
- {
5484
- "epoch": 3.81,
5485
- "learning_rate": 0.0001,
5486
- "loss": 0.6474,
5487
- "step": 857
5488
- },
5489
- {
5490
- "epoch": 3.81,
5491
- "learning_rate": 0.0001,
5492
- "loss": 0.6121,
5493
- "step": 858
5494
- },
5495
- {
5496
- "epoch": 3.82,
5497
- "learning_rate": 0.0001,
5498
- "loss": 0.5983,
5499
- "step": 859
5500
- },
5501
- {
5502
- "epoch": 3.82,
5503
- "learning_rate": 0.0001,
5504
- "loss": 0.6431,
5505
- "step": 860
5506
- },
5507
- {
5508
- "epoch": 3.82,
5509
- "eval_loss": 0.729424774646759,
5510
- "eval_runtime": 19.1466,
5511
- "eval_samples_per_second": 4.231,
5512
- "eval_steps_per_second": 1.097,
5513
- "step": 860
5514
- },
5515
- {
5516
- "epoch": 3.83,
5517
- "learning_rate": 0.0001,
5518
- "loss": 0.5838,
5519
- "step": 861
5520
- },
5521
- {
5522
- "epoch": 3.83,
5523
- "learning_rate": 0.0001,
5524
- "loss": 0.7613,
5525
- "step": 862
5526
- },
5527
- {
5528
- "epoch": 3.84,
5529
- "learning_rate": 0.0001,
5530
- "loss": 0.6654,
5531
- "step": 863
5532
- },
5533
- {
5534
- "epoch": 3.84,
5535
- "learning_rate": 0.0001,
5536
- "loss": 0.5563,
5537
- "step": 864
5538
- },
5539
- {
5540
- "epoch": 3.84,
5541
- "learning_rate": 0.0001,
5542
- "loss": 0.6706,
5543
- "step": 865
5544
- },
5545
- {
5546
- "epoch": 3.85,
5547
- "learning_rate": 0.0001,
5548
- "loss": 0.6083,
5549
- "step": 866
5550
- },
5551
- {
5552
- "epoch": 3.85,
5553
- "learning_rate": 0.0001,
5554
- "loss": 0.5665,
5555
- "step": 867
5556
- },
5557
- {
5558
- "epoch": 3.86,
5559
- "learning_rate": 0.0001,
5560
- "loss": 0.6089,
5561
- "step": 868
5562
- },
5563
- {
5564
- "epoch": 3.86,
5565
- "learning_rate": 0.0001,
5566
- "loss": 0.5937,
5567
- "step": 869
5568
- },
5569
- {
5570
- "epoch": 3.87,
5571
- "learning_rate": 0.0001,
5572
- "loss": 0.5958,
5573
- "step": 870
5574
- },
5575
- {
5576
- "epoch": 3.87,
5577
- "learning_rate": 0.0001,
5578
- "loss": 0.5194,
5579
- "step": 871
5580
- },
5581
- {
5582
- "epoch": 3.88,
5583
- "learning_rate": 0.0001,
5584
- "loss": 0.5701,
5585
- "step": 872
5586
- },
5587
- {
5588
- "epoch": 3.88,
5589
- "learning_rate": 0.0001,
5590
- "loss": 0.6191,
5591
- "step": 873
5592
- },
5593
- {
5594
- "epoch": 3.88,
5595
- "learning_rate": 0.0001,
5596
- "loss": 0.6032,
5597
- "step": 874
5598
- },
5599
- {
5600
- "epoch": 3.89,
5601
- "learning_rate": 0.0001,
5602
- "loss": 0.5789,
5603
- "step": 875
5604
- },
5605
- {
5606
- "epoch": 3.89,
5607
- "learning_rate": 0.0001,
5608
- "loss": 0.6796,
5609
- "step": 876
5610
- },
5611
- {
5612
- "epoch": 3.9,
5613
- "learning_rate": 0.0001,
5614
- "loss": 0.7132,
5615
- "step": 877
5616
- },
5617
- {
5618
- "epoch": 3.9,
5619
- "learning_rate": 0.0001,
5620
- "loss": 0.7038,
5621
- "step": 878
5622
- },
5623
- {
5624
- "epoch": 3.91,
5625
- "learning_rate": 0.0001,
5626
- "loss": 0.737,
5627
- "step": 879
5628
- },
5629
- {
5630
- "epoch": 3.91,
5631
- "learning_rate": 0.0001,
5632
- "loss": 0.6938,
5633
- "step": 880
5634
- },
5635
- {
5636
- "epoch": 3.91,
5637
- "eval_loss": 0.7355621457099915,
5638
- "eval_runtime": 19.0965,
5639
- "eval_samples_per_second": 4.242,
5640
- "eval_steps_per_second": 1.1,
5641
- "step": 880
5642
- },
5643
- {
5644
- "epoch": 3.92,
5645
- "learning_rate": 0.0001,
5646
- "loss": 0.7009,
5647
- "step": 881
5648
- },
5649
- {
5650
- "epoch": 3.92,
5651
- "learning_rate": 0.0001,
5652
- "loss": 0.6425,
5653
- "step": 882
5654
- },
5655
- {
5656
- "epoch": 3.92,
5657
- "learning_rate": 0.0001,
5658
- "loss": 0.6306,
5659
- "step": 883
5660
- },
5661
- {
5662
- "epoch": 3.93,
5663
- "learning_rate": 0.0001,
5664
- "loss": 0.6771,
5665
- "step": 884
5666
- },
5667
- {
5668
- "epoch": 3.93,
5669
- "learning_rate": 0.0001,
5670
- "loss": 0.6381,
5671
- "step": 885
5672
- },
5673
- {
5674
- "epoch": 3.94,
5675
- "learning_rate": 0.0001,
5676
- "loss": 0.6741,
5677
- "step": 886
5678
- },
5679
- {
5680
- "epoch": 3.94,
5681
- "learning_rate": 0.0001,
5682
- "loss": 0.6783,
5683
- "step": 887
5684
- },
5685
- {
5686
- "epoch": 3.95,
5687
- "learning_rate": 0.0001,
5688
- "loss": 0.6619,
5689
- "step": 888
5690
- },
5691
- {
5692
- "epoch": 3.95,
5693
- "learning_rate": 0.0001,
5694
- "loss": 0.6493,
5695
- "step": 889
5696
- },
5697
- {
5698
- "epoch": 3.96,
5699
- "learning_rate": 0.0001,
5700
- "loss": 0.6248,
5701
- "step": 890
5702
- },
5703
- {
5704
- "epoch": 3.96,
5705
- "learning_rate": 0.0001,
5706
- "loss": 0.5975,
5707
- "step": 891
5708
- },
5709
- {
5710
- "epoch": 3.96,
5711
- "learning_rate": 0.0001,
5712
- "loss": 0.6597,
5713
- "step": 892
5714
- },
5715
- {
5716
- "epoch": 3.97,
5717
- "learning_rate": 0.0001,
5718
- "loss": 0.5691,
5719
- "step": 893
5720
- },
5721
- {
5722
- "epoch": 3.97,
5723
- "learning_rate": 0.0001,
5724
- "loss": 0.6303,
5725
- "step": 894
5726
- },
5727
- {
5728
- "epoch": 3.98,
5729
- "learning_rate": 0.0001,
5730
- "loss": 0.6042,
5731
- "step": 895
5732
- },
5733
- {
5734
- "epoch": 3.98,
5735
- "learning_rate": 0.0001,
5736
- "loss": 0.5876,
5737
- "step": 896
5738
- },
5739
- {
5740
- "epoch": 3.99,
5741
- "learning_rate": 0.0001,
5742
- "loss": 0.6245,
5743
- "step": 897
5744
- },
5745
- {
5746
- "epoch": 3.99,
5747
- "learning_rate": 0.0001,
5748
- "loss": 0.5628,
5749
- "step": 898
5750
- },
5751
- {
5752
- "epoch": 4.0,
5753
- "learning_rate": 0.0001,
5754
- "loss": 0.5753,
5755
- "step": 899
5756
- },
5757
- {
5758
- "epoch": 4.0,
5759
- "learning_rate": 0.0001,
5760
- "loss": 0.5886,
5761
- "step": 900
5762
- },
5763
- {
5764
- "epoch": 4.0,
5765
- "eval_loss": 0.7348082661628723,
5766
- "eval_runtime": 18.8837,
5767
- "eval_samples_per_second": 4.289,
5768
- "eval_steps_per_second": 1.112,
5769
- "step": 900
5770
  }
5771
  ],
5772
  "logging_steps": 1,
@@ -5774,7 +4328,7 @@
5774
  "num_input_tokens_seen": 0,
5775
  "num_train_epochs": 10,
5776
  "save_steps": 20,
5777
- "total_flos": 2.1376566399074304e+17,
5778
  "train_batch_size": 4,
5779
  "trial_name": null,
5780
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 20,
6
+ "global_step": 675,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4321
  "learning_rate": 0.0001,
4322
  "loss": 0.716,
4323
  "step": 675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4324
  }
4325
  ],
4326
  "logging_steps": 1,
 
4328
  "num_input_tokens_seen": 0,
4329
  "num_train_epochs": 10,
4330
  "save_steps": 20,
4331
+ "total_flos": 1.6025567876087808e+17,
4332
  "train_batch_size": 4,
4333
  "trial_name": null,
4334
  "trial_params": null