Femboyuwu2000 commited on
Commit
72e7ceb
1 Parent(s): 7c4ee5e

Training in progress, step 2320, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fafd0f157d14e49e4739550f3f127d870e4c0caf60624531f00c5d7035d51299
3
  size 13982248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fca914b2057aa482eb7a6841c80c2efe9c097a9f40f833d983af4537dc40f6bb
3
  size 13982248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68070370254082a9da3aeac4075c069ea60def96eef668a1fc6d9c0196fc0554
3
  size 7062522
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0648084ec0eae35cf7dd3a294bee444ad33f083ffa2a37e0110a6f2612622237
3
  size 7062522
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98d3711de76e8c84da6b52967c082fa25514ac6013e7232a9fdcdac089584cc1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db6383b70031a203d8f79911eab63df2eb868f69d22da66a6d7b3fea0a2b6f6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dde20670de2d0a6327803d0e03f03b8ccee00551dc9309464bb5c115b97010c1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4fa7f675de7d8161961ca1ab082c7d8425357c049f1a3dc57c1e35db98971b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0832,
5
  "eval_steps": 500,
6
- "global_step": 1040,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -371,6 +371,454 @@
371
  "learning_rate": 2.996193909122197e-05,
372
  "loss": 3.7447,
373
  "step": 1040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  }
375
  ],
376
  "logging_steps": 20,
@@ -378,7 +826,7 @@
378
  "num_input_tokens_seen": 0,
379
  "num_train_epochs": 2,
380
  "save_steps": 20,
381
- "total_flos": 2473139331366912.0,
382
  "train_batch_size": 8,
383
  "trial_name": null,
384
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1856,
5
  "eval_steps": 500,
6
+ "global_step": 2320,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
371
  "learning_rate": 2.996193909122197e-05,
372
  "loss": 3.7447,
373
  "step": 1040
374
+ },
375
+ {
376
+ "epoch": 0.08,
377
+ "grad_norm": 26.41890525817871,
378
+ "learning_rate": 2.995840200683028e-05,
379
+ "loss": 3.5839,
380
+ "step": 1060
381
+ },
382
+ {
383
+ "epoch": 0.09,
384
+ "grad_norm": 32.88002014160156,
385
+ "learning_rate": 2.995470801587973e-05,
386
+ "loss": 3.6606,
387
+ "step": 1080
388
+ },
389
+ {
390
+ "epoch": 0.09,
391
+ "grad_norm": 32.0895881652832,
392
+ "learning_rate": 2.9950857157118544e-05,
393
+ "loss": 3.677,
394
+ "step": 1100
395
+ },
396
+ {
397
+ "epoch": 0.09,
398
+ "grad_norm": 37.726783752441406,
399
+ "learning_rate": 2.9946849470940395e-05,
400
+ "loss": 3.5546,
401
+ "step": 1120
402
+ },
403
+ {
404
+ "epoch": 0.09,
405
+ "grad_norm": 56.246299743652344,
406
+ "learning_rate": 2.9942684999384034e-05,
407
+ "loss": 3.6391,
408
+ "step": 1140
409
+ },
410
+ {
411
+ "epoch": 0.09,
412
+ "grad_norm": 35.675662994384766,
413
+ "learning_rate": 2.993836378613278e-05,
414
+ "loss": 3.5918,
415
+ "step": 1160
416
+ },
417
+ {
418
+ "epoch": 0.09,
419
+ "grad_norm": 26.685134887695312,
420
+ "learning_rate": 2.993388587651412e-05,
421
+ "loss": 3.6331,
422
+ "step": 1180
423
+ },
424
+ {
425
+ "epoch": 0.1,
426
+ "grad_norm": 27.400333404541016,
427
+ "learning_rate": 2.992925131749921e-05,
428
+ "loss": 3.6214,
429
+ "step": 1200
430
+ },
431
+ {
432
+ "epoch": 0.1,
433
+ "grad_norm": 28.501314163208008,
434
+ "learning_rate": 2.9924460157702378e-05,
435
+ "loss": 3.6619,
436
+ "step": 1220
437
+ },
438
+ {
439
+ "epoch": 0.1,
440
+ "grad_norm": 30.773778915405273,
441
+ "learning_rate": 2.991951244738063e-05,
442
+ "loss": 3.6453,
443
+ "step": 1240
444
+ },
445
+ {
446
+ "epoch": 0.1,
447
+ "grad_norm": 24.701374053955078,
448
+ "learning_rate": 2.9914408238433095e-05,
449
+ "loss": 3.7282,
450
+ "step": 1260
451
+ },
452
+ {
453
+ "epoch": 0.1,
454
+ "grad_norm": 27.605117797851562,
455
+ "learning_rate": 2.990914758440052e-05,
456
+ "loss": 3.6635,
457
+ "step": 1280
458
+ },
459
+ {
460
+ "epoch": 0.1,
461
+ "grad_norm": 27.829086303710938,
462
+ "learning_rate": 2.9903730540464668e-05,
463
+ "loss": 3.5293,
464
+ "step": 1300
465
+ },
466
+ {
467
+ "epoch": 0.11,
468
+ "grad_norm": 40.916263580322266,
469
+ "learning_rate": 2.9898157163447767e-05,
470
+ "loss": 3.6976,
471
+ "step": 1320
472
+ },
473
+ {
474
+ "epoch": 0.11,
475
+ "grad_norm": 33.31068420410156,
476
+ "learning_rate": 2.9892427511811912e-05,
477
+ "loss": 3.548,
478
+ "step": 1340
479
+ },
480
+ {
481
+ "epoch": 0.11,
482
+ "grad_norm": 29.932533264160156,
483
+ "learning_rate": 2.9886541645658435e-05,
484
+ "loss": 3.7486,
485
+ "step": 1360
486
+ },
487
+ {
488
+ "epoch": 0.11,
489
+ "grad_norm": 35.59455490112305,
490
+ "learning_rate": 2.9880499626727284e-05,
491
+ "loss": 3.6342,
492
+ "step": 1380
493
+ },
494
+ {
495
+ "epoch": 0.11,
496
+ "grad_norm": 29.93869400024414,
497
+ "learning_rate": 2.9874301518396377e-05,
498
+ "loss": 3.6615,
499
+ "step": 1400
500
+ },
501
+ {
502
+ "epoch": 0.11,
503
+ "grad_norm": 43.417213439941406,
504
+ "learning_rate": 2.986794738568094e-05,
505
+ "loss": 3.607,
506
+ "step": 1420
507
+ },
508
+ {
509
+ "epoch": 0.12,
510
+ "grad_norm": 52.483917236328125,
511
+ "learning_rate": 2.9861437295232825e-05,
512
+ "loss": 3.5937,
513
+ "step": 1440
514
+ },
515
+ {
516
+ "epoch": 0.12,
517
+ "grad_norm": 30.312334060668945,
518
+ "learning_rate": 2.9854771315339787e-05,
519
+ "loss": 3.5991,
520
+ "step": 1460
521
+ },
522
+ {
523
+ "epoch": 0.12,
524
+ "grad_norm": 49.459136962890625,
525
+ "learning_rate": 2.984794951592481e-05,
526
+ "loss": 3.5261,
527
+ "step": 1480
528
+ },
529
+ {
530
+ "epoch": 0.12,
531
+ "grad_norm": 34.81111526489258,
532
+ "learning_rate": 2.984097196854534e-05,
533
+ "loss": 3.6818,
534
+ "step": 1500
535
+ },
536
+ {
537
+ "epoch": 0.12,
538
+ "grad_norm": 34.721946716308594,
539
+ "learning_rate": 2.9833838746392544e-05,
540
+ "loss": 3.5636,
541
+ "step": 1520
542
+ },
543
+ {
544
+ "epoch": 0.12,
545
+ "grad_norm": 31.46621322631836,
546
+ "learning_rate": 2.982654992429056e-05,
547
+ "loss": 3.5597,
548
+ "step": 1540
549
+ },
550
+ {
551
+ "epoch": 0.12,
552
+ "grad_norm": 38.78512191772461,
553
+ "learning_rate": 2.981910557869566e-05,
554
+ "loss": 3.661,
555
+ "step": 1560
556
+ },
557
+ {
558
+ "epoch": 0.13,
559
+ "grad_norm": 27.38837432861328,
560
+ "learning_rate": 2.981150578769553e-05,
561
+ "loss": 3.6173,
562
+ "step": 1580
563
+ },
564
+ {
565
+ "epoch": 0.13,
566
+ "grad_norm": 45.619632720947266,
567
+ "learning_rate": 2.980375063100836e-05,
568
+ "loss": 3.6632,
569
+ "step": 1600
570
+ },
571
+ {
572
+ "epoch": 0.13,
573
+ "grad_norm": 30.708433151245117,
574
+ "learning_rate": 2.979584018998209e-05,
575
+ "loss": 3.5165,
576
+ "step": 1620
577
+ },
578
+ {
579
+ "epoch": 0.13,
580
+ "grad_norm": 35.472938537597656,
581
+ "learning_rate": 2.97877745475935e-05,
582
+ "loss": 3.5157,
583
+ "step": 1640
584
+ },
585
+ {
586
+ "epoch": 0.13,
587
+ "grad_norm": 39.029415130615234,
588
+ "learning_rate": 2.9779553788447358e-05,
589
+ "loss": 3.6259,
590
+ "step": 1660
591
+ },
592
+ {
593
+ "epoch": 0.13,
594
+ "grad_norm": 57.90769577026367,
595
+ "learning_rate": 2.977117799877554e-05,
596
+ "loss": 3.6378,
597
+ "step": 1680
598
+ },
599
+ {
600
+ "epoch": 0.14,
601
+ "grad_norm": 36.95255661010742,
602
+ "learning_rate": 2.9762647266436115e-05,
603
+ "loss": 3.5845,
604
+ "step": 1700
605
+ },
606
+ {
607
+ "epoch": 0.14,
608
+ "grad_norm": 27.456787109375,
609
+ "learning_rate": 2.9753961680912432e-05,
610
+ "loss": 3.6647,
611
+ "step": 1720
612
+ },
613
+ {
614
+ "epoch": 0.14,
615
+ "grad_norm": 27.383285522460938,
616
+ "learning_rate": 2.9745121333312166e-05,
617
+ "loss": 3.6668,
618
+ "step": 1740
619
+ },
620
+ {
621
+ "epoch": 0.14,
622
+ "grad_norm": 26.555049896240234,
623
+ "learning_rate": 2.9736126316366385e-05,
624
+ "loss": 3.6617,
625
+ "step": 1760
626
+ },
627
+ {
628
+ "epoch": 0.14,
629
+ "grad_norm": 34.009620666503906,
630
+ "learning_rate": 2.9726976724428563e-05,
631
+ "loss": 3.572,
632
+ "step": 1780
633
+ },
634
+ {
635
+ "epoch": 0.14,
636
+ "grad_norm": 45.44181823730469,
637
+ "learning_rate": 2.9717672653473588e-05,
638
+ "loss": 3.6354,
639
+ "step": 1800
640
+ },
641
+ {
642
+ "epoch": 0.15,
643
+ "grad_norm": 30.79588508605957,
644
+ "learning_rate": 2.9708214201096758e-05,
645
+ "loss": 3.6953,
646
+ "step": 1820
647
+ },
648
+ {
649
+ "epoch": 0.15,
650
+ "grad_norm": 46.61872482299805,
651
+ "learning_rate": 2.9698601466512767e-05,
652
+ "loss": 3.5373,
653
+ "step": 1840
654
+ },
655
+ {
656
+ "epoch": 0.15,
657
+ "grad_norm": 42.86500930786133,
658
+ "learning_rate": 2.9688834550554647e-05,
659
+ "loss": 3.5982,
660
+ "step": 1860
661
+ },
662
+ {
663
+ "epoch": 0.15,
664
+ "grad_norm": 33.480289459228516,
665
+ "learning_rate": 2.9678913555672733e-05,
666
+ "loss": 3.6024,
667
+ "step": 1880
668
+ },
669
+ {
670
+ "epoch": 0.15,
671
+ "grad_norm": 36.41415786743164,
672
+ "learning_rate": 2.966883858593356e-05,
673
+ "loss": 3.4843,
674
+ "step": 1900
675
+ },
676
+ {
677
+ "epoch": 0.15,
678
+ "grad_norm": 41.39873123168945,
679
+ "learning_rate": 2.9658609747018796e-05,
680
+ "loss": 3.5257,
681
+ "step": 1920
682
+ },
683
+ {
684
+ "epoch": 0.16,
685
+ "grad_norm": 31.24024200439453,
686
+ "learning_rate": 2.964822714622412e-05,
687
+ "loss": 3.5927,
688
+ "step": 1940
689
+ },
690
+ {
691
+ "epoch": 0.16,
692
+ "grad_norm": 52.78026580810547,
693
+ "learning_rate": 2.9637690892458103e-05,
694
+ "loss": 3.4678,
695
+ "step": 1960
696
+ },
697
+ {
698
+ "epoch": 0.16,
699
+ "grad_norm": 27.40117835998535,
700
+ "learning_rate": 2.962700109624106e-05,
701
+ "loss": 3.5541,
702
+ "step": 1980
703
+ },
704
+ {
705
+ "epoch": 0.16,
706
+ "grad_norm": 23.172683715820312,
707
+ "learning_rate": 2.961615786970389e-05,
708
+ "loss": 3.5713,
709
+ "step": 2000
710
+ },
711
+ {
712
+ "epoch": 0.16,
713
+ "grad_norm": 24.177541732788086,
714
+ "learning_rate": 2.960516132658692e-05,
715
+ "loss": 3.585,
716
+ "step": 2020
717
+ },
718
+ {
719
+ "epoch": 0.16,
720
+ "grad_norm": 44.673912048339844,
721
+ "learning_rate": 2.9594011582238672e-05,
722
+ "loss": 3.5035,
723
+ "step": 2040
724
+ },
725
+ {
726
+ "epoch": 0.16,
727
+ "grad_norm": 35.91664505004883,
728
+ "learning_rate": 2.95827087536147e-05,
729
+ "loss": 3.6404,
730
+ "step": 2060
731
+ },
732
+ {
733
+ "epoch": 0.17,
734
+ "grad_norm": 27.3450870513916,
735
+ "learning_rate": 2.9571252959276313e-05,
736
+ "loss": 3.5121,
737
+ "step": 2080
738
+ },
739
+ {
740
+ "epoch": 0.17,
741
+ "grad_norm": 25.66405487060547,
742
+ "learning_rate": 2.955964431938939e-05,
743
+ "loss": 3.5009,
744
+ "step": 2100
745
+ },
746
+ {
747
+ "epoch": 0.17,
748
+ "grad_norm": 24.1674861907959,
749
+ "learning_rate": 2.9547882955723052e-05,
750
+ "loss": 3.5482,
751
+ "step": 2120
752
+ },
753
+ {
754
+ "epoch": 0.17,
755
+ "grad_norm": 49.72268295288086,
756
+ "learning_rate": 2.953596899164846e-05,
757
+ "loss": 3.4969,
758
+ "step": 2140
759
+ },
760
+ {
761
+ "epoch": 0.17,
762
+ "grad_norm": 26.238168716430664,
763
+ "learning_rate": 2.9523902552137436e-05,
764
+ "loss": 3.5541,
765
+ "step": 2160
766
+ },
767
+ {
768
+ "epoch": 0.17,
769
+ "grad_norm": 30.524545669555664,
770
+ "learning_rate": 2.951168376376124e-05,
771
+ "loss": 3.6343,
772
+ "step": 2180
773
+ },
774
+ {
775
+ "epoch": 0.18,
776
+ "grad_norm": 38.179908752441406,
777
+ "learning_rate": 2.9499312754689168e-05,
778
+ "loss": 3.4795,
779
+ "step": 2200
780
+ },
781
+ {
782
+ "epoch": 0.18,
783
+ "grad_norm": 32.98453903198242,
784
+ "learning_rate": 2.9486789654687256e-05,
785
+ "loss": 3.6333,
786
+ "step": 2220
787
+ },
788
+ {
789
+ "epoch": 0.18,
790
+ "grad_norm": 26.77848243713379,
791
+ "learning_rate": 2.94741145951169e-05,
792
+ "loss": 3.5654,
793
+ "step": 2240
794
+ },
795
+ {
796
+ "epoch": 0.18,
797
+ "grad_norm": 27.737852096557617,
798
+ "learning_rate": 2.9461287708933475e-05,
799
+ "loss": 3.5044,
800
+ "step": 2260
801
+ },
802
+ {
803
+ "epoch": 0.18,
804
+ "grad_norm": 34.2584342956543,
805
+ "learning_rate": 2.9448309130684944e-05,
806
+ "loss": 3.5979,
807
+ "step": 2280
808
+ },
809
+ {
810
+ "epoch": 0.18,
811
+ "grad_norm": 57.86616897583008,
812
+ "learning_rate": 2.9435178996510456e-05,
813
+ "loss": 3.5726,
814
+ "step": 2300
815
+ },
816
+ {
817
+ "epoch": 0.19,
818
+ "grad_norm": 37.64597702026367,
819
+ "learning_rate": 2.9421897444138902e-05,
820
+ "loss": 3.5913,
821
+ "step": 2320
822
  }
823
  ],
824
  "logging_steps": 20,
 
826
  "num_input_tokens_seen": 0,
827
  "num_train_epochs": 2,
828
  "save_steps": 20,
829
+ "total_flos": 5493973373190144.0,
830
  "train_batch_size": 8,
831
  "trial_name": null,
832
  "trial_params": null