aleegis commited on
Commit
0d02e86
·
verified ·
1 Parent(s): 743d011

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95c516fe4d7ba1b8df18d4e76c625fa01d2ce34ffe0a062438bec237a014244b
3
  size 203456160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4054fc3341df3ec62ad23eb40df504bf559ae3fe109c9358a6fe2fbe13c2b5db
3
  size 203456160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:888abf7621e98b6dff03f8f532216dec008c8ba5cbf838bb0ff7a81c8f50bbd6
3
  size 407127126
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7de5b56bb44d80a59d57ae0d74443cf71e7c3a04fcea2547755d8ae3f6ab98b
3
  size 407127126
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1ba92e1c99bef2498a249fd40a486215006468f687654840f88285270cef3c8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4d2e3c9c2c299605f839dd9681034a9a825180b34e15be15b76272f4fb00418
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30d4acc946a913917ad083137cc06e69b7e71ebdbeee53948832810de27f25fc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dfefdd350a9c50a5ce8f17f77222cc43ade97bd1bc4dfe20825b1649c6776a6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9994088669950739,
5
  "eval_steps": 500,
6
- "global_step": 634,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -637,6 +637,643 @@
637
  "learning_rate": 6.861468292009727e-05,
638
  "loss": 0.7384,
639
  "step": 630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  }
641
  ],
642
  "logging_steps": 7,
@@ -656,7 +1293,7 @@
656
  "attributes": {}
657
  }
658
  },
659
- "total_flos": 4.20931500638208e+17,
660
  "train_batch_size": 2,
661
  "trial_name": null,
662
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9988177339901478,
5
  "eval_steps": 500,
6
+ "global_step": 1268,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
637
  "learning_rate": 6.861468292009727e-05,
638
  "loss": 0.7384,
639
  "step": 630
640
+ },
641
+ {
642
+ "epoch": 1.0041379310344827,
643
+ "grad_norm": 0.28151699900627136,
644
+ "learning_rate": 6.788347664153447e-05,
645
+ "loss": 0.7612,
646
+ "step": 637
647
+ },
648
+ {
649
+ "epoch": 1.0151724137931035,
650
+ "grad_norm": 0.2590833604335785,
651
+ "learning_rate": 6.714785788270658e-05,
652
+ "loss": 0.6778,
653
+ "step": 644
654
+ },
655
+ {
656
+ "epoch": 1.0262068965517241,
657
+ "grad_norm": 0.268964558839798,
658
+ "learning_rate": 6.640800814653503e-05,
659
+ "loss": 0.6337,
660
+ "step": 651
661
+ },
662
+ {
663
+ "epoch": 1.0372413793103448,
664
+ "grad_norm": 0.26795223355293274,
665
+ "learning_rate": 6.566410997987163e-05,
666
+ "loss": 0.6548,
667
+ "step": 658
668
+ },
669
+ {
670
+ "epoch": 1.0482758620689656,
671
+ "grad_norm": 0.25789910554885864,
672
+ "learning_rate": 6.49163469284578e-05,
673
+ "loss": 0.6494,
674
+ "step": 665
675
+ },
676
+ {
677
+ "epoch": 1.0593103448275862,
678
+ "grad_norm": 0.27773144841194153,
679
+ "learning_rate": 6.416490349163748e-05,
680
+ "loss": 0.6541,
681
+ "step": 672
682
+ },
683
+ {
684
+ "epoch": 1.0703448275862069,
685
+ "grad_norm": 0.28650370240211487,
686
+ "learning_rate": 6.340996507683458e-05,
687
+ "loss": 0.6677,
688
+ "step": 679
689
+ },
690
+ {
691
+ "epoch": 1.0813793103448275,
692
+ "grad_norm": 0.2758565843105316,
693
+ "learning_rate": 6.265171795380659e-05,
694
+ "loss": 0.6299,
695
+ "step": 686
696
+ },
697
+ {
698
+ "epoch": 1.0924137931034483,
699
+ "grad_norm": 0.2588457465171814,
700
+ "learning_rate": 6.189034920868522e-05,
701
+ "loss": 0.6391,
702
+ "step": 693
703
+ },
704
+ {
705
+ "epoch": 1.103448275862069,
706
+ "grad_norm": 0.2821789085865021,
707
+ "learning_rate": 6.112604669781572e-05,
708
+ "loss": 0.6239,
709
+ "step": 700
710
+ },
711
+ {
712
+ "epoch": 1.1144827586206896,
713
+ "grad_norm": 0.2735387086868286,
714
+ "learning_rate": 6.0358999001406156e-05,
715
+ "loss": 0.6581,
716
+ "step": 707
717
+ },
718
+ {
719
+ "epoch": 1.1255172413793104,
720
+ "grad_norm": 0.2622285485267639,
721
+ "learning_rate": 5.9589395376998e-05,
722
+ "loss": 0.6396,
723
+ "step": 714
724
+ },
725
+ {
726
+ "epoch": 1.136551724137931,
727
+ "grad_norm": 0.27206316590309143,
728
+ "learning_rate": 5.8817425712769794e-05,
729
+ "loss": 0.6305,
730
+ "step": 721
731
+ },
732
+ {
733
+ "epoch": 1.1475862068965517,
734
+ "grad_norm": 0.2703116536140442,
735
+ "learning_rate": 5.804328048068492e-05,
736
+ "loss": 0.6498,
737
+ "step": 728
738
+ },
739
+ {
740
+ "epoch": 1.1586206896551725,
741
+ "grad_norm": 0.2688596844673157,
742
+ "learning_rate": 5.7267150689495644e-05,
743
+ "loss": 0.6453,
744
+ "step": 735
745
+ },
746
+ {
747
+ "epoch": 1.1696551724137931,
748
+ "grad_norm": 0.27926501631736755,
749
+ "learning_rate": 5.648922783761443e-05,
750
+ "loss": 0.6715,
751
+ "step": 742
752
+ },
753
+ {
754
+ "epoch": 1.1806896551724138,
755
+ "grad_norm": 0.2714243531227112,
756
+ "learning_rate": 5.570970386586469e-05,
757
+ "loss": 0.6502,
758
+ "step": 749
759
+ },
760
+ {
761
+ "epoch": 1.1917241379310344,
762
+ "grad_norm": 0.27098771929740906,
763
+ "learning_rate": 5.492877111012218e-05,
764
+ "loss": 0.6382,
765
+ "step": 756
766
+ },
767
+ {
768
+ "epoch": 1.2027586206896552,
769
+ "grad_norm": 0.28337958455085754,
770
+ "learning_rate": 5.414662225385903e-05,
771
+ "loss": 0.6383,
772
+ "step": 763
773
+ },
774
+ {
775
+ "epoch": 1.2137931034482758,
776
+ "grad_norm": 0.28135356307029724,
777
+ "learning_rate": 5.336345028060199e-05,
778
+ "loss": 0.6433,
779
+ "step": 770
780
+ },
781
+ {
782
+ "epoch": 1.2248275862068965,
783
+ "grad_norm": 0.268600195646286,
784
+ "learning_rate": 5.257944842631658e-05,
785
+ "loss": 0.6503,
786
+ "step": 777
787
+ },
788
+ {
789
+ "epoch": 1.2358620689655173,
790
+ "grad_norm": 0.268028199672699,
791
+ "learning_rate": 5.179481013172912e-05,
792
+ "loss": 0.64,
793
+ "step": 784
794
+ },
795
+ {
796
+ "epoch": 1.246896551724138,
797
+ "grad_norm": 0.26292359828948975,
798
+ "learning_rate": 5.100972899459796e-05,
799
+ "loss": 0.6211,
800
+ "step": 791
801
+ },
802
+ {
803
+ "epoch": 1.2579310344827586,
804
+ "grad_norm": 0.2784179151058197,
805
+ "learning_rate": 5.022439872194629e-05,
806
+ "loss": 0.6466,
807
+ "step": 798
808
+ },
809
+ {
810
+ "epoch": 1.2689655172413792,
811
+ "grad_norm": 0.28252890706062317,
812
+ "learning_rate": 4.943901308226771e-05,
813
+ "loss": 0.6368,
814
+ "step": 805
815
+ },
816
+ {
817
+ "epoch": 1.28,
818
+ "grad_norm": 0.2834304869174957,
819
+ "learning_rate": 4.865376585771687e-05,
820
+ "loss": 0.6336,
821
+ "step": 812
822
+ },
823
+ {
824
+ "epoch": 1.2910344827586206,
825
+ "grad_norm": 0.2688741981983185,
826
+ "learning_rate": 4.7868850796296495e-05,
827
+ "loss": 0.653,
828
+ "step": 819
829
+ },
830
+ {
831
+ "epoch": 1.3020689655172415,
832
+ "grad_norm": 0.2756083905696869,
833
+ "learning_rate": 4.708446156405307e-05,
834
+ "loss": 0.6581,
835
+ "step": 826
836
+ },
837
+ {
838
+ "epoch": 1.3131034482758621,
839
+ "grad_norm": 0.2754676640033722,
840
+ "learning_rate": 4.630079169729257e-05,
841
+ "loss": 0.6393,
842
+ "step": 833
843
+ },
844
+ {
845
+ "epoch": 1.3241379310344827,
846
+ "grad_norm": 0.26543307304382324,
847
+ "learning_rate": 4.551803455482833e-05,
848
+ "loss": 0.6264,
849
+ "step": 840
850
+ },
851
+ {
852
+ "epoch": 1.3351724137931034,
853
+ "grad_norm": 0.2671962380409241,
854
+ "learning_rate": 4.473638327027259e-05,
855
+ "loss": 0.6266,
856
+ "step": 847
857
+ },
858
+ {
859
+ "epoch": 1.3462068965517242,
860
+ "grad_norm": 0.2819383442401886,
861
+ "learning_rate": 4.395603070438373e-05,
862
+ "loss": 0.6426,
863
+ "step": 854
864
+ },
865
+ {
866
+ "epoch": 1.3572413793103448,
867
+ "grad_norm": 0.2790848910808563,
868
+ "learning_rate": 4.31771693974807e-05,
869
+ "loss": 0.633,
870
+ "step": 861
871
+ },
872
+ {
873
+ "epoch": 1.3682758620689655,
874
+ "grad_norm": 0.2653134763240814,
875
+ "learning_rate": 4.239999152193664e-05,
876
+ "loss": 0.637,
877
+ "step": 868
878
+ },
879
+ {
880
+ "epoch": 1.3793103448275863,
881
+ "grad_norm": 0.27828529477119446,
882
+ "learning_rate": 4.162468883476319e-05,
883
+ "loss": 0.6273,
884
+ "step": 875
885
+ },
886
+ {
887
+ "epoch": 1.390344827586207,
888
+ "grad_norm": 0.27933961153030396,
889
+ "learning_rate": 4.085145263029726e-05,
890
+ "loss": 0.6477,
891
+ "step": 882
892
+ },
893
+ {
894
+ "epoch": 1.4013793103448275,
895
+ "grad_norm": 0.2893773317337036,
896
+ "learning_rate": 4.008047369300218e-05,
897
+ "loss": 0.6374,
898
+ "step": 889
899
+ },
900
+ {
901
+ "epoch": 1.4124137931034482,
902
+ "grad_norm": 0.29266366362571716,
903
+ "learning_rate": 3.9311942250394276e-05,
904
+ "loss": 0.6051,
905
+ "step": 896
906
+ },
907
+ {
908
+ "epoch": 1.423448275862069,
909
+ "grad_norm": 0.28573766350746155,
910
+ "learning_rate": 3.8546047926107256e-05,
911
+ "loss": 0.6371,
912
+ "step": 903
913
+ },
914
+ {
915
+ "epoch": 1.4344827586206896,
916
+ "grad_norm": 0.2910100817680359,
917
+ "learning_rate": 3.778297969310529e-05,
918
+ "loss": 0.6491,
919
+ "step": 910
920
+ },
921
+ {
922
+ "epoch": 1.4455172413793105,
923
+ "grad_norm": 0.2896603047847748,
924
+ "learning_rate": 3.7022925827056884e-05,
925
+ "loss": 0.6332,
926
+ "step": 917
927
+ },
928
+ {
929
+ "epoch": 1.456551724137931,
930
+ "grad_norm": 0.26403993368148804,
931
+ "learning_rate": 3.62660738598805e-05,
932
+ "loss": 0.6387,
933
+ "step": 924
934
+ },
935
+ {
936
+ "epoch": 1.4675862068965517,
937
+ "grad_norm": 0.27510949969291687,
938
+ "learning_rate": 3.551261053347404e-05,
939
+ "loss": 0.6217,
940
+ "step": 931
941
+ },
942
+ {
943
+ "epoch": 1.4786206896551723,
944
+ "grad_norm": 0.28636762499809265,
945
+ "learning_rate": 3.4762721753638995e-05,
946
+ "loss": 0.6186,
947
+ "step": 938
948
+ },
949
+ {
950
+ "epoch": 1.489655172413793,
951
+ "grad_norm": 0.2784745991230011,
952
+ "learning_rate": 3.401659254421094e-05,
953
+ "loss": 0.6392,
954
+ "step": 945
955
+ },
956
+ {
957
+ "epoch": 1.5006896551724138,
958
+ "grad_norm": 0.2818538546562195,
959
+ "learning_rate": 3.3274407001407735e-05,
960
+ "loss": 0.6311,
961
+ "step": 952
962
+ },
963
+ {
964
+ "epoch": 1.5117241379310344,
965
+ "grad_norm": 0.2834908068180084,
966
+ "learning_rate": 3.2536348248406534e-05,
967
+ "loss": 0.6205,
968
+ "step": 959
969
+ },
970
+ {
971
+ "epoch": 1.5227586206896553,
972
+ "grad_norm": 0.27675577998161316,
973
+ "learning_rate": 3.1802598390160784e-05,
974
+ "loss": 0.6191,
975
+ "step": 966
976
+ },
977
+ {
978
+ "epoch": 1.533793103448276,
979
+ "grad_norm": 0.27893996238708496,
980
+ "learning_rate": 3.107333846846872e-05,
981
+ "loss": 0.6263,
982
+ "step": 973
983
+ },
984
+ {
985
+ "epoch": 1.5448275862068965,
986
+ "grad_norm": 0.2641468048095703,
987
+ "learning_rate": 3.0348748417303823e-05,
988
+ "loss": 0.6163,
989
+ "step": 980
990
+ },
991
+ {
992
+ "epoch": 1.5558620689655172,
993
+ "grad_norm": 0.2818867862224579,
994
+ "learning_rate": 2.9629007018418985e-05,
995
+ "loss": 0.6201,
996
+ "step": 987
997
+ },
998
+ {
999
+ "epoch": 1.5668965517241378,
1000
+ "grad_norm": 0.28501781821250916,
1001
+ "learning_rate": 2.8914291857234636e-05,
1002
+ "loss": 0.6308,
1003
+ "step": 994
1004
+ },
1005
+ {
1006
+ "epoch": 1.5779310344827586,
1007
+ "grad_norm": 0.2769615352153778,
1008
+ "learning_rate": 2.8204779279022276e-05,
1009
+ "loss": 0.6063,
1010
+ "step": 1001
1011
+ },
1012
+ {
1013
+ "epoch": 1.5889655172413795,
1014
+ "grad_norm": 0.2790084481239319,
1015
+ "learning_rate": 2.7500644345393943e-05,
1016
+ "loss": 0.6325,
1017
+ "step": 1008
1018
+ },
1019
+ {
1020
+ "epoch": 1.6,
1021
+ "grad_norm": 0.2709619402885437,
1022
+ "learning_rate": 2.68020607911083e-05,
1023
+ "loss": 0.5982,
1024
+ "step": 1015
1025
+ },
1026
+ {
1027
+ "epoch": 1.6110344827586207,
1028
+ "grad_norm": 0.2806256115436554,
1029
+ "learning_rate": 2.610920098120424e-05,
1030
+ "loss": 0.6453,
1031
+ "step": 1022
1032
+ },
1033
+ {
1034
+ "epoch": 1.6220689655172413,
1035
+ "grad_norm": 0.2840186059474945,
1036
+ "learning_rate": 2.5422235868472345e-05,
1037
+ "loss": 0.6243,
1038
+ "step": 1029
1039
+ },
1040
+ {
1041
+ "epoch": 1.633103448275862,
1042
+ "grad_norm": 0.2787083089351654,
1043
+ "learning_rate": 2.4741334951274947e-05,
1044
+ "loss": 0.6097,
1045
+ "step": 1036
1046
+ },
1047
+ {
1048
+ "epoch": 1.6441379310344828,
1049
+ "grad_norm": 0.27798426151275635,
1050
+ "learning_rate": 2.40666662317248e-05,
1051
+ "loss": 0.616,
1052
+ "step": 1043
1053
+ },
1054
+ {
1055
+ "epoch": 1.6551724137931034,
1056
+ "grad_norm": 0.27529022097587585,
1057
+ "learning_rate": 2.3398396174233178e-05,
1058
+ "loss": 0.6207,
1059
+ "step": 1050
1060
+ },
1061
+ {
1062
+ "epoch": 1.6662068965517243,
1063
+ "grad_norm": 0.2800416946411133,
1064
+ "learning_rate": 2.2736689664437217e-05,
1065
+ "loss": 0.6273,
1066
+ "step": 1057
1067
+ },
1068
+ {
1069
+ "epoch": 1.677241379310345,
1070
+ "grad_norm": 0.2721683979034424,
1071
+ "learning_rate": 2.2081709968516866e-05,
1072
+ "loss": 0.6,
1073
+ "step": 1064
1074
+ },
1075
+ {
1076
+ "epoch": 1.6882758620689655,
1077
+ "grad_norm": 0.29215288162231445,
1078
+ "learning_rate": 2.1433618692911467e-05,
1079
+ "loss": 0.6105,
1080
+ "step": 1071
1081
+ },
1082
+ {
1083
+ "epoch": 1.6993103448275861,
1084
+ "grad_norm": 0.2790670692920685,
1085
+ "learning_rate": 2.0792575744445653e-05,
1086
+ "loss": 0.6028,
1087
+ "step": 1078
1088
+ },
1089
+ {
1090
+ "epoch": 1.7103448275862068,
1091
+ "grad_norm": 0.28281012177467346,
1092
+ "learning_rate": 2.015873929087482e-05,
1093
+ "loss": 0.6168,
1094
+ "step": 1085
1095
+ },
1096
+ {
1097
+ "epoch": 1.7213793103448276,
1098
+ "grad_norm": 0.2883938252925873,
1099
+ "learning_rate": 1.95322657218596e-05,
1100
+ "loss": 0.6045,
1101
+ "step": 1092
1102
+ },
1103
+ {
1104
+ "epoch": 1.7324137931034482,
1105
+ "grad_norm": 0.27805426716804504,
1106
+ "learning_rate": 1.8913309610379015e-05,
1107
+ "loss": 0.6154,
1108
+ "step": 1099
1109
+ },
1110
+ {
1111
+ "epoch": 1.743448275862069,
1112
+ "grad_norm": 0.27731558680534363,
1113
+ "learning_rate": 1.8302023674591935e-05,
1114
+ "loss": 0.6098,
1115
+ "step": 1106
1116
+ },
1117
+ {
1118
+ "epoch": 1.7544827586206897,
1119
+ "grad_norm": 0.2910196781158447,
1120
+ "learning_rate": 1.7698558740156135e-05,
1121
+ "loss": 0.6106,
1122
+ "step": 1113
1123
+ },
1124
+ {
1125
+ "epoch": 1.7655172413793103,
1126
+ "grad_norm": 0.27517473697662354,
1127
+ "learning_rate": 1.7103063703014372e-05,
1128
+ "loss": 0.6239,
1129
+ "step": 1120
1130
+ },
1131
+ {
1132
+ "epoch": 1.776551724137931,
1133
+ "grad_norm": 0.26991525292396545,
1134
+ "learning_rate": 1.6515685492656467e-05,
1135
+ "loss": 0.602,
1136
+ "step": 1127
1137
+ },
1138
+ {
1139
+ "epoch": 1.7875862068965516,
1140
+ "grad_norm": 0.28398916125297546,
1141
+ "learning_rate": 1.59365690358667e-05,
1142
+ "loss": 0.6138,
1143
+ "step": 1134
1144
+ },
1145
+ {
1146
+ "epoch": 1.7986206896551724,
1147
+ "grad_norm": 0.2737048268318176,
1148
+ "learning_rate": 1.5365857220965275e-05,
1149
+ "loss": 0.6034,
1150
+ "step": 1141
1151
+ },
1152
+ {
1153
+ "epoch": 1.8096551724137933,
1154
+ "grad_norm": 0.27919578552246094,
1155
+ "learning_rate": 1.4803690862552755e-05,
1156
+ "loss": 0.6126,
1157
+ "step": 1148
1158
+ },
1159
+ {
1160
+ "epoch": 1.8206896551724139,
1161
+ "grad_norm": 0.2809952199459076,
1162
+ "learning_rate": 1.4250208666766235e-05,
1163
+ "loss": 0.607,
1164
+ "step": 1155
1165
+ },
1166
+ {
1167
+ "epoch": 1.8317241379310345,
1168
+ "grad_norm": 0.28192582726478577,
1169
+ "learning_rate": 1.3705547197055584e-05,
1170
+ "loss": 0.6064,
1171
+ "step": 1162
1172
+ },
1173
+ {
1174
+ "epoch": 1.8427586206896551,
1175
+ "grad_norm": 0.2873040735721588,
1176
+ "learning_rate": 1.3169840840488501e-05,
1177
+ "loss": 0.6173,
1178
+ "step": 1169
1179
+ },
1180
+ {
1181
+ "epoch": 1.8537931034482757,
1182
+ "grad_norm": 0.2937031388282776,
1183
+ "learning_rate": 1.2643221774592518e-05,
1184
+ "loss": 0.6148,
1185
+ "step": 1176
1186
+ },
1187
+ {
1188
+ "epoch": 1.8648275862068966,
1189
+ "grad_norm": 0.28401997685432434,
1190
+ "learning_rate": 1.2125819934742188e-05,
1191
+ "loss": 0.6027,
1192
+ "step": 1183
1193
+ },
1194
+ {
1195
+ "epoch": 1.8758620689655172,
1196
+ "grad_norm": 0.2690780758857727,
1197
+ "learning_rate": 1.1617762982099446e-05,
1198
+ "loss": 0.6089,
1199
+ "step": 1190
1200
+ },
1201
+ {
1202
+ "epoch": 1.886896551724138,
1203
+ "grad_norm": 0.28273099660873413,
1204
+ "learning_rate": 1.1119176272115128e-05,
1205
+ "loss": 0.6017,
1206
+ "step": 1197
1207
+ },
1208
+ {
1209
+ "epoch": 1.8979310344827587,
1210
+ "grad_norm": 0.2761295437812805,
1211
+ "learning_rate": 1.0630182823599399e-05,
1212
+ "loss": 0.6156,
1213
+ "step": 1204
1214
+ },
1215
+ {
1216
+ "epoch": 1.9089655172413793,
1217
+ "grad_norm": 0.2870751917362213,
1218
+ "learning_rate": 1.0150903288368741e-05,
1219
+ "loss": 0.6038,
1220
+ "step": 1211
1221
+ },
1222
+ {
1223
+ "epoch": 1.92,
1224
+ "grad_norm": 0.2764483690261841,
1225
+ "learning_rate": 9.681455921476839e-06,
1226
+ "loss": 0.603,
1227
+ "step": 1218
1228
+ },
1229
+ {
1230
+ "epoch": 1.9310344827586206,
1231
+ "grad_norm": 0.2793201208114624,
1232
+ "learning_rate": 9.221956552036992e-06,
1233
+ "loss": 0.6194,
1234
+ "step": 1225
1235
+ },
1236
+ {
1237
+ "epoch": 1.9420689655172414,
1238
+ "grad_norm": 0.27977249026298523,
1239
+ "learning_rate": 8.772518554642973e-06,
1240
+ "loss": 0.622,
1241
+ "step": 1232
1242
+ },
1243
+ {
1244
+ "epoch": 1.953103448275862,
1245
+ "grad_norm": 0.27671122550964355,
1246
+ "learning_rate": 8.333252821395526e-06,
1247
+ "loss": 0.6083,
1248
+ "step": 1239
1249
+ },
1250
+ {
1251
+ "epoch": 1.9641379310344829,
1252
+ "grad_norm": 0.2808462679386139,
1253
+ "learning_rate": 7.904267734541498e-06,
1254
+ "loss": 0.6155,
1255
+ "step": 1246
1256
+ },
1257
+ {
1258
+ "epoch": 1.9751724137931035,
1259
+ "grad_norm": 0.2745875418186188,
1260
+ "learning_rate": 7.485669139732004e-06,
1261
+ "loss": 0.5934,
1262
+ "step": 1253
1263
+ },
1264
+ {
1265
+ "epoch": 1.986206896551724,
1266
+ "grad_norm": 0.2853251099586487,
1267
+ "learning_rate": 7.077560319906695e-06,
1268
+ "loss": 0.5992,
1269
+ "step": 1260
1270
+ },
1271
+ {
1272
+ "epoch": 1.9972413793103447,
1273
+ "grad_norm": 0.2884569764137268,
1274
+ "learning_rate": 6.680041969810203e-06,
1275
+ "loss": 0.5927,
1276
+ "step": 1267
1277
  }
1278
  ],
1279
  "logging_steps": 7,
 
1293
  "attributes": {}
1294
  }
1295
  },
1296
+ "total_flos": 8.41863001276416e+17,
1297
  "train_batch_size": 2,
1298
  "trial_name": null,
1299
  "trial_params": null