oliverchang commited on
Commit
85e98b5
·
verified ·
1 Parent(s): e47f614

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cae1214709ea8470efc2b34c46b75323420040ae64c043d801b33038d1180507
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d578bb35c317e7e525db203822ae41cce5d685a3cde394257758d4000ede4c6
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb971f7a2b9f6711cef8ea9affec03fcd50bd861777143766b5cb7e0bcb63ecf
3
  size 41119636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d5812584bde22bf01fc0c56c6dbe8f3c19965476007b408bc249ebf8cf4edbd
3
  size 41119636
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:139ba3629a1c48e660d6a05bd55c717dfb1aea59399165fe4f210e37b4e7af4e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eafb18d98f4c54a0a319ed9fd7490c4afd4dd5b2d57902a27826238fb340a960
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1daea96750ba9f8c361f7e6b8ab82396a1c3edd4a36217a7c055be604f422b5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2231a499fc6249ec0c6a54e630ff27aff6a281425b76ecea459adede6c9680b9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6309148264984227,
5
  "eval_steps": 500,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -707,6 +707,356 @@
707
  "learning_rate": 9.999991128530895e-05,
708
  "loss": 0.5321,
709
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  }
711
  ],
712
  "logging_steps": 1,
@@ -726,7 +1076,7 @@
726
  "attributes": {}
727
  }
728
  },
729
- "total_flos": 7.459671528544666e+16,
730
  "train_batch_size": 4,
731
  "trial_name": null,
732
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9463722397476341,
5
  "eval_steps": 500,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
707
  "learning_rate": 9.999991128530895e-05,
708
  "loss": 0.5321,
709
  "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.637223974763407,
713
+ "grad_norm": 0.3380476236343384,
714
+ "learning_rate": 9.999990940780191e-05,
715
+ "loss": 0.6164,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.6435331230283912,
720
+ "grad_norm": 0.3839210867881775,
721
+ "learning_rate": 9.999990751063516e-05,
722
+ "loss": 0.6901,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.6498422712933754,
727
+ "grad_norm": 0.36936458945274353,
728
+ "learning_rate": 9.999990559380867e-05,
729
+ "loss": 0.7658,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.6561514195583596,
734
+ "grad_norm": 0.37085050344467163,
735
+ "learning_rate": 9.999990365732244e-05,
736
+ "loss": 0.7844,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.6624605678233438,
741
+ "grad_norm": 0.36542758345603943,
742
+ "learning_rate": 9.999990170117648e-05,
743
+ "loss": 0.7223,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.668769716088328,
748
+ "grad_norm": 0.38479429483413696,
749
+ "learning_rate": 9.999989972537079e-05,
750
+ "loss": 0.7429,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.6750788643533123,
755
+ "grad_norm": 0.4295928180217743,
756
+ "learning_rate": 9.999989772990536e-05,
757
+ "loss": 0.6426,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.6813880126182965,
762
+ "grad_norm": 0.4287479817867279,
763
+ "learning_rate": 9.999989571478021e-05,
764
+ "loss": 0.7149,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.6876971608832808,
769
+ "grad_norm": 0.4698425531387329,
770
+ "learning_rate": 9.999989367999532e-05,
771
+ "loss": 0.707,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.694006309148265,
776
+ "grad_norm": 0.48254162073135376,
777
+ "learning_rate": 9.999989162555071e-05,
778
+ "loss": 0.7056,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.7003154574132492,
783
+ "grad_norm": 0.5717018842697144,
784
+ "learning_rate": 9.999988955144637e-05,
785
+ "loss": 0.6857,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.7066246056782335,
790
+ "grad_norm": 0.60892254114151,
791
+ "learning_rate": 9.99998874576823e-05,
792
+ "loss": 0.6499,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.7129337539432177,
797
+ "grad_norm": 0.7676037549972534,
798
+ "learning_rate": 9.99998853442585e-05,
799
+ "loss": 0.7463,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.7192429022082019,
804
+ "grad_norm": 0.9815330505371094,
805
+ "learning_rate": 9.999988321117497e-05,
806
+ "loss": 0.8505,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.7255520504731862,
811
+ "grad_norm": 1.2696208953857422,
812
+ "learning_rate": 9.999988105843173e-05,
813
+ "loss": 0.7409,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.7318611987381703,
818
+ "grad_norm": 1.1130510568618774,
819
+ "learning_rate": 9.999987888602875e-05,
820
+ "loss": 0.3319,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.7381703470031545,
825
+ "grad_norm": 0.7321246862411499,
826
+ "learning_rate": 9.999987669396606e-05,
827
+ "loss": 0.38,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.7444794952681388,
832
+ "grad_norm": 0.27002546191215515,
833
+ "learning_rate": 9.999987448224363e-05,
834
+ "loss": 0.8287,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.750788643533123,
839
+ "grad_norm": 0.24602839350700378,
840
+ "learning_rate": 9.99998722508615e-05,
841
+ "loss": 0.9131,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.7570977917981072,
846
+ "grad_norm": 0.25864341855049133,
847
+ "learning_rate": 9.999986999981963e-05,
848
+ "loss": 0.9195,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.7634069400630915,
853
+ "grad_norm": 0.2722395956516266,
854
+ "learning_rate": 9.999986772911804e-05,
855
+ "loss": 0.9736,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.7697160883280757,
860
+ "grad_norm": 0.27622660994529724,
861
+ "learning_rate": 9.999986543875674e-05,
862
+ "loss": 0.867,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.7760252365930599,
867
+ "grad_norm": 0.3594439625740051,
868
+ "learning_rate": 9.999986312873572e-05,
869
+ "loss": 0.9813,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.7823343848580442,
874
+ "grad_norm": 0.3490639925003052,
875
+ "learning_rate": 9.999986079905499e-05,
876
+ "loss": 0.8835,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.7886435331230284,
881
+ "grad_norm": 0.34569522738456726,
882
+ "learning_rate": 9.999985844971453e-05,
883
+ "loss": 0.8828,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.7949526813880127,
888
+ "grad_norm": 0.3032309412956238,
889
+ "learning_rate": 9.999985608071438e-05,
890
+ "loss": 0.7422,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.8012618296529969,
895
+ "grad_norm": 0.2762659192085266,
896
+ "learning_rate": 9.99998536920545e-05,
897
+ "loss": 0.7403,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.807570977917981,
902
+ "grad_norm": 0.29897361993789673,
903
+ "learning_rate": 9.999985128373489e-05,
904
+ "loss": 0.852,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.8138801261829653,
909
+ "grad_norm": 0.2828134000301361,
910
+ "learning_rate": 9.999984885575557e-05,
911
+ "loss": 0.7355,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.8201892744479495,
916
+ "grad_norm": 0.28995025157928467,
917
+ "learning_rate": 9.999984640811656e-05,
918
+ "loss": 0.8051,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.8264984227129337,
923
+ "grad_norm": 0.2877126932144165,
924
+ "learning_rate": 9.999984394081783e-05,
925
+ "loss": 0.684,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.832807570977918,
930
+ "grad_norm": 0.32978740334510803,
931
+ "learning_rate": 9.999984145385939e-05,
932
+ "loss": 0.7361,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.8391167192429022,
937
+ "grad_norm": 0.2849004566669464,
938
+ "learning_rate": 9.999983894724123e-05,
939
+ "loss": 0.8116,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.8454258675078864,
944
+ "grad_norm": 0.2859273850917816,
945
+ "learning_rate": 9.999983642096338e-05,
946
+ "loss": 0.5324,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.8517350157728707,
951
+ "grad_norm": 0.3232719302177429,
952
+ "learning_rate": 9.999983387502581e-05,
953
+ "loss": 0.7514,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.8580441640378549,
958
+ "grad_norm": 0.3699396252632141,
959
+ "learning_rate": 9.999983130942854e-05,
960
+ "loss": 0.9407,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.8643533123028391,
965
+ "grad_norm": 0.3257301449775696,
966
+ "learning_rate": 9.999982872417156e-05,
967
+ "loss": 0.6552,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.8706624605678234,
972
+ "grad_norm": 0.34637680649757385,
973
+ "learning_rate": 9.999982611925488e-05,
974
+ "loss": 0.7539,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.8769716088328076,
979
+ "grad_norm": 0.38663750886917114,
980
+ "learning_rate": 9.99998234946785e-05,
981
+ "loss": 0.7163,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.8832807570977917,
986
+ "grad_norm": 0.33887526392936707,
987
+ "learning_rate": 9.999982085044242e-05,
988
+ "loss": 0.6362,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.889589905362776,
993
+ "grad_norm": 0.3487693667411804,
994
+ "learning_rate": 9.999981818654662e-05,
995
+ "loss": 0.7196,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.8958990536277602,
1000
+ "grad_norm": 0.3876301348209381,
1001
+ "learning_rate": 9.999981550299115e-05,
1002
+ "loss": 0.8779,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.9022082018927445,
1007
+ "grad_norm": 0.39342001080513,
1008
+ "learning_rate": 9.999981279977596e-05,
1009
+ "loss": 0.6961,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.9085173501577287,
1014
+ "grad_norm": 0.37053290009498596,
1015
+ "learning_rate": 9.999981007690108e-05,
1016
+ "loss": 0.5283,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.9148264984227129,
1021
+ "grad_norm": 0.39710113406181335,
1022
+ "learning_rate": 9.99998073343665e-05,
1023
+ "loss": 0.6993,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.9211356466876972,
1028
+ "grad_norm": 0.3932151794433594,
1029
+ "learning_rate": 9.999980457217224e-05,
1030
+ "loss": 0.5497,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.9274447949526814,
1035
+ "grad_norm": 0.41066697239875793,
1036
+ "learning_rate": 9.999980179031826e-05,
1037
+ "loss": 0.5879,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.9337539432176656,
1042
+ "grad_norm": 0.5037790536880493,
1043
+ "learning_rate": 9.99997989888046e-05,
1044
+ "loss": 0.8422,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.9400630914826499,
1049
+ "grad_norm": 0.5321910381317139,
1050
+ "learning_rate": 9.999979616763126e-05,
1051
+ "loss": 0.7902,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.9463722397476341,
1056
+ "grad_norm": 0.5393354892730713,
1057
+ "learning_rate": 9.99997933267982e-05,
1058
+ "loss": 0.6235,
1059
+ "step": 150
1060
  }
1061
  ],
1062
  "logging_steps": 1,
 
1076
  "attributes": {}
1077
  }
1078
  },
1079
+ "total_flos": 1.1026632368612966e+17,
1080
  "train_batch_size": 4,
1081
  "trial_name": null,
1082
  "trial_params": null