oliverchang commited on
Commit
3eba4da
·
verified ·
1 Parent(s): 7fa3523

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a91b27e568f3b3dc70a1d8c75412159ad33bd5aba0e7b22f6619b8f582318aa8
3
  size 47235968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abcb05ca92ec06fa9d499ee86e392b068c0b4a16b306c8b0b8ea0aa1596b0e2d
3
  size 47235968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10ffc27ba58cde7e8c21991475e2219e2eb942aa9d560b90a5a52bc8e4ed68be
3
  size 24411220
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cbe2fca5d55fd4900aac8ab5212444f1c730addc7623246ee262e01f7108055
3
  size 24411220
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbbe9bba5389ba64ff06ef6b3cdf88b7c65f4059eaa91f597c1c9ca6154ecf7b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca3766410a42bb4ce032456de09de4a9260806e2a23e53e8723474208864451
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fea967412bd83a775b4d93f7216aebd2bc7764ecddeb461a0505a72239f0d159
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbad99671fe9e8fad7c9e5d1bd2f0b78776dc12ff7758703eb68552f8e997bf1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5766245287203371,
5
  "eval_steps": 50,
6
- "global_step": 650,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4669,6 +4669,364 @@
4669
  "eval_samples_per_second": 27.077,
4670
  "eval_steps_per_second": 13.567,
4671
  "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4672
  }
4673
  ],
4674
  "logging_steps": 1,
@@ -4688,7 +5046,7 @@
4688
  "attributes": {}
4689
  }
4690
  },
4691
- "total_flos": 4.2498426667008e+16,
4692
  "train_batch_size": 2,
4693
  "trial_name": null,
4694
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6209802616988246,
5
  "eval_steps": 50,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4669
  "eval_samples_per_second": 27.077,
4670
  "eval_steps_per_second": 13.567,
4671
  "step": 650
4672
+ },
4673
+ {
4674
+ "epoch": 0.5775116433799069,
4675
+ "grad_norm": 10.354761123657227,
4676
+ "learning_rate": 0.00019999984039399245,
4677
+ "loss": 0.3899,
4678
+ "step": 651
4679
+ },
4680
+ {
4681
+ "epoch": 0.5783987580394766,
4682
+ "grad_norm": 8.909992218017578,
4683
+ "learning_rate": 0.00019999983989561347,
4684
+ "loss": 0.398,
4685
+ "step": 652
4686
+ },
4687
+ {
4688
+ "epoch": 0.5792858726990463,
4689
+ "grad_norm": 7.4923906326293945,
4690
+ "learning_rate": 0.00019999983939645758,
4691
+ "loss": 0.3956,
4692
+ "step": 653
4693
+ },
4694
+ {
4695
+ "epoch": 0.5801729873586161,
4696
+ "grad_norm": 5.873891353607178,
4697
+ "learning_rate": 0.00019999983889652483,
4698
+ "loss": 0.368,
4699
+ "step": 654
4700
+ },
4701
+ {
4702
+ "epoch": 0.5810601020181858,
4703
+ "grad_norm": 7.360598087310791,
4704
+ "learning_rate": 0.00019999983839581517,
4705
+ "loss": 0.4163,
4706
+ "step": 655
4707
+ },
4708
+ {
4709
+ "epoch": 0.5819472166777556,
4710
+ "grad_norm": 10.175065994262695,
4711
+ "learning_rate": 0.0001999998378943286,
4712
+ "loss": 0.4937,
4713
+ "step": 656
4714
+ },
4715
+ {
4716
+ "epoch": 0.5828343313373253,
4717
+ "grad_norm": 10.308756828308105,
4718
+ "learning_rate": 0.00019999983739206516,
4719
+ "loss": 0.4576,
4720
+ "step": 657
4721
+ },
4722
+ {
4723
+ "epoch": 0.5837214459968951,
4724
+ "grad_norm": 6.791981220245361,
4725
+ "learning_rate": 0.0001999998368890248,
4726
+ "loss": 0.455,
4727
+ "step": 658
4728
+ },
4729
+ {
4730
+ "epoch": 0.5846085606564648,
4731
+ "grad_norm": 9.735336303710938,
4732
+ "learning_rate": 0.00019999983638520753,
4733
+ "loss": 0.5393,
4734
+ "step": 659
4735
+ },
4736
+ {
4737
+ "epoch": 0.5854956753160346,
4738
+ "grad_norm": 7.667912006378174,
4739
+ "learning_rate": 0.0001999998358806134,
4740
+ "loss": 0.5499,
4741
+ "step": 660
4742
+ },
4743
+ {
4744
+ "epoch": 0.5863827899756043,
4745
+ "grad_norm": 9.633892059326172,
4746
+ "learning_rate": 0.00019999983537524236,
4747
+ "loss": 0.4529,
4748
+ "step": 661
4749
+ },
4750
+ {
4751
+ "epoch": 0.5872699046351741,
4752
+ "grad_norm": 6.23357629776001,
4753
+ "learning_rate": 0.00019999983486909445,
4754
+ "loss": 0.414,
4755
+ "step": 662
4756
+ },
4757
+ {
4758
+ "epoch": 0.5881570192947438,
4759
+ "grad_norm": 11.319755554199219,
4760
+ "learning_rate": 0.0001999998343621696,
4761
+ "loss": 0.5764,
4762
+ "step": 663
4763
+ },
4764
+ {
4765
+ "epoch": 0.5890441339543135,
4766
+ "grad_norm": 13.453693389892578,
4767
+ "learning_rate": 0.0001999998338544679,
4768
+ "loss": 0.6609,
4769
+ "step": 664
4770
+ },
4771
+ {
4772
+ "epoch": 0.5899312486138834,
4773
+ "grad_norm": 18.999950408935547,
4774
+ "learning_rate": 0.00019999983334598927,
4775
+ "loss": 0.5154,
4776
+ "step": 665
4777
+ },
4778
+ {
4779
+ "epoch": 0.590818363273453,
4780
+ "grad_norm": 10.19577693939209,
4781
+ "learning_rate": 0.00019999983283673376,
4782
+ "loss": 0.5638,
4783
+ "step": 666
4784
+ },
4785
+ {
4786
+ "epoch": 0.5917054779330229,
4787
+ "grad_norm": 15.789012908935547,
4788
+ "learning_rate": 0.00019999983232670134,
4789
+ "loss": 0.5514,
4790
+ "step": 667
4791
+ },
4792
+ {
4793
+ "epoch": 0.5925925925925926,
4794
+ "grad_norm": 7.380650997161865,
4795
+ "learning_rate": 0.00019999983181589204,
4796
+ "loss": 0.5238,
4797
+ "step": 668
4798
+ },
4799
+ {
4800
+ "epoch": 0.5934797072521624,
4801
+ "grad_norm": 12.285819053649902,
4802
+ "learning_rate": 0.00019999983130430585,
4803
+ "loss": 0.6751,
4804
+ "step": 669
4805
+ },
4806
+ {
4807
+ "epoch": 0.5943668219117321,
4808
+ "grad_norm": 10.537954330444336,
4809
+ "learning_rate": 0.00019999983079194272,
4810
+ "loss": 0.5944,
4811
+ "step": 670
4812
+ },
4813
+ {
4814
+ "epoch": 0.5952539365713019,
4815
+ "grad_norm": 7.963987827301025,
4816
+ "learning_rate": 0.00019999983027880274,
4817
+ "loss": 0.5648,
4818
+ "step": 671
4819
+ },
4820
+ {
4821
+ "epoch": 0.5961410512308716,
4822
+ "grad_norm": 13.050141334533691,
4823
+ "learning_rate": 0.00019999982976488586,
4824
+ "loss": 0.8043,
4825
+ "step": 672
4826
+ },
4827
+ {
4828
+ "epoch": 0.5970281658904414,
4829
+ "grad_norm": 17.245141983032227,
4830
+ "learning_rate": 0.00019999982925019208,
4831
+ "loss": 0.5685,
4832
+ "step": 673
4833
+ },
4834
+ {
4835
+ "epoch": 0.5979152805500111,
4836
+ "grad_norm": 9.825310707092285,
4837
+ "learning_rate": 0.00019999982873472139,
4838
+ "loss": 0.403,
4839
+ "step": 674
4840
+ },
4841
+ {
4842
+ "epoch": 0.5988023952095808,
4843
+ "grad_norm": 10.039129257202148,
4844
+ "learning_rate": 0.00019999982821847383,
4845
+ "loss": 0.6127,
4846
+ "step": 675
4847
+ },
4848
+ {
4849
+ "epoch": 0.5996895098691506,
4850
+ "grad_norm": 7.191605091094971,
4851
+ "learning_rate": 0.00019999982770144937,
4852
+ "loss": 0.4647,
4853
+ "step": 676
4854
+ },
4855
+ {
4856
+ "epoch": 0.6005766245287203,
4857
+ "grad_norm": 6.5292181968688965,
4858
+ "learning_rate": 0.000199999827183648,
4859
+ "loss": 0.5513,
4860
+ "step": 677
4861
+ },
4862
+ {
4863
+ "epoch": 0.6014637391882901,
4864
+ "grad_norm": 7.8640666007995605,
4865
+ "learning_rate": 0.00019999982666506972,
4866
+ "loss": 0.3423,
4867
+ "step": 678
4868
+ },
4869
+ {
4870
+ "epoch": 0.6023508538478598,
4871
+ "grad_norm": 36.11716842651367,
4872
+ "learning_rate": 0.00019999982614571458,
4873
+ "loss": 0.5195,
4874
+ "step": 679
4875
+ },
4876
+ {
4877
+ "epoch": 0.6032379685074296,
4878
+ "grad_norm": 11.68887996673584,
4879
+ "learning_rate": 0.00019999982562558252,
4880
+ "loss": 0.4774,
4881
+ "step": 680
4882
+ },
4883
+ {
4884
+ "epoch": 0.6041250831669993,
4885
+ "grad_norm": 11.403864860534668,
4886
+ "learning_rate": 0.00019999982510467357,
4887
+ "loss": 0.7514,
4888
+ "step": 681
4889
+ },
4890
+ {
4891
+ "epoch": 0.6050121978265691,
4892
+ "grad_norm": 6.176864147186279,
4893
+ "learning_rate": 0.00019999982458298774,
4894
+ "loss": 0.4794,
4895
+ "step": 682
4896
+ },
4897
+ {
4898
+ "epoch": 0.6058993124861388,
4899
+ "grad_norm": 9.716928482055664,
4900
+ "learning_rate": 0.000199999824060525,
4901
+ "loss": 0.5677,
4902
+ "step": 683
4903
+ },
4904
+ {
4905
+ "epoch": 0.6067864271457086,
4906
+ "grad_norm": 6.658013343811035,
4907
+ "learning_rate": 0.00019999982353728537,
4908
+ "loss": 0.4459,
4909
+ "step": 684
4910
+ },
4911
+ {
4912
+ "epoch": 0.6076735418052783,
4913
+ "grad_norm": 4.054922580718994,
4914
+ "learning_rate": 0.00019999982301326886,
4915
+ "loss": 0.296,
4916
+ "step": 685
4917
+ },
4918
+ {
4919
+ "epoch": 0.608560656464848,
4920
+ "grad_norm": 7.6882710456848145,
4921
+ "learning_rate": 0.0001999998224884754,
4922
+ "loss": 0.5358,
4923
+ "step": 686
4924
+ },
4925
+ {
4926
+ "epoch": 0.6094477711244178,
4927
+ "grad_norm": 10.745440483093262,
4928
+ "learning_rate": 0.00019999982196290512,
4929
+ "loss": 0.4966,
4930
+ "step": 687
4931
+ },
4932
+ {
4933
+ "epoch": 0.6103348857839875,
4934
+ "grad_norm": 6.231947422027588,
4935
+ "learning_rate": 0.0001999998214365579,
4936
+ "loss": 0.4244,
4937
+ "step": 688
4938
+ },
4939
+ {
4940
+ "epoch": 0.6112220004435573,
4941
+ "grad_norm": 6.25615930557251,
4942
+ "learning_rate": 0.0001999998209094338,
4943
+ "loss": 0.3549,
4944
+ "step": 689
4945
+ },
4946
+ {
4947
+ "epoch": 0.612109115103127,
4948
+ "grad_norm": 5.80368185043335,
4949
+ "learning_rate": 0.00019999982038153277,
4950
+ "loss": 0.4522,
4951
+ "step": 690
4952
+ },
4953
+ {
4954
+ "epoch": 0.6129962297626969,
4955
+ "grad_norm": 8.078680038452148,
4956
+ "learning_rate": 0.0001999998198528549,
4957
+ "loss": 0.5348,
4958
+ "step": 691
4959
+ },
4960
+ {
4961
+ "epoch": 0.6138833444222666,
4962
+ "grad_norm": 8.55868148803711,
4963
+ "learning_rate": 0.00019999981932340008,
4964
+ "loss": 0.5859,
4965
+ "step": 692
4966
+ },
4967
+ {
4968
+ "epoch": 0.6147704590818364,
4969
+ "grad_norm": 10.13481616973877,
4970
+ "learning_rate": 0.00019999981879316838,
4971
+ "loss": 0.5749,
4972
+ "step": 693
4973
+ },
4974
+ {
4975
+ "epoch": 0.6156575737414061,
4976
+ "grad_norm": 9.58573055267334,
4977
+ "learning_rate": 0.00019999981826215982,
4978
+ "loss": 0.4413,
4979
+ "step": 694
4980
+ },
4981
+ {
4982
+ "epoch": 0.6165446884009759,
4983
+ "grad_norm": 10.3484525680542,
4984
+ "learning_rate": 0.00019999981773037432,
4985
+ "loss": 0.3737,
4986
+ "step": 695
4987
+ },
4988
+ {
4989
+ "epoch": 0.6174318030605456,
4990
+ "grad_norm": 5.4196672439575195,
4991
+ "learning_rate": 0.00019999981719781196,
4992
+ "loss": 0.3963,
4993
+ "step": 696
4994
+ },
4995
+ {
4996
+ "epoch": 0.6183189177201154,
4997
+ "grad_norm": 11.233104705810547,
4998
+ "learning_rate": 0.00019999981666447267,
4999
+ "loss": 0.6738,
5000
+ "step": 697
5001
+ },
5002
+ {
5003
+ "epoch": 0.6192060323796851,
5004
+ "grad_norm": 7.30211067199707,
5005
+ "learning_rate": 0.0001999998161303565,
5006
+ "loss": 0.4252,
5007
+ "step": 698
5008
+ },
5009
+ {
5010
+ "epoch": 0.6200931470392548,
5011
+ "grad_norm": 12.224394798278809,
5012
+ "learning_rate": 0.00019999981559546345,
5013
+ "loss": 0.3892,
5014
+ "step": 699
5015
+ },
5016
+ {
5017
+ "epoch": 0.6209802616988246,
5018
+ "grad_norm": 8.769978523254395,
5019
+ "learning_rate": 0.0001999998150597935,
5020
+ "loss": 0.4835,
5021
+ "step": 700
5022
+ },
5023
+ {
5024
+ "epoch": 0.6209802616988246,
5025
+ "eval_loss": 0.4917793571949005,
5026
+ "eval_runtime": 17.5512,
5027
+ "eval_samples_per_second": 27.064,
5028
+ "eval_steps_per_second": 13.56,
5029
+ "step": 700
5030
  }
5031
  ],
5032
  "logging_steps": 1,
 
5046
  "attributes": {}
5047
  }
5048
  },
5049
+ "total_flos": 4.5767536410624e+16,
5050
  "train_batch_size": 2,
5051
  "trial_name": null,
5052
  "trial_params": null