oliverchang commited on
Commit
26625b1
·
verified ·
1 Parent(s): f852845

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a06a0e47ba651e599e4b4766058ec931903a2cc546d08844cdbe699276f5f83a
3
  size 59933632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:418bcebcdfb7bb46077a0a8d3c77de02f80d94b6485b8050123d8dc674da1fd0
3
  size 59933632
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dde81ccfc1d6e7f89434439a1d6f64f76e1c9379d5ef6122b86e48182afe1d2
3
  size 31823460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27cac1d8665684f8fb9ae47b03b08212872a04f9d41c081c6f66d7ddc18d1571
3
  size 31823460
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4cdd7aec1abfd0c0395a28df47d40423c06a84d0d1ffe7f8ccd7c936e92670e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:401e8bcffbfaba4d317e2a89edb4f1073b7d8a172738af37a2a13688139c01d3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:514265d9316b4b9174cfc4ba2a301feeeb4433551f601594176fbaa00014d4c4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d8a72170922eb22149c61e4763d188f7c858219b1f34e4777fff2bb46fb290
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03460714238177002,
5
  "eval_steps": 500,
6
- "global_step": 650,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4557,6 +4557,356 @@
4557
  "learning_rate": 9.999999970901543e-05,
4558
  "loss": 3.4294,
4559
  "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4560
  }
4561
  ],
4562
  "logging_steps": 1,
@@ -4576,7 +4926,7 @@
4576
  "attributes": {}
4577
  }
4578
  },
4579
- "total_flos": 1.7832628604790374e+17,
4580
  "train_batch_size": 4,
4581
  "trial_name": null,
4582
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03726923025729079,
5
  "eval_steps": 500,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4557
  "learning_rate": 9.999999970901543e-05,
4558
  "loss": 3.4294,
4559
  "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 0.03466038413928044,
4563
+ "grad_norm": 0.3088901937007904,
4564
+ "learning_rate": 9.999999970811245e-05,
4565
+ "loss": 2.7029,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 0.034713625896790855,
4570
+ "grad_norm": 0.41135066747665405,
4571
+ "learning_rate": 9.999999970720808e-05,
4572
+ "loss": 2.7839,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 0.034766867654301266,
4577
+ "grad_norm": 0.42486026883125305,
4578
+ "learning_rate": 9.999999970630231e-05,
4579
+ "loss": 2.8996,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 0.034820109411811684,
4584
+ "grad_norm": 0.4523933529853821,
4585
+ "learning_rate": 9.999999970539513e-05,
4586
+ "loss": 2.8055,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 0.0348733511693221,
4591
+ "grad_norm": 0.4108598828315735,
4592
+ "learning_rate": 9.999999970448655e-05,
4593
+ "loss": 2.7709,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 0.03492659292683251,
4598
+ "grad_norm": 0.38042157888412476,
4599
+ "learning_rate": 9.999999970357658e-05,
4600
+ "loss": 2.8717,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 0.03497983468434293,
4605
+ "grad_norm": 0.37943702936172485,
4606
+ "learning_rate": 9.999999970266523e-05,
4607
+ "loss": 2.6056,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 0.03503307644185335,
4612
+ "grad_norm": 0.3546702265739441,
4613
+ "learning_rate": 9.999999970175244e-05,
4614
+ "loss": 2.8449,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 0.03508631819936376,
4619
+ "grad_norm": 0.37441372871398926,
4620
+ "learning_rate": 9.999999970083828e-05,
4621
+ "loss": 2.7897,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 0.03513955995687418,
4626
+ "grad_norm": 0.3513989746570587,
4627
+ "learning_rate": 9.999999969992271e-05,
4628
+ "loss": 2.7763,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 0.03519280171438459,
4633
+ "grad_norm": 0.35834649205207825,
4634
+ "learning_rate": 9.999999969900575e-05,
4635
+ "loss": 2.7976,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 0.03524604347189501,
4640
+ "grad_norm": 0.37155264616012573,
4641
+ "learning_rate": 9.999999969808738e-05,
4642
+ "loss": 2.8712,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 0.035299285229405425,
4647
+ "grad_norm": 0.3758937120437622,
4648
+ "learning_rate": 9.999999969716762e-05,
4649
+ "loss": 2.7646,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 0.035352526986915836,
4654
+ "grad_norm": 0.3764578700065613,
4655
+ "learning_rate": 9.999999969624645e-05,
4656
+ "loss": 2.8118,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 0.035405768744426254,
4661
+ "grad_norm": 0.380533903837204,
4662
+ "learning_rate": 9.99999996953239e-05,
4663
+ "loss": 2.9565,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 0.03545901050193667,
4668
+ "grad_norm": 0.36998698115348816,
4669
+ "learning_rate": 9.999999969439992e-05,
4670
+ "loss": 2.783,
4671
+ "step": 666
4672
+ },
4673
+ {
4674
+ "epoch": 0.03551225225944708,
4675
+ "grad_norm": 0.399178147315979,
4676
+ "learning_rate": 9.999999969347458e-05,
4677
+ "loss": 2.9048,
4678
+ "step": 667
4679
+ },
4680
+ {
4681
+ "epoch": 0.0355654940169575,
4682
+ "grad_norm": 0.40703439712524414,
4683
+ "learning_rate": 9.999999969254782e-05,
4684
+ "loss": 2.8838,
4685
+ "step": 668
4686
+ },
4687
+ {
4688
+ "epoch": 0.03561873577446791,
4689
+ "grad_norm": 0.39457255601882935,
4690
+ "learning_rate": 9.999999969161966e-05,
4691
+ "loss": 2.6732,
4692
+ "step": 669
4693
+ },
4694
+ {
4695
+ "epoch": 0.03567197753197833,
4696
+ "grad_norm": 0.4186328053474426,
4697
+ "learning_rate": 9.999999969069011e-05,
4698
+ "loss": 2.8089,
4699
+ "step": 670
4700
+ },
4701
+ {
4702
+ "epoch": 0.03572521928948875,
4703
+ "grad_norm": 0.4049818813800812,
4704
+ "learning_rate": 9.999999968975914e-05,
4705
+ "loss": 2.7544,
4706
+ "step": 671
4707
+ },
4708
+ {
4709
+ "epoch": 0.03577846104699916,
4710
+ "grad_norm": 0.41349250078201294,
4711
+ "learning_rate": 9.99999996888268e-05,
4712
+ "loss": 2.8557,
4713
+ "step": 672
4714
+ },
4715
+ {
4716
+ "epoch": 0.03583170280450958,
4717
+ "grad_norm": 0.384772390127182,
4718
+ "learning_rate": 9.999999968789304e-05,
4719
+ "loss": 2.7758,
4720
+ "step": 673
4721
+ },
4722
+ {
4723
+ "epoch": 0.035884944562019995,
4724
+ "grad_norm": 0.39242029190063477,
4725
+ "learning_rate": 9.999999968695789e-05,
4726
+ "loss": 2.8561,
4727
+ "step": 674
4728
+ },
4729
+ {
4730
+ "epoch": 0.035938186319530406,
4731
+ "grad_norm": 0.4232184886932373,
4732
+ "learning_rate": 9.999999968602134e-05,
4733
+ "loss": 2.7526,
4734
+ "step": 675
4735
+ },
4736
+ {
4737
+ "epoch": 0.035991428077040824,
4738
+ "grad_norm": 0.3954784870147705,
4739
+ "learning_rate": 9.999999968508339e-05,
4740
+ "loss": 2.8118,
4741
+ "step": 676
4742
+ },
4743
+ {
4744
+ "epoch": 0.036044669834551235,
4745
+ "grad_norm": 0.4440658688545227,
4746
+ "learning_rate": 9.999999968414405e-05,
4747
+ "loss": 2.8997,
4748
+ "step": 677
4749
+ },
4750
+ {
4751
+ "epoch": 0.03609791159206165,
4752
+ "grad_norm": 0.4090384244918823,
4753
+ "learning_rate": 9.99999996832033e-05,
4754
+ "loss": 2.7277,
4755
+ "step": 678
4756
+ },
4757
+ {
4758
+ "epoch": 0.03615115334957207,
4759
+ "grad_norm": 0.4622509479522705,
4760
+ "learning_rate": 9.999999968226114e-05,
4761
+ "loss": 2.8473,
4762
+ "step": 679
4763
+ },
4764
+ {
4765
+ "epoch": 0.03620439510708248,
4766
+ "grad_norm": 0.44071659445762634,
4767
+ "learning_rate": 9.999999968131761e-05,
4768
+ "loss": 2.942,
4769
+ "step": 680
4770
+ },
4771
+ {
4772
+ "epoch": 0.0362576368645929,
4773
+ "grad_norm": 0.5004546046257019,
4774
+ "learning_rate": 9.999999968037266e-05,
4775
+ "loss": 2.8794,
4776
+ "step": 681
4777
+ },
4778
+ {
4779
+ "epoch": 0.03631087862210332,
4780
+ "grad_norm": 0.4791366159915924,
4781
+ "learning_rate": 9.999999967942633e-05,
4782
+ "loss": 2.9765,
4783
+ "step": 682
4784
+ },
4785
+ {
4786
+ "epoch": 0.03636412037961373,
4787
+ "grad_norm": 0.4310838580131531,
4788
+ "learning_rate": 9.999999967847858e-05,
4789
+ "loss": 2.6606,
4790
+ "step": 683
4791
+ },
4792
+ {
4793
+ "epoch": 0.03641736213712415,
4794
+ "grad_norm": 0.43610477447509766,
4795
+ "learning_rate": 9.999999967752944e-05,
4796
+ "loss": 2.7102,
4797
+ "step": 684
4798
+ },
4799
+ {
4800
+ "epoch": 0.036470603894634565,
4801
+ "grad_norm": 0.5011301040649414,
4802
+ "learning_rate": 9.999999967657889e-05,
4803
+ "loss": 2.9686,
4804
+ "step": 685
4805
+ },
4806
+ {
4807
+ "epoch": 0.036523845652144976,
4808
+ "grad_norm": 0.48820289969444275,
4809
+ "learning_rate": 9.999999967562696e-05,
4810
+ "loss": 2.8583,
4811
+ "step": 686
4812
+ },
4813
+ {
4814
+ "epoch": 0.036577087409655394,
4815
+ "grad_norm": 0.49313730001449585,
4816
+ "learning_rate": 9.999999967467362e-05,
4817
+ "loss": 2.741,
4818
+ "step": 687
4819
+ },
4820
+ {
4821
+ "epoch": 0.036630329167165805,
4822
+ "grad_norm": 0.4905647933483124,
4823
+ "learning_rate": 9.999999967371889e-05,
4824
+ "loss": 2.8032,
4825
+ "step": 688
4826
+ },
4827
+ {
4828
+ "epoch": 0.03668357092467622,
4829
+ "grad_norm": 0.5020934343338013,
4830
+ "learning_rate": 9.999999967276276e-05,
4831
+ "loss": 2.8226,
4832
+ "step": 689
4833
+ },
4834
+ {
4835
+ "epoch": 0.03673681268218664,
4836
+ "grad_norm": 0.5333757400512695,
4837
+ "learning_rate": 9.999999967180522e-05,
4838
+ "loss": 2.8236,
4839
+ "step": 690
4840
+ },
4841
+ {
4842
+ "epoch": 0.03679005443969705,
4843
+ "grad_norm": 0.5454348921775818,
4844
+ "learning_rate": 9.99999996708463e-05,
4845
+ "loss": 2.9117,
4846
+ "step": 691
4847
+ },
4848
+ {
4849
+ "epoch": 0.03684329619720747,
4850
+ "grad_norm": 0.5079708099365234,
4851
+ "learning_rate": 9.999999966988596e-05,
4852
+ "loss": 2.8137,
4853
+ "step": 692
4854
+ },
4855
+ {
4856
+ "epoch": 0.03689653795471789,
4857
+ "grad_norm": 0.5753440260887146,
4858
+ "learning_rate": 9.999999966892422e-05,
4859
+ "loss": 2.9181,
4860
+ "step": 693
4861
+ },
4862
+ {
4863
+ "epoch": 0.0369497797122283,
4864
+ "grad_norm": 0.5435117483139038,
4865
+ "learning_rate": 9.99999996679611e-05,
4866
+ "loss": 2.9129,
4867
+ "step": 694
4868
+ },
4869
+ {
4870
+ "epoch": 0.03700302146973872,
4871
+ "grad_norm": 0.5819733142852783,
4872
+ "learning_rate": 9.999999966699656e-05,
4873
+ "loss": 3.0462,
4874
+ "step": 695
4875
+ },
4876
+ {
4877
+ "epoch": 0.03705626322724913,
4878
+ "grad_norm": 0.6745149493217468,
4879
+ "learning_rate": 9.999999966603065e-05,
4880
+ "loss": 3.2039,
4881
+ "step": 696
4882
+ },
4883
+ {
4884
+ "epoch": 0.037109504984759546,
4885
+ "grad_norm": 0.6611133217811584,
4886
+ "learning_rate": 9.999999966506331e-05,
4887
+ "loss": 2.9349,
4888
+ "step": 697
4889
+ },
4890
+ {
4891
+ "epoch": 0.037162746742269964,
4892
+ "grad_norm": 0.7144906520843506,
4893
+ "learning_rate": 9.999999966409459e-05,
4894
+ "loss": 2.9977,
4895
+ "step": 698
4896
+ },
4897
+ {
4898
+ "epoch": 0.037215988499780375,
4899
+ "grad_norm": 1.030449628829956,
4900
+ "learning_rate": 9.999999966312447e-05,
4901
+ "loss": 3.3691,
4902
+ "step": 699
4903
+ },
4904
+ {
4905
+ "epoch": 0.03726923025729079,
4906
+ "grad_norm": 1.110541820526123,
4907
+ "learning_rate": 9.999999966215294e-05,
4908
+ "loss": 3.0989,
4909
+ "step": 700
4910
  }
4911
  ],
4912
  "logging_steps": 1,
 
4926
  "attributes": {}
4927
  }
4928
  },
4929
+ "total_flos": 1.9203841876977254e+17,
4930
  "train_batch_size": 4,
4931
  "trial_name": null,
4932
  "trial_params": null