oliverchang commited on
Commit
976b851
·
verified ·
1 Parent(s): 7ff8c9c

Training in progress, step 650, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7882eadd685fdcce6d3734404c09814f09c0a2a5662a860276ddcb723598343b
3
  size 59933632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06a0e47ba651e599e4b4766058ec931903a2cc546d08844cdbe699276f5f83a
3
  size 59933632
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1f4a8e6947f1864feb49c9578ca5f609069497536dcff795f15673a3a394767
3
  size 31823460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dde81ccfc1d6e7f89434439a1d6f64f76e1c9379d5ef6122b86e48182afe1d2
3
  size 31823460
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebf46231b6528c7bb0c6907bfba1c53ba3622e0e9426b49c9bc7e496484a865e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4cdd7aec1abfd0c0395a28df47d40423c06a84d0d1ffe7f8ccd7c936e92670e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f007a15a43c93fe4f2be9f96951ebe7b1bfcc8190c27975766ecbd42149d0f2e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514265d9316b4b9174cfc4ba2a301feeeb4433551f601594176fbaa00014d4c4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.03194505450624925,
5
  "eval_steps": 500,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4207,6 +4207,356 @@
4207
  "learning_rate": 9.999999975238072e-05,
4208
  "loss": 3.2113,
4209
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4210
  }
4211
  ],
4212
  "logging_steps": 1,
@@ -4226,7 +4576,7 @@
4226
  "attributes": {}
4227
  }
4228
  },
4229
- "total_flos": 1.6461415332603494e+17,
4230
  "train_batch_size": 4,
4231
  "trial_name": null,
4232
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03460714238177002,
5
  "eval_steps": 500,
6
+ "global_step": 650,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4207
  "learning_rate": 9.999999975238072e-05,
4208
  "loss": 3.2113,
4209
  "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 0.03199829626375967,
4213
+ "grad_norm": 0.2847670614719391,
4214
+ "learning_rate": 9.999999975154769e-05,
4215
+ "loss": 2.7965,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 0.03205153802127008,
4220
+ "grad_norm": 0.3811182677745819,
4221
+ "learning_rate": 9.999999975071326e-05,
4222
+ "loss": 2.7836,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 0.0321047797787805,
4227
+ "grad_norm": 0.4415830969810486,
4228
+ "learning_rate": 9.999999974987742e-05,
4229
+ "loss": 2.7183,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 0.032158021536290916,
4234
+ "grad_norm": 0.3968278765678406,
4235
+ "learning_rate": 9.99999997490402e-05,
4236
+ "loss": 2.7068,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 0.03221126329380133,
4241
+ "grad_norm": 0.455646276473999,
4242
+ "learning_rate": 9.999999974820157e-05,
4243
+ "loss": 2.7694,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 0.032264505051311745,
4248
+ "grad_norm": 0.394357293844223,
4249
+ "learning_rate": 9.999999974736155e-05,
4250
+ "loss": 2.7312,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 0.032317746808822156,
4255
+ "grad_norm": 0.3707529604434967,
4256
+ "learning_rate": 9.999999974652011e-05,
4257
+ "loss": 2.9084,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 0.032370988566332574,
4262
+ "grad_norm": 0.3821130692958832,
4263
+ "learning_rate": 9.999999974567729e-05,
4264
+ "loss": 2.8708,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 0.03242423032384299,
4269
+ "grad_norm": 0.3351576328277588,
4270
+ "learning_rate": 9.999999974483306e-05,
4271
+ "loss": 2.7368,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 0.0324774720813534,
4276
+ "grad_norm": 0.35292184352874756,
4277
+ "learning_rate": 9.999999974398744e-05,
4278
+ "loss": 2.729,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 0.03253071383886382,
4283
+ "grad_norm": 0.35947975516319275,
4284
+ "learning_rate": 9.999999974314042e-05,
4285
+ "loss": 2.819,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 0.03258395559637424,
4290
+ "grad_norm": 0.35223957896232605,
4291
+ "learning_rate": 9.999999974229199e-05,
4292
+ "loss": 2.8256,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 0.03263719735388465,
4297
+ "grad_norm": 0.38854068517684937,
4298
+ "learning_rate": 9.999999974144218e-05,
4299
+ "loss": 2.9578,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 0.03269043911139507,
4304
+ "grad_norm": 0.3561096489429474,
4305
+ "learning_rate": 9.999999974059096e-05,
4306
+ "loss": 2.7325,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 0.03274368086890548,
4311
+ "grad_norm": 0.35364630818367004,
4312
+ "learning_rate": 9.999999973973834e-05,
4313
+ "loss": 2.695,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 0.0327969226264159,
4318
+ "grad_norm": 0.36281758546829224,
4319
+ "learning_rate": 9.999999973888432e-05,
4320
+ "loss": 2.6452,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 0.032850164383926315,
4325
+ "grad_norm": 0.3692990243434906,
4326
+ "learning_rate": 9.999999973802891e-05,
4327
+ "loss": 2.9116,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 0.03290340614143673,
4332
+ "grad_norm": 0.37901854515075684,
4333
+ "learning_rate": 9.999999973717209e-05,
4334
+ "loss": 2.7462,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 0.032956647898947145,
4339
+ "grad_norm": 0.39527255296707153,
4340
+ "learning_rate": 9.999999973631389e-05,
4341
+ "loss": 2.8559,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 0.03300988965645756,
4346
+ "grad_norm": 0.4052393436431885,
4347
+ "learning_rate": 9.999999973545427e-05,
4348
+ "loss": 2.8604,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 0.033063131413967974,
4353
+ "grad_norm": 0.4052470624446869,
4354
+ "learning_rate": 9.999999973459326e-05,
4355
+ "loss": 2.9266,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 0.03311637317147839,
4360
+ "grad_norm": 0.40139713883399963,
4361
+ "learning_rate": 9.999999973373085e-05,
4362
+ "loss": 2.8272,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 0.0331696149289888,
4367
+ "grad_norm": 0.3807532787322998,
4368
+ "learning_rate": 9.999999973286705e-05,
4369
+ "loss": 2.813,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 0.03322285668649922,
4374
+ "grad_norm": 0.4124998450279236,
4375
+ "learning_rate": 9.999999973200184e-05,
4376
+ "loss": 2.8132,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 0.03327609844400964,
4381
+ "grad_norm": 0.4142961800098419,
4382
+ "learning_rate": 9.999999973113523e-05,
4383
+ "loss": 3.0106,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 0.03332934020152005,
4388
+ "grad_norm": 0.4193595349788666,
4389
+ "learning_rate": 9.999999973026723e-05,
4390
+ "loss": 2.7762,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 0.03338258195903047,
4395
+ "grad_norm": 0.41629183292388916,
4396
+ "learning_rate": 9.999999972939782e-05,
4397
+ "loss": 2.8948,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 0.033435823716540886,
4402
+ "grad_norm": 0.41228872537612915,
4403
+ "learning_rate": 9.999999972852702e-05,
4404
+ "loss": 2.7374,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 0.0334890654740513,
4409
+ "grad_norm": 0.5310066342353821,
4410
+ "learning_rate": 9.999999972765482e-05,
4411
+ "loss": 2.8636,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 0.033542307231561715,
4416
+ "grad_norm": 0.4347386658191681,
4417
+ "learning_rate": 9.99999997267812e-05,
4418
+ "loss": 2.9061,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 0.03359554898907213,
4423
+ "grad_norm": 0.42458465695381165,
4424
+ "learning_rate": 9.999999972590621e-05,
4425
+ "loss": 2.7978,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 0.033648790746582544,
4430
+ "grad_norm": 0.43726715445518494,
4431
+ "learning_rate": 9.999999972502981e-05,
4432
+ "loss": 2.9151,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 0.03370203250409296,
4437
+ "grad_norm": 0.43290823698043823,
4438
+ "learning_rate": 9.999999972415202e-05,
4439
+ "loss": 2.9395,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 0.03375527426160337,
4444
+ "grad_norm": 0.518913984298706,
4445
+ "learning_rate": 9.999999972327282e-05,
4446
+ "loss": 2.8681,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 0.03380851601911379,
4451
+ "grad_norm": 0.49017536640167236,
4452
+ "learning_rate": 9.999999972239222e-05,
4453
+ "loss": 3.0101,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 0.03386175777662421,
4458
+ "grad_norm": 0.49527156352996826,
4459
+ "learning_rate": 9.999999972151024e-05,
4460
+ "loss": 2.7499,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 0.03391499953413462,
4465
+ "grad_norm": 0.4943864345550537,
4466
+ "learning_rate": 9.999999972062684e-05,
4467
+ "loss": 3.0001,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 0.03396824129164504,
4472
+ "grad_norm": 0.4815324544906616,
4473
+ "learning_rate": 9.999999971974205e-05,
4474
+ "loss": 2.7287,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 0.034021483049155456,
4479
+ "grad_norm": 0.4747610092163086,
4480
+ "learning_rate": 9.999999971885585e-05,
4481
+ "loss": 2.857,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 0.03407472480666587,
4486
+ "grad_norm": 0.5065243244171143,
4487
+ "learning_rate": 9.999999971796827e-05,
4488
+ "loss": 2.894,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 0.034127966564176285,
4493
+ "grad_norm": 0.5166441202163696,
4494
+ "learning_rate": 9.999999971707928e-05,
4495
+ "loss": 2.8062,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 0.034181208321686696,
4500
+ "grad_norm": 0.5336162447929382,
4501
+ "learning_rate": 9.99999997161889e-05,
4502
+ "loss": 3.0662,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 0.034234450079197114,
4507
+ "grad_norm": 0.5441266298294067,
4508
+ "learning_rate": 9.999999971529709e-05,
4509
+ "loss": 3.0668,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 0.03428769183670753,
4514
+ "grad_norm": 0.6290764212608337,
4515
+ "learning_rate": 9.999999971440393e-05,
4516
+ "loss": 2.8475,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 0.03434093359421794,
4521
+ "grad_norm": 0.5840023756027222,
4522
+ "learning_rate": 9.999999971350932e-05,
4523
+ "loss": 2.9856,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 0.03439417535172836,
4528
+ "grad_norm": 0.5803622007369995,
4529
+ "learning_rate": 9.999999971261335e-05,
4530
+ "loss": 2.9473,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 0.03444741710923878,
4535
+ "grad_norm": 0.6593179702758789,
4536
+ "learning_rate": 9.999999971171597e-05,
4537
+ "loss": 3.1348,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 0.03450065886674919,
4542
+ "grad_norm": 0.8158584833145142,
4543
+ "learning_rate": 9.99999997108172e-05,
4544
+ "loss": 3.0948,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 0.03455390062425961,
4549
+ "grad_norm": 0.7752049565315247,
4550
+ "learning_rate": 9.999999970991701e-05,
4551
+ "loss": 2.9799,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 0.03460714238177002,
4556
+ "grad_norm": 0.9332935214042664,
4557
+ "learning_rate": 9.999999970901543e-05,
4558
+ "loss": 3.4294,
4559
+ "step": 650
4560
  }
4561
  ],
4562
  "logging_steps": 1,
 
4576
  "attributes": {}
4577
  }
4578
  },
4579
+ "total_flos": 1.7832628604790374e+17,
4580
  "train_batch_size": 4,
4581
  "trial_name": null,
4582
  "trial_params": null