| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 25000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1523245573043823, | |
| "learning_rate": 0.00019959999999999997, | |
| "loss": 0.4793, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.207442283630371, | |
| "learning_rate": 0.0002999219633608753, | |
| "loss": 0.365, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.608832836151123, | |
| "learning_rate": 0.0002992943970692375, | |
| "loss": 0.3408, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.6748807430267334, | |
| "learning_rate": 0.0002980406381928192, | |
| "loss": 0.3432, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.1549451351165771, | |
| "learning_rate": 0.0002961659454320601, | |
| "loss": 0.3337, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.3943244218826294, | |
| "learning_rate": 0.0002936781818999006, | |
| "loss": 0.328, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8221294283866882, | |
| "learning_rate": 0.0002905877821411536, | |
| "loss": 0.3184, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8499279022216797, | |
| "learning_rate": 0.00028690770836639715, | |
| "loss": 0.312, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.1005589962005615, | |
| "learning_rate": 0.0002826533960839586, | |
| "loss": 0.3175, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.239676594734192, | |
| "learning_rate": 0.0002778426893580286, | |
| "loss": 0.3139, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5692813396453857, | |
| "learning_rate": 0.00027249576596445455, | |
| "loss": 0.3065, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6223649382591248, | |
| "learning_rate": 0.00026663505275813633, | |
| "loss": 0.3015, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.6326464414596558, | |
| "learning_rate": 0.0002602851316070032, | |
| "loss": 0.298, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.44571417570114136, | |
| "learning_rate": 0.0002534726362871166, | |
| "loss": 0.3043, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.7266321182250977, | |
| "learning_rate": 0.00024622614077135773, | |
| "loss": 0.2949, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.133889079093933, | |
| "learning_rate": 0.00023857603938025494, | |
| "loss": 0.2826, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7410082817077637, | |
| "learning_rate": 0.00023055441929764077, | |
| "loss": 0.2836, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.5899648070335388, | |
| "learning_rate": 0.00022219492598585185, | |
| "loss": 0.2868, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3609607219696045, | |
| "learning_rate": 0.00021353262206496714, | |
| "loss": 0.3002, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.7143217325210571, | |
| "learning_rate": 0.0002046038402479944, | |
| "loss": 0.2734, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9960472583770752, | |
| "learning_rate": 0.0001954460309488451, | |
| "loss": 0.2824, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.9902260899543762, | |
| "learning_rate": 0.00018609760520228364, | |
| "loss": 0.2802, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.1902357339859009, | |
| "learning_rate": 0.00017659777355469886, | |
| "loss": 0.2848, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.7066996097564697, | |
| "learning_rate": 0.0001669863816014457, | |
| "loss": 0.2723, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.8586142063140869, | |
| "learning_rate": 0.00015730374286057098, | |
| "loss": 0.2661, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.3139828443527222, | |
| "learning_rate": 0.00014759046968390891, | |
| "loss": 0.27, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.092033624649048, | |
| "learning_rate": 0.00013788730291476348, | |
| "loss": 0.2626, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6759353876113892, | |
| "learning_rate": 0.00012823494100665345, | |
| "loss": 0.2713, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7745094299316406, | |
| "learning_rate": 0.0001186738693198545, | |
| "loss": 0.2659, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.1789864301681519, | |
| "learning_rate": 0.00010924419031172836, | |
| "loss": 0.2621, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7983564138412476, | |
| "learning_rate": 9.998545533308028e-05, | |
| "loss": 0.2485, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.2566275596618652, | |
| "learning_rate": 9.093649873604878e-05, | |
| "loss": 0.2499, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.405757188796997, | |
| "learning_rate": 8.213527498933654e-05, | |
| "loss": 0.2648, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.706387996673584, | |
| "learning_rate": 7.361869948397888e-05, | |
| "loss": 0.248, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.9889776110649109, | |
| "learning_rate": 6.542249369736452e-05, | |
| "loss": 0.2533, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.3792021572589874, | |
| "learning_rate": 5.758103536494548e-05, | |
| "loss": 0.2464, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.4570865631103516, | |
| "learning_rate": 5.012721428806742e-05, | |
| "loss": 0.2469, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.3031163811683655, | |
| "learning_rate": 4.30922943827128e-05, | |
| "loss": 0.2563, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.5329792499542236, | |
| "learning_rate": 3.6505782547772205e-05, | |
| "loss": 0.2409, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.900152325630188, | |
| "learning_rate": 3.0395304902856126e-05, | |
| "loss": 0.2413, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.79344642162323, | |
| "learning_rate": 2.4786490914748508e-05, | |
| "loss": 0.241, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.6573524475097656, | |
| "learning_rate": 1.970286589851618e-05, | |
| "loss": 0.2447, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.92716383934021, | |
| "learning_rate": 1.516575234416183e-05, | |
| "loss": 0.2513, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.8475908041000366, | |
| "learning_rate": 1.1194180482690935e-05, | |
| "loss": 0.2425, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.6743385195732117, | |
| "learning_rate": 7.804808466709245e-06, | |
| "loss": 0.2372, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.7676027417182922, | |
| "learning_rate": 5.011852500341956e-06, | |
| "loss": 0.2429, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.600247621536255, | |
| "learning_rate": 2.827027211532812e-06, | |
| "loss": 0.2351, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.2377513349056244, | |
| "learning_rate": 1.2594965168235604e-06, | |
| "loss": 0.2464, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 3.111140012741089, | |
| "learning_rate": 3.1583518470324476e-07, | |
| "loss": 0.2387, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.134462833404541, | |
| "learning_rate": 1.2587443087852533e-12, | |
| "loss": 0.2547, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 25000, | |
| "total_flos": 2.8066019659471258e+17, | |
| "train_loss": 0.2806466256713867, | |
| "train_runtime": 4375.9007, | |
| "train_samples_per_second": 5.713, | |
| "train_steps_per_second": 5.713 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 25000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 12500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.8066019659471258e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |