| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9997034576834114, | |
| "eval_steps": 500, | |
| "global_step": 25290, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.059308463317715436, | |
| "grad_norm": 36.79362869262695, | |
| "learning_rate": 1.9609331751680507e-05, | |
| "loss": 2.1261, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11861692663543087, | |
| "grad_norm": 28.276079177856445, | |
| "learning_rate": 1.92139185448794e-05, | |
| "loss": 1.4346, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1779253899531463, | |
| "grad_norm": 27.88380241394043, | |
| "learning_rate": 1.8818505338078293e-05, | |
| "loss": 1.2696, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.23723385327086174, | |
| "grad_norm": 21.309072494506836, | |
| "learning_rate": 1.8423092131277186e-05, | |
| "loss": 1.1835, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2965423165885772, | |
| "grad_norm": 34.47601318359375, | |
| "learning_rate": 1.802767892447608e-05, | |
| "loss": 1.1527, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3558507799062926, | |
| "grad_norm": 30.186681747436523, | |
| "learning_rate": 1.7633056544088573e-05, | |
| "loss": 1.0756, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.41515924322400805, | |
| "grad_norm": 19.00593376159668, | |
| "learning_rate": 1.7237643337287466e-05, | |
| "loss": 1.0807, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4744677065417235, | |
| "grad_norm": 27.36587142944336, | |
| "learning_rate": 1.684223013048636e-05, | |
| "loss": 1.0658, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.533776169859439, | |
| "grad_norm": 28.576007843017578, | |
| "learning_rate": 1.644681692368525e-05, | |
| "loss": 1.0234, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5930846331771544, | |
| "grad_norm": 24.95502471923828, | |
| "learning_rate": 1.6052194543297746e-05, | |
| "loss": 1.0081, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6523930964948699, | |
| "grad_norm": 26.936307907104492, | |
| "learning_rate": 1.565678133649664e-05, | |
| "loss": 0.9975, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7117015598125852, | |
| "grad_norm": 20.501012802124023, | |
| "learning_rate": 1.5261368129695532e-05, | |
| "loss": 0.975, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7710100231303006, | |
| "grad_norm": 18.4880428314209, | |
| "learning_rate": 1.4865954922894426e-05, | |
| "loss": 0.968, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8303184864480161, | |
| "grad_norm": 27.225852966308594, | |
| "learning_rate": 1.4470541716093318e-05, | |
| "loss": 0.9638, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.8896269497657315, | |
| "grad_norm": 19.646434783935547, | |
| "learning_rate": 1.407512850929221e-05, | |
| "loss": 0.9294, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.948935413083447, | |
| "grad_norm": 19.823829650878906, | |
| "learning_rate": 1.3679715302491103e-05, | |
| "loss": 0.9074, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0081845679378447, | |
| "grad_norm": 30.65908432006836, | |
| "learning_rate": 1.3284302095689998e-05, | |
| "loss": 0.8917, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.0674930312555602, | |
| "grad_norm": 15.392027854919434, | |
| "learning_rate": 1.288888888888889e-05, | |
| "loss": 0.7416, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.1268014945732756, | |
| "grad_norm": 11.21274185180664, | |
| "learning_rate": 1.2493475682087784e-05, | |
| "loss": 0.7167, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.186109957890991, | |
| "grad_norm": 13.336421012878418, | |
| "learning_rate": 1.2098062475286676e-05, | |
| "loss": 0.7167, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.2454184212087065, | |
| "grad_norm": 11.942096710205078, | |
| "learning_rate": 1.170264926848557e-05, | |
| "loss": 0.7162, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.304726884526422, | |
| "grad_norm": 11.245469093322754, | |
| "learning_rate": 1.1308026888098062e-05, | |
| "loss": 0.7256, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.3640353478441374, | |
| "grad_norm": 59.7520751953125, | |
| "learning_rate": 1.0913404507710558e-05, | |
| "loss": 0.7336, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.4233438111618528, | |
| "grad_norm": 22.972858428955078, | |
| "learning_rate": 1.0517991300909451e-05, | |
| "loss": 0.6973, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.4826522744795683, | |
| "grad_norm": 19.813735961914062, | |
| "learning_rate": 1.0122578094108344e-05, | |
| "loss": 0.6968, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.5419607377972837, | |
| "grad_norm": 20.160003662109375, | |
| "learning_rate": 9.727164887307237e-06, | |
| "loss": 0.6998, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.6012692011149992, | |
| "grad_norm": 14.531295776367188, | |
| "learning_rate": 9.33175168050613e-06, | |
| "loss": 0.6966, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.6605776644327146, | |
| "grad_norm": 23.200489044189453, | |
| "learning_rate": 8.937129300118624e-06, | |
| "loss": 0.6846, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.71988612775043, | |
| "grad_norm": 29.29988670349121, | |
| "learning_rate": 8.541716093317518e-06, | |
| "loss": 0.7078, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.7791945910681455, | |
| "grad_norm": 15.779882431030273, | |
| "learning_rate": 8.147093712930011e-06, | |
| "loss": 0.7016, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.838503054385861, | |
| "grad_norm": 14.096702575683594, | |
| "learning_rate": 7.751680506128906e-06, | |
| "loss": 0.686, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.8978115177035764, | |
| "grad_norm": 22.493389129638672, | |
| "learning_rate": 7.356267299327799e-06, | |
| "loss": 0.7019, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.9571199810212918, | |
| "grad_norm": 33.955657958984375, | |
| "learning_rate": 6.960854092526691e-06, | |
| "loss": 0.6723, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.0163691358756894, | |
| "grad_norm": 19.76311683654785, | |
| "learning_rate": 6.5654408857255835e-06, | |
| "loss": 0.6539, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.075677599193405, | |
| "grad_norm": 10.439183235168457, | |
| "learning_rate": 6.170818505338079e-06, | |
| "loss": 0.5251, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.1349860625111203, | |
| "grad_norm": 9.628398895263672, | |
| "learning_rate": 5.775405298536972e-06, | |
| "loss": 0.521, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.1942945258288358, | |
| "grad_norm": 23.81959342956543, | |
| "learning_rate": 5.379992091735864e-06, | |
| "loss": 0.5243, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.253602989146551, | |
| "grad_norm": 88.61954498291016, | |
| "learning_rate": 4.984578884934757e-06, | |
| "loss": 0.5298, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.3129114524642667, | |
| "grad_norm": 34.159732818603516, | |
| "learning_rate": 4.58916567813365e-06, | |
| "loss": 0.5132, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.372219915781982, | |
| "grad_norm": 13.070930480957031, | |
| "learning_rate": 4.193752471332543e-06, | |
| "loss": 0.5221, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.4315283790996975, | |
| "grad_norm": 15.99543571472168, | |
| "learning_rate": 3.7983392645314355e-06, | |
| "loss": 0.53, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.490836842417413, | |
| "grad_norm": 35.39397430419922, | |
| "learning_rate": 3.4029260577303288e-06, | |
| "loss": 0.516, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.5501453057351284, | |
| "grad_norm": 20.889413833618164, | |
| "learning_rate": 3.0083036773428236e-06, | |
| "loss": 0.5134, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.609453769052844, | |
| "grad_norm": 14.608019828796387, | |
| "learning_rate": 2.6128904705417165e-06, | |
| "loss": 0.508, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.6687622323705593, | |
| "grad_norm": 44.29560852050781, | |
| "learning_rate": 2.2174772637406093e-06, | |
| "loss": 0.5055, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.7280706956882748, | |
| "grad_norm": 22.466794967651367, | |
| "learning_rate": 1.822064056939502e-06, | |
| "loss": 0.4965, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.78737915900599, | |
| "grad_norm": 61.90739822387695, | |
| "learning_rate": 1.4274416765519968e-06, | |
| "loss": 0.519, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.8466876223237056, | |
| "grad_norm": 27.325544357299805, | |
| "learning_rate": 1.032819296164492e-06, | |
| "loss": 0.5097, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.905996085641421, | |
| "grad_norm": 45.56763458251953, | |
| "learning_rate": 6.374060893633847e-07, | |
| "loss": 0.4979, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.9653045489591365, | |
| "grad_norm": 29.870397567749023, | |
| "learning_rate": 2.419928825622776e-07, | |
| "loss": 0.51, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.9997034576834114, | |
| "step": 25290, | |
| "total_flos": 8.259596752704e+16, | |
| "train_loss": 0.7820950991931099, | |
| "train_runtime": 45983.6556, | |
| "train_samples_per_second": 8.8, | |
| "train_steps_per_second": 0.55 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 25290, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.259596752704e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |