{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9997034576834114, "eval_steps": 500, "global_step": 25290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059308463317715436, "grad_norm": 36.79362869262695, "learning_rate": 1.9609331751680507e-05, "loss": 2.1261, "step": 500 }, { "epoch": 0.11861692663543087, "grad_norm": 28.276079177856445, "learning_rate": 1.92139185448794e-05, "loss": 1.4346, "step": 1000 }, { "epoch": 0.1779253899531463, "grad_norm": 27.88380241394043, "learning_rate": 1.8818505338078293e-05, "loss": 1.2696, "step": 1500 }, { "epoch": 0.23723385327086174, "grad_norm": 21.309072494506836, "learning_rate": 1.8423092131277186e-05, "loss": 1.1835, "step": 2000 }, { "epoch": 0.2965423165885772, "grad_norm": 34.47601318359375, "learning_rate": 1.802767892447608e-05, "loss": 1.1527, "step": 2500 }, { "epoch": 0.3558507799062926, "grad_norm": 30.186681747436523, "learning_rate": 1.7633056544088573e-05, "loss": 1.0756, "step": 3000 }, { "epoch": 0.41515924322400805, "grad_norm": 19.00593376159668, "learning_rate": 1.7237643337287466e-05, "loss": 1.0807, "step": 3500 }, { "epoch": 0.4744677065417235, "grad_norm": 27.36587142944336, "learning_rate": 1.684223013048636e-05, "loss": 1.0658, "step": 4000 }, { "epoch": 0.533776169859439, "grad_norm": 28.576007843017578, "learning_rate": 1.644681692368525e-05, "loss": 1.0234, "step": 4500 }, { "epoch": 0.5930846331771544, "grad_norm": 24.95502471923828, "learning_rate": 1.6052194543297746e-05, "loss": 1.0081, "step": 5000 }, { "epoch": 0.6523930964948699, "grad_norm": 26.936307907104492, "learning_rate": 1.565678133649664e-05, "loss": 0.9975, "step": 5500 }, { "epoch": 0.7117015598125852, "grad_norm": 20.501012802124023, "learning_rate": 1.5261368129695532e-05, "loss": 0.975, "step": 6000 }, { "epoch": 0.7710100231303006, "grad_norm": 18.4880428314209, "learning_rate": 1.4865954922894426e-05, "loss": 0.968, "step": 6500 }, { "epoch": 0.8303184864480161, "grad_norm": 27.225852966308594, "learning_rate": 1.4470541716093318e-05, "loss": 0.9638, "step": 7000 }, { "epoch": 0.8896269497657315, "grad_norm": 19.646434783935547, "learning_rate": 1.407512850929221e-05, "loss": 0.9294, "step": 7500 }, { "epoch": 0.948935413083447, "grad_norm": 19.823829650878906, "learning_rate": 1.3679715302491103e-05, "loss": 0.9074, "step": 8000 }, { "epoch": 1.0081845679378447, "grad_norm": 30.65908432006836, "learning_rate": 1.3284302095689998e-05, "loss": 0.8917, "step": 8500 }, { "epoch": 1.0674930312555602, "grad_norm": 15.392027854919434, "learning_rate": 1.288888888888889e-05, "loss": 0.7416, "step": 9000 }, { "epoch": 1.1268014945732756, "grad_norm": 11.21274185180664, "learning_rate": 1.2493475682087784e-05, "loss": 0.7167, "step": 9500 }, { "epoch": 1.186109957890991, "grad_norm": 13.336421012878418, "learning_rate": 1.2098062475286676e-05, "loss": 0.7167, "step": 10000 }, { "epoch": 1.2454184212087065, "grad_norm": 11.942096710205078, "learning_rate": 1.170264926848557e-05, "loss": 0.7162, "step": 10500 }, { "epoch": 1.304726884526422, "grad_norm": 11.245469093322754, "learning_rate": 1.1308026888098062e-05, "loss": 0.7256, "step": 11000 }, { "epoch": 1.3640353478441374, "grad_norm": 59.7520751953125, "learning_rate": 1.0913404507710558e-05, "loss": 0.7336, "step": 11500 }, { "epoch": 1.4233438111618528, "grad_norm": 22.972858428955078, "learning_rate": 1.0517991300909451e-05, "loss": 0.6973, "step": 12000 }, { "epoch": 1.4826522744795683, "grad_norm": 19.813735961914062, "learning_rate": 1.0122578094108344e-05, "loss": 0.6968, "step": 12500 }, { "epoch": 1.5419607377972837, "grad_norm": 20.160003662109375, "learning_rate": 9.727164887307237e-06, "loss": 0.6998, "step": 13000 }, { "epoch": 1.6012692011149992, "grad_norm": 14.531295776367188, "learning_rate": 9.33175168050613e-06, "loss": 0.6966, "step": 13500 }, { "epoch": 1.6605776644327146, "grad_norm": 23.200489044189453, "learning_rate": 8.937129300118624e-06, "loss": 0.6846, "step": 14000 }, { "epoch": 1.71988612775043, "grad_norm": 29.29988670349121, "learning_rate": 8.541716093317518e-06, "loss": 0.7078, "step": 14500 }, { "epoch": 1.7791945910681455, "grad_norm": 15.779882431030273, "learning_rate": 8.147093712930011e-06, "loss": 0.7016, "step": 15000 }, { "epoch": 1.838503054385861, "grad_norm": 14.096702575683594, "learning_rate": 7.751680506128906e-06, "loss": 0.686, "step": 15500 }, { "epoch": 1.8978115177035764, "grad_norm": 22.493389129638672, "learning_rate": 7.356267299327799e-06, "loss": 0.7019, "step": 16000 }, { "epoch": 1.9571199810212918, "grad_norm": 33.955657958984375, "learning_rate": 6.960854092526691e-06, "loss": 0.6723, "step": 16500 }, { "epoch": 2.0163691358756894, "grad_norm": 19.76311683654785, "learning_rate": 6.5654408857255835e-06, "loss": 0.6539, "step": 17000 }, { "epoch": 2.075677599193405, "grad_norm": 10.439183235168457, "learning_rate": 6.170818505338079e-06, "loss": 0.5251, "step": 17500 }, { "epoch": 2.1349860625111203, "grad_norm": 9.628398895263672, "learning_rate": 5.775405298536972e-06, "loss": 0.521, "step": 18000 }, { "epoch": 2.1942945258288358, "grad_norm": 23.81959342956543, "learning_rate": 5.379992091735864e-06, "loss": 0.5243, "step": 18500 }, { "epoch": 2.253602989146551, "grad_norm": 88.61954498291016, "learning_rate": 4.984578884934757e-06, "loss": 0.5298, "step": 19000 }, { "epoch": 2.3129114524642667, "grad_norm": 34.159732818603516, "learning_rate": 4.58916567813365e-06, "loss": 0.5132, "step": 19500 }, { "epoch": 2.372219915781982, "grad_norm": 13.070930480957031, "learning_rate": 4.193752471332543e-06, "loss": 0.5221, "step": 20000 }, { "epoch": 2.4315283790996975, "grad_norm": 15.99543571472168, "learning_rate": 3.7983392645314355e-06, "loss": 0.53, "step": 20500 }, { "epoch": 2.490836842417413, "grad_norm": 35.39397430419922, "learning_rate": 3.4029260577303288e-06, "loss": 0.516, "step": 21000 }, { "epoch": 2.5501453057351284, "grad_norm": 20.889413833618164, "learning_rate": 3.0083036773428236e-06, "loss": 0.5134, "step": 21500 }, { "epoch": 2.609453769052844, "grad_norm": 14.608019828796387, "learning_rate": 2.6128904705417165e-06, "loss": 0.508, "step": 22000 }, { "epoch": 2.6687622323705593, "grad_norm": 44.29560852050781, "learning_rate": 2.2174772637406093e-06, "loss": 0.5055, "step": 22500 }, { "epoch": 2.7280706956882748, "grad_norm": 22.466794967651367, "learning_rate": 1.822064056939502e-06, "loss": 0.4965, "step": 23000 }, { "epoch": 2.78737915900599, "grad_norm": 61.90739822387695, "learning_rate": 1.4274416765519968e-06, "loss": 0.519, "step": 23500 }, { "epoch": 2.8466876223237056, "grad_norm": 27.325544357299805, "learning_rate": 1.032819296164492e-06, "loss": 0.5097, "step": 24000 }, { "epoch": 2.905996085641421, "grad_norm": 45.56763458251953, "learning_rate": 6.374060893633847e-07, "loss": 0.4979, "step": 24500 }, { "epoch": 2.9653045489591365, "grad_norm": 29.870397567749023, "learning_rate": 2.419928825622776e-07, "loss": 0.51, "step": 25000 }, { "epoch": 2.9997034576834114, "step": 25290, "total_flos": 8.259596752704e+16, "train_loss": 0.7820950991931099, "train_runtime": 45983.6556, "train_samples_per_second": 8.8, "train_steps_per_second": 0.55 } ], "logging_steps": 500, "max_steps": 25290, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.259596752704e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }