{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 515, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0970873786407767, "grad_norm": 6.964529721458028, "learning_rate": 8.653846153846154e-07, "loss": 0.9101, "step": 10 }, { "epoch": 0.1941747572815534, "grad_norm": 3.566929368993238, "learning_rate": 1.826923076923077e-06, "loss": 0.785, "step": 20 }, { "epoch": 0.2912621359223301, "grad_norm": 2.572419911943158, "learning_rate": 2.7884615384615386e-06, "loss": 0.6303, "step": 30 }, { "epoch": 0.3883495145631068, "grad_norm": 0.6433805154127846, "learning_rate": 3.7500000000000005e-06, "loss": 0.5371, "step": 40 }, { "epoch": 0.4854368932038835, "grad_norm": 0.5152897860452502, "learning_rate": 4.711538461538462e-06, "loss": 0.4788, "step": 50 }, { "epoch": 0.5825242718446602, "grad_norm": 0.7787775149779967, "learning_rate": 4.997180564209414e-06, "loss": 0.4451, "step": 60 }, { "epoch": 0.6796116504854369, "grad_norm": 0.8091799603982648, "learning_rate": 4.9833863897161715e-06, "loss": 0.4145, "step": 70 }, { "epoch": 0.7766990291262136, "grad_norm": 0.8426511044153906, "learning_rate": 4.95816302585984e-06, "loss": 0.3935, "step": 80 }, { "epoch": 0.8737864077669902, "grad_norm": 0.8115389627762832, "learning_rate": 4.9216265571140565e-06, "loss": 0.3753, "step": 90 }, { "epoch": 0.970873786407767, "grad_norm": 0.5597270195195797, "learning_rate": 4.8739451338003675e-06, "loss": 0.3679, "step": 100 }, { "epoch": 1.0679611650485437, "grad_norm": 0.41701001823219247, "learning_rate": 4.815338198216762e-06, "loss": 0.3288, "step": 110 }, { "epoch": 1.1650485436893203, "grad_norm": 22.448277335293643, "learning_rate": 4.746075474707204e-06, "loss": 0.3192, "step": 120 }, { "epoch": 1.262135922330097, "grad_norm": 0.4010902102419476, "learning_rate": 4.666475728320124e-06, "loss": 0.3189, "step": 130 }, { "epoch": 1.3592233009708738, "grad_norm": 0.48930786140814836, "learning_rate": 4.576905297768856e-06, "loss": 0.3132, "step": 140 }, { "epoch": 1.4563106796116505, "grad_norm": 0.3541487037094227, "learning_rate": 4.477776409445692e-06, "loss": 0.3038, "step": 150 }, { "epoch": 1.5533980582524272, "grad_norm": 31.33298474339703, "learning_rate": 4.369545280248932e-06, "loss": 0.2957, "step": 160 }, { "epoch": 1.650485436893204, "grad_norm": 0.3578518918951451, "learning_rate": 4.252710017954191e-06, "loss": 0.2966, "step": 170 }, { "epoch": 1.7475728155339807, "grad_norm": 0.33938843089336523, "learning_rate": 4.127808328793e-06, "loss": 0.3004, "step": 180 }, { "epoch": 1.8446601941747574, "grad_norm": 0.32000682056080126, "learning_rate": 3.995415042789034e-06, "loss": 0.3086, "step": 190 }, { "epoch": 1.941747572815534, "grad_norm": 0.3829047834599722, "learning_rate": 3.856139468240996e-06, "loss": 0.2868, "step": 200 }, { "epoch": 2.0388349514563107, "grad_norm": 0.4031366118857969, "learning_rate": 3.7106225875275257e-06, "loss": 0.2779, "step": 210 }, { "epoch": 2.1359223300970873, "grad_norm": 0.37263381399379847, "learning_rate": 3.5595341071397627e-06, "loss": 0.2504, "step": 220 }, { "epoch": 2.233009708737864, "grad_norm": 0.3332345121427193, "learning_rate": 3.4035693755180817e-06, "loss": 0.2466, "step": 230 }, { "epoch": 2.3300970873786406, "grad_norm": 0.2989895988940855, "learning_rate": 3.2434461828779096e-06, "loss": 0.2387, "step": 240 }, { "epoch": 2.4271844660194173, "grad_norm": 0.3583335796728048, "learning_rate": 3.0799014577526735e-06, "loss": 0.2418, "step": 250 }, { "epoch": 2.524271844660194, "grad_norm": 0.38853396086729947, "learning_rate": 2.9136878754572317e-06, "loss": 0.2542, "step": 260 }, { "epoch": 2.6213592233009706, "grad_norm": 0.3547407166001253, "learning_rate": 2.7455703940805228e-06, "loss": 0.2419, "step": 270 }, { "epoch": 2.7184466019417477, "grad_norm": 0.33729197409950573, "learning_rate": 2.5763227339496984e-06, "loss": 0.24, "step": 280 }, { "epoch": 2.8155339805825244, "grad_norm": 0.33325471675524143, "learning_rate": 2.4067238167681655e-06, "loss": 0.2414, "step": 290 }, { "epoch": 2.912621359223301, "grad_norm": 0.3516852631876031, "learning_rate": 2.237554180815538e-06, "loss": 0.2413, "step": 300 }, { "epoch": 3.0097087378640777, "grad_norm": 0.3860131260501091, "learning_rate": 2.0695923887076824e-06, "loss": 0.241, "step": 310 }, { "epoch": 3.1067961165048543, "grad_norm": 0.3824808831772379, "learning_rate": 1.9036114442492901e-06, "loss": 0.2095, "step": 320 }, { "epoch": 3.203883495145631, "grad_norm": 0.39411524156521255, "learning_rate": 1.7403752348695296e-06, "loss": 0.211, "step": 330 }, { "epoch": 3.3009708737864076, "grad_norm": 0.3829443147282107, "learning_rate": 1.5806350160136446e-06, "loss": 0.2072, "step": 340 }, { "epoch": 3.3980582524271843, "grad_norm": 0.36458747976200806, "learning_rate": 1.4251259536702078e-06, "loss": 0.1964, "step": 350 }, { "epoch": 3.4951456310679614, "grad_norm": 0.37575548194010133, "learning_rate": 1.2745637409462447e-06, "loss": 0.2088, "step": 360 }, { "epoch": 3.592233009708738, "grad_norm": 0.3652962786931115, "learning_rate": 1.1296413042616115e-06, "loss": 0.1994, "step": 370 }, { "epoch": 3.6893203883495147, "grad_norm": 0.3712933316004299, "learning_rate": 9.910256143215882e-07, "loss": 0.2045, "step": 380 }, { "epoch": 3.7864077669902914, "grad_norm": 0.3461802470195759, "learning_rate": 8.593546165444078e-07, "loss": 0.2045, "step": 390 }, { "epoch": 3.883495145631068, "grad_norm": 0.35407836747337357, "learning_rate": 7.352342950706964e-07, "loss": 0.2067, "step": 400 }, { "epoch": 3.9805825242718447, "grad_norm": 0.35720973588507643, "learning_rate": 6.192358838670293e-07, "loss": 0.1983, "step": 410 }, { "epoch": 4.077669902912621, "grad_norm": 0.49136621176532774, "learning_rate": 5.118932377587984e-07, "loss": 0.186, "step": 420 }, { "epoch": 4.174757281553398, "grad_norm": 0.3964877004137412, "learning_rate": 4.137003754916105e-07, "loss": 0.1931, "step": 430 }, { "epoch": 4.271844660194175, "grad_norm": 0.34705190574049327, "learning_rate": 3.2510920612867284e-07, "loss": 0.1775, "step": 440 }, { "epoch": 4.368932038834951, "grad_norm": 0.37657649802404225, "learning_rate": 2.4652744924787253e-07, "loss": 0.1913, "step": 450 }, { "epoch": 4.466019417475728, "grad_norm": 0.3797812117921156, "learning_rate": 1.7831675851035264e-07, "loss": 0.193, "step": 460 }, { "epoch": 4.563106796116505, "grad_norm": 0.35942347600355457, "learning_rate": 1.207910572364046e-07, "loss": 0.1828, "step": 470 }, { "epoch": 4.660194174757281, "grad_norm": 0.34568138491238304, "learning_rate": 7.421509364878927e-08, "loss": 0.1833, "step": 480 }, { "epoch": 4.757281553398058, "grad_norm": 0.6159835033744901, "learning_rate": 3.8803222432630685e-08, "loss": 0.1832, "step": 490 }, { "epoch": 4.854368932038835, "grad_norm": 0.38886425151937265, "learning_rate": 1.4718418219468178e-08, "loss": 0.1827, "step": 500 }, { "epoch": 4.951456310679612, "grad_norm": 0.4089928939727998, "learning_rate": 2.0715255356559826e-09, "loss": 0.1829, "step": 510 }, { "epoch": 5.0, "step": 515, "total_flos": 944499472728064.0, "train_loss": 0.29363987642584494, "train_runtime": 38623.7413, "train_samples_per_second": 0.426, "train_steps_per_second": 0.013 } ], "logging_steps": 10, "max_steps": 515, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 944499472728064.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }