| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 515, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0970873786407767, | |
| "grad_norm": 6.964529721458028, | |
| "learning_rate": 8.653846153846154e-07, | |
| "loss": 0.9101, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1941747572815534, | |
| "grad_norm": 3.566929368993238, | |
| "learning_rate": 1.826923076923077e-06, | |
| "loss": 0.785, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2912621359223301, | |
| "grad_norm": 2.572419911943158, | |
| "learning_rate": 2.7884615384615386e-06, | |
| "loss": 0.6303, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 0.6433805154127846, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5371, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4854368932038835, | |
| "grad_norm": 0.5152897860452502, | |
| "learning_rate": 4.711538461538462e-06, | |
| "loss": 0.4788, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 0.7787775149779967, | |
| "learning_rate": 4.997180564209414e-06, | |
| "loss": 0.4451, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6796116504854369, | |
| "grad_norm": 0.8091799603982648, | |
| "learning_rate": 4.9833863897161715e-06, | |
| "loss": 0.4145, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.8426511044153906, | |
| "learning_rate": 4.95816302585984e-06, | |
| "loss": 0.3935, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8737864077669902, | |
| "grad_norm": 0.8115389627762832, | |
| "learning_rate": 4.9216265571140565e-06, | |
| "loss": 0.3753, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.970873786407767, | |
| "grad_norm": 0.5597270195195797, | |
| "learning_rate": 4.8739451338003675e-06, | |
| "loss": 0.3679, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0679611650485437, | |
| "grad_norm": 0.41701001823219247, | |
| "learning_rate": 4.815338198216762e-06, | |
| "loss": 0.3288, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.1650485436893203, | |
| "grad_norm": 22.448277335293643, | |
| "learning_rate": 4.746075474707204e-06, | |
| "loss": 0.3192, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.262135922330097, | |
| "grad_norm": 0.4010902102419476, | |
| "learning_rate": 4.666475728320124e-06, | |
| "loss": 0.3189, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.3592233009708738, | |
| "grad_norm": 0.48930786140814836, | |
| "learning_rate": 4.576905297768856e-06, | |
| "loss": 0.3132, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.4563106796116505, | |
| "grad_norm": 0.3541487037094227, | |
| "learning_rate": 4.477776409445692e-06, | |
| "loss": 0.3038, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.5533980582524272, | |
| "grad_norm": 31.33298474339703, | |
| "learning_rate": 4.369545280248932e-06, | |
| "loss": 0.2957, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.650485436893204, | |
| "grad_norm": 0.3578518918951451, | |
| "learning_rate": 4.252710017954191e-06, | |
| "loss": 0.2966, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.7475728155339807, | |
| "grad_norm": 0.33938843089336523, | |
| "learning_rate": 4.127808328793e-06, | |
| "loss": 0.3004, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.8446601941747574, | |
| "grad_norm": 0.32000682056080126, | |
| "learning_rate": 3.995415042789034e-06, | |
| "loss": 0.3086, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.941747572815534, | |
| "grad_norm": 0.3829047834599722, | |
| "learning_rate": 3.856139468240996e-06, | |
| "loss": 0.2868, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0388349514563107, | |
| "grad_norm": 0.4031366118857969, | |
| "learning_rate": 3.7106225875275257e-06, | |
| "loss": 0.2779, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.1359223300970873, | |
| "grad_norm": 0.37263381399379847, | |
| "learning_rate": 3.5595341071397627e-06, | |
| "loss": 0.2504, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.233009708737864, | |
| "grad_norm": 0.3332345121427193, | |
| "learning_rate": 3.4035693755180817e-06, | |
| "loss": 0.2466, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.3300970873786406, | |
| "grad_norm": 0.2989895988940855, | |
| "learning_rate": 3.2434461828779096e-06, | |
| "loss": 0.2387, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.4271844660194173, | |
| "grad_norm": 0.3583335796728048, | |
| "learning_rate": 3.0799014577526735e-06, | |
| "loss": 0.2418, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.524271844660194, | |
| "grad_norm": 0.38853396086729947, | |
| "learning_rate": 2.9136878754572317e-06, | |
| "loss": 0.2542, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.6213592233009706, | |
| "grad_norm": 0.3547407166001253, | |
| "learning_rate": 2.7455703940805228e-06, | |
| "loss": 0.2419, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.7184466019417477, | |
| "grad_norm": 0.33729197409950573, | |
| "learning_rate": 2.5763227339496984e-06, | |
| "loss": 0.24, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.8155339805825244, | |
| "grad_norm": 0.33325471675524143, | |
| "learning_rate": 2.4067238167681655e-06, | |
| "loss": 0.2414, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.912621359223301, | |
| "grad_norm": 0.3516852631876031, | |
| "learning_rate": 2.237554180815538e-06, | |
| "loss": 0.2413, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.0097087378640777, | |
| "grad_norm": 0.3860131260501091, | |
| "learning_rate": 2.0695923887076824e-06, | |
| "loss": 0.241, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.1067961165048543, | |
| "grad_norm": 0.3824808831772379, | |
| "learning_rate": 1.9036114442492901e-06, | |
| "loss": 0.2095, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.203883495145631, | |
| "grad_norm": 0.39411524156521255, | |
| "learning_rate": 1.7403752348695296e-06, | |
| "loss": 0.211, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.3009708737864076, | |
| "grad_norm": 0.3829443147282107, | |
| "learning_rate": 1.5806350160136446e-06, | |
| "loss": 0.2072, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.3980582524271843, | |
| "grad_norm": 0.36458747976200806, | |
| "learning_rate": 1.4251259536702078e-06, | |
| "loss": 0.1964, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.4951456310679614, | |
| "grad_norm": 0.37575548194010133, | |
| "learning_rate": 1.2745637409462447e-06, | |
| "loss": 0.2088, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.592233009708738, | |
| "grad_norm": 0.3652962786931115, | |
| "learning_rate": 1.1296413042616115e-06, | |
| "loss": 0.1994, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.6893203883495147, | |
| "grad_norm": 0.3712933316004299, | |
| "learning_rate": 9.910256143215882e-07, | |
| "loss": 0.2045, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.7864077669902914, | |
| "grad_norm": 0.3461802470195759, | |
| "learning_rate": 8.593546165444078e-07, | |
| "loss": 0.2045, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.883495145631068, | |
| "grad_norm": 0.35407836747337357, | |
| "learning_rate": 7.352342950706964e-07, | |
| "loss": 0.2067, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.9805825242718447, | |
| "grad_norm": 0.35720973588507643, | |
| "learning_rate": 6.192358838670293e-07, | |
| "loss": 0.1983, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.077669902912621, | |
| "grad_norm": 0.49136621176532774, | |
| "learning_rate": 5.118932377587984e-07, | |
| "loss": 0.186, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.174757281553398, | |
| "grad_norm": 0.3964877004137412, | |
| "learning_rate": 4.137003754916105e-07, | |
| "loss": 0.1931, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.271844660194175, | |
| "grad_norm": 0.34705190574049327, | |
| "learning_rate": 3.2510920612867284e-07, | |
| "loss": 0.1775, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.368932038834951, | |
| "grad_norm": 0.37657649802404225, | |
| "learning_rate": 2.4652744924787253e-07, | |
| "loss": 0.1913, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.466019417475728, | |
| "grad_norm": 0.3797812117921156, | |
| "learning_rate": 1.7831675851035264e-07, | |
| "loss": 0.193, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.563106796116505, | |
| "grad_norm": 0.35942347600355457, | |
| "learning_rate": 1.207910572364046e-07, | |
| "loss": 0.1828, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.660194174757281, | |
| "grad_norm": 0.34568138491238304, | |
| "learning_rate": 7.421509364878927e-08, | |
| "loss": 0.1833, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.757281553398058, | |
| "grad_norm": 0.6159835033744901, | |
| "learning_rate": 3.8803222432630685e-08, | |
| "loss": 0.1832, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.854368932038835, | |
| "grad_norm": 0.38886425151937265, | |
| "learning_rate": 1.4718418219468178e-08, | |
| "loss": 0.1827, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.951456310679612, | |
| "grad_norm": 0.4089928939727998, | |
| "learning_rate": 2.0715255356559826e-09, | |
| "loss": 0.1829, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 515, | |
| "total_flos": 944499472728064.0, | |
| "train_loss": 0.29363987642584494, | |
| "train_runtime": 38623.7413, | |
| "train_samples_per_second": 0.426, | |
| "train_steps_per_second": 0.013 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 515, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 944499472728064.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |