| { | |
| "best_metric": 0.5025785565376282, | |
| "best_model_checkpoint": "models/E-Coli-FFT/KCYHSM/checkpoint-21500", | |
| "epoch": 8.96, | |
| "eval_steps": 500, | |
| "global_step": 28000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.560947835445404, | |
| "learning_rate": 4.996016e-05, | |
| "loss": 2.0872, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_accuracy_per_token": 0.30261626839637756, | |
| "eval_loss": 2.0467002391815186, | |
| "eval_runtime": 226.6763, | |
| "eval_samples_per_second": 110.289, | |
| "eval_steps_per_second": 6.895, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.5276467204093933, | |
| "learning_rate": 4.992016000000001e-05, | |
| "loss": 2.033, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_accuracy_per_token": 0.31899595260620117, | |
| "eval_loss": 2.0086700916290283, | |
| "eval_runtime": 215.1833, | |
| "eval_samples_per_second": 116.18, | |
| "eval_steps_per_second": 7.264, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.3343302011489868, | |
| "learning_rate": 4.988016e-05, | |
| "loss": 1.9439, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_accuracy_per_token": 0.3831162750720978, | |
| "eval_loss": 1.8512518405914307, | |
| "eval_runtime": 215.5299, | |
| "eval_samples_per_second": 115.993, | |
| "eval_steps_per_second": 7.252, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.4562740325927734, | |
| "learning_rate": 4.984016e-05, | |
| "loss": 1.7198, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_accuracy_per_token": 0.47227564454078674, | |
| "eval_loss": 1.6024138927459717, | |
| "eval_runtime": 215.7037, | |
| "eval_samples_per_second": 115.9, | |
| "eval_steps_per_second": 7.246, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.136140823364258, | |
| "learning_rate": 4.980016e-05, | |
| "loss": 1.5124, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_accuracy_per_token": 0.5323951840400696, | |
| "eval_loss": 1.4281255006790161, | |
| "eval_runtime": 215.8041, | |
| "eval_samples_per_second": 115.846, | |
| "eval_steps_per_second": 7.243, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.6090725660324097, | |
| "learning_rate": 4.976016e-05, | |
| "loss": 1.3463, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_accuracy_per_token": 0.5815901160240173, | |
| "eval_loss": 1.2842084169387817, | |
| "eval_runtime": 215.7888, | |
| "eval_samples_per_second": 115.854, | |
| "eval_steps_per_second": 7.243, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.4967901706695557, | |
| "learning_rate": 4.972016e-05, | |
| "loss": 1.1657, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_accuracy_per_token": 0.6198335886001587, | |
| "eval_loss": 1.1727232933044434, | |
| "eval_runtime": 215.8392, | |
| "eval_samples_per_second": 115.827, | |
| "eval_steps_per_second": 7.242, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.750229597091675, | |
| "learning_rate": 4.968016e-05, | |
| "loss": 1.0624, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_accuracy_per_token": 0.6496000289916992, | |
| "eval_loss": 1.0832889080047607, | |
| "eval_runtime": 216.0152, | |
| "eval_samples_per_second": 115.733, | |
| "eval_steps_per_second": 7.236, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.4956400394439697, | |
| "learning_rate": 4.9640160000000003e-05, | |
| "loss": 0.9957, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_accuracy_per_token": 0.6765108108520508, | |
| "eval_loss": 1.0036195516586304, | |
| "eval_runtime": 215.7199, | |
| "eval_samples_per_second": 115.891, | |
| "eval_steps_per_second": 7.246, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.816002368927002, | |
| "learning_rate": 4.9600160000000004e-05, | |
| "loss": 0.913, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_accuracy_per_token": 0.6984472274780273, | |
| "eval_loss": 0.9418182969093323, | |
| "eval_runtime": 215.8333, | |
| "eval_samples_per_second": 115.83, | |
| "eval_steps_per_second": 7.242, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.293124198913574, | |
| "learning_rate": 4.956016e-05, | |
| "loss": 0.8747, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_accuracy_per_token": 0.7186679244041443, | |
| "eval_loss": 0.8788002133369446, | |
| "eval_runtime": 215.9244, | |
| "eval_samples_per_second": 115.781, | |
| "eval_steps_per_second": 7.239, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.6259233951568604, | |
| "learning_rate": 4.9520160000000005e-05, | |
| "loss": 0.8064, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_accuracy_per_token": 0.7395341992378235, | |
| "eval_loss": 0.8194286823272705, | |
| "eval_runtime": 215.8802, | |
| "eval_samples_per_second": 115.805, | |
| "eval_steps_per_second": 7.24, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.7774460315704346, | |
| "learning_rate": 4.9480160000000005e-05, | |
| "loss": 0.7329, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_accuracy_per_token": 0.7511833310127258, | |
| "eval_loss": 0.7874646782875061, | |
| "eval_runtime": 215.9936, | |
| "eval_samples_per_second": 115.744, | |
| "eval_steps_per_second": 7.236, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.8111910820007324, | |
| "learning_rate": 4.9440160000000005e-05, | |
| "loss": 0.6541, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_accuracy_per_token": 0.7654264569282532, | |
| "eval_loss": 0.7462261915206909, | |
| "eval_runtime": 215.7774, | |
| "eval_samples_per_second": 115.86, | |
| "eval_steps_per_second": 7.244, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.734023332595825, | |
| "learning_rate": 4.940016e-05, | |
| "loss": 0.6368, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_accuracy_per_token": 0.7762272953987122, | |
| "eval_loss": 0.7165001630783081, | |
| "eval_runtime": 215.5714, | |
| "eval_samples_per_second": 115.971, | |
| "eval_steps_per_second": 7.25, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 3.085864782333374, | |
| "learning_rate": 4.936016e-05, | |
| "loss": 0.6019, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_accuracy_per_token": 0.7840808033943176, | |
| "eval_loss": 0.6941312551498413, | |
| "eval_runtime": 215.5763, | |
| "eval_samples_per_second": 115.968, | |
| "eval_steps_per_second": 7.25, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 2.5468504428863525, | |
| "learning_rate": 4.9320320000000004e-05, | |
| "loss": 0.5855, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_accuracy_per_token": 0.7905924916267395, | |
| "eval_loss": 0.6726610064506531, | |
| "eval_runtime": 215.5941, | |
| "eval_samples_per_second": 115.959, | |
| "eval_steps_per_second": 7.25, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.7991392612457275, | |
| "learning_rate": 4.9280320000000005e-05, | |
| "loss": 0.565, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_accuracy_per_token": 0.7970213890075684, | |
| "eval_loss": 0.6574403047561646, | |
| "eval_runtime": 215.6086, | |
| "eval_samples_per_second": 115.951, | |
| "eval_steps_per_second": 7.249, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.0875983238220215, | |
| "learning_rate": 4.924032e-05, | |
| "loss": 0.5437, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_accuracy_per_token": 0.8036227226257324, | |
| "eval_loss": 0.6450381278991699, | |
| "eval_runtime": 215.7698, | |
| "eval_samples_per_second": 115.864, | |
| "eval_steps_per_second": 7.244, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 2.820004940032959, | |
| "learning_rate": 4.920032e-05, | |
| "loss": 0.4618, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_accuracy_per_token": 0.8065482378005981, | |
| "eval_loss": 0.6290712952613831, | |
| "eval_runtime": 215.5858, | |
| "eval_samples_per_second": 115.963, | |
| "eval_steps_per_second": 7.25, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 2.534975051879883, | |
| "learning_rate": 4.916048e-05, | |
| "loss": 0.4836, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "eval_accuracy_per_token": 0.8109478950500488, | |
| "eval_loss": 0.621654748916626, | |
| "eval_runtime": 215.6832, | |
| "eval_samples_per_second": 115.911, | |
| "eval_steps_per_second": 7.247, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 2.2555694580078125, | |
| "learning_rate": 4.9120480000000004e-05, | |
| "loss": 0.4793, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "eval_accuracy_per_token": 0.8153020739555359, | |
| "eval_loss": 0.6032074093818665, | |
| "eval_runtime": 215.7864, | |
| "eval_samples_per_second": 115.855, | |
| "eval_steps_per_second": 7.243, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 2.6011011600494385, | |
| "learning_rate": 4.9080480000000004e-05, | |
| "loss": 0.4639, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "eval_accuracy_per_token": 0.8189014792442322, | |
| "eval_loss": 0.5884661078453064, | |
| "eval_runtime": 215.9181, | |
| "eval_samples_per_second": 115.785, | |
| "eval_steps_per_second": 7.239, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 2.201638698577881, | |
| "learning_rate": 4.904048e-05, | |
| "loss": 0.4734, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "eval_accuracy_per_token": 0.8225154280662537, | |
| "eval_loss": 0.5901506543159485, | |
| "eval_runtime": 215.7481, | |
| "eval_samples_per_second": 115.876, | |
| "eval_steps_per_second": 7.245, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.115771770477295, | |
| "learning_rate": 4.9000480000000005e-05, | |
| "loss": 0.4589, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy_per_token": 0.824912428855896, | |
| "eval_loss": 0.57036292552948, | |
| "eval_runtime": 215.9409, | |
| "eval_samples_per_second": 115.772, | |
| "eval_steps_per_second": 7.238, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 2.13425350189209, | |
| "learning_rate": 4.8960480000000005e-05, | |
| "loss": 0.3846, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "eval_accuracy_per_token": 0.8280689716339111, | |
| "eval_loss": 0.5732296705245972, | |
| "eval_runtime": 215.8629, | |
| "eval_samples_per_second": 115.814, | |
| "eval_steps_per_second": 7.241, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 2.227823257446289, | |
| "learning_rate": 4.892048e-05, | |
| "loss": 0.3907, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "eval_accuracy_per_token": 0.8309552073478699, | |
| "eval_loss": 0.5606500506401062, | |
| "eval_runtime": 215.8455, | |
| "eval_samples_per_second": 115.824, | |
| "eval_steps_per_second": 7.241, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.9877572059631348, | |
| "learning_rate": 4.888048e-05, | |
| "loss": 0.3781, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "eval_accuracy_per_token": 0.833513617515564, | |
| "eval_loss": 0.5528165102005005, | |
| "eval_runtime": 216.0002, | |
| "eval_samples_per_second": 115.741, | |
| "eval_steps_per_second": 7.236, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 2.0553159713745117, | |
| "learning_rate": 4.884048e-05, | |
| "loss": 0.3914, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "eval_accuracy_per_token": 0.8335487246513367, | |
| "eval_loss": 0.549156904220581, | |
| "eval_runtime": 215.7201, | |
| "eval_samples_per_second": 115.891, | |
| "eval_steps_per_second": 7.246, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 2.2741925716400146, | |
| "learning_rate": 4.880048000000001e-05, | |
| "loss": 0.3894, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_accuracy_per_token": 0.8362560272216797, | |
| "eval_loss": 0.5504783391952515, | |
| "eval_runtime": 215.7979, | |
| "eval_samples_per_second": 115.849, | |
| "eval_steps_per_second": 7.243, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 2.08858585357666, | |
| "learning_rate": 4.876048e-05, | |
| "loss": 0.3944, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "eval_accuracy_per_token": 0.8395103812217712, | |
| "eval_loss": 0.5324631929397583, | |
| "eval_runtime": 215.9188, | |
| "eval_samples_per_second": 115.784, | |
| "eval_steps_per_second": 7.239, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 2.1368110179901123, | |
| "learning_rate": 4.872048e-05, | |
| "loss": 0.3307, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "eval_accuracy_per_token": 0.839469850063324, | |
| "eval_loss": 0.5465147495269775, | |
| "eval_runtime": 216.142, | |
| "eval_samples_per_second": 115.665, | |
| "eval_steps_per_second": 7.231, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 2.2641913890838623, | |
| "learning_rate": 4.868048e-05, | |
| "loss": 0.3125, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "eval_accuracy_per_token": 0.8418903946876526, | |
| "eval_loss": 0.5389049053192139, | |
| "eval_runtime": 216.0271, | |
| "eval_samples_per_second": 115.726, | |
| "eval_steps_per_second": 7.235, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 1.6883608102798462, | |
| "learning_rate": 4.864048e-05, | |
| "loss": 0.3213, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "eval_accuracy_per_token": 0.8448163270950317, | |
| "eval_loss": 0.5376200675964355, | |
| "eval_runtime": 216.0239, | |
| "eval_samples_per_second": 115.728, | |
| "eval_steps_per_second": 7.235, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 2.2113685607910156, | |
| "learning_rate": 4.860048e-05, | |
| "loss": 0.3269, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_accuracy_per_token": 0.8447655439376831, | |
| "eval_loss": 0.5422470569610596, | |
| "eval_runtime": 215.062, | |
| "eval_samples_per_second": 116.246, | |
| "eval_steps_per_second": 7.268, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 1.9853556156158447, | |
| "learning_rate": 4.856048e-05, | |
| "loss": 0.3374, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "eval_accuracy_per_token": 0.847917914390564, | |
| "eval_loss": 0.5255292057991028, | |
| "eval_runtime": 213.7043, | |
| "eval_samples_per_second": 116.984, | |
| "eval_steps_per_second": 7.314, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 2.584245204925537, | |
| "learning_rate": 4.852048e-05, | |
| "loss": 0.3229, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "eval_accuracy_per_token": 0.8491668701171875, | |
| "eval_loss": 0.5113908052444458, | |
| "eval_runtime": 214.1955, | |
| "eval_samples_per_second": 116.716, | |
| "eval_steps_per_second": 7.297, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 2.0866963863372803, | |
| "learning_rate": 4.848056e-05, | |
| "loss": 0.3061, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "eval_accuracy_per_token": 0.8502768874168396, | |
| "eval_loss": 0.5418649911880493, | |
| "eval_runtime": 214.0933, | |
| "eval_samples_per_second": 116.771, | |
| "eval_steps_per_second": 7.301, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 2.0000994205474854, | |
| "learning_rate": 4.844056e-05, | |
| "loss": 0.2556, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "eval_accuracy_per_token": 0.85043865442276, | |
| "eval_loss": 0.542536199092865, | |
| "eval_runtime": 213.1241, | |
| "eval_samples_per_second": 117.303, | |
| "eval_steps_per_second": 7.334, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 2.849112033843994, | |
| "learning_rate": 4.840064e-05, | |
| "loss": 0.2672, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "eval_accuracy_per_token": 0.8512768149375916, | |
| "eval_loss": 0.5348747372627258, | |
| "eval_runtime": 215.5865, | |
| "eval_samples_per_second": 115.963, | |
| "eval_steps_per_second": 7.25, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 2.216937303543091, | |
| "learning_rate": 4.836064e-05, | |
| "loss": 0.2791, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "eval_accuracy_per_token": 0.8536346554756165, | |
| "eval_loss": 0.5185777544975281, | |
| "eval_runtime": 215.5365, | |
| "eval_samples_per_second": 115.99, | |
| "eval_steps_per_second": 7.252, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 2.2237415313720703, | |
| "learning_rate": 4.832064e-05, | |
| "loss": 0.2792, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "eval_accuracy_per_token": 0.8542323112487793, | |
| "eval_loss": 0.5137789249420166, | |
| "eval_runtime": 215.6897, | |
| "eval_samples_per_second": 115.907, | |
| "eval_steps_per_second": 7.247, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 2.6220571994781494, | |
| "learning_rate": 4.828064e-05, | |
| "loss": 0.2867, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "eval_accuracy_per_token": 0.8552057147026062, | |
| "eval_loss": 0.5025785565376282, | |
| "eval_runtime": 215.6135, | |
| "eval_samples_per_second": 115.948, | |
| "eval_steps_per_second": 7.249, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 2.4813005924224854, | |
| "learning_rate": 4.82408e-05, | |
| "loss": 0.2628, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "eval_accuracy_per_token": 0.8553439974784851, | |
| "eval_loss": 0.5604137778282166, | |
| "eval_runtime": 215.6781, | |
| "eval_samples_per_second": 115.913, | |
| "eval_steps_per_second": 7.247, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 2.316831588745117, | |
| "learning_rate": 4.82008e-05, | |
| "loss": 0.2124, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "eval_accuracy_per_token": 0.8566614389419556, | |
| "eval_loss": 0.5436919927597046, | |
| "eval_runtime": 215.5335, | |
| "eval_samples_per_second": 115.991, | |
| "eval_steps_per_second": 7.252, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 2.0445761680603027, | |
| "learning_rate": 4.81608e-05, | |
| "loss": 0.2246, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "eval_accuracy_per_token": 0.8579392433166504, | |
| "eval_loss": 0.5327755212783813, | |
| "eval_runtime": 215.6442, | |
| "eval_samples_per_second": 115.932, | |
| "eval_steps_per_second": 7.248, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 1.8908203840255737, | |
| "learning_rate": 4.81208e-05, | |
| "loss": 0.2266, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "eval_accuracy_per_token": 0.8593510389328003, | |
| "eval_loss": 0.5242588520050049, | |
| "eval_runtime": 215.5467, | |
| "eval_samples_per_second": 115.984, | |
| "eval_steps_per_second": 7.251, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 1.698264241218567, | |
| "learning_rate": 4.80808e-05, | |
| "loss": 0.2335, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "eval_accuracy_per_token": 0.8600181341171265, | |
| "eval_loss": 0.5188168883323669, | |
| "eval_runtime": 215.5491, | |
| "eval_samples_per_second": 115.983, | |
| "eval_steps_per_second": 7.251, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 2.196866035461426, | |
| "learning_rate": 4.80408e-05, | |
| "loss": 0.2375, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "eval_accuracy_per_token": 0.8607679605484009, | |
| "eval_loss": 0.5174950957298279, | |
| "eval_runtime": 215.872, | |
| "eval_samples_per_second": 115.809, | |
| "eval_steps_per_second": 7.24, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 2.599553346633911, | |
| "learning_rate": 4.800088e-05, | |
| "loss": 0.2426, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy_per_token": 0.8615555763244629, | |
| "eval_loss": 0.5121429562568665, | |
| "eval_runtime": 215.6323, | |
| "eval_samples_per_second": 115.938, | |
| "eval_steps_per_second": 7.248, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 2.2151308059692383, | |
| "learning_rate": 4.796088e-05, | |
| "loss": 0.1708, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "eval_accuracy_per_token": 0.8606916666030884, | |
| "eval_loss": 0.5612675547599792, | |
| "eval_runtime": 215.6202, | |
| "eval_samples_per_second": 115.945, | |
| "eval_steps_per_second": 7.249, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 2.0478286743164062, | |
| "learning_rate": 4.792088e-05, | |
| "loss": 0.1739, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "eval_accuracy_per_token": 0.8610015511512756, | |
| "eval_loss": 0.5620591640472412, | |
| "eval_runtime": 215.5503, | |
| "eval_samples_per_second": 115.982, | |
| "eval_steps_per_second": 7.251, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 2.2109131813049316, | |
| "learning_rate": 4.788088e-05, | |
| "loss": 0.1893, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "eval_accuracy_per_token": 0.861874520778656, | |
| "eval_loss": 0.5581731200218201, | |
| "eval_runtime": 215.6555, | |
| "eval_samples_per_second": 115.926, | |
| "eval_steps_per_second": 7.248, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 2.688985586166382, | |
| "learning_rate": 4.784096e-05, | |
| "loss": 0.1948, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "eval_accuracy_per_token": 0.864142119884491, | |
| "eval_loss": 0.5396531820297241, | |
| "eval_runtime": 215.5944, | |
| "eval_samples_per_second": 115.959, | |
| "eval_steps_per_second": 7.25, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 2.5816333293914795, | |
| "learning_rate": 4.780096e-05, | |
| "loss": 0.2022, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "eval_accuracy_per_token": 0.8647276759147644, | |
| "eval_loss": 0.537101149559021, | |
| "eval_runtime": 215.522, | |
| "eval_samples_per_second": 115.997, | |
| "eval_steps_per_second": 7.252, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 2.262164354324341, | |
| "learning_rate": 4.776096e-05, | |
| "loss": 0.2027, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "eval_accuracy_per_token": 0.8652188777923584, | |
| "eval_loss": 0.5241075158119202, | |
| "eval_runtime": 215.8588, | |
| "eval_samples_per_second": 115.816, | |
| "eval_steps_per_second": 7.241, | |
| "step": 28000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 625000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 200, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 19, | |
| "early_stopping_threshold": 0.01 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.043038124900352e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |