diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,56019 +2,22675 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 5.233889434085705, - "eval_steps": 500, - "global_step": 80000, + "epoch": 8.372652927706902, + "eval_steps": 1000, + "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0006542361792607131, - "grad_norm": 2.3467204570770264, - "learning_rate": 1.0000000000000001e-07, - "loss": 0.208, + "epoch": 0.0026169447170428526, + "grad_norm": 0.5513588786125183, + "learning_rate": 2.25e-07, + "loss": 0.051, "step": 10 }, { - "epoch": 0.0013084723585214263, - "grad_norm": 2.2877755165100098, - "learning_rate": 2.111111111111111e-07, - "loss": 0.2277, + "epoch": 0.005233889434085705, + "grad_norm": 0.45345941185951233, + "learning_rate": 4.75e-07, + "loss": 0.0552, "step": 20 }, { - "epoch": 0.001962708537782139, - "grad_norm": 2.4381093978881836, - "learning_rate": 3.222222222222222e-07, - "loss": 0.2252, + "epoch": 0.007850834151128557, + "grad_norm": 0.5479740500450134, + "learning_rate": 7.25e-07, + "loss": 0.0607, "step": 30 }, { - "epoch": 0.0026169447170428526, - "grad_norm": 2.922689914703369, - "learning_rate": 4.3333333333333335e-07, - "loss": 0.2161, + "epoch": 0.01046777886817141, + "grad_norm": 0.36328455805778503, + "learning_rate": 9.75e-07, + "loss": 0.0478, "step": 40 }, { - "epoch": 0.0032711808963035655, - "grad_norm": 2.4064548015594482, - "learning_rate": 5.444444444444444e-07, - "loss": 0.2299, + "epoch": 0.013084723585214262, + "grad_norm": 0.6090478301048279, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.05, "step": 50 }, { - "epoch": 0.003925417075564278, - "grad_norm": 2.436875581741333, - "learning_rate": 6.555555555555556e-07, - "loss": 0.2218, + "epoch": 0.015701668302257114, + "grad_norm": 0.4246404469013214, + "learning_rate": 1.475e-06, + "loss": 0.0514, "step": 60 }, { - "epoch": 0.004579653254824992, - "grad_norm": 2.463653802871704, - "learning_rate": 7.666666666666667e-07, - "loss": 0.2234, + "epoch": 0.01831861301929997, + "grad_norm": 0.3821658194065094, + "learning_rate": 1.7250000000000002e-06, + "loss": 0.0576, "step": 70 }, { - "epoch": 0.005233889434085705, - "grad_norm": 1.9082249402999878, - "learning_rate": 8.777777777777779e-07, - "loss": 0.2155, + "epoch": 0.02093555773634282, + "grad_norm": 0.4736561179161072, + "learning_rate": 1.975e-06, + "loss": 0.0503, "step": 80 }, { - "epoch": 0.005888125613346418, - "grad_norm": 1.9711594581604004, - "learning_rate": 9.888888888888888e-07, - "loss": 0.2167, + "epoch": 0.023552502453385672, + "grad_norm": 0.3493232727050781, + "learning_rate": 2.225e-06, + "loss": 0.0521, "step": 90 }, { - "epoch": 0.006542361792607131, - "grad_norm": 2.022458076477051, - "learning_rate": 1.1e-06, - "loss": 0.2089, + "epoch": 0.026169447170428524, + "grad_norm": 0.3130541145801544, + "learning_rate": 2.4750000000000004e-06, + "loss": 0.049, "step": 100 }, { - "epoch": 0.007196597971867844, - "grad_norm": 2.3458659648895264, - "learning_rate": 1.2111111111111111e-06, - "loss": 0.2131, + "epoch": 0.028786391887471376, + "grad_norm": 0.40587979555130005, + "learning_rate": 2.725e-06, + "loss": 0.0569, "step": 110 }, { - "epoch": 0.007850834151128557, - "grad_norm": 2.2194032669067383, - "learning_rate": 1.3222222222222222e-06, - "loss": 0.2217, + "epoch": 0.03140333660451423, + "grad_norm": 0.2859586179256439, + "learning_rate": 2.975e-06, + "loss": 0.0504, "step": 120 }, { - "epoch": 0.00850507033038927, - "grad_norm": 2.2126388549804688, - "learning_rate": 1.4333333333333333e-06, - "loss": 0.2191, + "epoch": 0.03402028132155708, + "grad_norm": 0.38783594965934753, + "learning_rate": 3.225e-06, + "loss": 0.0447, "step": 130 }, { - "epoch": 0.009159306509649984, - "grad_norm": 2.212867021560669, - "learning_rate": 1.5444444444444446e-06, - "loss": 0.2201, + "epoch": 0.03663722603859994, + "grad_norm": 0.24415196478366852, + "learning_rate": 3.4750000000000006e-06, + "loss": 0.0534, "step": 140 }, { - "epoch": 0.009813542688910697, - "grad_norm": 2.3964221477508545, - "learning_rate": 1.6555555555555557e-06, - "loss": 0.2154, + "epoch": 0.03925417075564279, + "grad_norm": 0.4478183388710022, + "learning_rate": 3.725e-06, + "loss": 0.057, "step": 150 }, { - "epoch": 0.01046777886817141, - "grad_norm": 2.0090246200561523, - "learning_rate": 1.7666666666666668e-06, - "loss": 0.2154, + "epoch": 0.04187111547268564, + "grad_norm": 0.4929760694503784, + "learning_rate": 3.975e-06, + "loss": 0.0522, "step": 160 }, { - "epoch": 0.011122015047432123, - "grad_norm": 2.2598977088928223, - "learning_rate": 1.877777777777778e-06, - "loss": 0.2057, + "epoch": 0.04448806018972849, + "grad_norm": 0.4541402757167816, + "learning_rate": 4.225e-06, + "loss": 0.0484, "step": 170 }, { - "epoch": 0.011776251226692836, - "grad_norm": 2.0792648792266846, - "learning_rate": 1.988888888888889e-06, - "loss": 0.1998, + "epoch": 0.047105004906771344, + "grad_norm": 0.34477248787879944, + "learning_rate": 4.475e-06, + "loss": 0.044, "step": 180 }, { - "epoch": 0.012430487405953549, - "grad_norm": 2.0464093685150146, - "learning_rate": 2.1000000000000002e-06, - "loss": 0.1958, + "epoch": 0.049721949623814196, + "grad_norm": 0.4181000292301178, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.0489, "step": 190 }, { - "epoch": 0.013084723585214262, - "grad_norm": 1.8751633167266846, - "learning_rate": 2.2111111111111113e-06, - "loss": 0.1874, + "epoch": 0.05233889434085705, + "grad_norm": 0.38417401909828186, + "learning_rate": 4.975000000000001e-06, + "loss": 0.0479, "step": 200 }, { - "epoch": 0.013738959764474975, - "grad_norm": 1.9019556045532227, - "learning_rate": 2.322222222222222e-06, - "loss": 0.191, + "epoch": 0.0549558390578999, + "grad_norm": 0.34942829608917236, + "learning_rate": 5.225e-06, + "loss": 0.0481, "step": 210 }, { - "epoch": 0.014393195943735688, - "grad_norm": 1.7033251523971558, - "learning_rate": 2.4333333333333335e-06, - "loss": 0.1798, + "epoch": 0.05757278377494275, + "grad_norm": 0.5058625936508179, + "learning_rate": 5.475e-06, + "loss": 0.0508, "step": 220 }, { - "epoch": 0.015047432122996402, - "grad_norm": 1.5006437301635742, - "learning_rate": 2.5444444444444446e-06, - "loss": 0.1863, + "epoch": 0.06018972849198561, + "grad_norm": 0.39895838499069214, + "learning_rate": 5.725e-06, + "loss": 0.0518, "step": 230 }, { - "epoch": 0.015701668302257114, - "grad_norm": 1.7847758531570435, - "learning_rate": 2.6555555555555556e-06, - "loss": 0.1897, + "epoch": 0.06280667320902845, + "grad_norm": 0.32901522517204285, + "learning_rate": 5.975e-06, + "loss": 0.0552, "step": 240 }, { - "epoch": 0.016355904481517827, - "grad_norm": 2.0124351978302, - "learning_rate": 2.7666666666666667e-06, - "loss": 0.1816, + "epoch": 0.0654236179260713, + "grad_norm": 0.3974279761314392, + "learning_rate": 6.2250000000000005e-06, + "loss": 0.0468, "step": 250 }, { - "epoch": 0.01701014066077854, - "grad_norm": 1.3791583776474, - "learning_rate": 2.877777777777778e-06, - "loss": 0.1895, + "epoch": 0.06804056264311416, + "grad_norm": 0.4717833697795868, + "learning_rate": 6.475000000000001e-06, + "loss": 0.05, "step": 260 }, { - "epoch": 0.017664376840039256, - "grad_norm": 1.6091564893722534, - "learning_rate": 2.988888888888889e-06, - "loss": 0.174, + "epoch": 0.07065750736015702, + "grad_norm": 0.3713611960411072, + "learning_rate": 6.725000000000001e-06, + "loss": 0.0486, "step": 270 }, { - "epoch": 0.01831861301929997, - "grad_norm": 1.2440049648284912, - "learning_rate": 3.1e-06, - "loss": 0.1737, + "epoch": 0.07327445207719988, + "grad_norm": 0.3582926094532013, + "learning_rate": 6.975000000000001e-06, + "loss": 0.0451, "step": 280 }, { - "epoch": 0.018972849198560682, - "grad_norm": 1.4476827383041382, - "learning_rate": 3.2111111111111115e-06, - "loss": 0.1656, + "epoch": 0.07589139679424273, + "grad_norm": 0.3125925660133362, + "learning_rate": 7.2249999999999994e-06, + "loss": 0.0463, "step": 290 }, { - "epoch": 0.019627085377821395, - "grad_norm": 1.3262630701065063, - "learning_rate": 3.3222222222222226e-06, - "loss": 0.1588, + "epoch": 0.07850834151128558, + "grad_norm": 0.33565473556518555, + "learning_rate": 7.4750000000000004e-06, + "loss": 0.0512, "step": 300 }, { - "epoch": 0.020281321557082108, - "grad_norm": 1.3778283596038818, - "learning_rate": 3.4333333333333336e-06, - "loss": 0.1677, + "epoch": 0.08112528622832843, + "grad_norm": 0.6160638928413391, + "learning_rate": 7.725e-06, + "loss": 0.0529, "step": 310 }, { - "epoch": 0.02093555773634282, - "grad_norm": 1.322625756263733, - "learning_rate": 3.5444444444444447e-06, - "loss": 0.1776, + "epoch": 0.08374223094537128, + "grad_norm": 0.3819540739059448, + "learning_rate": 7.975e-06, + "loss": 0.0417, "step": 320 }, { - "epoch": 0.021589793915603533, - "grad_norm": 1.2029178142547607, - "learning_rate": 3.655555555555556e-06, - "loss": 0.1759, + "epoch": 0.08635917566241413, + "grad_norm": 0.34822750091552734, + "learning_rate": 8.225e-06, + "loss": 0.0464, "step": 330 }, { - "epoch": 0.022244030094864246, - "grad_norm": 1.393506646156311, - "learning_rate": 3.766666666666667e-06, - "loss": 0.1781, + "epoch": 0.08897612037945699, + "grad_norm": 0.47799152135849, + "learning_rate": 8.475000000000001e-06, + "loss": 0.0559, "step": 340 }, { - "epoch": 0.02289826627412496, - "grad_norm": 1.2977313995361328, - "learning_rate": 3.877777777777778e-06, - "loss": 0.1674, + "epoch": 0.09159306509649984, + "grad_norm": 0.33403629064559937, + "learning_rate": 8.725e-06, + "loss": 0.0473, "step": 350 }, { - "epoch": 0.023552502453385672, - "grad_norm": 1.1824816465377808, - "learning_rate": 3.9888888888888895e-06, - "loss": 0.165, + "epoch": 0.09421000981354269, + "grad_norm": 0.4290122091770172, + "learning_rate": 8.975e-06, + "loss": 0.0512, "step": 360 }, { - "epoch": 0.024206738632646385, - "grad_norm": 1.364259123802185, - "learning_rate": 4.1000000000000006e-06, - "loss": 0.1652, + "epoch": 0.09682695453058554, + "grad_norm": 0.46314916014671326, + "learning_rate": 9.225e-06, + "loss": 0.0451, "step": 370 }, { - "epoch": 0.024860974811907098, - "grad_norm": 1.3480299711227417, - "learning_rate": 4.211111111111112e-06, - "loss": 0.156, + "epoch": 0.09944389924762839, + "grad_norm": 0.5091361403465271, + "learning_rate": 9.475e-06, + "loss": 0.0451, "step": 380 }, { - "epoch": 0.02551521099116781, - "grad_norm": 1.1253418922424316, - "learning_rate": 4.322222222222223e-06, - "loss": 0.1666, + "epoch": 0.10206084396467124, + "grad_norm": 0.4810822308063507, + "learning_rate": 9.725000000000001e-06, + "loss": 0.0435, "step": 390 }, { - "epoch": 0.026169447170428524, - "grad_norm": 0.9541553258895874, - "learning_rate": 4.433333333333334e-06, - "loss": 0.1679, + "epoch": 0.1046777886817141, + "grad_norm": 0.5177111625671387, + "learning_rate": 9.975e-06, + "loss": 0.0476, "step": 400 }, { - "epoch": 0.026823683349689237, - "grad_norm": 1.3034117221832275, - "learning_rate": 4.544444444444445e-06, - "loss": 0.167, + "epoch": 0.10729473339875695, + "grad_norm": 0.4763142764568329, + "learning_rate": 1.0225e-05, + "loss": 0.0493, "step": 410 }, { - "epoch": 0.02747791952894995, - "grad_norm": 1.1535122394561768, - "learning_rate": 4.655555555555556e-06, - "loss": 0.1562, + "epoch": 0.1099116781157998, + "grad_norm": 0.4892769753932953, + "learning_rate": 1.0475e-05, + "loss": 0.048, "step": 420 }, { - "epoch": 0.028132155708210663, - "grad_norm": 1.2631272077560425, - "learning_rate": 4.766666666666667e-06, - "loss": 0.1656, + "epoch": 0.11252862283284265, + "grad_norm": 0.41679662466049194, + "learning_rate": 1.0725e-05, + "loss": 0.052, "step": 430 }, { - "epoch": 0.028786391887471376, - "grad_norm": 0.9639325737953186, - "learning_rate": 4.877777777777778e-06, - "loss": 0.1696, + "epoch": 0.1151455675498855, + "grad_norm": 0.5526682734489441, + "learning_rate": 1.0975e-05, + "loss": 0.046, "step": 440 }, { - "epoch": 0.029440628066732092, - "grad_norm": 1.205918550491333, - "learning_rate": 4.988888888888889e-06, - "loss": 0.161, + "epoch": 0.11776251226692837, + "grad_norm": 0.46023502945899963, + "learning_rate": 1.1225e-05, + "loss": 0.0478, "step": 450 }, { - "epoch": 0.030094864245992805, - "grad_norm": 1.0042939186096191, - "learning_rate": 5.1e-06, - "loss": 0.1651, + "epoch": 0.12037945698397122, + "grad_norm": 0.41873809695243835, + "learning_rate": 1.1475000000000001e-05, + "loss": 0.0439, "step": 460 }, { - "epoch": 0.030749100425253518, - "grad_norm": 1.0380401611328125, - "learning_rate": 5.211111111111111e-06, - "loss": 0.162, + "epoch": 0.12299640170101407, + "grad_norm": 0.3857043981552124, + "learning_rate": 1.1725e-05, + "loss": 0.0409, "step": 470 }, { - "epoch": 0.03140333660451423, - "grad_norm": 0.9904937148094177, - "learning_rate": 5.3222222222222225e-06, - "loss": 0.1663, + "epoch": 0.1256133464180569, + "grad_norm": 0.4027094841003418, + "learning_rate": 1.1975e-05, + "loss": 0.045, "step": 480 }, { - "epoch": 0.03205757278377494, - "grad_norm": 1.1420738697052002, - "learning_rate": 5.4333333333333335e-06, - "loss": 0.1566, + "epoch": 0.12823029113509976, + "grad_norm": 0.45145806670188904, + "learning_rate": 1.2225e-05, + "loss": 0.0453, "step": 490 }, { - "epoch": 0.03271180896303565, - "grad_norm": 1.0128347873687744, - "learning_rate": 5.544444444444445e-06, - "loss": 0.1576, + "epoch": 0.1308472358521426, + "grad_norm": 0.3627549409866333, + "learning_rate": 1.2475e-05, + "loss": 0.045, "step": 500 }, { - "epoch": 0.033366045142296366, - "grad_norm": 0.9660897254943848, - "learning_rate": 5.655555555555556e-06, - "loss": 0.1549, + "epoch": 0.13346418056918546, + "grad_norm": 0.4891456067562103, + "learning_rate": 1.2725000000000001e-05, + "loss": 0.0415, "step": 510 }, { - "epoch": 0.03402028132155708, - "grad_norm": 1.0083342790603638, - "learning_rate": 5.766666666666667e-06, - "loss": 0.1575, + "epoch": 0.13608112528622832, + "grad_norm": 0.4617407023906708, + "learning_rate": 1.2975e-05, + "loss": 0.0454, "step": 520 }, { - "epoch": 0.03467451750081779, - "grad_norm": 1.092995047569275, - "learning_rate": 5.877777777777778e-06, - "loss": 0.162, + "epoch": 0.13869807000327117, + "grad_norm": 0.5147842764854431, + "learning_rate": 1.3225000000000001e-05, + "loss": 0.047, "step": 530 }, { - "epoch": 0.03532875368007851, - "grad_norm": 0.9964866638183594, - "learning_rate": 5.988888888888889e-06, - "loss": 0.1518, + "epoch": 0.14131501472031405, + "grad_norm": 0.47616279125213623, + "learning_rate": 1.3475000000000002e-05, + "loss": 0.0481, "step": 540 }, { - "epoch": 0.035982989859339225, - "grad_norm": 1.2313237190246582, - "learning_rate": 6.1e-06, - "loss": 0.151, + "epoch": 0.1439319594373569, + "grad_norm": 0.4826867878437042, + "learning_rate": 1.3725000000000002e-05, + "loss": 0.0381, "step": 550 }, { - "epoch": 0.03663722603859994, - "grad_norm": 1.1173750162124634, - "learning_rate": 6.211111111111111e-06, - "loss": 0.1651, + "epoch": 0.14654890415439975, + "grad_norm": 0.4853934943675995, + "learning_rate": 1.3975000000000003e-05, + "loss": 0.0479, "step": 560 }, { - "epoch": 0.03729146221786065, - "grad_norm": 1.0656248331069946, - "learning_rate": 6.322222222222222e-06, - "loss": 0.151, + "epoch": 0.1491658488714426, + "grad_norm": 0.7997105121612549, + "learning_rate": 1.4225e-05, + "loss": 0.0519, "step": 570 }, { - "epoch": 0.037945698397121363, - "grad_norm": 1.1378906965255737, - "learning_rate": 6.433333333333334e-06, - "loss": 0.1666, + "epoch": 0.15178279358848545, + "grad_norm": 0.352081298828125, + "learning_rate": 1.4475e-05, + "loss": 0.0469, "step": 580 }, { - "epoch": 0.038599934576382076, - "grad_norm": 1.271768569946289, - "learning_rate": 6.544444444444444e-06, - "loss": 0.1557, + "epoch": 0.1543997383055283, + "grad_norm": 0.5584960579872131, + "learning_rate": 1.4725e-05, + "loss": 0.045, "step": 590 }, { - "epoch": 0.03925417075564279, - "grad_norm": 1.0535553693771362, - "learning_rate": 6.655555555555556e-06, - "loss": 0.1423, + "epoch": 0.15701668302257116, + "grad_norm": 0.5723614692687988, + "learning_rate": 1.4975e-05, + "loss": 0.0506, "step": 600 }, { - "epoch": 0.0399084069349035, - "grad_norm": 1.0603430271148682, - "learning_rate": 6.766666666666667e-06, - "loss": 0.1556, + "epoch": 0.159633627739614, + "grad_norm": 0.608233630657196, + "learning_rate": 1.5225e-05, + "loss": 0.047, "step": 610 }, { - "epoch": 0.040562643114164215, - "grad_norm": 3.6667890548706055, - "learning_rate": 6.877777777777778e-06, - "loss": 0.1488, + "epoch": 0.16225057245665686, + "grad_norm": 0.42129021883010864, + "learning_rate": 1.5475e-05, + "loss": 0.0462, "step": 620 }, { - "epoch": 0.04121687929342493, - "grad_norm": 0.9498468041419983, - "learning_rate": 6.9888888888888895e-06, - "loss": 0.155, + "epoch": 0.1648675171736997, + "grad_norm": 0.2876960039138794, + "learning_rate": 1.5725e-05, + "loss": 0.0504, "step": 630 }, { - "epoch": 0.04187111547268564, - "grad_norm": 1.061012625694275, - "learning_rate": 7.1e-06, - "loss": 0.1559, + "epoch": 0.16748446189074256, + "grad_norm": 1.045732021331787, + "learning_rate": 1.5975000000000002e-05, + "loss": 0.0519, "step": 640 }, { - "epoch": 0.042525351651946354, - "grad_norm": 1.0795965194702148, - "learning_rate": 7.211111111111112e-06, - "loss": 0.1494, + "epoch": 0.17010140660778542, + "grad_norm": 0.707153856754303, + "learning_rate": 1.6225e-05, + "loss": 0.0486, "step": 650 }, { - "epoch": 0.04317958783120707, - "grad_norm": 0.8958123922348022, - "learning_rate": 7.322222222222222e-06, - "loss": 0.1435, + "epoch": 0.17271835132482827, + "grad_norm": 0.5502765774726868, + "learning_rate": 1.6475e-05, + "loss": 0.0515, "step": 660 }, { - "epoch": 0.04383382401046778, - "grad_norm": 0.9391831755638123, - "learning_rate": 7.433333333333334e-06, - "loss": 0.1466, + "epoch": 0.17533529604187112, + "grad_norm": 0.6845604181289673, + "learning_rate": 1.6725000000000003e-05, + "loss": 0.0443, "step": 670 }, { - "epoch": 0.04448806018972849, - "grad_norm": 1.4169477224349976, - "learning_rate": 7.544444444444444e-06, - "loss": 0.1394, + "epoch": 0.17795224075891397, + "grad_norm": 0.7459421157836914, + "learning_rate": 1.6975000000000003e-05, + "loss": 0.0428, "step": 680 }, { - "epoch": 0.045142296368989206, - "grad_norm": 1.0921765565872192, - "learning_rate": 7.655555555555556e-06, - "loss": 0.1522, + "epoch": 0.18056918547595682, + "grad_norm": 0.4575180411338806, + "learning_rate": 1.7225e-05, + "loss": 0.042, "step": 690 }, { - "epoch": 0.04579653254824992, - "grad_norm": 0.8971364498138428, - "learning_rate": 7.766666666666666e-06, - "loss": 0.1567, + "epoch": 0.18318613019299967, + "grad_norm": 0.8352943062782288, + "learning_rate": 1.7475e-05, + "loss": 0.0448, "step": 700 }, { - "epoch": 0.04645076872751063, - "grad_norm": 0.9771297574043274, - "learning_rate": 7.877777777777778e-06, - "loss": 0.1562, + "epoch": 0.18580307491004253, + "grad_norm": 0.6489511728286743, + "learning_rate": 1.7725e-05, + "loss": 0.0463, "step": 710 }, { - "epoch": 0.047105004906771344, - "grad_norm": 1.0776768922805786, - "learning_rate": 7.988888888888888e-06, - "loss": 0.1485, + "epoch": 0.18842001962708538, + "grad_norm": 0.5922821164131165, + "learning_rate": 1.7975e-05, + "loss": 0.0487, "step": 720 }, { - "epoch": 0.04775924108603206, - "grad_norm": 1.0369359254837036, - "learning_rate": 8.1e-06, - "loss": 0.1524, + "epoch": 0.19103696434412823, + "grad_norm": 0.6881849765777588, + "learning_rate": 1.8225e-05, + "loss": 0.0492, "step": 730 }, { - "epoch": 0.04841347726529277, - "grad_norm": 1.057263731956482, - "learning_rate": 8.21111111111111e-06, - "loss": 0.1542, + "epoch": 0.19365390906117108, + "grad_norm": 0.6800899505615234, + "learning_rate": 1.8475000000000002e-05, + "loss": 0.0521, "step": 740 }, { - "epoch": 0.04906771344455348, - "grad_norm": 1.065109372138977, - "learning_rate": 8.322222222222223e-06, - "loss": 0.1539, + "epoch": 0.19627085377821393, + "grad_norm": 0.578558623790741, + "learning_rate": 1.8725e-05, + "loss": 0.0489, "step": 750 }, { - "epoch": 0.049721949623814196, - "grad_norm": 1.0507162809371948, - "learning_rate": 8.433333333333333e-06, - "loss": 0.1383, + "epoch": 0.19888779849525678, + "grad_norm": 0.5541056990623474, + "learning_rate": 1.8975e-05, + "loss": 0.0487, "step": 760 }, { - "epoch": 0.05037618580307491, - "grad_norm": 0.9928821325302124, - "learning_rate": 8.544444444444445e-06, - "loss": 0.1459, + "epoch": 0.20150474321229964, + "grad_norm": 0.6745200157165527, + "learning_rate": 1.9225e-05, + "loss": 0.0542, "step": 770 }, { - "epoch": 0.05103042198233562, - "grad_norm": 1.1054047346115112, - "learning_rate": 8.655555555555555e-06, - "loss": 0.1455, + "epoch": 0.2041216879293425, + "grad_norm": 0.7663201093673706, + "learning_rate": 1.9475000000000002e-05, + "loss": 0.0502, "step": 780 }, { - "epoch": 0.051684658161596335, - "grad_norm": 0.8906415104866028, - "learning_rate": 8.766666666666667e-06, - "loss": 0.1351, + "epoch": 0.20673863264638534, + "grad_norm": 0.5854843854904175, + "learning_rate": 1.9725000000000002e-05, + "loss": 0.0441, "step": 790 }, { - "epoch": 0.05233889434085705, - "grad_norm": 0.891897439956665, - "learning_rate": 8.877777777777777e-06, - "loss": 0.1536, + "epoch": 0.2093555773634282, + "grad_norm": 0.580001175403595, + "learning_rate": 1.9975e-05, + "loss": 0.0496, "step": 800 }, { - "epoch": 0.05299313052011776, - "grad_norm": 0.8980157971382141, - "learning_rate": 8.988888888888889e-06, - "loss": 0.1366, + "epoch": 0.21197252208047104, + "grad_norm": 0.6295838356018066, + "learning_rate": 2.0225000000000004e-05, + "loss": 0.0415, "step": 810 }, { - "epoch": 0.053647366699378474, - "grad_norm": 0.9560266733169556, - "learning_rate": 9.100000000000001e-06, - "loss": 0.1432, + "epoch": 0.2145894667975139, + "grad_norm": 0.656004786491394, + "learning_rate": 2.0475e-05, + "loss": 0.0472, "step": 820 }, { - "epoch": 0.05430160287863919, - "grad_norm": 0.9071044921875, - "learning_rate": 9.211111111111111e-06, - "loss": 0.1437, + "epoch": 0.21720641151455675, + "grad_norm": 0.5602745413780212, + "learning_rate": 2.0725e-05, + "loss": 0.0469, "step": 830 }, { - "epoch": 0.0549558390578999, - "grad_norm": 1.1102005243301392, - "learning_rate": 9.322222222222223e-06, - "loss": 0.1446, + "epoch": 0.2198233562315996, + "grad_norm": 0.7688363790512085, + "learning_rate": 2.0975e-05, + "loss": 0.0503, "step": 840 }, { - "epoch": 0.05561007523716061, - "grad_norm": 0.9674533605575562, - "learning_rate": 9.433333333333335e-06, - "loss": 0.1534, + "epoch": 0.22244030094864245, + "grad_norm": 0.5913244485855103, + "learning_rate": 2.1225e-05, + "loss": 0.0493, "step": 850 }, { - "epoch": 0.056264311416421325, - "grad_norm": 0.8930101990699768, - "learning_rate": 9.544444444444445e-06, - "loss": 0.154, + "epoch": 0.2250572456656853, + "grad_norm": 0.7088301777839661, + "learning_rate": 2.1475e-05, + "loss": 0.0498, "step": 860 }, { - "epoch": 0.05691854759568204, - "grad_norm": 1.0106406211853027, - "learning_rate": 9.655555555555557e-06, - "loss": 0.1438, + "epoch": 0.22767419038272815, + "grad_norm": 0.5226425528526306, + "learning_rate": 2.1725e-05, + "loss": 0.0456, "step": 870 }, { - "epoch": 0.05757278377494275, - "grad_norm": 1.0581188201904297, - "learning_rate": 9.766666666666667e-06, - "loss": 0.1465, + "epoch": 0.230291135099771, + "grad_norm": 0.5275906324386597, + "learning_rate": 2.1975000000000002e-05, + "loss": 0.0508, "step": 880 }, { - "epoch": 0.058227019954203464, - "grad_norm": 1.138856291770935, - "learning_rate": 9.87777777777778e-06, - "loss": 0.1439, + "epoch": 0.23290807981681386, + "grad_norm": 0.36874884366989136, + "learning_rate": 2.2225e-05, + "loss": 0.0506, "step": 890 }, { - "epoch": 0.058881256133464184, - "grad_norm": 1.0272846221923828, - "learning_rate": 9.98888888888889e-06, - "loss": 0.1627, + "epoch": 0.23552502453385674, + "grad_norm": 0.44395896792411804, + "learning_rate": 2.2475e-05, + "loss": 0.0524, "step": 900 }, { - "epoch": 0.0595354923127249, - "grad_norm": 1.071536898612976, - "learning_rate": 1.0100000000000002e-05, - "loss": 0.1544, + "epoch": 0.2381419692508996, + "grad_norm": 0.5883681178092957, + "learning_rate": 2.2725000000000003e-05, + "loss": 0.0465, "step": 910 }, { - "epoch": 0.06018972849198561, - "grad_norm": 0.9497654438018799, - "learning_rate": 1.0211111111111112e-05, - "loss": 0.1433, + "epoch": 0.24075891396794244, + "grad_norm": 0.48441359400749207, + "learning_rate": 2.2975000000000003e-05, + "loss": 0.0477, "step": 920 }, { - "epoch": 0.06084396467124632, - "grad_norm": 0.9319833517074585, - "learning_rate": 1.0322222222222224e-05, - "loss": 0.151, + "epoch": 0.2433758586849853, + "grad_norm": 0.6230258345603943, + "learning_rate": 2.3225000000000002e-05, + "loss": 0.0535, "step": 930 }, { - "epoch": 0.061498200850507036, - "grad_norm": 0.8812747001647949, - "learning_rate": 1.0433333333333334e-05, - "loss": 0.1307, + "epoch": 0.24599280340202814, + "grad_norm": 0.7106624245643616, + "learning_rate": 2.3475e-05, + "loss": 0.0471, "step": 940 }, { - "epoch": 0.06215243702976775, - "grad_norm": 0.9672985672950745, - "learning_rate": 1.0544444444444444e-05, - "loss": 0.1453, + "epoch": 0.248609748119071, + "grad_norm": 0.7645530700683594, + "learning_rate": 2.3725e-05, + "loss": 0.0485, "step": 950 }, { - "epoch": 0.06280667320902845, - "grad_norm": 0.9534893035888672, - "learning_rate": 1.0655555555555556e-05, - "loss": 0.1365, + "epoch": 0.2512266928361138, + "grad_norm": 0.6872208714485168, + "learning_rate": 2.3975e-05, + "loss": 0.0516, "step": 960 }, { - "epoch": 0.06346090938828917, - "grad_norm": 0.9244397282600403, - "learning_rate": 1.0766666666666666e-05, - "loss": 0.1472, + "epoch": 0.2538436375531567, + "grad_norm": 0.705649733543396, + "learning_rate": 2.4225e-05, + "loss": 0.0535, "step": 970 }, { - "epoch": 0.06411514556754988, - "grad_norm": 0.9510551691055298, - "learning_rate": 1.0877777777777778e-05, - "loss": 0.1315, + "epoch": 0.2564605822701995, + "grad_norm": 0.533315896987915, + "learning_rate": 2.4475000000000002e-05, + "loss": 0.0469, "step": 980 }, { - "epoch": 0.0647693817468106, - "grad_norm": 1.0477503538131714, - "learning_rate": 1.0988888888888889e-05, - "loss": 0.137, + "epoch": 0.2590775269872424, + "grad_norm": 0.5918316841125488, + "learning_rate": 2.4725e-05, + "loss": 0.0575, "step": 990 }, { - "epoch": 0.0654236179260713, - "grad_norm": 0.7996601462364197, - "learning_rate": 1.11e-05, - "loss": 0.1334, + "epoch": 0.2616944717042852, + "grad_norm": 0.5549744963645935, + "learning_rate": 2.4975e-05, + "loss": 0.0475, "step": 1000 }, { - "epoch": 0.06607785410533203, - "grad_norm": 0.9080948829650879, - "learning_rate": 1.121111111111111e-05, - "loss": 0.1443, + "epoch": 0.2616944717042852, + "eval_loss": 0.05487126120450602, + "eval_runtime": 8.7333, + "eval_samples_per_second": 117.252, + "eval_steps_per_second": 1.832, + "step": 1000 + }, + { + "epoch": 0.2643114164213281, + "grad_norm": 0.82178795337677, + "learning_rate": 2.5225e-05, + "loss": 0.0504, "step": 1010 }, { - "epoch": 0.06673209028459273, - "grad_norm": 1.1308685541152954, - "learning_rate": 1.1322222222222223e-05, - "loss": 0.1424, + "epoch": 0.26692836113837093, + "grad_norm": 0.7854476571083069, + "learning_rate": 2.5475e-05, + "loss": 0.0549, "step": 1020 }, { - "epoch": 0.06738632646385345, - "grad_norm": 0.9781653881072998, - "learning_rate": 1.1433333333333333e-05, - "loss": 0.1332, + "epoch": 0.2695453058554138, + "grad_norm": 0.43605050444602966, + "learning_rate": 2.5725e-05, + "loss": 0.0482, "step": 1030 }, { - "epoch": 0.06804056264311416, - "grad_norm": 0.9940928220748901, - "learning_rate": 1.1544444444444445e-05, - "loss": 0.1449, + "epoch": 0.27216225057245663, + "grad_norm": 0.4977242350578308, + "learning_rate": 2.5974999999999998e-05, + "loss": 0.0432, "step": 1040 }, { - "epoch": 0.06869479882237488, - "grad_norm": 1.109440803527832, - "learning_rate": 1.1655555555555555e-05, - "loss": 0.1373, + "epoch": 0.2747791952894995, + "grad_norm": 0.5008284449577332, + "learning_rate": 2.6225e-05, + "loss": 0.0445, "step": 1050 }, { - "epoch": 0.06934903500163558, - "grad_norm": 0.7790347337722778, - "learning_rate": 1.1766666666666667e-05, - "loss": 0.1441, + "epoch": 0.27739614000654234, + "grad_norm": 0.5448734164237976, + "learning_rate": 2.6475e-05, + "loss": 0.0456, "step": 1060 }, { - "epoch": 0.0700032711808963, - "grad_norm": 0.9548224210739136, - "learning_rate": 1.1877777777777777e-05, - "loss": 0.1512, + "epoch": 0.2800130847235852, + "grad_norm": 0.6299842596054077, + "learning_rate": 2.6725e-05, + "loss": 0.0525, "step": 1070 }, { - "epoch": 0.07065750736015702, - "grad_norm": 0.9746776223182678, - "learning_rate": 1.1988888888888889e-05, - "loss": 0.1412, + "epoch": 0.2826300294406281, + "grad_norm": 0.5707956552505493, + "learning_rate": 2.6975000000000002e-05, + "loss": 0.0545, "step": 1080 }, { - "epoch": 0.07131174353941773, - "grad_norm": 1.2580838203430176, - "learning_rate": 1.2100000000000001e-05, - "loss": 0.153, + "epoch": 0.2852469741576709, + "grad_norm": 0.9552313685417175, + "learning_rate": 2.7225e-05, + "loss": 0.0496, "step": 1090 }, { - "epoch": 0.07196597971867845, - "grad_norm": 1.0998780727386475, - "learning_rate": 1.2211111111111111e-05, - "loss": 0.1396, + "epoch": 0.2878639188747138, + "grad_norm": 0.6557886600494385, + "learning_rate": 2.7475e-05, + "loss": 0.0447, "step": 1100 }, { - "epoch": 0.07262021589793916, - "grad_norm": 0.9261326789855957, - "learning_rate": 1.2322222222222223e-05, - "loss": 0.1431, + "epoch": 0.2904808635917566, + "grad_norm": 0.6832493543624878, + "learning_rate": 2.7725e-05, + "loss": 0.0461, "step": 1110 }, { - "epoch": 0.07327445207719988, - "grad_norm": 0.9783452749252319, - "learning_rate": 1.2433333333333335e-05, - "loss": 0.147, + "epoch": 0.2930978083087995, + "grad_norm": 0.38393524289131165, + "learning_rate": 2.7975000000000002e-05, + "loss": 0.0486, "step": 1120 }, { - "epoch": 0.07392868825646058, - "grad_norm": 0.9785774946212769, - "learning_rate": 1.2544444444444445e-05, - "loss": 0.1418, + "epoch": 0.2957147530258423, + "grad_norm": 0.894242525100708, + "learning_rate": 2.8225e-05, + "loss": 0.0555, "step": 1130 }, { - "epoch": 0.0745829244357213, - "grad_norm": 1.0473101139068604, - "learning_rate": 1.2655555555555557e-05, - "loss": 0.1397, + "epoch": 0.2983316977428852, + "grad_norm": 0.8547376990318298, + "learning_rate": 2.8475e-05, + "loss": 0.0515, "step": 1140 }, { - "epoch": 0.07523716061498201, - "grad_norm": 0.972935140132904, - "learning_rate": 1.276666666666667e-05, - "loss": 0.1394, + "epoch": 0.30094864245992803, + "grad_norm": 0.48917195200920105, + "learning_rate": 2.8725e-05, + "loss": 0.0479, "step": 1150 }, { - "epoch": 0.07589139679424273, - "grad_norm": 0.9036497473716736, - "learning_rate": 1.2877777777777778e-05, - "loss": 0.1465, + "epoch": 0.3035655871769709, + "grad_norm": 0.6105035543441772, + "learning_rate": 2.8975000000000003e-05, + "loss": 0.0527, "step": 1160 }, { - "epoch": 0.07654563297350343, - "grad_norm": 1.1967897415161133, - "learning_rate": 1.298888888888889e-05, - "loss": 0.1332, + "epoch": 0.30618253189401373, + "grad_norm": 0.6663628816604614, + "learning_rate": 2.9225000000000002e-05, + "loss": 0.0526, "step": 1170 }, { - "epoch": 0.07719986915276415, - "grad_norm": 0.875632643699646, - "learning_rate": 1.3100000000000002e-05, - "loss": 0.14, + "epoch": 0.3087994766110566, + "grad_norm": 0.6550115942955017, + "learning_rate": 2.9475e-05, + "loss": 0.0506, "step": 1180 }, { - "epoch": 0.07785410533202486, - "grad_norm": 0.9649753570556641, - "learning_rate": 1.3211111111111114e-05, - "loss": 0.1324, + "epoch": 0.31141642132809944, + "grad_norm": 0.5986365675926208, + "learning_rate": 2.9725000000000004e-05, + "loss": 0.0459, "step": 1190 }, { - "epoch": 0.07850834151128558, - "grad_norm": 0.8261013627052307, - "learning_rate": 1.3322222222222222e-05, - "loss": 0.1333, + "epoch": 0.3140333660451423, + "grad_norm": 0.6375890970230103, + "learning_rate": 2.9975000000000004e-05, + "loss": 0.0543, "step": 1200 }, { - "epoch": 0.07916257769054628, - "grad_norm": 1.0080054998397827, - "learning_rate": 1.3433333333333334e-05, - "loss": 0.1367, + "epoch": 0.31665031076218514, + "grad_norm": 0.7755595445632935, + "learning_rate": 3.0225000000000003e-05, + "loss": 0.0466, "step": 1210 }, { - "epoch": 0.079816813869807, - "grad_norm": 1.0562273263931274, - "learning_rate": 1.3544444444444446e-05, - "loss": 0.1329, + "epoch": 0.319267255479228, + "grad_norm": 0.8417050838470459, + "learning_rate": 3.0475000000000002e-05, + "loss": 0.0516, "step": 1220 }, { - "epoch": 0.08047105004906771, - "grad_norm": 0.9354407787322998, - "learning_rate": 1.3655555555555558e-05, - "loss": 0.134, + "epoch": 0.32188420019627084, + "grad_norm": 0.6634261608123779, + "learning_rate": 3.0725e-05, + "loss": 0.0544, "step": 1230 }, { - "epoch": 0.08112528622832843, - "grad_norm": 0.9830518364906311, - "learning_rate": 1.3766666666666666e-05, - "loss": 0.135, + "epoch": 0.3245011449133137, + "grad_norm": 0.8552030920982361, + "learning_rate": 3.0975e-05, + "loss": 0.0538, "step": 1240 }, { - "epoch": 0.08177952240758914, - "grad_norm": 0.951457679271698, - "learning_rate": 1.3877777777777778e-05, - "loss": 0.1339, + "epoch": 0.32711808963035655, + "grad_norm": 0.6499819755554199, + "learning_rate": 3.122500000000001e-05, + "loss": 0.0507, "step": 1250 }, { - "epoch": 0.08243375858684986, - "grad_norm": 0.887077271938324, - "learning_rate": 1.398888888888889e-05, - "loss": 0.1436, + "epoch": 0.3297350343473994, + "grad_norm": 0.8389486074447632, + "learning_rate": 3.1475e-05, + "loss": 0.0513, "step": 1260 }, { - "epoch": 0.08308799476611056, - "grad_norm": 0.9000831842422485, - "learning_rate": 1.4099999999999999e-05, - "loss": 0.1367, + "epoch": 0.33235197906444225, + "grad_norm": 0.7545788288116455, + "learning_rate": 3.1725e-05, + "loss": 0.0508, "step": 1270 }, { - "epoch": 0.08374223094537128, - "grad_norm": 1.0145295858383179, - "learning_rate": 1.421111111111111e-05, - "loss": 0.1491, + "epoch": 0.33496892378148513, + "grad_norm": 0.739088773727417, + "learning_rate": 3.1975e-05, + "loss": 0.0592, "step": 1280 }, { - "epoch": 0.08439646712463199, - "grad_norm": 0.8529645800590515, - "learning_rate": 1.4322222222222223e-05, - "loss": 0.1278, + "epoch": 0.33758586849852795, + "grad_norm": 0.629649817943573, + "learning_rate": 3.2225e-05, + "loss": 0.0537, "step": 1290 }, { - "epoch": 0.08505070330389271, - "grad_norm": 1.141903042793274, - "learning_rate": 1.4433333333333335e-05, - "loss": 0.1319, + "epoch": 0.34020281321557083, + "grad_norm": 0.6471114158630371, + "learning_rate": 3.2474999999999997e-05, + "loss": 0.0569, "step": 1300 }, { - "epoch": 0.08570493948315341, - "grad_norm": 1.0339152812957764, - "learning_rate": 1.4544444444444443e-05, - "loss": 0.1362, + "epoch": 0.34281975793261366, + "grad_norm": 0.5596538782119751, + "learning_rate": 3.2725e-05, + "loss": 0.052, "step": 1310 }, { - "epoch": 0.08635917566241413, - "grad_norm": 0.9347769021987915, - "learning_rate": 1.4655555555555555e-05, - "loss": 0.1382, + "epoch": 0.34543670264965654, + "grad_norm": 0.46181052923202515, + "learning_rate": 3.2975e-05, + "loss": 0.0451, "step": 1320 }, { - "epoch": 0.08701341184167484, - "grad_norm": 1.0565035343170166, - "learning_rate": 1.4766666666666667e-05, - "loss": 0.1321, + "epoch": 0.34805364736669936, + "grad_norm": 0.6883523464202881, + "learning_rate": 3.3225e-05, + "loss": 0.0634, "step": 1330 }, { - "epoch": 0.08766764802093556, - "grad_norm": 0.9495023488998413, - "learning_rate": 1.4877777777777779e-05, - "loss": 0.1486, + "epoch": 0.35067059208374224, + "grad_norm": 0.7209715247154236, + "learning_rate": 3.3475e-05, + "loss": 0.0586, "step": 1340 }, { - "epoch": 0.08832188420019627, - "grad_norm": 0.9335034489631653, - "learning_rate": 1.498888888888889e-05, - "loss": 0.1413, + "epoch": 0.35328753680078506, + "grad_norm": 0.7685849666595459, + "learning_rate": 3.3725e-05, + "loss": 0.0572, "step": 1350 }, { - "epoch": 0.08897612037945699, - "grad_norm": 0.8525314331054688, - "learning_rate": 1.51e-05, - "loss": 0.1265, + "epoch": 0.35590448151782794, + "grad_norm": 0.538578450679779, + "learning_rate": 3.3975e-05, + "loss": 0.0545, "step": 1360 }, { - "epoch": 0.08963035655871769, - "grad_norm": 1.1363369226455688, - "learning_rate": 1.5211111111111111e-05, - "loss": 0.1397, + "epoch": 0.35852142623487077, + "grad_norm": 0.6326640844345093, + "learning_rate": 3.4225e-05, + "loss": 0.0589, "step": 1370 }, { - "epoch": 0.09028459273797841, - "grad_norm": 0.892582893371582, - "learning_rate": 1.5322222222222225e-05, - "loss": 0.1314, + "epoch": 0.36113837095191365, + "grad_norm": 0.8394850492477417, + "learning_rate": 3.4475000000000005e-05, + "loss": 0.049, "step": 1380 }, { - "epoch": 0.09093882891723912, - "grad_norm": 1.058522343635559, - "learning_rate": 1.5433333333333334e-05, - "loss": 0.1356, + "epoch": 0.36375531566895647, + "grad_norm": 0.7793498635292053, + "learning_rate": 3.4725000000000004e-05, + "loss": 0.0543, "step": 1390 }, { - "epoch": 0.09159306509649984, - "grad_norm": 0.9127554297447205, - "learning_rate": 1.5544444444444445e-05, - "loss": 0.1368, + "epoch": 0.36637226038599935, + "grad_norm": 0.8102224469184875, + "learning_rate": 3.4975e-05, + "loss": 0.0524, "step": 1400 }, { - "epoch": 0.09224730127576054, - "grad_norm": 1.5529781579971313, - "learning_rate": 1.5655555555555557e-05, - "loss": 0.1406, + "epoch": 0.3689892051030422, + "grad_norm": 0.6351240873336792, + "learning_rate": 3.5225e-05, + "loss": 0.0514, "step": 1410 }, { - "epoch": 0.09290153745502126, - "grad_norm": 0.9582400321960449, - "learning_rate": 1.576666666666667e-05, - "loss": 0.1315, + "epoch": 0.37160614982008505, + "grad_norm": 0.7605561017990112, + "learning_rate": 3.5475e-05, + "loss": 0.0566, "step": 1420 }, { - "epoch": 0.09355577363428197, - "grad_norm": 0.9931496977806091, - "learning_rate": 1.5877777777777778e-05, - "loss": 0.1327, + "epoch": 0.3742230945371279, + "grad_norm": 0.5509055852890015, + "learning_rate": 3.5725e-05, + "loss": 0.0546, "step": 1430 }, { - "epoch": 0.09421000981354269, - "grad_norm": 0.9021315574645996, - "learning_rate": 1.598888888888889e-05, - "loss": 0.1335, + "epoch": 0.37684003925417076, + "grad_norm": 0.5754609107971191, + "learning_rate": 3.5975e-05, + "loss": 0.0564, "step": 1440 }, { - "epoch": 0.09486424599280341, - "grad_norm": 0.9401276707649231, - "learning_rate": 1.6100000000000002e-05, - "loss": 0.1272, + "epoch": 0.37945698397121363, + "grad_norm": 0.6741311550140381, + "learning_rate": 3.6225000000000006e-05, + "loss": 0.058, "step": 1450 }, { - "epoch": 0.09551848217206411, - "grad_norm": 0.9633158445358276, - "learning_rate": 1.6211111111111114e-05, - "loss": 0.1359, + "epoch": 0.38207392868825646, + "grad_norm": 0.7794651985168457, + "learning_rate": 3.6475000000000006e-05, + "loss": 0.069, "step": 1460 }, { - "epoch": 0.09617271835132483, - "grad_norm": 1.0396912097930908, - "learning_rate": 1.6322222222222222e-05, - "loss": 0.1298, + "epoch": 0.38469087340529934, + "grad_norm": 0.6684126853942871, + "learning_rate": 3.6725000000000005e-05, + "loss": 0.0588, "step": 1470 }, { - "epoch": 0.09682695453058554, - "grad_norm": 0.9414869546890259, - "learning_rate": 1.6433333333333334e-05, - "loss": 0.1288, + "epoch": 0.38730781812234216, + "grad_norm": 0.6147798299789429, + "learning_rate": 3.6975000000000004e-05, + "loss": 0.0614, "step": 1480 }, { - "epoch": 0.09748119070984626, - "grad_norm": 0.8435006737709045, - "learning_rate": 1.6544444444444446e-05, - "loss": 0.1303, + "epoch": 0.38992476283938504, + "grad_norm": 0.5154690146446228, + "learning_rate": 3.7225000000000004e-05, + "loss": 0.0595, "step": 1490 }, { - "epoch": 0.09813542688910697, - "grad_norm": 0.9831594824790955, - "learning_rate": 1.6655555555555558e-05, - "loss": 0.1439, + "epoch": 0.39254170755642787, + "grad_norm": 0.6523647308349609, + "learning_rate": 3.7475e-05, + "loss": 0.0597, "step": 1500 }, { - "epoch": 0.09878966306836769, - "grad_norm": 1.0466912984848022, - "learning_rate": 1.6766666666666667e-05, - "loss": 0.1277, + "epoch": 0.39515865227347075, + "grad_norm": 0.6172918081283569, + "learning_rate": 3.7725e-05, + "loss": 0.0626, "step": 1510 }, { - "epoch": 0.09944389924762839, - "grad_norm": 0.9995728731155396, - "learning_rate": 1.687777777777778e-05, - "loss": 0.1297, + "epoch": 0.39777559699051357, + "grad_norm": 0.41199785470962524, + "learning_rate": 3.7975e-05, + "loss": 0.0551, "step": 1520 }, { - "epoch": 0.10009813542688911, - "grad_norm": 1.018913745880127, - "learning_rate": 1.698888888888889e-05, - "loss": 0.1341, + "epoch": 0.40039254170755645, + "grad_norm": 0.4674221873283386, + "learning_rate": 3.8225e-05, + "loss": 0.0631, "step": 1530 }, { - "epoch": 0.10075237160614982, - "grad_norm": 1.1046700477600098, - "learning_rate": 1.7100000000000002e-05, - "loss": 0.1332, + "epoch": 0.4030094864245993, + "grad_norm": 0.7040983438491821, + "learning_rate": 3.8475e-05, + "loss": 0.0647, "step": 1540 }, { - "epoch": 0.10140660778541054, - "grad_norm": 1.0834091901779175, - "learning_rate": 1.721111111111111e-05, - "loss": 0.1349, + "epoch": 0.40562643114164215, + "grad_norm": 0.912530243396759, + "learning_rate": 3.8725e-05, + "loss": 0.0643, "step": 1550 }, { - "epoch": 0.10206084396467124, - "grad_norm": 1.079143762588501, - "learning_rate": 1.7322222222222223e-05, - "loss": 0.1398, + "epoch": 0.408243375858685, + "grad_norm": 0.6816163063049316, + "learning_rate": 3.8975e-05, + "loss": 0.0657, "step": 1560 }, { - "epoch": 0.10271508014393196, - "grad_norm": 1.0492314100265503, - "learning_rate": 1.7433333333333335e-05, - "loss": 0.1411, + "epoch": 0.41086032057572786, + "grad_norm": 0.9752461910247803, + "learning_rate": 3.9225e-05, + "loss": 0.0705, "step": 1570 }, { - "epoch": 0.10336931632319267, - "grad_norm": 1.171302318572998, - "learning_rate": 1.7544444444444443e-05, - "loss": 0.1312, + "epoch": 0.4134772652927707, + "grad_norm": 0.910144031047821, + "learning_rate": 3.9475000000000004e-05, + "loss": 0.0653, "step": 1580 }, { - "epoch": 0.10402355250245339, - "grad_norm": 1.0378974676132202, - "learning_rate": 1.7655555555555555e-05, - "loss": 0.1254, + "epoch": 0.41609421000981356, + "grad_norm": 0.611270010471344, + "learning_rate": 3.9725e-05, + "loss": 0.0612, "step": 1590 }, { - "epoch": 0.1046777886817141, - "grad_norm": 0.9647935032844543, - "learning_rate": 1.7766666666666667e-05, - "loss": 0.1444, + "epoch": 0.4187111547268564, + "grad_norm": 0.6135270595550537, + "learning_rate": 3.9975e-05, + "loss": 0.0697, "step": 1600 }, { - "epoch": 0.10533202486097482, - "grad_norm": 0.9934017658233643, - "learning_rate": 1.787777777777778e-05, - "loss": 0.1344, + "epoch": 0.42132809944389926, + "grad_norm": 0.6670326590538025, + "learning_rate": 4.0225e-05, + "loss": 0.0632, "step": 1610 }, { - "epoch": 0.10598626104023552, - "grad_norm": 1.0105055570602417, - "learning_rate": 1.7988888888888888e-05, - "loss": 0.1397, + "epoch": 0.4239450441609421, + "grad_norm": 0.6105315089225769, + "learning_rate": 4.0475e-05, + "loss": 0.0604, "step": 1620 }, { - "epoch": 0.10664049721949624, - "grad_norm": 1.05049729347229, - "learning_rate": 1.81e-05, - "loss": 0.1348, + "epoch": 0.42656198887798497, + "grad_norm": 0.8678605556488037, + "learning_rate": 4.0725e-05, + "loss": 0.0642, "step": 1630 }, { - "epoch": 0.10729473339875695, - "grad_norm": 1.0454671382904053, - "learning_rate": 1.821111111111111e-05, - "loss": 0.1239, + "epoch": 0.4291789335950278, + "grad_norm": 0.7068043351173401, + "learning_rate": 4.0975e-05, + "loss": 0.0598, "step": 1640 }, { - "epoch": 0.10794896957801767, - "grad_norm": 1.0142239332199097, - "learning_rate": 1.8322222222222223e-05, - "loss": 0.1148, + "epoch": 0.43179587831207067, + "grad_norm": 0.9618980288505554, + "learning_rate": 4.1225e-05, + "loss": 0.0617, "step": 1650 }, { - "epoch": 0.10860320575727837, - "grad_norm": 1.1496508121490479, - "learning_rate": 1.8433333333333332e-05, - "loss": 0.1308, + "epoch": 0.4344128230291135, + "grad_norm": 0.6300309300422668, + "learning_rate": 4.1475000000000005e-05, + "loss": 0.0567, "step": 1660 }, { - "epoch": 0.1092574419365391, - "grad_norm": 0.9551987051963806, - "learning_rate": 1.8544444444444444e-05, - "loss": 0.1339, + "epoch": 0.4370297677461564, + "grad_norm": 0.7122251391410828, + "learning_rate": 4.1725000000000005e-05, + "loss": 0.0617, "step": 1670 }, { - "epoch": 0.1099116781157998, - "grad_norm": 0.8923566937446594, - "learning_rate": 1.8655555555555556e-05, - "loss": 0.1286, + "epoch": 0.4396467124631992, + "grad_norm": 0.5705268383026123, + "learning_rate": 4.1975000000000004e-05, + "loss": 0.0634, "step": 1680 }, { - "epoch": 0.11056591429506052, - "grad_norm": 1.0510790348052979, - "learning_rate": 1.8766666666666668e-05, - "loss": 0.1362, + "epoch": 0.4422636571802421, + "grad_norm": 0.9508737921714783, + "learning_rate": 4.2225e-05, + "loss": 0.07, "step": 1690 }, { - "epoch": 0.11122015047432122, - "grad_norm": 0.9121619462966919, - "learning_rate": 1.8877777777777776e-05, - "loss": 0.1273, + "epoch": 0.4448806018972849, + "grad_norm": 0.9834522604942322, + "learning_rate": 4.2475e-05, + "loss": 0.0727, "step": 1700 }, { - "epoch": 0.11187438665358194, - "grad_norm": 1.1024173498153687, - "learning_rate": 1.8988888888888888e-05, - "loss": 0.1253, + "epoch": 0.4474975466143278, + "grad_norm": 1.0555498600006104, + "learning_rate": 4.2725e-05, + "loss": 0.066, "step": 1710 }, { - "epoch": 0.11252862283284265, - "grad_norm": 1.032799482345581, - "learning_rate": 1.91e-05, - "loss": 0.1313, + "epoch": 0.4501144913313706, + "grad_norm": 1.0131648778915405, + "learning_rate": 4.2975e-05, + "loss": 0.0628, "step": 1720 }, { - "epoch": 0.11318285901210337, - "grad_norm": 1.0584205389022827, - "learning_rate": 1.9211111111111112e-05, - "loss": 0.1342, + "epoch": 0.4527314360484135, + "grad_norm": 0.7705113291740417, + "learning_rate": 4.322500000000001e-05, + "loss": 0.0645, "step": 1730 }, { - "epoch": 0.11383709519136408, - "grad_norm": 0.8857603669166565, - "learning_rate": 1.932222222222222e-05, - "loss": 0.1308, + "epoch": 0.4553483807654563, + "grad_norm": 0.6077147722244263, + "learning_rate": 4.3475000000000006e-05, + "loss": 0.0626, "step": 1740 }, { - "epoch": 0.1144913313706248, - "grad_norm": 0.9453005194664001, - "learning_rate": 1.9433333333333332e-05, - "loss": 0.1181, + "epoch": 0.4579653254824992, + "grad_norm": 1.067004919052124, + "learning_rate": 4.3725000000000006e-05, + "loss": 0.0692, "step": 1750 }, { - "epoch": 0.1151455675498855, - "grad_norm": 1.0007144212722778, - "learning_rate": 1.9544444444444444e-05, - "loss": 0.1233, + "epoch": 0.460582270199542, + "grad_norm": 1.199952483177185, + "learning_rate": 4.3975e-05, + "loss": 0.065, "step": 1760 }, { - "epoch": 0.11579980372914622, - "grad_norm": 1.0010242462158203, - "learning_rate": 1.9655555555555556e-05, - "loss": 0.1218, + "epoch": 0.4631992149165849, + "grad_norm": 1.3196426630020142, + "learning_rate": 4.4225e-05, + "loss": 0.0672, "step": 1770 }, { - "epoch": 0.11645403990840693, - "grad_norm": 0.9568318128585815, - "learning_rate": 1.9766666666666668e-05, - "loss": 0.127, + "epoch": 0.4658161596336277, + "grad_norm": 1.0404473543167114, + "learning_rate": 4.4475e-05, + "loss": 0.0719, "step": 1780 }, { - "epoch": 0.11710827608766765, - "grad_norm": 0.9792290925979614, - "learning_rate": 1.9877777777777777e-05, - "loss": 0.135, + "epoch": 0.4684331043506706, + "grad_norm": 0.7192089557647705, + "learning_rate": 4.4725e-05, + "loss": 0.0709, "step": 1790 }, { - "epoch": 0.11776251226692837, - "grad_norm": 1.0170482397079468, - "learning_rate": 1.998888888888889e-05, - "loss": 0.1401, + "epoch": 0.47105004906771347, + "grad_norm": 0.7947105765342712, + "learning_rate": 4.4975e-05, + "loss": 0.0662, "step": 1800 }, { - "epoch": 0.11841674844618907, - "grad_norm": 0.949272632598877, - "learning_rate": 2.01e-05, - "loss": 0.1319, + "epoch": 0.4736669937847563, + "grad_norm": 0.8048547506332397, + "learning_rate": 4.5225e-05, + "loss": 0.0637, "step": 1810 }, { - "epoch": 0.1190709846254498, - "grad_norm": 0.8818572163581848, - "learning_rate": 2.0211111111111113e-05, - "loss": 0.1277, + "epoch": 0.4762839385017992, + "grad_norm": 0.5961185097694397, + "learning_rate": 4.5475e-05, + "loss": 0.069, "step": 1820 }, { - "epoch": 0.1197252208047105, - "grad_norm": 0.9168890118598938, - "learning_rate": 2.0322222222222225e-05, - "loss": 0.1168, + "epoch": 0.478900883218842, + "grad_norm": 0.5182297825813293, + "learning_rate": 4.5725e-05, + "loss": 0.0621, "step": 1830 }, { - "epoch": 0.12037945698397122, - "grad_norm": 1.180184006690979, - "learning_rate": 2.0433333333333336e-05, - "loss": 0.1281, + "epoch": 0.4815178279358849, + "grad_norm": 0.7439857125282288, + "learning_rate": 4.5975e-05, + "loss": 0.0676, "step": 1840 }, { - "epoch": 0.12103369316323193, - "grad_norm": 1.0016313791275024, - "learning_rate": 2.054444444444445e-05, - "loss": 0.1351, + "epoch": 0.4841347726529277, + "grad_norm": 0.5764941573143005, + "learning_rate": 4.6225e-05, + "loss": 0.0685, "step": 1850 }, { - "epoch": 0.12168792934249265, - "grad_norm": 0.9246931672096252, - "learning_rate": 2.0655555555555557e-05, - "loss": 0.1311, + "epoch": 0.4867517173699706, + "grad_norm": 0.8225212693214417, + "learning_rate": 4.6475000000000005e-05, + "loss": 0.0691, "step": 1860 }, { - "epoch": 0.12234216552175335, - "grad_norm": 1.3594558238983154, - "learning_rate": 2.076666666666667e-05, - "loss": 0.1414, + "epoch": 0.4893686620870134, + "grad_norm": 0.6442469954490662, + "learning_rate": 4.6725000000000004e-05, + "loss": 0.0639, "step": 1870 }, { - "epoch": 0.12299640170101407, - "grad_norm": 0.9774754643440247, - "learning_rate": 2.087777777777778e-05, - "loss": 0.1155, + "epoch": 0.4919856068040563, + "grad_norm": 0.5695465803146362, + "learning_rate": 4.6975000000000003e-05, + "loss": 0.0659, "step": 1880 }, { - "epoch": 0.12365063788027478, - "grad_norm": 1.1299134492874146, - "learning_rate": 2.0988888888888893e-05, - "loss": 0.1167, + "epoch": 0.4946025515210991, + "grad_norm": 0.7405710220336914, + "learning_rate": 4.7225e-05, + "loss": 0.075, "step": 1890 }, { - "epoch": 0.1243048740595355, - "grad_norm": 0.9047524333000183, - "learning_rate": 2.11e-05, - "loss": 0.1306, + "epoch": 0.497219496238142, + "grad_norm": 0.7905144691467285, + "learning_rate": 4.7475e-05, + "loss": 0.0655, "step": 1900 }, { - "epoch": 0.1249591102387962, - "grad_norm": 1.080834150314331, - "learning_rate": 2.1211111111111113e-05, - "loss": 0.1234, + "epoch": 0.4998364409551848, + "grad_norm": 0.3834249973297119, + "learning_rate": 4.7725e-05, + "loss": 0.0664, "step": 1910 }, { - "epoch": 0.1256133464180569, - "grad_norm": 0.8716378211975098, - "learning_rate": 2.1322222222222225e-05, - "loss": 0.1265, + "epoch": 0.5024533856722276, + "grad_norm": 0.7903843522071838, + "learning_rate": 4.7975e-05, + "loss": 0.0699, "step": 1920 }, { - "epoch": 0.12626758259731763, - "grad_norm": 0.9985718727111816, - "learning_rate": 2.1433333333333334e-05, - "loss": 0.1284, + "epoch": 0.5050703303892705, + "grad_norm": 1.1027508974075317, + "learning_rate": 4.822500000000001e-05, + "loss": 0.0717, "step": 1930 }, { - "epoch": 0.12692181877657835, - "grad_norm": 1.0242950916290283, - "learning_rate": 2.1544444444444446e-05, - "loss": 0.1267, + "epoch": 0.5076872751063134, + "grad_norm": 0.6772246360778809, + "learning_rate": 4.8475000000000006e-05, + "loss": 0.0742, "step": 1940 }, { - "epoch": 0.12757605495583907, - "grad_norm": 1.1003845930099487, - "learning_rate": 2.1655555555555558e-05, - "loss": 0.1273, + "epoch": 0.5103042198233563, + "grad_norm": 0.8637296557426453, + "learning_rate": 4.8725000000000005e-05, + "loss": 0.0719, "step": 1950 }, { - "epoch": 0.12823029113509976, - "grad_norm": 0.8267995119094849, - "learning_rate": 2.176666666666667e-05, - "loss": 0.1261, + "epoch": 0.512921164540399, + "grad_norm": 0.7059396505355835, + "learning_rate": 4.8975000000000005e-05, + "loss": 0.0722, "step": 1960 }, { - "epoch": 0.12888452731436048, - "grad_norm": 1.2606829404830933, - "learning_rate": 2.1877777777777778e-05, - "loss": 0.1166, + "epoch": 0.5155381092574419, + "grad_norm": 0.6927570104598999, + "learning_rate": 4.9225000000000004e-05, + "loss": 0.0765, "step": 1970 }, { - "epoch": 0.1295387634936212, - "grad_norm": 0.8288832306861877, - "learning_rate": 2.198888888888889e-05, - "loss": 0.1128, + "epoch": 0.5181550539744848, + "grad_norm": 0.8184845447540283, + "learning_rate": 4.9475e-05, + "loss": 0.0751, "step": 1980 }, { - "epoch": 0.13019299967288192, - "grad_norm": 1.1182180643081665, - "learning_rate": 2.2100000000000002e-05, - "loss": 0.1281, + "epoch": 0.5207719986915277, + "grad_norm": 0.6966920495033264, + "learning_rate": 4.9725e-05, + "loss": 0.0743, "step": 1990 }, { - "epoch": 0.1308472358521426, - "grad_norm": 0.8783948421478271, - "learning_rate": 2.2211111111111114e-05, - "loss": 0.1277, + "epoch": 0.5233889434085705, + "grad_norm": 0.6769987940788269, + "learning_rate": 4.9975e-05, + "loss": 0.0731, + "step": 2000 + }, + { + "epoch": 0.5233889434085705, + "eval_loss": 0.07937794011851888, + "eval_runtime": 8.4909, + "eval_samples_per_second": 120.6, + "eval_steps_per_second": 1.884, "step": 2000 }, { - "epoch": 0.13150147203140333, - "grad_norm": 1.0937625169754028, - "learning_rate": 2.2322222222222222e-05, - "loss": 0.1294, + "epoch": 0.5260058881256133, + "grad_norm": 0.6470193862915039, + "learning_rate": 4.99999930796579e-05, + "loss": 0.0711, "step": 2010 }, { - "epoch": 0.13215570821066405, - "grad_norm": 0.9571072459220886, - "learning_rate": 2.2433333333333334e-05, - "loss": 0.1109, + "epoch": 0.5286228328426562, + "grad_norm": 0.8470696210861206, + "learning_rate": 4.999996915749259e-05, + "loss": 0.077, "step": 2020 }, { - "epoch": 0.13280994438992477, - "grad_norm": 1.1977931261062622, - "learning_rate": 2.2544444444444446e-05, - "loss": 0.1331, + "epoch": 0.5312397775596991, + "grad_norm": 1.0474553108215332, + "learning_rate": 4.99999281480841e-05, + "loss": 0.0727, "step": 2030 }, { - "epoch": 0.13346418056918546, - "grad_norm": 0.9299104809761047, - "learning_rate": 2.2655555555555558e-05, - "loss": 0.1351, + "epoch": 0.5338567222767419, + "grad_norm": 0.7926612496376038, + "learning_rate": 4.999987005146045e-05, + "loss": 0.071, "step": 2040 }, { - "epoch": 0.13411841674844618, - "grad_norm": 0.8753429651260376, - "learning_rate": 2.2766666666666667e-05, - "loss": 0.1365, + "epoch": 0.5364736669937847, + "grad_norm": 0.593268096446991, + "learning_rate": 4.9999794867661356e-05, + "loss": 0.0732, "step": 2050 }, { - "epoch": 0.1347726529277069, - "grad_norm": 1.0041842460632324, - "learning_rate": 2.287777777777778e-05, - "loss": 0.1303, + "epoch": 0.5390906117108276, + "grad_norm": 0.7809154391288757, + "learning_rate": 4.999970259673821e-05, + "loss": 0.076, "step": 2060 }, { - "epoch": 0.13542688910696762, - "grad_norm": 1.1017142534255981, - "learning_rate": 2.298888888888889e-05, - "loss": 0.124, + "epoch": 0.5417075564278705, + "grad_norm": 0.7197739481925964, + "learning_rate": 4.999959323875406e-05, + "loss": 0.0766, "step": 2070 }, { - "epoch": 0.13608112528622832, - "grad_norm": 1.038955569267273, - "learning_rate": 2.3100000000000002e-05, - "loss": 0.1275, + "epoch": 0.5443245011449133, + "grad_norm": 0.5898247361183167, + "learning_rate": 4.999946679378368e-05, + "loss": 0.0639, "step": 2080 }, { - "epoch": 0.13673536146548904, - "grad_norm": 0.8835306763648987, - "learning_rate": 2.321111111111111e-05, - "loss": 0.1292, + "epoch": 0.5469414458619561, + "grad_norm": 1.1134157180786133, + "learning_rate": 4.999932326191346e-05, + "loss": 0.0764, "step": 2090 }, { - "epoch": 0.13738959764474976, - "grad_norm": 1.067949652671814, - "learning_rate": 2.3322222222222223e-05, - "loss": 0.1308, + "epoch": 0.549558390578999, + "grad_norm": 0.8103418946266174, + "learning_rate": 4.999916264324153e-05, + "loss": 0.0727, "step": 2100 }, { - "epoch": 0.13804383382401048, - "grad_norm": 0.9632567167282104, - "learning_rate": 2.3433333333333335e-05, - "loss": 0.1389, + "epoch": 0.5521753352960419, + "grad_norm": 0.7031409740447998, + "learning_rate": 4.999898493787766e-05, + "loss": 0.0673, "step": 2110 }, { - "epoch": 0.13869807000327117, - "grad_norm": 0.9398028254508972, - "learning_rate": 2.3544444444444447e-05, - "loss": 0.1242, + "epoch": 0.5547922800130847, + "grad_norm": 0.8002007603645325, + "learning_rate": 4.999879014594331e-05, + "loss": 0.0722, "step": 2120 }, { - "epoch": 0.1393523061825319, - "grad_norm": 0.9722338914871216, - "learning_rate": 2.3655555555555555e-05, - "loss": 0.1179, + "epoch": 0.5574092247301276, + "grad_norm": 0.6864317655563354, + "learning_rate": 4.999857826757162e-05, + "loss": 0.0746, "step": 2130 }, { - "epoch": 0.1400065423617926, - "grad_norm": 0.8309763669967651, - "learning_rate": 2.3766666666666667e-05, - "loss": 0.1232, + "epoch": 0.5600261694471704, + "grad_norm": 0.5315617322921753, + "learning_rate": 4.999834930290741e-05, + "loss": 0.0682, "step": 2140 }, { - "epoch": 0.14066077854105333, - "grad_norm": 0.8105255961418152, - "learning_rate": 2.387777777777778e-05, - "loss": 0.1308, + "epoch": 0.5626431141642133, + "grad_norm": 0.6081879734992981, + "learning_rate": 4.9998103252107166e-05, + "loss": 0.0693, "step": 2150 }, { - "epoch": 0.14131501472031405, - "grad_norm": 0.8585493564605713, - "learning_rate": 2.398888888888889e-05, - "loss": 0.1196, + "epoch": 0.5652600588812562, + "grad_norm": 0.44701310992240906, + "learning_rate": 4.999784011533907e-05, + "loss": 0.0672, "step": 2160 }, { - "epoch": 0.14196925089957474, - "grad_norm": 0.9327337145805359, - "learning_rate": 2.41e-05, - "loss": 0.1212, + "epoch": 0.567877003598299, + "grad_norm": 0.8048236966133118, + "learning_rate": 4.999755989278298e-05, + "loss": 0.0681, "step": 2170 }, { - "epoch": 0.14262348707883546, - "grad_norm": 0.867504358291626, - "learning_rate": 2.421111111111111e-05, - "loss": 0.1131, + "epoch": 0.5704939483153418, + "grad_norm": 0.7597357034683228, + "learning_rate": 4.99972625846304e-05, + "loss": 0.0681, "step": 2180 }, { - "epoch": 0.14327772325809618, - "grad_norm": 0.8426870703697205, - "learning_rate": 2.4322222222222224e-05, - "loss": 0.1239, + "epoch": 0.5731108930323847, + "grad_norm": 0.6289070248603821, + "learning_rate": 4.999694819108456e-05, + "loss": 0.0705, "step": 2190 }, { - "epoch": 0.1439319594373569, - "grad_norm": 0.9085756540298462, - "learning_rate": 2.4433333333333335e-05, - "loss": 0.1288, + "epoch": 0.5757278377494276, + "grad_norm": 1.0394102334976196, + "learning_rate": 4.999661671236034e-05, + "loss": 0.0823, "step": 2200 }, { - "epoch": 0.1445861956166176, - "grad_norm": 0.9379426836967468, - "learning_rate": 2.4544444444444444e-05, - "loss": 0.1168, + "epoch": 0.5783447824664704, + "grad_norm": 0.9317861199378967, + "learning_rate": 4.99962681486843e-05, + "loss": 0.0725, "step": 2210 }, { - "epoch": 0.1452404317958783, - "grad_norm": 0.9728325009346008, - "learning_rate": 2.4655555555555556e-05, - "loss": 0.1237, + "epoch": 0.5809617271835132, + "grad_norm": 1.0117568969726562, + "learning_rate": 4.9995902500294676e-05, + "loss": 0.0751, "step": 2220 }, { - "epoch": 0.14589466797513903, - "grad_norm": 0.9717603921890259, - "learning_rate": 2.4766666666666668e-05, - "loss": 0.1172, + "epoch": 0.5835786719005561, + "grad_norm": 0.7560598254203796, + "learning_rate": 4.99955197674414e-05, + "loss": 0.0723, "step": 2230 }, { - "epoch": 0.14654890415439975, - "grad_norm": 0.9177740812301636, - "learning_rate": 2.4877777777777776e-05, - "loss": 0.1172, + "epoch": 0.586195616617599, + "grad_norm": 0.7627548575401306, + "learning_rate": 4.999511995038605e-05, + "loss": 0.0744, "step": 2240 }, { - "epoch": 0.14720314033366044, - "grad_norm": 1.4036056995391846, - "learning_rate": 2.498888888888889e-05, - "loss": 0.125, + "epoch": 0.5888125613346418, + "grad_norm": 0.5133088827133179, + "learning_rate": 4.99947030494019e-05, + "loss": 0.071, "step": 2250 }, { - "epoch": 0.14785737651292116, - "grad_norm": 1.0242791175842285, - "learning_rate": 2.51e-05, - "loss": 0.1333, + "epoch": 0.5914295060516847, + "grad_norm": 0.8383373022079468, + "learning_rate": 4.9994269064773916e-05, + "loss": 0.0816, "step": 2260 }, { - "epoch": 0.14851161269218188, - "grad_norm": 0.8376342058181763, - "learning_rate": 2.5211111111111112e-05, - "loss": 0.1227, + "epoch": 0.5940464507687275, + "grad_norm": 0.4755108058452606, + "learning_rate": 4.9993817996798695e-05, + "loss": 0.0709, "step": 2270 }, { - "epoch": 0.1491658488714426, - "grad_norm": 0.9228038191795349, - "learning_rate": 2.5322222222222224e-05, - "loss": 0.1296, + "epoch": 0.5966633954857704, + "grad_norm": 0.8120405077934265, + "learning_rate": 4.999334984578456e-05, + "loss": 0.0648, "step": 2280 }, { - "epoch": 0.1498200850507033, - "grad_norm": 0.8739307522773743, - "learning_rate": 2.5433333333333336e-05, - "loss": 0.1086, + "epoch": 0.5992803402028132, + "grad_norm": 0.67301344871521, + "learning_rate": 4.999286461205147e-05, + "loss": 0.0734, "step": 2290 }, { - "epoch": 0.15047432122996401, - "grad_norm": 0.9078662991523743, - "learning_rate": 2.5544444444444445e-05, - "loss": 0.1307, + "epoch": 0.6018972849198561, + "grad_norm": 1.0979734659194946, + "learning_rate": 4.9992362295931094e-05, + "loss": 0.075, "step": 2300 }, { - "epoch": 0.15112855740922473, - "grad_norm": 1.0658289194107056, - "learning_rate": 2.5655555555555557e-05, - "loss": 0.128, + "epoch": 0.6045142296368989, + "grad_norm": 0.9093306660652161, + "learning_rate": 4.999184289776675e-05, + "loss": 0.0762, "step": 2310 }, { - "epoch": 0.15178279358848545, - "grad_norm": 0.8755213618278503, - "learning_rate": 2.5766666666666665e-05, - "loss": 0.1305, + "epoch": 0.6071311743539418, + "grad_norm": 1.5600154399871826, + "learning_rate": 4.999130641791344e-05, + "loss": 0.0744, "step": 2320 }, { - "epoch": 0.15243702976774615, - "grad_norm": 1.0628185272216797, - "learning_rate": 2.5877777777777777e-05, - "loss": 0.1244, + "epoch": 0.6097481190709846, + "grad_norm": 0.7541422247886658, + "learning_rate": 4.9990752856737856e-05, + "loss": 0.0827, "step": 2330 }, { - "epoch": 0.15309126594700687, - "grad_norm": 1.0583535432815552, - "learning_rate": 2.598888888888889e-05, - "loss": 0.1257, + "epoch": 0.6123650637880275, + "grad_norm": 0.8271247744560242, + "learning_rate": 4.9990182214618334e-05, + "loss": 0.0688, "step": 2340 }, { - "epoch": 0.15374550212626759, - "grad_norm": 0.9537503719329834, - "learning_rate": 2.61e-05, - "loss": 0.1241, + "epoch": 0.6149820085050703, + "grad_norm": 0.7597739100456238, + "learning_rate": 4.9989594491944915e-05, + "loss": 0.0743, "step": 2350 }, { - "epoch": 0.1543997383055283, - "grad_norm": 1.0985156297683716, - "learning_rate": 2.6211111111111113e-05, - "loss": 0.1359, + "epoch": 0.6175989532221132, + "grad_norm": 0.8055959343910217, + "learning_rate": 4.9988989689119296e-05, + "loss": 0.079, "step": 2360 }, { - "epoch": 0.155053974484789, - "grad_norm": 0.9252329468727112, - "learning_rate": 2.6322222222222225e-05, - "loss": 0.1317, + "epoch": 0.620215897939156, + "grad_norm": 1.167317271232605, + "learning_rate": 4.9988367806554856e-05, + "loss": 0.0697, "step": 2370 }, { - "epoch": 0.15570821066404972, - "grad_norm": 1.1681864261627197, - "learning_rate": 2.6433333333333333e-05, - "loss": 0.1242, + "epoch": 0.6228328426561989, + "grad_norm": 0.5930063128471375, + "learning_rate": 4.9987728844676657e-05, + "loss": 0.0755, "step": 2380 }, { - "epoch": 0.15636244684331044, - "grad_norm": 1.2066115140914917, - "learning_rate": 2.6544444444444445e-05, - "loss": 0.1152, + "epoch": 0.6254497873732418, + "grad_norm": 0.9787731170654297, + "learning_rate": 4.998707280392141e-05, + "loss": 0.0759, "step": 2390 }, { - "epoch": 0.15701668302257116, - "grad_norm": 1.1337898969650269, - "learning_rate": 2.6655555555555557e-05, - "loss": 0.1334, + "epoch": 0.6280667320902846, + "grad_norm": 0.7472729682922363, + "learning_rate": 4.998639968473751e-05, + "loss": 0.0741, "step": 2400 }, { - "epoch": 0.15767091920183185, - "grad_norm": 0.9998272657394409, - "learning_rate": 2.676666666666667e-05, - "loss": 0.1189, + "epoch": 0.6306836768073274, + "grad_norm": 0.6840028166770935, + "learning_rate": 4.998570948758503e-05, + "loss": 0.0692, "step": 2410 }, { - "epoch": 0.15832515538109257, - "grad_norm": 0.8952450752258301, - "learning_rate": 2.687777777777778e-05, - "loss": 0.1227, + "epoch": 0.6333006215243703, + "grad_norm": 0.7901447415351868, + "learning_rate": 4.998500221293572e-05, + "loss": 0.0843, "step": 2420 }, { - "epoch": 0.1589793915603533, - "grad_norm": 1.0793391466140747, - "learning_rate": 2.6988888888888893e-05, - "loss": 0.1198, + "epoch": 0.6359175662414132, + "grad_norm": 0.6261860132217407, + "learning_rate": 4.9984277861273e-05, + "loss": 0.0709, "step": 2430 }, { - "epoch": 0.159633627739614, - "grad_norm": 1.016587734222412, - "learning_rate": 2.7100000000000005e-05, - "loss": 0.1208, + "epoch": 0.638534510958456, + "grad_norm": 0.6743974089622498, + "learning_rate": 4.9983536433091936e-05, + "loss": 0.0739, "step": 2440 }, { - "epoch": 0.1602878639188747, - "grad_norm": 1.0260379314422607, - "learning_rate": 2.7211111111111113e-05, - "loss": 0.1258, + "epoch": 0.6411514556754988, + "grad_norm": 0.9257513880729675, + "learning_rate": 4.998277792889931e-05, + "loss": 0.0783, "step": 2450 }, { - "epoch": 0.16094210009813542, - "grad_norm": 0.9341796636581421, - "learning_rate": 2.7322222222222222e-05, - "loss": 0.1172, + "epoch": 0.6437684003925417, + "grad_norm": 0.5694191455841064, + "learning_rate": 4.998200234921354e-05, + "loss": 0.0694, "step": 2460 }, { - "epoch": 0.16159633627739614, - "grad_norm": 1.0528295040130615, - "learning_rate": 2.7433333333333334e-05, - "loss": 0.124, + "epoch": 0.6463853451095846, + "grad_norm": 0.8103319406509399, + "learning_rate": 4.9981209694564726e-05, + "loss": 0.076, "step": 2470 }, { - "epoch": 0.16225057245665686, - "grad_norm": 0.831092894077301, - "learning_rate": 2.7544444444444446e-05, - "loss": 0.1143, + "epoch": 0.6490022898266274, + "grad_norm": 0.5619904398918152, + "learning_rate": 4.998039996549465e-05, + "loss": 0.0706, "step": 2480 }, { - "epoch": 0.16290480863591755, - "grad_norm": 1.0132439136505127, - "learning_rate": 2.7655555555555558e-05, - "loss": 0.134, + "epoch": 0.6516192345436702, + "grad_norm": 1.2966636419296265, + "learning_rate": 4.997957316255675e-05, + "loss": 0.0772, "step": 2490 }, { - "epoch": 0.16355904481517827, - "grad_norm": 1.041785717010498, - "learning_rate": 2.776666666666667e-05, - "loss": 0.1328, + "epoch": 0.6542361792607131, + "grad_norm": 0.7027465105056763, + "learning_rate": 4.997872928631613e-05, + "loss": 0.0746, "step": 2500 }, { - "epoch": 0.164213280994439, - "grad_norm": 1.1009939908981323, - "learning_rate": 2.787777777777778e-05, - "loss": 0.1262, + "epoch": 0.656853123977756, + "grad_norm": 0.8370850682258606, + "learning_rate": 4.997786833734959e-05, + "loss": 0.0762, "step": 2510 }, { - "epoch": 0.1648675171736997, - "grad_norm": 1.0970548391342163, - "learning_rate": 2.7988888888888893e-05, - "loss": 0.132, + "epoch": 0.6594700686947988, + "grad_norm": 0.8642465472221375, + "learning_rate": 4.997699031624556e-05, + "loss": 0.0747, "step": 2520 }, { - "epoch": 0.16552175335296043, - "grad_norm": 1.0594844818115234, - "learning_rate": 2.8100000000000005e-05, - "loss": 0.1267, + "epoch": 0.6620870134118417, + "grad_norm": 0.7573094964027405, + "learning_rate": 4.9976095223604166e-05, + "loss": 0.0742, "step": 2530 }, { - "epoch": 0.16617598953222112, - "grad_norm": 1.1462485790252686, - "learning_rate": 2.821111111111111e-05, - "loss": 0.1214, + "epoch": 0.6647039581288845, + "grad_norm": 0.8384932279586792, + "learning_rate": 4.99751830600372e-05, + "loss": 0.0675, "step": 2540 }, { - "epoch": 0.16683022571148184, - "grad_norm": 1.044067621231079, - "learning_rate": 2.8322222222222222e-05, - "loss": 0.1287, + "epoch": 0.6673209028459274, + "grad_norm": 0.8319125771522522, + "learning_rate": 4.997425382616812e-05, + "loss": 0.0792, "step": 2550 }, { - "epoch": 0.16748446189074256, - "grad_norm": 1.0372158288955688, - "learning_rate": 2.8433333333333334e-05, - "loss": 0.1297, + "epoch": 0.6699378475629703, + "grad_norm": 0.8607365489006042, + "learning_rate": 4.997330752263203e-05, + "loss": 0.0796, "step": 2560 }, { - "epoch": 0.16813869807000328, - "grad_norm": 1.0640093088150024, - "learning_rate": 2.8544444444444446e-05, - "loss": 0.1266, + "epoch": 0.6725547922800131, + "grad_norm": 0.699475884437561, + "learning_rate": 4.997234415007574e-05, + "loss": 0.067, "step": 2570 }, { - "epoch": 0.16879293424926398, - "grad_norm": 0.9317982792854309, - "learning_rate": 2.8655555555555558e-05, - "loss": 0.1283, + "epoch": 0.6751717369970559, + "grad_norm": 0.9081335067749023, + "learning_rate": 4.99713637091577e-05, + "loss": 0.0774, "step": 2580 }, { - "epoch": 0.1694471704285247, - "grad_norm": 1.5074639320373535, - "learning_rate": 2.876666666666667e-05, - "loss": 0.1105, + "epoch": 0.6777886817140988, + "grad_norm": 0.5712231993675232, + "learning_rate": 4.997036620054803e-05, + "loss": 0.0644, "step": 2590 }, { - "epoch": 0.17010140660778542, - "grad_norm": 1.1034104824066162, - "learning_rate": 2.8877777777777782e-05, - "loss": 0.1186, + "epoch": 0.6804056264311417, + "grad_norm": 0.9103120565414429, + "learning_rate": 4.996935162492852e-05, + "loss": 0.0769, "step": 2600 }, { - "epoch": 0.17075564278704614, - "grad_norm": 0.9475632905960083, - "learning_rate": 2.8988888888888887e-05, - "loss": 0.1191, + "epoch": 0.6830225711481845, + "grad_norm": 0.6050474643707275, + "learning_rate": 4.996831998299262e-05, + "loss": 0.0784, "step": 2610 }, { - "epoch": 0.17140987896630683, - "grad_norm": 0.9753337502479553, - "learning_rate": 2.91e-05, - "loss": 0.115, + "epoch": 0.6856395158652273, + "grad_norm": 0.5857220888137817, + "learning_rate": 4.9967271275445444e-05, + "loss": 0.0736, "step": 2620 }, { - "epoch": 0.17206411514556755, - "grad_norm": 1.0091173648834229, - "learning_rate": 2.921111111111111e-05, - "loss": 0.1132, + "epoch": 0.6882564605822702, + "grad_norm": 0.5730482339859009, + "learning_rate": 4.996620550300378e-05, + "loss": 0.0727, "step": 2630 }, { - "epoch": 0.17271835132482827, - "grad_norm": 0.8938674330711365, - "learning_rate": 2.9322222222222223e-05, - "loss": 0.1221, + "epoch": 0.6908734052993131, + "grad_norm": 0.791704535484314, + "learning_rate": 4.996512266639608e-05, + "loss": 0.0809, "step": 2640 }, { - "epoch": 0.173372587504089, - "grad_norm": 0.9849511384963989, - "learning_rate": 2.9433333333333335e-05, - "loss": 0.1161, + "epoch": 0.693490350016356, + "grad_norm": 0.6655763983726501, + "learning_rate": 4.9964022766362436e-05, + "loss": 0.0793, "step": 2650 }, { - "epoch": 0.17402682368334968, - "grad_norm": 0.9725368618965149, - "learning_rate": 2.9544444444444447e-05, - "loss": 0.1117, + "epoch": 0.6961072947333987, + "grad_norm": 0.9879754185676575, + "learning_rate": 4.996290580365463e-05, + "loss": 0.0751, "step": 2660 }, { - "epoch": 0.1746810598626104, - "grad_norm": 0.8807811737060547, - "learning_rate": 2.965555555555556e-05, - "loss": 0.1251, + "epoch": 0.6987242394504416, + "grad_norm": 0.8520436882972717, + "learning_rate": 4.996177177903609e-05, + "loss": 0.0739, "step": 2670 }, { - "epoch": 0.17533529604187112, - "grad_norm": 1.1785404682159424, - "learning_rate": 2.976666666666667e-05, - "loss": 0.1219, + "epoch": 0.7013411841674845, + "grad_norm": 0.7003749012947083, + "learning_rate": 4.9960620693281924e-05, + "loss": 0.0761, "step": 2680 }, { - "epoch": 0.17598953222113184, - "grad_norm": 0.9174340963363647, - "learning_rate": 2.9877777777777776e-05, - "loss": 0.1099, + "epoch": 0.7039581288845274, + "grad_norm": 0.8380037546157837, + "learning_rate": 4.995945254717887e-05, + "loss": 0.0759, "step": 2690 }, { - "epoch": 0.17664376840039253, - "grad_norm": 1.0297743082046509, - "learning_rate": 2.9988888888888888e-05, - "loss": 0.1163, + "epoch": 0.7065750736015701, + "grad_norm": 0.756373941898346, + "learning_rate": 4.9958267341525353e-05, + "loss": 0.073, "step": 2700 }, { - "epoch": 0.17729800457965325, - "grad_norm": 0.8429158926010132, - "learning_rate": 3.01e-05, - "loss": 0.1147, + "epoch": 0.709192018318613, + "grad_norm": 0.6666063070297241, + "learning_rate": 4.995706507713146e-05, + "loss": 0.0712, "step": 2710 }, { - "epoch": 0.17795224075891397, - "grad_norm": 1.0993781089782715, - "learning_rate": 3.0211111111111112e-05, - "loss": 0.1241, + "epoch": 0.7118089630356559, + "grad_norm": 0.743940532207489, + "learning_rate": 4.9955845754818906e-05, + "loss": 0.0746, "step": 2720 }, { - "epoch": 0.1786064769381747, - "grad_norm": 1.0947730541229248, - "learning_rate": 3.0322222222222224e-05, - "loss": 0.1137, + "epoch": 0.7144259077526988, + "grad_norm": 0.5994930267333984, + "learning_rate": 4.9954609375421105e-05, + "loss": 0.0733, "step": 2730 }, { - "epoch": 0.17926071311743538, - "grad_norm": 1.0554438829421997, - "learning_rate": 3.0433333333333336e-05, - "loss": 0.1211, + "epoch": 0.7170428524697415, + "grad_norm": 0.6286001205444336, + "learning_rate": 4.995335593978311e-05, + "loss": 0.0684, "step": 2740 }, { - "epoch": 0.1799149492966961, - "grad_norm": 0.987632155418396, - "learning_rate": 3.054444444444445e-05, - "loss": 0.1207, + "epoch": 0.7196597971867844, + "grad_norm": 0.6213799715042114, + "learning_rate": 4.995208544876162e-05, + "loss": 0.0684, "step": 2750 }, { - "epoch": 0.18056918547595682, - "grad_norm": 1.0157499313354492, - "learning_rate": 3.065555555555556e-05, - "loss": 0.1182, + "epoch": 0.7222767419038273, + "grad_norm": 0.5822627544403076, + "learning_rate": 4.9950797903225006e-05, + "loss": 0.0684, "step": 2760 }, { - "epoch": 0.18122342165521754, - "grad_norm": 0.8833321332931519, - "learning_rate": 3.0766666666666665e-05, - "loss": 0.1202, + "epoch": 0.7248936866208702, + "grad_norm": 0.7613430619239807, + "learning_rate": 4.99494933040533e-05, + "loss": 0.0731, "step": 2770 }, { - "epoch": 0.18187765783447823, - "grad_norm": 1.0868935585021973, - "learning_rate": 3.087777777777778e-05, - "loss": 0.1207, + "epoch": 0.7275106313379129, + "grad_norm": 1.0139579772949219, + "learning_rate": 4.994817165213818e-05, + "loss": 0.0701, "step": 2780 }, { - "epoch": 0.18253189401373895, - "grad_norm": 1.1105583906173706, - "learning_rate": 3.098888888888889e-05, - "loss": 0.1242, + "epoch": 0.7301275760549558, + "grad_norm": 0.5469045639038086, + "learning_rate": 4.994683294838298e-05, + "loss": 0.0716, "step": 2790 }, { - "epoch": 0.18318613019299967, - "grad_norm": 1.014053225517273, - "learning_rate": 3.1100000000000004e-05, - "loss": 0.1176, + "epoch": 0.7327445207719987, + "grad_norm": 0.6303797960281372, + "learning_rate": 4.99454771937027e-05, + "loss": 0.0744, "step": 2800 }, { - "epoch": 0.1838403663722604, - "grad_norm": 1.2380222082138062, - "learning_rate": 3.121111111111111e-05, - "loss": 0.1231, + "epoch": 0.7353614654890416, + "grad_norm": 0.5600771307945251, + "learning_rate": 4.994410438902398e-05, + "loss": 0.081, "step": 2810 }, { - "epoch": 0.1844946025515211, - "grad_norm": 0.9214850664138794, - "learning_rate": 3.132222222222223e-05, - "loss": 0.1258, + "epoch": 0.7379784102060843, + "grad_norm": 0.6783274412155151, + "learning_rate": 4.994271453528511e-05, + "loss": 0.0695, "step": 2820 }, { - "epoch": 0.1851488387307818, - "grad_norm": 0.9159259796142578, - "learning_rate": 3.1433333333333336e-05, - "loss": 0.1199, + "epoch": 0.7405953549231272, + "grad_norm": 0.7181297540664673, + "learning_rate": 4.994130763343606e-05, + "loss": 0.0704, "step": 2830 }, { - "epoch": 0.18580307491004253, - "grad_norm": 1.0562981367111206, - "learning_rate": 3.154444444444445e-05, - "loss": 0.12, + "epoch": 0.7432122996401701, + "grad_norm": 0.6178589463233948, + "learning_rate": 4.993988368443843e-05, + "loss": 0.0711, "step": 2840 }, { - "epoch": 0.18645731108930325, - "grad_norm": 0.9648650288581848, - "learning_rate": 3.165555555555555e-05, - "loss": 0.1168, + "epoch": 0.745829244357213, + "grad_norm": 0.40977856516838074, + "learning_rate": 4.9938442689265456e-05, + "loss": 0.0726, "step": 2850 }, { - "epoch": 0.18711154726856394, - "grad_norm": 1.028402328491211, - "learning_rate": 3.176666666666667e-05, - "loss": 0.1281, + "epoch": 0.7484461890742558, + "grad_norm": 0.6271493434906006, + "learning_rate": 4.9936984648902064e-05, + "loss": 0.0821, "step": 2860 }, { - "epoch": 0.18776578344782466, - "grad_norm": 0.9670931696891785, - "learning_rate": 3.187777777777778e-05, - "loss": 0.1236, + "epoch": 0.7510631337912986, + "grad_norm": 0.5872370600700378, + "learning_rate": 4.993550956434481e-05, + "loss": 0.0742, "step": 2870 }, { - "epoch": 0.18842001962708538, - "grad_norm": 0.9317355751991272, - "learning_rate": 3.198888888888889e-05, - "loss": 0.113, + "epoch": 0.7536800785083415, + "grad_norm": 0.5435254573822021, + "learning_rate": 4.99340174366019e-05, + "loss": 0.0703, "step": 2880 }, { - "epoch": 0.1890742558063461, - "grad_norm": 0.978927493095398, - "learning_rate": 3.21e-05, - "loss": 0.1223, + "epoch": 0.7562970232253844, + "grad_norm": 0.6672695875167847, + "learning_rate": 4.993250826669318e-05, + "loss": 0.0786, "step": 2890 }, { - "epoch": 0.18972849198560682, - "grad_norm": 0.9678372740745544, - "learning_rate": 3.2211111111111116e-05, - "loss": 0.1246, + "epoch": 0.7589139679424273, + "grad_norm": 0.6329795718193054, + "learning_rate": 4.993098205565016e-05, + "loss": 0.0695, "step": 2900 }, { - "epoch": 0.1903827281648675, - "grad_norm": 0.856540322303772, - "learning_rate": 3.2322222222222225e-05, - "loss": 0.1192, + "epoch": 0.76153091265947, + "grad_norm": 0.47968053817749023, + "learning_rate": 4.992943880451599e-05, + "loss": 0.0672, "step": 2910 }, { - "epoch": 0.19103696434412823, - "grad_norm": 1.0986219644546509, - "learning_rate": 3.243333333333333e-05, - "loss": 0.1189, + "epoch": 0.7641478573765129, + "grad_norm": 0.5885211825370789, + "learning_rate": 4.992787851434546e-05, + "loss": 0.0786, "step": 2920 }, { - "epoch": 0.19169120052338895, - "grad_norm": 0.9947769641876221, - "learning_rate": 3.254444444444444e-05, - "loss": 0.1222, + "epoch": 0.7667648020935558, + "grad_norm": 0.8792977929115295, + "learning_rate": 4.992630118620504e-05, + "loss": 0.0769, "step": 2930 }, { - "epoch": 0.19234543670264967, - "grad_norm": 0.9733787178993225, - "learning_rate": 3.265555555555556e-05, - "loss": 0.1214, + "epoch": 0.7693817468105987, + "grad_norm": 1.083542823791504, + "learning_rate": 4.9924706821172784e-05, + "loss": 0.0802, "step": 2940 }, { - "epoch": 0.19299967288191036, - "grad_norm": 1.0199573040008545, - "learning_rate": 3.2766666666666666e-05, - "loss": 0.1304, + "epoch": 0.7719986915276414, + "grad_norm": 0.811312198638916, + "learning_rate": 4.992309542033845e-05, + "loss": 0.0735, "step": 2950 }, { - "epoch": 0.19365390906117108, - "grad_norm": 0.8536534905433655, - "learning_rate": 3.287777777777778e-05, - "loss": 0.1093, + "epoch": 0.7746156362446843, + "grad_norm": 0.9727795124053955, + "learning_rate": 4.99214669848034e-05, + "loss": 0.073, "step": 2960 }, { - "epoch": 0.1943081452404318, - "grad_norm": 0.8914840817451477, - "learning_rate": 3.298888888888889e-05, - "loss": 0.1064, + "epoch": 0.7772325809617272, + "grad_norm": 0.8784989714622498, + "learning_rate": 4.9919821515680665e-05, + "loss": 0.0699, "step": 2970 }, { - "epoch": 0.19496238141969252, - "grad_norm": 0.9319343566894531, - "learning_rate": 3.3100000000000005e-05, - "loss": 0.118, + "epoch": 0.7798495256787701, + "grad_norm": 0.744993269443512, + "learning_rate": 4.9918159014094906e-05, + "loss": 0.0714, "step": 2980 }, { - "epoch": 0.1956166175989532, - "grad_norm": 1.0728380680084229, - "learning_rate": 3.3211111111111114e-05, - "loss": 0.1202, + "epoch": 0.7824664703958129, + "grad_norm": 0.7093546390533447, + "learning_rate": 4.991647948118242e-05, + "loss": 0.0703, "step": 2990 }, { - "epoch": 0.19627085377821393, - "grad_norm": 0.9401952624320984, - "learning_rate": 3.332222222222222e-05, - "loss": 0.1173, + "epoch": 0.7850834151128557, + "grad_norm": 0.6334063410758972, + "learning_rate": 4.991478291809116e-05, + "loss": 0.0683, + "step": 3000 + }, + { + "epoch": 0.7850834151128557, + "eval_loss": 0.07849323315593094, + "eval_runtime": 8.5756, + "eval_samples_per_second": 119.408, + "eval_steps_per_second": 1.866, "step": 3000 }, { - "epoch": 0.19692508995747465, - "grad_norm": 0.9090434312820435, - "learning_rate": 3.343333333333333e-05, - "loss": 0.1285, + "epoch": 0.7877003598298986, + "grad_norm": 0.7186859846115112, + "learning_rate": 4.991306932598071e-05, + "loss": 0.0701, "step": 3010 }, { - "epoch": 0.19757932613673537, - "grad_norm": 1.0409756898880005, - "learning_rate": 3.3544444444444446e-05, - "loss": 0.1172, + "epoch": 0.7903173045469415, + "grad_norm": 0.7815176844596863, + "learning_rate": 4.991133870602229e-05, + "loss": 0.0702, "step": 3020 }, { - "epoch": 0.19823356231599606, - "grad_norm": 1.026477336883545, - "learning_rate": 3.3655555555555554e-05, - "loss": 0.1277, + "epoch": 0.7929342492639843, + "grad_norm": 0.8299248814582825, + "learning_rate": 4.9909591059398764e-05, + "loss": 0.0637, "step": 3030 }, { - "epoch": 0.19888779849525678, - "grad_norm": 1.1095457077026367, - "learning_rate": 3.376666666666667e-05, - "loss": 0.1131, + "epoch": 0.7955511939810271, + "grad_norm": 0.6321790814399719, + "learning_rate": 4.990782638730464e-05, + "loss": 0.0734, "step": 3040 }, { - "epoch": 0.1995420346745175, - "grad_norm": 1.1334766149520874, - "learning_rate": 3.387777777777778e-05, - "loss": 0.1327, + "epoch": 0.79816813869807, + "grad_norm": 0.7430514097213745, + "learning_rate": 4.990604469094603e-05, + "loss": 0.0657, "step": 3050 }, { - "epoch": 0.20019627085377822, - "grad_norm": 0.8890497088432312, - "learning_rate": 3.3988888888888894e-05, - "loss": 0.1209, + "epoch": 0.8007850834151129, + "grad_norm": 0.9191706776618958, + "learning_rate": 4.9904245971540745e-05, + "loss": 0.0718, "step": 3060 }, { - "epoch": 0.20085050703303892, - "grad_norm": 0.8564671874046326, - "learning_rate": 3.41e-05, - "loss": 0.1148, + "epoch": 0.8034020281321557, + "grad_norm": 0.6048433184623718, + "learning_rate": 4.990243023031815e-05, + "loss": 0.0688, "step": 3070 }, { - "epoch": 0.20150474321229964, - "grad_norm": 0.9905887842178345, - "learning_rate": 3.421111111111111e-05, - "loss": 0.1045, + "epoch": 0.8060189728491985, + "grad_norm": 0.5070809721946716, + "learning_rate": 4.990059746851932e-05, + "loss": 0.0687, "step": 3080 }, { - "epoch": 0.20215897939156036, - "grad_norm": 1.105401873588562, - "learning_rate": 3.432222222222222e-05, - "loss": 0.1101, + "epoch": 0.8086359175662414, + "grad_norm": 0.8282752633094788, + "learning_rate": 4.9898747687396916e-05, + "loss": 0.0788, "step": 3090 }, { - "epoch": 0.20281321557082108, - "grad_norm": 1.2804734706878662, - "learning_rate": 3.4433333333333335e-05, - "loss": 0.1264, + "epoch": 0.8112528622832843, + "grad_norm": 0.6389812231063843, + "learning_rate": 4.9896880888215254e-05, + "loss": 0.0782, "step": 3100 }, { - "epoch": 0.20346745175008177, - "grad_norm": 0.8999707698822021, - "learning_rate": 3.454444444444444e-05, - "loss": 0.1089, + "epoch": 0.8138698070003271, + "grad_norm": 1.0264029502868652, + "learning_rate": 4.989499707225026e-05, + "loss": 0.0738, "step": 3110 }, { - "epoch": 0.2041216879293425, - "grad_norm": 1.0667651891708374, - "learning_rate": 3.465555555555556e-05, - "loss": 0.1171, + "epoch": 0.81648675171737, + "grad_norm": 0.8186160326004028, + "learning_rate": 4.989309624078952e-05, + "loss": 0.0732, "step": 3120 }, { - "epoch": 0.2047759241086032, - "grad_norm": 0.9714770913124084, - "learning_rate": 3.476666666666667e-05, - "loss": 0.1118, + "epoch": 0.8191036964344128, + "grad_norm": 0.7047196626663208, + "learning_rate": 4.9891178395132224e-05, + "loss": 0.0764, "step": 3130 }, { - "epoch": 0.20543016028786393, - "grad_norm": 1.0278006792068481, - "learning_rate": 3.487777777777778e-05, - "loss": 0.1239, + "epoch": 0.8217206411514557, + "grad_norm": 0.7498276233673096, + "learning_rate": 4.98892435365892e-05, + "loss": 0.0755, "step": 3140 }, { - "epoch": 0.20608439646712462, - "grad_norm": 1.0025670528411865, - "learning_rate": 3.498888888888889e-05, - "loss": 0.1171, + "epoch": 0.8243375858684985, + "grad_norm": 0.8340325951576233, + "learning_rate": 4.988729166648292e-05, + "loss": 0.073, "step": 3150 }, { - "epoch": 0.20673863264638534, - "grad_norm": 1.191396713256836, - "learning_rate": 3.51e-05, - "loss": 0.1208, + "epoch": 0.8269545305855414, + "grad_norm": 0.7537565231323242, + "learning_rate": 4.988532278614746e-05, + "loss": 0.0693, "step": 3160 }, { - "epoch": 0.20739286882564606, - "grad_norm": 0.9984032511711121, - "learning_rate": 3.5211111111111115e-05, - "loss": 0.1164, + "epoch": 0.8295714753025842, + "grad_norm": 0.5033385157585144, + "learning_rate": 4.988333689692852e-05, + "loss": 0.0693, "step": 3170 }, { - "epoch": 0.20804710500490678, - "grad_norm": 0.863865852355957, - "learning_rate": 3.532222222222222e-05, - "loss": 0.1043, + "epoch": 0.8321884200196271, + "grad_norm": 0.5596148371696472, + "learning_rate": 4.988133400018345e-05, + "loss": 0.0698, "step": 3180 }, { - "epoch": 0.20870134118416747, - "grad_norm": 1.0260671377182007, - "learning_rate": 3.543333333333333e-05, - "loss": 0.1231, + "epoch": 0.8348053647366699, + "grad_norm": 0.9153453707695007, + "learning_rate": 4.987931409728121e-05, + "loss": 0.0697, "step": 3190 }, { - "epoch": 0.2093555773634282, - "grad_norm": 1.0903806686401367, - "learning_rate": 3.554444444444445e-05, - "loss": 0.1176, + "epoch": 0.8374223094537128, + "grad_norm": 0.8965064287185669, + "learning_rate": 4.9877277189602384e-05, + "loss": 0.0772, "step": 3200 }, { - "epoch": 0.2100098135426889, - "grad_norm": 1.0024579763412476, - "learning_rate": 3.5655555555555556e-05, - "loss": 0.1153, + "epoch": 0.8400392541707556, + "grad_norm": 0.4553247392177582, + "learning_rate": 4.987522327853917e-05, + "loss": 0.0744, "step": 3210 }, { - "epoch": 0.21066404972194963, - "grad_norm": 1.0972620248794556, - "learning_rate": 3.576666666666667e-05, - "loss": 0.1191, + "epoch": 0.8426561988877985, + "grad_norm": 0.9072389006614685, + "learning_rate": 4.987315236549541e-05, + "loss": 0.0706, "step": 3220 }, { - "epoch": 0.21131828590121032, - "grad_norm": 0.980950653553009, - "learning_rate": 3.587777777777778e-05, - "loss": 0.1071, + "epoch": 0.8452731436048413, + "grad_norm": 0.7078306674957275, + "learning_rate": 4.9871064451886554e-05, + "loss": 0.0627, "step": 3230 }, { - "epoch": 0.21197252208047104, - "grad_norm": 1.0454057455062866, - "learning_rate": 3.598888888888889e-05, - "loss": 0.1213, + "epoch": 0.8478900883218842, + "grad_norm": 0.5441957712173462, + "learning_rate": 4.986895953913966e-05, + "loss": 0.0692, "step": 3240 }, { - "epoch": 0.21262675825973176, - "grad_norm": 1.0699424743652344, - "learning_rate": 3.61e-05, - "loss": 0.1195, + "epoch": 0.850507033038927, + "grad_norm": 0.4400569796562195, + "learning_rate": 4.9866837628693416e-05, + "loss": 0.0669, "step": 3250 }, { - "epoch": 0.21328099443899248, - "grad_norm": 0.956729531288147, - "learning_rate": 3.621111111111111e-05, - "loss": 0.1157, + "epoch": 0.8531239777559699, + "grad_norm": 0.5146138072013855, + "learning_rate": 4.9864698721998136e-05, + "loss": 0.0729, "step": 3260 }, { - "epoch": 0.2139352306182532, - "grad_norm": 1.1061006784439087, - "learning_rate": 3.632222222222223e-05, - "loss": 0.1204, + "epoch": 0.8557409224730128, + "grad_norm": 0.6510049700737, + "learning_rate": 4.986254282051575e-05, + "loss": 0.0712, "step": 3270 }, { - "epoch": 0.2145894667975139, - "grad_norm": 1.060990571975708, - "learning_rate": 3.6433333333333336e-05, - "loss": 0.117, + "epoch": 0.8583578671900556, + "grad_norm": 0.698809027671814, + "learning_rate": 4.986036992571978e-05, + "loss": 0.0736, "step": 3280 }, { - "epoch": 0.21524370297677461, - "grad_norm": 1.000768780708313, - "learning_rate": 3.654444444444445e-05, - "loss": 0.1206, + "epoch": 0.8609748119070985, + "grad_norm": 0.6784025430679321, + "learning_rate": 4.985818003909537e-05, + "loss": 0.0825, "step": 3290 }, { - "epoch": 0.21589793915603533, - "grad_norm": 0.9216058850288391, - "learning_rate": 3.665555555555556e-05, - "loss": 0.1127, + "epoch": 0.8635917566241413, + "grad_norm": 0.7913259267807007, + "learning_rate": 4.9855973162139316e-05, + "loss": 0.0721, "step": 3300 }, { - "epoch": 0.21655217533529605, - "grad_norm": 0.9647179841995239, - "learning_rate": 3.676666666666667e-05, - "loss": 0.1204, + "epoch": 0.8662087013411842, + "grad_norm": 0.7243340611457825, + "learning_rate": 4.985374929635998e-05, + "loss": 0.0707, "step": 3310 }, { - "epoch": 0.21720641151455675, - "grad_norm": 1.077235221862793, - "learning_rate": 3.687777777777778e-05, - "loss": 0.1343, + "epoch": 0.868825646058227, + "grad_norm": 0.5413879156112671, + "learning_rate": 4.985150844327736e-05, + "loss": 0.0779, "step": 3320 }, { - "epoch": 0.21786064769381747, - "grad_norm": 1.0138746500015259, - "learning_rate": 3.698888888888889e-05, - "loss": 0.1187, + "epoch": 0.8714425907752699, + "grad_norm": 0.87245112657547, + "learning_rate": 4.984925060442306e-05, + "loss": 0.0687, "step": 3330 }, { - "epoch": 0.2185148838730782, - "grad_norm": 0.9533920884132385, - "learning_rate": 3.71e-05, - "loss": 0.1069, + "epoch": 0.8740595354923127, + "grad_norm": 0.6047831773757935, + "learning_rate": 4.9846975781340274e-05, + "loss": 0.0728, "step": 3340 }, { - "epoch": 0.2191691200523389, - "grad_norm": 0.8543539643287659, - "learning_rate": 3.7211111111111116e-05, - "loss": 0.1246, + "epoch": 0.8766764802093556, + "grad_norm": 0.5928580164909363, + "learning_rate": 4.984468397558384e-05, + "loss": 0.0729, "step": 3350 }, { - "epoch": 0.2198233562315996, - "grad_norm": 0.88579922914505, - "learning_rate": 3.7322222222222224e-05, - "loss": 0.1059, + "epoch": 0.8792934249263984, + "grad_norm": 0.68870609998703, + "learning_rate": 4.984237518872018e-05, + "loss": 0.072, "step": 3360 }, { - "epoch": 0.22047759241086032, - "grad_norm": 0.9485725164413452, - "learning_rate": 3.743333333333334e-05, - "loss": 0.1074, + "epoch": 0.8819103696434413, + "grad_norm": 0.6248053908348083, + "learning_rate": 4.9840049422327325e-05, + "loss": 0.0787, "step": 3370 }, { - "epoch": 0.22113182859012104, - "grad_norm": 1.2649180889129639, - "learning_rate": 3.754444444444445e-05, - "loss": 0.1173, + "epoch": 0.8845273143604842, + "grad_norm": 0.589297890663147, + "learning_rate": 4.983770667799492e-05, + "loss": 0.066, "step": 3380 }, { - "epoch": 0.22178606476938176, - "grad_norm": 0.9635019302368164, - "learning_rate": 3.765555555555556e-05, - "loss": 0.1057, + "epoch": 0.887144259077527, + "grad_norm": 0.7025781273841858, + "learning_rate": 4.9835346957324206e-05, + "loss": 0.0675, "step": 3390 }, { - "epoch": 0.22244030094864245, - "grad_norm": 1.0199334621429443, - "learning_rate": 3.7766666666666665e-05, - "loss": 0.1113, + "epoch": 0.8897612037945698, + "grad_norm": 0.7033712267875671, + "learning_rate": 4.983297026192804e-05, + "loss": 0.081, "step": 3400 }, { - "epoch": 0.22309453712790317, - "grad_norm": 1.4301012754440308, - "learning_rate": 3.787777777777778e-05, - "loss": 0.1258, + "epoch": 0.8923781485116127, + "grad_norm": 0.5622373223304749, + "learning_rate": 4.983057659343085e-05, + "loss": 0.0617, "step": 3410 }, { - "epoch": 0.2237487733071639, - "grad_norm": 1.0970239639282227, - "learning_rate": 3.798888888888889e-05, - "loss": 0.1142, + "epoch": 0.8949950932286556, + "grad_norm": 0.8491391539573669, + "learning_rate": 4.98281659534687e-05, + "loss": 0.0623, "step": 3420 }, { - "epoch": 0.2244030094864246, - "grad_norm": 1.0526765584945679, - "learning_rate": 3.8100000000000005e-05, - "loss": 0.1119, + "epoch": 0.8976120379456984, + "grad_norm": 0.6941475868225098, + "learning_rate": 4.982573834368923e-05, + "loss": 0.0722, "step": 3430 }, { - "epoch": 0.2250572456656853, - "grad_norm": 0.9532793164253235, - "learning_rate": 3.821111111111111e-05, - "loss": 0.1206, + "epoch": 0.9002289826627412, + "grad_norm": 0.6664544939994812, + "learning_rate": 4.98232937657517e-05, + "loss": 0.0772, "step": 3440 }, { - "epoch": 0.22571148184494602, - "grad_norm": 1.0817819833755493, - "learning_rate": 3.832222222222223e-05, - "loss": 0.1198, + "epoch": 0.9028459273797841, + "grad_norm": 0.6295298933982849, + "learning_rate": 4.982083222132695e-05, + "loss": 0.0746, "step": 3450 }, { - "epoch": 0.22636571802420674, - "grad_norm": 1.0234626531600952, - "learning_rate": 3.843333333333334e-05, - "loss": 0.117, + "epoch": 0.905462872096827, + "grad_norm": 0.5968911051750183, + "learning_rate": 4.981835371209742e-05, + "loss": 0.0769, "step": 3460 }, { - "epoch": 0.22701995420346746, - "grad_norm": 0.9513489007949829, - "learning_rate": 3.8544444444444445e-05, - "loss": 0.1212, + "epoch": 0.9080798168138698, + "grad_norm": 0.5266377925872803, + "learning_rate": 4.981585823975715e-05, + "loss": 0.0645, "step": 3470 }, { - "epoch": 0.22767419038272815, - "grad_norm": 0.869707465171814, - "learning_rate": 3.8655555555555554e-05, - "loss": 0.1036, + "epoch": 0.9106967615309126, + "grad_norm": 0.965641975402832, + "learning_rate": 4.981334580601178e-05, + "loss": 0.0795, "step": 3480 }, { - "epoch": 0.22832842656198887, - "grad_norm": 0.8939826488494873, - "learning_rate": 3.876666666666667e-05, - "loss": 0.1139, + "epoch": 0.9133137062479555, + "grad_norm": 0.6652019619941711, + "learning_rate": 4.9810816412578525e-05, + "loss": 0.0669, "step": 3490 }, { - "epoch": 0.2289826627412496, - "grad_norm": 0.8776119351387024, - "learning_rate": 3.887777777777778e-05, - "loss": 0.1133, + "epoch": 0.9159306509649984, + "grad_norm": 0.921328067779541, + "learning_rate": 4.9808270061186204e-05, + "loss": 0.0722, "step": 3500 }, { - "epoch": 0.2296368989205103, - "grad_norm": 1.042763590812683, - "learning_rate": 3.898888888888889e-05, - "loss": 0.106, + "epoch": 0.9185475956820413, + "grad_norm": 0.6966074705123901, + "learning_rate": 4.980570675357522e-05, + "loss": 0.0663, "step": 3510 }, { - "epoch": 0.230291135099771, - "grad_norm": 1.07195246219635, - "learning_rate": 3.91e-05, - "loss": 0.1103, + "epoch": 0.921164540399084, + "grad_norm": 0.7490382194519043, + "learning_rate": 4.980312649149758e-05, + "loss": 0.0764, "step": 3520 }, { - "epoch": 0.23094537127903172, - "grad_norm": 0.9736886024475098, - "learning_rate": 3.921111111111112e-05, - "loss": 0.1118, + "epoch": 0.9237814851161269, + "grad_norm": 0.6709177494049072, + "learning_rate": 4.980052927671686e-05, + "loss": 0.0743, "step": 3530 }, { - "epoch": 0.23159960745829244, - "grad_norm": 1.0525206327438354, - "learning_rate": 3.932222222222222e-05, - "loss": 0.1201, + "epoch": 0.9263984298331698, + "grad_norm": 0.81973797082901, + "learning_rate": 4.9797915111008236e-05, + "loss": 0.0705, "step": 3540 }, { - "epoch": 0.23225384363755316, - "grad_norm": 0.9918597340583801, - "learning_rate": 3.9433333333333334e-05, - "loss": 0.1199, + "epoch": 0.9290153745502127, + "grad_norm": 0.8974931240081787, + "learning_rate": 4.979528399615846e-05, + "loss": 0.0703, "step": 3550 }, { - "epoch": 0.23290807981681386, - "grad_norm": 1.1127550601959229, - "learning_rate": 3.954444444444444e-05, - "loss": 0.1211, + "epoch": 0.9316323192672554, + "grad_norm": 1.0583653450012207, + "learning_rate": 4.979263593396588e-05, + "loss": 0.0726, "step": 3560 }, { - "epoch": 0.23356231599607458, - "grad_norm": 1.1743565797805786, - "learning_rate": 3.965555555555556e-05, - "loss": 0.1142, + "epoch": 0.9342492639842983, + "grad_norm": 0.7668379545211792, + "learning_rate": 4.978997092624043e-05, + "loss": 0.0683, "step": 3570 }, { - "epoch": 0.2342165521753353, - "grad_norm": 1.09585702419281, - "learning_rate": 3.9766666666666667e-05, - "loss": 0.1281, + "epoch": 0.9368662087013412, + "grad_norm": 0.8339925408363342, + "learning_rate": 4.978728897480359e-05, + "loss": 0.0753, "step": 3580 }, { - "epoch": 0.23487078835459602, - "grad_norm": 0.9824903011322021, - "learning_rate": 3.987777777777778e-05, - "loss": 0.1139, + "epoch": 0.9394831534183841, + "grad_norm": 0.6908124685287476, + "learning_rate": 4.978459008148847e-05, + "loss": 0.0719, "step": 3590 }, { - "epoch": 0.23552502453385674, - "grad_norm": 1.1848162412643433, - "learning_rate": 3.998888888888889e-05, - "loss": 0.1074, + "epoch": 0.9421000981354269, + "grad_norm": 0.6138983964920044, + "learning_rate": 4.978187424813974e-05, + "loss": 0.0767, "step": 3600 }, { - "epoch": 0.23617926071311743, - "grad_norm": 1.1790823936462402, - "learning_rate": 4.0100000000000006e-05, - "loss": 0.1141, + "epoch": 0.9447170428524697, + "grad_norm": 0.6764789819717407, + "learning_rate": 4.977914147661364e-05, + "loss": 0.07, "step": 3610 }, { - "epoch": 0.23683349689237815, - "grad_norm": 0.9774477481842041, - "learning_rate": 4.021111111111111e-05, - "loss": 0.1095, + "epoch": 0.9473339875695126, + "grad_norm": 0.5771927833557129, + "learning_rate": 4.977639176877799e-05, + "loss": 0.0647, "step": 3620 }, { - "epoch": 0.23748773307163887, - "grad_norm": 1.0515410900115967, - "learning_rate": 4.032222222222222e-05, - "loss": 0.1156, + "epoch": 0.9499509322865555, + "grad_norm": 0.5599648952484131, + "learning_rate": 4.977362512651219e-05, + "loss": 0.0663, "step": 3630 }, { - "epoch": 0.2381419692508996, - "grad_norm": 0.9490527510643005, - "learning_rate": 4.043333333333333e-05, - "loss": 0.1243, + "epoch": 0.9525678770035984, + "grad_norm": 0.7062062621116638, + "learning_rate": 4.9770841551707226e-05, + "loss": 0.0699, "step": 3640 }, { - "epoch": 0.23879620543016028, - "grad_norm": 0.9454949498176575, - "learning_rate": 4.054444444444445e-05, - "loss": 0.1138, + "epoch": 0.9551848217206411, + "grad_norm": 0.4688286781311035, + "learning_rate": 4.976804104626563e-05, + "loss": 0.0721, "step": 3650 }, { - "epoch": 0.239450441609421, - "grad_norm": 0.9614177346229553, - "learning_rate": 4.0655555555555555e-05, - "loss": 0.1097, + "epoch": 0.957801766437684, + "grad_norm": 0.5337697863578796, + "learning_rate": 4.9765223612101534e-05, + "loss": 0.0656, "step": 3660 }, { - "epoch": 0.24010467778868172, - "grad_norm": 0.9757283329963684, - "learning_rate": 4.076666666666667e-05, - "loss": 0.1151, + "epoch": 0.9604187111547269, + "grad_norm": 0.619052529335022, + "learning_rate": 4.976238925114062e-05, + "loss": 0.0707, "step": 3670 }, { - "epoch": 0.24075891396794244, - "grad_norm": 0.9931315779685974, - "learning_rate": 4.087777777777778e-05, - "loss": 0.1203, + "epoch": 0.9630356558717698, + "grad_norm": 0.6267631649971008, + "learning_rate": 4.975953796532015e-05, + "loss": 0.0655, "step": 3680 }, { - "epoch": 0.24141315014720313, - "grad_norm": 0.9971386194229126, - "learning_rate": 4.0988888888888894e-05, - "loss": 0.1139, + "epoch": 0.9656526005888125, + "grad_norm": 0.5680310726165771, + "learning_rate": 4.9756669756588944e-05, + "loss": 0.0677, "step": 3690 }, { - "epoch": 0.24206738632646385, - "grad_norm": 1.0257045030593872, - "learning_rate": 4.11e-05, - "loss": 0.1319, + "epoch": 0.9682695453058554, + "grad_norm": 0.6664942502975464, + "learning_rate": 4.9753784626907395e-05, + "loss": 0.0721, "step": 3700 }, { - "epoch": 0.24272162250572457, - "grad_norm": 1.130571722984314, - "learning_rate": 4.121111111111111e-05, - "loss": 0.1159, + "epoch": 0.9708864900228983, + "grad_norm": 0.7929763197898865, + "learning_rate": 4.975088257824748e-05, + "loss": 0.0685, "step": 3710 }, { - "epoch": 0.2433758586849853, - "grad_norm": 0.893582284450531, - "learning_rate": 4.132222222222222e-05, - "loss": 0.1225, + "epoch": 0.9735034347399412, + "grad_norm": 0.6787649393081665, + "learning_rate": 4.974796361259271e-05, + "loss": 0.0681, "step": 3720 }, { - "epoch": 0.24403009486424598, - "grad_norm": 0.9939897656440735, - "learning_rate": 4.1433333333333335e-05, - "loss": 0.1016, + "epoch": 0.9761203794569839, + "grad_norm": 0.7067198753356934, + "learning_rate": 4.974502773193816e-05, + "loss": 0.066, "step": 3730 }, { - "epoch": 0.2446843310435067, - "grad_norm": 0.9984415173530579, - "learning_rate": 4.1544444444444444e-05, - "loss": 0.1094, + "epoch": 0.9787373241740268, + "grad_norm": 0.6574423313140869, + "learning_rate": 4.974207493829049e-05, + "loss": 0.0697, "step": 3740 }, { - "epoch": 0.24533856722276742, - "grad_norm": 1.0292317867279053, - "learning_rate": 4.165555555555556e-05, - "loss": 0.1147, + "epoch": 0.9813542688910697, + "grad_norm": 0.6540109515190125, + "learning_rate": 4.97391052336679e-05, + "loss": 0.0734, "step": 3750 }, { - "epoch": 0.24599280340202814, - "grad_norm": 0.9206924438476562, - "learning_rate": 4.176666666666667e-05, - "loss": 0.1269, + "epoch": 0.9839712136081126, + "grad_norm": 0.6842678785324097, + "learning_rate": 4.973611862010017e-05, + "loss": 0.0699, "step": 3760 }, { - "epoch": 0.24664703958128883, - "grad_norm": 1.1015186309814453, - "learning_rate": 4.187777777777778e-05, - "loss": 0.1162, + "epoch": 0.9865881583251553, + "grad_norm": 0.6524052619934082, + "learning_rate": 4.97331150996286e-05, + "loss": 0.0724, "step": 3770 }, { - "epoch": 0.24730127576054955, - "grad_norm": 0.950849175453186, - "learning_rate": 4.198888888888889e-05, - "loss": 0.1153, + "epoch": 0.9892051030421982, + "grad_norm": 0.7590875625610352, + "learning_rate": 4.973009467430608e-05, + "loss": 0.0681, "step": 3780 }, { - "epoch": 0.24795551193981027, - "grad_norm": 1.0211883783340454, - "learning_rate": 4.21e-05, - "loss": 0.1259, + "epoch": 0.9918220477592411, + "grad_norm": 1.2194914817810059, + "learning_rate": 4.9727057346197046e-05, + "loss": 0.0757, "step": 3790 }, { - "epoch": 0.248609748119071, - "grad_norm": 1.113376259803772, - "learning_rate": 4.2211111111111115e-05, - "loss": 0.1198, + "epoch": 0.994438992476284, + "grad_norm": 0.7816717624664307, + "learning_rate": 4.9724003117377484e-05, + "loss": 0.0743, "step": 3800 }, { - "epoch": 0.2492639842983317, - "grad_norm": 1.0689711570739746, - "learning_rate": 4.2322222222222224e-05, - "loss": 0.1199, + "epoch": 0.9970559371933267, + "grad_norm": 0.5530228614807129, + "learning_rate": 4.972093198993492e-05, + "loss": 0.063, "step": 3810 }, { - "epoch": 0.2499182204775924, - "grad_norm": 1.1541417837142944, - "learning_rate": 4.243333333333334e-05, - "loss": 0.1148, + "epoch": 0.9996728819103696, + "grad_norm": 0.6032941341400146, + "learning_rate": 4.971784396596843e-05, + "loss": 0.07, "step": 3820 }, { - "epoch": 0.2505724566568531, - "grad_norm": 0.9707340598106384, - "learning_rate": 4.254444444444445e-05, - "loss": 0.1298, + "epoch": 1.0020935557736343, + "grad_norm": 0.7758064866065979, + "learning_rate": 4.971473904758868e-05, + "loss": 0.0711, "step": 3830 }, { - "epoch": 0.2512266928361138, - "grad_norm": 1.2857780456542969, - "learning_rate": 4.2655555555555556e-05, - "loss": 0.1298, + "epoch": 1.0047105004906771, + "grad_norm": 1.45048189163208, + "learning_rate": 4.971161723691784e-05, + "loss": 0.0701, "step": 3840 }, { - "epoch": 0.25188092901537457, - "grad_norm": 0.8688610792160034, - "learning_rate": 4.2766666666666665e-05, - "loss": 0.1169, + "epoch": 1.0073274452077199, + "grad_norm": 1.0233992338180542, + "learning_rate": 4.9708478536089626e-05, + "loss": 0.0687, "step": 3850 }, { - "epoch": 0.25253516519463526, - "grad_norm": 1.0786139965057373, - "learning_rate": 4.287777777777778e-05, - "loss": 0.1279, + "epoch": 1.0099443899247629, + "grad_norm": 1.0815229415893555, + "learning_rate": 4.9705322947249325e-05, + "loss": 0.0716, "step": 3860 }, { - "epoch": 0.25318940137389595, - "grad_norm": 1.1422791481018066, - "learning_rate": 4.298888888888889e-05, - "loss": 0.1114, + "epoch": 1.0125613346418056, + "grad_norm": 0.981233537197113, + "learning_rate": 4.970215047255374e-05, + "loss": 0.0733, "step": 3870 }, { - "epoch": 0.2538436375531567, - "grad_norm": 0.9638531804084778, - "learning_rate": 4.3100000000000004e-05, - "loss": 0.1144, + "epoch": 1.0151782793588486, + "grad_norm": 1.0590592622756958, + "learning_rate": 4.969896111417124e-05, + "loss": 0.0668, "step": 3880 }, { - "epoch": 0.2544978737324174, - "grad_norm": 1.1392393112182617, - "learning_rate": 4.321111111111111e-05, - "loss": 0.1087, + "epoch": 1.0177952240758914, + "grad_norm": 1.0366718769073486, + "learning_rate": 4.969575487428171e-05, + "loss": 0.0696, "step": 3890 }, { - "epoch": 0.25515210991167814, - "grad_norm": 0.9770271182060242, - "learning_rate": 4.332222222222223e-05, - "loss": 0.1305, + "epoch": 1.0204121687929342, + "grad_norm": 0.6779992580413818, + "learning_rate": 4.96925317550766e-05, + "loss": 0.0688, "step": 3900 }, { - "epoch": 0.25580634609093883, - "grad_norm": 0.9613223075866699, - "learning_rate": 4.3433333333333336e-05, - "loss": 0.1093, + "epoch": 1.0230291135099772, + "grad_norm": 0.4735548496246338, + "learning_rate": 4.9689291758758874e-05, + "loss": 0.0703, "step": 3910 }, { - "epoch": 0.2564605822701995, - "grad_norm": 0.9954821467399597, - "learning_rate": 4.354444444444445e-05, - "loss": 0.1201, + "epoch": 1.02564605822702, + "grad_norm": 0.6989257335662842, + "learning_rate": 4.968603488754302e-05, + "loss": 0.0771, "step": 3920 }, { - "epoch": 0.25711481844946027, - "grad_norm": 1.0677788257598877, - "learning_rate": 4.3655555555555554e-05, - "loss": 0.1102, + "epoch": 1.0282630029440627, + "grad_norm": 0.7250983119010925, + "learning_rate": 4.968276114365511e-05, + "loss": 0.0706, "step": 3930 }, { - "epoch": 0.25776905462872096, - "grad_norm": 1.1282657384872437, - "learning_rate": 4.376666666666667e-05, - "loss": 0.118, + "epoch": 1.0308799476611057, + "grad_norm": 0.9411719441413879, + "learning_rate": 4.96794705293327e-05, + "loss": 0.0697, "step": 3940 }, { - "epoch": 0.25842329080798165, - "grad_norm": 1.0023534297943115, - "learning_rate": 4.387777777777778e-05, - "loss": 0.1144, + "epoch": 1.0334968923781485, + "grad_norm": 0.7651953101158142, + "learning_rate": 4.96761630468249e-05, + "loss": 0.0711, "step": 3950 }, { - "epoch": 0.2590775269872424, - "grad_norm": 0.9646608233451843, - "learning_rate": 4.398888888888889e-05, - "loss": 0.1106, + "epoch": 1.0361138370951914, + "grad_norm": 0.4986521303653717, + "learning_rate": 4.967283869839233e-05, + "loss": 0.0749, "step": 3960 }, { - "epoch": 0.2597317631665031, - "grad_norm": 1.139906406402588, - "learning_rate": 4.41e-05, - "loss": 0.1134, + "epoch": 1.0387307818122342, + "grad_norm": 0.8632932901382446, + "learning_rate": 4.966949748630716e-05, + "loss": 0.0715, "step": 3970 }, { - "epoch": 0.26038599934576384, - "grad_norm": 0.960753858089447, - "learning_rate": 4.4211111111111117e-05, - "loss": 0.1142, + "epoch": 1.041347726529277, + "grad_norm": 0.6957460641860962, + "learning_rate": 4.966613941285308e-05, + "loss": 0.0827, "step": 3980 }, { - "epoch": 0.26104023552502453, - "grad_norm": 1.0693498849868774, - "learning_rate": 4.4322222222222225e-05, - "loss": 0.1091, + "epoch": 1.04396467124632, + "grad_norm": 0.61822909116745, + "learning_rate": 4.966276448032531e-05, + "loss": 0.0744, "step": 3990 }, { - "epoch": 0.2616944717042852, - "grad_norm": 1.1489149332046509, - "learning_rate": 4.443333333333334e-05, - "loss": 0.107, + "epoch": 1.0465816159633627, + "grad_norm": 0.9637846946716309, + "learning_rate": 4.965937269103057e-05, + "loss": 0.0755, + "step": 4000 + }, + { + "epoch": 1.0465816159633627, + "eval_loss": 0.07746633274647106, + "eval_runtime": 8.6612, + "eval_samples_per_second": 118.229, + "eval_steps_per_second": 1.847, "step": 4000 }, { - "epoch": 0.262348707883546, - "grad_norm": 0.9810730218887329, - "learning_rate": 4.454444444444444e-05, - "loss": 0.1197, + "epoch": 1.0491985606804057, + "grad_norm": 0.7809803485870361, + "learning_rate": 4.9655964047287114e-05, + "loss": 0.0753, "step": 4010 }, { - "epoch": 0.26300294406280667, - "grad_norm": 0.9745346307754517, - "learning_rate": 4.465555555555556e-05, - "loss": 0.1158, + "epoch": 1.0518155053974485, + "grad_norm": 0.6005727052688599, + "learning_rate": 4.965253855142472e-05, + "loss": 0.0677, "step": 4020 }, { - "epoch": 0.2636571802420674, - "grad_norm": 1.056600570678711, - "learning_rate": 4.4766666666666666e-05, - "loss": 0.1102, + "epoch": 1.0544324501144913, + "grad_norm": 0.8859987258911133, + "learning_rate": 4.96490962057847e-05, + "loss": 0.0701, "step": 4030 }, { - "epoch": 0.2643114164213281, - "grad_norm": 1.090558409690857, - "learning_rate": 4.487777777777778e-05, - "loss": 0.1151, + "epoch": 1.0570493948315343, + "grad_norm": 0.49039119482040405, + "learning_rate": 4.964563701271984e-05, + "loss": 0.0684, "step": 4040 }, { - "epoch": 0.2649656526005888, - "grad_norm": 0.9222038984298706, - "learning_rate": 4.498888888888889e-05, - "loss": 0.1103, + "epoch": 1.059666339548577, + "grad_norm": 0.5293424725532532, + "learning_rate": 4.964216097459448e-05, + "loss": 0.0603, "step": 4050 }, { - "epoch": 0.26561988877984954, - "grad_norm": 1.0304709672927856, - "learning_rate": 4.5100000000000005e-05, - "loss": 0.1169, + "epoch": 1.0622832842656198, + "grad_norm": 0.535614550113678, + "learning_rate": 4.9638668093784445e-05, + "loss": 0.0701, "step": 4060 }, { - "epoch": 0.26627412495911024, - "grad_norm": 1.0132149457931519, - "learning_rate": 4.5211111111111114e-05, - "loss": 0.1074, + "epoch": 1.0649002289826628, + "grad_norm": 0.7680052518844604, + "learning_rate": 4.96351583726771e-05, + "loss": 0.0644, "step": 4070 }, { - "epoch": 0.26692836113837093, - "grad_norm": 1.0685970783233643, - "learning_rate": 4.532222222222223e-05, - "loss": 0.1087, + "epoch": 1.0675171736997056, + "grad_norm": 0.6915044784545898, + "learning_rate": 4.9631631813671314e-05, + "loss": 0.0728, "step": 4080 }, { - "epoch": 0.2675825973176317, - "grad_norm": 1.0187596082687378, - "learning_rate": 4.543333333333333e-05, - "loss": 0.1225, + "epoch": 1.0701341184167485, + "grad_norm": 0.5453565120697021, + "learning_rate": 4.962808841917744e-05, + "loss": 0.0691, "step": 4090 }, { - "epoch": 0.26823683349689237, - "grad_norm": 1.0164580345153809, - "learning_rate": 4.5544444444444446e-05, - "loss": 0.119, + "epoch": 1.0727510631337913, + "grad_norm": 0.5886152982711792, + "learning_rate": 4.962452819161736e-05, + "loss": 0.0719, "step": 4100 }, { - "epoch": 0.2688910696761531, - "grad_norm": 0.9964419007301331, - "learning_rate": 4.5655555555555555e-05, - "loss": 0.1077, + "epoch": 1.075368007850834, + "grad_norm": 0.747417151927948, + "learning_rate": 4.962095113342445e-05, + "loss": 0.0719, "step": 4110 }, { - "epoch": 0.2695453058554138, - "grad_norm": 1.1001616716384888, - "learning_rate": 4.576666666666667e-05, - "loss": 0.1203, + "epoch": 1.077984952567877, + "grad_norm": 0.9508110284805298, + "learning_rate": 4.9617357247043616e-05, + "loss": 0.0723, "step": 4120 }, { - "epoch": 0.2701995420346745, - "grad_norm": 0.984094500541687, - "learning_rate": 4.587777777777778e-05, - "loss": 0.1219, + "epoch": 1.0806018972849198, + "grad_norm": 0.8973356485366821, + "learning_rate": 4.961374653493122e-05, + "loss": 0.0643, "step": 4130 }, { - "epoch": 0.27085377821393525, - "grad_norm": 0.8646033406257629, - "learning_rate": 4.5988888888888894e-05, - "loss": 0.1144, + "epoch": 1.0832188420019626, + "grad_norm": 0.740419328212738, + "learning_rate": 4.9610118999555165e-05, + "loss": 0.0765, "step": 4140 }, { - "epoch": 0.27150801439319594, - "grad_norm": 0.8888686299324036, - "learning_rate": 4.61e-05, - "loss": 0.1216, + "epoch": 1.0858357867190056, + "grad_norm": 0.7163695096969604, + "learning_rate": 4.960647464339484e-05, + "loss": 0.0688, "step": 4150 }, { - "epoch": 0.27216225057245663, - "grad_norm": 0.9775686264038086, - "learning_rate": 4.621111111111111e-05, - "loss": 0.1119, + "epoch": 1.0884527314360484, + "grad_norm": 0.5579230189323425, + "learning_rate": 4.960281346894111e-05, + "loss": 0.0732, "step": 4160 }, { - "epoch": 0.2728164867517174, - "grad_norm": 1.1479732990264893, - "learning_rate": 4.632222222222222e-05, - "loss": 0.1159, + "epoch": 1.0910696761530914, + "grad_norm": 0.8381157517433167, + "learning_rate": 4.959913547869637e-05, + "loss": 0.0756, "step": 4170 }, { - "epoch": 0.27347072293097807, - "grad_norm": 0.9950860738754272, - "learning_rate": 4.6433333333333335e-05, - "loss": 0.1151, + "epoch": 1.0936866208701341, + "grad_norm": 0.8548111319541931, + "learning_rate": 4.959544067517449e-05, + "loss": 0.0694, "step": 4180 }, { - "epoch": 0.2741249591102388, - "grad_norm": 1.044150948524475, - "learning_rate": 4.6544444444444443e-05, - "loss": 0.1029, + "epoch": 1.096303565587177, + "grad_norm": 0.5482849478721619, + "learning_rate": 4.959172906090082e-05, + "loss": 0.0672, "step": 4190 }, { - "epoch": 0.2747791952894995, - "grad_norm": 0.9072064757347107, - "learning_rate": 4.665555555555556e-05, - "loss": 0.1217, + "epoch": 1.0989205103042199, + "grad_norm": 0.6070835590362549, + "learning_rate": 4.958800063841223e-05, + "loss": 0.0639, "step": 4200 }, { - "epoch": 0.2754334314687602, - "grad_norm": 1.0930536985397339, - "learning_rate": 4.676666666666667e-05, - "loss": 0.1161, + "epoch": 1.1015374550212627, + "grad_norm": 0.7139147520065308, + "learning_rate": 4.958425541025705e-05, + "loss": 0.0659, "step": 4210 }, { - "epoch": 0.27608766764802095, - "grad_norm": 1.104744791984558, - "learning_rate": 4.687777777777778e-05, - "loss": 0.1127, + "epoch": 1.1041543997383054, + "grad_norm": 0.49353158473968506, + "learning_rate": 4.958049337899512e-05, + "loss": 0.058, "step": 4220 }, { - "epoch": 0.27674190382728164, - "grad_norm": 0.8702093958854675, - "learning_rate": 4.698888888888889e-05, - "loss": 0.1117, + "epoch": 1.1067713444553484, + "grad_norm": 0.7460055351257324, + "learning_rate": 4.957671454719774e-05, + "loss": 0.0652, "step": 4230 }, { - "epoch": 0.27739614000654234, - "grad_norm": 1.0562191009521484, - "learning_rate": 4.71e-05, - "loss": 0.1172, + "epoch": 1.1093882891723912, + "grad_norm": 0.8104005455970764, + "learning_rate": 4.9572918917447715e-05, + "loss": 0.0669, "step": 4240 }, { - "epoch": 0.2780503761858031, - "grad_norm": 1.0193010568618774, - "learning_rate": 4.721111111111111e-05, - "loss": 0.1117, + "epoch": 1.1120052338894342, + "grad_norm": 0.6421072483062744, + "learning_rate": 4.956910649233931e-05, + "loss": 0.066, "step": 4250 }, { - "epoch": 0.2787046123650638, - "grad_norm": 1.2876737117767334, - "learning_rate": 4.7322222222222224e-05, - "loss": 0.1174, + "epoch": 1.114622178606477, + "grad_norm": 0.5011855363845825, + "learning_rate": 4.9565277274478304e-05, + "loss": 0.0734, "step": 4260 }, { - "epoch": 0.2793588485443245, - "grad_norm": 1.0730646848678589, - "learning_rate": 4.743333333333333e-05, - "loss": 0.1136, + "epoch": 1.1172391233235197, + "grad_norm": 0.46482545137405396, + "learning_rate": 4.9561431266481906e-05, + "loss": 0.0621, "step": 4270 }, { - "epoch": 0.2800130847235852, - "grad_norm": 1.0272183418273926, - "learning_rate": 4.754444444444445e-05, - "loss": 0.1042, + "epoch": 1.1198560680405627, + "grad_norm": 0.545685887336731, + "learning_rate": 4.955756847097884e-05, + "loss": 0.0695, "step": 4280 }, { - "epoch": 0.2806673209028459, - "grad_norm": 0.9593018293380737, - "learning_rate": 4.7655555555555556e-05, - "loss": 0.1068, + "epoch": 1.1224730127576055, + "grad_norm": 0.6811407208442688, + "learning_rate": 4.9553688890609296e-05, + "loss": 0.0691, "step": 4290 }, { - "epoch": 0.28132155708210665, - "grad_norm": 1.1273571252822876, - "learning_rate": 4.776666666666667e-05, - "loss": 0.1183, + "epoch": 1.1250899574746485, + "grad_norm": 0.8716004490852356, + "learning_rate": 4.954979252802492e-05, + "loss": 0.0733, "step": 4300 }, { - "epoch": 0.28197579326136735, - "grad_norm": 0.9116014838218689, - "learning_rate": 4.787777777777778e-05, - "loss": 0.1114, + "epoch": 1.1277069021916912, + "grad_norm": 0.8074702024459839, + "learning_rate": 4.9545879385888836e-05, + "loss": 0.0739, "step": 4310 }, { - "epoch": 0.2826300294406281, - "grad_norm": 0.9448710680007935, - "learning_rate": 4.798888888888889e-05, - "loss": 0.1155, + "epoch": 1.130323846908734, + "grad_norm": 0.9537233710289001, + "learning_rate": 4.9541949466875644e-05, + "loss": 0.0687, "step": 4320 }, { - "epoch": 0.2832842656198888, - "grad_norm": 0.9872847199440002, - "learning_rate": 4.8100000000000004e-05, - "loss": 0.1072, + "epoch": 1.132940791625777, + "grad_norm": 0.8007683753967285, + "learning_rate": 4.9538002773671415e-05, + "loss": 0.0681, "step": 4330 }, { - "epoch": 0.2839385017991495, - "grad_norm": 0.9878488779067993, - "learning_rate": 4.821111111111111e-05, - "loss": 0.1187, + "epoch": 1.1355577363428198, + "grad_norm": 0.8221684098243713, + "learning_rate": 4.953403930897367e-05, + "loss": 0.0741, "step": 4340 }, { - "epoch": 0.2845927379784102, - "grad_norm": 1.1943411827087402, - "learning_rate": 4.832222222222223e-05, - "loss": 0.1299, + "epoch": 1.1381746810598625, + "grad_norm": 0.4814731180667877, + "learning_rate": 4.9530059075491395e-05, + "loss": 0.0714, "step": 4350 }, { - "epoch": 0.2852469741576709, - "grad_norm": 1.0091590881347656, - "learning_rate": 4.8433333333333336e-05, - "loss": 0.1205, + "epoch": 1.1407916257769055, + "grad_norm": 0.6261118054389954, + "learning_rate": 4.952606207594505e-05, + "loss": 0.0674, "step": 4360 }, { - "epoch": 0.2859012103369316, - "grad_norm": 1.0240885019302368, - "learning_rate": 4.8544444444444445e-05, - "loss": 0.11, + "epoch": 1.1434085704939483, + "grad_norm": 0.6069873571395874, + "learning_rate": 4.952204831306654e-05, + "loss": 0.0666, "step": 4370 }, { - "epoch": 0.28655544651619236, - "grad_norm": 0.8469128012657166, - "learning_rate": 4.865555555555556e-05, - "loss": 0.1223, + "epoch": 1.146025515210991, + "grad_norm": 0.5223729610443115, + "learning_rate": 4.9518017789599244e-05, + "loss": 0.0745, "step": 4380 }, { - "epoch": 0.28720968269545305, - "grad_norm": 0.932327389717102, - "learning_rate": 4.876666666666667e-05, - "loss": 0.1061, + "epoch": 1.148642459928034, + "grad_norm": 0.5413695573806763, + "learning_rate": 4.951397050829797e-05, + "loss": 0.0696, "step": 4390 }, { - "epoch": 0.2878639188747138, - "grad_norm": 0.9431477785110474, - "learning_rate": 4.887777777777778e-05, - "loss": 0.1064, + "epoch": 1.1512594046450768, + "grad_norm": 0.7739647626876831, + "learning_rate": 4.9509906471929016e-05, + "loss": 0.0755, "step": 4400 }, { - "epoch": 0.2885181550539745, - "grad_norm": 0.9958844780921936, - "learning_rate": 4.898888888888889e-05, - "loss": 0.112, + "epoch": 1.1538763493621198, + "grad_norm": 0.8038027286529541, + "learning_rate": 4.950582568327009e-05, + "loss": 0.0686, "step": 4410 }, { - "epoch": 0.2891723912332352, - "grad_norm": 1.146600365638733, - "learning_rate": 4.91e-05, - "loss": 0.1189, + "epoch": 1.1564932940791626, + "grad_norm": 0.5534743070602417, + "learning_rate": 4.9501728145110395e-05, + "loss": 0.0751, "step": 4420 }, { - "epoch": 0.28982662741249593, - "grad_norm": 1.0450646877288818, - "learning_rate": 4.9211111111111116e-05, - "loss": 0.1268, + "epoch": 1.1591102387962053, + "grad_norm": 0.5017138123512268, + "learning_rate": 4.949761386025055e-05, + "loss": 0.0745, "step": 4430 }, { - "epoch": 0.2904808635917566, - "grad_norm": 1.2781046628952026, - "learning_rate": 4.9322222222222225e-05, - "loss": 0.123, + "epoch": 1.1617271835132483, + "grad_norm": 0.5460235476493835, + "learning_rate": 4.949348283150263e-05, + "loss": 0.067, "step": 4440 }, { - "epoch": 0.2911350997710173, - "grad_norm": 0.9840906858444214, - "learning_rate": 4.943333333333334e-05, - "loss": 0.1139, + "epoch": 1.164344128230291, + "grad_norm": 0.7793023586273193, + "learning_rate": 4.948933506169016e-05, + "loss": 0.0689, "step": 4450 }, { - "epoch": 0.29178933595027806, - "grad_norm": 0.9794589281082153, - "learning_rate": 4.954444444444445e-05, - "loss": 0.1176, + "epoch": 1.166961072947334, + "grad_norm": 0.7321499586105347, + "learning_rate": 4.9485170553648086e-05, + "loss": 0.0679, "step": 4460 }, { - "epoch": 0.29244357212953875, - "grad_norm": 1.2850165367126465, - "learning_rate": 4.965555555555556e-05, - "loss": 0.1245, + "epoch": 1.1695780176643769, + "grad_norm": 0.793841540813446, + "learning_rate": 4.948098931022282e-05, + "loss": 0.066, "step": 4470 }, { - "epoch": 0.2930978083087995, - "grad_norm": 0.9761756062507629, - "learning_rate": 4.9766666666666666e-05, - "loss": 0.1052, + "epoch": 1.1721949623814196, + "grad_norm": 0.8915519118309021, + "learning_rate": 4.947679133427221e-05, + "loss": 0.0672, "step": 4480 }, { - "epoch": 0.2937520444880602, - "grad_norm": 0.9061872959136963, - "learning_rate": 4.987777777777778e-05, - "loss": 0.1102, + "epoch": 1.1748119070984626, + "grad_norm": 0.5249608159065247, + "learning_rate": 4.9472576628665515e-05, + "loss": 0.0669, "step": 4490 }, { - "epoch": 0.2944062806673209, - "grad_norm": 1.1952383518218994, - "learning_rate": 4.998888888888889e-05, - "loss": 0.1046, + "epoch": 1.1774288518155054, + "grad_norm": 0.5474559664726257, + "learning_rate": 4.9468345196283465e-05, + "loss": 0.0711, "step": 4500 }, { - "epoch": 0.29506051684658163, - "grad_norm": 1.018266201019287, - "learning_rate": 5.0100000000000005e-05, - "loss": 0.1126, + "epoch": 1.1800457965325482, + "grad_norm": 0.8466582894325256, + "learning_rate": 4.94640970400182e-05, + "loss": 0.0619, "step": 4510 }, { - "epoch": 0.2957147530258423, - "grad_norm": 1.0872063636779785, - "learning_rate": 5.0211111111111107e-05, - "loss": 0.1123, + "epoch": 1.1826627412495911, + "grad_norm": 0.7379885315895081, + "learning_rate": 4.9459832162773276e-05, + "loss": 0.0643, "step": 4520 }, { - "epoch": 0.296368989205103, - "grad_norm": 1.1259245872497559, - "learning_rate": 5.032222222222223e-05, - "loss": 0.1138, + "epoch": 1.185279685966634, + "grad_norm": 0.6102809309959412, + "learning_rate": 4.9455550567463724e-05, + "loss": 0.0654, "step": 4530 }, { - "epoch": 0.29702322538436376, - "grad_norm": 0.939315676689148, - "learning_rate": 5.043333333333333e-05, - "loss": 0.112, + "epoch": 1.187896630683677, + "grad_norm": 0.6338376998901367, + "learning_rate": 4.945125225701595e-05, + "loss": 0.0639, "step": 4540 }, { - "epoch": 0.29767746156362446, - "grad_norm": 0.9471316933631897, - "learning_rate": 5.054444444444445e-05, - "loss": 0.116, + "epoch": 1.1905135754007197, + "grad_norm": 0.6943968534469604, + "learning_rate": 4.9446937234367834e-05, + "loss": 0.0669, "step": 4550 }, { - "epoch": 0.2983316977428852, - "grad_norm": 1.0100241899490356, - "learning_rate": 5.0655555555555554e-05, - "loss": 0.105, + "epoch": 1.1931305201177624, + "grad_norm": 0.5532180070877075, + "learning_rate": 4.944260550246863e-05, + "loss": 0.0717, "step": 4560 }, { - "epoch": 0.2989859339221459, - "grad_norm": 1.010973572731018, - "learning_rate": 5.0766666666666676e-05, - "loss": 0.0968, + "epoch": 1.1957474648348054, + "grad_norm": 0.727317750453949, + "learning_rate": 4.9438257064279046e-05, + "loss": 0.061, "step": 4570 }, { - "epoch": 0.2996401701014066, - "grad_norm": 1.0246537923812866, - "learning_rate": 5.087777777777778e-05, - "loss": 0.1301, + "epoch": 1.1983644095518482, + "grad_norm": 0.6897532343864441, + "learning_rate": 4.94338919227712e-05, + "loss": 0.0716, "step": 4580 }, { - "epoch": 0.30029440628066734, - "grad_norm": 1.0651960372924805, - "learning_rate": 5.098888888888889e-05, - "loss": 0.1184, + "epoch": 1.2009813542688912, + "grad_norm": 0.9191898703575134, + "learning_rate": 4.9429510080928624e-05, + "loss": 0.0751, "step": 4590 }, { - "epoch": 0.30094864245992803, - "grad_norm": 1.0061384439468384, - "learning_rate": 5.11e-05, - "loss": 0.1238, + "epoch": 1.203598298985934, + "grad_norm": 0.6276187896728516, + "learning_rate": 4.942511154174626e-05, + "loss": 0.0696, "step": 4600 }, { - "epoch": 0.3016028786391887, - "grad_norm": 1.0411393642425537, - "learning_rate": 5.121111111111111e-05, - "loss": 0.1066, + "epoch": 1.2062152437029767, + "grad_norm": 0.7370391488075256, + "learning_rate": 4.942069630823047e-05, + "loss": 0.0706, "step": 4610 }, { - "epoch": 0.30225711481844947, - "grad_norm": 1.1767691373825073, - "learning_rate": 5.1322222222222226e-05, - "loss": 0.1176, + "epoch": 1.2088321884200197, + "grad_norm": 1.0085150003433228, + "learning_rate": 4.941626438339903e-05, + "loss": 0.0699, "step": 4620 }, { - "epoch": 0.30291135099771016, - "grad_norm": 1.1108002662658691, - "learning_rate": 5.1433333333333334e-05, - "loss": 0.1231, + "epoch": 1.2114491331370625, + "grad_norm": 0.8513169884681702, + "learning_rate": 4.9411815770281125e-05, + "loss": 0.0723, "step": 4630 }, { - "epoch": 0.3035655871769709, - "grad_norm": 0.9178494811058044, - "learning_rate": 5.154444444444445e-05, - "loss": 0.1219, + "epoch": 1.2140660778541053, + "grad_norm": 0.7845053672790527, + "learning_rate": 4.9407350471917326e-05, + "loss": 0.0691, "step": 4640 }, { - "epoch": 0.3042198233562316, - "grad_norm": 1.046704649925232, - "learning_rate": 5.165555555555556e-05, - "loss": 0.108, + "epoch": 1.2166830225711482, + "grad_norm": 0.5463152527809143, + "learning_rate": 4.940286849135962e-05, + "loss": 0.0649, "step": 4650 }, { - "epoch": 0.3048740595354923, - "grad_norm": 1.1278116703033447, - "learning_rate": 5.1766666666666674e-05, - "loss": 0.1205, + "epoch": 1.219299967288191, + "grad_norm": 0.5815234780311584, + "learning_rate": 4.939836983167141e-05, + "loss": 0.0652, "step": 4660 }, { - "epoch": 0.30552829571475304, - "grad_norm": 1.049989104270935, - "learning_rate": 5.187777777777778e-05, - "loss": 0.1195, + "epoch": 1.2219169120052338, + "grad_norm": 0.6902991533279419, + "learning_rate": 4.939385449592748e-05, + "loss": 0.075, "step": 4670 }, { - "epoch": 0.30618253189401373, - "grad_norm": 1.1788783073425293, - "learning_rate": 5.1988888888888884e-05, - "loss": 0.112, + "epoch": 1.2245338567222768, + "grad_norm": 0.49586668610572815, + "learning_rate": 4.938932248721402e-05, + "loss": 0.068, "step": 4680 }, { - "epoch": 0.3068367680732745, - "grad_norm": 1.094664454460144, - "learning_rate": 5.2100000000000006e-05, - "loss": 0.1104, + "epoch": 1.2271508014393195, + "grad_norm": 0.7140547633171082, + "learning_rate": 4.938477380862862e-05, + "loss": 0.0755, "step": 4690 }, { - "epoch": 0.30749100425253517, - "grad_norm": 1.0641672611236572, - "learning_rate": 5.221111111111111e-05, - "loss": 0.1209, + "epoch": 1.2297677461563625, + "grad_norm": 0.799771249294281, + "learning_rate": 4.938020846328026e-05, + "loss": 0.0689, "step": 4700 }, { - "epoch": 0.30814524043179586, - "grad_norm": 0.9625852108001709, - "learning_rate": 5.232222222222223e-05, - "loss": 0.1171, + "epoch": 1.2323846908734053, + "grad_norm": 0.6713014245033264, + "learning_rate": 4.93756264542893e-05, + "loss": 0.0694, "step": 4710 }, { - "epoch": 0.3087994766110566, - "grad_norm": 0.9965457916259766, - "learning_rate": 5.243333333333333e-05, - "loss": 0.1152, + "epoch": 1.235001635590448, + "grad_norm": 0.8662418723106384, + "learning_rate": 4.937102778478752e-05, + "loss": 0.0702, "step": 4720 }, { - "epoch": 0.3094537127903173, - "grad_norm": 1.1874282360076904, - "learning_rate": 5.2544444444444454e-05, - "loss": 0.1069, + "epoch": 1.237618580307491, + "grad_norm": 0.5999091863632202, + "learning_rate": 4.936641245791804e-05, + "loss": 0.0694, "step": 4730 }, { - "epoch": 0.310107948969578, - "grad_norm": 1.071972370147705, - "learning_rate": 5.2655555555555555e-05, - "loss": 0.1207, + "epoch": 1.2402355250245338, + "grad_norm": 0.674126386642456, + "learning_rate": 4.936178047683542e-05, + "loss": 0.0726, "step": 4740 }, { - "epoch": 0.31076218514883874, - "grad_norm": 1.1899669170379639, - "learning_rate": 5.2766666666666664e-05, - "loss": 0.1003, + "epoch": 1.2428524697415768, + "grad_norm": 0.8433323502540588, + "learning_rate": 4.935713184470555e-05, + "loss": 0.0716, "step": 4750 }, { - "epoch": 0.31141642132809944, - "grad_norm": 1.1113231182098389, - "learning_rate": 5.287777777777778e-05, - "loss": 0.1101, + "epoch": 1.2454694144586196, + "grad_norm": 1.1719890832901, + "learning_rate": 4.935246656470574e-05, + "loss": 0.0733, "step": 4760 }, { - "epoch": 0.3120706575073602, - "grad_norm": 0.9851582646369934, - "learning_rate": 5.298888888888889e-05, - "loss": 0.1246, + "epoch": 1.2480863591756624, + "grad_norm": 0.7315360307693481, + "learning_rate": 4.9347784640024666e-05, + "loss": 0.0714, "step": 4770 }, { - "epoch": 0.3127248936866209, - "grad_norm": 1.1461477279663086, - "learning_rate": 5.31e-05, - "loss": 0.1136, + "epoch": 1.2507033038927053, + "grad_norm": 0.7105130553245544, + "learning_rate": 4.934308607386238e-05, + "loss": 0.0782, "step": 4780 }, { - "epoch": 0.31337912986588157, - "grad_norm": 1.0127012729644775, - "learning_rate": 5.321111111111111e-05, - "loss": 0.1082, + "epoch": 1.253320248609748, + "grad_norm": 0.8442756533622742, + "learning_rate": 4.9338370869430294e-05, + "loss": 0.0721, "step": 4790 }, { - "epoch": 0.3140333660451423, - "grad_norm": 1.0114226341247559, - "learning_rate": 5.332222222222223e-05, - "loss": 0.123, + "epoch": 1.2559371933267909, + "grad_norm": 0.550815224647522, + "learning_rate": 4.9333639029951225e-05, + "loss": 0.0648, "step": 4800 }, { - "epoch": 0.314687602224403, - "grad_norm": 1.0806419849395752, - "learning_rate": 5.3433333333333336e-05, - "loss": 0.1239, + "epoch": 1.2585541380438339, + "grad_norm": 0.7204784750938416, + "learning_rate": 4.932889055865933e-05, + "loss": 0.0794, "step": 4810 }, { - "epoch": 0.3153418384036637, - "grad_norm": 0.9436666369438171, - "learning_rate": 5.3544444444444444e-05, - "loss": 0.1127, + "epoch": 1.2611710827608766, + "grad_norm": 0.738088846206665, + "learning_rate": 4.932412545880014e-05, + "loss": 0.0682, "step": 4820 }, { - "epoch": 0.31599607458292445, - "grad_norm": 0.9551024436950684, - "learning_rate": 5.365555555555556e-05, - "loss": 0.1146, + "epoch": 1.2637880274779194, + "grad_norm": 0.6076630353927612, + "learning_rate": 4.931934373363056e-05, + "loss": 0.0715, "step": 4830 }, { - "epoch": 0.31665031076218514, - "grad_norm": 0.9468205571174622, - "learning_rate": 5.376666666666667e-05, - "loss": 0.1099, + "epoch": 1.2664049721949624, + "grad_norm": 0.7719533443450928, + "learning_rate": 4.931454538641886e-05, + "loss": 0.0681, "step": 4840 }, { - "epoch": 0.3173045469414459, - "grad_norm": 1.0313448905944824, - "learning_rate": 5.387777777777778e-05, - "loss": 0.1108, + "epoch": 1.2690219169120052, + "grad_norm": 0.8249597549438477, + "learning_rate": 4.9309730420444667e-05, + "loss": 0.0673, "step": 4850 }, { - "epoch": 0.3179587831207066, - "grad_norm": 1.025217890739441, - "learning_rate": 5.398888888888889e-05, - "loss": 0.1245, + "epoch": 1.2716388616290482, + "grad_norm": 0.6375730037689209, + "learning_rate": 4.930489883899896e-05, + "loss": 0.0704, "step": 4860 }, { - "epoch": 0.31861301929996727, - "grad_norm": 0.9557814002037048, - "learning_rate": 5.410000000000001e-05, - "loss": 0.1099, + "epoch": 1.274255806346091, + "grad_norm": 0.8313435316085815, + "learning_rate": 4.9300050645384065e-05, + "loss": 0.0723, "step": 4870 }, { - "epoch": 0.319267255479228, - "grad_norm": 1.0116934776306152, - "learning_rate": 5.4211111111111116e-05, - "loss": 0.1068, + "epoch": 1.276872751063134, + "grad_norm": 0.7468934655189514, + "learning_rate": 4.9295185842913705e-05, + "loss": 0.0692, "step": 4880 }, { - "epoch": 0.3199214916584887, - "grad_norm": 1.0298535823822021, - "learning_rate": 5.432222222222223e-05, - "loss": 0.1261, + "epoch": 1.2794896957801767, + "grad_norm": 0.6839153170585632, + "learning_rate": 4.92903044349129e-05, + "loss": 0.0697, "step": 4890 }, { - "epoch": 0.3205757278377494, - "grad_norm": 1.6107592582702637, - "learning_rate": 5.443333333333333e-05, - "loss": 0.1166, + "epoch": 1.2821066404972195, + "grad_norm": 0.5754392147064209, + "learning_rate": 4.928540642471806e-05, + "loss": 0.0656, "step": 4900 }, { - "epoch": 0.32122996401701015, - "grad_norm": 0.9493816494941711, - "learning_rate": 5.454444444444444e-05, - "loss": 0.1025, + "epoch": 1.2847235852142624, + "grad_norm": 0.832221269607544, + "learning_rate": 4.9280491815676925e-05, + "loss": 0.0668, "step": 4910 }, { - "epoch": 0.32188420019627084, - "grad_norm": 1.0891478061676025, - "learning_rate": 5.465555555555556e-05, - "loss": 0.1259, + "epoch": 1.2873405299313052, + "grad_norm": 0.7316128015518188, + "learning_rate": 4.9275560611148587e-05, + "loss": 0.0763, "step": 4920 }, { - "epoch": 0.3225384363755316, - "grad_norm": 1.0789752006530762, - "learning_rate": 5.4766666666666665e-05, - "loss": 0.1092, + "epoch": 1.289957474648348, + "grad_norm": 0.5915740132331848, + "learning_rate": 4.927061281450348e-05, + "loss": 0.0719, "step": 4930 }, { - "epoch": 0.3231926725547923, - "grad_norm": 0.8456171154975891, - "learning_rate": 5.487777777777778e-05, - "loss": 0.1094, + "epoch": 1.292574419365391, + "grad_norm": 0.6216297149658203, + "learning_rate": 4.926564842912337e-05, + "loss": 0.0648, "step": 4940 }, { - "epoch": 0.323846908734053, - "grad_norm": 1.1265510320663452, - "learning_rate": 5.498888888888889e-05, - "loss": 0.1105, + "epoch": 1.2951913640824337, + "grad_norm": 0.6205251812934875, + "learning_rate": 4.926066745840137e-05, + "loss": 0.0736, "step": 4950 }, { - "epoch": 0.3245011449133137, - "grad_norm": 1.0033330917358398, - "learning_rate": 5.5100000000000004e-05, - "loss": 0.1138, + "epoch": 1.2978083087994765, + "grad_norm": 0.6216917037963867, + "learning_rate": 4.9255669905741924e-05, + "loss": 0.0678, "step": 4960 }, { - "epoch": 0.3251553810925744, - "grad_norm": 0.9532373547554016, - "learning_rate": 5.521111111111111e-05, - "loss": 0.1134, + "epoch": 1.3004252535165195, + "grad_norm": 0.6178058981895447, + "learning_rate": 4.925065577456082e-05, + "loss": 0.0697, "step": 4970 }, { - "epoch": 0.3258096172718351, - "grad_norm": 1.070513367652893, - "learning_rate": 5.532222222222222e-05, - "loss": 0.1054, + "epoch": 1.3030421982335623, + "grad_norm": 0.7660165429115295, + "learning_rate": 4.924562506828516e-05, + "loss": 0.0704, "step": 4980 }, { - "epoch": 0.32646385345109585, - "grad_norm": 1.1604113578796387, - "learning_rate": 5.543333333333334e-05, - "loss": 0.1089, + "epoch": 1.305659142950605, + "grad_norm": 0.44091275334358215, + "learning_rate": 4.924057779035338e-05, + "loss": 0.065, "step": 4990 }, { - "epoch": 0.32711808963035655, - "grad_norm": 0.9671838283538818, - "learning_rate": 5.5544444444444445e-05, - "loss": 0.1114, + "epoch": 1.308276087667648, + "grad_norm": 0.4674038290977478, + "learning_rate": 4.9235513944215276e-05, + "loss": 0.0716, + "step": 5000 + }, + { + "epoch": 1.308276087667648, + "eval_loss": 0.07460379635236306, + "eval_runtime": 8.5112, + "eval_samples_per_second": 120.313, + "eval_steps_per_second": 1.88, "step": 5000 }, { - "epoch": 0.3277723258096173, - "grad_norm": 1.0246025323867798, - "learning_rate": 5.565555555555556e-05, - "loss": 0.1155, + "epoch": 1.3108930323846908, + "grad_norm": 0.46065524220466614, + "learning_rate": 4.92304335333319e-05, + "loss": 0.0669, "step": 5010 }, { - "epoch": 0.328426561988878, - "grad_norm": 1.1723190546035767, - "learning_rate": 5.576666666666667e-05, - "loss": 0.1086, + "epoch": 1.3135099771017338, + "grad_norm": 0.5088114738464355, + "learning_rate": 4.922533656117569e-05, + "loss": 0.0687, "step": 5020 }, { - "epoch": 0.3290807981681387, - "grad_norm": 1.158227801322937, - "learning_rate": 5.5877777777777785e-05, - "loss": 0.1142, + "epoch": 1.3161269218187766, + "grad_norm": 0.573154091835022, + "learning_rate": 4.922022303123037e-05, + "loss": 0.0703, "step": 5030 }, { - "epoch": 0.3297350343473994, - "grad_norm": 1.070424199104309, - "learning_rate": 5.598888888888889e-05, - "loss": 0.1075, + "epoch": 1.3187438665358195, + "grad_norm": 0.5735311508178711, + "learning_rate": 4.9215092946990994e-05, + "loss": 0.0646, "step": 5040 }, { - "epoch": 0.3303892705266601, - "grad_norm": 1.1170357465744019, - "learning_rate": 5.610000000000001e-05, - "loss": 0.1242, + "epoch": 1.3213608112528623, + "grad_norm": 0.6008470058441162, + "learning_rate": 4.9209946311963926e-05, + "loss": 0.0625, "step": 5050 }, { - "epoch": 0.33104350670592086, - "grad_norm": 1.009589672088623, - "learning_rate": 5.621111111111112e-05, - "loss": 0.1076, + "epoch": 1.323977755969905, + "grad_norm": 0.6273025274276733, + "learning_rate": 4.920478312966683e-05, + "loss": 0.0717, "step": 5060 }, { - "epoch": 0.33169774288518156, - "grad_norm": 1.002823829650879, - "learning_rate": 5.632222222222222e-05, - "loss": 0.1146, + "epoch": 1.326594700686948, + "grad_norm": 0.40354007482528687, + "learning_rate": 4.919960340362871e-05, + "loss": 0.0668, "step": 5070 }, { - "epoch": 0.33235197906444225, - "grad_norm": 0.9945969581604004, - "learning_rate": 5.643333333333334e-05, - "loss": 0.1155, + "epoch": 1.3292116454039908, + "grad_norm": 0.7045165300369263, + "learning_rate": 4.919440713738985e-05, + "loss": 0.0682, "step": 5080 }, { - "epoch": 0.333006215243703, - "grad_norm": 0.9207086563110352, - "learning_rate": 5.654444444444444e-05, - "loss": 0.116, + "epoch": 1.3318285901210336, + "grad_norm": 0.6253055334091187, + "learning_rate": 4.918919433450185e-05, + "loss": 0.0669, "step": 5090 }, { - "epoch": 0.3336604514229637, - "grad_norm": 0.8761827945709229, - "learning_rate": 5.6655555555555565e-05, - "loss": 0.1119, + "epoch": 1.3344455348380766, + "grad_norm": 0.6311752796173096, + "learning_rate": 4.918396499852762e-05, + "loss": 0.0676, "step": 5100 }, { - "epoch": 0.3343146876022244, - "grad_norm": 1.128303050994873, - "learning_rate": 5.6766666666666666e-05, - "loss": 0.1134, + "epoch": 1.3370624795551194, + "grad_norm": 0.6613034605979919, + "learning_rate": 4.9178719133041353e-05, + "loss": 0.0683, "step": 5110 }, { - "epoch": 0.33496892378148513, - "grad_norm": 0.9229829907417297, - "learning_rate": 5.687777777777779e-05, - "loss": 0.1085, + "epoch": 1.3396794242721621, + "grad_norm": 0.6443865299224854, + "learning_rate": 4.9173456741628546e-05, + "loss": 0.0696, "step": 5120 }, { - "epoch": 0.3356231599607458, - "grad_norm": 1.0717655420303345, - "learning_rate": 5.698888888888889e-05, - "loss": 0.1154, + "epoch": 1.3422963689892051, + "grad_norm": 0.6435128450393677, + "learning_rate": 4.9168177827885996e-05, + "loss": 0.0657, "step": 5130 }, { - "epoch": 0.33627739614000657, - "grad_norm": 1.0167633295059204, - "learning_rate": 5.71e-05, - "loss": 0.1185, + "epoch": 1.344913313706248, + "grad_norm": 0.628079891204834, + "learning_rate": 4.9162882395421794e-05, + "loss": 0.0733, "step": 5140 }, { - "epoch": 0.33693163231926726, - "grad_norm": 1.1209583282470703, - "learning_rate": 5.7211111111111114e-05, - "loss": 0.1182, + "epoch": 1.3475302584232909, + "grad_norm": 0.49360260367393494, + "learning_rate": 4.915757044785532e-05, + "loss": 0.0729, "step": 5150 }, { - "epoch": 0.33758586849852795, - "grad_norm": 0.9364834427833557, - "learning_rate": 5.732222222222222e-05, - "loss": 0.1024, + "epoch": 1.3501472031403337, + "grad_norm": 0.49952232837677, + "learning_rate": 4.9152241988817236e-05, + "loss": 0.0663, "step": 5160 }, { - "epoch": 0.3382401046777887, - "grad_norm": 1.077996850013733, - "learning_rate": 5.743333333333334e-05, - "loss": 0.1076, + "epoch": 1.3527641478573766, + "grad_norm": 0.4128344655036926, + "learning_rate": 4.91468970219495e-05, + "loss": 0.0609, "step": 5170 }, { - "epoch": 0.3388943408570494, - "grad_norm": 0.946433424949646, - "learning_rate": 5.7544444444444447e-05, - "loss": 0.1176, + "epoch": 1.3553810925744194, + "grad_norm": 0.6112388968467712, + "learning_rate": 4.914153555090533e-05, + "loss": 0.0612, "step": 5180 }, { - "epoch": 0.3395485770363101, - "grad_norm": 0.9579144716262817, - "learning_rate": 5.765555555555556e-05, - "loss": 0.1077, + "epoch": 1.3579980372914622, + "grad_norm": 0.6395546197891235, + "learning_rate": 4.913615757934926e-05, + "loss": 0.0697, "step": 5190 }, { - "epoch": 0.34020281321557083, - "grad_norm": 0.9725630283355713, - "learning_rate": 5.776666666666667e-05, - "loss": 0.1123, + "epoch": 1.3606149820085052, + "grad_norm": 0.6269518136978149, + "learning_rate": 4.913076311095707e-05, + "loss": 0.0672, "step": 5200 }, { - "epoch": 0.3408570493948315, - "grad_norm": 1.1612457036972046, - "learning_rate": 5.787777777777777e-05, - "loss": 0.1079, + "epoch": 1.363231926725548, + "grad_norm": 0.6928361058235168, + "learning_rate": 4.912535214941584e-05, + "loss": 0.063, "step": 5210 }, { - "epoch": 0.34151128557409227, - "grad_norm": 1.1368802785873413, - "learning_rate": 5.7988888888888894e-05, - "loss": 0.1145, + "epoch": 1.3658488714425907, + "grad_norm": 0.541039228439331, + "learning_rate": 4.91199246984239e-05, + "loss": 0.0656, "step": 5220 }, { - "epoch": 0.34216552175335296, - "grad_norm": 1.130491852760315, - "learning_rate": 5.8099999999999996e-05, - "loss": 0.1127, + "epoch": 1.3684658161596337, + "grad_norm": 0.9675135612487793, + "learning_rate": 4.9114480761690865e-05, + "loss": 0.0652, "step": 5230 }, { - "epoch": 0.34281975793261366, - "grad_norm": 1.1472586393356323, - "learning_rate": 5.821111111111112e-05, - "loss": 0.1067, + "epoch": 1.3710827608766765, + "grad_norm": 0.9641793966293335, + "learning_rate": 4.9109020342937614e-05, + "loss": 0.0676, "step": 5240 }, { - "epoch": 0.3434739941118744, - "grad_norm": 1.2415672540664673, - "learning_rate": 5.832222222222222e-05, - "loss": 0.1035, + "epoch": 1.3736997055937192, + "grad_norm": 0.7403941750526428, + "learning_rate": 4.9103543445896296e-05, + "loss": 0.0663, "step": 5250 }, { - "epoch": 0.3441282302911351, - "grad_norm": 0.952216625213623, - "learning_rate": 5.843333333333334e-05, - "loss": 0.1211, + "epoch": 1.3763166503107622, + "grad_norm": 0.4992590844631195, + "learning_rate": 4.9098050074310316e-05, + "loss": 0.0696, "step": 5260 }, { - "epoch": 0.3447824664703958, - "grad_norm": 0.9842739701271057, - "learning_rate": 5.8544444444444444e-05, - "loss": 0.1081, + "epoch": 1.378933595027805, + "grad_norm": 0.7306753993034363, + "learning_rate": 4.909254023193434e-05, + "loss": 0.07, "step": 5270 }, { - "epoch": 0.34543670264965654, - "grad_norm": 0.921035885810852, - "learning_rate": 5.8655555555555566e-05, - "loss": 0.1063, + "epoch": 1.3815505397448478, + "grad_norm": 0.7848047018051147, + "learning_rate": 4.9087013922534295e-05, + "loss": 0.0646, "step": 5280 }, { - "epoch": 0.3460909388289172, - "grad_norm": 1.028622031211853, - "learning_rate": 5.876666666666667e-05, - "loss": 0.1114, + "epoch": 1.3841674844618908, + "grad_norm": 0.7645529508590698, + "learning_rate": 4.908147114988736e-05, + "loss": 0.0687, "step": 5290 }, { - "epoch": 0.346745175008178, - "grad_norm": 1.103540062904358, - "learning_rate": 5.8877777777777776e-05, - "loss": 0.1144, + "epoch": 1.3867844291789335, + "grad_norm": 0.8650171160697937, + "learning_rate": 4.907591191778197e-05, + "loss": 0.0702, "step": 5300 }, { - "epoch": 0.34739941118743867, - "grad_norm": 0.9521011710166931, - "learning_rate": 5.898888888888889e-05, - "loss": 0.1104, + "epoch": 1.3894013738959765, + "grad_norm": 0.8469581604003906, + "learning_rate": 4.90703362300178e-05, + "loss": 0.0646, "step": 5310 }, { - "epoch": 0.34805364736669936, - "grad_norm": 1.1249808073043823, - "learning_rate": 5.91e-05, - "loss": 0.1235, + "epoch": 1.3920183186130193, + "grad_norm": 0.7733801603317261, + "learning_rate": 4.9064744090405803e-05, + "loss": 0.0691, "step": 5320 }, { - "epoch": 0.3487078835459601, - "grad_norm": 1.0736548900604248, - "learning_rate": 5.9211111111111115e-05, - "loss": 0.1114, + "epoch": 1.3946352633300623, + "grad_norm": 0.8647816777229309, + "learning_rate": 4.905913550276812e-05, + "loss": 0.0659, "step": 5330 }, { - "epoch": 0.3493621197252208, - "grad_norm": 0.9119420647621155, - "learning_rate": 5.9322222222222224e-05, - "loss": 0.1234, + "epoch": 1.397252208047105, + "grad_norm": 0.7563399076461792, + "learning_rate": 4.905351047093819e-05, + "loss": 0.0667, "step": 5340 }, { - "epoch": 0.3500163559044815, - "grad_norm": 0.9707286953926086, - "learning_rate": 5.943333333333334e-05, - "loss": 0.1139, + "epoch": 1.3998691527641478, + "grad_norm": 0.7720489501953125, + "learning_rate": 4.904786899876067e-05, + "loss": 0.065, "step": 5350 }, { - "epoch": 0.35067059208374224, - "grad_norm": 0.8778756260871887, - "learning_rate": 5.954444444444445e-05, - "loss": 0.1119, + "epoch": 1.4024860974811908, + "grad_norm": 0.6422615051269531, + "learning_rate": 4.904221109009144e-05, + "loss": 0.068, "step": 5360 }, { - "epoch": 0.35132482826300293, - "grad_norm": 0.9317549467086792, - "learning_rate": 5.9655555555555556e-05, - "loss": 0.1162, + "epoch": 1.4051030421982336, + "grad_norm": 0.526404082775116, + "learning_rate": 4.903653674879763e-05, + "loss": 0.0634, "step": 5370 }, { - "epoch": 0.3519790644422637, - "grad_norm": 0.9935291409492493, - "learning_rate": 5.976666666666667e-05, - "loss": 0.115, + "epoch": 1.4077199869152763, + "grad_norm": 0.5538908243179321, + "learning_rate": 4.9030845978757624e-05, + "loss": 0.0719, "step": 5380 }, { - "epoch": 0.35263330062152437, - "grad_norm": 0.8970289826393127, - "learning_rate": 5.987777777777778e-05, - "loss": 0.1094, + "epoch": 1.4103369316323193, + "grad_norm": 0.5565630793571472, + "learning_rate": 4.902513878386097e-05, + "loss": 0.0689, "step": 5390 }, { - "epoch": 0.35328753680078506, - "grad_norm": 1.034538745880127, - "learning_rate": 5.9988888888888895e-05, - "loss": 0.1097, + "epoch": 1.412953876349362, + "grad_norm": 0.4883444011211395, + "learning_rate": 4.901941516800851e-05, + "loss": 0.064, "step": 5400 }, { - "epoch": 0.3539417729800458, - "grad_norm": 0.9625052809715271, - "learning_rate": 6.0100000000000004e-05, - "loss": 0.1254, + "epoch": 1.4155708210664049, + "grad_norm": 0.5586369037628174, + "learning_rate": 4.9013675135112265e-05, + "loss": 0.0708, "step": 5410 }, { - "epoch": 0.3545960091593065, - "grad_norm": 1.0226140022277832, - "learning_rate": 6.021111111111112e-05, - "loss": 0.1089, + "epoch": 1.4181877657834479, + "grad_norm": 0.6188867092132568, + "learning_rate": 4.90079186890955e-05, + "loss": 0.073, "step": 5420 }, { - "epoch": 0.35525024533856725, - "grad_norm": 0.9254044890403748, - "learning_rate": 6.032222222222222e-05, - "loss": 0.1233, + "epoch": 1.4208047105004906, + "grad_norm": 0.672395646572113, + "learning_rate": 4.90021458338927e-05, + "loss": 0.0612, "step": 5430 }, { - "epoch": 0.35590448151782794, - "grad_norm": 0.9888849854469299, - "learning_rate": 6.043333333333333e-05, - "loss": 0.1031, + "epoch": 1.4234216552175336, + "grad_norm": 0.6133560538291931, + "learning_rate": 4.899635657344954e-05, + "loss": 0.062, "step": 5440 }, { - "epoch": 0.35655871769708863, - "grad_norm": 0.9793877601623535, - "learning_rate": 6.0544444444444445e-05, - "loss": 0.1127, + "epoch": 1.4260385999345764, + "grad_norm": 0.8154978156089783, + "learning_rate": 4.899055091172294e-05, + "loss": 0.0634, "step": 5450 }, { - "epoch": 0.3572129538763494, - "grad_norm": 0.9079664945602417, - "learning_rate": 6.0655555555555553e-05, - "loss": 0.1088, + "epoch": 1.4286555446516194, + "grad_norm": 0.6268131732940674, + "learning_rate": 4.898472885268102e-05, + "loss": 0.0664, "step": 5460 }, { - "epoch": 0.3578671900556101, - "grad_norm": 1.0613657236099243, - "learning_rate": 6.076666666666667e-05, - "loss": 0.1047, + "epoch": 1.4312724893686621, + "grad_norm": 0.6232008934020996, + "learning_rate": 4.8978890400303074e-05, + "loss": 0.0682, "step": 5470 }, { - "epoch": 0.35852142623487077, - "grad_norm": 1.1257914304733276, - "learning_rate": 6.087777777777778e-05, - "loss": 0.1197, + "epoch": 1.433889434085705, + "grad_norm": 0.8692842125892639, + "learning_rate": 4.897303555857965e-05, + "loss": 0.0687, "step": 5480 }, { - "epoch": 0.3591756624141315, - "grad_norm": 0.9039554595947266, - "learning_rate": 6.098888888888889e-05, - "loss": 0.1103, + "epoch": 1.436506378802748, + "grad_norm": 0.7789962291717529, + "learning_rate": 4.896716433151248e-05, + "loss": 0.0652, "step": 5490 }, { - "epoch": 0.3598298985933922, - "grad_norm": 1.026854395866394, - "learning_rate": 6.110000000000001e-05, - "loss": 0.1116, + "epoch": 1.4391233235197907, + "grad_norm": 0.49732038378715515, + "learning_rate": 4.896127672311448e-05, + "loss": 0.0775, "step": 5500 }, { - "epoch": 0.36048413477265295, - "grad_norm": 1.1896244287490845, - "learning_rate": 6.121111111111112e-05, - "loss": 0.1136, + "epoch": 1.4417402682368334, + "grad_norm": 0.7146192789077759, + "learning_rate": 4.8955372737409786e-05, + "loss": 0.0635, "step": 5510 }, { - "epoch": 0.36113837095191365, - "grad_norm": 0.9056310057640076, - "learning_rate": 6.132222222222223e-05, - "loss": 0.1134, + "epoch": 1.4443572129538764, + "grad_norm": 0.5541284084320068, + "learning_rate": 4.894945237843371e-05, + "loss": 0.0717, "step": 5520 }, { - "epoch": 0.36179260713117434, - "grad_norm": 1.078660249710083, - "learning_rate": 6.143333333333333e-05, - "loss": 0.1063, + "epoch": 1.4469741576709192, + "grad_norm": 0.5081911087036133, + "learning_rate": 4.894351565023276e-05, + "loss": 0.0628, "step": 5530 }, { - "epoch": 0.3624468433104351, - "grad_norm": 1.1922471523284912, - "learning_rate": 6.154444444444446e-05, - "loss": 0.1116, + "epoch": 1.449591102387962, + "grad_norm": 0.9104524850845337, + "learning_rate": 4.893756255686465e-05, + "loss": 0.0699, "step": 5540 }, { - "epoch": 0.3631010794896958, - "grad_norm": 1.298832893371582, - "learning_rate": 6.165555555555556e-05, - "loss": 0.1252, + "epoch": 1.452208047105005, + "grad_norm": 0.9743561148643494, + "learning_rate": 4.893159310239823e-05, + "loss": 0.0716, "step": 5550 }, { - "epoch": 0.36375531566895647, - "grad_norm": 0.9783083200454712, - "learning_rate": 6.176666666666667e-05, - "loss": 0.1143, + "epoch": 1.4548249918220477, + "grad_norm": 0.7123206257820129, + "learning_rate": 4.89256072909136e-05, + "loss": 0.0739, "step": 5560 }, { - "epoch": 0.3644095518482172, - "grad_norm": 1.0835789442062378, - "learning_rate": 6.187777777777777e-05, - "loss": 0.1189, + "epoch": 1.4574419365390905, + "grad_norm": 0.7350765466690063, + "learning_rate": 4.891960512650199e-05, + "loss": 0.0693, "step": 5570 }, { - "epoch": 0.3650637880274779, - "grad_norm": 1.1260126829147339, - "learning_rate": 6.198888888888889e-05, - "loss": 0.1092, + "epoch": 1.4600588812561335, + "grad_norm": 0.9752265810966492, + "learning_rate": 4.891358661326582e-05, + "loss": 0.0746, "step": 5580 }, { - "epoch": 0.36571802420673866, - "grad_norm": 1.1071107387542725, - "learning_rate": 6.21e-05, - "loss": 0.1108, + "epoch": 1.4626758259731762, + "grad_norm": 0.47263097763061523, + "learning_rate": 4.890755175531869e-05, + "loss": 0.0715, "step": 5590 }, { - "epoch": 0.36637226038599935, - "grad_norm": 1.383540153503418, - "learning_rate": 6.221111111111111e-05, - "loss": 0.1208, + "epoch": 1.4652927706902192, + "grad_norm": 0.9160968661308289, + "learning_rate": 4.890150055678538e-05, + "loss": 0.0734, "step": 5600 }, { - "epoch": 0.36702649656526004, - "grad_norm": 1.0578371286392212, - "learning_rate": 6.232222222222222e-05, - "loss": 0.1105, + "epoch": 1.467909715407262, + "grad_norm": 0.8024541735649109, + "learning_rate": 4.8895433021801804e-05, + "loss": 0.0709, "step": 5610 }, { - "epoch": 0.3676807327445208, - "grad_norm": 0.9242531061172485, - "learning_rate": 6.243333333333334e-05, - "loss": 0.1026, + "epoch": 1.470526660124305, + "grad_norm": 0.8918023109436035, + "learning_rate": 4.888934915451509e-05, + "loss": 0.0713, "step": 5620 }, { - "epoch": 0.3683349689237815, - "grad_norm": 1.0212069749832153, - "learning_rate": 6.254444444444445e-05, - "loss": 0.1121, + "epoch": 1.4731436048413478, + "grad_norm": 0.5163362622261047, + "learning_rate": 4.888324895908349e-05, + "loss": 0.07, "step": 5630 }, { - "epoch": 0.3689892051030422, - "grad_norm": 1.075881004333496, - "learning_rate": 6.265555555555555e-05, - "loss": 0.107, + "epoch": 1.4757605495583905, + "grad_norm": 0.7050215005874634, + "learning_rate": 4.8877132439676427e-05, + "loss": 0.0649, "step": 5640 }, { - "epoch": 0.3696434412823029, - "grad_norm": 0.993894100189209, - "learning_rate": 6.276666666666667e-05, - "loss": 0.1202, + "epoch": 1.4783774942754335, + "grad_norm": 0.7626054883003235, + "learning_rate": 4.887099960047449e-05, + "loss": 0.0709, "step": 5650 }, { - "epoch": 0.3702976774615636, - "grad_norm": 1.1158571243286133, - "learning_rate": 6.287777777777779e-05, - "loss": 0.1151, + "epoch": 1.4809944389924763, + "grad_norm": 0.645378589630127, + "learning_rate": 4.886485044566942e-05, + "loss": 0.0624, "step": 5660 }, { - "epoch": 0.37095191364082436, - "grad_norm": 0.9299028515815735, - "learning_rate": 6.29888888888889e-05, - "loss": 0.1203, + "epoch": 1.483611383709519, + "grad_norm": 0.6905105113983154, + "learning_rate": 4.88586849794641e-05, + "loss": 0.0727, "step": 5670 }, { - "epoch": 0.37160614982008505, - "grad_norm": 0.9454100131988525, - "learning_rate": 6.31e-05, - "loss": 0.1105, + "epoch": 1.486228328426562, + "grad_norm": 0.7395666837692261, + "learning_rate": 4.885250320607257e-05, + "loss": 0.0662, "step": 5680 }, { - "epoch": 0.37226038599934574, - "grad_norm": 0.9291905164718628, - "learning_rate": 6.32111111111111e-05, - "loss": 0.1141, + "epoch": 1.4888452731436048, + "grad_norm": 0.6373220682144165, + "learning_rate": 4.884630512972001e-05, + "loss": 0.0684, "step": 5690 }, { - "epoch": 0.3729146221786065, - "grad_norm": 0.9003360867500305, - "learning_rate": 6.332222222222223e-05, - "loss": 0.1115, + "epoch": 1.4914622178606476, + "grad_norm": 0.5650008320808411, + "learning_rate": 4.884009075464276e-05, + "loss": 0.0575, "step": 5700 }, { - "epoch": 0.3735688583578672, - "grad_norm": 1.0188775062561035, - "learning_rate": 6.343333333333333e-05, - "loss": 0.1199, + "epoch": 1.4940791625776906, + "grad_norm": 0.9049765467643738, + "learning_rate": 4.883386008508827e-05, + "loss": 0.0709, "step": 5710 }, { - "epoch": 0.3742230945371279, - "grad_norm": 0.9012176990509033, - "learning_rate": 6.354444444444445e-05, - "loss": 0.1091, + "epoch": 1.4966961072947333, + "grad_norm": 0.6606411337852478, + "learning_rate": 4.882761312531516e-05, + "loss": 0.0649, "step": 5720 }, { - "epoch": 0.3748773307163886, - "grad_norm": 0.977641224861145, - "learning_rate": 6.365555555555555e-05, - "loss": 0.1138, + "epoch": 1.4993130520117761, + "grad_norm": 0.8762325048446655, + "learning_rate": 4.882134987959315e-05, + "loss": 0.0741, "step": 5730 }, { - "epoch": 0.3755315668956493, - "grad_norm": 0.9474180340766907, - "learning_rate": 6.376666666666668e-05, - "loss": 0.1085, + "epoch": 1.501929996728819, + "grad_norm": 0.6389805674552917, + "learning_rate": 4.881507035220313e-05, + "loss": 0.0705, "step": 5740 }, { - "epoch": 0.37618580307491006, - "grad_norm": 0.8891827464103699, - "learning_rate": 6.387777777777778e-05, - "loss": 0.1183, + "epoch": 1.504546941445862, + "grad_norm": 0.7237274646759033, + "learning_rate": 4.880877454743708e-05, + "loss": 0.0634, "step": 5750 }, { - "epoch": 0.37684003925417076, - "grad_norm": 1.0365437269210815, - "learning_rate": 6.398888888888888e-05, - "loss": 0.1166, + "epoch": 1.5071638861629046, + "grad_norm": 0.8704226613044739, + "learning_rate": 4.880246246959813e-05, + "loss": 0.0666, "step": 5760 }, { - "epoch": 0.37749427543343145, - "grad_norm": 0.8813347220420837, - "learning_rate": 6.41e-05, - "loss": 0.1007, + "epoch": 1.5097808308799476, + "grad_norm": 0.9800268411636353, + "learning_rate": 4.8796134123000526e-05, + "loss": 0.0665, "step": 5770 }, { - "epoch": 0.3781485116126922, - "grad_norm": 0.888383150100708, - "learning_rate": 6.421111111111111e-05, - "loss": 0.1133, + "epoch": 1.5123977755969906, + "grad_norm": 0.9698657989501953, + "learning_rate": 4.878978951196964e-05, + "loss": 0.0698, "step": 5780 }, { - "epoch": 0.3788027477919529, - "grad_norm": 1.1097888946533203, - "learning_rate": 6.432222222222223e-05, - "loss": 0.1318, + "epoch": 1.5150147203140334, + "grad_norm": 0.4997177720069885, + "learning_rate": 4.8783428640841934e-05, + "loss": 0.0635, "step": 5790 }, { - "epoch": 0.37945698397121363, - "grad_norm": 0.928402304649353, - "learning_rate": 6.443333333333333e-05, - "loss": 0.1141, + "epoch": 1.5176316650310762, + "grad_norm": 0.4548600912094116, + "learning_rate": 4.877705151396502e-05, + "loss": 0.0673, "step": 5800 }, { - "epoch": 0.3801112201504743, - "grad_norm": 1.0633147954940796, - "learning_rate": 6.454444444444445e-05, - "loss": 0.1198, + "epoch": 1.5202486097481192, + "grad_norm": 0.5970808267593384, + "learning_rate": 4.877065813569761e-05, + "loss": 0.0667, "step": 5810 }, { - "epoch": 0.380765456329735, - "grad_norm": 1.11026132106781, - "learning_rate": 6.465555555555556e-05, - "loss": 0.1234, + "epoch": 1.522865554465162, + "grad_norm": 0.7120151519775391, + "learning_rate": 4.8764248510409505e-05, + "loss": 0.0652, "step": 5820 }, { - "epoch": 0.38141969250899577, - "grad_norm": 0.9502913951873779, - "learning_rate": 6.476666666666666e-05, - "loss": 0.1043, + "epoch": 1.5254824991822047, + "grad_norm": 0.4894607365131378, + "learning_rate": 4.875782264248162e-05, + "loss": 0.0653, "step": 5830 }, { - "epoch": 0.38207392868825646, - "grad_norm": 1.1175204515457153, - "learning_rate": 6.487777777777778e-05, - "loss": 0.1091, + "epoch": 1.5280994438992477, + "grad_norm": 0.6062511205673218, + "learning_rate": 4.8751380536305986e-05, + "loss": 0.0658, "step": 5840 }, { - "epoch": 0.38272816486751715, - "grad_norm": 0.9505729079246521, - "learning_rate": 6.498888888888888e-05, - "loss": 0.1067, + "epoch": 1.5307163886162904, + "grad_norm": 1.0854140520095825, + "learning_rate": 4.874492219628571e-05, + "loss": 0.0664, "step": 5850 }, { - "epoch": 0.3833824010467779, - "grad_norm": 1.11201810836792, - "learning_rate": 6.510000000000001e-05, - "loss": 0.1123, + "epoch": 1.5333333333333332, + "grad_norm": 0.6886491775512695, + "learning_rate": 4.8738447626835026e-05, + "loss": 0.0715, "step": 5860 }, { - "epoch": 0.3840366372260386, - "grad_norm": 1.0022423267364502, - "learning_rate": 6.521111111111111e-05, - "loss": 0.1076, + "epoch": 1.5359502780503762, + "grad_norm": 0.6680225133895874, + "learning_rate": 4.873195683237922e-05, + "loss": 0.0617, "step": 5870 }, { - "epoch": 0.38469087340529934, - "grad_norm": 1.1039308309555054, - "learning_rate": 6.532222222222223e-05, - "loss": 0.1086, + "epoch": 1.5385672227674192, + "grad_norm": 0.8506457805633545, + "learning_rate": 4.872544981735471e-05, + "loss": 0.0669, "step": 5880 }, { - "epoch": 0.38534510958456003, - "grad_norm": 0.970061182975769, - "learning_rate": 6.543333333333333e-05, - "loss": 0.1192, + "epoch": 1.5411841674844617, + "grad_norm": 0.5290977358818054, + "learning_rate": 4.8718926586208955e-05, + "loss": 0.0603, "step": 5890 }, { - "epoch": 0.3859993457638207, - "grad_norm": 1.0175315141677856, - "learning_rate": 6.554444444444446e-05, - "loss": 0.1301, + "epoch": 1.5438011122015047, + "grad_norm": 0.6317195892333984, + "learning_rate": 4.871238714340054e-05, + "loss": 0.0682, "step": 5900 }, { - "epoch": 0.38665358194308147, - "grad_norm": 1.1553181409835815, - "learning_rate": 6.565555555555556e-05, - "loss": 0.1175, + "epoch": 1.5464180569185477, + "grad_norm": 0.8768609762191772, + "learning_rate": 4.8705831493399106e-05, + "loss": 0.0677, "step": 5910 }, { - "epoch": 0.38730781812234216, - "grad_norm": 0.9421599507331848, - "learning_rate": 6.576666666666666e-05, - "loss": 0.1147, + "epoch": 1.5490350016355905, + "grad_norm": 0.8061237335205078, + "learning_rate": 4.869925964068538e-05, + "loss": 0.0678, "step": 5920 }, { - "epoch": 0.38796205430160285, - "grad_norm": 1.103137731552124, - "learning_rate": 6.587777777777778e-05, - "loss": 0.1268, + "epoch": 1.5516519463526333, + "grad_norm": 0.5746549367904663, + "learning_rate": 4.869267158975116e-05, + "loss": 0.0694, "step": 5930 }, { - "epoch": 0.3886162904808636, - "grad_norm": 1.0151023864746094, - "learning_rate": 6.598888888888889e-05, - "loss": 0.1103, + "epoch": 1.5542688910696763, + "grad_norm": 0.6497983336448669, + "learning_rate": 4.868606734509932e-05, + "loss": 0.0682, "step": 5940 }, { - "epoch": 0.3892705266601243, - "grad_norm": 1.017514944076538, - "learning_rate": 6.610000000000001e-05, - "loss": 0.1168, + "epoch": 1.556885835786719, + "grad_norm": 0.751058042049408, + "learning_rate": 4.8679446911243783e-05, + "loss": 0.0691, "step": 5950 }, { - "epoch": 0.38992476283938504, - "grad_norm": 0.9031379222869873, - "learning_rate": 6.621111111111111e-05, - "loss": 0.1191, + "epoch": 1.5595027805037618, + "grad_norm": 0.5331182479858398, + "learning_rate": 4.867281029270958e-05, + "loss": 0.067, "step": 5960 }, { - "epoch": 0.39057899901864573, - "grad_norm": 1.1167620420455933, - "learning_rate": 6.632222222222222e-05, - "loss": 0.1032, + "epoch": 1.5621197252208048, + "grad_norm": 0.6946902871131897, + "learning_rate": 4.866615749403276e-05, + "loss": 0.0652, "step": 5970 }, { - "epoch": 0.3912332351979064, - "grad_norm": 0.9715782403945923, - "learning_rate": 6.643333333333334e-05, - "loss": 0.1231, + "epoch": 1.5647366699378475, + "grad_norm": 0.581596314907074, + "learning_rate": 4.865948851976044e-05, + "loss": 0.0648, "step": 5980 }, { - "epoch": 0.3918874713771672, - "grad_norm": 1.0662617683410645, - "learning_rate": 6.654444444444444e-05, - "loss": 0.1122, + "epoch": 1.5673536146548903, + "grad_norm": 0.8583618402481079, + "learning_rate": 4.865280337445083e-05, + "loss": 0.0655, "step": 5990 }, { - "epoch": 0.39254170755642787, - "grad_norm": 0.9010646343231201, - "learning_rate": 6.665555555555556e-05, - "loss": 0.1109, + "epoch": 1.5699705593719333, + "grad_norm": 0.4247710704803467, + "learning_rate": 4.864610206267314e-05, + "loss": 0.0633, + "step": 6000 + }, + { + "epoch": 1.5699705593719333, + "eval_loss": 0.07125227067872947, + "eval_runtime": 8.5376, + "eval_samples_per_second": 119.939, + "eval_steps_per_second": 1.874, "step": 6000 }, { - "epoch": 0.39319594373568856, - "grad_norm": 0.9664989709854126, - "learning_rate": 6.676666666666667e-05, - "loss": 0.1195, + "epoch": 1.572587504088976, + "grad_norm": 0.7412828207015991, + "learning_rate": 4.863938458900765e-05, + "loss": 0.0688, "step": 6010 }, { - "epoch": 0.3938501799149493, - "grad_norm": 1.0604381561279297, - "learning_rate": 6.687777777777779e-05, - "loss": 0.1013, + "epoch": 1.5752044488060188, + "grad_norm": 0.7785113453865051, + "learning_rate": 4.863265095804571e-05, + "loss": 0.0679, "step": 6020 }, { - "epoch": 0.39450441609421, - "grad_norm": 1.0185221433639526, - "learning_rate": 6.698888888888889e-05, - "loss": 0.1087, + "epoch": 1.5778213935230618, + "grad_norm": 0.7265563607215881, + "learning_rate": 4.8625901174389685e-05, + "loss": 0.0704, "step": 6030 }, { - "epoch": 0.39515865227347075, - "grad_norm": 1.0667742490768433, - "learning_rate": 6.71e-05, - "loss": 0.1108, + "epoch": 1.5804383382401048, + "grad_norm": 0.6599150896072388, + "learning_rate": 4.861913524265298e-05, + "loss": 0.0705, "step": 6040 }, { - "epoch": 0.39581288845273144, - "grad_norm": 0.9968790411949158, - "learning_rate": 6.721111111111112e-05, - "loss": 0.1254, + "epoch": 1.5830552829571474, + "grad_norm": 0.4436973035335541, + "learning_rate": 4.8612353167460054e-05, + "loss": 0.0657, "step": 6050 }, { - "epoch": 0.39646712463199213, - "grad_norm": 0.9908486604690552, - "learning_rate": 6.732222222222224e-05, - "loss": 0.1098, + "epoch": 1.5856722276741904, + "grad_norm": 0.6332281231880188, + "learning_rate": 4.860555495344639e-05, + "loss": 0.0671, "step": 6060 }, { - "epoch": 0.3971213608112529, - "grad_norm": 1.073398470878601, - "learning_rate": 6.743333333333334e-05, - "loss": 0.1034, + "epoch": 1.5882891723912334, + "grad_norm": 0.5939836502075195, + "learning_rate": 4.8598740605258494e-05, + "loss": 0.0679, "step": 6070 }, { - "epoch": 0.39777559699051357, - "grad_norm": 0.9481722712516785, - "learning_rate": 6.754444444444444e-05, - "loss": 0.1141, + "epoch": 1.5909061171082761, + "grad_norm": 0.8490774035453796, + "learning_rate": 4.8591910127553925e-05, + "loss": 0.0749, "step": 6080 }, { - "epoch": 0.39842983316977426, - "grad_norm": 1.1808781623840332, - "learning_rate": 6.765555555555555e-05, - "loss": 0.113, + "epoch": 1.5935230618253189, + "grad_norm": 0.7159541845321655, + "learning_rate": 4.858506352500124e-05, + "loss": 0.0675, "step": 6090 }, { - "epoch": 0.399084069349035, - "grad_norm": 1.1544404029846191, - "learning_rate": 6.776666666666667e-05, - "loss": 0.1118, + "epoch": 1.5961400065423619, + "grad_norm": 0.5728774070739746, + "learning_rate": 4.857820080228003e-05, + "loss": 0.0693, "step": 6100 }, { - "epoch": 0.3997383055282957, - "grad_norm": 1.1222143173217773, - "learning_rate": 6.787777777777778e-05, - "loss": 0.1001, + "epoch": 1.5987569512594046, + "grad_norm": 0.7324960231781006, + "learning_rate": 4.8571321964080904e-05, + "loss": 0.0771, "step": 6110 }, { - "epoch": 0.40039254170755645, - "grad_norm": 0.9140275716781616, - "learning_rate": 6.798888888888889e-05, - "loss": 0.1146, + "epoch": 1.6013738959764474, + "grad_norm": 1.0330255031585693, + "learning_rate": 4.856442701510548e-05, + "loss": 0.073, "step": 6120 }, { - "epoch": 0.40104677788681714, - "grad_norm": 0.9539370536804199, - "learning_rate": 6.81e-05, - "loss": 0.1135, + "epoch": 1.6039908406934904, + "grad_norm": 0.634896457195282, + "learning_rate": 4.855751596006638e-05, + "loss": 0.0693, "step": 6130 }, { - "epoch": 0.40170101406607783, - "grad_norm": 1.0647083520889282, - "learning_rate": 6.821111111111112e-05, - "loss": 0.1102, + "epoch": 1.6066077854105332, + "grad_norm": 0.46513810753822327, + "learning_rate": 4.855058880368727e-05, + "loss": 0.0694, "step": 6140 }, { - "epoch": 0.4023552502453386, - "grad_norm": 0.9687780141830444, - "learning_rate": 6.832222222222222e-05, - "loss": 0.1164, + "epoch": 1.609224730127576, + "grad_norm": 0.6653358340263367, + "learning_rate": 4.854364555070277e-05, + "loss": 0.0648, "step": 6150 }, { - "epoch": 0.4030094864245993, - "grad_norm": 1.0673967599868774, - "learning_rate": 6.843333333333333e-05, - "loss": 0.1166, + "epoch": 1.611841674844619, + "grad_norm": 0.6717187762260437, + "learning_rate": 4.8536686205858545e-05, + "loss": 0.0647, "step": 6160 }, { - "epoch": 0.40366372260386, - "grad_norm": 1.0471779108047485, - "learning_rate": 6.854444444444445e-05, - "loss": 0.11, + "epoch": 1.614458619561662, + "grad_norm": 0.521497905254364, + "learning_rate": 4.852971077391123e-05, + "loss": 0.0612, "step": 6170 }, { - "epoch": 0.4043179587831207, - "grad_norm": 1.0822899341583252, - "learning_rate": 6.865555555555556e-05, - "loss": 0.1246, + "epoch": 1.6170755642787045, + "grad_norm": 0.6874605417251587, + "learning_rate": 4.852271925962848e-05, + "loss": 0.063, "step": 6180 }, { - "epoch": 0.4049721949623814, - "grad_norm": 0.7718848586082458, - "learning_rate": 6.876666666666667e-05, - "loss": 0.1116, + "epoch": 1.6196925089957475, + "grad_norm": 0.5139410495758057, + "learning_rate": 4.851571166778892e-05, + "loss": 0.064, "step": 6190 }, { - "epoch": 0.40562643114164215, - "grad_norm": 1.1657593250274658, - "learning_rate": 6.887777777777778e-05, - "loss": 0.1087, + "epoch": 1.6223094537127905, + "grad_norm": 0.5792190432548523, + "learning_rate": 4.850868800318218e-05, + "loss": 0.072, "step": 6200 }, { - "epoch": 0.40628066732090284, - "grad_norm": 0.8355668783187866, - "learning_rate": 6.89888888888889e-05, - "loss": 0.1003, + "epoch": 1.6249263984298332, + "grad_norm": 0.6859474778175354, + "learning_rate": 4.8501648270608854e-05, + "loss": 0.0716, "step": 6210 }, { - "epoch": 0.40693490350016354, - "grad_norm": 0.9746671319007874, - "learning_rate": 6.91e-05, - "loss": 0.1282, + "epoch": 1.627543343146876, + "grad_norm": 0.6160419583320618, + "learning_rate": 4.8494592474880544e-05, + "loss": 0.0669, "step": 6220 }, { - "epoch": 0.4075891396794243, - "grad_norm": 1.1945754289627075, - "learning_rate": 6.921111111111111e-05, - "loss": 0.1219, + "epoch": 1.630160287863919, + "grad_norm": 0.7906918525695801, + "learning_rate": 4.848752062081982e-05, + "loss": 0.0634, "step": 6230 }, { - "epoch": 0.408243375858685, - "grad_norm": 1.1995118856430054, - "learning_rate": 6.932222222222222e-05, - "loss": 0.1212, + "epoch": 1.6327772325809617, + "grad_norm": 0.47414660453796387, + "learning_rate": 4.8480432713260226e-05, + "loss": 0.0709, "step": 6240 }, { - "epoch": 0.4088976120379457, - "grad_norm": 1.1618099212646484, - "learning_rate": 6.943333333333335e-05, - "loss": 0.117, + "epoch": 1.6353941772980045, + "grad_norm": 0.6333305239677429, + "learning_rate": 4.847332875704628e-05, + "loss": 0.0666, "step": 6250 }, { - "epoch": 0.4095518482172064, - "grad_norm": 0.8268874883651733, - "learning_rate": 6.954444444444445e-05, - "loss": 0.1141, + "epoch": 1.6380111220150475, + "grad_norm": 0.6713711619377136, + "learning_rate": 4.846620875703347e-05, + "loss": 0.0659, "step": 6260 }, { - "epoch": 0.4102060843964671, - "grad_norm": 0.8862093091011047, - "learning_rate": 6.965555555555556e-05, - "loss": 0.122, + "epoch": 1.6406280667320903, + "grad_norm": 0.844963550567627, + "learning_rate": 4.845907271808825e-05, + "loss": 0.062, "step": 6270 }, { - "epoch": 0.41086032057572786, - "grad_norm": 1.2067736387252808, - "learning_rate": 6.976666666666666e-05, - "loss": 0.1089, + "epoch": 1.643245011449133, + "grad_norm": 0.7952145338058472, + "learning_rate": 4.8451920645088025e-05, + "loss": 0.0627, "step": 6280 }, { - "epoch": 0.41151455675498855, - "grad_norm": 1.0685234069824219, - "learning_rate": 6.987777777777779e-05, - "loss": 0.1147, + "epoch": 1.645861956166176, + "grad_norm": 0.9742829203605652, + "learning_rate": 4.8444752542921186e-05, + "loss": 0.0707, "step": 6290 }, { - "epoch": 0.41216879293424924, - "grad_norm": 0.9467248916625977, - "learning_rate": 6.99888888888889e-05, - "loss": 0.1282, + "epoch": 1.6484789008832188, + "grad_norm": 0.8199586272239685, + "learning_rate": 4.843756841648705e-05, + "loss": 0.0658, "step": 6300 }, { - "epoch": 0.41282302911351, - "grad_norm": 1.2121899127960205, - "learning_rate": 7.01e-05, - "loss": 0.1052, + "epoch": 1.6510958456002616, + "grad_norm": 0.8059208989143372, + "learning_rate": 4.84303682706959e-05, + "loss": 0.0659, "step": 6310 }, { - "epoch": 0.4134772652927707, - "grad_norm": 1.1744288206100464, - "learning_rate": 7.021111111111111e-05, - "loss": 0.1119, + "epoch": 1.6537127903173046, + "grad_norm": 0.7988175749778748, + "learning_rate": 4.842315211046898e-05, + "loss": 0.0644, "step": 6320 }, { - "epoch": 0.4141315014720314, - "grad_norm": 1.219565749168396, - "learning_rate": 7.032222222222223e-05, - "loss": 0.1203, + "epoch": 1.6563297350343476, + "grad_norm": 0.8034182786941528, + "learning_rate": 4.8415919940738464e-05, + "loss": 0.0664, "step": 6330 }, { - "epoch": 0.4147857376512921, - "grad_norm": 0.9562080502510071, - "learning_rate": 7.043333333333334e-05, - "loss": 0.1157, + "epoch": 1.65894667975139, + "grad_norm": 0.7371190786361694, + "learning_rate": 4.8408671766447456e-05, + "loss": 0.0648, "step": 6340 }, { - "epoch": 0.4154399738305528, - "grad_norm": 1.188444972038269, - "learning_rate": 7.054444444444444e-05, - "loss": 0.1153, + "epoch": 1.661563624468433, + "grad_norm": 0.9478877782821655, + "learning_rate": 4.840140759255003e-05, + "loss": 0.0757, "step": 6350 }, { - "epoch": 0.41609421000981356, - "grad_norm": 1.1564666032791138, - "learning_rate": 7.065555555555556e-05, - "loss": 0.1135, + "epoch": 1.664180569185476, + "grad_norm": 0.7560123205184937, + "learning_rate": 4.839412742401118e-05, + "loss": 0.0673, "step": 6360 }, { - "epoch": 0.41674844618907425, - "grad_norm": 0.9498192071914673, - "learning_rate": 7.076666666666667e-05, - "loss": 0.119, + "epoch": 1.6667975139025188, + "grad_norm": 1.1563645601272583, + "learning_rate": 4.838683126580683e-05, + "loss": 0.068, "step": 6370 }, { - "epoch": 0.41740268236833494, - "grad_norm": 1.0807279348373413, - "learning_rate": 7.087777777777778e-05, - "loss": 0.1176, + "epoch": 1.6694144586195616, + "grad_norm": 0.8656620979309082, + "learning_rate": 4.8379519122923825e-05, + "loss": 0.0667, "step": 6380 }, { - "epoch": 0.4180569185475957, - "grad_norm": 1.169202446937561, - "learning_rate": 7.098888888888889e-05, - "loss": 0.1206, + "epoch": 1.6720314033366046, + "grad_norm": 0.7312747240066528, + "learning_rate": 4.8372191000359955e-05, + "loss": 0.0666, "step": 6390 }, { - "epoch": 0.4187111547268564, - "grad_norm": 1.1617976427078247, - "learning_rate": 7.11e-05, - "loss": 0.1017, + "epoch": 1.6746483480536474, + "grad_norm": 0.8066172003746033, + "learning_rate": 4.836484690312393e-05, + "loss": 0.0692, "step": 6400 }, { - "epoch": 0.41936539090611713, - "grad_norm": 0.8993281126022339, - "learning_rate": 7.121111111111112e-05, - "loss": 0.1053, + "epoch": 1.6772652927706901, + "grad_norm": 0.8848495483398438, + "learning_rate": 4.8357486836235365e-05, + "loss": 0.0754, "step": 6410 }, { - "epoch": 0.4200196270853778, - "grad_norm": 1.145491600036621, - "learning_rate": 7.132222222222222e-05, - "loss": 0.1247, + "epoch": 1.6798822374877331, + "grad_norm": 0.6636145114898682, + "learning_rate": 4.8350110804724794e-05, + "loss": 0.0721, "step": 6420 }, { - "epoch": 0.4206738632646385, - "grad_norm": 1.1302369832992554, - "learning_rate": 7.143333333333334e-05, - "loss": 0.1165, + "epoch": 1.682499182204776, + "grad_norm": 0.6970325708389282, + "learning_rate": 4.834271881363367e-05, + "loss": 0.0644, "step": 6430 }, { - "epoch": 0.42132809944389926, - "grad_norm": 1.070312261581421, - "learning_rate": 7.154444444444444e-05, - "loss": 0.112, + "epoch": 1.6851161269218187, + "grad_norm": 0.6332338452339172, + "learning_rate": 4.833531086801434e-05, + "loss": 0.0663, "step": 6440 }, { - "epoch": 0.42198233562315995, - "grad_norm": 1.261004090309143, - "learning_rate": 7.165555555555556e-05, - "loss": 0.1202, + "epoch": 1.6877330716388617, + "grad_norm": 0.703227698802948, + "learning_rate": 4.832788697293007e-05, + "loss": 0.0641, "step": 6450 }, { - "epoch": 0.42263657180242065, - "grad_norm": 1.149681806564331, - "learning_rate": 7.176666666666667e-05, - "loss": 0.1116, + "epoch": 1.6903500163559044, + "grad_norm": 0.8867107033729553, + "learning_rate": 4.832044713345503e-05, + "loss": 0.0703, "step": 6460 }, { - "epoch": 0.4232908079816814, - "grad_norm": 1.0995161533355713, - "learning_rate": 7.187777777777777e-05, - "loss": 0.1068, + "epoch": 1.6929669610729472, + "grad_norm": 0.7940611243247986, + "learning_rate": 4.831299135467426e-05, + "loss": 0.0632, "step": 6470 }, { - "epoch": 0.4239450441609421, - "grad_norm": 1.2060518264770508, - "learning_rate": 7.198888888888889e-05, - "loss": 0.1178, + "epoch": 1.6955839057899902, + "grad_norm": 0.7674934267997742, + "learning_rate": 4.830551964168374e-05, + "loss": 0.0648, "step": 6480 }, { - "epoch": 0.42459928034020283, - "grad_norm": 0.885783851146698, - "learning_rate": 7.21e-05, - "loss": 0.1114, + "epoch": 1.6982008505070332, + "grad_norm": 0.6497439742088318, + "learning_rate": 4.829803199959029e-05, + "loss": 0.0697, "step": 6490 }, { - "epoch": 0.4252535165194635, - "grad_norm": 1.0212535858154297, - "learning_rate": 7.221111111111112e-05, - "loss": 0.102, + "epoch": 1.700817795224076, + "grad_norm": 0.6304329633712769, + "learning_rate": 4.829052843351167e-05, + "loss": 0.0657, "step": 6500 }, { - "epoch": 0.4259077526987242, - "grad_norm": 0.9369866847991943, - "learning_rate": 7.232222222222222e-05, - "loss": 0.1018, + "epoch": 1.7034347399411187, + "grad_norm": 0.621741533279419, + "learning_rate": 4.828300894857647e-05, + "loss": 0.0727, "step": 6510 }, { - "epoch": 0.42656198887798497, - "grad_norm": 1.102473497390747, - "learning_rate": 7.243333333333334e-05, - "loss": 0.1163, + "epoch": 1.7060516846581617, + "grad_norm": 0.7226573824882507, + "learning_rate": 4.827547354992421e-05, + "loss": 0.065, "step": 6520 }, { - "epoch": 0.42721622505724566, - "grad_norm": 0.9343268871307373, - "learning_rate": 7.254444444444445e-05, - "loss": 0.1118, + "epoch": 1.7086686293752045, + "grad_norm": 0.9050750732421875, + "learning_rate": 4.826792224270524e-05, + "loss": 0.0662, "step": 6530 }, { - "epoch": 0.4278704612365064, - "grad_norm": 0.9922140836715698, - "learning_rate": 7.265555555555555e-05, - "loss": 0.1312, + "epoch": 1.7112855740922472, + "grad_norm": 0.659066915512085, + "learning_rate": 4.826035503208083e-05, + "loss": 0.0584, "step": 6540 }, { - "epoch": 0.4285246974157671, - "grad_norm": 0.92384934425354, - "learning_rate": 7.276666666666667e-05, - "loss": 0.1152, + "epoch": 1.7139025188092902, + "grad_norm": 0.580033004283905, + "learning_rate": 4.825277192322309e-05, + "loss": 0.0682, "step": 6550 }, { - "epoch": 0.4291789335950278, - "grad_norm": 0.8126742243766785, - "learning_rate": 7.287777777777778e-05, - "loss": 0.1132, + "epoch": 1.716519463526333, + "grad_norm": 0.7463915348052979, + "learning_rate": 4.8245172921315e-05, + "loss": 0.067, "step": 6560 }, { - "epoch": 0.42983316977428854, - "grad_norm": 0.8990784287452698, - "learning_rate": 7.29888888888889e-05, - "loss": 0.1172, + "epoch": 1.7191364082433758, + "grad_norm": 0.6378515958786011, + "learning_rate": 4.82375580315504e-05, + "loss": 0.0655, "step": 6570 }, { - "epoch": 0.43048740595354923, - "grad_norm": 0.7989428043365479, - "learning_rate": 7.31e-05, - "loss": 0.1146, + "epoch": 1.7217533529604188, + "grad_norm": 0.4390958845615387, + "learning_rate": 4.8229927259134014e-05, + "loss": 0.0582, "step": 6580 }, { - "epoch": 0.4311416421328099, - "grad_norm": 1.0440112352371216, - "learning_rate": 7.321111111111112e-05, - "loss": 0.1257, + "epoch": 1.7243702976774615, + "grad_norm": 0.4259874224662781, + "learning_rate": 4.8222280609281376e-05, + "loss": 0.0666, "step": 6590 }, { - "epoch": 0.43179587831207067, - "grad_norm": 0.9922072887420654, - "learning_rate": 7.332222222222223e-05, - "loss": 0.116, + "epoch": 1.7269872423945043, + "grad_norm": 0.4031057357788086, + "learning_rate": 4.821461808721892e-05, + "loss": 0.0666, "step": 6600 }, { - "epoch": 0.43245011449133136, - "grad_norm": 0.951362133026123, - "learning_rate": 7.343333333333333e-05, - "loss": 0.1205, + "epoch": 1.7296041871115473, + "grad_norm": 0.9084675312042236, + "learning_rate": 4.820693969818391e-05, + "loss": 0.0707, "step": 6610 }, { - "epoch": 0.4331043506705921, - "grad_norm": 1.096359133720398, - "learning_rate": 7.354444444444445e-05, - "loss": 0.1092, + "epoch": 1.7322211318285903, + "grad_norm": 0.6418400406837463, + "learning_rate": 4.819924544742444e-05, + "loss": 0.0741, "step": 6620 }, { - "epoch": 0.4337585868498528, - "grad_norm": 0.9648379683494568, - "learning_rate": 7.365555555555555e-05, - "loss": 0.1121, + "epoch": 1.7348380765456328, + "grad_norm": 0.7338866591453552, + "learning_rate": 4.8191535340199464e-05, + "loss": 0.068, "step": 6630 }, { - "epoch": 0.4344128230291135, - "grad_norm": 0.9251676797866821, - "learning_rate": 7.376666666666667e-05, - "loss": 0.1139, + "epoch": 1.7374550212626758, + "grad_norm": 0.6052848696708679, + "learning_rate": 4.8183809381778765e-05, + "loss": 0.0727, "step": 6640 }, { - "epoch": 0.43506705920837424, - "grad_norm": 1.0393186807632446, - "learning_rate": 7.387777777777778e-05, - "loss": 0.1136, + "epoch": 1.7400719659797188, + "grad_norm": 0.7197098731994629, + "learning_rate": 4.8176067577442964e-05, + "loss": 0.0707, "step": 6650 }, { - "epoch": 0.43572129538763493, - "grad_norm": 0.870494544506073, - "learning_rate": 7.39888888888889e-05, - "loss": 0.1052, + "epoch": 1.7426889106967616, + "grad_norm": 0.7238598465919495, + "learning_rate": 4.816830993248351e-05, + "loss": 0.0613, "step": 6660 }, { - "epoch": 0.4363755315668956, - "grad_norm": 0.9582804441452026, - "learning_rate": 7.41e-05, - "loss": 0.1218, + "epoch": 1.7453058554138043, + "grad_norm": 0.6141265630722046, + "learning_rate": 4.8160536452202673e-05, + "loss": 0.0694, "step": 6670 }, { - "epoch": 0.4370297677461564, - "grad_norm": 1.166326642036438, - "learning_rate": 7.421111111111111e-05, - "loss": 0.1126, + "epoch": 1.7479228001308473, + "grad_norm": 0.6801651120185852, + "learning_rate": 4.815274714191357e-05, + "loss": 0.0627, "step": 6680 }, { - "epoch": 0.43768400392541706, - "grad_norm": 0.9711355566978455, - "learning_rate": 7.432222222222223e-05, - "loss": 0.1186, + "epoch": 1.75053974484789, + "grad_norm": 0.8193888664245605, + "learning_rate": 4.814494200694012e-05, + "loss": 0.0653, "step": 6690 }, { - "epoch": 0.4383382401046778, - "grad_norm": 1.110071063041687, - "learning_rate": 7.443333333333333e-05, - "loss": 0.1117, + "epoch": 1.7531566895649329, + "grad_norm": 0.6166638731956482, + "learning_rate": 4.813712105261704e-05, + "loss": 0.0601, "step": 6700 }, { - "epoch": 0.4389924762839385, - "grad_norm": 0.836465060710907, - "learning_rate": 7.454444444444445e-05, - "loss": 0.1108, + "epoch": 1.7557736342819759, + "grad_norm": 0.7112689018249512, + "learning_rate": 4.81292842842899e-05, + "loss": 0.0708, "step": 6710 }, { - "epoch": 0.4396467124631992, - "grad_norm": 1.120985507965088, - "learning_rate": 7.465555555555556e-05, - "loss": 0.1172, + "epoch": 1.7583905789990186, + "grad_norm": 0.5664924383163452, + "learning_rate": 4.812143170731504e-05, + "loss": 0.0718, "step": 6720 }, { - "epoch": 0.44030094864245994, - "grad_norm": 1.0334186553955078, - "learning_rate": 7.476666666666668e-05, - "loss": 0.1142, + "epoch": 1.7610075237160614, + "grad_norm": 0.5493945479393005, + "learning_rate": 4.811356332705963e-05, + "loss": 0.0711, "step": 6730 }, { - "epoch": 0.44095518482172064, - "grad_norm": 1.0058670043945312, - "learning_rate": 7.487777777777778e-05, - "loss": 0.1191, + "epoch": 1.7636244684331044, + "grad_norm": 0.8429237604141235, + "learning_rate": 4.810567914890164e-05, + "loss": 0.063, "step": 6740 }, { - "epoch": 0.44160942100098133, - "grad_norm": 1.1377954483032227, - "learning_rate": 7.49888888888889e-05, - "loss": 0.1083, + "epoch": 1.7662414131501472, + "grad_norm": 0.6945361495018005, + "learning_rate": 4.809777917822982e-05, + "loss": 0.0738, "step": 6750 }, { - "epoch": 0.4422636571802421, - "grad_norm": 1.0749398469924927, - "learning_rate": 7.510000000000001e-05, - "loss": 0.1275, + "epoch": 1.76885835786719, + "grad_norm": 0.5371482968330383, + "learning_rate": 4.808986342044374e-05, + "loss": 0.0673, "step": 6760 }, { - "epoch": 0.44291789335950277, - "grad_norm": 1.0065230131149292, - "learning_rate": 7.521111111111111e-05, - "loss": 0.1192, + "epoch": 1.771475302584233, + "grad_norm": 0.7007372975349426, + "learning_rate": 4.8081931880953726e-05, + "loss": 0.0667, "step": 6770 }, { - "epoch": 0.4435721295387635, - "grad_norm": 1.1602001190185547, - "learning_rate": 7.532222222222223e-05, - "loss": 0.1137, + "epoch": 1.774092247301276, + "grad_norm": 0.5731061100959778, + "learning_rate": 4.807398456518092e-05, + "loss": 0.0664, "step": 6780 }, { - "epoch": 0.4442263657180242, - "grad_norm": 1.0304006338119507, - "learning_rate": 7.543333333333333e-05, - "loss": 0.1206, + "epoch": 1.7767091920183185, + "grad_norm": 0.845836341381073, + "learning_rate": 4.806602147855725e-05, + "loss": 0.0655, "step": 6790 }, { - "epoch": 0.4448806018972849, - "grad_norm": 1.0605281591415405, - "learning_rate": 7.554444444444446e-05, - "loss": 0.1171, + "epoch": 1.7793261367353614, + "grad_norm": 0.6107457280158997, + "learning_rate": 4.805804262652539e-05, + "loss": 0.0666, "step": 6800 }, { - "epoch": 0.44553483807654565, - "grad_norm": 0.8925848007202148, - "learning_rate": 7.565555555555556e-05, - "loss": 0.1091, + "epoch": 1.7819430814524044, + "grad_norm": 1.2480820417404175, + "learning_rate": 4.805004801453882e-05, + "loss": 0.0662, "step": 6810 }, { - "epoch": 0.44618907425580634, - "grad_norm": 0.9806022047996521, - "learning_rate": 7.576666666666667e-05, - "loss": 0.1168, + "epoch": 1.7845600261694472, + "grad_norm": 0.5964035391807556, + "learning_rate": 4.8042037648061784e-05, + "loss": 0.0733, "step": 6820 }, { - "epoch": 0.44684331043506703, - "grad_norm": 1.2420053482055664, - "learning_rate": 7.587777777777778e-05, - "loss": 0.117, + "epoch": 1.78717697088649, + "grad_norm": 0.8358713388442993, + "learning_rate": 4.803401153256929e-05, + "loss": 0.0672, "step": 6830 }, { - "epoch": 0.4474975466143278, - "grad_norm": 1.1187241077423096, - "learning_rate": 7.598888888888889e-05, - "loss": 0.1286, + "epoch": 1.789793915603533, + "grad_norm": 0.7865737676620483, + "learning_rate": 4.802596967354711e-05, + "loss": 0.0649, "step": 6840 }, { - "epoch": 0.44815178279358847, - "grad_norm": 1.2027949094772339, - "learning_rate": 7.61e-05, - "loss": 0.1178, + "epoch": 1.7924108603205757, + "grad_norm": 0.6207346320152283, + "learning_rate": 4.801791207649177e-05, + "loss": 0.072, "step": 6850 }, { - "epoch": 0.4488060189728492, - "grad_norm": 1.1143661737442017, - "learning_rate": 7.621111111111111e-05, - "loss": 0.115, + "epoch": 1.7950278050376185, + "grad_norm": 0.5982876420021057, + "learning_rate": 4.800983874691058e-05, + "loss": 0.0646, "step": 6860 }, { - "epoch": 0.4494602551521099, - "grad_norm": 0.9064378142356873, - "learning_rate": 7.632222222222222e-05, - "loss": 0.1141, + "epoch": 1.7976447497546615, + "grad_norm": 0.6681666374206543, + "learning_rate": 4.800174969032158e-05, + "loss": 0.0691, "step": 6870 }, { - "epoch": 0.4501144913313706, - "grad_norm": 1.2292486429214478, - "learning_rate": 7.643333333333334e-05, - "loss": 0.1234, + "epoch": 1.8002616944717043, + "grad_norm": 0.6014638543128967, + "learning_rate": 4.799364491225356e-05, + "loss": 0.0693, "step": 6880 }, { - "epoch": 0.45076872751063135, - "grad_norm": 1.2885360717773438, - "learning_rate": 7.654444444444445e-05, - "loss": 0.1205, + "epoch": 1.802878639188747, + "grad_norm": 0.5622276067733765, + "learning_rate": 4.7985524418246054e-05, + "loss": 0.065, "step": 6890 }, { - "epoch": 0.45142296368989204, - "grad_norm": 0.9081943035125732, - "learning_rate": 7.665555555555556e-05, - "loss": 0.1226, + "epoch": 1.80549558390579, + "grad_norm": 0.567729115486145, + "learning_rate": 4.797738821384935e-05, + "loss": 0.0679, "step": 6900 }, { - "epoch": 0.4520771998691528, - "grad_norm": 1.1508632898330688, - "learning_rate": 7.676666666666667e-05, - "loss": 0.1219, + "epoch": 1.808112528622833, + "grad_norm": 0.6296717524528503, + "learning_rate": 4.796923630462446e-05, + "loss": 0.0728, "step": 6910 }, { - "epoch": 0.4527314360484135, - "grad_norm": 0.971246063709259, - "learning_rate": 7.687777777777779e-05, - "loss": 0.1315, + "epoch": 1.8107294733398756, + "grad_norm": 0.8331077098846436, + "learning_rate": 4.796106869614315e-05, + "loss": 0.0687, "step": 6920 }, { - "epoch": 0.4533856722276742, - "grad_norm": 1.2282114028930664, - "learning_rate": 7.698888888888889e-05, - "loss": 0.1183, + "epoch": 1.8133464180569185, + "grad_norm": 0.7066493034362793, + "learning_rate": 4.79528853939879e-05, + "loss": 0.0654, "step": 6930 }, { - "epoch": 0.4540399084069349, - "grad_norm": 1.111584186553955, - "learning_rate": 7.71e-05, - "loss": 0.1193, + "epoch": 1.8159633627739615, + "grad_norm": 0.8010251522064209, + "learning_rate": 4.794468640375191e-05, + "loss": 0.0625, "step": 6940 }, { - "epoch": 0.4546941445861956, - "grad_norm": 1.0776195526123047, - "learning_rate": 7.72111111111111e-05, - "loss": 0.1157, + "epoch": 1.8185803074910043, + "grad_norm": 0.633176863193512, + "learning_rate": 4.793647173103912e-05, + "loss": 0.0665, "step": 6950 }, { - "epoch": 0.4553483807654563, - "grad_norm": 1.1044821739196777, - "learning_rate": 7.732222222222223e-05, - "loss": 0.1191, + "epoch": 1.821197252208047, + "grad_norm": 0.9934688806533813, + "learning_rate": 4.792824138146418e-05, + "loss": 0.0671, "step": 6960 }, { - "epoch": 0.45600261694471705, - "grad_norm": 0.9981603622436523, - "learning_rate": 7.743333333333334e-05, - "loss": 0.1231, + "epoch": 1.82381419692509, + "grad_norm": 0.4914017617702484, + "learning_rate": 4.791999536065246e-05, + "loss": 0.0662, "step": 6970 }, { - "epoch": 0.45665685312397775, - "grad_norm": 1.0244512557983398, - "learning_rate": 7.754444444444445e-05, - "loss": 0.1175, + "epoch": 1.8264311416421328, + "grad_norm": 0.6178949475288391, + "learning_rate": 4.791173367424002e-05, + "loss": 0.0692, "step": 6980 }, { - "epoch": 0.4573110893032385, - "grad_norm": 0.9602757692337036, - "learning_rate": 7.765555555555555e-05, - "loss": 0.1095, + "epoch": 1.8290480863591756, + "grad_norm": 0.805231511592865, + "learning_rate": 4.790345632787367e-05, + "loss": 0.0623, "step": 6990 }, { - "epoch": 0.4579653254824992, - "grad_norm": 1.00235116481781, - "learning_rate": 7.776666666666667e-05, - "loss": 0.1066, + "epoch": 1.8316650310762186, + "grad_norm": 0.5471294522285461, + "learning_rate": 4.789516332721089e-05, + "loss": 0.0594, + "step": 7000 + }, + { + "epoch": 1.8316650310762186, + "eval_loss": 0.07114872934573006, + "eval_runtime": 8.5303, + "eval_samples_per_second": 120.043, + "eval_steps_per_second": 1.876, "step": 7000 }, { - "epoch": 0.4586195616617599, - "grad_norm": 0.9261903762817383, - "learning_rate": 7.787777777777778e-05, - "loss": 0.1073, + "epoch": 1.8342819757932614, + "grad_norm": 0.7086074948310852, + "learning_rate": 4.7886854677919856e-05, + "loss": 0.0649, "step": 7010 }, { - "epoch": 0.4592737978410206, - "grad_norm": 0.9496639370918274, - "learning_rate": 7.798888888888889e-05, - "loss": 0.1291, + "epoch": 1.8368989205103041, + "grad_norm": 0.5640758275985718, + "learning_rate": 4.7878530385679466e-05, + "loss": 0.0599, "step": 7020 }, { - "epoch": 0.4599280340202813, - "grad_norm": 0.983691394329071, - "learning_rate": 7.81e-05, - "loss": 0.1113, + "epoch": 1.8395158652273471, + "grad_norm": 0.598824679851532, + "learning_rate": 4.7870190456179284e-05, + "loss": 0.0636, "step": 7030 }, { - "epoch": 0.460582270199542, - "grad_norm": 1.006894826889038, - "learning_rate": 7.821111111111112e-05, - "loss": 0.1102, + "epoch": 1.8421328099443899, + "grad_norm": 0.5776214003562927, + "learning_rate": 4.786183489511958e-05, + "loss": 0.0677, "step": 7040 }, { - "epoch": 0.46123650637880276, - "grad_norm": 0.907843291759491, - "learning_rate": 7.832222222222223e-05, - "loss": 0.1161, + "epoch": 1.8447497546614327, + "grad_norm": 0.6187028288841248, + "learning_rate": 4.78534637082113e-05, + "loss": 0.0607, "step": 7050 }, { - "epoch": 0.46189074255806345, - "grad_norm": 1.1818556785583496, - "learning_rate": 7.843333333333333e-05, - "loss": 0.1193, + "epoch": 1.8473666993784756, + "grad_norm": 0.5059308409690857, + "learning_rate": 4.784507690117607e-05, + "loss": 0.0614, "step": 7060 }, { - "epoch": 0.4625449787373242, - "grad_norm": 1.0127677917480469, - "learning_rate": 7.854444444444445e-05, - "loss": 0.1149, + "epoch": 1.8499836440955186, + "grad_norm": 0.6319045424461365, + "learning_rate": 4.783667447974619e-05, + "loss": 0.069, "step": 7070 }, { - "epoch": 0.4631992149165849, - "grad_norm": 0.9791194796562195, - "learning_rate": 7.865555555555556e-05, - "loss": 0.1251, + "epoch": 1.8526005888125612, + "grad_norm": 0.7948163747787476, + "learning_rate": 4.782825644966464e-05, + "loss": 0.0598, "step": 7080 }, { - "epoch": 0.4638534510958456, - "grad_norm": 1.0158543586730957, - "learning_rate": 7.876666666666667e-05, - "loss": 0.1137, + "epoch": 1.8552175335296042, + "grad_norm": 0.5553324818611145, + "learning_rate": 4.781982281668508e-05, + "loss": 0.0652, "step": 7090 }, { - "epoch": 0.46450768727510633, - "grad_norm": 1.2994464635849, - "learning_rate": 7.887777777777778e-05, - "loss": 0.1277, + "epoch": 1.8578344782466472, + "grad_norm": 0.49766290187835693, + "learning_rate": 4.781137358657179e-05, + "loss": 0.0638, "step": 7100 }, { - "epoch": 0.465161923454367, - "grad_norm": 1.0386182069778442, - "learning_rate": 7.89888888888889e-05, - "loss": 0.1398, + "epoch": 1.86045142296369, + "grad_norm": 0.4596664309501648, + "learning_rate": 4.780290876509975e-05, + "loss": 0.0672, "step": 7110 }, { - "epoch": 0.4658161596336277, - "grad_norm": 0.8721251487731934, - "learning_rate": 7.910000000000001e-05, - "loss": 0.1043, + "epoch": 1.8630683676807327, + "grad_norm": 0.5042189359664917, + "learning_rate": 4.779442835805459e-05, + "loss": 0.065, "step": 7120 }, { - "epoch": 0.46647039581288846, - "grad_norm": 1.1872791051864624, - "learning_rate": 7.921111111111111e-05, - "loss": 0.1087, + "epoch": 1.8656853123977757, + "grad_norm": 0.6542985439300537, + "learning_rate": 4.7785932371232586e-05, + "loss": 0.07, "step": 7130 }, { - "epoch": 0.46712463199214915, - "grad_norm": 1.0702382326126099, - "learning_rate": 7.932222222222223e-05, - "loss": 0.1136, + "epoch": 1.8683022571148185, + "grad_norm": 0.5698999762535095, + "learning_rate": 4.7777420810440666e-05, + "loss": 0.062, "step": 7140 }, { - "epoch": 0.4677788681714099, - "grad_norm": 1.5441776514053345, - "learning_rate": 7.943333333333333e-05, - "loss": 0.1156, + "epoch": 1.8709192018318612, + "grad_norm": 0.48582544922828674, + "learning_rate": 4.7768893681496394e-05, + "loss": 0.0571, "step": 7150 }, { - "epoch": 0.4684331043506706, - "grad_norm": 1.2157565355300903, - "learning_rate": 7.954444444444445e-05, - "loss": 0.1146, + "epoch": 1.8735361465489042, + "grad_norm": 0.7398406863212585, + "learning_rate": 4.7760350990227995e-05, + "loss": 0.065, "step": 7160 }, { - "epoch": 0.4690873405299313, - "grad_norm": 0.9208630323410034, - "learning_rate": 7.965555555555556e-05, - "loss": 0.1057, + "epoch": 1.876153091265947, + "grad_norm": 0.5045884251594543, + "learning_rate": 4.7751792742474317e-05, + "loss": 0.0668, "step": 7170 }, { - "epoch": 0.46974157670919203, - "grad_norm": 1.2033475637435913, - "learning_rate": 7.976666666666666e-05, - "loss": 0.1173, + "epoch": 1.8787700359829898, + "grad_norm": 0.6144871711730957, + "learning_rate": 4.774321894408483e-05, + "loss": 0.0683, "step": 7180 }, { - "epoch": 0.4703958128884527, - "grad_norm": 1.084647297859192, - "learning_rate": 7.987777777777778e-05, - "loss": 0.1103, + "epoch": 1.8813869807000327, + "grad_norm": 0.7012941837310791, + "learning_rate": 4.7734629600919645e-05, + "loss": 0.0678, "step": 7190 }, { - "epoch": 0.47105004906771347, - "grad_norm": 1.0333460569381714, - "learning_rate": 7.99888888888889e-05, - "loss": 0.1151, + "epoch": 1.8840039254170755, + "grad_norm": 0.5341391563415527, + "learning_rate": 4.772602471884951e-05, + "loss": 0.0619, "step": 7200 }, { - "epoch": 0.47170428524697416, - "grad_norm": 0.7924304604530334, - "learning_rate": 8.010000000000001e-05, - "loss": 0.1198, + "epoch": 1.8866208701341183, + "grad_norm": 0.761932909488678, + "learning_rate": 4.7717404303755775e-05, + "loss": 0.0647, "step": 7210 }, { - "epoch": 0.47235852142623486, - "grad_norm": 0.9112861156463623, - "learning_rate": 8.021111111111111e-05, - "loss": 0.1116, + "epoch": 1.8892378148511613, + "grad_norm": 0.7804669737815857, + "learning_rate": 4.7708768361530405e-05, + "loss": 0.0579, "step": 7220 }, { - "epoch": 0.4730127576054956, - "grad_norm": 0.8361260890960693, - "learning_rate": 8.032222222222223e-05, - "loss": 0.1169, + "epoch": 1.8918547595682043, + "grad_norm": 0.505577564239502, + "learning_rate": 4.770011689807599e-05, + "loss": 0.0692, "step": 7230 }, { - "epoch": 0.4736669937847563, - "grad_norm": 1.0955017805099487, - "learning_rate": 8.043333333333334e-05, - "loss": 0.1171, + "epoch": 1.894471704285247, + "grad_norm": 0.6581014394760132, + "learning_rate": 4.769144991930573e-05, + "loss": 0.0695, "step": 7240 }, { - "epoch": 0.474321229964017, - "grad_norm": 0.775754988193512, - "learning_rate": 8.054444444444444e-05, - "loss": 0.1109, + "epoch": 1.8970886490022898, + "grad_norm": 0.9491140842437744, + "learning_rate": 4.7682767431143416e-05, + "loss": 0.063, "step": 7250 }, { - "epoch": 0.47497546614327774, - "grad_norm": 0.9245556592941284, - "learning_rate": 8.065555555555556e-05, - "loss": 0.1185, + "epoch": 1.8997055937193328, + "grad_norm": 0.6348735690116882, + "learning_rate": 4.7674069439523445e-05, + "loss": 0.06, "step": 7260 }, { - "epoch": 0.47562970232253843, - "grad_norm": 0.9954939484596252, - "learning_rate": 8.076666666666667e-05, - "loss": 0.1199, + "epoch": 1.9023225384363756, + "grad_norm": 0.48946839570999146, + "learning_rate": 4.766535595039082e-05, + "loss": 0.0673, "step": 7270 }, { - "epoch": 0.4762839385017992, - "grad_norm": 0.9574825167655945, - "learning_rate": 8.087777777777779e-05, - "loss": 0.1154, + "epoch": 1.9049394831534183, + "grad_norm": 1.8328133821487427, + "learning_rate": 4.7656626969701124e-05, + "loss": 0.0674, "step": 7280 }, { - "epoch": 0.47693817468105987, - "grad_norm": 1.0296094417572021, - "learning_rate": 8.098888888888889e-05, - "loss": 0.1226, + "epoch": 1.9075564278704613, + "grad_norm": 0.7238545417785645, + "learning_rate": 4.7647882503420526e-05, + "loss": 0.0686, "step": 7290 }, { - "epoch": 0.47759241086032056, - "grad_norm": 1.061394453048706, - "learning_rate": 8.11e-05, - "loss": 0.1142, + "epoch": 1.910173372587504, + "grad_norm": 0.5402998328208923, + "learning_rate": 4.76391225575258e-05, + "loss": 0.0605, "step": 7300 }, { - "epoch": 0.4782466470395813, - "grad_norm": 1.1966540813446045, - "learning_rate": 8.121111111111112e-05, - "loss": 0.1252, + "epoch": 1.9127903173045468, + "grad_norm": 0.9406218528747559, + "learning_rate": 4.7630347138004285e-05, + "loss": 0.0645, "step": 7310 }, { - "epoch": 0.478900883218842, - "grad_norm": 1.1094386577606201, - "learning_rate": 8.132222222222222e-05, - "loss": 0.12, + "epoch": 1.9154072620215898, + "grad_norm": 0.729573130607605, + "learning_rate": 4.762155625085388e-05, + "loss": 0.0749, "step": 7320 }, { - "epoch": 0.4795551193981027, - "grad_norm": 1.1346668004989624, - "learning_rate": 8.143333333333334e-05, - "loss": 0.121, + "epoch": 1.9180242067386326, + "grad_norm": 0.5499677062034607, + "learning_rate": 4.7612749902083095e-05, + "loss": 0.0639, "step": 7330 }, { - "epoch": 0.48020935557736344, - "grad_norm": 1.08058500289917, - "learning_rate": 8.154444444444444e-05, - "loss": 0.1175, + "epoch": 1.9206411514556754, + "grad_norm": 0.7047156691551208, + "learning_rate": 4.760392809771098e-05, + "loss": 0.059, "step": 7340 }, { - "epoch": 0.48086359175662413, - "grad_norm": 1.0835899114608765, - "learning_rate": 8.165555555555557e-05, - "loss": 0.1139, + "epoch": 1.9232580961727184, + "grad_norm": 0.4866229295730591, + "learning_rate": 4.759509084376714e-05, + "loss": 0.0595, "step": 7350 }, { - "epoch": 0.4815178279358849, - "grad_norm": 1.1137175559997559, - "learning_rate": 8.176666666666667e-05, - "loss": 0.1182, + "epoch": 1.9258750408897614, + "grad_norm": 1.0535590648651123, + "learning_rate": 4.7586238146291785e-05, + "loss": 0.0725, "step": 7360 }, { - "epoch": 0.48217206411514557, - "grad_norm": 0.8468176126480103, - "learning_rate": 8.187777777777779e-05, - "loss": 0.1112, + "epoch": 1.928491985606804, + "grad_norm": 1.2395167350769043, + "learning_rate": 4.757737001133562e-05, + "loss": 0.0693, "step": 7370 }, { - "epoch": 0.48282630029440626, - "grad_norm": 0.984216034412384, - "learning_rate": 8.198888888888889e-05, - "loss": 0.1158, + "epoch": 1.931108930323847, + "grad_norm": 0.5386394262313843, + "learning_rate": 4.7568486444959945e-05, + "loss": 0.0679, "step": 7380 }, { - "epoch": 0.483480536473667, - "grad_norm": 1.0112940073013306, - "learning_rate": 8.21e-05, - "loss": 0.1184, + "epoch": 1.9337258750408899, + "grad_norm": 0.6242868900299072, + "learning_rate": 4.75595874532366e-05, + "loss": 0.0708, "step": 7390 }, { - "epoch": 0.4841347726529277, - "grad_norm": 1.1188663244247437, - "learning_rate": 8.221111111111112e-05, - "loss": 0.1205, + "epoch": 1.9363428197579327, + "grad_norm": 0.6938721537590027, + "learning_rate": 4.755067304224795e-05, + "loss": 0.0597, "step": 7400 }, { - "epoch": 0.4847890088321884, - "grad_norm": 1.0517669916152954, - "learning_rate": 8.232222222222222e-05, - "loss": 0.1175, + "epoch": 1.9389597644749754, + "grad_norm": 0.4725117087364197, + "learning_rate": 4.754174321808691e-05, + "loss": 0.0585, "step": 7410 }, { - "epoch": 0.48544324501144914, - "grad_norm": 0.9220474362373352, - "learning_rate": 8.243333333333334e-05, - "loss": 0.1072, + "epoch": 1.9415767091920184, + "grad_norm": 0.5444639325141907, + "learning_rate": 4.753279798685695e-05, + "loss": 0.0686, "step": 7420 }, { - "epoch": 0.48609748119070983, - "grad_norm": 0.9419307708740234, - "learning_rate": 8.254444444444445e-05, - "loss": 0.1173, + "epoch": 1.9441936539090612, + "grad_norm": 0.48581692576408386, + "learning_rate": 4.752383735467202e-05, + "loss": 0.0645, "step": 7430 }, { - "epoch": 0.4867517173699706, - "grad_norm": 0.7772601842880249, - "learning_rate": 8.265555555555557e-05, - "loss": 0.113, + "epoch": 1.946810598626104, + "grad_norm": 0.643066942691803, + "learning_rate": 4.751486132765666e-05, + "loss": 0.062, "step": 7440 }, { - "epoch": 0.4874059535492313, - "grad_norm": 0.9354446530342102, - "learning_rate": 8.276666666666667e-05, - "loss": 0.1158, + "epoch": 1.949427543343147, + "grad_norm": 0.34584930539131165, + "learning_rate": 4.750586991194588e-05, + "loss": 0.0612, "step": 7450 }, { - "epoch": 0.48806018972849197, - "grad_norm": 1.0952316522598267, - "learning_rate": 8.287777777777777e-05, - "loss": 0.1092, + "epoch": 1.9520444880601897, + "grad_norm": 0.7048997282981873, + "learning_rate": 4.749686311368523e-05, + "loss": 0.0678, "step": 7460 }, { - "epoch": 0.4887144259077527, - "grad_norm": 0.9121578335762024, - "learning_rate": 8.29888888888889e-05, - "loss": 0.1232, + "epoch": 1.9546614327772325, + "grad_norm": 0.7045301198959351, + "learning_rate": 4.748784093903078e-05, + "loss": 0.0656, "step": 7470 }, { - "epoch": 0.4893686620870134, - "grad_norm": 1.195021152496338, - "learning_rate": 8.31e-05, - "loss": 0.117, + "epoch": 1.9572783774942755, + "grad_norm": 0.6391457319259644, + "learning_rate": 4.7478803394149094e-05, + "loss": 0.0628, "step": 7480 }, { - "epoch": 0.4900228982662741, - "grad_norm": 1.1168640851974487, - "learning_rate": 8.321111111111112e-05, - "loss": 0.1202, + "epoch": 1.9598953222113182, + "grad_norm": 0.699043869972229, + "learning_rate": 4.746975048521725e-05, + "loss": 0.0575, "step": 7490 }, { - "epoch": 0.49067713444553485, - "grad_norm": 1.1179988384246826, - "learning_rate": 8.332222222222222e-05, - "loss": 0.1173, + "epoch": 1.962512266928361, + "grad_norm": 0.7100141048431396, + "learning_rate": 4.746068221842282e-05, + "loss": 0.0649, "step": 7500 }, { - "epoch": 0.49133137062479554, - "grad_norm": 1.026973843574524, - "learning_rate": 8.343333333333335e-05, - "loss": 0.1139, + "epoch": 1.965129211645404, + "grad_norm": 0.6790652871131897, + "learning_rate": 4.74515985999639e-05, + "loss": 0.0617, "step": 7510 }, { - "epoch": 0.4919856068040563, - "grad_norm": 1.1670565605163574, - "learning_rate": 8.354444444444445e-05, - "loss": 0.12, + "epoch": 1.967746156362447, + "grad_norm": 0.7839459180831909, + "learning_rate": 4.744249963604903e-05, + "loss": 0.0668, "step": 7520 }, { - "epoch": 0.492639842983317, - "grad_norm": 0.9871100187301636, - "learning_rate": 8.365555555555556e-05, - "loss": 0.1218, + "epoch": 1.9703631010794895, + "grad_norm": 0.7605068683624268, + "learning_rate": 4.743338533289728e-05, + "loss": 0.0641, "step": 7530 }, { - "epoch": 0.49329407916257767, - "grad_norm": 1.137722134590149, - "learning_rate": 8.376666666666667e-05, - "loss": 0.1178, + "epoch": 1.9729800457965325, + "grad_norm": 0.6718177795410156, + "learning_rate": 4.7424255696738195e-05, + "loss": 0.0615, "step": 7540 }, { - "epoch": 0.4939483153418384, - "grad_norm": 1.0345990657806396, - "learning_rate": 8.387777777777778e-05, - "loss": 0.1226, + "epoch": 1.9755969905135755, + "grad_norm": 0.47757506370544434, + "learning_rate": 4.741511073381179e-05, + "loss": 0.0606, "step": 7550 }, { - "epoch": 0.4946025515210991, - "grad_norm": 1.140154480934143, - "learning_rate": 8.39888888888889e-05, - "loss": 0.1297, + "epoch": 1.9782139352306183, + "grad_norm": 0.7829891443252563, + "learning_rate": 4.740595045036855e-05, + "loss": 0.073, "step": 7560 }, { - "epoch": 0.49525678770035986, - "grad_norm": 0.97530198097229, - "learning_rate": 8.41e-05, - "loss": 0.1194, + "epoch": 1.980830879947661, + "grad_norm": 0.4041369557380676, + "learning_rate": 4.739677485266946e-05, + "loss": 0.0623, "step": 7570 }, { - "epoch": 0.49591102387962055, - "grad_norm": 1.1377121210098267, - "learning_rate": 8.421111111111111e-05, - "loss": 0.1295, + "epoch": 1.983447824664704, + "grad_norm": 0.46687090396881104, + "learning_rate": 4.7387583946985946e-05, + "loss": 0.0581, "step": 7580 }, { - "epoch": 0.49656526005888124, - "grad_norm": 1.4145655632019043, - "learning_rate": 8.432222222222223e-05, - "loss": 0.1145, + "epoch": 1.9860647693817468, + "grad_norm": 0.5236268639564514, + "learning_rate": 4.7378377739599914e-05, + "loss": 0.0573, "step": 7590 }, { - "epoch": 0.497219496238142, - "grad_norm": 1.2647418975830078, - "learning_rate": 8.443333333333334e-05, - "loss": 0.1254, + "epoch": 1.9886817140987896, + "grad_norm": 0.46081119775772095, + "learning_rate": 4.73691562368037e-05, + "loss": 0.0662, "step": 7600 }, { - "epoch": 0.4978737324174027, - "grad_norm": 1.0249871015548706, - "learning_rate": 8.454444444444445e-05, - "loss": 0.1249, + "epoch": 1.9912986588158326, + "grad_norm": 0.764456033706665, + "learning_rate": 4.735991944490014e-05, + "loss": 0.0593, "step": 7610 }, { - "epoch": 0.4985279685966634, - "grad_norm": 0.8847797513008118, - "learning_rate": 8.465555555555556e-05, - "loss": 0.1039, + "epoch": 1.9939156035328753, + "grad_norm": 0.5772601962089539, + "learning_rate": 4.735066737020247e-05, + "loss": 0.0623, "step": 7620 }, { - "epoch": 0.4991822047759241, - "grad_norm": 1.2074919939041138, - "learning_rate": 8.476666666666668e-05, - "loss": 0.1109, + "epoch": 1.996532548249918, + "grad_norm": 0.5460513234138489, + "learning_rate": 4.734140001903441e-05, + "loss": 0.0633, "step": 7630 }, { - "epoch": 0.4998364409551848, - "grad_norm": 0.9957761764526367, - "learning_rate": 8.487777777777778e-05, - "loss": 0.1224, + "epoch": 1.999149492966961, + "grad_norm": 0.8035596609115601, + "learning_rate": 4.73321173977301e-05, + "loss": 0.0666, "step": 7640 }, { - "epoch": 0.5004906771344455, - "grad_norm": 0.8962732553482056, - "learning_rate": 8.498888888888889e-05, - "loss": 0.1247, + "epoch": 2.0015701668302257, + "grad_norm": 0.638861358165741, + "learning_rate": 4.732281951263413e-05, + "loss": 0.0611, "step": 7650 }, { - "epoch": 0.5011449133137063, - "grad_norm": 0.8505379557609558, - "learning_rate": 8.510000000000001e-05, - "loss": 0.1152, + "epoch": 2.0041871115472687, + "grad_norm": 0.455998033285141, + "learning_rate": 4.7313506370101515e-05, + "loss": 0.0681, "step": 7660 }, { - "epoch": 0.501799149492967, - "grad_norm": 1.0067418813705444, - "learning_rate": 8.521111111111112e-05, - "loss": 0.1152, + "epoch": 2.0068040562643112, + "grad_norm": 0.7048136591911316, + "learning_rate": 4.73041779764977e-05, + "loss": 0.0731, "step": 7670 }, { - "epoch": 0.5024533856722276, - "grad_norm": 0.9401522278785706, - "learning_rate": 8.532222222222223e-05, - "loss": 0.1306, + "epoch": 2.0094210009813542, + "grad_norm": 0.5252156257629395, + "learning_rate": 4.729483433819856e-05, + "loss": 0.0605, "step": 7680 }, { - "epoch": 0.5031076218514884, - "grad_norm": 0.9951933026313782, - "learning_rate": 8.543333333333333e-05, - "loss": 0.1126, + "epoch": 2.0120379456983972, + "grad_norm": 0.6412255167961121, + "learning_rate": 4.728547546159037e-05, + "loss": 0.0623, "step": 7690 }, { - "epoch": 0.5037618580307491, - "grad_norm": 1.0896522998809814, - "learning_rate": 8.554444444444444e-05, - "loss": 0.1158, + "epoch": 2.0146548904154398, + "grad_norm": 0.8844165205955505, + "learning_rate": 4.7276101353069843e-05, + "loss": 0.0688, "step": 7700 }, { - "epoch": 0.5044160942100098, - "grad_norm": 0.9648086428642273, - "learning_rate": 8.565555555555556e-05, - "loss": 0.1115, + "epoch": 2.0172718351324828, + "grad_norm": 0.4701058864593506, + "learning_rate": 4.72667120190441e-05, + "loss": 0.0675, "step": 7710 }, { - "epoch": 0.5050703303892705, - "grad_norm": 1.1799818277359009, - "learning_rate": 8.576666666666667e-05, - "loss": 0.1303, + "epoch": 2.0198887798495258, + "grad_norm": 0.5482307076454163, + "learning_rate": 4.7257307465930644e-05, + "loss": 0.0628, "step": 7720 }, { - "epoch": 0.5057245665685313, - "grad_norm": 1.237389326095581, - "learning_rate": 8.587777777777778e-05, - "loss": 0.1238, + "epoch": 2.0225057245665687, + "grad_norm": 0.7286236882209778, + "learning_rate": 4.724788770015741e-05, + "loss": 0.0613, "step": 7730 }, { - "epoch": 0.5063788027477919, - "grad_norm": 0.7409398555755615, - "learning_rate": 8.598888888888889e-05, - "loss": 0.1184, + "epoch": 2.0251226692836113, + "grad_norm": 0.49437442421913147, + "learning_rate": 4.723845272816272e-05, + "loss": 0.0611, "step": 7740 }, { - "epoch": 0.5070330389270526, - "grad_norm": 0.9186582565307617, - "learning_rate": 8.61e-05, - "loss": 0.1135, + "epoch": 2.0277396140006543, + "grad_norm": 0.4415714740753174, + "learning_rate": 4.722900255639529e-05, + "loss": 0.0639, "step": 7750 }, { - "epoch": 0.5076872751063134, - "grad_norm": 1.130561351776123, - "learning_rate": 8.621111111111112e-05, - "loss": 0.1213, + "epoch": 2.0303565587176973, + "grad_norm": 0.4924696683883667, + "learning_rate": 4.721953719131422e-05, + "loss": 0.0647, "step": 7760 }, { - "epoch": 0.5083415112855741, - "grad_norm": 1.1196184158325195, - "learning_rate": 8.632222222222222e-05, - "loss": 0.1171, + "epoch": 2.03297350343474, + "grad_norm": 0.4595443606376648, + "learning_rate": 4.721005663938899e-05, + "loss": 0.0686, "step": 7770 }, { - "epoch": 0.5089957474648348, - "grad_norm": 0.9427690505981445, - "learning_rate": 8.643333333333334e-05, - "loss": 0.1145, + "epoch": 2.035590448151783, + "grad_norm": 0.6738218069076538, + "learning_rate": 4.7200560907099476e-05, + "loss": 0.0614, "step": 7780 }, { - "epoch": 0.5096499836440955, - "grad_norm": 0.9436381459236145, - "learning_rate": 8.654444444444445e-05, - "loss": 0.113, + "epoch": 2.038207392868826, + "grad_norm": 0.7287262082099915, + "learning_rate": 4.719105000093593e-05, + "loss": 0.06, "step": 7790 }, { - "epoch": 0.5103042198233563, - "grad_norm": 1.0007132291793823, - "learning_rate": 8.665555555555556e-05, - "loss": 0.1174, + "epoch": 2.0408243375858683, + "grad_norm": 0.43699556589126587, + "learning_rate": 4.718152392739895e-05, + "loss": 0.0668, "step": 7800 }, { - "epoch": 0.5109584560026169, - "grad_norm": 1.1906782388687134, - "learning_rate": 8.676666666666667e-05, - "loss": 0.1363, + "epoch": 2.0434412823029113, + "grad_norm": 0.8281365633010864, + "learning_rate": 4.717198269299953e-05, + "loss": 0.0621, "step": 7810 }, { - "epoch": 0.5116126921818777, - "grad_norm": 0.9627518653869629, - "learning_rate": 8.687777777777779e-05, - "loss": 0.1123, + "epoch": 2.0460582270199543, + "grad_norm": 0.626327633857727, + "learning_rate": 4.7162426304259e-05, + "loss": 0.056, "step": 7820 }, { - "epoch": 0.5122669283611384, - "grad_norm": 1.0318536758422852, - "learning_rate": 8.69888888888889e-05, - "loss": 0.1138, + "epoch": 2.048675171736997, + "grad_norm": 0.49688369035720825, + "learning_rate": 4.715285476770908e-05, + "loss": 0.0634, "step": 7830 }, { - "epoch": 0.512921164540399, - "grad_norm": 0.9794100522994995, - "learning_rate": 8.71e-05, - "loss": 0.1184, + "epoch": 2.05129211645404, + "grad_norm": 0.4797987937927246, + "learning_rate": 4.714326808989181e-05, + "loss": 0.062, "step": 7840 }, { - "epoch": 0.5135754007196598, - "grad_norm": 1.2829060554504395, - "learning_rate": 8.72111111111111e-05, - "loss": 0.1206, + "epoch": 2.053909061171083, + "grad_norm": 0.4971781075000763, + "learning_rate": 4.71336662773596e-05, + "loss": 0.0617, "step": 7850 }, { - "epoch": 0.5142296368989205, - "grad_norm": 0.9050964117050171, - "learning_rate": 8.732222222222223e-05, - "loss": 0.1117, + "epoch": 2.0565260058881254, + "grad_norm": 0.5491329431533813, + "learning_rate": 4.71240493366752e-05, + "loss": 0.0654, "step": 7860 }, { - "epoch": 0.5148838730781812, - "grad_norm": 1.2302874326705933, - "learning_rate": 8.743333333333334e-05, - "loss": 0.1118, + "epoch": 2.0591429506051684, + "grad_norm": 0.5640793442726135, + "learning_rate": 4.711441727441169e-05, + "loss": 0.0598, "step": 7870 }, { - "epoch": 0.5155381092574419, - "grad_norm": 1.3270689249038696, - "learning_rate": 8.754444444444445e-05, - "loss": 0.1188, + "epoch": 2.0617598953222114, + "grad_norm": 0.7192339301109314, + "learning_rate": 4.71047700971525e-05, + "loss": 0.0633, "step": 7880 }, { - "epoch": 0.5161923454367027, - "grad_norm": 1.0173133611679077, - "learning_rate": 8.765555555555555e-05, - "loss": 0.1187, + "epoch": 2.0643768400392544, + "grad_norm": 0.5545005202293396, + "learning_rate": 4.709510781149139e-05, + "loss": 0.0686, "step": 7890 }, { - "epoch": 0.5168465816159633, - "grad_norm": 1.0799261331558228, - "learning_rate": 8.776666666666668e-05, - "loss": 0.1172, + "epoch": 2.066993784756297, + "grad_norm": 0.7110175490379333, + "learning_rate": 4.708543042403243e-05, + "loss": 0.0642, "step": 7900 }, { - "epoch": 0.517500817795224, - "grad_norm": 1.013592004776001, - "learning_rate": 8.787777777777778e-05, - "loss": 0.1156, + "epoch": 2.06961072947334, + "grad_norm": 0.5228858590126038, + "learning_rate": 4.707573794139003e-05, + "loss": 0.0693, "step": 7910 }, { - "epoch": 0.5181550539744848, - "grad_norm": 1.0565035343170166, - "learning_rate": 8.79888888888889e-05, - "loss": 0.1271, + "epoch": 2.072227674190383, + "grad_norm": 0.4876290261745453, + "learning_rate": 4.706603037018891e-05, + "loss": 0.0619, "step": 7920 }, { - "epoch": 0.5188092901537456, - "grad_norm": 0.8640610575675964, - "learning_rate": 8.81e-05, - "loss": 0.1136, + "epoch": 2.0748446189074254, + "grad_norm": 0.5120744705200195, + "learning_rate": 4.7056307717064094e-05, + "loss": 0.0605, "step": 7930 }, { - "epoch": 0.5194635263330062, - "grad_norm": 0.942816972732544, - "learning_rate": 8.821111111111112e-05, - "loss": 0.114, + "epoch": 2.0774615636244684, + "grad_norm": 0.6547344326972961, + "learning_rate": 4.704656998866094e-05, + "loss": 0.0674, "step": 7940 }, { - "epoch": 0.5201177625122669, - "grad_norm": 1.1157722473144531, - "learning_rate": 8.832222222222223e-05, - "loss": 0.1148, + "epoch": 2.0800785083415114, + "grad_norm": 0.7206751108169556, + "learning_rate": 4.703681719163509e-05, + "loss": 0.0669, "step": 7950 }, { - "epoch": 0.5207719986915277, - "grad_norm": 1.1118143796920776, - "learning_rate": 8.843333333333333e-05, - "loss": 0.1162, + "epoch": 2.082695453058554, + "grad_norm": 0.6250445246696472, + "learning_rate": 4.702704933265248e-05, + "loss": 0.0661, "step": 7960 }, { - "epoch": 0.5214262348707883, - "grad_norm": 1.1708115339279175, - "learning_rate": 8.854444444444445e-05, - "loss": 0.1273, + "epoch": 2.085312397775597, + "grad_norm": 0.39483657479286194, + "learning_rate": 4.701726641838935e-05, + "loss": 0.0721, "step": 7970 }, { - "epoch": 0.5220804710500491, - "grad_norm": 1.163822054862976, - "learning_rate": 8.865555555555556e-05, - "loss": 0.1164, + "epoch": 2.08792934249264, + "grad_norm": 0.8020210266113281, + "learning_rate": 4.700746845553223e-05, + "loss": 0.0721, "step": 7980 }, { - "epoch": 0.5227347072293098, - "grad_norm": 0.847159206867218, - "learning_rate": 8.876666666666668e-05, - "loss": 0.1161, + "epoch": 2.0905462872096825, + "grad_norm": 0.7316377758979797, + "learning_rate": 4.699765545077795e-05, + "loss": 0.0663, "step": 7990 }, { - "epoch": 0.5233889434085705, - "grad_norm": 0.9535149335861206, - "learning_rate": 8.887777777777778e-05, - "loss": 0.1193, + "epoch": 2.0931632319267255, + "grad_norm": 0.6087160706520081, + "learning_rate": 4.698782741083359e-05, + "loss": 0.0655, + "step": 8000 + }, + { + "epoch": 2.0931632319267255, + "eval_loss": 0.06413635517794444, + "eval_runtime": 8.6044, + "eval_samples_per_second": 119.01, + "eval_steps_per_second": 1.86, "step": 8000 }, { - "epoch": 0.5240431795878312, - "grad_norm": 0.9705567359924316, - "learning_rate": 8.898888888888888e-05, - "loss": 0.1217, + "epoch": 2.0957801766437685, + "grad_norm": 0.7263542413711548, + "learning_rate": 4.6977984342416524e-05, + "loss": 0.06, "step": 8010 }, { - "epoch": 0.524697415767092, - "grad_norm": 1.0848075151443481, - "learning_rate": 8.910000000000001e-05, - "loss": 0.1274, + "epoch": 2.0983971213608115, + "grad_norm": 0.6284407377243042, + "learning_rate": 4.696812625225441e-05, + "loss": 0.0647, "step": 8020 }, { - "epoch": 0.5253516519463526, - "grad_norm": 1.0525822639465332, - "learning_rate": 8.921111111111111e-05, - "loss": 0.1471, + "epoch": 2.101014066077854, + "grad_norm": 0.594032347202301, + "learning_rate": 4.695825314708514e-05, + "loss": 0.0657, "step": 8030 }, { - "epoch": 0.5260058881256133, - "grad_norm": 1.1972851753234863, - "learning_rate": 8.932222222222223e-05, - "loss": 0.1224, + "epoch": 2.103631010794897, + "grad_norm": 0.6179496645927429, + "learning_rate": 4.69483650336569e-05, + "loss": 0.0645, "step": 8040 }, { - "epoch": 0.5266601243048741, - "grad_norm": 1.103018045425415, - "learning_rate": 8.943333333333333e-05, - "loss": 0.1312, + "epoch": 2.10624795551194, + "grad_norm": 0.8740331530570984, + "learning_rate": 4.693846191872812e-05, + "loss": 0.0669, "step": 8050 }, { - "epoch": 0.5273143604841348, - "grad_norm": 1.3622784614562988, - "learning_rate": 8.954444444444446e-05, - "loss": 0.12, + "epoch": 2.1088649002289825, + "grad_norm": 0.4465997517108917, + "learning_rate": 4.692854380906748e-05, + "loss": 0.0605, "step": 8060 }, { - "epoch": 0.5279685966633955, - "grad_norm": 1.1434608697891235, - "learning_rate": 8.965555555555556e-05, - "loss": 0.1083, + "epoch": 2.1114818449460255, + "grad_norm": 0.46578449010849, + "learning_rate": 4.6918610711453936e-05, + "loss": 0.0636, "step": 8070 }, { - "epoch": 0.5286228328426562, - "grad_norm": 1.039299726486206, - "learning_rate": 8.976666666666666e-05, - "loss": 0.1276, + "epoch": 2.1140987896630685, + "grad_norm": 0.6705744862556458, + "learning_rate": 4.690866263267664e-05, + "loss": 0.0654, "step": 8080 }, { - "epoch": 0.529277069021917, - "grad_norm": 1.0581910610198975, - "learning_rate": 8.987777777777778e-05, - "loss": 0.1217, + "epoch": 2.116715734380111, + "grad_norm": 0.5965505242347717, + "learning_rate": 4.689869957953502e-05, + "loss": 0.0643, "step": 8090 }, { - "epoch": 0.5299313052011776, - "grad_norm": 1.0329546928405762, - "learning_rate": 8.998888888888889e-05, - "loss": 0.1211, + "epoch": 2.119332679097154, + "grad_norm": 0.5712394714355469, + "learning_rate": 4.688872155883873e-05, + "loss": 0.0656, "step": 8100 }, { - "epoch": 0.5305855413804383, - "grad_norm": 1.1167445182800293, - "learning_rate": 9.010000000000001e-05, - "loss": 0.1129, + "epoch": 2.121949623814197, + "grad_norm": 0.9391905665397644, + "learning_rate": 4.687872857740766e-05, + "loss": 0.0591, "step": 8110 }, { - "epoch": 0.5312397775596991, - "grad_norm": 0.8185030817985535, - "learning_rate": 9.021111111111111e-05, - "loss": 0.1084, + "epoch": 2.1245665685312396, + "grad_norm": 0.769233763217926, + "learning_rate": 4.686872064207191e-05, + "loss": 0.0639, "step": 8120 }, { - "epoch": 0.5318940137389597, - "grad_norm": 0.9566468000411987, - "learning_rate": 9.032222222222223e-05, - "loss": 0.117, + "epoch": 2.1271835132482826, + "grad_norm": 0.7062404751777649, + "learning_rate": 4.6858697759671796e-05, + "loss": 0.0663, "step": 8130 }, { - "epoch": 0.5325482499182205, - "grad_norm": 1.1805304288864136, - "learning_rate": 9.043333333333334e-05, - "loss": 0.1243, + "epoch": 2.1298004579653256, + "grad_norm": 0.59275883436203, + "learning_rate": 4.68486599370579e-05, + "loss": 0.0619, "step": 8140 }, { - "epoch": 0.5332024860974812, - "grad_norm": 1.2267868518829346, - "learning_rate": 9.054444444444446e-05, - "loss": 0.1197, + "epoch": 2.1324174026823686, + "grad_norm": 0.5705215930938721, + "learning_rate": 4.683860718109094e-05, + "loss": 0.0648, "step": 8150 }, { - "epoch": 0.5338567222767419, - "grad_norm": 1.0790307521820068, - "learning_rate": 9.065555555555556e-05, - "loss": 0.114, + "epoch": 2.135034347399411, + "grad_norm": 0.6727226972579956, + "learning_rate": 4.6828539498641913e-05, + "loss": 0.0676, "step": 8160 }, { - "epoch": 0.5345109584560026, - "grad_norm": 0.9891142249107361, - "learning_rate": 9.076666666666667e-05, - "loss": 0.1138, + "epoch": 2.137651292116454, + "grad_norm": 0.5288172364234924, + "learning_rate": 4.6818456896591956e-05, + "loss": 0.0647, "step": 8170 }, { - "epoch": 0.5351651946352634, - "grad_norm": 0.8925236463546753, - "learning_rate": 9.087777777777779e-05, - "loss": 0.1327, + "epoch": 2.140268236833497, + "grad_norm": 0.7117987871170044, + "learning_rate": 4.6808359381832456e-05, + "loss": 0.0589, "step": 8180 }, { - "epoch": 0.535819430814524, - "grad_norm": 1.0021766424179077, - "learning_rate": 9.098888888888889e-05, - "loss": 0.1213, + "epoch": 2.1428851815505396, + "grad_norm": 0.41700848937034607, + "learning_rate": 4.679824696126495e-05, + "loss": 0.062, "step": 8190 }, { - "epoch": 0.5364736669937847, - "grad_norm": 0.9427801966667175, - "learning_rate": 9.11e-05, - "loss": 0.1073, + "epoch": 2.1455021262675826, + "grad_norm": 0.7077834606170654, + "learning_rate": 4.67881196418012e-05, + "loss": 0.0647, "step": 8200 }, { - "epoch": 0.5371279031730455, - "grad_norm": 1.0449068546295166, - "learning_rate": 9.121111111111112e-05, - "loss": 0.1078, + "epoch": 2.1481190709846256, + "grad_norm": 0.4548968970775604, + "learning_rate": 4.677797743036312e-05, + "loss": 0.0592, "step": 8210 }, { - "epoch": 0.5377821393523062, - "grad_norm": 1.049320936203003, - "learning_rate": 9.132222222222224e-05, - "loss": 0.1147, + "epoch": 2.150736015701668, + "grad_norm": 0.6429287791252136, + "learning_rate": 4.6767820333882815e-05, + "loss": 0.0576, "step": 8220 }, { - "epoch": 0.5384363755315669, - "grad_norm": 1.2729623317718506, - "learning_rate": 9.143333333333334e-05, - "loss": 0.1382, + "epoch": 2.153352960418711, + "grad_norm": 0.6636003255844116, + "learning_rate": 4.675764835930258e-05, + "loss": 0.0579, "step": 8230 }, { - "epoch": 0.5390906117108276, - "grad_norm": 0.9762771725654602, - "learning_rate": 9.154444444444444e-05, - "loss": 0.1162, + "epoch": 2.155969905135754, + "grad_norm": 0.5616292953491211, + "learning_rate": 4.6747461513574845e-05, + "loss": 0.0662, "step": 8240 }, { - "epoch": 0.5397448478900884, - "grad_norm": 0.8867229223251343, - "learning_rate": 9.165555555555555e-05, - "loss": 0.1203, + "epoch": 2.1585868498527967, + "grad_norm": 0.4538356065750122, + "learning_rate": 4.6737259803662236e-05, + "loss": 0.0701, "step": 8250 }, { - "epoch": 0.540399084069349, - "grad_norm": 0.9900445938110352, - "learning_rate": 9.176666666666667e-05, - "loss": 0.131, + "epoch": 2.1612037945698397, + "grad_norm": 0.6878339648246765, + "learning_rate": 4.672704323653753e-05, + "loss": 0.0592, "step": 8260 }, { - "epoch": 0.5410533202486097, - "grad_norm": 1.091282844543457, - "learning_rate": 9.187777777777779e-05, - "loss": 0.1243, + "epoch": 2.1638207392868827, + "grad_norm": 0.586982786655426, + "learning_rate": 4.671681181918363e-05, + "loss": 0.0652, "step": 8270 }, { - "epoch": 0.5417075564278705, - "grad_norm": 0.931505024433136, - "learning_rate": 9.198888888888889e-05, - "loss": 0.1051, + "epoch": 2.1664376840039252, + "grad_norm": 0.5167317390441895, + "learning_rate": 4.670656555859364e-05, + "loss": 0.064, "step": 8280 }, { - "epoch": 0.5423617926071311, - "grad_norm": 0.9861153960227966, - "learning_rate": 9.21e-05, - "loss": 0.1134, + "epoch": 2.169054628720968, + "grad_norm": 0.42640289664268494, + "learning_rate": 4.6696304461770765e-05, + "loss": 0.0645, "step": 8290 }, { - "epoch": 0.5430160287863919, - "grad_norm": 0.8883926272392273, - "learning_rate": 9.221111111111112e-05, - "loss": 0.1148, + "epoch": 2.171671573438011, + "grad_norm": 0.41746556758880615, + "learning_rate": 4.668602853572838e-05, + "loss": 0.0609, "step": 8300 }, { - "epoch": 0.5436702649656526, - "grad_norm": 0.9315553903579712, - "learning_rate": 9.232222222222223e-05, - "loss": 0.1204, + "epoch": 2.1742885181550538, + "grad_norm": 0.7875781059265137, + "learning_rate": 4.667573778748997e-05, + "loss": 0.0604, "step": 8310 }, { - "epoch": 0.5443245011449133, - "grad_norm": 0.8906738758087158, - "learning_rate": 9.243333333333333e-05, - "loss": 0.1149, + "epoch": 2.1769054628720967, + "grad_norm": 0.9152212738990784, + "learning_rate": 4.6665432224089176e-05, + "loss": 0.063, "step": 8320 }, { - "epoch": 0.544978737324174, - "grad_norm": 1.268044114112854, - "learning_rate": 9.254444444444445e-05, - "loss": 0.1198, + "epoch": 2.1795224075891397, + "grad_norm": 0.7127177715301514, + "learning_rate": 4.6655111852569754e-05, + "loss": 0.0668, "step": 8330 }, { - "epoch": 0.5456329735034348, - "grad_norm": 0.9080297946929932, - "learning_rate": 9.265555555555557e-05, - "loss": 0.1209, + "epoch": 2.1821393523061827, + "grad_norm": 0.5886343717575073, + "learning_rate": 4.664477667998557e-05, + "loss": 0.0617, "step": 8340 }, { - "epoch": 0.5462872096826955, - "grad_norm": 0.8843944072723389, - "learning_rate": 9.276666666666667e-05, - "loss": 0.1144, + "epoch": 2.1847562970232253, + "grad_norm": 0.4000236988067627, + "learning_rate": 4.6634426713400625e-05, + "loss": 0.0554, "step": 8350 }, { - "epoch": 0.5469414458619561, - "grad_norm": 1.117506980895996, - "learning_rate": 9.287777777777778e-05, - "loss": 0.1256, + "epoch": 2.1873732417402683, + "grad_norm": 0.6061784625053406, + "learning_rate": 4.662406195988903e-05, + "loss": 0.0614, "step": 8360 }, { - "epoch": 0.5475956820412169, - "grad_norm": 0.9390103816986084, - "learning_rate": 9.29888888888889e-05, - "loss": 0.1206, + "epoch": 2.1899901864573112, + "grad_norm": 0.5097105503082275, + "learning_rate": 4.6613682426534975e-05, + "loss": 0.0573, "step": 8370 }, { - "epoch": 0.5482499182204776, - "grad_norm": 1.0785554647445679, - "learning_rate": 9.310000000000001e-05, - "loss": 0.1265, + "epoch": 2.192607131174354, + "grad_norm": 0.6182939410209656, + "learning_rate": 4.66032881204328e-05, + "loss": 0.0626, "step": 8380 }, { - "epoch": 0.5489041543997383, - "grad_norm": 1.078389048576355, - "learning_rate": 9.321111111111112e-05, - "loss": 0.1207, + "epoch": 2.195224075891397, + "grad_norm": 0.5252171158790588, + "learning_rate": 4.6592879048686886e-05, + "loss": 0.0637, "step": 8390 }, { - "epoch": 0.549558390578999, - "grad_norm": 0.811019241809845, - "learning_rate": 9.332222222222222e-05, - "loss": 0.1087, + "epoch": 2.1978410206084398, + "grad_norm": 0.6885454058647156, + "learning_rate": 4.6582455218411755e-05, + "loss": 0.064, "step": 8400 }, { - "epoch": 0.5502126267582598, - "grad_norm": 0.9671462774276733, - "learning_rate": 9.343333333333335e-05, - "loss": 0.123, + "epoch": 2.2004579653254823, + "grad_norm": 0.6499890685081482, + "learning_rate": 4.6572016636732e-05, + "loss": 0.064, "step": 8410 }, { - "epoch": 0.5508668629375204, - "grad_norm": 0.9795189499855042, - "learning_rate": 9.354444444444445e-05, - "loss": 0.1119, + "epoch": 2.2030749100425253, + "grad_norm": 0.3553354740142822, + "learning_rate": 4.656156331078229e-05, + "loss": 0.062, "step": 8420 }, { - "epoch": 0.5515210991167812, - "grad_norm": 1.1247087717056274, - "learning_rate": 9.365555555555556e-05, - "loss": 0.129, + "epoch": 2.2056918547595683, + "grad_norm": 0.711338460445404, + "learning_rate": 4.6551095247707354e-05, + "loss": 0.0612, "step": 8430 }, { - "epoch": 0.5521753352960419, - "grad_norm": 1.5826585292816162, - "learning_rate": 9.376666666666666e-05, - "loss": 0.1226, + "epoch": 2.208308799476611, + "grad_norm": 0.7364850044250488, + "learning_rate": 4.6540612454662044e-05, + "loss": 0.0631, "step": 8440 }, { - "epoch": 0.5528295714753025, - "grad_norm": 1.0107616186141968, - "learning_rate": 9.38777777777778e-05, - "loss": 0.1237, + "epoch": 2.210925744193654, + "grad_norm": 0.5707940459251404, + "learning_rate": 4.653011493881123e-05, + "loss": 0.0714, "step": 8450 }, { - "epoch": 0.5534838076545633, - "grad_norm": 0.9465190172195435, - "learning_rate": 9.39888888888889e-05, - "loss": 0.1169, + "epoch": 2.213542688910697, + "grad_norm": 0.8141592144966125, + "learning_rate": 4.651960270732987e-05, + "loss": 0.0649, "step": 8460 }, { - "epoch": 0.554138043833824, - "grad_norm": 1.0885943174362183, - "learning_rate": 9.41e-05, - "loss": 0.1235, + "epoch": 2.21615963362774, + "grad_norm": 0.6246472597122192, + "learning_rate": 4.650907576740299e-05, + "loss": 0.0592, "step": 8470 }, { - "epoch": 0.5547922800130847, - "grad_norm": 1.2744386196136475, - "learning_rate": 9.421111111111111e-05, - "loss": 0.1282, + "epoch": 2.2187765783447824, + "grad_norm": 0.5534509420394897, + "learning_rate": 4.649853412622563e-05, + "loss": 0.0614, "step": 8480 }, { - "epoch": 0.5554465161923454, - "grad_norm": 1.011501669883728, - "learning_rate": 9.432222222222223e-05, - "loss": 0.1157, + "epoch": 2.2213935230618254, + "grad_norm": 1.4337730407714844, + "learning_rate": 4.6487977791002914e-05, + "loss": 0.0613, "step": 8490 }, { - "epoch": 0.5561007523716062, - "grad_norm": 0.9409123063087463, - "learning_rate": 9.443333333333334e-05, - "loss": 0.1171, + "epoch": 2.2240104677788683, + "grad_norm": 0.5923467874526978, + "learning_rate": 4.647740676895001e-05, + "loss": 0.0657, "step": 8500 }, { - "epoch": 0.5567549885508669, - "grad_norm": 0.8566303253173828, - "learning_rate": 9.454444444444444e-05, - "loss": 0.1134, + "epoch": 2.226627412495911, + "grad_norm": 0.7094632983207703, + "learning_rate": 4.646682106729208e-05, + "loss": 0.067, "step": 8510 }, { - "epoch": 0.5574092247301276, - "grad_norm": 1.0621939897537231, - "learning_rate": 9.465555555555556e-05, - "loss": 0.1213, + "epoch": 2.229244357212954, + "grad_norm": 0.5530036091804504, + "learning_rate": 4.645622069326439e-05, + "loss": 0.0584, "step": 8520 }, { - "epoch": 0.5580634609093883, - "grad_norm": 0.7857542037963867, - "learning_rate": 9.476666666666668e-05, - "loss": 0.1287, + "epoch": 2.231861301929997, + "grad_norm": 0.5729954838752747, + "learning_rate": 4.6445605654112156e-05, + "loss": 0.064, "step": 8530 }, { - "epoch": 0.558717697088649, - "grad_norm": 1.0616250038146973, - "learning_rate": 9.487777777777779e-05, - "loss": 0.1235, + "epoch": 2.2344782466470394, + "grad_norm": 0.736095130443573, + "learning_rate": 4.6434975957090686e-05, + "loss": 0.0708, "step": 8540 }, { - "epoch": 0.5593719332679097, - "grad_norm": 1.1893281936645508, - "learning_rate": 9.498888888888889e-05, - "loss": 0.1181, + "epoch": 2.2370951913640824, + "grad_norm": 0.5560267567634583, + "learning_rate": 4.642433160946528e-05, + "loss": 0.0572, "step": 8550 }, { - "epoch": 0.5600261694471704, - "grad_norm": 1.0680195093154907, - "learning_rate": 9.51e-05, - "loss": 0.1144, + "epoch": 2.2397121360811254, + "grad_norm": 0.7600628733634949, + "learning_rate": 4.641367261851122e-05, + "loss": 0.0676, "step": 8560 }, { - "epoch": 0.5606804056264312, - "grad_norm": 1.200814962387085, - "learning_rate": 9.521111111111112e-05, - "loss": 0.1286, + "epoch": 2.242329080798168, + "grad_norm": 0.8181315660476685, + "learning_rate": 4.6402998991513855e-05, + "loss": 0.0745, "step": 8570 }, { - "epoch": 0.5613346418056918, - "grad_norm": 1.1205694675445557, - "learning_rate": 9.532222222222222e-05, - "loss": 0.1192, + "epoch": 2.244946025515211, + "grad_norm": 0.6807851791381836, + "learning_rate": 4.6392310735768495e-05, + "loss": 0.0576, "step": 8580 }, { - "epoch": 0.5619888779849526, - "grad_norm": 0.915143609046936, - "learning_rate": 9.543333333333334e-05, - "loss": 0.1227, + "epoch": 2.247562970232254, + "grad_norm": 0.6910809278488159, + "learning_rate": 4.638160785858047e-05, + "loss": 0.0641, "step": 8590 }, { - "epoch": 0.5626431141642133, - "grad_norm": 0.9382200241088867, - "learning_rate": 9.554444444444444e-05, - "loss": 0.1272, + "epoch": 2.250179914949297, + "grad_norm": 0.7639349102973938, + "learning_rate": 4.637089036726508e-05, + "loss": 0.0634, "step": 8600 }, { - "epoch": 0.563297350343474, - "grad_norm": 0.8184203505516052, - "learning_rate": 9.565555555555557e-05, - "loss": 0.1188, + "epoch": 2.2527968596663395, + "grad_norm": 0.6723827123641968, + "learning_rate": 4.636015826914765e-05, + "loss": 0.0658, "step": 8610 }, { - "epoch": 0.5639515865227347, - "grad_norm": 0.9451429843902588, - "learning_rate": 9.576666666666667e-05, - "loss": 0.1126, + "epoch": 2.2554138043833825, + "grad_norm": 0.5631839036941528, + "learning_rate": 4.634941157156345e-05, + "loss": 0.0585, "step": 8620 }, { - "epoch": 0.5646058227019954, - "grad_norm": 0.9278766512870789, - "learning_rate": 9.587777777777777e-05, - "loss": 0.113, + "epoch": 2.258030749100425, + "grad_norm": 0.5604969263076782, + "learning_rate": 4.6338650281857756e-05, + "loss": 0.0636, "step": 8630 }, { - "epoch": 0.5652600588812562, - "grad_norm": 0.9145293831825256, - "learning_rate": 9.598888888888889e-05, - "loss": 0.1229, + "epoch": 2.260647693817468, + "grad_norm": 0.6197567582130432, + "learning_rate": 4.6327874407385805e-05, + "loss": 0.0615, "step": 8640 }, { - "epoch": 0.5659142950605168, - "grad_norm": 0.8851084113121033, - "learning_rate": 9.61e-05, - "loss": 0.1117, + "epoch": 2.263264638534511, + "grad_norm": 0.5681231617927551, + "learning_rate": 4.631708395551281e-05, + "loss": 0.0673, "step": 8650 }, { - "epoch": 0.5665685312397776, - "grad_norm": 0.9469485878944397, - "learning_rate": 9.621111111111112e-05, - "loss": 0.1198, + "epoch": 2.265881583251554, + "grad_norm": 0.5107356905937195, + "learning_rate": 4.630627893361393e-05, + "loss": 0.0592, "step": 8660 }, { - "epoch": 0.5672227674190383, - "grad_norm": 0.9359365105628967, - "learning_rate": 9.632222222222222e-05, - "loss": 0.1169, + "epoch": 2.2684985279685965, + "grad_norm": 0.7415857315063477, + "learning_rate": 4.629545934907432e-05, + "loss": 0.066, "step": 8670 }, { - "epoch": 0.567877003598299, - "grad_norm": 1.2340917587280273, - "learning_rate": 9.643333333333334e-05, - "loss": 0.1279, + "epoch": 2.2711154726856395, + "grad_norm": 0.527061402797699, + "learning_rate": 4.6284625209289037e-05, + "loss": 0.0675, "step": 8680 }, { - "epoch": 0.5685312397775597, - "grad_norm": 0.9072956442832947, - "learning_rate": 9.654444444444445e-05, - "loss": 0.1193, + "epoch": 2.2737324174026825, + "grad_norm": 0.626235842704773, + "learning_rate": 4.627377652166313e-05, + "loss": 0.0561, "step": 8690 }, { - "epoch": 0.5691854759568205, - "grad_norm": 0.9480854868888855, - "learning_rate": 9.665555555555555e-05, - "loss": 0.1143, + "epoch": 2.276349362119725, + "grad_norm": 0.5250177979469299, + "learning_rate": 4.6262913293611567e-05, + "loss": 0.0543, "step": 8700 }, { - "epoch": 0.5698397121360811, - "grad_norm": 0.9658315181732178, - "learning_rate": 9.676666666666667e-05, - "loss": 0.125, + "epoch": 2.278966306836768, + "grad_norm": 0.6569236516952515, + "learning_rate": 4.6252035532559266e-05, + "loss": 0.065, "step": 8710 }, { - "epoch": 0.5704939483153418, - "grad_norm": 1.0352250337600708, - "learning_rate": 9.687777777777778e-05, - "loss": 0.1239, + "epoch": 2.281583251553811, + "grad_norm": 0.8803039193153381, + "learning_rate": 4.6241143245941076e-05, + "loss": 0.065, "step": 8720 }, { - "epoch": 0.5711481844946026, - "grad_norm": 1.0940804481506348, - "learning_rate": 9.69888888888889e-05, - "loss": 0.1283, + "epoch": 2.284200196270854, + "grad_norm": 0.7301013469696045, + "learning_rate": 4.623023644120177e-05, + "loss": 0.0571, "step": 8730 }, { - "epoch": 0.5718024206738632, - "grad_norm": 0.9814597964286804, - "learning_rate": 9.71e-05, - "loss": 0.1155, + "epoch": 2.2868171409878966, + "grad_norm": 0.44893473386764526, + "learning_rate": 4.621931512579604e-05, + "loss": 0.0664, "step": 8740 }, { - "epoch": 0.572456656853124, - "grad_norm": 1.007461667060852, - "learning_rate": 9.721111111111112e-05, - "loss": 0.122, + "epoch": 2.2894340857049396, + "grad_norm": 0.6927379965782166, + "learning_rate": 4.620837930718852e-05, + "loss": 0.0591, "step": 8750 }, { - "epoch": 0.5731108930323847, - "grad_norm": 0.9953462481498718, - "learning_rate": 9.732222222222222e-05, - "loss": 0.1322, + "epoch": 2.292051030421982, + "grad_norm": 0.4672520160675049, + "learning_rate": 4.619742899285371e-05, + "loss": 0.0572, "step": 8760 }, { - "epoch": 0.5737651292116454, - "grad_norm": 0.9330369830131531, - "learning_rate": 9.743333333333335e-05, - "loss": 0.1129, + "epoch": 2.294667975139025, + "grad_norm": 0.9525076150894165, + "learning_rate": 4.6186464190276076e-05, + "loss": 0.0697, "step": 8770 }, { - "epoch": 0.5744193653909061, - "grad_norm": 0.9782475233078003, - "learning_rate": 9.754444444444445e-05, - "loss": 0.142, + "epoch": 2.297284919856068, + "grad_norm": 0.5259708166122437, + "learning_rate": 4.617548490694994e-05, + "loss": 0.0638, "step": 8780 }, { - "epoch": 0.5750736015701668, - "grad_norm": 1.0962268114089966, - "learning_rate": 9.765555555555555e-05, - "loss": 0.1143, + "epoch": 2.299901864573111, + "grad_norm": 0.53282231092453, + "learning_rate": 4.616449115037954e-05, + "loss": 0.0666, "step": 8790 }, { - "epoch": 0.5757278377494276, - "grad_norm": 1.130919337272644, - "learning_rate": 9.776666666666667e-05, - "loss": 0.1282, + "epoch": 2.3025188092901536, + "grad_norm": 0.6173849105834961, + "learning_rate": 4.6153482928079006e-05, + "loss": 0.0585, "step": 8800 }, { - "epoch": 0.5763820739286882, - "grad_norm": 0.9477802515029907, - "learning_rate": 9.787777777777778e-05, - "loss": 0.12, + "epoch": 2.3051357540071966, + "grad_norm": 0.6748325228691101, + "learning_rate": 4.614246024757237e-05, + "loss": 0.0545, "step": 8810 }, { - "epoch": 0.577036310107949, - "grad_norm": 1.0506799221038818, - "learning_rate": 9.79888888888889e-05, - "loss": 0.1187, + "epoch": 2.3077526987242396, + "grad_norm": 0.8505156636238098, + "learning_rate": 4.61314231163935e-05, + "loss": 0.065, "step": 8820 }, { - "epoch": 0.5776905462872097, - "grad_norm": 0.8452275991439819, - "learning_rate": 9.81e-05, - "loss": 0.1176, + "epoch": 2.310369643441282, + "grad_norm": 0.6660551428794861, + "learning_rate": 4.612037154208619e-05, + "loss": 0.0623, "step": 8830 }, { - "epoch": 0.5783447824664704, - "grad_norm": 1.2657310962677002, - "learning_rate": 9.821111111111111e-05, - "loss": 0.1319, + "epoch": 2.312986588158325, + "grad_norm": 0.43942147493362427, + "learning_rate": 4.610930553220409e-05, + "loss": 0.0636, "step": 8840 }, { - "epoch": 0.5789990186457311, - "grad_norm": 0.8468209505081177, - "learning_rate": 9.832222222222223e-05, - "loss": 0.1193, + "epoch": 2.315603532875368, + "grad_norm": 0.7198824286460876, + "learning_rate": 4.609822509431071e-05, + "loss": 0.0681, "step": 8850 }, { - "epoch": 0.5796532548249919, - "grad_norm": 1.0563205480575562, - "learning_rate": 9.843333333333333e-05, - "loss": 0.127, + "epoch": 2.3182204775924107, + "grad_norm": 1.167868971824646, + "learning_rate": 4.608713023597941e-05, + "loss": 0.0617, "step": 8860 }, { - "epoch": 0.5803074910042525, - "grad_norm": 0.9747239351272583, - "learning_rate": 9.854444444444445e-05, - "loss": 0.1217, + "epoch": 2.3208374223094537, + "grad_norm": 0.40410086512565613, + "learning_rate": 4.607602096479345e-05, + "loss": 0.0652, "step": 8870 }, { - "epoch": 0.5809617271835132, - "grad_norm": 1.1098183393478394, - "learning_rate": 9.865555555555556e-05, - "loss": 0.1313, + "epoch": 2.3234543670264967, + "grad_norm": 0.8580119013786316, + "learning_rate": 4.606489728834589e-05, + "loss": 0.0601, "step": 8880 }, { - "epoch": 0.581615963362774, - "grad_norm": 1.0163518190383911, - "learning_rate": 9.876666666666668e-05, - "loss": 0.1259, + "epoch": 2.326071311743539, + "grad_norm": 0.5383431315422058, + "learning_rate": 4.6053759214239654e-05, + "loss": 0.0653, "step": 8890 }, { - "epoch": 0.5822701995420346, - "grad_norm": 1.0436562299728394, - "learning_rate": 9.887777777777778e-05, - "loss": 0.1302, + "epoch": 2.328688256460582, + "grad_norm": 0.8677040934562683, + "learning_rate": 4.604260675008753e-05, + "loss": 0.0604, "step": 8900 }, { - "epoch": 0.5829244357212954, - "grad_norm": 0.9520934820175171, - "learning_rate": 9.89888888888889e-05, - "loss": 0.1343, + "epoch": 2.331305201177625, + "grad_norm": 0.5791143178939819, + "learning_rate": 4.603143990351211e-05, + "loss": 0.0634, "step": 8910 }, { - "epoch": 0.5835786719005561, - "grad_norm": 1.1144458055496216, - "learning_rate": 9.910000000000001e-05, - "loss": 0.1188, + "epoch": 2.333922145894668, + "grad_norm": 1.2247254848480225, + "learning_rate": 4.602025868214583e-05, + "loss": 0.0683, "step": 8920 }, { - "epoch": 0.5842329080798168, - "grad_norm": 0.8635526299476624, - "learning_rate": 9.921111111111113e-05, - "loss": 0.1263, + "epoch": 2.3365390906117107, + "grad_norm": 0.6355758905410767, + "learning_rate": 4.600906309363095e-05, + "loss": 0.0747, "step": 8930 }, { - "epoch": 0.5848871442590775, - "grad_norm": 0.9793311357498169, - "learning_rate": 9.932222222222223e-05, - "loss": 0.1183, + "epoch": 2.3391560353287537, + "grad_norm": 0.5947376489639282, + "learning_rate": 4.599785314561955e-05, + "loss": 0.0615, "step": 8940 }, { - "epoch": 0.5855413804383383, - "grad_norm": 1.0002834796905518, - "learning_rate": 9.943333333333333e-05, - "loss": 0.1212, + "epoch": 2.3417729800457967, + "grad_norm": 0.5728808045387268, + "learning_rate": 4.598662884577352e-05, + "loss": 0.0636, "step": 8950 }, { - "epoch": 0.586195616617599, - "grad_norm": 1.0948991775512695, - "learning_rate": 9.954444444444446e-05, - "loss": 0.1257, + "epoch": 2.3443899247628393, + "grad_norm": 0.4856645464897156, + "learning_rate": 4.597539020176457e-05, + "loss": 0.0632, "step": 8960 }, { - "epoch": 0.5868498527968596, - "grad_norm": 1.0232096910476685, - "learning_rate": 9.965555555555556e-05, - "loss": 0.1238, + "epoch": 2.3470068694798822, + "grad_norm": 1.0914742946624756, + "learning_rate": 4.5964137221274195e-05, + "loss": 0.0693, "step": 8970 }, { - "epoch": 0.5875040889761204, - "grad_norm": 0.8366304039955139, - "learning_rate": 9.976666666666667e-05, - "loss": 0.1203, + "epoch": 2.3496238141969252, + "grad_norm": 0.5342974066734314, + "learning_rate": 4.595286991199372e-05, + "loss": 0.0624, "step": 8980 }, { - "epoch": 0.5881583251553811, - "grad_norm": 1.152336835861206, - "learning_rate": 9.987777777777778e-05, - "loss": 0.1273, + "epoch": 2.3522407589139678, + "grad_norm": 0.7614314556121826, + "learning_rate": 4.5941588281624226e-05, + "loss": 0.0677, "step": 8990 }, { - "epoch": 0.5888125613346418, - "grad_norm": 0.9613927006721497, - "learning_rate": 9.998888888888889e-05, - "loss": 0.1119, + "epoch": 2.3548577036310108, + "grad_norm": 0.4510788023471832, + "learning_rate": 4.593029233787661e-05, + "loss": 0.0646, + "step": 9000 + }, + { + "epoch": 2.3548577036310108, + "eval_loss": 0.06636923542647824, + "eval_runtime": 8.6164, + "eval_samples_per_second": 118.843, + "eval_steps_per_second": 1.857, "step": 9000 }, { - "epoch": 0.5894667975139025, - "grad_norm": 0.9294191598892212, - "learning_rate": 9.99999993165094e-05, - "loss": 0.1197, + "epoch": 2.3574746483480538, + "grad_norm": 0.5777572989463806, + "learning_rate": 4.5918982088471544e-05, + "loss": 0.0708, "step": 9010 }, { - "epoch": 0.5901210336931633, - "grad_norm": 1.0190068483352661, - "learning_rate": 9.999999695382584e-05, - "loss": 0.119, + "epoch": 2.3600915930650963, + "grad_norm": 0.6823755502700806, + "learning_rate": 4.5907657541139484e-05, + "loss": 0.0645, "step": 9020 }, { - "epoch": 0.5907752698724239, - "grad_norm": 0.9870291948318481, - "learning_rate": 9.999999290351126e-05, - "loss": 0.1316, + "epoch": 2.3627085377821393, + "grad_norm": 0.5664687156677246, + "learning_rate": 4.5896318703620626e-05, + "loss": 0.0686, "step": 9030 }, { - "epoch": 0.5914295060516847, - "grad_norm": 1.160510778427124, - "learning_rate": 9.999998716556578e-05, - "loss": 0.1278, + "epoch": 2.3653254824991823, + "grad_norm": 0.5852131247520447, + "learning_rate": 4.588496558366498e-05, + "loss": 0.0567, "step": 9040 }, { - "epoch": 0.5920837422309454, - "grad_norm": 0.9017443656921387, - "learning_rate": 9.99999797399896e-05, - "loss": 0.1154, + "epoch": 2.3679424272162253, + "grad_norm": 0.5248322486877441, + "learning_rate": 4.58735981890323e-05, + "loss": 0.058, "step": 9050 }, { - "epoch": 0.592737978410206, - "grad_norm": 1.122379183769226, - "learning_rate": 9.999997062678298e-05, - "loss": 0.1227, + "epoch": 2.370559371933268, + "grad_norm": 0.46559926867485046, + "learning_rate": 4.586221652749207e-05, + "loss": 0.0648, "step": 9060 }, { - "epoch": 0.5933922145894668, - "grad_norm": 0.9526849985122681, - "learning_rate": 9.999995982594624e-05, - "loss": 0.1196, + "epoch": 2.373176316650311, + "grad_norm": 0.6136702299118042, + "learning_rate": 4.585082060682357e-05, + "loss": 0.0611, "step": 9070 }, { - "epoch": 0.5940464507687275, - "grad_norm": 1.065383791923523, - "learning_rate": 9.999994733747969e-05, - "loss": 0.1252, + "epoch": 2.375793261367354, + "grad_norm": 0.729214072227478, + "learning_rate": 4.583941043481579e-05, + "loss": 0.0652, "step": 9080 }, { - "epoch": 0.5947006869479883, - "grad_norm": 1.0271326303482056, - "learning_rate": 9.99999331613838e-05, - "loss": 0.1256, + "epoch": 2.3784102060843963, + "grad_norm": 0.5812839269638062, + "learning_rate": 4.5827986019267496e-05, + "loss": 0.0608, "step": 9090 }, { - "epoch": 0.5953549231272489, - "grad_norm": 0.8906413316726685, - "learning_rate": 9.999991729765906e-05, - "loss": 0.1383, + "epoch": 2.3810271508014393, + "grad_norm": 0.6233426928520203, + "learning_rate": 4.581654736798714e-05, + "loss": 0.0654, "step": 9100 }, { - "epoch": 0.5960091593065097, - "grad_norm": 1.0225988626480103, - "learning_rate": 9.999989974630596e-05, - "loss": 0.1247, + "epoch": 2.3836440955184823, + "grad_norm": 0.7243742942810059, + "learning_rate": 4.5805094488792956e-05, + "loss": 0.0563, "step": 9110 }, { - "epoch": 0.5966633954857704, - "grad_norm": 0.9085574746131897, - "learning_rate": 9.999988050732512e-05, - "loss": 0.1344, + "epoch": 2.386261040235525, + "grad_norm": 0.7165740728378296, + "learning_rate": 4.579362738951286e-05, + "loss": 0.0666, "step": 9120 }, { - "epoch": 0.597317631665031, - "grad_norm": 0.9044840931892395, - "learning_rate": 9.999985958071718e-05, - "loss": 0.1308, + "epoch": 2.388877984952568, + "grad_norm": 0.6373894810676575, + "learning_rate": 4.5782146077984523e-05, + "loss": 0.0639, "step": 9130 }, { - "epoch": 0.5979718678442918, - "grad_norm": 0.9104439616203308, - "learning_rate": 9.999983696648286e-05, - "loss": 0.1253, + "epoch": 2.391494929669611, + "grad_norm": 0.4645223319530487, + "learning_rate": 4.577065056205531e-05, + "loss": 0.0592, "step": 9140 }, { - "epoch": 0.5986261040235525, - "grad_norm": 0.998406708240509, - "learning_rate": 9.99998126646229e-05, - "loss": 0.1134, + "epoch": 2.3941118743866534, + "grad_norm": 0.6769869327545166, + "learning_rate": 4.5759140849582276e-05, + "loss": 0.0659, "step": 9150 }, { - "epoch": 0.5992803402028132, - "grad_norm": 1.0714468955993652, - "learning_rate": 9.999978667513815e-05, - "loss": 0.1238, + "epoch": 2.3967288191036964, + "grad_norm": 0.6146237850189209, + "learning_rate": 4.574761694843222e-05, + "loss": 0.0603, "step": 9160 }, { - "epoch": 0.5999345763820739, - "grad_norm": 0.9690518975257874, - "learning_rate": 9.999975899802944e-05, - "loss": 0.1192, + "epoch": 2.3993457638207394, + "grad_norm": 0.7659674882888794, + "learning_rate": 4.5736078866481634e-05, + "loss": 0.0601, "step": 9170 }, { - "epoch": 0.6005888125613347, - "grad_norm": 0.9514327645301819, - "learning_rate": 9.999972963329775e-05, - "loss": 0.1211, + "epoch": 2.4019627085377824, + "grad_norm": 0.8440955281257629, + "learning_rate": 4.572452661161667e-05, + "loss": 0.059, "step": 9180 }, { - "epoch": 0.6012430487405953, - "grad_norm": 0.9661425948143005, - "learning_rate": 9.999969858094407e-05, - "loss": 0.1241, + "epoch": 2.404579653254825, + "grad_norm": 0.6863587498664856, + "learning_rate": 4.571296019173318e-05, + "loss": 0.0648, "step": 9190 }, { - "epoch": 0.6018972849198561, - "grad_norm": 1.0832891464233398, - "learning_rate": 9.999966584096941e-05, - "loss": 0.1271, + "epoch": 2.407196597971868, + "grad_norm": 0.41940245032310486, + "learning_rate": 4.5701379614736715e-05, + "loss": 0.0717, "step": 9200 }, { - "epoch": 0.6025515210991168, - "grad_norm": 1.053916573524475, - "learning_rate": 9.999963141337492e-05, - "loss": 0.1342, + "epoch": 2.4098135426889105, + "grad_norm": 0.9120506048202515, + "learning_rate": 4.568978488854248e-05, + "loss": 0.0664, "step": 9210 }, { - "epoch": 0.6032057572783774, - "grad_norm": 0.9167242646217346, - "learning_rate": 9.999959529816173e-05, - "loss": 0.1221, + "epoch": 2.4124304874059534, + "grad_norm": 0.38293400406837463, + "learning_rate": 4.567817602107537e-05, + "loss": 0.0611, "step": 9220 }, { - "epoch": 0.6038599934576382, - "grad_norm": 0.8978278636932373, - "learning_rate": 9.999955749533107e-05, - "loss": 0.1217, + "epoch": 2.4150474321229964, + "grad_norm": 0.8800689578056335, + "learning_rate": 4.566655302026993e-05, + "loss": 0.0677, "step": 9230 }, { - "epoch": 0.6045142296368989, - "grad_norm": 0.9800412654876709, - "learning_rate": 9.999951800488422e-05, - "loss": 0.1178, + "epoch": 2.4176643768400394, + "grad_norm": 0.5821717381477356, + "learning_rate": 4.5654915894070384e-05, + "loss": 0.0567, "step": 9240 }, { - "epoch": 0.6051684658161597, - "grad_norm": 0.9557292461395264, - "learning_rate": 9.999947682682251e-05, - "loss": 0.1342, + "epoch": 2.420281321557082, + "grad_norm": 0.6287429928779602, + "learning_rate": 4.564326465043058e-05, + "loss": 0.0649, "step": 9250 }, { - "epoch": 0.6058227019954203, - "grad_norm": 0.9261611104011536, - "learning_rate": 9.999943396114732e-05, - "loss": 0.1208, + "epoch": 2.422898266274125, + "grad_norm": 0.3861464560031891, + "learning_rate": 4.563159929731404e-05, + "loss": 0.0621, "step": 9260 }, { - "epoch": 0.6064769381746811, - "grad_norm": 0.8848803639411926, - "learning_rate": 9.999938940786011e-05, - "loss": 0.1287, + "epoch": 2.425515210991168, + "grad_norm": 0.9000930190086365, + "learning_rate": 4.5619919842693935e-05, + "loss": 0.0663, "step": 9270 }, { - "epoch": 0.6071311743539418, - "grad_norm": 0.818735659122467, - "learning_rate": 9.999934316696238e-05, - "loss": 0.1135, + "epoch": 2.4281321557082105, + "grad_norm": 0.7210332155227661, + "learning_rate": 4.5608226294553044e-05, + "loss": 0.0635, "step": 9280 }, { - "epoch": 0.6077854105332025, - "grad_norm": 1.051778793334961, - "learning_rate": 9.99992952384557e-05, - "loss": 0.1175, + "epoch": 2.4307491004252535, + "grad_norm": 1.0638445615768433, + "learning_rate": 4.559651866088381e-05, + "loss": 0.0595, "step": 9290 }, { - "epoch": 0.6084396467124632, - "grad_norm": 0.9308475255966187, - "learning_rate": 9.999924562234167e-05, - "loss": 0.1114, + "epoch": 2.4333660451422965, + "grad_norm": 0.6221959590911865, + "learning_rate": 4.558479694968828e-05, + "loss": 0.0697, "step": 9300 }, { - "epoch": 0.609093882891724, - "grad_norm": 1.1484242677688599, - "learning_rate": 9.999919431862197e-05, - "loss": 0.1254, + "epoch": 2.435982989859339, + "grad_norm": 0.46246451139450073, + "learning_rate": 4.557306116897814e-05, + "loss": 0.0624, "step": 9310 }, { - "epoch": 0.6097481190709846, - "grad_norm": 0.9219973683357239, - "learning_rate": 9.999914132729832e-05, - "loss": 0.1187, + "epoch": 2.438599934576382, + "grad_norm": 0.43070271611213684, + "learning_rate": 4.556131132677468e-05, + "loss": 0.0614, "step": 9320 }, { - "epoch": 0.6104023552502453, - "grad_norm": 0.9428659677505493, - "learning_rate": 9.999908664837255e-05, - "loss": 0.1182, + "epoch": 2.441216879293425, + "grad_norm": 0.44614386558532715, + "learning_rate": 4.554954743110881e-05, + "loss": 0.0582, "step": 9330 }, { - "epoch": 0.6110565914295061, - "grad_norm": 0.8808643221855164, - "learning_rate": 9.999903028184646e-05, - "loss": 0.1195, + "epoch": 2.4438338240104676, + "grad_norm": 0.6269716024398804, + "learning_rate": 4.553776949002104e-05, + "loss": 0.0694, "step": 9340 }, { - "epoch": 0.6117108276087667, - "grad_norm": 1.1182430982589722, - "learning_rate": 9.999897222772198e-05, - "loss": 0.1329, + "epoch": 2.4464507687275105, + "grad_norm": 0.7049176692962646, + "learning_rate": 4.552597751156149e-05, + "loss": 0.0636, "step": 9350 }, { - "epoch": 0.6123650637880275, - "grad_norm": 1.0213252305984497, - "learning_rate": 9.999891248600107e-05, - "loss": 0.1229, + "epoch": 2.4490677134445535, + "grad_norm": 0.6448350548744202, + "learning_rate": 4.551417150378986e-05, + "loss": 0.0635, "step": 9360 }, { - "epoch": 0.6130192999672882, - "grad_norm": 1.048762321472168, - "learning_rate": 9.999885105668571e-05, - "loss": 0.1213, + "epoch": 2.4516846581615965, + "grad_norm": 0.5977907180786133, + "learning_rate": 4.550235147477544e-05, + "loss": 0.0566, "step": 9370 }, { - "epoch": 0.613673536146549, - "grad_norm": 0.9131613969802856, - "learning_rate": 9.999878793977801e-05, - "loss": 0.1153, + "epoch": 2.454301602878639, + "grad_norm": 0.5693519115447998, + "learning_rate": 4.5490517432597115e-05, + "loss": 0.0568, "step": 9380 }, { - "epoch": 0.6143277723258096, - "grad_norm": 1.397139310836792, - "learning_rate": 9.999872313528009e-05, - "loss": 0.1203, + "epoch": 2.456918547595682, + "grad_norm": 0.6188995838165283, + "learning_rate": 4.547866938534333e-05, + "loss": 0.06, "step": 9390 }, { - "epoch": 0.6149820085050703, - "grad_norm": 1.120139479637146, - "learning_rate": 9.999865664319414e-05, - "loss": 0.1216, + "epoch": 2.459535492312725, + "grad_norm": 0.5704085230827332, + "learning_rate": 4.546680734111213e-05, + "loss": 0.0623, "step": 9400 }, { - "epoch": 0.6156362446843311, - "grad_norm": 1.1770591735839844, - "learning_rate": 9.999858846352242e-05, - "loss": 0.1135, + "epoch": 2.4621524370297676, + "grad_norm": 0.5484632849693298, + "learning_rate": 4.5454931308011106e-05, + "loss": 0.0634, "step": 9410 }, { - "epoch": 0.6162904808635917, - "grad_norm": 1.0336472988128662, - "learning_rate": 9.99985185962672e-05, - "loss": 0.1197, + "epoch": 2.4647693817468106, + "grad_norm": 0.5368353724479675, + "learning_rate": 4.544304129415741e-05, + "loss": 0.0601, "step": 9420 }, { - "epoch": 0.6169447170428525, - "grad_norm": 1.1690295934677124, - "learning_rate": 9.999844704143084e-05, - "loss": 0.1329, + "epoch": 2.4673863264638536, + "grad_norm": 0.5148277878761292, + "learning_rate": 4.543113730767775e-05, + "loss": 0.0569, "step": 9430 }, { - "epoch": 0.6175989532221132, - "grad_norm": 0.9790360331535339, - "learning_rate": 9.999837379901578e-05, - "loss": 0.114, + "epoch": 2.470003271180896, + "grad_norm": 0.5295160412788391, + "learning_rate": 4.5419219356708396e-05, + "loss": 0.0606, "step": 9440 }, { - "epoch": 0.6182531894013739, - "grad_norm": 0.918538510799408, - "learning_rate": 9.999829886902446e-05, - "loss": 0.1204, + "epoch": 2.472620215897939, + "grad_norm": 0.5751885175704956, + "learning_rate": 4.540728744939515e-05, + "loss": 0.0599, "step": 9450 }, { - "epoch": 0.6189074255806346, - "grad_norm": 0.986890435218811, - "learning_rate": 9.999822225145945e-05, - "loss": 0.1291, + "epoch": 2.475237160614982, + "grad_norm": 0.5783774256706238, + "learning_rate": 4.539534159389337e-05, + "loss": 0.0637, "step": 9460 }, { - "epoch": 0.6195616617598954, - "grad_norm": 0.8274875283241272, - "learning_rate": 9.99981439463233e-05, - "loss": 0.126, + "epoch": 2.4778541053320247, + "grad_norm": 0.582295298576355, + "learning_rate": 4.538338179836793e-05, + "loss": 0.0646, "step": 9470 }, { - "epoch": 0.620215897939156, - "grad_norm": 0.8591904640197754, - "learning_rate": 9.999806395361867e-05, - "loss": 0.1184, + "epoch": 2.4804710500490676, + "grad_norm": 0.7552310824394226, + "learning_rate": 4.5371408070993225e-05, + "loss": 0.0597, "step": 9480 }, { - "epoch": 0.6208701341184167, - "grad_norm": 0.9025493860244751, - "learning_rate": 9.999798227334827e-05, - "loss": 0.1169, + "epoch": 2.4830879947661106, + "grad_norm": 0.5856291651725769, + "learning_rate": 4.535942041995319e-05, + "loss": 0.0605, "step": 9490 }, { - "epoch": 0.6215243702976775, - "grad_norm": 0.9297366142272949, - "learning_rate": 9.999789890551483e-05, - "loss": 0.1313, + "epoch": 2.4857049394831536, + "grad_norm": 0.7458827495574951, + "learning_rate": 4.5347418853441295e-05, + "loss": 0.0526, "step": 9500 }, { - "epoch": 0.6221786064769381, - "grad_norm": 0.843330979347229, - "learning_rate": 9.999781385012116e-05, - "loss": 0.1178, + "epoch": 2.488321884200196, + "grad_norm": 0.40024372935295105, + "learning_rate": 4.533540337966046e-05, + "loss": 0.0575, "step": 9510 }, { - "epoch": 0.6228328426561989, - "grad_norm": 0.8769571781158447, - "learning_rate": 9.999772710717018e-05, - "loss": 0.1191, + "epoch": 2.490938828917239, + "grad_norm": 0.5775712132453918, + "learning_rate": 4.532337400682317e-05, + "loss": 0.0626, "step": 9520 }, { - "epoch": 0.6234870788354596, - "grad_norm": 0.996464192867279, - "learning_rate": 9.999763867666479e-05, - "loss": 0.1184, + "epoch": 2.493555773634282, + "grad_norm": 0.5049796104431152, + "learning_rate": 4.531133074315139e-05, + "loss": 0.0545, "step": 9530 }, { - "epoch": 0.6241413150147204, - "grad_norm": 0.9058352112770081, - "learning_rate": 9.999754855860795e-05, - "loss": 0.1246, + "epoch": 2.4961727183513247, + "grad_norm": 0.46351227164268494, + "learning_rate": 4.529927359687657e-05, + "loss": 0.0591, "step": 9540 }, { - "epoch": 0.624795551193981, - "grad_norm": 0.8626279830932617, - "learning_rate": 9.999745675300271e-05, - "loss": 0.1175, + "epoch": 2.4987896630683677, + "grad_norm": 0.37615618109703064, + "learning_rate": 4.528720257623966e-05, + "loss": 0.056, "step": 9550 }, { - "epoch": 0.6254497873732418, - "grad_norm": 0.8973715901374817, - "learning_rate": 9.999736325985221e-05, - "loss": 0.1085, + "epoch": 2.5014066077854107, + "grad_norm": 0.37017613649368286, + "learning_rate": 4.5275117689491076e-05, + "loss": 0.0597, "step": 9560 }, { - "epoch": 0.6261040235525025, - "grad_norm": 0.9920212030410767, - "learning_rate": 9.999726807915956e-05, - "loss": 0.1203, + "epoch": 2.5040235525024532, + "grad_norm": 0.3615386486053467, + "learning_rate": 4.5263018944890744e-05, + "loss": 0.0575, "step": 9570 }, { - "epoch": 0.6267582597317631, - "grad_norm": 0.8262133598327637, - "learning_rate": 9.999717121092802e-05, - "loss": 0.1214, + "epoch": 2.506640497219496, + "grad_norm": 0.48490145802497864, + "learning_rate": 4.525090635070803e-05, + "loss": 0.0646, "step": 9580 }, { - "epoch": 0.6274124959110239, - "grad_norm": 0.7645416259765625, - "learning_rate": 9.999707265516079e-05, - "loss": 0.1211, + "epoch": 2.509257441936539, + "grad_norm": 0.5488083958625793, + "learning_rate": 4.523877991522178e-05, + "loss": 0.0597, "step": 9590 }, { - "epoch": 0.6280667320902846, - "grad_norm": 1.0003942251205444, - "learning_rate": 9.999697241186126e-05, - "loss": 0.1167, + "epoch": 2.5118743866535818, + "grad_norm": 1.2636692523956299, + "learning_rate": 4.522663964672029e-05, + "loss": 0.0613, "step": 9600 }, { - "epoch": 0.6287209682695453, - "grad_norm": 1.0682168006896973, - "learning_rate": 9.999687048103278e-05, - "loss": 0.1182, + "epoch": 2.5144913313706247, + "grad_norm": 0.6536663174629211, + "learning_rate": 4.521448555350134e-05, + "loss": 0.0558, "step": 9610 }, { - "epoch": 0.629375204448806, - "grad_norm": 0.7786453366279602, - "learning_rate": 9.999676686267881e-05, - "loss": 0.1241, + "epoch": 2.5171082760876677, + "grad_norm": 0.36221152544021606, + "learning_rate": 4.5202317643872114e-05, + "loss": 0.0604, "step": 9620 }, { - "epoch": 0.6300294406280668, - "grad_norm": 0.9212088584899902, - "learning_rate": 9.999666155680281e-05, - "loss": 0.1219, + "epoch": 2.5197252208047107, + "grad_norm": 0.465022474527359, + "learning_rate": 4.519013592614928e-05, + "loss": 0.0659, "step": 9630 }, { - "epoch": 0.6306836768073274, - "grad_norm": 0.8398849964141846, - "learning_rate": 9.999655456340839e-05, - "loss": 0.1217, + "epoch": 2.5223421655217533, + "grad_norm": 0.6671952605247498, + "learning_rate": 4.517794040865892e-05, + "loss": 0.0611, "step": 9640 }, { - "epoch": 0.6313379129865881, - "grad_norm": 0.7940874099731445, - "learning_rate": 9.999644588249912e-05, - "loss": 0.1156, + "epoch": 2.5249591102387963, + "grad_norm": 0.35949820280075073, + "learning_rate": 4.516573109973656e-05, + "loss": 0.0654, "step": 9650 }, { - "epoch": 0.6319921491658489, - "grad_norm": 0.9876879453659058, - "learning_rate": 9.999633551407867e-05, - "loss": 0.1307, + "epoch": 2.527576054955839, + "grad_norm": 0.5386552214622498, + "learning_rate": 4.5153508007727145e-05, + "loss": 0.0571, "step": 9660 }, { - "epoch": 0.6326463853451095, - "grad_norm": 0.9693999886512756, - "learning_rate": 9.999622345815081e-05, - "loss": 0.1232, + "epoch": 2.530192999672882, + "grad_norm": 0.6097580790519714, + "learning_rate": 4.5141271140985044e-05, + "loss": 0.0578, "step": 9670 }, { - "epoch": 0.6333006215243703, - "grad_norm": 0.8915924429893494, - "learning_rate": 9.999610971471925e-05, - "loss": 0.1255, + "epoch": 2.532809944389925, + "grad_norm": 0.48718369007110596, + "learning_rate": 4.512902050787404e-05, + "loss": 0.058, "step": 9680 }, { - "epoch": 0.633954857703631, - "grad_norm": 0.9316652417182922, - "learning_rate": 9.999599428378789e-05, - "loss": 0.1258, + "epoch": 2.535426889106968, + "grad_norm": 0.49066928029060364, + "learning_rate": 4.5116756116767315e-05, + "loss": 0.059, "step": 9690 }, { - "epoch": 0.6346090938828918, - "grad_norm": 0.8917170763015747, - "learning_rate": 9.99958771653606e-05, - "loss": 0.1246, + "epoch": 2.5380438338240103, + "grad_norm": 0.5355244874954224, + "learning_rate": 4.510447797604749e-05, + "loss": 0.0644, "step": 9700 }, { - "epoch": 0.6352633300621524, - "grad_norm": 1.0505290031433105, - "learning_rate": 9.999575835944133e-05, - "loss": 0.1109, + "epoch": 2.5406607785410533, + "grad_norm": 0.3920019268989563, + "learning_rate": 4.509218609410652e-05, + "loss": 0.0604, "step": 9710 }, { - "epoch": 0.6359175662414132, - "grad_norm": 1.0103509426116943, - "learning_rate": 9.999563786603412e-05, - "loss": 0.125, + "epoch": 2.5432777232580963, + "grad_norm": 0.5329289436340332, + "learning_rate": 4.507988047934583e-05, + "loss": 0.0576, "step": 9720 }, { - "epoch": 0.6365718024206739, - "grad_norm": 0.8918554186820984, - "learning_rate": 9.999551568514298e-05, - "loss": 0.1281, + "epoch": 2.545894667975139, + "grad_norm": 0.5136492252349854, + "learning_rate": 4.5067561140176176e-05, + "loss": 0.0588, "step": 9730 }, { - "epoch": 0.6372260385999345, - "grad_norm": 0.8537850975990295, - "learning_rate": 9.999539181677208e-05, - "loss": 0.1301, + "epoch": 2.548511612692182, + "grad_norm": 0.6208662390708923, + "learning_rate": 4.50552280850177e-05, + "loss": 0.0664, "step": 9740 }, { - "epoch": 0.6378802747791953, - "grad_norm": 1.0393998622894287, - "learning_rate": 9.99952662609256e-05, - "loss": 0.1183, + "epoch": 2.551128557409225, + "grad_norm": 0.695162832736969, + "learning_rate": 4.5042881322299936e-05, + "loss": 0.0621, "step": 9750 }, { - "epoch": 0.638534510958456, - "grad_norm": 0.9914324283599854, - "learning_rate": 9.999513901760775e-05, - "loss": 0.1348, + "epoch": 2.553745502126268, + "grad_norm": 0.9550657868385315, + "learning_rate": 4.5030520860461784e-05, + "loss": 0.0622, "step": 9760 }, { - "epoch": 0.6391887471377167, - "grad_norm": 1.1659291982650757, - "learning_rate": 9.999501008682286e-05, - "loss": 0.124, + "epoch": 2.5563624468433104, + "grad_norm": 0.6386085152626038, + "learning_rate": 4.50181467079515e-05, + "loss": 0.0595, "step": 9770 }, { - "epoch": 0.6398429833169774, - "grad_norm": 0.9629908800125122, - "learning_rate": 9.999487946857525e-05, - "loss": 0.1177, + "epoch": 2.5589793915603534, + "grad_norm": 0.4597232937812805, + "learning_rate": 4.50057588732267e-05, + "loss": 0.0606, "step": 9780 }, { - "epoch": 0.6404972194962382, - "grad_norm": 0.8898019194602966, - "learning_rate": 9.999474716286934e-05, - "loss": 0.1233, + "epoch": 2.561596336277396, + "grad_norm": 0.6518101692199707, + "learning_rate": 4.499335736475436e-05, + "loss": 0.065, "step": 9790 }, { - "epoch": 0.6411514556754988, - "grad_norm": 0.9645368456840515, - "learning_rate": 9.99946131697096e-05, - "loss": 0.1172, + "epoch": 2.564213280994439, + "grad_norm": 0.5159201622009277, + "learning_rate": 4.498094219101078e-05, + "loss": 0.0539, "step": 9800 }, { - "epoch": 0.6418056918547596, - "grad_norm": 1.1151118278503418, - "learning_rate": 9.999447748910056e-05, - "loss": 0.1154, + "epoch": 2.566830225711482, + "grad_norm": 0.4551909565925598, + "learning_rate": 4.496851336048162e-05, + "loss": 0.0544, "step": 9810 }, { - "epoch": 0.6424599280340203, - "grad_norm": 1.1241856813430786, - "learning_rate": 9.999434012104678e-05, - "loss": 0.1182, + "epoch": 2.569447170428525, + "grad_norm": 0.6590986847877502, + "learning_rate": 4.495607088166188e-05, + "loss": 0.0581, "step": 9820 }, { - "epoch": 0.643114164213281, - "grad_norm": 0.883420467376709, - "learning_rate": 9.999420106555291e-05, - "loss": 0.126, + "epoch": 2.5720641151455674, + "grad_norm": 0.616459310054779, + "learning_rate": 4.494361476305586e-05, + "loss": 0.056, "step": 9830 }, { - "epoch": 0.6437684003925417, - "grad_norm": 1.023460865020752, - "learning_rate": 9.999406032262362e-05, - "loss": 0.1242, + "epoch": 2.5746810598626104, + "grad_norm": 0.4650562107563019, + "learning_rate": 4.493114501317721e-05, + "loss": 0.0632, "step": 9840 }, { - "epoch": 0.6444226365718024, - "grad_norm": 1.0635135173797607, - "learning_rate": 9.99939178922637e-05, - "loss": 0.1383, + "epoch": 2.5772980045796534, + "grad_norm": 0.44526204466819763, + "learning_rate": 4.4918661640548874e-05, + "loss": 0.0559, "step": 9850 }, { - "epoch": 0.6450768727510632, - "grad_norm": 0.738158643245697, - "learning_rate": 9.999377377447794e-05, - "loss": 0.1199, + "epoch": 2.579914949296696, + "grad_norm": 0.5463991165161133, + "learning_rate": 4.4906164653703134e-05, + "loss": 0.0629, "step": 9860 }, { - "epoch": 0.6457311089303238, - "grad_norm": 1.1745080947875977, - "learning_rate": 9.999362796927119e-05, - "loss": 0.1239, + "epoch": 2.582531894013739, + "grad_norm": 0.8113076686859131, + "learning_rate": 4.4893654061181563e-05, + "loss": 0.0574, "step": 9870 }, { - "epoch": 0.6463853451095846, - "grad_norm": 1.2103548049926758, - "learning_rate": 9.999348047664838e-05, - "loss": 0.1281, + "epoch": 2.585148838730782, + "grad_norm": 0.7694985866546631, + "learning_rate": 4.488112987153502e-05, + "loss": 0.0662, "step": 9880 }, { - "epoch": 0.6470395812888453, - "grad_norm": 0.9273943901062012, - "learning_rate": 9.999333129661451e-05, - "loss": 0.1274, + "epoch": 2.587765783447825, + "grad_norm": 0.490842342376709, + "learning_rate": 4.486859209332368e-05, + "loss": 0.0676, "step": 9890 }, { - "epoch": 0.647693817468106, - "grad_norm": 0.9063096642494202, - "learning_rate": 9.999318042917459e-05, - "loss": 0.1161, + "epoch": 2.5903827281648675, + "grad_norm": 0.6306349039077759, + "learning_rate": 4.4856040735116986e-05, + "loss": 0.0585, "step": 9900 }, { - "epoch": 0.6483480536473667, - "grad_norm": 1.0374435186386108, - "learning_rate": 9.999302787433372e-05, - "loss": 0.1253, + "epoch": 2.5929996728819105, + "grad_norm": 0.8197053670883179, + "learning_rate": 4.4843475805493696e-05, + "loss": 0.0598, "step": 9910 }, { - "epoch": 0.6490022898266274, - "grad_norm": 0.8295223116874695, - "learning_rate": 9.999287363209703e-05, - "loss": 0.1196, + "epoch": 2.595616617598953, + "grad_norm": 0.6163144707679749, + "learning_rate": 4.48308973130418e-05, + "loss": 0.0636, "step": 9920 }, { - "epoch": 0.6496565260058881, - "grad_norm": 0.9288245439529419, - "learning_rate": 9.999271770246975e-05, - "loss": 0.1272, + "epoch": 2.598233562315996, + "grad_norm": 0.48077651858329773, + "learning_rate": 4.481830526635858e-05, + "loss": 0.0681, "step": 9930 }, { - "epoch": 0.6503107621851488, - "grad_norm": 1.091799259185791, - "learning_rate": 9.999256008545714e-05, - "loss": 0.1181, + "epoch": 2.600850507033039, + "grad_norm": 0.580900251865387, + "learning_rate": 4.4805699674050585e-05, + "loss": 0.0636, "step": 9940 }, { - "epoch": 0.6509649983644096, - "grad_norm": 0.8960475325584412, - "learning_rate": 9.999240078106452e-05, - "loss": 0.1307, + "epoch": 2.603467451750082, + "grad_norm": 0.7182990312576294, + "learning_rate": 4.4793080544733626e-05, + "loss": 0.0622, "step": 9950 }, { - "epoch": 0.6516192345436702, - "grad_norm": 0.9426436424255371, - "learning_rate": 9.999223978929727e-05, - "loss": 0.116, + "epoch": 2.6060843964671245, + "grad_norm": 0.6701587438583374, + "learning_rate": 4.478044788703275e-05, + "loss": 0.056, "step": 9960 }, { - "epoch": 0.652273470722931, - "grad_norm": 0.9542483687400818, - "learning_rate": 9.999207711016081e-05, - "loss": 0.1197, + "epoch": 2.6087013411841675, + "grad_norm": 0.5928230881690979, + "learning_rate": 4.476780170958226e-05, + "loss": 0.0538, "step": 9970 }, { - "epoch": 0.6529277069021917, - "grad_norm": 0.8808081746101379, - "learning_rate": 9.999191274366064e-05, - "loss": 0.1261, + "epoch": 2.61131828590121, + "grad_norm": 0.6294201016426086, + "learning_rate": 4.47551420210257e-05, + "loss": 0.0613, "step": 9980 }, { - "epoch": 0.6535819430814525, - "grad_norm": 0.8422145247459412, - "learning_rate": 9.99917466898023e-05, - "loss": 0.112, + "epoch": 2.613935230618253, + "grad_norm": 0.5394532680511475, + "learning_rate": 4.474246883001585e-05, + "loss": 0.0596, "step": 9990 }, { - "epoch": 0.6542361792607131, - "grad_norm": 0.956287682056427, - "learning_rate": 9.999157894859142e-05, - "loss": 0.1356, + "epoch": 2.616552175335296, + "grad_norm": 0.7629094123840332, + "learning_rate": 4.4729782145214716e-05, + "loss": 0.0631, + "step": 10000 + }, + { + "epoch": 2.616552175335296, + "eval_loss": 0.06285145155637001, + "eval_runtime": 8.6089, + "eval_samples_per_second": 118.947, + "eval_steps_per_second": 1.859, "step": 10000 }, { - "epoch": 0.6548904154399738, - "grad_norm": 0.9550445079803467, - "learning_rate": 9.99914095200336e-05, - "loss": 0.1277, + "epoch": 2.619169120052339, + "grad_norm": 0.4276679456233978, + "learning_rate": 4.471708197529352e-05, + "loss": 0.0603, "step": 10010 }, { - "epoch": 0.6555446516192346, - "grad_norm": 0.9725729823112488, - "learning_rate": 9.999123840413465e-05, - "loss": 0.1257, + "epoch": 2.6217860647693816, + "grad_norm": 0.862521231174469, + "learning_rate": 4.470436832893272e-05, + "loss": 0.0584, "step": 10020 }, { - "epoch": 0.6561988877984952, - "grad_norm": 0.8097350001335144, - "learning_rate": 9.999106560090028e-05, - "loss": 0.1199, + "epoch": 2.6244030094864246, + "grad_norm": 0.8724349141120911, + "learning_rate": 4.469164121482197e-05, + "loss": 0.0618, "step": 10030 }, { - "epoch": 0.656853123977756, - "grad_norm": 0.9223546981811523, - "learning_rate": 9.999089111033633e-05, - "loss": 0.1223, + "epoch": 2.6270199542034676, + "grad_norm": 0.4335499107837677, + "learning_rate": 4.467890064166013e-05, + "loss": 0.0577, "step": 10040 }, { - "epoch": 0.6575073601570167, - "grad_norm": 0.908068060874939, - "learning_rate": 9.99907149324487e-05, - "loss": 0.1124, + "epoch": 2.62963689892051, + "grad_norm": 0.7104797959327698, + "learning_rate": 4.466614661815526e-05, + "loss": 0.0599, "step": 10050 }, { - "epoch": 0.6581615963362774, - "grad_norm": 0.9873263835906982, - "learning_rate": 9.999053706724335e-05, - "loss": 0.1229, + "epoch": 2.632253843637553, + "grad_norm": 0.6808757781982422, + "learning_rate": 4.4653379153024624e-05, + "loss": 0.0546, "step": 10060 }, { - "epoch": 0.6588158325155381, - "grad_norm": 0.870583713054657, - "learning_rate": 9.999035751472625e-05, - "loss": 0.1113, + "epoch": 2.634870788354596, + "grad_norm": 0.3997107744216919, + "learning_rate": 4.464059825499465e-05, + "loss": 0.0579, "step": 10070 }, { - "epoch": 0.6594700686947988, - "grad_norm": 0.9837010502815247, - "learning_rate": 9.99901762749035e-05, - "loss": 0.1281, + "epoch": 2.637487733071639, + "grad_norm": 0.6047708988189697, + "learning_rate": 4.462780393280097e-05, + "loss": 0.0648, "step": 10080 }, { - "epoch": 0.6601243048740595, - "grad_norm": 1.0344374179840088, - "learning_rate": 9.998999334778118e-05, - "loss": 0.1355, + "epoch": 2.6401046777886816, + "grad_norm": 0.5319182276725769, + "learning_rate": 4.461499619518838e-05, + "loss": 0.0609, "step": 10090 }, { - "epoch": 0.6607785410533202, - "grad_norm": 1.1033861637115479, - "learning_rate": 9.99898087333655e-05, - "loss": 0.1213, + "epoch": 2.6427216225057246, + "grad_norm": 0.6906712651252747, + "learning_rate": 4.460217505091086e-05, + "loss": 0.0565, "step": 10100 }, { - "epoch": 0.661432777232581, - "grad_norm": 0.9438520073890686, - "learning_rate": 9.998962243166266e-05, - "loss": 0.1358, + "epoch": 2.645338567222767, + "grad_norm": 0.6555960178375244, + "learning_rate": 4.458934050873151e-05, + "loss": 0.0641, "step": 10110 }, { - "epoch": 0.6620870134118417, - "grad_norm": 0.9945731163024902, - "learning_rate": 9.998943444267896e-05, - "loss": 0.1261, + "epoch": 2.64795551193981, + "grad_norm": 0.41678696870803833, + "learning_rate": 4.457649257742265e-05, + "loss": 0.0603, "step": 10120 }, { - "epoch": 0.6627412495911024, - "grad_norm": 0.9855345487594604, - "learning_rate": 9.998924476642074e-05, - "loss": 0.1285, + "epoch": 2.650572456656853, + "grad_norm": 0.843837320804596, + "learning_rate": 4.456363126576571e-05, + "loss": 0.0594, "step": 10130 }, { - "epoch": 0.6633954857703631, - "grad_norm": 1.0155123472213745, - "learning_rate": 9.998905340289442e-05, - "loss": 0.1156, + "epoch": 2.653189401373896, + "grad_norm": 0.5070556402206421, + "learning_rate": 4.4550756582551273e-05, + "loss": 0.0552, "step": 10140 }, { - "epoch": 0.6640497219496239, - "grad_norm": 0.953682541847229, - "learning_rate": 9.998886035210643e-05, - "loss": 0.1144, + "epoch": 2.6558063460909387, + "grad_norm": 0.6374284029006958, + "learning_rate": 4.453786853657907e-05, + "loss": 0.0566, "step": 10150 }, { - "epoch": 0.6647039581288845, - "grad_norm": 1.0266921520233154, - "learning_rate": 9.99886656140633e-05, - "loss": 0.1202, + "epoch": 2.6584232908079817, + "grad_norm": 0.6199434399604797, + "learning_rate": 4.452496713665794e-05, + "loss": 0.0529, "step": 10160 }, { - "epoch": 0.6653581943081452, - "grad_norm": 0.8357493281364441, - "learning_rate": 9.998846918877162e-05, - "loss": 0.1136, + "epoch": 2.6610402355250247, + "grad_norm": 0.8124728202819824, + "learning_rate": 4.451205239160588e-05, + "loss": 0.0639, "step": 10170 }, { - "epoch": 0.666012430487406, - "grad_norm": 0.9822357892990112, - "learning_rate": 9.9988271076238e-05, - "loss": 0.1327, + "epoch": 2.663657180242067, + "grad_norm": 0.5821810960769653, + "learning_rate": 4.449912431025001e-05, + "loss": 0.0637, "step": 10180 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.8407633304595947, - "learning_rate": 9.998807127646915e-05, - "loss": 0.1299, + "epoch": 2.66627412495911, + "grad_norm": 0.4941767454147339, + "learning_rate": 4.448618290142654e-05, + "loss": 0.0539, "step": 10190 }, { - "epoch": 0.6673209028459274, - "grad_norm": 0.8804751038551331, - "learning_rate": 9.998786978947177e-05, - "loss": 0.1204, + "epoch": 2.668891069676153, + "grad_norm": 0.7200390100479126, + "learning_rate": 4.4473228173980794e-05, + "loss": 0.0573, "step": 10200 }, { - "epoch": 0.6679751390251881, - "grad_norm": 1.0064154863357544, - "learning_rate": 9.99876666152527e-05, - "loss": 0.1181, + "epoch": 2.671508014393196, + "grad_norm": 0.4795537292957306, + "learning_rate": 4.446026013676722e-05, + "loss": 0.0549, "step": 10210 }, { - "epoch": 0.6686293752044488, - "grad_norm": 1.158360481262207, - "learning_rate": 9.998746175381879e-05, - "loss": 0.1163, + "epoch": 2.6741249591102387, + "grad_norm": 0.5811921954154968, + "learning_rate": 4.444727879864933e-05, + "loss": 0.0588, "step": 10220 }, { - "epoch": 0.6692836113837095, - "grad_norm": 1.060184359550476, - "learning_rate": 9.998725520517693e-05, - "loss": 0.1213, + "epoch": 2.6767419038272817, + "grad_norm": 0.48299723863601685, + "learning_rate": 4.4434284168499775e-05, + "loss": 0.0538, "step": 10230 }, { - "epoch": 0.6699378475629703, - "grad_norm": 1.123124122619629, - "learning_rate": 9.998704696933413e-05, - "loss": 0.1184, + "epoch": 2.6793588485443243, + "grad_norm": 0.4856926500797272, + "learning_rate": 4.442127625520023e-05, + "loss": 0.0551, "step": 10240 }, { - "epoch": 0.6705920837422309, - "grad_norm": 0.8490195870399475, - "learning_rate": 9.998683704629739e-05, - "loss": 0.1236, + "epoch": 2.6819757932613673, + "grad_norm": 0.5260676741600037, + "learning_rate": 4.44082550676415e-05, + "loss": 0.0543, "step": 10250 }, { - "epoch": 0.6712463199214916, - "grad_norm": 0.8913207650184631, - "learning_rate": 9.99866254360738e-05, - "loss": 0.1195, + "epoch": 2.6845927379784102, + "grad_norm": 0.623572826385498, + "learning_rate": 4.439522061472344e-05, + "loss": 0.0596, "step": 10260 }, { - "epoch": 0.6719005561007524, - "grad_norm": 0.9286946654319763, - "learning_rate": 9.998641213867051e-05, - "loss": 0.1168, + "epoch": 2.6872096826954532, + "grad_norm": 0.5810160636901855, + "learning_rate": 4.438217290535498e-05, + "loss": 0.06, "step": 10270 }, { - "epoch": 0.6725547922800131, - "grad_norm": 0.9744524359703064, - "learning_rate": 9.998619715409471e-05, - "loss": 0.1134, + "epoch": 2.689826627412496, + "grad_norm": 0.6521799564361572, + "learning_rate": 4.43691119484541e-05, + "loss": 0.06, "step": 10280 }, { - "epoch": 0.6732090284592738, - "grad_norm": 0.911208987236023, - "learning_rate": 9.998598048235369e-05, - "loss": 0.1115, + "epoch": 2.6924435721295388, + "grad_norm": 0.5883389115333557, + "learning_rate": 4.435603775294784e-05, + "loss": 0.0642, "step": 10290 }, { - "epoch": 0.6738632646385345, - "grad_norm": 1.0315773487091064, - "learning_rate": 9.99857621234547e-05, - "loss": 0.1189, + "epoch": 2.6950605168465818, + "grad_norm": 0.5264142751693726, + "learning_rate": 4.434295032777229e-05, + "loss": 0.0598, "step": 10300 }, { - "epoch": 0.6745175008177953, - "grad_norm": 1.1607714891433716, - "learning_rate": 9.998554207740517e-05, - "loss": 0.1238, + "epoch": 2.6976774615636243, + "grad_norm": 0.728809118270874, + "learning_rate": 4.432984968187259e-05, + "loss": 0.0642, "step": 10310 }, { - "epoch": 0.6751717369970559, - "grad_norm": 0.7840601801872253, - "learning_rate": 9.99853203442125e-05, - "loss": 0.1196, + "epoch": 2.7002944062806673, + "grad_norm": 0.4668984115123749, + "learning_rate": 4.431673582420291e-05, + "loss": 0.0506, "step": 10320 }, { - "epoch": 0.6758259731763167, - "grad_norm": 1.062690019607544, - "learning_rate": 9.998509692388416e-05, - "loss": 0.1227, + "epoch": 2.7029113509977103, + "grad_norm": 0.6090394854545593, + "learning_rate": 4.4303608763726426e-05, + "loss": 0.059, "step": 10330 }, { - "epoch": 0.6764802093555774, - "grad_norm": 1.104194164276123, - "learning_rate": 9.998487181642772e-05, - "loss": 0.1231, + "epoch": 2.7055282957147533, + "grad_norm": 0.399859756231308, + "learning_rate": 4.4290468509415384e-05, + "loss": 0.0584, "step": 10340 }, { - "epoch": 0.677134445534838, - "grad_norm": 1.0216797590255737, - "learning_rate": 9.998464502185076e-05, - "loss": 0.118, + "epoch": 2.708145240431796, + "grad_norm": 0.4588572680950165, + "learning_rate": 4.4277315070251e-05, + "loss": 0.0568, "step": 10350 }, { - "epoch": 0.6777886817140988, - "grad_norm": 0.9924758076667786, - "learning_rate": 9.998441654016095e-05, - "loss": 0.1245, + "epoch": 2.710762185148839, + "grad_norm": 0.5472003221511841, + "learning_rate": 4.426414845522355e-05, + "loss": 0.0596, "step": 10360 }, { - "epoch": 0.6784429178933595, - "grad_norm": 0.9383712410926819, - "learning_rate": 9.9984186371366e-05, - "loss": 0.116, + "epoch": 2.7133791298658814, + "grad_norm": 0.6493082642555237, + "learning_rate": 4.425096867333228e-05, + "loss": 0.0611, "step": 10370 }, { - "epoch": 0.6790971540726202, - "grad_norm": 0.8061214089393616, - "learning_rate": 9.998395451547367e-05, - "loss": 0.1172, + "epoch": 2.7159960745829244, + "grad_norm": 0.7279231548309326, + "learning_rate": 4.423777573358545e-05, + "loss": 0.0505, "step": 10380 }, { - "epoch": 0.6797513902518809, - "grad_norm": 0.9712622761726379, - "learning_rate": 9.998372097249177e-05, - "loss": 0.1206, + "epoch": 2.7186130192999673, + "grad_norm": 0.665616512298584, + "learning_rate": 4.42245696450003e-05, + "loss": 0.0652, "step": 10390 }, { - "epoch": 0.6804056264311417, - "grad_norm": 0.8583802580833435, - "learning_rate": 9.998348574242821e-05, - "loss": 0.121, + "epoch": 2.7212299640170103, + "grad_norm": 0.5311397314071655, + "learning_rate": 4.4211350416603084e-05, + "loss": 0.0617, "step": 10400 }, { - "epoch": 0.6810598626104023, - "grad_norm": 0.8582515120506287, - "learning_rate": 9.99832488252909e-05, - "loss": 0.1189, + "epoch": 2.723846908734053, + "grad_norm": 0.6540460586547852, + "learning_rate": 4.4198118057429005e-05, + "loss": 0.0576, "step": 10410 }, { - "epoch": 0.681714098789663, - "grad_norm": 0.9043430685997009, - "learning_rate": 9.998301022108789e-05, - "loss": 0.1108, + "epoch": 2.726463853451096, + "grad_norm": 0.6157358288764954, + "learning_rate": 4.4184872576522263e-05, + "loss": 0.0589, "step": 10420 }, { - "epoch": 0.6823683349689238, - "grad_norm": 0.9330876469612122, - "learning_rate": 9.998276992982717e-05, - "loss": 0.1178, + "epoch": 2.7290807981681384, + "grad_norm": 0.44291484355926514, + "learning_rate": 4.417161398293602e-05, + "loss": 0.0584, "step": 10430 }, { - "epoch": 0.6830225711481845, - "grad_norm": 0.9934483170509338, - "learning_rate": 9.99825279515169e-05, - "loss": 0.1206, + "epoch": 2.7316977428851814, + "grad_norm": 0.6388154625892639, + "learning_rate": 4.415834228573239e-05, + "loss": 0.053, "step": 10440 }, { - "epoch": 0.6836768073274452, - "grad_norm": 0.8056890964508057, - "learning_rate": 9.998228428616523e-05, - "loss": 0.1283, + "epoch": 2.7343146876022244, + "grad_norm": 0.4673249125480652, + "learning_rate": 4.414505749398247e-05, + "loss": 0.0585, "step": 10450 }, { - "epoch": 0.6843310435067059, - "grad_norm": 0.9143165349960327, - "learning_rate": 9.998203893378037e-05, - "loss": 0.1322, + "epoch": 2.7369316323192674, + "grad_norm": 0.6346825957298279, + "learning_rate": 4.4131759616766266e-05, + "loss": 0.0576, "step": 10460 }, { - "epoch": 0.6849852796859667, - "grad_norm": 0.9717006683349609, - "learning_rate": 9.998179189437062e-05, - "loss": 0.1274, + "epoch": 2.7395485770363104, + "grad_norm": 0.41277366876602173, + "learning_rate": 4.4118448663172776e-05, + "loss": 0.0588, "step": 10470 }, { - "epoch": 0.6856395158652273, - "grad_norm": 1.0283546447753906, - "learning_rate": 9.99815431679443e-05, - "loss": 0.1193, + "epoch": 2.742165521753353, + "grad_norm": 0.7207258343696594, + "learning_rate": 4.41051246422999e-05, + "loss": 0.0572, "step": 10480 }, { - "epoch": 0.6862937520444881, - "grad_norm": 0.9242172241210938, - "learning_rate": 9.998129275450983e-05, - "loss": 0.1284, + "epoch": 2.744782466470396, + "grad_norm": 0.6065157651901245, + "learning_rate": 4.409178756325448e-05, + "loss": 0.0641, "step": 10490 }, { - "epoch": 0.6869479882237488, - "grad_norm": 0.8820136785507202, - "learning_rate": 9.998104065407565e-05, - "loss": 0.124, + "epoch": 2.7473994111874385, + "grad_norm": 0.6917155981063843, + "learning_rate": 4.407843743515229e-05, + "loss": 0.0648, "step": 10500 }, { - "epoch": 0.6876022244030094, - "grad_norm": 0.8272663354873657, - "learning_rate": 9.998078686665026e-05, - "loss": 0.116, + "epoch": 2.7500163559044815, + "grad_norm": 0.5036665201187134, + "learning_rate": 4.4065074267118e-05, + "loss": 0.054, "step": 10510 }, { - "epoch": 0.6882564605822702, - "grad_norm": 0.9149497151374817, - "learning_rate": 9.998053139224224e-05, - "loss": 0.1145, + "epoch": 2.7526333006215244, + "grad_norm": 0.6457655429840088, + "learning_rate": 4.405169806828523e-05, + "loss": 0.0515, "step": 10520 }, { - "epoch": 0.6889106967615309, - "grad_norm": 0.8796846866607666, - "learning_rate": 9.99802742308602e-05, - "loss": 0.1186, + "epoch": 2.7552502453385674, + "grad_norm": 0.4316859245300293, + "learning_rate": 4.403830884779647e-05, + "loss": 0.0609, "step": 10530 }, { - "epoch": 0.6895649329407916, - "grad_norm": 1.0413655042648315, - "learning_rate": 9.998001538251282e-05, - "loss": 0.1149, + "epoch": 2.75786719005561, + "grad_norm": 0.519899308681488, + "learning_rate": 4.402490661480314e-05, + "loss": 0.061, "step": 10540 }, { - "epoch": 0.6902191691200523, - "grad_norm": 0.9910515546798706, - "learning_rate": 9.997975484720887e-05, - "loss": 0.1258, + "epoch": 2.760484134772653, + "grad_norm": 0.815555989742279, + "learning_rate": 4.401149137846553e-05, + "loss": 0.06, "step": 10550 }, { - "epoch": 0.6908734052993131, - "grad_norm": 0.9942857623100281, - "learning_rate": 9.997949262495709e-05, - "loss": 0.1236, + "epoch": 2.7631010794896955, + "grad_norm": 0.5091633796691895, + "learning_rate": 4.399806314795284e-05, + "loss": 0.0583, "step": 10560 }, { - "epoch": 0.6915276414785738, - "grad_norm": 0.8903499841690063, - "learning_rate": 9.997922871576638e-05, - "loss": 0.113, + "epoch": 2.7657180242067385, + "grad_norm": 0.4408508241176605, + "learning_rate": 4.398462193244312e-05, + "loss": 0.0581, "step": 10570 }, { - "epoch": 0.6921818776578345, - "grad_norm": 0.9105699062347412, - "learning_rate": 9.997896311964561e-05, - "loss": 0.1276, + "epoch": 2.7683349689237815, + "grad_norm": 0.5117101669311523, + "learning_rate": 4.397116774112333e-05, + "loss": 0.0588, "step": 10580 }, { - "epoch": 0.6928361138370952, - "grad_norm": 1.2962309122085571, - "learning_rate": 9.997869583660375e-05, - "loss": 0.1285, + "epoch": 2.7709519136408245, + "grad_norm": 0.5849635601043701, + "learning_rate": 4.3957700583189266e-05, + "loss": 0.0553, "step": 10590 }, { - "epoch": 0.693490350016356, - "grad_norm": 1.1062681674957275, - "learning_rate": 9.997842686664985e-05, - "loss": 0.117, + "epoch": 2.773568858357867, + "grad_norm": 0.6975322961807251, + "learning_rate": 4.394422046784562e-05, + "loss": 0.0537, "step": 10600 }, { - "epoch": 0.6941445861956166, - "grad_norm": 0.9840047955513, - "learning_rate": 9.997815620979297e-05, - "loss": 0.1329, + "epoch": 2.77618580307491, + "grad_norm": 0.3537542223930359, + "learning_rate": 4.393072740430592e-05, + "loss": 0.0565, "step": 10610 }, { - "epoch": 0.6947988223748773, - "grad_norm": 0.7675462961196899, - "learning_rate": 9.997788386604224e-05, - "loss": 0.1115, + "epoch": 2.778802747791953, + "grad_norm": 0.5385840535163879, + "learning_rate": 4.3917221401792536e-05, + "loss": 0.0598, "step": 10620 }, { - "epoch": 0.6954530585541381, - "grad_norm": 0.9856576323509216, - "learning_rate": 9.997760983540686e-05, - "loss": 0.108, + "epoch": 2.7814196925089956, + "grad_norm": 0.4339829981327057, + "learning_rate": 4.390370246953671e-05, + "loss": 0.0601, "step": 10630 }, { - "epoch": 0.6961072947333987, - "grad_norm": 1.1004915237426758, - "learning_rate": 9.997733411789607e-05, - "loss": 0.1158, + "epoch": 2.7840366372260386, + "grad_norm": 0.37706896662712097, + "learning_rate": 4.389017061677849e-05, + "loss": 0.0656, "step": 10640 }, { - "epoch": 0.6967615309126595, - "grad_norm": 0.9109641313552856, - "learning_rate": 9.99770567135192e-05, - "loss": 0.1198, + "epoch": 2.7866535819430815, + "grad_norm": 0.6400270462036133, + "learning_rate": 4.3876625852766785e-05, + "loss": 0.062, "step": 10650 }, { - "epoch": 0.6974157670919202, - "grad_norm": 0.8749265074729919, - "learning_rate": 9.997677762228558e-05, - "loss": 0.1166, + "epoch": 2.7892705266601245, + "grad_norm": 0.3444439172744751, + "learning_rate": 4.38630681867593e-05, + "loss": 0.0529, "step": 10660 }, { - "epoch": 0.6980700032711809, - "grad_norm": 0.9863060712814331, - "learning_rate": 9.997649684420465e-05, - "loss": 0.1151, + "epoch": 2.791887471377167, + "grad_norm": 0.5224131345748901, + "learning_rate": 4.384949762802258e-05, + "loss": 0.056, "step": 10670 }, { - "epoch": 0.6987242394504416, - "grad_norm": 0.9862884879112244, - "learning_rate": 9.997621437928588e-05, - "loss": 0.1199, + "epoch": 2.79450441609421, + "grad_norm": 0.5811564922332764, + "learning_rate": 4.3835914185831985e-05, + "loss": 0.0634, "step": 10680 }, { - "epoch": 0.6993784756297023, - "grad_norm": 0.8637502193450928, - "learning_rate": 9.997593022753881e-05, - "loss": 0.118, + "epoch": 2.7971213608112526, + "grad_norm": 0.5171063542366028, + "learning_rate": 4.382231786947164e-05, + "loss": 0.0592, "step": 10690 }, { - "epoch": 0.700032711808963, - "grad_norm": 0.8936694860458374, - "learning_rate": 9.997564438897304e-05, - "loss": 0.1226, + "epoch": 2.7997383055282956, + "grad_norm": 0.5354034900665283, + "learning_rate": 4.380870868823451e-05, + "loss": 0.0662, "step": 10700 }, { - "epoch": 0.7006869479882237, - "grad_norm": 0.8638026714324951, - "learning_rate": 9.997535686359819e-05, - "loss": 0.1084, + "epoch": 2.8023552502453386, + "grad_norm": 0.3287551999092102, + "learning_rate": 4.3795086651422355e-05, + "loss": 0.0565, "step": 10710 }, { - "epoch": 0.7013411841674845, - "grad_norm": 0.8587387800216675, - "learning_rate": 9.997506765142399e-05, - "loss": 0.1229, + "epoch": 2.8049721949623816, + "grad_norm": 0.6036696434020996, + "learning_rate": 4.378145176834571e-05, + "loss": 0.061, "step": 10720 }, { - "epoch": 0.7019954203467452, - "grad_norm": 0.894414484500885, - "learning_rate": 9.997477675246019e-05, - "loss": 0.1046, + "epoch": 2.807589139679424, + "grad_norm": 0.5706713199615479, + "learning_rate": 4.376780404832387e-05, + "loss": 0.0627, "step": 10730 }, { - "epoch": 0.7026496565260059, - "grad_norm": 0.8560695648193359, - "learning_rate": 9.997448416671661e-05, - "loss": 0.1083, + "epoch": 2.810206084396467, + "grad_norm": 0.6217731833457947, + "learning_rate": 4.375414350068493e-05, + "loss": 0.0586, "step": 10740 }, { - "epoch": 0.7033038927052666, - "grad_norm": 0.9901031851768494, - "learning_rate": 9.997418989420313e-05, - "loss": 0.116, + "epoch": 2.81282302911351, + "grad_norm": 0.4816734790802002, + "learning_rate": 4.374047013476575e-05, + "loss": 0.0606, "step": 10750 }, { - "epoch": 0.7039581288845274, - "grad_norm": 0.96174556016922, - "learning_rate": 9.997389393492966e-05, - "loss": 0.1185, + "epoch": 2.8154399738305527, + "grad_norm": 0.4744681417942047, + "learning_rate": 4.3726783959911956e-05, + "loss": 0.0568, "step": 10760 }, { - "epoch": 0.704612365063788, - "grad_norm": 0.802387535572052, - "learning_rate": 9.997359628890623e-05, - "loss": 0.1202, + "epoch": 2.8180569185475957, + "grad_norm": 0.49870118498802185, + "learning_rate": 4.371308498547789e-05, + "loss": 0.0543, "step": 10770 }, { - "epoch": 0.7052666012430487, - "grad_norm": 0.7916472554206848, - "learning_rate": 9.997329695614286e-05, - "loss": 0.1103, + "epoch": 2.8206738632646386, + "grad_norm": 0.5821275115013123, + "learning_rate": 4.3699373220826704e-05, + "loss": 0.0542, "step": 10780 }, { - "epoch": 0.7059208374223095, - "grad_norm": 1.1118965148925781, - "learning_rate": 9.997299593664966e-05, - "loss": 0.1263, + "epoch": 2.8232908079816816, + "grad_norm": 0.6296197772026062, + "learning_rate": 4.368564867533024e-05, + "loss": 0.0544, "step": 10790 }, { - "epoch": 0.7065750736015701, - "grad_norm": 1.2766551971435547, - "learning_rate": 9.997269323043678e-05, - "loss": 0.1283, + "epoch": 2.825907752698724, + "grad_norm": 0.5080370903015137, + "learning_rate": 4.3671911358369104e-05, + "loss": 0.0633, "step": 10800 }, { - "epoch": 0.7072293097808309, - "grad_norm": 0.9842929244041443, - "learning_rate": 9.997238883751446e-05, - "loss": 0.1167, + "epoch": 2.828524697415767, + "grad_norm": 0.46273985505104065, + "learning_rate": 4.365816127933262e-05, + "loss": 0.0594, "step": 10810 }, { - "epoch": 0.7078835459600916, - "grad_norm": 0.9710572957992554, - "learning_rate": 9.997208275789294e-05, - "loss": 0.1074, + "epoch": 2.8311416421328097, + "grad_norm": 0.5995994806289673, + "learning_rate": 4.3644398447618836e-05, + "loss": 0.0541, "step": 10820 }, { - "epoch": 0.7085377821393523, - "grad_norm": 1.014204978942871, - "learning_rate": 9.997177499158258e-05, - "loss": 0.1155, + "epoch": 2.8337585868498527, + "grad_norm": 0.4902913272380829, + "learning_rate": 4.363062287263453e-05, + "loss": 0.055, "step": 10830 }, { - "epoch": 0.709192018318613, - "grad_norm": 0.851264238357544, - "learning_rate": 9.997146553859375e-05, - "loss": 0.1154, + "epoch": 2.8363755315668957, + "grad_norm": 0.5302367210388184, + "learning_rate": 4.361683456379515e-05, + "loss": 0.0585, "step": 10840 }, { - "epoch": 0.7098462544978738, - "grad_norm": 0.9606887102127075, - "learning_rate": 9.997115439893692e-05, - "loss": 0.1306, + "epoch": 2.8389924762839387, + "grad_norm": 0.49508044123649597, + "learning_rate": 4.3603033530524896e-05, + "loss": 0.0583, "step": 10850 }, { - "epoch": 0.7105004906771345, - "grad_norm": 0.9389902949333191, - "learning_rate": 9.997084157262256e-05, - "loss": 0.1104, + "epoch": 2.8416094210009812, + "grad_norm": 0.4201105535030365, + "learning_rate": 4.358921978225665e-05, + "loss": 0.0611, "step": 10860 }, { - "epoch": 0.7111547268563951, - "grad_norm": 0.78615403175354, - "learning_rate": 9.997052705966126e-05, - "loss": 0.1112, + "epoch": 2.8442263657180242, + "grad_norm": 0.5711117386817932, + "learning_rate": 4.357539332843196e-05, + "loss": 0.0549, "step": 10870 }, { - "epoch": 0.7118089630356559, - "grad_norm": 0.884412407875061, - "learning_rate": 9.99702108600636e-05, - "loss": 0.1246, + "epoch": 2.846843310435067, + "grad_norm": 0.6540241837501526, + "learning_rate": 4.356155417850109e-05, + "loss": 0.0569, "step": 10880 }, { - "epoch": 0.7124631992149166, - "grad_norm": 0.8281691074371338, - "learning_rate": 9.996989297384029e-05, - "loss": 0.1097, + "epoch": 2.8494602551521098, + "grad_norm": 0.5406968593597412, + "learning_rate": 4.354770234192296e-05, + "loss": 0.0527, "step": 10890 }, { - "epoch": 0.7131174353941773, - "grad_norm": 1.054056167602539, - "learning_rate": 9.996957340100203e-05, - "loss": 0.1206, + "epoch": 2.8520771998691528, + "grad_norm": 0.4803619384765625, + "learning_rate": 4.353383782816517e-05, + "loss": 0.056, "step": 10900 }, { - "epoch": 0.713771671573438, - "grad_norm": 0.8140998482704163, - "learning_rate": 9.996925214155962e-05, - "loss": 0.1207, + "epoch": 2.8546941445861957, + "grad_norm": 0.530622124671936, + "learning_rate": 4.3519960646704e-05, + "loss": 0.0558, "step": 10910 }, { - "epoch": 0.7144259077526988, - "grad_norm": 0.8529103398323059, - "learning_rate": 9.99689291955239e-05, - "loss": 0.1118, + "epoch": 2.8573110893032387, + "grad_norm": 0.40140339732170105, + "learning_rate": 4.350607080702435e-05, + "loss": 0.0552, "step": 10920 }, { - "epoch": 0.7150801439319594, - "grad_norm": 1.020617127418518, - "learning_rate": 9.996860456290576e-05, - "loss": 0.1215, + "epoch": 2.8599280340202813, + "grad_norm": 0.45789000391960144, + "learning_rate": 4.349216831861981e-05, + "loss": 0.0607, "step": 10930 }, { - "epoch": 0.7157343801112201, - "grad_norm": 0.8196330070495605, - "learning_rate": 9.996827824371618e-05, - "loss": 0.1094, + "epoch": 2.8625449787373243, + "grad_norm": 0.6772827506065369, + "learning_rate": 4.347825319099259e-05, + "loss": 0.0627, "step": 10940 }, { - "epoch": 0.7163886162904809, - "grad_norm": 0.9389585256576538, - "learning_rate": 9.996795023796617e-05, - "loss": 0.1175, + "epoch": 2.865161923454367, + "grad_norm": 0.5811320543289185, + "learning_rate": 4.3464325433653566e-05, + "loss": 0.0573, "step": 10950 }, { - "epoch": 0.7170428524697415, - "grad_norm": 0.9360471367835999, - "learning_rate": 9.996762054566679e-05, - "loss": 0.1138, + "epoch": 2.86777886817141, + "grad_norm": 0.5410882830619812, + "learning_rate": 4.34503850561222e-05, + "loss": 0.0563, "step": 10960 }, { - "epoch": 0.7176970886490023, - "grad_norm": 0.9107570052146912, - "learning_rate": 9.996728916682915e-05, - "loss": 0.1124, + "epoch": 2.870395812888453, + "grad_norm": 0.7019317746162415, + "learning_rate": 4.343643206792664e-05, + "loss": 0.0677, "step": 10970 }, { - "epoch": 0.718351324828263, - "grad_norm": 0.8106933236122131, - "learning_rate": 9.996695610146449e-05, - "loss": 0.124, + "epoch": 2.873012757605496, + "grad_norm": 0.5208338499069214, + "learning_rate": 4.3422466478603593e-05, + "loss": 0.0569, "step": 10980 }, { - "epoch": 0.7190055610075237, - "grad_norm": 1.0682817697525024, - "learning_rate": 9.9966621349584e-05, - "loss": 0.1277, + "epoch": 2.8756297023225383, + "grad_norm": 0.49888139963150024, + "learning_rate": 4.340848829769843e-05, + "loss": 0.0571, "step": 10990 }, { - "epoch": 0.7196597971867844, - "grad_norm": 1.0235021114349365, - "learning_rate": 9.9966284911199e-05, - "loss": 0.1229, + "epoch": 2.8782466470395813, + "grad_norm": 0.5914691090583801, + "learning_rate": 4.3394497534765094e-05, + "loss": 0.0604, + "step": 11000 + }, + { + "epoch": 2.8782466470395813, + "eval_loss": 0.06439468686764342, + "eval_runtime": 8.5706, + "eval_samples_per_second": 119.478, + "eval_steps_per_second": 1.867, "step": 11000 }, { - "epoch": 0.7203140333660452, - "grad_norm": 1.0049972534179688, - "learning_rate": 9.996594678632085e-05, - "loss": 0.1182, + "epoch": 2.880863591756624, + "grad_norm": 0.5112311840057373, + "learning_rate": 4.338049419936614e-05, + "loss": 0.0585, "step": 11010 }, { - "epoch": 0.7209682695453059, - "grad_norm": 0.8805012106895447, - "learning_rate": 9.996560697496094e-05, - "loss": 0.116, + "epoch": 2.883480536473667, + "grad_norm": 0.47918373346328735, + "learning_rate": 4.3366478301072723e-05, + "loss": 0.0566, "step": 11020 }, { - "epoch": 0.7216225057245665, - "grad_norm": 0.8662134408950806, - "learning_rate": 9.996526547713077e-05, - "loss": 0.117, + "epoch": 2.88609748119071, + "grad_norm": 0.5077507495880127, + "learning_rate": 4.335244984946457e-05, + "loss": 0.0613, "step": 11030 }, { - "epoch": 0.7222767419038273, - "grad_norm": 1.0079435110092163, - "learning_rate": 9.996492229284185e-05, - "loss": 0.1213, + "epoch": 2.888714425907753, + "grad_norm": 0.5855154991149902, + "learning_rate": 4.333840885413e-05, + "loss": 0.0545, "step": 11040 }, { - "epoch": 0.722930978083088, - "grad_norm": 1.034039855003357, - "learning_rate": 9.996457742210576e-05, - "loss": 0.1116, + "epoch": 2.8913313706247954, + "grad_norm": 0.5740836262702942, + "learning_rate": 4.33243553246659e-05, + "loss": 0.0597, "step": 11050 }, { - "epoch": 0.7235852142623487, - "grad_norm": 0.9001430869102478, - "learning_rate": 9.996423086493414e-05, - "loss": 0.1155, + "epoch": 2.8939483153418384, + "grad_norm": 0.6112914681434631, + "learning_rate": 4.331028927067772e-05, + "loss": 0.0578, "step": 11060 }, { - "epoch": 0.7242394504416094, - "grad_norm": 0.8581667542457581, - "learning_rate": 9.996388262133869e-05, - "loss": 0.1147, + "epoch": 2.8965652600588814, + "grad_norm": 0.48574313521385193, + "learning_rate": 4.329621070177948e-05, + "loss": 0.0555, "step": 11070 }, { - "epoch": 0.7248936866208702, - "grad_norm": 0.9072811007499695, - "learning_rate": 9.996353269133118e-05, - "loss": 0.1164, + "epoch": 2.899182204775924, + "grad_norm": 0.8778485059738159, + "learning_rate": 4.328211962759375e-05, + "loss": 0.0598, "step": 11080 }, { - "epoch": 0.7255479228001308, - "grad_norm": 0.7761887311935425, - "learning_rate": 9.99631810749234e-05, - "loss": 0.1107, + "epoch": 2.901799149492967, + "grad_norm": 0.6654403805732727, + "learning_rate": 4.326801605775165e-05, + "loss": 0.0551, "step": 11090 }, { - "epoch": 0.7262021589793916, - "grad_norm": 1.0262295007705688, - "learning_rate": 9.996282777212723e-05, - "loss": 0.1209, + "epoch": 2.90441609421001, + "grad_norm": 0.4592023491859436, + "learning_rate": 4.325390000189283e-05, + "loss": 0.0535, "step": 11100 }, { - "epoch": 0.7268563951586523, - "grad_norm": 0.9890956282615662, - "learning_rate": 9.996247278295458e-05, - "loss": 0.1215, + "epoch": 2.907033038927053, + "grad_norm": 0.7717892527580261, + "learning_rate": 4.323977146966548e-05, + "loss": 0.0584, "step": 11110 }, { - "epoch": 0.7275106313379129, - "grad_norm": 0.8441636562347412, - "learning_rate": 9.996211610741745e-05, - "loss": 0.1076, + "epoch": 2.9096499836440954, + "grad_norm": 0.7081303596496582, + "learning_rate": 4.322563047072632e-05, + "loss": 0.0559, "step": 11120 }, { - "epoch": 0.7281648675171737, - "grad_norm": 0.9701690673828125, - "learning_rate": 9.996175774552788e-05, - "loss": 0.1305, + "epoch": 2.9122669283611384, + "grad_norm": 0.6562286019325256, + "learning_rate": 4.3211477014740584e-05, + "loss": 0.0654, "step": 11130 }, { - "epoch": 0.7288191036964344, - "grad_norm": 0.8142655491828918, - "learning_rate": 9.996139769729795e-05, - "loss": 0.1164, + "epoch": 2.914883873078181, + "grad_norm": 0.5132052898406982, + "learning_rate": 4.3197311111382045e-05, + "loss": 0.0564, "step": 11140 }, { - "epoch": 0.7294733398756952, - "grad_norm": 1.1642175912857056, - "learning_rate": 9.99610359627398e-05, - "loss": 0.1196, + "epoch": 2.917500817795224, + "grad_norm": 0.522114634513855, + "learning_rate": 4.3183132770332946e-05, + "loss": 0.0488, "step": 11150 }, { - "epoch": 0.7301275760549558, - "grad_norm": 0.864878237247467, - "learning_rate": 9.996067254186568e-05, - "loss": 0.1156, + "epoch": 2.920117762512267, + "grad_norm": 0.3157167434692383, + "learning_rate": 4.3168942001284055e-05, + "loss": 0.0593, "step": 11160 }, { - "epoch": 0.7307818122342166, - "grad_norm": 1.0916894674301147, - "learning_rate": 9.996030743468783e-05, - "loss": 0.1235, + "epoch": 2.92273470722931, + "grad_norm": 0.49267107248306274, + "learning_rate": 4.315473881393463e-05, + "loss": 0.0635, "step": 11170 }, { - "epoch": 0.7314360484134773, - "grad_norm": 0.9421660304069519, - "learning_rate": 9.995994064121859e-05, - "loss": 0.116, + "epoch": 2.9253516519463525, + "grad_norm": 0.5157380104064941, + "learning_rate": 4.3140523217992414e-05, + "loss": 0.0551, "step": 11180 }, { - "epoch": 0.732090284592738, - "grad_norm": 0.8942557573318481, - "learning_rate": 9.995957216147031e-05, - "loss": 0.1136, + "epoch": 2.9279685966633955, + "grad_norm": 0.3554172217845917, + "learning_rate": 4.312629522317363e-05, + "loss": 0.066, "step": 11190 }, { - "epoch": 0.7327445207719987, - "grad_norm": 0.8067488670349121, - "learning_rate": 9.995920199545546e-05, - "loss": 0.112, + "epoch": 2.9305855413804385, + "grad_norm": 0.4762328267097473, + "learning_rate": 4.3112054839202986e-05, + "loss": 0.0577, "step": 11200 }, { - "epoch": 0.7333987569512594, - "grad_norm": 0.9836673140525818, - "learning_rate": 9.99588301431865e-05, - "loss": 0.1128, + "epoch": 2.933202486097481, + "grad_norm": 0.7763202786445618, + "learning_rate": 4.3097802075813655e-05, + "loss": 0.0635, "step": 11210 }, { - "epoch": 0.7340529931305201, - "grad_norm": 0.7709094285964966, - "learning_rate": 9.995845660467602e-05, - "loss": 0.1205, + "epoch": 2.935819430814524, + "grad_norm": 0.6447554230690002, + "learning_rate": 4.308353694274724e-05, + "loss": 0.0549, "step": 11220 }, { - "epoch": 0.7347072293097808, - "grad_norm": 0.7916396856307983, - "learning_rate": 9.99580813799366e-05, - "loss": 0.1201, + "epoch": 2.938436375531567, + "grad_norm": 0.4910326600074768, + "learning_rate": 4.3069259449753853e-05, + "loss": 0.0599, "step": 11230 }, { - "epoch": 0.7353614654890416, - "grad_norm": 1.0031894445419312, - "learning_rate": 9.995770446898092e-05, - "loss": 0.1181, + "epoch": 2.94105332024861, + "grad_norm": 0.8897386193275452, + "learning_rate": 4.305496960659201e-05, + "loss": 0.0593, "step": 11240 }, { - "epoch": 0.7360157016683022, - "grad_norm": 1.0334042310714722, - "learning_rate": 9.995732587182168e-05, - "loss": 0.1191, + "epoch": 2.9436702649656525, + "grad_norm": 0.8142090439796448, + "learning_rate": 4.304066742302869e-05, + "loss": 0.0615, "step": 11250 }, { - "epoch": 0.736669937847563, - "grad_norm": 0.9869486689567566, - "learning_rate": 9.995694558847169e-05, - "loss": 0.1129, + "epoch": 2.9462872096826955, + "grad_norm": 0.49429208040237427, + "learning_rate": 4.3026352908839295e-05, + "loss": 0.0567, "step": 11260 }, { - "epoch": 0.7373241740268237, - "grad_norm": 0.7451587915420532, - "learning_rate": 9.995656361894377e-05, - "loss": 0.1216, + "epoch": 2.948904154399738, + "grad_norm": 0.39475226402282715, + "learning_rate": 4.301202607380768e-05, + "loss": 0.0585, "step": 11270 }, { - "epoch": 0.7379784102060843, - "grad_norm": 0.9288752675056458, - "learning_rate": 9.99561799632508e-05, - "loss": 0.1257, + "epoch": 2.951521099116781, + "grad_norm": 0.7416360378265381, + "learning_rate": 4.2997686927726075e-05, + "loss": 0.0539, "step": 11280 }, { - "epoch": 0.7386326463853451, - "grad_norm": 0.972141683101654, - "learning_rate": 9.995579462140574e-05, - "loss": 0.1124, + "epoch": 2.954138043833824, + "grad_norm": 0.6084833145141602, + "learning_rate": 4.298333548039516e-05, + "loss": 0.0535, "step": 11290 }, { - "epoch": 0.7392868825646058, - "grad_norm": 1.0879883766174316, - "learning_rate": 9.995540759342161e-05, - "loss": 0.1216, + "epoch": 2.956754988550867, + "grad_norm": 0.484015554189682, + "learning_rate": 4.296897174162403e-05, + "loss": 0.0526, "step": 11300 }, { - "epoch": 0.7399411187438666, - "grad_norm": 0.9992396235466003, - "learning_rate": 9.995501887931146e-05, - "loss": 0.117, + "epoch": 2.9593719332679096, + "grad_norm": 0.5022487640380859, + "learning_rate": 4.295459572123014e-05, + "loss": 0.0573, "step": 11310 }, { - "epoch": 0.7405953549231272, - "grad_norm": 0.951146125793457, - "learning_rate": 9.99546284790884e-05, - "loss": 0.106, + "epoch": 2.9619888779849526, + "grad_norm": 0.638661801815033, + "learning_rate": 4.294020742903938e-05, + "loss": 0.0573, "step": 11320 }, { - "epoch": 0.741249591102388, - "grad_norm": 0.9258811473846436, - "learning_rate": 9.995423639276562e-05, - "loss": 0.1253, + "epoch": 2.9646058227019956, + "grad_norm": 0.3423210084438324, + "learning_rate": 4.292580687488601e-05, + "loss": 0.0507, "step": 11330 }, { - "epoch": 0.7419038272816487, - "grad_norm": 0.8994291424751282, - "learning_rate": 9.995384262035637e-05, - "loss": 0.1141, + "epoch": 2.967222767419038, + "grad_norm": 0.6734946370124817, + "learning_rate": 4.2911394068612665e-05, + "loss": 0.0567, "step": 11340 }, { - "epoch": 0.7425580634609094, - "grad_norm": 1.0479780435562134, - "learning_rate": 9.99534471618739e-05, - "loss": 0.1228, + "epoch": 2.969839712136081, + "grad_norm": 0.5600636005401611, + "learning_rate": 4.289696902007038e-05, + "loss": 0.0618, "step": 11350 }, { - "epoch": 0.7432122996401701, - "grad_norm": 0.8784400820732117, - "learning_rate": 9.99530500173316e-05, - "loss": 0.1111, + "epoch": 2.972456656853124, + "grad_norm": 0.6957277059555054, + "learning_rate": 4.288253173911852e-05, + "loss": 0.0641, "step": 11360 }, { - "epoch": 0.7438665358194309, - "grad_norm": 0.9299617409706116, - "learning_rate": 9.995265118674284e-05, - "loss": 0.1127, + "epoch": 2.975073601570167, + "grad_norm": 0.4874526262283325, + "learning_rate": 4.286808223562484e-05, + "loss": 0.0617, "step": 11370 }, { - "epoch": 0.7445207719986915, - "grad_norm": 0.9754354357719421, - "learning_rate": 9.99522506701211e-05, - "loss": 0.1186, + "epoch": 2.9776905462872096, + "grad_norm": 0.6502581834793091, + "learning_rate": 4.285362051946543e-05, + "loss": 0.0573, "step": 11380 }, { - "epoch": 0.7451750081779522, - "grad_norm": 1.0314409732818604, - "learning_rate": 9.99518484674799e-05, - "loss": 0.1145, + "epoch": 2.9803074910042526, + "grad_norm": 0.5507333278656006, + "learning_rate": 4.283914660052476e-05, + "loss": 0.0575, "step": 11390 }, { - "epoch": 0.745829244357213, - "grad_norm": 0.7628955841064453, - "learning_rate": 9.99514445788328e-05, - "loss": 0.1124, + "epoch": 2.982924435721295, + "grad_norm": 0.6331663131713867, + "learning_rate": 4.282466048869559e-05, + "loss": 0.0629, "step": 11400 }, { - "epoch": 0.7464834805364736, - "grad_norm": 0.9075015783309937, - "learning_rate": 9.995103900419348e-05, - "loss": 0.1159, + "epoch": 2.985541380438338, + "grad_norm": 0.4715561866760254, + "learning_rate": 4.2810162193879053e-05, + "loss": 0.059, "step": 11410 }, { - "epoch": 0.7471377167157344, - "grad_norm": 1.0489757061004639, - "learning_rate": 9.995063174357555e-05, - "loss": 0.1226, + "epoch": 2.988158325155381, + "grad_norm": 0.664364755153656, + "learning_rate": 4.279565172598461e-05, + "loss": 0.0556, "step": 11420 }, { - "epoch": 0.7477919528949951, - "grad_norm": 0.9378239512443542, - "learning_rate": 9.995022279699281e-05, - "loss": 0.1209, + "epoch": 2.990775269872424, + "grad_norm": 0.7428573966026306, + "learning_rate": 4.278112909493e-05, + "loss": 0.0562, "step": 11430 }, { - "epoch": 0.7484461890742558, - "grad_norm": 0.8588927388191223, - "learning_rate": 9.994981216445905e-05, - "loss": 0.1204, + "epoch": 2.9933922145894667, + "grad_norm": 0.630793035030365, + "learning_rate": 4.2766594310641326e-05, + "loss": 0.0554, "step": 11440 }, { - "epoch": 0.7491004252535165, - "grad_norm": 0.760901689529419, - "learning_rate": 9.994939984598813e-05, - "loss": 0.111, + "epoch": 2.9960091593065097, + "grad_norm": 0.6804364323616028, + "learning_rate": 4.2752047383052966e-05, + "loss": 0.056, "step": 11450 }, { - "epoch": 0.7497546614327772, - "grad_norm": 0.78853440284729, - "learning_rate": 9.994898584159397e-05, - "loss": 0.114, + "epoch": 2.9986261040235522, + "grad_norm": 0.5913823843002319, + "learning_rate": 4.273748832210761e-05, + "loss": 0.0594, "step": 11460 }, { - "epoch": 0.750408897612038, - "grad_norm": 0.9857150316238403, - "learning_rate": 9.994857015129056e-05, - "loss": 0.1131, + "epoch": 3.0010467778868173, + "grad_norm": 0.6175287961959839, + "learning_rate": 4.2722917137756245e-05, + "loss": 0.0534, "step": 11470 }, { - "epoch": 0.7510631337912986, - "grad_norm": 0.8129025101661682, - "learning_rate": 9.994815277509188e-05, - "loss": 0.1038, + "epoch": 3.00366372260386, + "grad_norm": 0.5993890762329102, + "learning_rate": 4.270833383995814e-05, + "loss": 0.0568, "step": 11480 }, { - "epoch": 0.7517173699705594, - "grad_norm": 0.8603448867797852, - "learning_rate": 9.994773371301207e-05, - "loss": 0.119, + "epoch": 3.006280667320903, + "grad_norm": 0.5144372582435608, + "learning_rate": 4.269373843868083e-05, + "loss": 0.0542, "step": 11490 }, { - "epoch": 0.7523716061498201, - "grad_norm": 0.797516942024231, - "learning_rate": 9.994731296506525e-05, - "loss": 0.1257, + "epoch": 3.008897612037946, + "grad_norm": 0.5794318914413452, + "learning_rate": 4.267913094390013e-05, + "loss": 0.0562, "step": 11500 }, { - "epoch": 0.7530258423290808, - "grad_norm": 0.7435304522514343, - "learning_rate": 9.994689053126564e-05, - "loss": 0.1089, + "epoch": 3.0115145567549884, + "grad_norm": 0.7506561875343323, + "learning_rate": 4.266451136560014e-05, + "loss": 0.0594, "step": 11510 }, { - "epoch": 0.7536800785083415, - "grad_norm": 0.8212321996688843, - "learning_rate": 9.994646641162745e-05, - "loss": 0.1115, + "epoch": 3.0141315014720313, + "grad_norm": 0.5357182621955872, + "learning_rate": 4.26498797137732e-05, + "loss": 0.0607, "step": 11520 }, { - "epoch": 0.7543343146876023, - "grad_norm": 0.8718414306640625, - "learning_rate": 9.994604060616506e-05, - "loss": 0.1215, + "epoch": 3.0167484461890743, + "grad_norm": 0.669437050819397, + "learning_rate": 4.26352359984199e-05, + "loss": 0.0658, "step": 11530 }, { - "epoch": 0.7549885508668629, - "grad_norm": 0.8560036420822144, - "learning_rate": 9.99456131148928e-05, - "loss": 0.1098, + "epoch": 3.019365390906117, + "grad_norm": 0.8581656813621521, + "learning_rate": 4.262058022954909e-05, + "loss": 0.0651, "step": 11540 }, { - "epoch": 0.7556427870461236, - "grad_norm": 1.0013700723648071, - "learning_rate": 9.994518393782513e-05, - "loss": 0.1199, + "epoch": 3.02198233562316, + "grad_norm": 0.49288684129714966, + "learning_rate": 4.2605912417177846e-05, + "loss": 0.0598, "step": 11550 }, { - "epoch": 0.7562970232253844, - "grad_norm": 0.8606430292129517, - "learning_rate": 9.994475307497649e-05, - "loss": 0.1022, + "epoch": 3.024599280340203, + "grad_norm": 0.5953378081321716, + "learning_rate": 4.2591232571331476e-05, + "loss": 0.0667, "step": 11560 }, { - "epoch": 0.756951259404645, - "grad_norm": 0.9482471942901611, - "learning_rate": 9.994432052636145e-05, - "loss": 0.1203, + "epoch": 3.027216225057246, + "grad_norm": 0.6922196745872498, + "learning_rate": 4.2576540702043516e-05, + "loss": 0.0542, "step": 11570 }, { - "epoch": 0.7576054955839058, - "grad_norm": 1.0482877492904663, - "learning_rate": 9.994388629199463e-05, - "loss": 0.1214, + "epoch": 3.0298331697742884, + "grad_norm": 0.6142755150794983, + "learning_rate": 4.256183681935573e-05, + "loss": 0.0606, "step": 11580 }, { - "epoch": 0.7582597317631665, - "grad_norm": 0.8290185928344727, - "learning_rate": 9.994345037189063e-05, - "loss": 0.1192, + "epoch": 3.0324501144913314, + "grad_norm": 0.6295666694641113, + "learning_rate": 4.254712093331807e-05, + "loss": 0.0582, "step": 11590 }, { - "epoch": 0.7589139679424273, - "grad_norm": 0.8499178886413574, - "learning_rate": 9.994301276606424e-05, - "loss": 0.1183, + "epoch": 3.0350670592083744, + "grad_norm": 0.6455519199371338, + "learning_rate": 4.2532393053988715e-05, + "loss": 0.0546, "step": 11600 }, { - "epoch": 0.7595682041216879, - "grad_norm": 0.9083896279335022, - "learning_rate": 9.994257347453015e-05, - "loss": 0.1098, + "epoch": 3.037684003925417, + "grad_norm": 0.5940701365470886, + "learning_rate": 4.2517653191434026e-05, + "loss": 0.0621, "step": 11610 }, { - "epoch": 0.7602224403009487, - "grad_norm": 0.9857131242752075, - "learning_rate": 9.994213249730325e-05, - "loss": 0.1212, + "epoch": 3.04030094864246, + "grad_norm": 0.5957797765731812, + "learning_rate": 4.250290135572856e-05, + "loss": 0.0633, "step": 11620 }, { - "epoch": 0.7608766764802094, - "grad_norm": 0.930366575717926, - "learning_rate": 9.99416898343984e-05, - "loss": 0.1173, + "epoch": 3.042917893359503, + "grad_norm": 0.794637143611908, + "learning_rate": 4.248813755695507e-05, + "loss": 0.0607, "step": 11630 }, { - "epoch": 0.76153091265947, - "grad_norm": 0.9344758987426758, - "learning_rate": 9.994124548583053e-05, - "loss": 0.1246, + "epoch": 3.0455348380765455, + "grad_norm": 0.8148157596588135, + "learning_rate": 4.2473361805204453e-05, + "loss": 0.0671, "step": 11640 }, { - "epoch": 0.7621851488387308, - "grad_norm": 0.8181567192077637, - "learning_rate": 9.994079945161466e-05, - "loss": 0.1207, + "epoch": 3.0481517827935884, + "grad_norm": 0.6912362575531006, + "learning_rate": 4.245857411057581e-05, + "loss": 0.0609, "step": 11650 }, { - "epoch": 0.7628393850179915, - "grad_norm": 0.7763569951057434, - "learning_rate": 9.994035173176582e-05, - "loss": 0.1251, + "epoch": 3.0507687275106314, + "grad_norm": 0.5286144018173218, + "learning_rate": 4.244377448317638e-05, + "loss": 0.058, "step": 11660 }, { - "epoch": 0.7634936211972522, - "grad_norm": 1.0689425468444824, - "learning_rate": 9.993990232629915e-05, - "loss": 0.136, + "epoch": 3.053385672227674, + "grad_norm": 0.5043061375617981, + "learning_rate": 4.242896293312159e-05, + "loss": 0.0573, "step": 11670 }, { - "epoch": 0.7641478573765129, - "grad_norm": 0.9611626267433167, - "learning_rate": 9.993945123522978e-05, - "loss": 0.1179, + "epoch": 3.056002616944717, + "grad_norm": 0.49938568472862244, + "learning_rate": 4.2414139470534965e-05, + "loss": 0.0585, "step": 11680 }, { - "epoch": 0.7648020935557737, - "grad_norm": 0.9553081393241882, - "learning_rate": 9.9938998458573e-05, - "loss": 0.1153, + "epoch": 3.05861956166176, + "grad_norm": 0.5215581655502319, + "learning_rate": 4.239930410554823e-05, + "loss": 0.0549, "step": 11690 }, { - "epoch": 0.7654563297350343, - "grad_norm": 0.9695801138877869, - "learning_rate": 9.993854399634402e-05, - "loss": 0.1305, + "epoch": 3.061236506378803, + "grad_norm": 0.8619711995124817, + "learning_rate": 4.238445684830119e-05, + "loss": 0.0628, "step": 11700 }, { - "epoch": 0.766110565914295, - "grad_norm": 1.1359174251556396, - "learning_rate": 9.993808784855823e-05, - "loss": 0.1135, + "epoch": 3.0638534510958455, + "grad_norm": 0.4214211106300354, + "learning_rate": 4.236959770894183e-05, + "loss": 0.0566, "step": 11710 }, { - "epoch": 0.7667648020935558, - "grad_norm": 0.8524360060691833, - "learning_rate": 9.9937630015231e-05, - "loss": 0.1204, + "epoch": 3.0664703958128885, + "grad_norm": 0.7512931823730469, + "learning_rate": 4.235472669762622e-05, + "loss": 0.0522, "step": 11720 }, { - "epoch": 0.7674190382728164, - "grad_norm": 1.0540283918380737, - "learning_rate": 9.993717049637779e-05, - "loss": 0.1255, + "epoch": 3.0690873405299315, + "grad_norm": 0.6468842029571533, + "learning_rate": 4.233984382451856e-05, + "loss": 0.0599, "step": 11730 }, { - "epoch": 0.7680732744520772, - "grad_norm": 0.8269158601760864, - "learning_rate": 9.99367092920141e-05, - "loss": 0.1145, + "epoch": 3.071704285246974, + "grad_norm": 0.719258725643158, + "learning_rate": 4.232494909979115e-05, + "loss": 0.0584, "step": 11740 }, { - "epoch": 0.7687275106313379, - "grad_norm": 0.9928538203239441, - "learning_rate": 9.993624640215552e-05, - "loss": 0.1245, + "epoch": 3.074321229964017, + "grad_norm": 0.9053614139556885, + "learning_rate": 4.2310042533624395e-05, + "loss": 0.0646, "step": 11750 }, { - "epoch": 0.7693817468105987, - "grad_norm": 0.9222886562347412, - "learning_rate": 9.993578182681767e-05, - "loss": 0.1098, + "epoch": 3.07693817468106, + "grad_norm": 0.8233187794685364, + "learning_rate": 4.2295124136206794e-05, + "loss": 0.0572, "step": 11760 }, { - "epoch": 0.7700359829898593, - "grad_norm": 1.1276291608810425, - "learning_rate": 9.993531556601621e-05, - "loss": 0.1114, + "epoch": 3.0795551193981026, + "grad_norm": 0.6152809262275696, + "learning_rate": 4.2280193917734926e-05, + "loss": 0.058, "step": 11770 }, { - "epoch": 0.7706902191691201, - "grad_norm": 0.9698314070701599, - "learning_rate": 9.993484761976688e-05, - "loss": 0.1103, + "epoch": 3.0821720641151455, + "grad_norm": 0.5993651747703552, + "learning_rate": 4.226525188841346e-05, + "loss": 0.0512, "step": 11780 }, { - "epoch": 0.7713444553483808, - "grad_norm": 1.0448344945907593, - "learning_rate": 9.993437798808549e-05, - "loss": 0.1034, + "epoch": 3.0847890088321885, + "grad_norm": 0.4626370370388031, + "learning_rate": 4.225029805845513e-05, + "loss": 0.0548, "step": 11790 }, { - "epoch": 0.7719986915276414, - "grad_norm": 0.7689130902290344, - "learning_rate": 9.99339066709879e-05, - "loss": 0.112, + "epoch": 3.087405953549231, + "grad_norm": 0.6150082945823669, + "learning_rate": 4.223533243808073e-05, + "loss": 0.064, "step": 11800 }, { - "epoch": 0.7726529277069022, - "grad_norm": 0.8706762790679932, - "learning_rate": 9.993343366849e-05, - "loss": 0.1238, + "epoch": 3.090022898266274, + "grad_norm": 0.3250552713871002, + "learning_rate": 4.222035503751913e-05, + "loss": 0.0595, "step": 11810 }, { - "epoch": 0.7733071638861629, - "grad_norm": 0.9579262733459473, - "learning_rate": 9.993295898060775e-05, - "loss": 0.1092, + "epoch": 3.092639842983317, + "grad_norm": 0.6475739479064941, + "learning_rate": 4.220536586700724e-05, + "loss": 0.0565, "step": 11820 }, { - "epoch": 0.7739614000654236, - "grad_norm": 0.9516409039497375, - "learning_rate": 9.993248260735717e-05, - "loss": 0.1148, + "epoch": 3.09525678770036, + "grad_norm": 0.5960320234298706, + "learning_rate": 4.219036493679003e-05, + "loss": 0.0668, "step": 11830 }, { - "epoch": 0.7746156362446843, - "grad_norm": 0.7520780563354492, - "learning_rate": 9.993200454875436e-05, - "loss": 0.1155, + "epoch": 3.0978737324174026, + "grad_norm": 0.8178985118865967, + "learning_rate": 4.217535225712047e-05, + "loss": 0.0617, "step": 11840 }, { - "epoch": 0.7752698724239451, - "grad_norm": 0.8823644518852234, - "learning_rate": 9.993152480481545e-05, - "loss": 0.1093, + "epoch": 3.1004906771344456, + "grad_norm": 0.4452805519104004, + "learning_rate": 4.2160327838259594e-05, + "loss": 0.0649, "step": 11850 }, { - "epoch": 0.7759241086032057, - "grad_norm": 1.0025229454040527, - "learning_rate": 9.993104337555663e-05, - "loss": 0.1207, + "epoch": 3.1031076218514886, + "grad_norm": 0.5689908266067505, + "learning_rate": 4.214529169047646e-05, + "loss": 0.0508, "step": 11860 }, { - "epoch": 0.7765783447824665, - "grad_norm": 0.9853441715240479, - "learning_rate": 9.993056026099415e-05, - "loss": 0.1264, + "epoch": 3.105724566568531, + "grad_norm": 0.4972374737262726, + "learning_rate": 4.213024382404812e-05, + "loss": 0.0585, "step": 11870 }, { - "epoch": 0.7772325809617272, - "grad_norm": 0.8879414200782776, - "learning_rate": 9.99300754611443e-05, - "loss": 0.1252, + "epoch": 3.108341511285574, + "grad_norm": 0.3974771201610565, + "learning_rate": 4.211518424925966e-05, + "loss": 0.0584, "step": 11880 }, { - "epoch": 0.777886817140988, - "grad_norm": 0.8338192105293274, - "learning_rate": 9.992958897602344e-05, - "loss": 0.1187, + "epoch": 3.110958456002617, + "grad_norm": 0.6321877241134644, + "learning_rate": 4.210011297640415e-05, + "loss": 0.0571, "step": 11890 }, { - "epoch": 0.7785410533202486, - "grad_norm": 0.872117280960083, - "learning_rate": 9.992910080564803e-05, - "loss": 0.1105, + "epoch": 3.1135754007196597, + "grad_norm": 0.7349003553390503, + "learning_rate": 4.208503001578266e-05, + "loss": 0.0628, "step": 11900 }, { - "epoch": 0.7791952894995093, - "grad_norm": 0.9396425485610962, - "learning_rate": 9.992861095003454e-05, - "loss": 0.1131, + "epoch": 3.1161923454367026, + "grad_norm": 0.6462802886962891, + "learning_rate": 4.206993537770426e-05, + "loss": 0.0591, "step": 11910 }, { - "epoch": 0.7798495256787701, - "grad_norm": 1.1532872915267944, - "learning_rate": 9.992811940919946e-05, - "loss": 0.1247, + "epoch": 3.1188092901537456, + "grad_norm": 0.6877278685569763, + "learning_rate": 4.2054829072486e-05, + "loss": 0.0595, "step": 11920 }, { - "epoch": 0.7805037618580307, - "grad_norm": 1.065436601638794, - "learning_rate": 9.992762618315942e-05, - "loss": 0.127, + "epoch": 3.121426234870788, + "grad_norm": 0.6100813746452332, + "learning_rate": 4.2039711110452866e-05, + "loss": 0.0656, "step": 11930 }, { - "epoch": 0.7811579980372915, - "grad_norm": 0.9578902721405029, - "learning_rate": 9.992713127193106e-05, - "loss": 0.1106, + "epoch": 3.124043179587831, + "grad_norm": 0.4432187080383301, + "learning_rate": 4.202458150193788e-05, + "loss": 0.065, "step": 11940 }, { - "epoch": 0.7818122342165522, - "grad_norm": 0.8533861637115479, - "learning_rate": 9.992663467553108e-05, - "loss": 0.1244, + "epoch": 3.126660124304874, + "grad_norm": 0.42920616269111633, + "learning_rate": 4.2009440257281956e-05, + "loss": 0.062, "step": 11950 }, { - "epoch": 0.7824664703958129, - "grad_norm": 0.9745835661888123, - "learning_rate": 9.992613639397624e-05, - "loss": 0.1104, + "epoch": 3.1292770690219167, + "grad_norm": 0.5105937123298645, + "learning_rate": 4.1994287386834014e-05, + "loss": 0.0533, "step": 11960 }, { - "epoch": 0.7831207065750736, - "grad_norm": 0.8613539338111877, - "learning_rate": 9.992563642728335e-05, - "loss": 0.1044, + "epoch": 3.1318940137389597, + "grad_norm": 0.42014339566230774, + "learning_rate": 4.197912290095089e-05, + "loss": 0.0605, "step": 11970 }, { - "epoch": 0.7837749427543343, - "grad_norm": 1.0119600296020508, - "learning_rate": 9.992513477546931e-05, - "loss": 0.1181, + "epoch": 3.1345109584560027, + "grad_norm": 0.45812082290649414, + "learning_rate": 4.1963946809997366e-05, + "loss": 0.0551, "step": 11980 }, { - "epoch": 0.784429178933595, - "grad_norm": 0.856393039226532, - "learning_rate": 9.992463143855102e-05, - "loss": 0.1172, + "epoch": 3.1371279031730452, + "grad_norm": 0.423980712890625, + "learning_rate": 4.194875912434615e-05, + "loss": 0.054, "step": 11990 }, { - "epoch": 0.7850834151128557, - "grad_norm": 0.8078387379646301, - "learning_rate": 9.992412641654551e-05, - "loss": 0.1332, + "epoch": 3.1397448478900882, + "grad_norm": 0.47686734795570374, + "learning_rate": 4.1933559854377904e-05, + "loss": 0.0623, + "step": 12000 + }, + { + "epoch": 3.1397448478900882, + "eval_loss": 0.06390022339072847, + "eval_runtime": 8.8627, + "eval_samples_per_second": 115.541, + "eval_steps_per_second": 1.805, "step": 12000 }, { - "epoch": 0.7857376512921165, - "grad_norm": 0.818047046661377, - "learning_rate": 9.99236197094698e-05, - "loss": 0.1156, + "epoch": 3.142361792607131, + "grad_norm": 0.8884722590446472, + "learning_rate": 4.191834901048116e-05, + "loss": 0.0578, "step": 12010 }, { - "epoch": 0.7863918874713771, - "grad_norm": 0.8972421288490295, - "learning_rate": 9.992311131734098e-05, - "loss": 0.1162, + "epoch": 3.144978737324174, + "grad_norm": 0.4640854299068451, + "learning_rate": 4.19031266030524e-05, + "loss": 0.0563, "step": 12020 }, { - "epoch": 0.7870461236506379, - "grad_norm": 0.9026692509651184, - "learning_rate": 9.992260124017623e-05, - "loss": 0.1239, + "epoch": 3.1475956820412168, + "grad_norm": 0.5811957120895386, + "learning_rate": 4.1887892642496e-05, + "loss": 0.0567, "step": 12030 }, { - "epoch": 0.7877003598298986, - "grad_norm": 0.8012493252754211, - "learning_rate": 9.992208947799276e-05, - "loss": 0.1224, + "epoch": 3.1502126267582597, + "grad_norm": 0.6208037734031677, + "learning_rate": 4.1872647139224215e-05, + "loss": 0.0544, "step": 12040 }, { - "epoch": 0.7883545960091594, - "grad_norm": 0.7508116960525513, - "learning_rate": 9.992157603080785e-05, - "loss": 0.1145, + "epoch": 3.1528295714753027, + "grad_norm": 0.6058433055877686, + "learning_rate": 4.185739010365721e-05, + "loss": 0.0579, "step": 12050 }, { - "epoch": 0.78900883218842, - "grad_norm": 0.9431008100509644, - "learning_rate": 9.992106089863883e-05, - "loss": 0.1209, + "epoch": 3.1554465161923453, + "grad_norm": 0.7506754994392395, + "learning_rate": 4.1842121546223034e-05, + "loss": 0.0568, "step": 12060 }, { - "epoch": 0.7896630683676807, - "grad_norm": 0.9398839473724365, - "learning_rate": 9.992054408150307e-05, - "loss": 0.1195, + "epoch": 3.1580634609093883, + "grad_norm": 0.5643253326416016, + "learning_rate": 4.1826841477357584e-05, + "loss": 0.0547, "step": 12070 }, { - "epoch": 0.7903173045469415, - "grad_norm": 0.8729955554008484, - "learning_rate": 9.992002557941804e-05, - "loss": 0.1151, + "epoch": 3.1606804056264313, + "grad_norm": 0.5418350696563721, + "learning_rate": 4.1811549907504654e-05, + "loss": 0.0557, "step": 12080 }, { - "epoch": 0.7909715407262021, - "grad_norm": 0.8589168787002563, - "learning_rate": 9.991950539240122e-05, - "loss": 0.1228, + "epoch": 3.163297350343474, + "grad_norm": 0.8077098727226257, + "learning_rate": 4.1796246847115886e-05, + "loss": 0.0558, "step": 12090 }, { - "epoch": 0.7916257769054629, - "grad_norm": 0.7919473052024841, - "learning_rate": 9.991898352047016e-05, - "loss": 0.1113, + "epoch": 3.165914295060517, + "grad_norm": 0.42577677965164185, + "learning_rate": 4.1780932306650775e-05, + "loss": 0.0592, "step": 12100 }, { - "epoch": 0.7922800130847236, - "grad_norm": 0.8259612321853638, - "learning_rate": 9.99184599636425e-05, - "loss": 0.1143, + "epoch": 3.16853123977756, + "grad_norm": 0.7265817523002625, + "learning_rate": 4.176560629657667e-05, + "loss": 0.0602, "step": 12110 }, { - "epoch": 0.7929342492639843, - "grad_norm": 0.9801688194274902, - "learning_rate": 9.99179347219359e-05, - "loss": 0.1148, + "epoch": 3.1711481844946023, + "grad_norm": 0.7095639705657959, + "learning_rate": 4.175026882736876e-05, + "loss": 0.0566, "step": 12120 }, { - "epoch": 0.793588485443245, - "grad_norm": 0.9544298648834229, - "learning_rate": 9.991740779536808e-05, - "loss": 0.1276, + "epoch": 3.1737651292116453, + "grad_norm": 0.4474140703678131, + "learning_rate": 4.173491990951003e-05, + "loss": 0.0559, "step": 12130 }, { - "epoch": 0.7942427216225058, - "grad_norm": 0.946922242641449, - "learning_rate": 9.991687918395686e-05, - "loss": 0.1174, + "epoch": 3.1763820739286883, + "grad_norm": 0.4180569648742676, + "learning_rate": 4.1719559553491356e-05, + "loss": 0.0519, "step": 12140 }, { - "epoch": 0.7948969578017664, - "grad_norm": 1.27000892162323, - "learning_rate": 9.991634888772003e-05, - "loss": 0.1163, + "epoch": 3.1789990186457313, + "grad_norm": 0.5319985151290894, + "learning_rate": 4.170418776981139e-05, + "loss": 0.0552, "step": 12150 }, { - "epoch": 0.7955511939810271, - "grad_norm": 0.7371228933334351, - "learning_rate": 9.991581690667553e-05, - "loss": 0.117, + "epoch": 3.181615963362774, + "grad_norm": 0.4087623953819275, + "learning_rate": 4.168880456897658e-05, + "loss": 0.057, "step": 12160 }, { - "epoch": 0.7962054301602879, - "grad_norm": 0.8912835717201233, - "learning_rate": 9.99152832408413e-05, - "loss": 0.1134, + "epoch": 3.184232908079817, + "grad_norm": 0.7610423564910889, + "learning_rate": 4.167340996150122e-05, + "loss": 0.0563, "step": 12170 }, { - "epoch": 0.7968596663395485, - "grad_norm": 0.8078799843788147, - "learning_rate": 9.991474789023534e-05, - "loss": 0.1217, + "epoch": 3.18684985279686, + "grad_norm": 0.40939995646476746, + "learning_rate": 4.165800395790737e-05, + "loss": 0.0565, "step": 12180 }, { - "epoch": 0.7975139025188093, - "grad_norm": 0.8640681505203247, - "learning_rate": 9.991421085487573e-05, - "loss": 0.1029, + "epoch": 3.1894667975139024, + "grad_norm": 0.6325905919075012, + "learning_rate": 4.164258656872489e-05, + "loss": 0.0521, "step": 12190 }, { - "epoch": 0.79816813869807, - "grad_norm": 1.075920820236206, - "learning_rate": 9.991367213478062e-05, - "loss": 0.1155, + "epoch": 3.1920837422309454, + "grad_norm": 0.5581347942352295, + "learning_rate": 4.162715780449143e-05, + "loss": 0.0606, "step": 12200 }, { - "epoch": 0.7988223748773308, - "grad_norm": 1.051973581314087, - "learning_rate": 9.991313172996815e-05, - "loss": 0.1233, + "epoch": 3.1947006869479884, + "grad_norm": 0.5394080281257629, + "learning_rate": 4.161171767575239e-05, + "loss": 0.0567, "step": 12210 }, { - "epoch": 0.7994766110565914, - "grad_norm": 0.9649173617362976, - "learning_rate": 9.991258964045659e-05, - "loss": 0.1247, + "epoch": 3.197317631665031, + "grad_norm": 0.47330865263938904, + "learning_rate": 4.1596266193060954e-05, + "loss": 0.0548, "step": 12220 }, { - "epoch": 0.8001308472358521, - "grad_norm": 0.9157887697219849, - "learning_rate": 9.991204586626424e-05, - "loss": 0.1288, + "epoch": 3.199934576382074, + "grad_norm": 0.5119001269340515, + "learning_rate": 4.158080336697807e-05, + "loss": 0.0584, "step": 12230 }, { - "epoch": 0.8007850834151129, - "grad_norm": 0.8497806787490845, - "learning_rate": 9.991150040740944e-05, - "loss": 0.1091, + "epoch": 3.202551521099117, + "grad_norm": 0.4136107265949249, + "learning_rate": 4.1565329208072437e-05, + "loss": 0.0589, "step": 12240 }, { - "epoch": 0.8014393195943735, - "grad_norm": 1.007943034172058, - "learning_rate": 9.99109532639106e-05, - "loss": 0.1104, + "epoch": 3.2051684658161594, + "grad_norm": 0.8875516057014465, + "learning_rate": 4.154984372692048e-05, + "loss": 0.0514, "step": 12250 }, { - "epoch": 0.8020935557736343, - "grad_norm": 0.8589887022972107, - "learning_rate": 9.991040443578618e-05, - "loss": 0.1219, + "epoch": 3.2077854105332024, + "grad_norm": 0.559062659740448, + "learning_rate": 4.153434693410641e-05, + "loss": 0.0556, "step": 12260 }, { - "epoch": 0.802747791952895, - "grad_norm": 1.0197535753250122, - "learning_rate": 9.990985392305473e-05, - "loss": 0.1156, + "epoch": 3.2104023552502454, + "grad_norm": 0.6495611667633057, + "learning_rate": 4.15188388402221e-05, + "loss": 0.0607, "step": 12270 }, { - "epoch": 0.8034020281321557, - "grad_norm": 0.8951653242111206, - "learning_rate": 9.99093017257348e-05, - "loss": 0.111, + "epoch": 3.2130192999672884, + "grad_norm": 0.6440935730934143, + "learning_rate": 4.1503319455867215e-05, + "loss": 0.0556, "step": 12280 }, { - "epoch": 0.8040562643114164, - "grad_norm": 1.0193089246749878, - "learning_rate": 9.990874784384506e-05, - "loss": 0.1139, + "epoch": 3.215636244684331, + "grad_norm": 0.4232413172721863, + "learning_rate": 4.148778879164911e-05, + "loss": 0.0505, "step": 12290 }, { - "epoch": 0.8047105004906772, - "grad_norm": 0.8544113636016846, - "learning_rate": 9.990819227740418e-05, - "loss": 0.1092, + "epoch": 3.218253189401374, + "grad_norm": 0.7083742022514343, + "learning_rate": 4.147224685818282e-05, + "loss": 0.0592, "step": 12300 }, { - "epoch": 0.8053647366699378, - "grad_norm": 0.9851740598678589, - "learning_rate": 9.990763502643094e-05, - "loss": 0.1163, + "epoch": 3.220870134118417, + "grad_norm": 0.591503381729126, + "learning_rate": 4.145669366609113e-05, + "loss": 0.0577, "step": 12310 }, { - "epoch": 0.8060189728491985, - "grad_norm": 1.2490851879119873, - "learning_rate": 9.990707609094412e-05, - "loss": 0.1177, + "epoch": 3.2234870788354595, + "grad_norm": 0.5156564116477966, + "learning_rate": 4.14411292260045e-05, + "loss": 0.057, "step": 12320 }, { - "epoch": 0.8066732090284593, - "grad_norm": 0.8917890787124634, - "learning_rate": 9.990651547096259e-05, - "loss": 0.1218, + "epoch": 3.2261040235525025, + "grad_norm": 0.5001283884048462, + "learning_rate": 4.142555354856107e-05, + "loss": 0.0503, "step": 12330 }, { - "epoch": 0.80732744520772, - "grad_norm": 1.0979591608047485, - "learning_rate": 9.990595316650528e-05, - "loss": 0.114, + "epoch": 3.2287209682695455, + "grad_norm": 0.5152615308761597, + "learning_rate": 4.1409966644406686e-05, + "loss": 0.0617, "step": 12340 }, { - "epoch": 0.8079816813869807, - "grad_norm": 0.89876788854599, - "learning_rate": 9.990538917759117e-05, - "loss": 0.1308, + "epoch": 3.231337912986588, + "grad_norm": 0.7098157405853271, + "learning_rate": 4.139436852419482e-05, + "loss": 0.0605, "step": 12350 }, { - "epoch": 0.8086359175662414, - "grad_norm": 0.9439699053764343, - "learning_rate": 9.990482350423929e-05, - "loss": 0.1131, + "epoch": 3.233954857703631, + "grad_norm": 0.7927548289299011, + "learning_rate": 4.137875919858667e-05, + "loss": 0.0512, "step": 12360 }, { - "epoch": 0.8092901537455022, - "grad_norm": 0.9236170053482056, - "learning_rate": 9.990425614646874e-05, - "loss": 0.114, + "epoch": 3.236571802420674, + "grad_norm": 0.5227700471878052, + "learning_rate": 4.136313867825104e-05, + "loss": 0.0524, "step": 12370 }, { - "epoch": 0.8099443899247628, - "grad_norm": 1.0061261653900146, - "learning_rate": 9.990368710429866e-05, - "loss": 0.1216, + "epoch": 3.2391887471377165, + "grad_norm": 0.8455647826194763, + "learning_rate": 4.134750697386442e-05, + "loss": 0.0515, "step": 12380 }, { - "epoch": 0.8105986261040236, - "grad_norm": 0.9143770933151245, - "learning_rate": 9.990311637774827e-05, - "loss": 0.1159, + "epoch": 3.2418056918547595, + "grad_norm": 0.597399890422821, + "learning_rate": 4.133186409611094e-05, + "loss": 0.0503, "step": 12390 }, { - "epoch": 0.8112528622832843, - "grad_norm": 0.8878925442695618, - "learning_rate": 9.990254396683683e-05, - "loss": 0.1078, + "epoch": 3.2444226365718025, + "grad_norm": 0.7307800650596619, + "learning_rate": 4.131621005568235e-05, + "loss": 0.0522, "step": 12400 }, { - "epoch": 0.8119070984625449, - "grad_norm": 1.0045684576034546, - "learning_rate": 9.990196987158364e-05, - "loss": 0.1112, + "epoch": 3.247039581288845, + "grad_norm": 0.49984949827194214, + "learning_rate": 4.130054486327803e-05, + "loss": 0.0514, "step": 12410 }, { - "epoch": 0.8125613346418057, - "grad_norm": 0.7909841537475586, - "learning_rate": 9.990139409200812e-05, - "loss": 0.1122, + "epoch": 3.249656526005888, + "grad_norm": 0.5808200836181641, + "learning_rate": 4.1284868529605e-05, + "loss": 0.0512, "step": 12420 }, { - "epoch": 0.8132155708210664, - "grad_norm": 0.8620671629905701, - "learning_rate": 9.990081662812966e-05, - "loss": 0.121, + "epoch": 3.252273470722931, + "grad_norm": 0.739686131477356, + "learning_rate": 4.12691810653779e-05, + "loss": 0.0586, "step": 12430 }, { - "epoch": 0.8138698070003271, - "grad_norm": 0.8675190210342407, - "learning_rate": 9.990023747996777e-05, - "loss": 0.1225, + "epoch": 3.2548904154399736, + "grad_norm": 0.5442174077033997, + "learning_rate": 4.125348248131895e-05, + "loss": 0.0519, "step": 12440 }, { - "epoch": 0.8145240431795878, - "grad_norm": 1.0223960876464844, - "learning_rate": 9.9899656647542e-05, - "loss": 0.1217, + "epoch": 3.2575073601570166, + "grad_norm": 0.6988082528114319, + "learning_rate": 4.123777278815798e-05, + "loss": 0.0534, "step": 12450 }, { - "epoch": 0.8151782793588486, - "grad_norm": 0.850107729434967, - "learning_rate": 9.989907413087196e-05, - "loss": 0.1081, + "epoch": 3.2601243048740596, + "grad_norm": 0.7346671223640442, + "learning_rate": 4.1222051996632415e-05, + "loss": 0.0493, "step": 12460 }, { - "epoch": 0.8158325155381092, - "grad_norm": 0.8945133090019226, - "learning_rate": 9.98984899299773e-05, - "loss": 0.1133, + "epoch": 3.2627412495911026, + "grad_norm": 0.6555301547050476, + "learning_rate": 4.1206320117487285e-05, + "loss": 0.0589, "step": 12470 }, { - "epoch": 0.81648675171737, - "grad_norm": 0.8991668820381165, - "learning_rate": 9.989790404487773e-05, - "loss": 0.1104, + "epoch": 3.265358194308145, + "grad_norm": 0.6752069592475891, + "learning_rate": 4.119057716147517e-05, + "loss": 0.0582, "step": 12480 }, { - "epoch": 0.8171409878966307, - "grad_norm": 0.7645007967948914, - "learning_rate": 9.989731647559304e-05, - "loss": 0.1179, + "epoch": 3.267975139025188, + "grad_norm": 0.5889864563941956, + "learning_rate": 4.117482313935623e-05, + "loss": 0.0545, "step": 12490 }, { - "epoch": 0.8177952240758914, - "grad_norm": 0.7892476916313171, - "learning_rate": 9.989672722214307e-05, - "loss": 0.1097, + "epoch": 3.270592083742231, + "grad_norm": 0.506980836391449, + "learning_rate": 4.1159058061898195e-05, + "loss": 0.0536, "step": 12500 }, { - "epoch": 0.8184494602551521, - "grad_norm": 0.882279098033905, - "learning_rate": 9.989613628454769e-05, - "loss": 0.1278, + "epoch": 3.2732090284592736, + "grad_norm": 0.49957287311553955, + "learning_rate": 4.114328193987634e-05, + "loss": 0.0541, "step": 12510 }, { - "epoch": 0.8191036964344128, - "grad_norm": 1.140187382698059, - "learning_rate": 9.989554366282684e-05, - "loss": 0.1163, + "epoch": 3.2758259731763166, + "grad_norm": 0.39040929079055786, + "learning_rate": 4.112749478407351e-05, + "loss": 0.0503, "step": 12520 }, { - "epoch": 0.8197579326136736, - "grad_norm": 0.8200512528419495, - "learning_rate": 9.989494935700054e-05, - "loss": 0.1209, + "epoch": 3.2784429178933596, + "grad_norm": 0.5606443285942078, + "learning_rate": 4.111169660528007e-05, + "loss": 0.0527, "step": 12530 }, { - "epoch": 0.8204121687929342, - "grad_norm": 0.9112234711647034, - "learning_rate": 9.989435336708886e-05, - "loss": 0.1156, + "epoch": 3.281059862610402, + "grad_norm": 0.3037455677986145, + "learning_rate": 4.109588741429392e-05, + "loss": 0.054, "step": 12540 }, { - "epoch": 0.821066404972195, - "grad_norm": 0.7766229510307312, - "learning_rate": 9.98937556931119e-05, - "loss": 0.1101, + "epoch": 3.283676807327445, + "grad_norm": 0.5493094325065613, + "learning_rate": 4.1080067221920494e-05, + "loss": 0.0492, "step": 12550 }, { - "epoch": 0.8217206411514557, - "grad_norm": 1.0036638975143433, - "learning_rate": 9.989315633508983e-05, - "loss": 0.1233, + "epoch": 3.286293752044488, + "grad_norm": 0.9509132504463196, + "learning_rate": 4.106423603897275e-05, + "loss": 0.0608, "step": 12560 }, { - "epoch": 0.8223748773307163, - "grad_norm": 0.9545724391937256, - "learning_rate": 9.989255529304287e-05, - "loss": 0.1198, + "epoch": 3.2889106967615307, + "grad_norm": 0.6977026462554932, + "learning_rate": 4.1048393876271155e-05, + "loss": 0.0492, "step": 12570 }, { - "epoch": 0.8230291135099771, - "grad_norm": 0.8219319581985474, - "learning_rate": 9.989195256699133e-05, - "loss": 0.1183, + "epoch": 3.2915276414785737, + "grad_norm": 0.5020390748977661, + "learning_rate": 4.1032540744643666e-05, + "loss": 0.0599, "step": 12580 }, { - "epoch": 0.8236833496892378, - "grad_norm": 0.7603002190589905, - "learning_rate": 9.989134815695556e-05, - "loss": 0.1088, + "epoch": 3.2941445861956167, + "grad_norm": 0.8102987408638, + "learning_rate": 4.101667665492576e-05, + "loss": 0.0535, "step": 12590 }, { - "epoch": 0.8243375858684985, - "grad_norm": 0.7731127738952637, - "learning_rate": 9.989074206295592e-05, - "loss": 0.1181, + "epoch": 3.2967615309126597, + "grad_norm": 0.4310251474380493, + "learning_rate": 4.100080161796038e-05, + "loss": 0.0499, "step": 12600 }, { - "epoch": 0.8249918220477592, - "grad_norm": 1.0608632564544678, - "learning_rate": 9.98901342850129e-05, - "loss": 0.1248, + "epoch": 3.299378475629702, + "grad_norm": 0.5380797982215881, + "learning_rate": 4.098491564459799e-05, + "loss": 0.047, "step": 12610 }, { - "epoch": 0.82564605822702, - "grad_norm": 0.9366574287414551, - "learning_rate": 9.988952482314702e-05, - "loss": 0.1197, + "epoch": 3.301995420346745, + "grad_norm": 0.4607020318508148, + "learning_rate": 4.0969018745696476e-05, + "loss": 0.0536, "step": 12620 }, { - "epoch": 0.8263002944062807, - "grad_norm": 0.8143541812896729, - "learning_rate": 9.988891367737882e-05, - "loss": 0.1033, + "epoch": 3.304612365063788, + "grad_norm": 0.6233174204826355, + "learning_rate": 4.095311093212122e-05, + "loss": 0.0569, "step": 12630 }, { - "epoch": 0.8269545305855414, - "grad_norm": 1.157392144203186, - "learning_rate": 9.988830084772896e-05, - "loss": 0.1086, + "epoch": 3.3072293097808307, + "grad_norm": 0.5668174028396606, + "learning_rate": 4.093719221474508e-05, + "loss": 0.0555, "step": 12640 }, { - "epoch": 0.8276087667648021, - "grad_norm": 0.9264316558837891, - "learning_rate": 9.98876863342181e-05, - "loss": 0.11, + "epoch": 3.3098462544978737, + "grad_norm": 0.32410597801208496, + "learning_rate": 4.092126260444834e-05, + "loss": 0.05, "step": 12650 }, { - "epoch": 0.8282630029440629, - "grad_norm": 0.9766997694969177, - "learning_rate": 9.988707013686698e-05, - "loss": 0.11, + "epoch": 3.3124631992149167, + "grad_norm": 0.5236939787864685, + "learning_rate": 4.090532211211874e-05, + "loss": 0.0583, "step": 12660 }, { - "epoch": 0.8289172391233235, - "grad_norm": 0.9637208580970764, - "learning_rate": 9.988645225569643e-05, - "loss": 0.1112, + "epoch": 3.3150801439319593, + "grad_norm": 0.6198887825012207, + "learning_rate": 4.0889370748651446e-05, + "loss": 0.0521, "step": 12670 }, { - "epoch": 0.8295714753025842, - "grad_norm": 0.9603047370910645, - "learning_rate": 9.988583269072727e-05, - "loss": 0.1092, + "epoch": 3.3176970886490023, + "grad_norm": 0.5270038843154907, + "learning_rate": 4.087340852494908e-05, + "loss": 0.0537, "step": 12680 }, { - "epoch": 0.830225711481845, - "grad_norm": 0.8850484490394592, - "learning_rate": 9.988521144198043e-05, - "loss": 0.1286, + "epoch": 3.3203140333660452, + "grad_norm": 0.42099273204803467, + "learning_rate": 4.0857435451921664e-05, + "loss": 0.0558, "step": 12690 }, { - "epoch": 0.8308799476611056, - "grad_norm": 1.1651179790496826, - "learning_rate": 9.988458850947689e-05, - "loss": 0.1156, + "epoch": 3.322930978083088, + "grad_norm": 0.36083897948265076, + "learning_rate": 4.084145154048664e-05, + "loss": 0.0515, "step": 12700 }, { - "epoch": 0.8315341838403664, - "grad_norm": 1.13690185546875, - "learning_rate": 9.988396389323764e-05, - "loss": 0.1165, + "epoch": 3.325547922800131, + "grad_norm": 0.445707231760025, + "learning_rate": 4.082545680156887e-05, + "loss": 0.0537, "step": 12710 }, { - "epoch": 0.8321884200196271, - "grad_norm": 0.8526743054389954, - "learning_rate": 9.988333759328379e-05, - "loss": 0.1144, + "epoch": 3.3281648675171738, + "grad_norm": 0.5661662220954895, + "learning_rate": 4.0809451246100594e-05, + "loss": 0.0615, "step": 12720 }, { - "epoch": 0.8328426561988878, - "grad_norm": 0.8829309344291687, - "learning_rate": 9.988270960963648e-05, - "loss": 0.1118, + "epoch": 3.3307818122342168, + "grad_norm": 0.8061146140098572, + "learning_rate": 4.0793434885021475e-05, + "loss": 0.0618, "step": 12730 }, { - "epoch": 0.8334968923781485, - "grad_norm": 0.8683731555938721, - "learning_rate": 9.988207994231689e-05, - "loss": 0.1123, + "epoch": 3.3333987569512593, + "grad_norm": 0.666907012462616, + "learning_rate": 4.077740772927853e-05, + "loss": 0.0509, "step": 12740 }, { - "epoch": 0.8341511285574092, - "grad_norm": 0.9223302602767944, - "learning_rate": 9.988144859134627e-05, - "loss": 0.1257, + "epoch": 3.3360157016683023, + "grad_norm": 0.46689221262931824, + "learning_rate": 4.0761369789826166e-05, + "loss": 0.0581, "step": 12750 }, { - "epoch": 0.8348053647366699, - "grad_norm": 1.061397910118103, - "learning_rate": 9.988081555674596e-05, - "loss": 0.125, + "epoch": 3.3386326463853453, + "grad_norm": 0.6941690444946289, + "learning_rate": 4.074532107762619e-05, + "loss": 0.0637, "step": 12760 }, { - "epoch": 0.8354596009159306, - "grad_norm": 1.0193485021591187, - "learning_rate": 9.98801808385373e-05, - "loss": 0.1168, + "epoch": 3.341249591102388, + "grad_norm": 0.5134106874465942, + "learning_rate": 4.07292616036477e-05, + "loss": 0.0579, "step": 12770 }, { - "epoch": 0.8361138370951914, - "grad_norm": 0.9661241769790649, - "learning_rate": 9.987954443674173e-05, - "loss": 0.1085, + "epoch": 3.343866535819431, + "grad_norm": 0.43664735555648804, + "learning_rate": 4.071319137886724e-05, + "loss": 0.054, "step": 12780 }, { - "epoch": 0.8367680732744521, - "grad_norm": 0.9959526658058167, - "learning_rate": 9.98789063513807e-05, - "loss": 0.1082, + "epoch": 3.346483480536474, + "grad_norm": 0.5271178483963013, + "learning_rate": 4.0697110414268644e-05, + "loss": 0.0622, "step": 12790 }, { - "epoch": 0.8374223094537128, - "grad_norm": 0.7342641353607178, - "learning_rate": 9.987826658247579e-05, - "loss": 0.1187, + "epoch": 3.3491004252535164, + "grad_norm": 0.536749541759491, + "learning_rate": 4.068101872084309e-05, + "loss": 0.0568, "step": 12800 }, { - "epoch": 0.8380765456329735, - "grad_norm": 0.9340495467185974, - "learning_rate": 9.987762513004856e-05, - "loss": 0.1022, + "epoch": 3.3517173699705594, + "grad_norm": 0.4899667799472809, + "learning_rate": 4.0664916309589093e-05, + "loss": 0.0501, "step": 12810 }, { - "epoch": 0.8387307818122343, - "grad_norm": 0.9463670253753662, - "learning_rate": 9.98769819941207e-05, - "loss": 0.1246, + "epoch": 3.3543343146876023, + "grad_norm": 0.4232407212257385, + "learning_rate": 4.064880319151252e-05, + "loss": 0.0592, "step": 12820 }, { - "epoch": 0.8393850179914949, - "grad_norm": 0.8426292538642883, - "learning_rate": 9.987633717471385e-05, - "loss": 0.1077, + "epoch": 3.356951259404645, + "grad_norm": 0.47795727849006653, + "learning_rate": 4.063267937762652e-05, + "loss": 0.049, "step": 12830 }, { - "epoch": 0.8400392541707556, - "grad_norm": 1.0582283735275269, - "learning_rate": 9.987569067184983e-05, - "loss": 0.1215, + "epoch": 3.359568204121688, + "grad_norm": 0.5101355314254761, + "learning_rate": 4.061654487895158e-05, + "loss": 0.0506, "step": 12840 }, { - "epoch": 0.8406934903500164, - "grad_norm": 0.9140653610229492, - "learning_rate": 9.987504248555047e-05, - "loss": 0.1193, + "epoch": 3.362185148838731, + "grad_norm": 0.4006783068180084, + "learning_rate": 4.060039970651547e-05, + "loss": 0.0562, "step": 12850 }, { - "epoch": 0.841347726529277, - "grad_norm": 0.843436598777771, - "learning_rate": 9.98743926158376e-05, - "loss": 0.1243, + "epoch": 3.364802093555774, + "grad_norm": 0.5601989030838013, + "learning_rate": 4.0584243871353257e-05, + "loss": 0.0571, "step": 12860 }, { - "epoch": 0.8420019627085378, - "grad_norm": 0.8201360702514648, - "learning_rate": 9.987374106273318e-05, - "loss": 0.1106, + "epoch": 3.3674190382728164, + "grad_norm": 0.478302001953125, + "learning_rate": 4.0568077384507306e-05, + "loss": 0.0587, "step": 12870 }, { - "epoch": 0.8426561988877985, - "grad_norm": 1.0431125164031982, - "learning_rate": 9.987308782625919e-05, - "loss": 0.1182, + "epoch": 3.3700359829898594, + "grad_norm": 0.5545663833618164, + "learning_rate": 4.055190025702727e-05, + "loss": 0.0489, "step": 12880 }, { - "epoch": 0.8433104350670592, - "grad_norm": 0.8388211727142334, - "learning_rate": 9.98724329064377e-05, - "loss": 0.1093, + "epoch": 3.372652927706902, + "grad_norm": 0.5649152398109436, + "learning_rate": 4.0535712499970045e-05, + "loss": 0.0519, "step": 12890 }, { - "epoch": 0.8439646712463199, - "grad_norm": 0.8879048228263855, - "learning_rate": 9.987177630329081e-05, - "loss": 0.1004, + "epoch": 3.375269872423945, + "grad_norm": 0.4853672981262207, + "learning_rate": 4.051951412439983e-05, + "loss": 0.0618, "step": 12900 }, { - "epoch": 0.8446189074255807, - "grad_norm": 0.857945442199707, - "learning_rate": 9.987111801684068e-05, - "loss": 0.1107, + "epoch": 3.377886817140988, + "grad_norm": 0.30773958563804626, + "learning_rate": 4.050330514138805e-05, + "loss": 0.0473, "step": 12910 }, { - "epoch": 0.8452731436048413, - "grad_norm": 0.7848942875862122, - "learning_rate": 9.987045804710951e-05, - "loss": 0.1112, + "epoch": 3.380503761858031, + "grad_norm": 0.44944265484809875, + "learning_rate": 4.048708556201338e-05, + "loss": 0.0561, "step": 12920 }, { - "epoch": 0.845927379784102, - "grad_norm": 0.8898484706878662, - "learning_rate": 9.98697963941196e-05, - "loss": 0.114, + "epoch": 3.3831207065750735, + "grad_norm": 0.5097119212150574, + "learning_rate": 4.047085539736177e-05, + "loss": 0.0609, "step": 12930 }, { - "epoch": 0.8465816159633628, - "grad_norm": 1.1552084684371948, - "learning_rate": 9.986913305789328e-05, - "loss": 0.1143, + "epoch": 3.3857376512921165, + "grad_norm": 0.4472697675228119, + "learning_rate": 4.0454614658526384e-05, + "loss": 0.0542, "step": 12940 }, { - "epoch": 0.8472358521426235, - "grad_norm": 0.8362104296684265, - "learning_rate": 9.986846803845291e-05, - "loss": 0.1142, + "epoch": 3.3883545960091594, + "grad_norm": 0.45250511169433594, + "learning_rate": 4.04383633566076e-05, + "loss": 0.0567, "step": 12950 }, { - "epoch": 0.8478900883218842, - "grad_norm": 1.0865123271942139, - "learning_rate": 9.9867801335821e-05, - "loss": 0.1147, + "epoch": 3.390971540726202, + "grad_norm": 0.5949851870536804, + "learning_rate": 4.042210150271304e-05, + "loss": 0.055, "step": 12960 }, { - "epoch": 0.8485443245011449, - "grad_norm": 0.8024462461471558, - "learning_rate": 9.986713295001997e-05, - "loss": 0.1101, + "epoch": 3.393588485443245, + "grad_norm": 0.5202013254165649, + "learning_rate": 4.0405829107957525e-05, + "loss": 0.0502, "step": 12970 }, { - "epoch": 0.8491985606804057, - "grad_norm": 0.960501492023468, - "learning_rate": 9.986646288107243e-05, - "loss": 0.1215, + "epoch": 3.396205430160288, + "grad_norm": 0.3496212065219879, + "learning_rate": 4.038954618346308e-05, + "loss": 0.0538, "step": 12980 }, { - "epoch": 0.8498527968596663, - "grad_norm": 0.7977132797241211, - "learning_rate": 9.9865791129001e-05, - "loss": 0.1175, + "epoch": 3.398822374877331, + "grad_norm": 0.47482752799987793, + "learning_rate": 4.0373252740358936e-05, + "loss": 0.0571, "step": 12990 }, { - "epoch": 0.850507033038927, - "grad_norm": 0.913974404335022, - "learning_rate": 9.986511769382834e-05, - "loss": 0.1115, + "epoch": 3.4014393195943735, + "grad_norm": 0.6373538970947266, + "learning_rate": 4.035694878978151e-05, + "loss": 0.0558, + "step": 13000 + }, + { + "epoch": 3.4014393195943735, + "eval_loss": 0.06179543161421813, + "eval_runtime": 8.9033, + "eval_samples_per_second": 115.014, + "eval_steps_per_second": 1.797, "step": 13000 }, { - "epoch": 0.8511612692181878, - "grad_norm": 0.86736661195755, - "learning_rate": 9.986444257557717e-05, - "loss": 0.1113, + "epoch": 3.4040562643114165, + "grad_norm": 0.40769073367118835, + "learning_rate": 4.034063434287438e-05, + "loss": 0.0486, "step": 13010 }, { - "epoch": 0.8518155053974484, - "grad_norm": 0.8728740811347961, - "learning_rate": 9.98637657742703e-05, - "loss": 0.1113, + "epoch": 3.406673209028459, + "grad_norm": 0.43151283264160156, + "learning_rate": 4.032430941078834e-05, + "loss": 0.0584, "step": 13020 }, { - "epoch": 0.8524697415767092, - "grad_norm": 0.8732495903968811, - "learning_rate": 9.986308728993056e-05, - "loss": 0.1129, + "epoch": 3.409290153745502, + "grad_norm": 0.601318359375, + "learning_rate": 4.030797400468132e-05, + "loss": 0.0467, "step": 13030 }, { - "epoch": 0.8531239777559699, - "grad_norm": 0.9430647492408752, - "learning_rate": 9.986240712258085e-05, - "loss": 0.1065, + "epoch": 3.411907098462545, + "grad_norm": 0.5108250379562378, + "learning_rate": 4.0291628135718404e-05, + "loss": 0.0528, "step": 13040 }, { - "epoch": 0.8537782139352306, - "grad_norm": 0.849603533744812, - "learning_rate": 9.986172527224413e-05, - "loss": 0.1198, + "epoch": 3.414524043179588, + "grad_norm": 0.5723781585693359, + "learning_rate": 4.027527181507186e-05, + "loss": 0.055, "step": 13050 }, { - "epoch": 0.8544324501144913, - "grad_norm": 0.75860595703125, - "learning_rate": 9.986104173894342e-05, - "loss": 0.1063, + "epoch": 3.4171409878966306, + "grad_norm": 0.5672504305839539, + "learning_rate": 4.0258905053921056e-05, + "loss": 0.0553, "step": 13060 }, { - "epoch": 0.8550866862937521, - "grad_norm": 0.916583240032196, - "learning_rate": 9.986035652270178e-05, - "loss": 0.1179, + "epoch": 3.4197579326136736, + "grad_norm": 0.4763956367969513, + "learning_rate": 4.024252786345253e-05, + "loss": 0.0492, "step": 13070 }, { - "epoch": 0.8557409224730128, - "grad_norm": 0.7773154377937317, - "learning_rate": 9.985966962354235e-05, - "loss": 0.1102, + "epoch": 3.4223748773307165, + "grad_norm": 0.5045111775398254, + "learning_rate": 4.022614025485994e-05, + "loss": 0.0606, "step": 13080 }, { - "epoch": 0.8563951586522734, - "grad_norm": 0.7246621251106262, - "learning_rate": 9.985898104148831e-05, - "loss": 0.1108, + "epoch": 3.424991822047759, + "grad_norm": 0.529390811920166, + "learning_rate": 4.020974223934407e-05, + "loss": 0.0565, "step": 13090 }, { - "epoch": 0.8570493948315342, - "grad_norm": 0.7535557746887207, - "learning_rate": 9.985829077656291e-05, - "loss": 0.1099, + "epoch": 3.427608766764802, + "grad_norm": 0.7006625533103943, + "learning_rate": 4.019333382811279e-05, + "loss": 0.0521, "step": 13100 }, { - "epoch": 0.8577036310107949, - "grad_norm": 0.8509471416473389, - "learning_rate": 9.98575988287894e-05, - "loss": 0.0995, + "epoch": 3.430225711481845, + "grad_norm": 0.35650935769081116, + "learning_rate": 4.01769150323811e-05, + "loss": 0.0599, "step": 13110 }, { - "epoch": 0.8583578671900556, - "grad_norm": 0.7079533934593201, - "learning_rate": 9.98569051981912e-05, - "loss": 0.1189, + "epoch": 3.4328426561988876, + "grad_norm": 0.7201257348060608, + "learning_rate": 4.01604858633711e-05, + "loss": 0.0532, "step": 13120 }, { - "epoch": 0.8590121033693163, - "grad_norm": 0.9748969674110413, - "learning_rate": 9.985620988479169e-05, - "loss": 0.1168, + "epoch": 3.4354596009159306, + "grad_norm": 0.5479598641395569, + "learning_rate": 4.014404633231198e-05, + "loss": 0.0567, "step": 13130 }, { - "epoch": 0.8596663395485771, - "grad_norm": 0.9905074834823608, - "learning_rate": 9.985551288861435e-05, - "loss": 0.1155, + "epoch": 3.4380765456329736, + "grad_norm": 0.4715985655784607, + "learning_rate": 4.012759645043997e-05, + "loss": 0.0504, "step": 13140 }, { - "epoch": 0.8603205757278377, - "grad_norm": 0.7859392762184143, - "learning_rate": 9.98548142096827e-05, - "loss": 0.1111, + "epoch": 3.440693490350016, + "grad_norm": 0.4740786552429199, + "learning_rate": 4.011113622899844e-05, + "loss": 0.0545, "step": 13150 }, { - "epoch": 0.8609748119070985, - "grad_norm": 0.7785911560058594, - "learning_rate": 9.985411384802031e-05, - "loss": 0.1159, + "epoch": 3.443310435067059, + "grad_norm": 0.3725115656852722, + "learning_rate": 4.0094665679237786e-05, + "loss": 0.0513, "step": 13160 }, { - "epoch": 0.8616290480863592, - "grad_norm": 1.0877043008804321, - "learning_rate": 9.985341180365084e-05, - "loss": 0.1197, + "epoch": 3.445927379784102, + "grad_norm": 0.5373284220695496, + "learning_rate": 4.007818481241548e-05, + "loss": 0.0552, "step": 13170 }, { - "epoch": 0.8622832842656198, - "grad_norm": 0.8900846838951111, - "learning_rate": 9.985270807659798e-05, - "loss": 0.1181, + "epoch": 3.448544324501145, + "grad_norm": 0.8126721382141113, + "learning_rate": 4.006169363979603e-05, + "loss": 0.059, "step": 13180 }, { - "epoch": 0.8629375204448806, - "grad_norm": 0.7488992810249329, - "learning_rate": 9.985200266688546e-05, - "loss": 0.1058, + "epoch": 3.4511612692181877, + "grad_norm": 0.5969352126121521, + "learning_rate": 4.004519217265099e-05, + "loss": 0.0569, "step": 13190 }, { - "epoch": 0.8635917566241413, - "grad_norm": 0.7488667964935303, - "learning_rate": 9.985129557453713e-05, - "loss": 0.1136, + "epoch": 3.4537782139352307, + "grad_norm": 0.5900636911392212, + "learning_rate": 4.002868042225898e-05, + "loss": 0.051, "step": 13200 }, { - "epoch": 0.864245992803402, - "grad_norm": 1.060916543006897, - "learning_rate": 9.985058679957681e-05, - "loss": 0.1169, + "epoch": 3.4563951586522736, + "grad_norm": 0.663048505783081, + "learning_rate": 4.001215839990561e-05, + "loss": 0.0556, "step": 13210 }, { - "epoch": 0.8649002289826627, - "grad_norm": 1.009759783744812, - "learning_rate": 9.984987634202847e-05, - "loss": 0.1154, + "epoch": 3.459012103369316, + "grad_norm": 1.137934923171997, + "learning_rate": 3.999562611688353e-05, + "loss": 0.0507, "step": 13220 }, { - "epoch": 0.8655544651619235, - "grad_norm": 1.0468559265136719, - "learning_rate": 9.984916420191607e-05, - "loss": 0.1205, + "epoch": 3.461629048086359, + "grad_norm": 0.4088614881038666, + "learning_rate": 3.99790835844924e-05, + "loss": 0.0567, "step": 13230 }, { - "epoch": 0.8662087013411842, - "grad_norm": 0.9158945083618164, - "learning_rate": 9.984845037926362e-05, - "loss": 0.1076, + "epoch": 3.464245992803402, + "grad_norm": 0.42933085560798645, + "learning_rate": 3.996253081403888e-05, + "loss": 0.0488, "step": 13240 }, { - "epoch": 0.8668629375204449, - "grad_norm": 0.893125057220459, - "learning_rate": 9.984773487409527e-05, - "loss": 0.1189, + "epoch": 3.4668629375204447, + "grad_norm": 0.5075600743293762, + "learning_rate": 3.994596781683664e-05, + "loss": 0.0534, "step": 13250 }, { - "epoch": 0.8675171736997056, - "grad_norm": 0.9089499711990356, - "learning_rate": 9.984701768643512e-05, - "loss": 0.1033, + "epoch": 3.4694798822374877, + "grad_norm": 0.6735544800758362, + "learning_rate": 3.992939460420633e-05, + "loss": 0.0558, "step": 13260 }, { - "epoch": 0.8681714098789663, - "grad_norm": 0.9404613375663757, - "learning_rate": 9.984629881630738e-05, - "loss": 0.1225, + "epoch": 3.4720968269545307, + "grad_norm": 0.6506981253623962, + "learning_rate": 3.991281118747558e-05, + "loss": 0.0522, "step": 13270 }, { - "epoch": 0.868825646058227, - "grad_norm": 0.8523469567298889, - "learning_rate": 9.984557826373635e-05, - "loss": 0.1146, + "epoch": 3.4747137716715732, + "grad_norm": 0.3499435484409332, + "learning_rate": 3.989621757797901e-05, + "loss": 0.0532, "step": 13280 }, { - "epoch": 0.8694798822374877, - "grad_norm": 0.9582688212394714, - "learning_rate": 9.984485602874632e-05, - "loss": 0.1278, + "epoch": 3.4773307163886162, + "grad_norm": 0.5630961060523987, + "learning_rate": 3.987961378705818e-05, + "loss": 0.0556, "step": 13290 }, { - "epoch": 0.8701341184167485, - "grad_norm": 0.8448380827903748, - "learning_rate": 9.984413211136167e-05, - "loss": 0.1224, + "epoch": 3.4799476611056592, + "grad_norm": 0.7153360247612, + "learning_rate": 3.986299982606164e-05, + "loss": 0.0507, "step": 13300 }, { - "epoch": 0.8707883545960091, - "grad_norm": 0.8751958608627319, - "learning_rate": 9.984340651160685e-05, - "loss": 0.1088, + "epoch": 3.482564605822702, + "grad_norm": 0.5627347230911255, + "learning_rate": 3.9846375706344864e-05, + "loss": 0.0551, "step": 13310 }, { - "epoch": 0.8714425907752699, - "grad_norm": 0.7682243585586548, - "learning_rate": 9.984267922950634e-05, - "loss": 0.1048, + "epoch": 3.4851815505397448, + "grad_norm": 0.5859262347221375, + "learning_rate": 3.98297414392703e-05, + "loss": 0.0557, "step": 13320 }, { - "epoch": 0.8720968269545306, - "grad_norm": 0.9508857727050781, - "learning_rate": 9.984195026508469e-05, - "loss": 0.1124, + "epoch": 3.4877984952567878, + "grad_norm": 0.6154468655586243, + "learning_rate": 3.981309703620728e-05, + "loss": 0.0603, "step": 13330 }, { - "epoch": 0.8727510631337912, - "grad_norm": 1.0506585836410522, - "learning_rate": 9.98412196183665e-05, - "loss": 0.1266, + "epoch": 3.4904154399738307, + "grad_norm": 0.4030719995498657, + "learning_rate": 3.979644250853212e-05, + "loss": 0.0455, "step": 13340 }, { - "epoch": 0.873405299313052, - "grad_norm": 1.0263482332229614, - "learning_rate": 9.984048728937643e-05, - "loss": 0.1313, + "epoch": 3.4930323846908733, + "grad_norm": 0.42505738139152527, + "learning_rate": 3.9779777867628023e-05, + "loss": 0.0554, "step": 13350 }, { - "epoch": 0.8740595354923127, - "grad_norm": 0.8746235370635986, - "learning_rate": 9.98397532781392e-05, - "loss": 0.1066, + "epoch": 3.4956493294079163, + "grad_norm": 0.6388312578201294, + "learning_rate": 3.976310312488513e-05, + "loss": 0.0597, "step": 13360 }, { - "epoch": 0.8747137716715735, - "grad_norm": 0.797562301158905, - "learning_rate": 9.98390175846796e-05, - "loss": 0.1052, + "epoch": 3.4982662741249593, + "grad_norm": 0.5594176650047302, + "learning_rate": 3.9746418291700446e-05, + "loss": 0.0511, "step": 13370 }, { - "epoch": 0.8753680078508341, - "grad_norm": 1.0147572755813599, - "learning_rate": 9.983828020902244e-05, - "loss": 0.1073, + "epoch": 3.500883218842002, + "grad_norm": 0.4737611413002014, + "learning_rate": 3.9729723379477926e-05, + "loss": 0.0524, "step": 13380 }, { - "epoch": 0.8760222440300949, - "grad_norm": 1.0348169803619385, - "learning_rate": 9.983754115119261e-05, - "loss": 0.1168, + "epoch": 3.503500163559045, + "grad_norm": 0.6079171895980835, + "learning_rate": 3.9713018399628356e-05, + "loss": 0.0522, "step": 13390 }, { - "epoch": 0.8766764802093556, - "grad_norm": 0.8402387499809265, - "learning_rate": 9.983680041121509e-05, - "loss": 0.1213, + "epoch": 3.506117108276088, + "grad_norm": 0.4987918436527252, + "learning_rate": 3.969630336356945e-05, + "loss": 0.0555, "step": 13400 }, { - "epoch": 0.8773307163886163, - "grad_norm": 0.8802344799041748, - "learning_rate": 9.983605798911484e-05, - "loss": 0.1121, + "epoch": 3.5087340529931303, + "grad_norm": 0.4886915683746338, + "learning_rate": 3.967957828272577e-05, + "loss": 0.0552, "step": 13410 }, { - "epoch": 0.877984952567877, - "grad_norm": 0.7464808821678162, - "learning_rate": 9.983531388491691e-05, - "loss": 0.1184, + "epoch": 3.5113509977101733, + "grad_norm": 0.48263683915138245, + "learning_rate": 3.966284316852876e-05, + "loss": 0.0615, "step": 13420 }, { - "epoch": 0.8786391887471378, - "grad_norm": 0.8094585537910461, - "learning_rate": 9.983456809864646e-05, - "loss": 0.1138, + "epoch": 3.5139679424272163, + "grad_norm": 0.6995543837547302, + "learning_rate": 3.9646098032416704e-05, + "loss": 0.0564, "step": 13430 }, { - "epoch": 0.8792934249263984, - "grad_norm": 0.7302922010421753, - "learning_rate": 9.983382063032864e-05, - "loss": 0.1124, + "epoch": 3.5165848871442593, + "grad_norm": 0.7016103267669678, + "learning_rate": 3.962934288583474e-05, + "loss": 0.0597, "step": 13440 }, { - "epoch": 0.8799476611056591, - "grad_norm": 0.8307011723518372, - "learning_rate": 9.983307147998868e-05, - "loss": 0.1155, + "epoch": 3.519201831861302, + "grad_norm": 0.4052731692790985, + "learning_rate": 3.961257774023487e-05, + "loss": 0.0504, "step": 13450 }, { - "epoch": 0.8806018972849199, - "grad_norm": 0.8752557635307312, - "learning_rate": 9.983232064765187e-05, - "loss": 0.1097, + "epoch": 3.521818776578345, + "grad_norm": 0.42247992753982544, + "learning_rate": 3.9595802607075896e-05, + "loss": 0.0539, "step": 13460 }, { - "epoch": 0.8812561334641805, - "grad_norm": 0.9539555907249451, - "learning_rate": 9.983156813334354e-05, - "loss": 0.1194, + "epoch": 3.5244357212953874, + "grad_norm": 0.626800537109375, + "learning_rate": 3.957901749782347e-05, + "loss": 0.051, "step": 13470 }, { - "epoch": 0.8819103696434413, - "grad_norm": 0.9456961750984192, - "learning_rate": 9.983081393708911e-05, - "loss": 0.1129, + "epoch": 3.5270526660124304, + "grad_norm": 0.6743230819702148, + "learning_rate": 3.9562222423950065e-05, + "loss": 0.0625, "step": 13480 }, { - "epoch": 0.882564605822702, - "grad_norm": 1.089903473854065, - "learning_rate": 9.983005805891401e-05, - "loss": 0.1114, + "epoch": 3.5296696107294734, + "grad_norm": 0.8379527926445007, + "learning_rate": 3.9545417396934936e-05, + "loss": 0.0522, "step": 13490 }, { - "epoch": 0.8832188420019627, - "grad_norm": 0.9316903948783875, - "learning_rate": 9.982930049884377e-05, - "loss": 0.1132, + "epoch": 3.5322865554465164, + "grad_norm": 0.5695173144340515, + "learning_rate": 3.952860242826418e-05, + "loss": 0.0549, "step": 13500 }, { - "epoch": 0.8838730781812234, - "grad_norm": 0.8528069853782654, - "learning_rate": 9.982854125690395e-05, - "loss": 0.1188, + "epoch": 3.534903500163559, + "grad_norm": 0.5186528563499451, + "learning_rate": 3.951177752943066e-05, + "loss": 0.0581, "step": 13510 }, { - "epoch": 0.8845273143604842, - "grad_norm": 0.8563151955604553, - "learning_rate": 9.982778033312019e-05, - "loss": 0.1122, + "epoch": 3.537520444880602, + "grad_norm": 0.5912035703659058, + "learning_rate": 3.9494942711934026e-05, + "loss": 0.0478, "step": 13520 }, { - "epoch": 0.8851815505397449, - "grad_norm": 0.7810546159744263, - "learning_rate": 9.982701772751816e-05, - "loss": 0.1076, + "epoch": 3.540137389597645, + "grad_norm": 0.33838945627212524, + "learning_rate": 3.9478097987280735e-05, + "loss": 0.0512, "step": 13530 }, { - "epoch": 0.8858357867190055, - "grad_norm": 0.7881696820259094, - "learning_rate": 9.982625344012361e-05, - "loss": 0.1293, + "epoch": 3.5427543343146874, + "grad_norm": 0.3433888554573059, + "learning_rate": 3.946124336698399e-05, + "loss": 0.0522, "step": 13540 }, { - "epoch": 0.8864900228982663, - "grad_norm": 0.7264038324356079, - "learning_rate": 9.982548747096235e-05, - "loss": 0.1156, + "epoch": 3.5453712790317304, + "grad_norm": 0.45829489827156067, + "learning_rate": 3.944437886256377e-05, + "loss": 0.0579, "step": 13550 }, { - "epoch": 0.887144259077527, - "grad_norm": 1.067252516746521, - "learning_rate": 9.982471982006019e-05, - "loss": 0.1142, + "epoch": 3.5479882237487734, + "grad_norm": 0.7708325982093811, + "learning_rate": 3.9427504485546796e-05, + "loss": 0.0546, "step": 13560 }, { - "epoch": 0.8877984952567877, - "grad_norm": 0.9019017815589905, - "learning_rate": 9.982395048744307e-05, - "loss": 0.1066, + "epoch": 3.5506051684658164, + "grad_norm": 0.5576505064964294, + "learning_rate": 3.9410620247466544e-05, + "loss": 0.0572, "step": 13570 }, { - "epoch": 0.8884527314360484, - "grad_norm": 1.020104169845581, - "learning_rate": 9.982317947313695e-05, - "loss": 0.1193, + "epoch": 3.553222113182859, + "grad_norm": 0.5772538185119629, + "learning_rate": 3.9393726159863245e-05, + "loss": 0.0572, "step": 13580 }, { - "epoch": 0.8891069676153092, - "grad_norm": 0.8307511806488037, - "learning_rate": 9.982240677716788e-05, - "loss": 0.1249, + "epoch": 3.555839057899902, + "grad_norm": 0.6166955828666687, + "learning_rate": 3.937682223428383e-05, + "loss": 0.0548, "step": 13590 }, { - "epoch": 0.8897612037945698, - "grad_norm": 0.9394966959953308, - "learning_rate": 9.98216323995619e-05, - "loss": 0.1158, + "epoch": 3.5584560026169445, + "grad_norm": 0.535210907459259, + "learning_rate": 3.935990848228199e-05, + "loss": 0.0594, "step": 13600 }, { - "epoch": 0.8904154399738305, - "grad_norm": 0.9355982542037964, - "learning_rate": 9.982085634034515e-05, - "loss": 0.1243, + "epoch": 3.5610729473339875, + "grad_norm": 0.6279690861701965, + "learning_rate": 3.9342984915418114e-05, + "loss": 0.0516, "step": 13610 }, { - "epoch": 0.8910696761530913, - "grad_norm": 0.9257426857948303, - "learning_rate": 9.982007859954386e-05, - "loss": 0.1079, + "epoch": 3.5636898920510305, + "grad_norm": 0.5341526865959167, + "learning_rate": 3.932605154525929e-05, + "loss": 0.0489, "step": 13620 }, { - "epoch": 0.8917239123323519, - "grad_norm": 1.074717402458191, - "learning_rate": 9.981929917718426e-05, - "loss": 0.1144, + "epoch": 3.5663068367680735, + "grad_norm": 0.5120111703872681, + "learning_rate": 3.930910838337932e-05, + "loss": 0.0529, "step": 13630 }, { - "epoch": 0.8923781485116127, - "grad_norm": 0.8805826902389526, - "learning_rate": 9.981851807329264e-05, - "loss": 0.117, + "epoch": 3.568923781485116, + "grad_norm": 0.4864059090614319, + "learning_rate": 3.9292155441358694e-05, + "loss": 0.0521, "step": 13640 }, { - "epoch": 0.8930323846908734, - "grad_norm": 1.0279000997543335, - "learning_rate": 9.98177352878954e-05, - "loss": 0.1132, + "epoch": 3.571540726202159, + "grad_norm": 0.5746162533760071, + "learning_rate": 3.927519273078459e-05, + "loss": 0.0583, "step": 13650 }, { - "epoch": 0.8936866208701341, - "grad_norm": 1.029099941253662, - "learning_rate": 9.981695082101893e-05, - "loss": 0.1125, + "epoch": 3.5741576709192016, + "grad_norm": 0.7206608653068542, + "learning_rate": 3.9258220263250865e-05, + "loss": 0.0519, "step": 13660 }, { - "epoch": 0.8943408570493948, - "grad_norm": 1.044093132019043, - "learning_rate": 9.981616467268973e-05, - "loss": 0.1163, + "epoch": 3.5767746156362445, + "grad_norm": 0.6371074318885803, + "learning_rate": 3.9241238050358044e-05, + "loss": 0.0539, "step": 13670 }, { - "epoch": 0.8949950932286556, - "grad_norm": 0.9173268675804138, - "learning_rate": 9.98153768429343e-05, - "loss": 0.1133, + "epoch": 3.5793915603532875, + "grad_norm": 0.5628343820571899, + "learning_rate": 3.922424610371329e-05, + "loss": 0.0626, "step": 13680 }, { - "epoch": 0.8956493294079163, - "grad_norm": 0.917991578578949, - "learning_rate": 9.981458733177928e-05, - "loss": 0.1043, + "epoch": 3.5820085050703305, + "grad_norm": 0.4270327389240265, + "learning_rate": 3.920724443493046e-05, + "loss": 0.0588, "step": 13690 }, { - "epoch": 0.8963035655871769, - "grad_norm": 0.9324220418930054, - "learning_rate": 9.981379613925129e-05, - "loss": 0.1145, + "epoch": 3.5846254497873735, + "grad_norm": 0.4563710689544678, + "learning_rate": 3.919023305563002e-05, + "loss": 0.0561, "step": 13700 }, { - "epoch": 0.8969578017664377, - "grad_norm": 0.9457996487617493, - "learning_rate": 9.981300326537704e-05, - "loss": 0.1145, + "epoch": 3.587242394504416, + "grad_norm": 0.9741207361221313, + "learning_rate": 3.9173211977439094e-05, + "loss": 0.0555, "step": 13710 }, { - "epoch": 0.8976120379456984, - "grad_norm": 0.9472202062606812, - "learning_rate": 9.981220871018329e-05, - "loss": 0.1176, + "epoch": 3.589859339221459, + "grad_norm": 0.7190433144569397, + "learning_rate": 3.9156181211991426e-05, + "loss": 0.0527, "step": 13720 }, { - "epoch": 0.8982662741249591, - "grad_norm": 0.8660651445388794, - "learning_rate": 9.981141247369685e-05, - "loss": 0.1161, + "epoch": 3.5924762839385016, + "grad_norm": 0.697902262210846, + "learning_rate": 3.9139140770927385e-05, + "loss": 0.0495, "step": 13730 }, { - "epoch": 0.8989205103042198, - "grad_norm": 0.9130904674530029, - "learning_rate": 9.981061455594461e-05, - "loss": 0.1095, + "epoch": 3.5950932286555446, + "grad_norm": 0.5454297661781311, + "learning_rate": 3.912209066589395e-05, + "loss": 0.0533, "step": 13740 }, { - "epoch": 0.8995747464834806, - "grad_norm": 1.0127148628234863, - "learning_rate": 9.980981495695349e-05, - "loss": 0.1115, + "epoch": 3.5977101733725876, + "grad_norm": 0.5934323668479919, + "learning_rate": 3.910503090854472e-05, + "loss": 0.0597, "step": 13750 }, { - "epoch": 0.9002289826627412, - "grad_norm": 0.8180867433547974, - "learning_rate": 9.980901367675048e-05, - "loss": 0.1051, + "epoch": 3.6003271180896306, + "grad_norm": 0.44584307074546814, + "learning_rate": 3.908796151053985e-05, + "loss": 0.0568, "step": 13760 }, { - "epoch": 0.900883218842002, - "grad_norm": 0.8807599544525146, - "learning_rate": 9.980821071536264e-05, - "loss": 0.1317, + "epoch": 3.602944062806673, + "grad_norm": 0.5419604778289795, + "learning_rate": 3.9070882483546135e-05, + "loss": 0.051, "step": 13770 }, { - "epoch": 0.9015374550212627, - "grad_norm": 0.8612924218177795, - "learning_rate": 9.980740607281707e-05, - "loss": 0.1147, + "epoch": 3.605561007523716, + "grad_norm": 0.3984220623970032, + "learning_rate": 3.905379383923693e-05, + "loss": 0.0512, "step": 13780 }, { - "epoch": 0.9021916912005233, - "grad_norm": 0.9025987386703491, - "learning_rate": 9.980659974914091e-05, - "loss": 0.1103, + "epoch": 3.6081779522407587, + "grad_norm": 0.4912005364894867, + "learning_rate": 3.9036695589292136e-05, + "loss": 0.0518, "step": 13790 }, { - "epoch": 0.9028459273797841, - "grad_norm": 0.8930261731147766, - "learning_rate": 9.980579174436138e-05, - "loss": 0.1101, + "epoch": 3.6107948969578016, + "grad_norm": 0.39835718274116516, + "learning_rate": 3.9019587745398276e-05, + "loss": 0.0499, "step": 13800 }, { - "epoch": 0.9035001635590448, - "grad_norm": 0.8868409395217896, - "learning_rate": 9.980498205850577e-05, - "loss": 0.1085, + "epoch": 3.6134118416748446, + "grad_norm": 0.6596834659576416, + "learning_rate": 3.9002470319248394e-05, + "loss": 0.0543, "step": 13810 }, { - "epoch": 0.9041543997383056, - "grad_norm": 0.7958750128746033, - "learning_rate": 9.980417069160139e-05, - "loss": 0.1128, + "epoch": 3.6160287863918876, + "grad_norm": 0.7677016854286194, + "learning_rate": 3.898534332254208e-05, + "loss": 0.0503, "step": 13820 }, { - "epoch": 0.9048086359175662, - "grad_norm": 1.0320643186569214, - "learning_rate": 9.980335764367563e-05, - "loss": 0.1179, + "epoch": 3.61864573110893, + "grad_norm": 0.4970107674598694, + "learning_rate": 3.896820676698548e-05, + "loss": 0.0598, "step": 13830 }, { - "epoch": 0.905462872096827, - "grad_norm": 0.9353842735290527, - "learning_rate": 9.980254291475595e-05, - "loss": 0.1012, + "epoch": 3.621262675825973, + "grad_norm": 0.709835946559906, + "learning_rate": 3.8951060664291265e-05, + "loss": 0.0587, "step": 13840 }, { - "epoch": 0.9061171082760877, - "grad_norm": 0.8081122636795044, - "learning_rate": 9.980172650486983e-05, - "loss": 0.1129, + "epoch": 3.623879620543016, + "grad_norm": 0.610241711139679, + "learning_rate": 3.893390502617864e-05, + "loss": 0.0533, "step": 13850 }, { - "epoch": 0.9067713444553483, - "grad_norm": 0.7655879259109497, - "learning_rate": 9.980090841404482e-05, - "loss": 0.1043, + "epoch": 3.6264965652600587, + "grad_norm": 0.7120442986488342, + "learning_rate": 3.891673986437331e-05, + "loss": 0.0496, "step": 13860 }, { - "epoch": 0.9074255806346091, - "grad_norm": 0.886113703250885, - "learning_rate": 9.980008864230854e-05, - "loss": 0.1072, + "epoch": 3.6291135099771017, + "grad_norm": 0.6501908302307129, + "learning_rate": 3.889956519060752e-05, + "loss": 0.0548, "step": 13870 }, { - "epoch": 0.9080798168138698, - "grad_norm": 0.9072959423065186, - "learning_rate": 9.979926718968868e-05, - "loss": 0.1132, + "epoch": 3.6317304546941447, + "grad_norm": 0.4381459653377533, + "learning_rate": 3.8882381016619986e-05, + "loss": 0.0565, "step": 13880 }, { - "epoch": 0.9087340529931305, - "grad_norm": 1.0498781204223633, - "learning_rate": 9.979844405621295e-05, - "loss": 0.1179, + "epoch": 3.6343473994111877, + "grad_norm": 0.49421507120132446, + "learning_rate": 3.886518735415593e-05, + "loss": 0.0547, "step": 13890 }, { - "epoch": 0.9093882891723912, - "grad_norm": 0.9316992163658142, - "learning_rate": 9.979761924190911e-05, - "loss": 0.1127, + "epoch": 3.63696434412823, + "grad_norm": 0.5255186557769775, + "learning_rate": 3.884798421496705e-05, + "loss": 0.0537, "step": 13900 }, { - "epoch": 0.910042525351652, - "grad_norm": 0.9296565055847168, - "learning_rate": 9.979679274680504e-05, - "loss": 0.11, + "epoch": 3.639581288845273, + "grad_norm": 0.8173955082893372, + "learning_rate": 3.883077161081155e-05, + "loss": 0.0552, "step": 13910 }, { - "epoch": 0.9106967615309126, - "grad_norm": 1.0889675617218018, - "learning_rate": 9.979596457092861e-05, - "loss": 0.1027, + "epoch": 3.6421982335623158, + "grad_norm": 0.5856022834777832, + "learning_rate": 3.8813549553454056e-05, + "loss": 0.0532, "step": 13920 }, { - "epoch": 0.9113509977101734, - "grad_norm": 0.742987871170044, - "learning_rate": 9.979513471430779e-05, - "loss": 0.1047, + "epoch": 3.6448151782793587, + "grad_norm": 0.4360755980014801, + "learning_rate": 3.8796318054665706e-05, + "loss": 0.0504, "step": 13930 }, { - "epoch": 0.9120052338894341, - "grad_norm": 0.8663235902786255, - "learning_rate": 9.979430317697056e-05, - "loss": 0.1226, + "epoch": 3.6474321229964017, + "grad_norm": 0.35014355182647705, + "learning_rate": 3.877907712622406e-05, + "loss": 0.0464, "step": 13940 }, { - "epoch": 0.9126594700686947, - "grad_norm": 1.0001239776611328, - "learning_rate": 9.979346995894504e-05, - "loss": 0.1174, + "epoch": 3.6500490677134447, + "grad_norm": 0.5001941919326782, + "learning_rate": 3.876182677991312e-05, + "loss": 0.0575, "step": 13950 }, { - "epoch": 0.9133137062479555, - "grad_norm": 1.0531165599822998, - "learning_rate": 9.979263506025929e-05, - "loss": 0.1148, + "epoch": 3.6526660124304873, + "grad_norm": 0.29256579279899597, + "learning_rate": 3.874456702752334e-05, + "loss": 0.0547, "step": 13960 }, { - "epoch": 0.9139679424272162, - "grad_norm": 0.8810727000236511, - "learning_rate": 9.979179848094153e-05, - "loss": 0.1153, + "epoch": 3.6552829571475303, + "grad_norm": 0.4693641662597656, + "learning_rate": 3.872729788085161e-05, + "loss": 0.0558, "step": 13970 }, { - "epoch": 0.914622178606477, - "grad_norm": 0.8835552930831909, - "learning_rate": 9.979096022102e-05, - "loss": 0.1089, + "epoch": 3.6578999018645733, + "grad_norm": 0.40088728070259094, + "learning_rate": 3.871001935170121e-05, + "loss": 0.0521, "step": 13980 }, { - "epoch": 0.9152764147857376, - "grad_norm": 0.9073536396026611, - "learning_rate": 9.979012028052297e-05, - "loss": 0.1254, + "epoch": 3.660516846581616, + "grad_norm": 0.4256943464279175, + "learning_rate": 3.869273145188187e-05, + "loss": 0.0523, "step": 13990 }, { - "epoch": 0.9159306509649984, - "grad_norm": 0.8263078331947327, - "learning_rate": 9.97892786594788e-05, - "loss": 0.1104, + "epoch": 3.663133791298659, + "grad_norm": 0.445076584815979, + "learning_rate": 3.8675434193209684e-05, + "loss": 0.0507, "step": 14000 }, { - "epoch": 0.9165848871442591, - "grad_norm": 0.9409777522087097, - "learning_rate": 9.978843535791588e-05, - "loss": 0.1119, + "epoch": 3.663133791298659, + "eval_loss": 0.05693136046282009, + "eval_runtime": 9.1176, + "eval_samples_per_second": 112.31, + "eval_steps_per_second": 1.755, + "step": 14000 + }, + { + "epoch": 3.665750736015702, + "grad_norm": 0.37255793809890747, + "learning_rate": 3.8658127587507184e-05, + "loss": 0.0531, "step": 14010 }, { - "epoch": 0.9172391233235198, - "grad_norm": 0.8402562737464905, - "learning_rate": 9.978759037586272e-05, - "loss": 0.1134, + "epoch": 3.6683676807327448, + "grad_norm": 0.5347610116004944, + "learning_rate": 3.8640811646603276e-05, + "loss": 0.0559, "step": 14020 }, { - "epoch": 0.9178933595027805, - "grad_norm": 0.8777570724487305, - "learning_rate": 9.978674371334782e-05, - "loss": 0.1077, + "epoch": 3.6709846254497873, + "grad_norm": 0.2988486886024475, + "learning_rate": 3.8623486382333226e-05, + "loss": 0.0518, "step": 14030 }, { - "epoch": 0.9185475956820413, - "grad_norm": 1.0120798349380493, - "learning_rate": 9.978589537039972e-05, - "loss": 0.1179, + "epoch": 3.6736015701668303, + "grad_norm": 0.5495784282684326, + "learning_rate": 3.860615180653869e-05, + "loss": 0.0499, "step": 14040 }, { - "epoch": 0.9192018318613019, - "grad_norm": 0.7693895101547241, - "learning_rate": 9.97850453470471e-05, - "loss": 0.1248, + "epoch": 3.676218514883873, + "grad_norm": 0.5568211674690247, + "learning_rate": 3.85888079310677e-05, + "loss": 0.0556, "step": 14050 }, { - "epoch": 0.9198560680405626, - "grad_norm": 0.8541615009307861, - "learning_rate": 9.978419364331863e-05, - "loss": 0.1226, + "epoch": 3.678835459600916, + "grad_norm": 0.672922670841217, + "learning_rate": 3.857145476777463e-05, + "loss": 0.0541, "step": 14060 }, { - "epoch": 0.9205103042198234, - "grad_norm": 1.0362035036087036, - "learning_rate": 9.978334025924307e-05, - "loss": 0.1274, + "epoch": 3.681452404317959, + "grad_norm": 0.6420120596885681, + "learning_rate": 3.8554092328520186e-05, + "loss": 0.0474, "step": 14070 }, { - "epoch": 0.921164540399084, - "grad_norm": 0.9056493639945984, - "learning_rate": 9.97824851948492e-05, - "loss": 0.1163, + "epoch": 3.684069349035002, + "grad_norm": 0.508482813835144, + "learning_rate": 3.853672062517144e-05, + "loss": 0.0455, "step": 14080 }, { - "epoch": 0.9218187765783448, - "grad_norm": 0.9539541602134705, - "learning_rate": 9.97816284501659e-05, - "loss": 0.1117, + "epoch": 3.6866862937520444, + "grad_norm": 0.36084631085395813, + "learning_rate": 3.8519339669601794e-05, + "loss": 0.0492, "step": 14090 }, { - "epoch": 0.9224730127576055, - "grad_norm": 0.8763441443443298, - "learning_rate": 9.978077002522208e-05, - "loss": 0.1187, + "epoch": 3.6893032384690874, + "grad_norm": 0.45002564787864685, + "learning_rate": 3.850194947369097e-05, + "loss": 0.0512, "step": 14100 }, { - "epoch": 0.9231272489368663, - "grad_norm": 1.0391874313354492, - "learning_rate": 9.977990992004672e-05, - "loss": 0.1131, + "epoch": 3.6919201831861304, + "grad_norm": 0.36685696244239807, + "learning_rate": 3.8484550049324996e-05, + "loss": 0.0531, "step": 14110 }, { - "epoch": 0.9237814851161269, - "grad_norm": 0.8817654252052307, - "learning_rate": 9.977904813466885e-05, - "loss": 0.1106, + "epoch": 3.694537127903173, + "grad_norm": 0.48534664511680603, + "learning_rate": 3.8467141408396206e-05, + "loss": 0.0583, "step": 14120 }, { - "epoch": 0.9244357212953876, - "grad_norm": 0.7447488903999329, - "learning_rate": 9.977818466911754e-05, - "loss": 0.1042, + "epoch": 3.697154072620216, + "grad_norm": 0.8050063848495483, + "learning_rate": 3.844972356280326e-05, + "loss": 0.0559, "step": 14130 }, { - "epoch": 0.9250899574746484, - "grad_norm": 0.9143378734588623, - "learning_rate": 9.977731952342198e-05, - "loss": 0.1299, + "epoch": 3.699771017337259, + "grad_norm": 0.6816076040267944, + "learning_rate": 3.843229652445107e-05, + "loss": 0.0509, "step": 14140 }, { - "epoch": 0.925744193653909, - "grad_norm": 0.9226405024528503, - "learning_rate": 9.97764526976113e-05, - "loss": 0.1058, + "epoch": 3.702387962054302, + "grad_norm": 0.5003925561904907, + "learning_rate": 3.8414860305250875e-05, + "loss": 0.0472, "step": 14150 }, { - "epoch": 0.9263984298331698, - "grad_norm": 0.7661489844322205, - "learning_rate": 9.977558419171485e-05, - "loss": 0.1109, + "epoch": 3.7050049067713444, + "grad_norm": 0.5868620872497559, + "learning_rate": 3.839741491712016e-05, + "loss": 0.0518, "step": 14160 }, { - "epoch": 0.9270526660124305, - "grad_norm": 0.8209551572799683, - "learning_rate": 9.977471400576185e-05, - "loss": 0.1158, + "epoch": 3.7076218514883874, + "grad_norm": 0.6365765929222107, + "learning_rate": 3.837996037198267e-05, + "loss": 0.0515, "step": 14170 }, { - "epoch": 0.9277069021916912, - "grad_norm": 0.9053903818130493, - "learning_rate": 9.977384213978173e-05, - "loss": 0.1041, + "epoch": 3.71023879620543, + "grad_norm": 0.40195780992507935, + "learning_rate": 3.836249668176844e-05, + "loss": 0.0498, "step": 14180 }, { - "epoch": 0.9283611383709519, - "grad_norm": 0.9400367140769958, - "learning_rate": 9.97729685938039e-05, - "loss": 0.1077, + "epoch": 3.712855740922473, + "grad_norm": 0.4129416346549988, + "learning_rate": 3.834502385841372e-05, + "loss": 0.0529, "step": 14190 }, { - "epoch": 0.9290153745502127, - "grad_norm": 0.9870936274528503, - "learning_rate": 9.977209336785783e-05, - "loss": 0.1053, + "epoch": 3.715472685639516, + "grad_norm": 0.41690176725387573, + "learning_rate": 3.832754191386103e-05, + "loss": 0.0527, "step": 14200 }, { - "epoch": 0.9296696107294733, - "grad_norm": 0.9150540828704834, - "learning_rate": 9.977121646197309e-05, - "loss": 0.1115, + "epoch": 3.718089630356559, + "grad_norm": 0.6315004825592041, + "learning_rate": 3.831005086005912e-05, + "loss": 0.0545, "step": 14210 }, { - "epoch": 0.930323846908734, - "grad_norm": 1.1972026824951172, - "learning_rate": 9.977033787617927e-05, - "loss": 0.1049, + "epoch": 3.7207065750736015, + "grad_norm": 0.49347928166389465, + "learning_rate": 3.829255070896294e-05, + "loss": 0.0504, "step": 14220 }, { - "epoch": 0.9309780830879948, - "grad_norm": 0.9222517013549805, - "learning_rate": 9.9769457610506e-05, - "loss": 0.1094, + "epoch": 3.7233235197906445, + "grad_norm": 0.625518798828125, + "learning_rate": 3.82750414725337e-05, + "loss": 0.0556, "step": 14230 }, { - "epoch": 0.9316323192672554, - "grad_norm": 0.763886034488678, - "learning_rate": 9.976857566498303e-05, - "loss": 0.1083, + "epoch": 3.725940464507687, + "grad_norm": 0.6610779166221619, + "learning_rate": 3.8257523162738794e-05, + "loss": 0.0531, "step": 14240 }, { - "epoch": 0.9322865554465162, - "grad_norm": 0.8632875084877014, - "learning_rate": 9.976769203964011e-05, - "loss": 0.1082, + "epoch": 3.72855740922473, + "grad_norm": 0.5094367265701294, + "learning_rate": 3.823999579155182e-05, + "loss": 0.0568, "step": 14250 }, { - "epoch": 0.9329407916257769, - "grad_norm": 1.0797144174575806, - "learning_rate": 9.976680673450704e-05, - "loss": 0.1119, + "epoch": 3.731174353941773, + "grad_norm": 0.7669066786766052, + "learning_rate": 3.822245937095256e-05, + "loss": 0.0522, "step": 14260 }, { - "epoch": 0.9335950278050377, - "grad_norm": 0.7785837054252625, - "learning_rate": 9.976591974961376e-05, - "loss": 0.1061, + "epoch": 3.733791298658816, + "grad_norm": 0.7019723057746887, + "learning_rate": 3.8204913912927e-05, + "loss": 0.0549, "step": 14270 }, { - "epoch": 0.9342492639842983, - "grad_norm": 0.9709704518318176, - "learning_rate": 9.976503108499014e-05, - "loss": 0.1245, + "epoch": 3.7364082433758585, + "grad_norm": 0.5560470223426819, + "learning_rate": 3.8187359429467294e-05, + "loss": 0.0525, "step": 14280 }, { - "epoch": 0.934903500163559, - "grad_norm": 0.8645631670951843, - "learning_rate": 9.976414074066622e-05, - "loss": 0.1041, + "epoch": 3.7390251880929015, + "grad_norm": 0.6931577324867249, + "learning_rate": 3.816979593257177e-05, + "loss": 0.0481, "step": 14290 }, { - "epoch": 0.9355577363428198, - "grad_norm": 0.8737794756889343, - "learning_rate": 9.976324871667204e-05, - "loss": 0.1068, + "epoch": 3.7416421328099445, + "grad_norm": 0.6408355236053467, + "learning_rate": 3.815222343424492e-05, + "loss": 0.0502, "step": 14300 }, { - "epoch": 0.9362119725220804, - "grad_norm": 0.9471864104270935, - "learning_rate": 9.97623550130377e-05, - "loss": 0.1132, + "epoch": 3.744259077526987, + "grad_norm": 0.6335650086402893, + "learning_rate": 3.8134641946497354e-05, + "loss": 0.0547, "step": 14310 }, { - "epoch": 0.9368662087013412, - "grad_norm": 0.9491965770721436, - "learning_rate": 9.976145962979337e-05, - "loss": 0.1086, + "epoch": 3.74687602224403, + "grad_norm": 0.6570053100585938, + "learning_rate": 3.811705148134587e-05, + "loss": 0.054, "step": 14320 }, { - "epoch": 0.9375204448806019, - "grad_norm": 1.008070468902588, - "learning_rate": 9.976056256696928e-05, - "loss": 0.1133, + "epoch": 3.749492966961073, + "grad_norm": 0.4380761981010437, + "learning_rate": 3.80994520508134e-05, + "loss": 0.0495, "step": 14330 }, { - "epoch": 0.9381746810598626, - "grad_norm": 0.8952319025993347, - "learning_rate": 9.975966382459572e-05, - "loss": 0.1053, + "epoch": 3.752109911678116, + "grad_norm": 0.4016902446746826, + "learning_rate": 3.8081843666928965e-05, + "loss": 0.0581, "step": 14340 }, { - "epoch": 0.9388289172391233, - "grad_norm": 0.8078186511993408, - "learning_rate": 9.975876340270298e-05, - "loss": 0.1079, + "epoch": 3.7547268563951586, + "grad_norm": 0.5980265736579895, + "learning_rate": 3.8064226341727736e-05, + "loss": 0.0507, "step": 14350 }, { - "epoch": 0.9394831534183841, - "grad_norm": 1.273657202720642, - "learning_rate": 9.975786130132148e-05, - "loss": 0.1111, + "epoch": 3.7573438011122016, + "grad_norm": 0.44559043645858765, + "learning_rate": 3.8046600087250996e-05, + "loss": 0.0506, "step": 14360 }, { - "epoch": 0.9401373895976447, - "grad_norm": 0.9750173091888428, - "learning_rate": 9.975695752048168e-05, - "loss": 0.1207, + "epoch": 3.759960745829244, + "grad_norm": 0.45184126496315, + "learning_rate": 3.802896491554611e-05, + "loss": 0.0508, "step": 14370 }, { - "epoch": 0.9407916257769054, - "grad_norm": 0.7701619267463684, - "learning_rate": 9.975605206021406e-05, - "loss": 0.1098, + "epoch": 3.762577690546287, + "grad_norm": 0.7190099954605103, + "learning_rate": 3.801132083866657e-05, + "loss": 0.0505, "step": 14380 }, { - "epoch": 0.9414458619561662, - "grad_norm": 0.8776541352272034, - "learning_rate": 9.97551449205492e-05, - "loss": 0.1126, + "epoch": 3.76519463526333, + "grad_norm": 0.5304298996925354, + "learning_rate": 3.799366786867192e-05, + "loss": 0.0589, "step": 14390 }, { - "epoch": 0.9421000981354269, - "grad_norm": 0.8586710691452026, - "learning_rate": 9.975423610151771e-05, - "loss": 0.1052, + "epoch": 3.767811579980373, + "grad_norm": 0.4039107859134674, + "learning_rate": 3.7976006017627806e-05, + "loss": 0.0546, "step": 14400 }, { - "epoch": 0.9427543343146876, - "grad_norm": 0.9918507933616638, - "learning_rate": 9.975332560315026e-05, - "loss": 0.1105, + "epoch": 3.7704285246974156, + "grad_norm": 0.972225546836853, + "learning_rate": 3.7958335297605935e-05, + "loss": 0.0505, "step": 14410 }, { - "epoch": 0.9434085704939483, - "grad_norm": 0.8050119280815125, - "learning_rate": 9.97524134254776e-05, - "loss": 0.1009, + "epoch": 3.7730454694144586, + "grad_norm": 0.6542357206344604, + "learning_rate": 3.7940655720684076e-05, + "loss": 0.0467, "step": 14420 }, { - "epoch": 0.9440628066732091, - "grad_norm": 0.8901785612106323, - "learning_rate": 9.975149956853049e-05, - "loss": 0.1104, + "epoch": 3.7756624141315016, + "grad_norm": 0.7045339941978455, + "learning_rate": 3.792296729894606e-05, + "loss": 0.051, "step": 14430 }, { - "epoch": 0.9447170428524697, - "grad_norm": 1.129379391670227, - "learning_rate": 9.975058403233981e-05, - "loss": 0.122, + "epoch": 3.778279358848544, + "grad_norm": 0.348215252161026, + "learning_rate": 3.790527004448175e-05, + "loss": 0.0567, "step": 14440 }, { - "epoch": 0.9453712790317305, - "grad_norm": 0.946667492389679, - "learning_rate": 9.974966681693642e-05, - "loss": 0.1093, + "epoch": 3.780896303565587, + "grad_norm": 0.7830608487129211, + "learning_rate": 3.788756396938705e-05, + "loss": 0.0511, "step": 14450 }, { - "epoch": 0.9460255152109912, - "grad_norm": 1.037529468536377, - "learning_rate": 9.974874792235131e-05, - "loss": 0.12, + "epoch": 3.78351324828263, + "grad_norm": 0.4746474623680115, + "learning_rate": 3.786984908576391e-05, + "loss": 0.0481, "step": 14460 }, { - "epoch": 0.9466797513902518, - "grad_norm": 0.9789792895317078, - "learning_rate": 9.97478273486155e-05, - "loss": 0.1238, + "epoch": 3.786130192999673, + "grad_norm": 0.5564001798629761, + "learning_rate": 3.785212540572026e-05, + "loss": 0.0547, "step": 14470 }, { - "epoch": 0.9473339875695126, - "grad_norm": 0.9181520938873291, - "learning_rate": 9.974690509576005e-05, - "loss": 0.1214, + "epoch": 3.7887471377167157, + "grad_norm": 0.4673481583595276, + "learning_rate": 3.78343929413701e-05, + "loss": 0.0441, "step": 14480 }, { - "epoch": 0.9479882237487733, - "grad_norm": 0.8259325623512268, - "learning_rate": 9.974598116381608e-05, - "loss": 0.114, + "epoch": 3.7913640824337587, + "grad_norm": 0.43412885069847107, + "learning_rate": 3.7816651704833374e-05, + "loss": 0.0521, "step": 14490 }, { - "epoch": 0.948642459928034, - "grad_norm": 0.8979782462120056, - "learning_rate": 9.974505555281476e-05, - "loss": 0.117, + "epoch": 3.793981027150801, + "grad_norm": 0.4434505105018616, + "learning_rate": 3.779890170823606e-05, + "loss": 0.0576, "step": 14500 }, { - "epoch": 0.9492966961072947, - "grad_norm": 0.9124277234077454, - "learning_rate": 9.974412826278738e-05, - "loss": 0.1052, + "epoch": 3.796597971867844, + "grad_norm": 0.6474137306213379, + "learning_rate": 3.778114296371013e-05, + "loss": 0.0571, "step": 14510 }, { - "epoch": 0.9499509322865555, - "grad_norm": 1.0142040252685547, - "learning_rate": 9.974319929376522e-05, - "loss": 0.1166, + "epoch": 3.799214916584887, + "grad_norm": 0.66424560546875, + "learning_rate": 3.776337548339348e-05, + "loss": 0.0469, "step": 14520 }, { - "epoch": 0.9506051684658161, - "grad_norm": 0.973535418510437, - "learning_rate": 9.974226864577961e-05, - "loss": 0.1059, + "epoch": 3.80183186130193, + "grad_norm": 0.4851696193218231, + "learning_rate": 3.774559927943006e-05, + "loss": 0.0532, "step": 14530 }, { - "epoch": 0.9512594046450769, - "grad_norm": 0.8721091151237488, - "learning_rate": 9.974133631886198e-05, - "loss": 0.1119, + "epoch": 3.8044488060189727, + "grad_norm": 0.3956303894519806, + "learning_rate": 3.7727814363969705e-05, + "loss": 0.0463, "step": 14540 }, { - "epoch": 0.9519136408243376, - "grad_norm": 1.0169627666473389, - "learning_rate": 9.97404023130438e-05, - "loss": 0.1231, + "epoch": 3.8070657507360157, + "grad_norm": 0.5008952021598816, + "learning_rate": 3.771002074916824e-05, + "loss": 0.06, "step": 14550 }, { - "epoch": 0.9525678770035984, - "grad_norm": 0.861275315284729, - "learning_rate": 9.973946662835658e-05, - "loss": 0.1046, + "epoch": 3.8096826954530587, + "grad_norm": 0.4984845221042633, + "learning_rate": 3.769221844718746e-05, + "loss": 0.0543, "step": 14560 }, { - "epoch": 0.953222113182859, - "grad_norm": 1.012182593345642, - "learning_rate": 9.973852926483194e-05, - "loss": 0.112, + "epoch": 3.8122996401701013, + "grad_norm": 0.5751848220825195, + "learning_rate": 3.767440747019505e-05, + "loss": 0.0501, "step": 14570 }, { - "epoch": 0.9538763493621197, - "grad_norm": 0.804041862487793, - "learning_rate": 9.973759022250147e-05, - "loss": 0.1076, + "epoch": 3.8149165848871442, + "grad_norm": 0.42314207553863525, + "learning_rate": 3.7656587830364646e-05, + "loss": 0.0484, "step": 14580 }, { - "epoch": 0.9545305855413805, - "grad_norm": 0.799505889415741, - "learning_rate": 9.97366495013969e-05, - "loss": 0.1137, + "epoch": 3.8175335296041872, + "grad_norm": 0.5318662524223328, + "learning_rate": 3.763875953987579e-05, + "loss": 0.0504, "step": 14590 }, { - "epoch": 0.9551848217206411, - "grad_norm": 0.8227983713150024, - "learning_rate": 9.973570710154998e-05, - "loss": 0.0982, + "epoch": 3.8201504743212302, + "grad_norm": 0.43563997745513916, + "learning_rate": 3.7620922610913966e-05, + "loss": 0.0517, "step": 14600 }, { - "epoch": 0.9558390578999019, - "grad_norm": 0.9317682385444641, - "learning_rate": 9.973476302299249e-05, - "loss": 0.1103, + "epoch": 3.8227674190382728, + "grad_norm": 0.5153934359550476, + "learning_rate": 3.760307705567056e-05, + "loss": 0.0511, "step": 14610 }, { - "epoch": 0.9564932940791626, - "grad_norm": 0.8401086926460266, - "learning_rate": 9.973381726575632e-05, - "loss": 0.1188, + "epoch": 3.8253843637553158, + "grad_norm": 0.49038711190223694, + "learning_rate": 3.758522288634282e-05, + "loss": 0.0518, "step": 14620 }, { - "epoch": 0.9571475302584233, - "grad_norm": 0.9187564849853516, - "learning_rate": 9.97328698298734e-05, - "loss": 0.1242, + "epoch": 3.8280013084723583, + "grad_norm": 0.6636834740638733, + "learning_rate": 3.756736011513391e-05, + "loss": 0.0497, "step": 14630 }, { - "epoch": 0.957801766437684, - "grad_norm": 1.0764808654785156, - "learning_rate": 9.973192071537567e-05, - "loss": 0.112, + "epoch": 3.8306182531894013, + "grad_norm": 0.8136001825332642, + "learning_rate": 3.754948875425286e-05, + "loss": 0.0536, "step": 14640 }, { - "epoch": 0.9584560026169447, - "grad_norm": 0.947796642780304, - "learning_rate": 9.97309699222952e-05, - "loss": 0.1048, + "epoch": 3.8332351979064443, + "grad_norm": 0.4622998535633087, + "learning_rate": 3.753160881591459e-05, + "loss": 0.0484, "step": 14650 }, { - "epoch": 0.9591102387962054, - "grad_norm": 0.8203085660934448, - "learning_rate": 9.973001745066408e-05, - "loss": 0.1156, + "epoch": 3.8358521426234873, + "grad_norm": 0.4776436388492584, + "learning_rate": 3.751372031233985e-05, + "loss": 0.0497, "step": 14660 }, { - "epoch": 0.9597644749754661, - "grad_norm": 0.7854580879211426, - "learning_rate": 9.972906330051444e-05, - "loss": 0.1097, + "epoch": 3.83846908734053, + "grad_norm": 0.3973548114299774, + "learning_rate": 3.749582325575528e-05, + "loss": 0.0537, "step": 14670 }, { - "epoch": 0.9604187111547269, - "grad_norm": 0.904597818851471, - "learning_rate": 9.97281074718785e-05, - "loss": 0.1062, + "epoch": 3.841086032057573, + "grad_norm": 0.4959774613380432, + "learning_rate": 3.7477917658393345e-05, + "loss": 0.0514, "step": 14680 }, { - "epoch": 0.9610729473339875, - "grad_norm": 0.8809137940406799, - "learning_rate": 9.972714996478851e-05, - "loss": 0.1027, + "epoch": 3.8437029767746154, + "grad_norm": 0.28917068243026733, + "learning_rate": 3.746000353249234e-05, + "loss": 0.0452, "step": 14690 }, { - "epoch": 0.9617271835132483, - "grad_norm": 0.8244183659553528, - "learning_rate": 9.972619077927679e-05, - "loss": 0.1032, + "epoch": 3.8463199214916584, + "grad_norm": 0.44032183289527893, + "learning_rate": 3.744208089029642e-05, + "loss": 0.0564, "step": 14700 }, { - "epoch": 0.962381419692509, - "grad_norm": 0.7864908576011658, - "learning_rate": 9.972522991537573e-05, - "loss": 0.0975, + "epoch": 3.8489368662087013, + "grad_norm": 0.5193437933921814, + "learning_rate": 3.7424149744055534e-05, + "loss": 0.0511, "step": 14710 }, { - "epoch": 0.9630356558717698, - "grad_norm": 0.8401934504508972, - "learning_rate": 9.972426737311774e-05, - "loss": 0.1166, + "epoch": 3.8515538109257443, + "grad_norm": 0.46186214685440063, + "learning_rate": 3.740621010602545e-05, + "loss": 0.054, "step": 14720 }, { - "epoch": 0.9636898920510304, - "grad_norm": 0.8961406350135803, - "learning_rate": 9.972330315253534e-05, - "loss": 0.1121, + "epoch": 3.854170755642787, + "grad_norm": 0.5270763635635376, + "learning_rate": 3.7388261988467747e-05, + "loss": 0.0504, "step": 14730 }, { - "epoch": 0.9643441282302911, - "grad_norm": 0.9105174541473389, - "learning_rate": 9.972233725366102e-05, - "loss": 0.1047, + "epoch": 3.85678770035983, + "grad_norm": 0.4619028866291046, + "learning_rate": 3.7370305403649774e-05, + "loss": 0.0518, "step": 14740 }, { - "epoch": 0.9649983644095519, - "grad_norm": 0.9146085977554321, - "learning_rate": 9.972136967652746e-05, - "loss": 0.1086, + "epoch": 3.859404645076873, + "grad_norm": 0.4640951156616211, + "learning_rate": 3.7352340363844704e-05, + "loss": 0.0498, "step": 14750 }, { - "epoch": 0.9656526005888125, - "grad_norm": 0.8616661429405212, - "learning_rate": 9.972040042116724e-05, - "loss": 0.1113, + "epoch": 3.8620215897939154, + "grad_norm": 0.3832848370075226, + "learning_rate": 3.7334366881331486e-05, + "loss": 0.052, "step": 14760 }, { - "epoch": 0.9663068367680733, - "grad_norm": 0.9188449382781982, - "learning_rate": 9.971942948761313e-05, - "loss": 0.1171, + "epoch": 3.8646385345109584, + "grad_norm": 0.5284420847892761, + "learning_rate": 3.73163849683948e-05, + "loss": 0.0509, "step": 14770 }, { - "epoch": 0.966961072947334, - "grad_norm": 0.9797625541687012, - "learning_rate": 9.971845687589786e-05, - "loss": 0.1142, + "epoch": 3.8672554792280014, + "grad_norm": 0.5752606987953186, + "learning_rate": 3.729839463732513e-05, + "loss": 0.0513, "step": 14780 }, { - "epoch": 0.9676153091265947, - "grad_norm": 0.9933854341506958, - "learning_rate": 9.97174825860543e-05, - "loss": 0.1118, + "epoch": 3.8698724239450444, + "grad_norm": 0.5546534657478333, + "learning_rate": 3.7280395900418685e-05, + "loss": 0.0489, "step": 14790 }, { - "epoch": 0.9682695453058554, - "grad_norm": 0.9022259712219238, - "learning_rate": 9.971650661811529e-05, - "loss": 0.1073, + "epoch": 3.872489368662087, + "grad_norm": 0.5715247392654419, + "learning_rate": 3.726238876997744e-05, + "loss": 0.0533, "step": 14800 }, { - "epoch": 0.9689237814851162, - "grad_norm": 0.824038028717041, - "learning_rate": 9.971552897211381e-05, - "loss": 0.1161, + "epoch": 3.87510631337913, + "grad_norm": 0.3901219666004181, + "learning_rate": 3.724437325830911e-05, + "loss": 0.0513, "step": 14810 }, { - "epoch": 0.9695780176643768, - "grad_norm": 0.7811712622642517, - "learning_rate": 9.971454964808284e-05, - "loss": 0.1043, + "epoch": 3.8777232580961725, + "grad_norm": 0.4849676489830017, + "learning_rate": 3.722634937772711e-05, + "loss": 0.0457, "step": 14820 }, { - "epoch": 0.9702322538436375, - "grad_norm": 1.1899689435958862, - "learning_rate": 9.971356864605544e-05, - "loss": 0.1191, + "epoch": 3.8803402028132155, + "grad_norm": 0.3245883285999298, + "learning_rate": 3.72083171405506e-05, + "loss": 0.0547, "step": 14830 }, { - "epoch": 0.9708864900228983, - "grad_norm": 0.9993957281112671, - "learning_rate": 9.971258596606472e-05, - "loss": 0.1146, + "epoch": 3.8829571475302584, + "grad_norm": 0.5932340621948242, + "learning_rate": 3.719027655910443e-05, + "loss": 0.0508, "step": 14840 }, { - "epoch": 0.971540726202159, - "grad_norm": 0.8771776556968689, - "learning_rate": 9.971160160814386e-05, - "loss": 0.1151, + "epoch": 3.8855740922473014, + "grad_norm": 0.36464598774909973, + "learning_rate": 3.7172227645719186e-05, + "loss": 0.0499, "step": 14850 }, { - "epoch": 0.9721949623814197, - "grad_norm": 0.9334481358528137, - "learning_rate": 9.971061557232606e-05, - "loss": 0.1068, + "epoch": 3.888191036964344, + "grad_norm": 0.4754194915294647, + "learning_rate": 3.7154170412731124e-05, + "loss": 0.0532, "step": 14860 }, { - "epoch": 0.9728491985606804, - "grad_norm": 0.9948738813400269, - "learning_rate": 9.970962785864461e-05, - "loss": 0.1052, + "epoch": 3.890807981681387, + "grad_norm": 0.4943714439868927, + "learning_rate": 3.713610487248219e-05, + "loss": 0.0511, "step": 14870 }, { - "epoch": 0.9735034347399412, - "grad_norm": 0.8575040102005005, - "learning_rate": 9.970863846713286e-05, - "loss": 0.1025, + "epoch": 3.89342492639843, + "grad_norm": 0.7925359606742859, + "learning_rate": 3.7118031037320025e-05, + "loss": 0.0566, "step": 14880 }, { - "epoch": 0.9741576709192018, - "grad_norm": 1.019413948059082, - "learning_rate": 9.970764739782419e-05, - "loss": 0.1091, + "epoch": 3.8960418711154725, + "grad_norm": 0.43726474046707153, + "learning_rate": 3.709994891959789e-05, + "loss": 0.0457, "step": 14890 }, { - "epoch": 0.9748119070984625, - "grad_norm": 0.8023679852485657, - "learning_rate": 9.970665465075205e-05, - "loss": 0.1044, + "epoch": 3.8986588158325155, + "grad_norm": 0.4101634621620178, + "learning_rate": 3.708185853167478e-05, + "loss": 0.0555, "step": 14900 }, { - "epoch": 0.9754661432777233, - "grad_norm": 0.8333026766777039, - "learning_rate": 9.970566022594996e-05, - "loss": 0.1055, + "epoch": 3.9012757605495585, + "grad_norm": 0.7545664310455322, + "learning_rate": 3.706375988591528e-05, + "loss": 0.0596, "step": 14910 }, { - "epoch": 0.9761203794569839, - "grad_norm": 0.9163589477539062, - "learning_rate": 9.97046641234515e-05, - "loss": 0.0995, + "epoch": 3.9038927052666015, + "grad_norm": 0.5561701059341431, + "learning_rate": 3.704565299468966e-05, + "loss": 0.0468, "step": 14920 }, { - "epoch": 0.9767746156362447, - "grad_norm": 0.90887850522995, - "learning_rate": 9.970366634329024e-05, - "loss": 0.1066, + "epoch": 3.906509649983644, + "grad_norm": 0.5525078177452087, + "learning_rate": 3.70275378703738e-05, + "loss": 0.0542, "step": 14930 }, { - "epoch": 0.9774288518155054, - "grad_norm": 1.0364896059036255, - "learning_rate": 9.970266688549991e-05, - "loss": 0.1136, + "epoch": 3.909126594700687, + "grad_norm": 0.45929309725761414, + "learning_rate": 3.700941452534922e-05, + "loss": 0.0586, "step": 14940 }, { - "epoch": 0.9780830879947661, - "grad_norm": 1.5073118209838867, - "learning_rate": 9.970166575011422e-05, - "loss": 0.1048, + "epoch": 3.9117435394177296, + "grad_norm": 0.6954630017280579, + "learning_rate": 3.699128297200305e-05, + "loss": 0.0459, "step": 14950 }, { - "epoch": 0.9787373241740268, - "grad_norm": 0.8328016400337219, - "learning_rate": 9.970066293716695e-05, - "loss": 0.1149, + "epoch": 3.9143604841347726, + "grad_norm": 0.4128349721431732, + "learning_rate": 3.697314322272804e-05, + "loss": 0.0478, "step": 14960 }, { - "epoch": 0.9793915603532876, - "grad_norm": 0.8772666454315186, - "learning_rate": 9.969965844669197e-05, - "loss": 0.1238, + "epoch": 3.9169774288518155, + "grad_norm": 0.576116681098938, + "learning_rate": 3.695499528992253e-05, + "loss": 0.0528, "step": 14970 }, { - "epoch": 0.9800457965325482, - "grad_norm": 1.083786964416504, - "learning_rate": 9.969865227872317e-05, - "loss": 0.1016, + "epoch": 3.9195943735688585, + "grad_norm": 0.7343944907188416, + "learning_rate": 3.693683918599049e-05, + "loss": 0.0554, "step": 14980 }, { - "epoch": 0.9807000327118089, - "grad_norm": 0.9459251165390015, - "learning_rate": 9.969764443329452e-05, - "loss": 0.1047, + "epoch": 3.922211318285901, + "grad_norm": 0.4868308901786804, + "learning_rate": 3.6918674923341405e-05, + "loss": 0.0498, "step": 14990 }, { - "epoch": 0.9813542688910697, - "grad_norm": 0.8524743914604187, - "learning_rate": 9.969663491044003e-05, - "loss": 0.1097, + "epoch": 3.924828263002944, + "grad_norm": 0.6821767091751099, + "learning_rate": 3.69005025143904e-05, + "loss": 0.0512, "step": 15000 }, { - "epoch": 0.9820085050703304, - "grad_norm": 0.9077591896057129, - "learning_rate": 9.969562371019379e-05, - "loss": 0.1087, + "epoch": 3.924828263002944, + "eval_loss": 0.05896398778761369, + "eval_runtime": 8.8799, + "eval_samples_per_second": 115.317, + "eval_steps_per_second": 1.802, + "step": 15000 + }, + { + "epoch": 3.927445207719987, + "grad_norm": 0.43927785754203796, + "learning_rate": 3.688232197155814e-05, + "loss": 0.0546, "step": 15010 }, { - "epoch": 0.9826627412495911, - "grad_norm": 0.9452090263366699, - "learning_rate": 9.969461083258991e-05, - "loss": 0.11, + "epoch": 3.9300621524370296, + "grad_norm": 0.48438242077827454, + "learning_rate": 3.686413330727086e-05, + "loss": 0.0515, "step": 15020 }, { - "epoch": 0.9833169774288518, - "grad_norm": 0.9411560297012329, - "learning_rate": 9.969359627766258e-05, - "loss": 0.1065, + "epoch": 3.9326790971540726, + "grad_norm": 0.43656083941459656, + "learning_rate": 3.684593653396034e-05, + "loss": 0.0491, "step": 15030 }, { - "epoch": 0.9839712136081126, - "grad_norm": 0.8572501540184021, - "learning_rate": 9.969258004544606e-05, - "loss": 0.1103, + "epoch": 3.9352960418711156, + "grad_norm": 0.5122218728065491, + "learning_rate": 3.6827731664063895e-05, + "loss": 0.0475, "step": 15040 }, { - "epoch": 0.9846254497873732, - "grad_norm": 1.1027193069458008, - "learning_rate": 9.969156213597464e-05, - "loss": 0.1198, + "epoch": 3.9379129865881586, + "grad_norm": 0.6488099694252014, + "learning_rate": 3.680951871002438e-05, + "loss": 0.052, "step": 15050 }, { - "epoch": 0.985279685966634, - "grad_norm": 0.780197024345398, - "learning_rate": 9.969054254928267e-05, - "loss": 0.1102, + "epoch": 3.940529931305201, + "grad_norm": 0.7116113305091858, + "learning_rate": 3.6791297684290196e-05, + "loss": 0.0554, "step": 15060 }, { - "epoch": 0.9859339221458947, - "grad_norm": 0.8345792889595032, - "learning_rate": 9.968952128540456e-05, - "loss": 0.1166, + "epoch": 3.943146876022244, + "grad_norm": 0.5660650134086609, + "learning_rate": 3.677306859931522e-05, + "loss": 0.0529, "step": 15070 }, { - "epoch": 0.9865881583251553, - "grad_norm": 0.8777825236320496, - "learning_rate": 9.968849834437481e-05, - "loss": 0.1066, + "epoch": 3.9457638207392867, + "grad_norm": 0.5480781197547913, + "learning_rate": 3.675483146755888e-05, + "loss": 0.0591, "step": 15080 }, { - "epoch": 0.9872423945044161, - "grad_norm": 0.933706521987915, - "learning_rate": 9.968747372622793e-05, - "loss": 0.1241, + "epoch": 3.9483807654563297, + "grad_norm": 0.6030336618423462, + "learning_rate": 3.673658630148606e-05, + "loss": 0.0571, "step": 15090 }, { - "epoch": 0.9878966306836768, - "grad_norm": 0.7910727858543396, - "learning_rate": 9.968644743099848e-05, - "loss": 0.1132, + "epoch": 3.9509977101733726, + "grad_norm": 0.6756324768066406, + "learning_rate": 3.671833311356718e-05, + "loss": 0.0587, "step": 15100 }, { - "epoch": 0.9885508668629375, - "grad_norm": 0.7466284036636353, - "learning_rate": 9.968541945872114e-05, - "loss": 0.1079, + "epoch": 3.9536146548904156, + "grad_norm": 0.565447986125946, + "learning_rate": 3.670007191627812e-05, + "loss": 0.049, "step": 15110 }, { - "epoch": 0.9892051030421982, - "grad_norm": 0.8247948884963989, - "learning_rate": 9.968438980943057e-05, - "loss": 0.1025, + "epoch": 3.956231599607458, + "grad_norm": 0.8143734335899353, + "learning_rate": 3.668180272210022e-05, + "loss": 0.0555, "step": 15120 }, { - "epoch": 0.989859339221459, - "grad_norm": 0.8794564008712769, - "learning_rate": 9.968335848316157e-05, - "loss": 0.1155, + "epoch": 3.958848544324501, + "grad_norm": 0.5056126117706299, + "learning_rate": 3.666352554352032e-05, + "loss": 0.0488, "step": 15130 }, { - "epoch": 0.9905135754007197, - "grad_norm": 0.794762909412384, - "learning_rate": 9.968232547994891e-05, - "loss": 0.1075, + "epoch": 3.961465489041544, + "grad_norm": 0.6878169178962708, + "learning_rate": 3.664524039303069e-05, + "loss": 0.0514, "step": 15140 }, { - "epoch": 0.9911678115799804, - "grad_norm": 0.8396781086921692, - "learning_rate": 9.968129079982747e-05, - "loss": 0.1137, + "epoch": 3.9640824337585867, + "grad_norm": 0.5267339944839478, + "learning_rate": 3.662694728312905e-05, + "loss": 0.0513, "step": 15150 }, { - "epoch": 0.9918220477592411, - "grad_norm": 0.7761490941047668, - "learning_rate": 9.968025444283215e-05, - "loss": 0.1065, + "epoch": 3.9666993784756297, + "grad_norm": 0.36863964796066284, + "learning_rate": 3.660864622631859e-05, + "loss": 0.0463, "step": 15160 }, { - "epoch": 0.9924762839385018, - "grad_norm": 1.025817632675171, - "learning_rate": 9.967921640899797e-05, - "loss": 0.1023, + "epoch": 3.9693163231926727, + "grad_norm": 0.4979141652584076, + "learning_rate": 3.659033723510789e-05, + "loss": 0.052, "step": 15170 }, { - "epoch": 0.9931305201177625, - "grad_norm": 0.8451741337776184, - "learning_rate": 9.967817669835995e-05, - "loss": 0.1135, + "epoch": 3.9719332679097157, + "grad_norm": 0.34969276189804077, + "learning_rate": 3.657202032201099e-05, + "loss": 0.0461, "step": 15180 }, { - "epoch": 0.9937847562970232, - "grad_norm": 0.7876104712486267, - "learning_rate": 9.967713531095317e-05, - "loss": 0.1145, + "epoch": 3.9745502126267582, + "grad_norm": 0.31592005491256714, + "learning_rate": 3.6553695499547305e-05, + "loss": 0.0529, "step": 15190 }, { - "epoch": 0.994438992476284, - "grad_norm": 1.0193179845809937, - "learning_rate": 9.967609224681281e-05, - "loss": 0.1023, + "epoch": 3.977167157343801, + "grad_norm": 0.5293835997581482, + "learning_rate": 3.6535362780241694e-05, + "loss": 0.0541, "step": 15200 }, { - "epoch": 0.9950932286555446, - "grad_norm": 0.8599629998207092, - "learning_rate": 9.967504750597405e-05, - "loss": 0.1134, + "epoch": 3.9797841020608438, + "grad_norm": 0.6030070781707764, + "learning_rate": 3.65170221766244e-05, + "loss": 0.0514, "step": 15210 }, { - "epoch": 0.9957474648348054, - "grad_norm": 0.9450410604476929, - "learning_rate": 9.967400108847213e-05, - "loss": 0.1128, + "epoch": 3.9824010467778868, + "grad_norm": 0.53548264503479, + "learning_rate": 3.649867370123104e-05, + "loss": 0.0524, "step": 15220 }, { - "epoch": 0.9964017010140661, - "grad_norm": 0.841467022895813, - "learning_rate": 9.967295299434243e-05, - "loss": 0.1072, + "epoch": 3.9850179914949297, + "grad_norm": 0.585054337978363, + "learning_rate": 3.648031736660264e-05, + "loss": 0.0482, "step": 15230 }, { - "epoch": 0.9970559371933267, - "grad_norm": 0.7316820621490479, - "learning_rate": 9.967190322362029e-05, - "loss": 0.1059, + "epoch": 3.9876349362119727, + "grad_norm": 0.6449889540672302, + "learning_rate": 3.6461953185285566e-05, + "loss": 0.0509, "step": 15240 }, { - "epoch": 0.9977101733725875, - "grad_norm": 0.8536782264709473, - "learning_rate": 9.967085177634115e-05, - "loss": 0.1026, + "epoch": 3.9902518809290153, + "grad_norm": 0.5677632689476013, + "learning_rate": 3.644358116983157e-05, + "loss": 0.0516, "step": 15250 }, { - "epoch": 0.9983644095518482, - "grad_norm": 0.8215930461883545, - "learning_rate": 9.966979865254047e-05, - "loss": 0.1131, + "epoch": 3.9928688256460583, + "grad_norm": 0.49215826392173767, + "learning_rate": 3.6425201332797755e-05, + "loss": 0.0505, "step": 15260 }, { - "epoch": 0.9990186457311089, - "grad_norm": 1.0290292501449585, - "learning_rate": 9.966874385225385e-05, - "loss": 0.1151, + "epoch": 3.995485770363101, + "grad_norm": 0.6127063632011414, + "learning_rate": 3.640681368674656e-05, + "loss": 0.0531, "step": 15270 }, { - "epoch": 0.9996728819103696, - "grad_norm": 1.0046546459197998, - "learning_rate": 9.966768737551685e-05, - "loss": 0.1078, + "epoch": 3.998102715080144, + "grad_norm": 0.6169252395629883, + "learning_rate": 3.638841824424577e-05, + "loss": 0.0519, "step": 15280 }, { - "epoch": 1.0003271180896303, - "grad_norm": 1.0391733646392822, - "learning_rate": 9.966662922236515e-05, - "loss": 0.1175, + "epoch": 4.000523388943408, + "grad_norm": 0.491829514503479, + "learning_rate": 3.63700150178685e-05, + "loss": 0.0493, "step": 15290 }, { - "epoch": 1.000981354268891, - "grad_norm": 0.8953946828842163, - "learning_rate": 9.966556939283445e-05, - "loss": 0.1119, + "epoch": 4.003140333660451, + "grad_norm": 0.5655731558799744, + "learning_rate": 3.635160402019317e-05, + "loss": 0.048, "step": 15300 }, { - "epoch": 1.0016355904481518, - "grad_norm": 0.8209472298622131, - "learning_rate": 9.966450788696053e-05, - "loss": 0.1263, + "epoch": 4.005757278377494, + "grad_norm": 0.6087900400161743, + "learning_rate": 3.633318526380354e-05, + "loss": 0.0535, "step": 15310 }, { - "epoch": 1.0022898266274125, - "grad_norm": 0.9228883981704712, - "learning_rate": 9.966344470477922e-05, - "loss": 0.1051, + "epoch": 4.008374223094537, + "grad_norm": 0.5320557355880737, + "learning_rate": 3.631475876128864e-05, + "loss": 0.0396, "step": 15320 }, { - "epoch": 1.0029440628066733, - "grad_norm": 0.8755295276641846, - "learning_rate": 9.966237984632641e-05, - "loss": 0.105, + "epoch": 4.01099116781158, + "grad_norm": 0.44770991802215576, + "learning_rate": 3.629632452524282e-05, + "loss": 0.0434, "step": 15330 }, { - "epoch": 1.003598298985934, - "grad_norm": 0.727872371673584, - "learning_rate": 9.966131331163803e-05, - "loss": 0.1011, + "epoch": 4.0136081125286225, + "grad_norm": 0.5166875123977661, + "learning_rate": 3.627788256826571e-05, + "loss": 0.0506, "step": 15340 }, { - "epoch": 1.0042525351651945, - "grad_norm": 0.8583409190177917, - "learning_rate": 9.96602451007501e-05, - "loss": 0.1096, + "epoch": 4.0162250572456655, + "grad_norm": 0.6423705816268921, + "learning_rate": 3.6259432902962195e-05, + "loss": 0.0537, "step": 15350 }, { - "epoch": 1.0049067713444553, - "grad_norm": 0.971300482749939, - "learning_rate": 9.965917521369865e-05, - "loss": 0.1031, + "epoch": 4.0188420019627085, + "grad_norm": 0.62786465883255, + "learning_rate": 3.624097554194248e-05, + "loss": 0.0527, "step": 15360 }, { - "epoch": 1.005561007523716, - "grad_norm": 0.8533384799957275, - "learning_rate": 9.96581036505198e-05, - "loss": 0.1199, + "epoch": 4.0214589466797515, + "grad_norm": 0.6154875755310059, + "learning_rate": 3.622251049782197e-05, + "loss": 0.0542, "step": 15370 }, { - "epoch": 1.0062152437029768, - "grad_norm": 0.8822680711746216, - "learning_rate": 9.96570304112497e-05, - "loss": 0.1087, + "epoch": 4.0240758913967944, + "grad_norm": 0.5390315651893616, + "learning_rate": 3.6204037783221356e-05, + "loss": 0.0504, "step": 15380 }, { - "epoch": 1.0068694798822375, - "grad_norm": 0.714950442314148, - "learning_rate": 9.965595549592462e-05, - "loss": 0.112, + "epoch": 4.026692836113837, + "grad_norm": 0.5157982110977173, + "learning_rate": 3.618555741076657e-05, + "loss": 0.0549, "step": 15390 }, { - "epoch": 1.0075237160614983, - "grad_norm": 0.9944178462028503, - "learning_rate": 9.96548789045808e-05, - "loss": 0.1072, + "epoch": 4.0293097808308795, + "grad_norm": 0.5032212138175964, + "learning_rate": 3.6167069393088756e-05, + "loss": 0.0515, "step": 15400 }, { - "epoch": 1.008177952240759, - "grad_norm": 0.8967353105545044, - "learning_rate": 9.96538006372546e-05, - "loss": 0.1119, + "epoch": 4.0319267255479225, + "grad_norm": 0.5536220073699951, + "learning_rate": 3.614857374282432e-05, + "loss": 0.0449, "step": 15410 }, { - "epoch": 1.0088321884200195, - "grad_norm": 0.7786975502967834, - "learning_rate": 9.96527206939824e-05, - "loss": 0.1114, + "epoch": 4.0345436702649655, + "grad_norm": 0.41552475094795227, + "learning_rate": 3.613007047261485e-05, + "loss": 0.0522, "step": 15420 }, { - "epoch": 1.0094864245992803, - "grad_norm": 0.8993393182754517, - "learning_rate": 9.965163907480066e-05, - "loss": 0.1047, + "epoch": 4.0371606149820085, + "grad_norm": 0.45113885402679443, + "learning_rate": 3.6111559595107164e-05, + "loss": 0.0508, "step": 15430 }, { - "epoch": 1.010140660778541, - "grad_norm": 0.8233774304389954, - "learning_rate": 9.965055577974588e-05, - "loss": 0.1033, + "epoch": 4.0397775596990515, + "grad_norm": 0.6253324151039124, + "learning_rate": 3.609304112295328e-05, + "loss": 0.0547, "step": 15440 }, { - "epoch": 1.0107948969578018, - "grad_norm": 0.7405421733856201, - "learning_rate": 9.964947080885464e-05, - "loss": 0.1214, + "epoch": 4.0423945044160945, + "grad_norm": 0.5622521042823792, + "learning_rate": 3.607451506881037e-05, + "loss": 0.0471, "step": 15450 }, { - "epoch": 1.0114491331370625, - "grad_norm": 0.7713890075683594, - "learning_rate": 9.964838416216354e-05, - "loss": 0.1069, + "epoch": 4.0450114491331375, + "grad_norm": 0.346451997756958, + "learning_rate": 3.605598144534085e-05, + "loss": 0.051, "step": 15460 }, { - "epoch": 1.0121033693163233, - "grad_norm": 0.7736397981643677, - "learning_rate": 9.964729583970927e-05, - "loss": 0.1251, + "epoch": 4.04762839385018, + "grad_norm": 0.5430278182029724, + "learning_rate": 3.603744026521227e-05, + "loss": 0.0432, "step": 15470 }, { - "epoch": 1.0127576054955838, - "grad_norm": 0.7739368081092834, - "learning_rate": 9.964620584152857e-05, - "loss": 0.105, + "epoch": 4.050245338567223, + "grad_norm": 0.5486944317817688, + "learning_rate": 3.6018891541097336e-05, + "loss": 0.0515, "step": 15480 }, { - "epoch": 1.0134118416748445, - "grad_norm": 0.8225721120834351, - "learning_rate": 9.964511416765821e-05, - "loss": 0.1196, + "epoch": 4.052862283284266, + "grad_norm": 0.4349093735218048, + "learning_rate": 3.6000335285673934e-05, + "loss": 0.054, "step": 15490 }, { - "epoch": 1.0140660778541053, - "grad_norm": 0.7423897981643677, - "learning_rate": 9.964402081813504e-05, - "loss": 0.1159, + "epoch": 4.055479228001309, + "grad_norm": 0.6662614941596985, + "learning_rate": 3.5981771511625094e-05, + "loss": 0.0527, "step": 15500 }, { - "epoch": 1.014720314033366, - "grad_norm": 0.8205366730690002, - "learning_rate": 9.9642925792996e-05, - "loss": 0.1092, + "epoch": 4.0580961727183515, + "grad_norm": 0.5280335545539856, + "learning_rate": 3.5963200231638976e-05, + "loss": 0.0522, "step": 15510 }, { - "epoch": 1.0153745502126268, - "grad_norm": 0.9027040600776672, - "learning_rate": 9.964182909227799e-05, - "loss": 0.1122, + "epoch": 4.0607131174353945, + "grad_norm": 0.46068698167800903, + "learning_rate": 3.5944621458408883e-05, + "loss": 0.0523, "step": 15520 }, { - "epoch": 1.0160287863918875, - "grad_norm": 0.7824887037277222, - "learning_rate": 9.964073071601808e-05, - "loss": 0.1046, + "epoch": 4.063330062152437, + "grad_norm": 0.4579375386238098, + "learning_rate": 3.5926035204633216e-05, + "loss": 0.0476, "step": 15530 }, { - "epoch": 1.0166830225711483, - "grad_norm": 0.9918166995048523, - "learning_rate": 9.963963066425331e-05, - "loss": 0.1141, + "epoch": 4.06594700686948, + "grad_norm": 0.6499910354614258, + "learning_rate": 3.590744148301552e-05, + "loss": 0.0523, "step": 15540 }, { - "epoch": 1.0173372587504088, - "grad_norm": 0.9670895934104919, - "learning_rate": 9.963852893702081e-05, - "loss": 0.1055, + "epoch": 4.068563951586523, + "grad_norm": 0.5918278694152832, + "learning_rate": 3.5888840306264424e-05, + "loss": 0.0553, "step": 15550 }, { - "epoch": 1.0179914949296696, - "grad_norm": 0.8161757588386536, - "learning_rate": 9.96374255343578e-05, - "loss": 0.109, + "epoch": 4.071180896303566, + "grad_norm": 0.5414620041847229, + "learning_rate": 3.5870231687093644e-05, + "loss": 0.0538, "step": 15560 }, { - "epoch": 1.0186457311089303, - "grad_norm": 0.9068933129310608, - "learning_rate": 9.963632045630147e-05, - "loss": 0.1047, + "epoch": 4.073797841020609, + "grad_norm": 0.6221410632133484, + "learning_rate": 3.5851615638222014e-05, + "loss": 0.0503, "step": 15570 }, { - "epoch": 1.019299967288191, - "grad_norm": 0.902734100818634, - "learning_rate": 9.963521370288917e-05, - "loss": 0.1019, + "epoch": 4.076414785737652, + "grad_norm": 0.5084514617919922, + "learning_rate": 3.583299217237341e-05, + "loss": 0.0543, "step": 15580 }, { - "epoch": 1.0199542034674518, - "grad_norm": 0.9012474417686462, - "learning_rate": 9.963410527415823e-05, - "loss": 0.1052, + "epoch": 4.079031730454695, + "grad_norm": 0.44841188192367554, + "learning_rate": 3.581436130227682e-05, + "loss": 0.045, "step": 15590 }, { - "epoch": 1.0206084396467126, - "grad_norm": 0.7533280253410339, - "learning_rate": 9.963299517014608e-05, - "loss": 0.1101, + "epoch": 4.081648675171737, + "grad_norm": 0.43916547298431396, + "learning_rate": 3.579572304066624e-05, + "loss": 0.0475, "step": 15600 }, { - "epoch": 1.021262675825973, - "grad_norm": 0.8915255069732666, - "learning_rate": 9.963188339089015e-05, - "loss": 0.0986, + "epoch": 4.08426561988878, + "grad_norm": 0.4475759267807007, + "learning_rate": 3.5777077400280765e-05, + "loss": 0.0462, "step": 15610 }, { - "epoch": 1.0219169120052338, - "grad_norm": 0.8015055060386658, - "learning_rate": 9.963076993642802e-05, - "loss": 0.1187, + "epoch": 4.086882564605823, + "grad_norm": 0.6395149827003479, + "learning_rate": 3.575842439386451e-05, + "loss": 0.0496, "step": 15620 }, { - "epoch": 1.0225711481844946, - "grad_norm": 0.9138506054878235, - "learning_rate": 9.962965480679721e-05, - "loss": 0.106, + "epoch": 4.089499509322866, + "grad_norm": 0.5050554275512695, + "learning_rate": 3.573976403416662e-05, + "loss": 0.0456, "step": 15630 }, { - "epoch": 1.0232253843637553, - "grad_norm": 0.8131011724472046, - "learning_rate": 9.962853800203541e-05, - "loss": 0.1278, + "epoch": 4.092116454039909, + "grad_norm": 0.5360559821128845, + "learning_rate": 3.57210963339413e-05, + "loss": 0.0504, "step": 15640 }, { - "epoch": 1.023879620543016, - "grad_norm": 0.8767716288566589, - "learning_rate": 9.96274195221803e-05, - "loss": 0.1054, + "epoch": 4.094733398756952, + "grad_norm": 0.5057238936424255, + "learning_rate": 3.5702421305947714e-05, + "loss": 0.0528, "step": 15650 }, { - "epoch": 1.0245338567222768, - "grad_norm": 0.8304327726364136, - "learning_rate": 9.962629936726962e-05, - "loss": 0.0943, + "epoch": 4.097350343473994, + "grad_norm": 0.4850975573062897, + "learning_rate": 3.5683738962950086e-05, + "loss": 0.0424, "step": 15660 }, { - "epoch": 1.0251880929015376, - "grad_norm": 0.8598764538764954, - "learning_rate": 9.96251775373412e-05, - "loss": 0.108, + "epoch": 4.099967288191037, + "grad_norm": 0.6539948582649231, + "learning_rate": 3.566504931771762e-05, + "loss": 0.0508, "step": 15670 }, { - "epoch": 1.025842329080798, - "grad_norm": 0.8592802286148071, - "learning_rate": 9.962405403243287e-05, - "loss": 0.1186, + "epoch": 4.10258423290808, + "grad_norm": 0.4550129473209381, + "learning_rate": 3.5646352383024504e-05, + "loss": 0.043, "step": 15680 }, { - "epoch": 1.0264965652600588, - "grad_norm": 0.9450241327285767, - "learning_rate": 9.962292885258259e-05, - "loss": 0.1196, + "epoch": 4.105201177625123, + "grad_norm": 0.5807818174362183, + "learning_rate": 3.562764817164994e-05, + "loss": 0.0471, "step": 15690 }, { - "epoch": 1.0271508014393196, - "grad_norm": 0.8318309187889099, - "learning_rate": 9.962180199782831e-05, - "loss": 0.1173, + "epoch": 4.107818122342166, + "grad_norm": 0.604076087474823, + "learning_rate": 3.560893669637805e-05, + "loss": 0.0472, "step": 15700 }, { - "epoch": 1.0278050376185803, - "grad_norm": 0.8451339602470398, - "learning_rate": 9.962067346820808e-05, - "loss": 0.119, + "epoch": 4.110435067059209, + "grad_norm": 0.4531025290489197, + "learning_rate": 3.5590217969997964e-05, + "loss": 0.0495, "step": 15710 }, { - "epoch": 1.028459273797841, - "grad_norm": 0.8618002533912659, - "learning_rate": 9.961954326375998e-05, - "loss": 0.1103, + "epoch": 4.113052011776251, + "grad_norm": 0.3466964364051819, + "learning_rate": 3.557149200530376e-05, + "loss": 0.0438, "step": 15720 }, { - "epoch": 1.0291135099771018, - "grad_norm": 0.7853378653526306, - "learning_rate": 9.961841138452217e-05, - "loss": 0.1144, + "epoch": 4.115668956493294, + "grad_norm": 0.4820724427700043, + "learning_rate": 3.555275881509445e-05, + "loss": 0.0461, "step": 15730 }, { - "epoch": 1.0297677461563624, - "grad_norm": 0.8953865766525269, - "learning_rate": 9.961727783053285e-05, - "loss": 0.1148, + "epoch": 4.118285901210337, + "grad_norm": 0.586309552192688, + "learning_rate": 3.5534018412174e-05, + "loss": 0.0472, "step": 15740 }, { - "epoch": 1.030421982335623, - "grad_norm": 0.9555432200431824, - "learning_rate": 9.961614260183028e-05, - "loss": 0.1188, + "epoch": 4.12090284592738, + "grad_norm": 0.46379247307777405, + "learning_rate": 3.55152708093513e-05, + "loss": 0.0522, "step": 15750 }, { - "epoch": 1.0310762185148838, - "grad_norm": 0.9691207408905029, - "learning_rate": 9.961500569845275e-05, - "loss": 0.1037, + "epoch": 4.123519790644423, + "grad_norm": 0.438968688249588, + "learning_rate": 3.549651601944014e-05, + "loss": 0.0503, "step": 15760 }, { - "epoch": 1.0317304546941446, - "grad_norm": 0.9222931265830994, - "learning_rate": 9.961386712043868e-05, - "loss": 0.1007, + "epoch": 4.126136735361466, + "grad_norm": 0.5727360844612122, + "learning_rate": 3.547775405525927e-05, + "loss": 0.0491, "step": 15770 }, { - "epoch": 1.0323846908734053, - "grad_norm": 0.838117241859436, - "learning_rate": 9.961272686782646e-05, - "loss": 0.1034, + "epoch": 4.128753680078509, + "grad_norm": 0.5099532604217529, + "learning_rate": 3.54589849296323e-05, + "loss": 0.0523, "step": 15780 }, { - "epoch": 1.033038927052666, - "grad_norm": 1.437838077545166, - "learning_rate": 9.961158494065461e-05, - "loss": 0.1285, + "epoch": 4.131370624795551, + "grad_norm": 0.7122123837471008, + "learning_rate": 3.5440208655387754e-05, + "loss": 0.0575, "step": 15790 }, { - "epoch": 1.0336931632319266, - "grad_norm": 0.8757314085960388, - "learning_rate": 9.961044133896166e-05, - "loss": 0.1098, + "epoch": 4.133987569512594, + "grad_norm": 0.437671422958374, + "learning_rate": 3.542142524535903e-05, + "loss": 0.0481, "step": 15800 }, { - "epoch": 1.0343473994111874, - "grad_norm": 0.8826759457588196, - "learning_rate": 9.96092960627862e-05, - "loss": 0.1102, + "epoch": 4.136604514229637, + "grad_norm": 0.4661213159561157, + "learning_rate": 3.540263471238443e-05, + "loss": 0.0571, "step": 15810 }, { - "epoch": 1.035001635590448, - "grad_norm": 0.820978581905365, - "learning_rate": 9.96081491121669e-05, - "loss": 0.1053, + "epoch": 4.13922145894668, + "grad_norm": 0.6541229486465454, + "learning_rate": 3.538383706930709e-05, + "loss": 0.0499, "step": 15820 }, { - "epoch": 1.0356558717697089, - "grad_norm": 0.8541067242622375, - "learning_rate": 9.960700048714244e-05, - "loss": 0.1115, + "epoch": 4.141838403663723, + "grad_norm": 0.3765426278114319, + "learning_rate": 3.5365032328975025e-05, + "loss": 0.0494, "step": 15830 }, { - "epoch": 1.0363101079489696, - "grad_norm": 0.7648676633834839, - "learning_rate": 9.960585018775164e-05, - "loss": 0.1015, + "epoch": 4.144455348380766, + "grad_norm": 1.0478640794754028, + "learning_rate": 3.53462205042411e-05, + "loss": 0.052, "step": 15840 }, { - "epoch": 1.0369643441282304, - "grad_norm": 0.8757931590080261, - "learning_rate": 9.960469821403329e-05, - "loss": 0.1006, + "epoch": 4.147072293097808, + "grad_norm": 0.5365044474601746, + "learning_rate": 3.532740160796302e-05, + "loss": 0.0524, "step": 15850 }, { - "epoch": 1.037618580307491, - "grad_norm": 0.9553504586219788, - "learning_rate": 9.960354456602628e-05, - "loss": 0.1076, + "epoch": 4.149689237814851, + "grad_norm": 0.6117434501647949, + "learning_rate": 3.5308575653003314e-05, + "loss": 0.055, "step": 15860 }, { - "epoch": 1.0382728164867516, - "grad_norm": 1.0022895336151123, - "learning_rate": 9.960238924376954e-05, - "loss": 0.1076, + "epoch": 4.152306182531894, + "grad_norm": 0.6938223838806152, + "learning_rate": 3.5289742652229366e-05, + "loss": 0.0508, "step": 15870 }, { - "epoch": 1.0389270526660124, - "grad_norm": 0.8987383842468262, - "learning_rate": 9.96012322473021e-05, - "loss": 0.112, + "epoch": 4.154923127248937, + "grad_norm": 0.5880016684532166, + "learning_rate": 3.527090261851334e-05, + "loss": 0.0494, "step": 15880 }, { - "epoch": 1.0395812888452731, - "grad_norm": 0.8711419105529785, - "learning_rate": 9.960007357666297e-05, - "loss": 0.1046, + "epoch": 4.15754007196598, + "grad_norm": 0.4967406094074249, + "learning_rate": 3.525205556473221e-05, + "loss": 0.0539, "step": 15890 }, { - "epoch": 1.0402355250245339, - "grad_norm": 0.6669361591339111, - "learning_rate": 9.95989132318913e-05, - "loss": 0.1021, + "epoch": 4.160157016683023, + "grad_norm": 0.6478481888771057, + "learning_rate": 3.5233201503767786e-05, + "loss": 0.0573, "step": 15900 }, { - "epoch": 1.0408897612037946, - "grad_norm": 0.7434557676315308, - "learning_rate": 9.959775121302621e-05, - "loss": 0.1104, + "epoch": 4.162773961400066, + "grad_norm": 0.5297810435295105, + "learning_rate": 3.5214340448506624e-05, + "loss": 0.0485, "step": 15910 }, { - "epoch": 1.0415439973830554, - "grad_norm": 0.8554294109344482, - "learning_rate": 9.959658752010695e-05, - "loss": 0.104, + "epoch": 4.165390906117108, + "grad_norm": 0.4374099671840668, + "learning_rate": 3.519547241184008e-05, + "loss": 0.0537, "step": 15920 }, { - "epoch": 1.042198233562316, - "grad_norm": 0.9452110528945923, - "learning_rate": 9.959542215317278e-05, - "loss": 0.1057, + "epoch": 4.168007850834151, + "grad_norm": 0.5356457829475403, + "learning_rate": 3.517659740666429e-05, + "loss": 0.0482, "step": 15930 }, { - "epoch": 1.0428524697415766, - "grad_norm": 0.9260740876197815, - "learning_rate": 9.959425511226304e-05, - "loss": 0.1065, + "epoch": 4.170624795551194, + "grad_norm": 0.36456596851348877, + "learning_rate": 3.5157715445880114e-05, + "loss": 0.0465, "step": 15940 }, { - "epoch": 1.0435067059208374, - "grad_norm": 0.939108669757843, - "learning_rate": 9.959308639741714e-05, - "loss": 0.1098, + "epoch": 4.173241740268237, + "grad_norm": 0.5124794244766235, + "learning_rate": 3.513882654239322e-05, + "loss": 0.0489, "step": 15950 }, { - "epoch": 1.0441609421000981, - "grad_norm": 0.7533574104309082, - "learning_rate": 9.95919160086745e-05, - "loss": 0.11, + "epoch": 4.17585868498528, + "grad_norm": 0.6307095289230347, + "learning_rate": 3.511993070911399e-05, + "loss": 0.0493, "step": 15960 }, { - "epoch": 1.0448151782793589, - "grad_norm": 0.7909335494041443, - "learning_rate": 9.959074394607464e-05, - "loss": 0.1032, + "epoch": 4.178475629702323, + "grad_norm": 0.6663428544998169, + "learning_rate": 3.510102795895755e-05, + "loss": 0.0511, "step": 15970 }, { - "epoch": 1.0454694144586196, - "grad_norm": 0.6894845366477966, - "learning_rate": 9.958957020965712e-05, - "loss": 0.0998, + "epoch": 4.181092574419365, + "grad_norm": 0.684826135635376, + "learning_rate": 3.508211830484374e-05, + "loss": 0.0533, "step": 15980 }, { - "epoch": 1.0461236506378804, - "grad_norm": 0.8080494403839111, - "learning_rate": 9.958839479946154e-05, - "loss": 0.1073, + "epoch": 4.183709519136408, + "grad_norm": 0.3565167188644409, + "learning_rate": 3.506320175969714e-05, + "loss": 0.0504, "step": 15990 }, { - "epoch": 1.046777886817141, - "grad_norm": 0.8611446022987366, - "learning_rate": 9.958721771552759e-05, - "loss": 0.1055, + "epoch": 4.186326463853451, + "grad_norm": 0.544464111328125, + "learning_rate": 3.504427833644702e-05, + "loss": 0.0454, "step": 16000 }, { - "epoch": 1.0474321229964016, - "grad_norm": 0.9711137413978577, - "learning_rate": 9.958603895789501e-05, - "loss": 0.1015, + "epoch": 4.186326463853451, + "eval_loss": 0.05761846851942311, + "eval_runtime": 8.7972, + "eval_samples_per_second": 116.4, + "eval_steps_per_second": 1.819, + "step": 16000 + }, + { + "epoch": 4.188943408570494, + "grad_norm": 0.40490591526031494, + "learning_rate": 3.502534804802738e-05, + "loss": 0.0448, "step": 16010 }, { - "epoch": 1.0480863591756624, - "grad_norm": 0.9332404732704163, - "learning_rate": 9.958485852660356e-05, - "loss": 0.1092, + "epoch": 4.191560353287537, + "grad_norm": 0.4140538275241852, + "learning_rate": 3.500641090737689e-05, + "loss": 0.0494, "step": 16020 }, { - "epoch": 1.0487405953549231, - "grad_norm": 0.7202958464622498, - "learning_rate": 9.958367642169308e-05, - "loss": 0.0976, + "epoch": 4.19417729800458, + "grad_norm": 0.4455014765262604, + "learning_rate": 3.4987466927438875e-05, + "loss": 0.0481, "step": 16030 }, { - "epoch": 1.049394831534184, - "grad_norm": 0.9648202657699585, - "learning_rate": 9.958249264320349e-05, - "loss": 0.1103, + "epoch": 4.196794242721623, + "grad_norm": 0.4680039584636688, + "learning_rate": 3.49685161211614e-05, + "loss": 0.0458, "step": 16040 }, { - "epoch": 1.0500490677134446, - "grad_norm": 0.8752630949020386, - "learning_rate": 9.958130719117475e-05, - "loss": 0.1095, + "epoch": 4.199411187438665, + "grad_norm": 0.45873773097991943, + "learning_rate": 3.4949558501497166e-05, + "loss": 0.0508, "step": 16050 }, { - "epoch": 1.0507033038927052, - "grad_norm": 1.0610864162445068, - "learning_rate": 9.958012006564686e-05, - "loss": 0.1026, + "epoch": 4.202028132155708, + "grad_norm": 0.5634317994117737, + "learning_rate": 3.49305940814035e-05, + "loss": 0.0506, "step": 16060 }, { - "epoch": 1.051357540071966, - "grad_norm": 0.9164272546768188, - "learning_rate": 9.957893126665987e-05, - "loss": 0.112, + "epoch": 4.204645076872751, + "grad_norm": 0.5611090660095215, + "learning_rate": 3.4911622873842434e-05, + "loss": 0.0518, "step": 16070 }, { - "epoch": 1.0520117762512267, - "grad_norm": 0.8324456214904785, - "learning_rate": 9.957774079425395e-05, - "loss": 0.1134, + "epoch": 4.207262021589794, + "grad_norm": 0.4446581304073334, + "learning_rate": 3.4892644891780586e-05, + "loss": 0.0516, "step": 16080 }, { - "epoch": 1.0526660124304874, - "grad_norm": 1.0304049253463745, - "learning_rate": 9.957654864846924e-05, - "loss": 0.101, + "epoch": 4.209878966306837, + "grad_norm": 0.6510340571403503, + "learning_rate": 3.487366014818923e-05, + "loss": 0.0548, "step": 16090 }, { - "epoch": 1.0533202486097482, - "grad_norm": 0.9455142021179199, - "learning_rate": 9.9575354829346e-05, - "loss": 0.1024, + "epoch": 4.21249591102388, + "grad_norm": 0.6392655372619629, + "learning_rate": 3.485466865604427e-05, + "loss": 0.042, "step": 16100 }, { - "epoch": 1.053974484789009, - "grad_norm": 1.337161660194397, - "learning_rate": 9.95741593369245e-05, - "loss": 0.1067, + "epoch": 4.215112855740922, + "grad_norm": 0.6639431118965149, + "learning_rate": 3.483567042832622e-05, + "loss": 0.0478, "step": 16110 }, { - "epoch": 1.0546287209682697, - "grad_norm": 0.9535005688667297, - "learning_rate": 9.957296217124513e-05, - "loss": 0.1078, + "epoch": 4.217729800457965, + "grad_norm": 0.5047462582588196, + "learning_rate": 3.481666547802017e-05, + "loss": 0.0476, "step": 16120 }, { - "epoch": 1.0552829571475302, - "grad_norm": 0.849690854549408, - "learning_rate": 9.957176333234828e-05, - "loss": 0.1083, + "epoch": 4.220346745175008, + "grad_norm": 0.35533663630485535, + "learning_rate": 3.479765381811583e-05, + "loss": 0.0443, "step": 16130 }, { - "epoch": 1.055937193326791, - "grad_norm": 0.8453637361526489, - "learning_rate": 9.957056282027439e-05, - "loss": 0.105, + "epoch": 4.222963689892051, + "grad_norm": 0.36323872208595276, + "learning_rate": 3.4778635461607486e-05, + "loss": 0.0472, "step": 16140 }, { - "epoch": 1.0565914295060517, - "grad_norm": 0.9851493835449219, - "learning_rate": 9.956936063506402e-05, - "loss": 0.1029, + "epoch": 4.225580634609094, + "grad_norm": 0.44074222445487976, + "learning_rate": 3.4759610421494016e-05, + "loss": 0.0487, "step": 16150 }, { - "epoch": 1.0572456656853124, - "grad_norm": 0.8924856781959534, - "learning_rate": 9.956815677675772e-05, - "loss": 0.1031, + "epoch": 4.228197579326137, + "grad_norm": 1.4504610300064087, + "learning_rate": 3.4740578710778845e-05, + "loss": 0.0494, "step": 16160 }, { - "epoch": 1.0578999018645732, - "grad_norm": 1.1166882514953613, - "learning_rate": 9.956695124539613e-05, - "loss": 0.1058, + "epoch": 4.230814524043179, + "grad_norm": 0.7710520029067993, + "learning_rate": 3.472154034246998e-05, + "loss": 0.0486, "step": 16170 }, { - "epoch": 1.058554138043834, - "grad_norm": 0.7543265223503113, - "learning_rate": 9.956574404101994e-05, - "loss": 0.0973, + "epoch": 4.233431468760222, + "grad_norm": 0.4526599943637848, + "learning_rate": 3.470249532957996e-05, + "loss": 0.0493, "step": 16180 }, { - "epoch": 1.0592083742230944, - "grad_norm": 0.7494108080863953, - "learning_rate": 9.95645351636699e-05, - "loss": 0.104, + "epoch": 4.236048413477265, + "grad_norm": 0.4356165826320648, + "learning_rate": 3.4683443685125864e-05, + "loss": 0.0557, "step": 16190 }, { - "epoch": 1.0598626104023552, - "grad_norm": 0.9787102937698364, - "learning_rate": 9.956332461338683e-05, - "loss": 0.1027, + "epoch": 4.238665358194308, + "grad_norm": 0.8690013885498047, + "learning_rate": 3.466438542212934e-05, + "loss": 0.0577, "step": 16200 }, { - "epoch": 1.060516846581616, - "grad_norm": 0.957390546798706, - "learning_rate": 9.956211239021154e-05, - "loss": 0.1086, + "epoch": 4.241282302911351, + "grad_norm": 0.4734126627445221, + "learning_rate": 3.4645320553616485e-05, + "loss": 0.0509, "step": 16210 }, { - "epoch": 1.0611710827608767, - "grad_norm": 1.1353367567062378, - "learning_rate": 9.9560898494185e-05, - "loss": 0.1035, + "epoch": 4.243899247628394, + "grad_norm": 0.5786865949630737, + "learning_rate": 3.462624909261799e-05, + "loss": 0.0556, "step": 16220 }, { - "epoch": 1.0618253189401374, - "grad_norm": 1.0974490642547607, - "learning_rate": 9.955968292534814e-05, - "loss": 0.1087, + "epoch": 4.246516192345437, + "grad_norm": 0.5717179179191589, + "learning_rate": 3.460717105216901e-05, + "loss": 0.0524, "step": 16230 }, { - "epoch": 1.0624795551193982, - "grad_norm": 1.101291537284851, - "learning_rate": 9.955846568374201e-05, - "loss": 0.1154, + "epoch": 4.249133137062479, + "grad_norm": 0.3806012272834778, + "learning_rate": 3.4588086445309205e-05, + "loss": 0.0433, "step": 16240 }, { - "epoch": 1.0631337912986587, - "grad_norm": 0.8823967576026917, - "learning_rate": 9.955724676940769e-05, - "loss": 0.1043, + "epoch": 4.251750081779522, + "grad_norm": 0.6166380643844604, + "learning_rate": 3.4568995285082735e-05, + "loss": 0.0511, "step": 16250 }, { - "epoch": 1.0637880274779195, - "grad_norm": 0.7924269437789917, - "learning_rate": 9.955602618238633e-05, - "loss": 0.1018, + "epoch": 4.254367026496565, + "grad_norm": 0.4418392479419708, + "learning_rate": 3.454989758453821e-05, + "loss": 0.0541, "step": 16260 }, { - "epoch": 1.0644422636571802, - "grad_norm": 1.0062264204025269, - "learning_rate": 9.955480392271911e-05, - "loss": 0.1101, + "epoch": 4.256983971213608, + "grad_norm": 0.8362012505531311, + "learning_rate": 3.453079335672873e-05, + "loss": 0.0483, "step": 16270 }, { - "epoch": 1.065096499836441, - "grad_norm": 0.8188719749450684, - "learning_rate": 9.95535799904473e-05, - "loss": 0.0963, + "epoch": 4.259600915930651, + "grad_norm": 0.49030205607414246, + "learning_rate": 3.451168261471187e-05, + "loss": 0.0529, "step": 16280 }, { - "epoch": 1.0657507360157017, - "grad_norm": 1.1326643228530884, - "learning_rate": 9.955235438561222e-05, - "loss": 0.1052, + "epoch": 4.262217860647694, + "grad_norm": 0.8152894973754883, + "learning_rate": 3.449256537154962e-05, + "loss": 0.0556, "step": 16290 }, { - "epoch": 1.0664049721949624, - "grad_norm": 0.7720603942871094, - "learning_rate": 9.95511271082552e-05, - "loss": 0.1034, + "epoch": 4.264834805364737, + "grad_norm": 0.5284848213195801, + "learning_rate": 3.4473441640308464e-05, + "loss": 0.0564, "step": 16300 }, { - "epoch": 1.0670592083742232, - "grad_norm": 0.8074955344200134, - "learning_rate": 9.954989815841771e-05, - "loss": 0.1159, + "epoch": 4.267451750081779, + "grad_norm": 0.6003028750419617, + "learning_rate": 3.4454311434059266e-05, + "loss": 0.046, "step": 16310 }, { - "epoch": 1.0677134445534837, - "grad_norm": 0.7520632147789001, - "learning_rate": 9.954866753614118e-05, - "loss": 0.1035, + "epoch": 4.270068694798822, + "grad_norm": 0.5223946571350098, + "learning_rate": 3.443517476587735e-05, + "loss": 0.0446, "step": 16320 }, { - "epoch": 1.0683676807327445, - "grad_norm": 0.8165426254272461, - "learning_rate": 9.95474352414672e-05, - "loss": 0.1079, + "epoch": 4.272685639515865, + "grad_norm": 0.5389282703399658, + "learning_rate": 3.441603164884246e-05, + "loss": 0.0554, "step": 16330 }, { - "epoch": 1.0690219169120052, - "grad_norm": 0.8366930484771729, - "learning_rate": 9.954620127443733e-05, - "loss": 0.1116, + "epoch": 4.275302584232908, + "grad_norm": 0.591784656047821, + "learning_rate": 3.4396882096038717e-05, + "loss": 0.0473, "step": 16340 }, { - "epoch": 1.069676153091266, - "grad_norm": 0.7152531147003174, - "learning_rate": 9.954496563509323e-05, - "loss": 0.0996, + "epoch": 4.277919528949951, + "grad_norm": 0.762988805770874, + "learning_rate": 3.4377726120554675e-05, + "loss": 0.0479, "step": 16350 }, { - "epoch": 1.0703303892705267, - "grad_norm": 0.9950698614120483, - "learning_rate": 9.954372832347661e-05, - "loss": 0.1074, + "epoch": 4.280536473666994, + "grad_norm": 0.5459496378898621, + "learning_rate": 3.4358563735483254e-05, + "loss": 0.0529, "step": 16360 }, { - "epoch": 1.0709846254497875, - "grad_norm": 0.9378706812858582, - "learning_rate": 9.954248933962919e-05, - "loss": 0.1058, + "epoch": 4.283153418384036, + "grad_norm": 0.3209206163883209, + "learning_rate": 3.4339394953921765e-05, + "loss": 0.0447, "step": 16370 }, { - "epoch": 1.0716388616290482, - "grad_norm": 0.8824446201324463, - "learning_rate": 9.954124868359287e-05, - "loss": 0.1022, + "epoch": 4.285770363101079, + "grad_norm": 0.39667803049087524, + "learning_rate": 3.4320219788971884e-05, + "loss": 0.0493, "step": 16380 }, { - "epoch": 1.0722930978083087, - "grad_norm": 0.8328707218170166, - "learning_rate": 9.954000635540946e-05, - "loss": 0.1081, + "epoch": 4.288387307818122, + "grad_norm": 0.3870331048965454, + "learning_rate": 3.430103825373967e-05, + "loss": 0.0482, "step": 16390 }, { - "epoch": 1.0729473339875695, - "grad_norm": 0.9733028411865234, - "learning_rate": 9.953876235512091e-05, - "loss": 0.1012, + "epoch": 4.291004252535165, + "grad_norm": 0.4382264316082001, + "learning_rate": 3.428185036133552e-05, + "loss": 0.0465, "step": 16400 }, { - "epoch": 1.0736015701668302, - "grad_norm": 0.8833745718002319, - "learning_rate": 9.953751668276921e-05, - "loss": 0.1173, + "epoch": 4.293621197252208, + "grad_norm": 0.6332788467407227, + "learning_rate": 3.426265612487416e-05, + "loss": 0.0526, "step": 16410 }, { - "epoch": 1.074255806346091, - "grad_norm": 0.9558156728744507, - "learning_rate": 9.953626933839641e-05, - "loss": 0.1014, + "epoch": 4.296238141969251, + "grad_norm": 0.4075234532356262, + "learning_rate": 3.424345555747468e-05, + "loss": 0.0447, "step": 16420 }, { - "epoch": 1.0749100425253517, - "grad_norm": 0.9810806512832642, - "learning_rate": 9.95350203220446e-05, - "loss": 0.1074, + "epoch": 4.298855086686293, + "grad_norm": 0.451867938041687, + "learning_rate": 3.422424867226049e-05, + "loss": 0.0503, "step": 16430 }, { - "epoch": 1.0755642787046125, - "grad_norm": 0.810119092464447, - "learning_rate": 9.953376963375596e-05, - "loss": 0.1046, + "epoch": 4.301472031403336, + "grad_norm": 0.5655390620231628, + "learning_rate": 3.420503548235931e-05, + "loss": 0.0455, "step": 16440 }, { - "epoch": 1.076218514883873, - "grad_norm": 0.9398297071456909, - "learning_rate": 9.953251727357267e-05, - "loss": 0.1014, + "epoch": 4.304088976120379, + "grad_norm": 0.47900524735450745, + "learning_rate": 3.418581600090318e-05, + "loss": 0.0493, "step": 16450 }, { - "epoch": 1.0768727510631337, - "grad_norm": 0.7809659838676453, - "learning_rate": 9.953126324153701e-05, - "loss": 0.1171, + "epoch": 4.306705920837422, + "grad_norm": 0.46969595551490784, + "learning_rate": 3.416659024102842e-05, + "loss": 0.0459, "step": 16460 }, { - "epoch": 1.0775269872423945, - "grad_norm": 1.0472731590270996, - "learning_rate": 9.953000753769135e-05, - "loss": 0.1062, + "epoch": 4.309322865554465, + "grad_norm": 0.42236611247062683, + "learning_rate": 3.414735821587568e-05, + "loss": 0.0419, "step": 16470 }, { - "epoch": 1.0781812234216552, - "grad_norm": 1.0119749307632446, - "learning_rate": 9.9528750162078e-05, - "loss": 0.0979, + "epoch": 4.311939810271508, + "grad_norm": 0.6245198249816895, + "learning_rate": 3.4128119938589844e-05, + "loss": 0.0541, "step": 16480 }, { - "epoch": 1.078835459600916, - "grad_norm": 0.8965128064155579, - "learning_rate": 9.952749111473946e-05, - "loss": 0.1093, + "epoch": 4.314556754988551, + "grad_norm": 0.46810778975486755, + "learning_rate": 3.410887542232011e-05, + "loss": 0.0515, "step": 16490 }, { - "epoch": 1.0794896957801767, - "grad_norm": 0.9507614970207214, - "learning_rate": 9.95262303957182e-05, - "loss": 0.0978, + "epoch": 4.317173699705593, + "grad_norm": 0.4198428690433502, + "learning_rate": 3.408962468021991e-05, + "loss": 0.0449, "step": 16500 }, { - "epoch": 1.0801439319594373, - "grad_norm": 0.728331446647644, - "learning_rate": 9.952496800505679e-05, - "loss": 0.1016, + "epoch": 4.319790644422636, + "grad_norm": 0.8210209012031555, + "learning_rate": 3.407036772544695e-05, + "loss": 0.0478, "step": 16510 }, { - "epoch": 1.080798168138698, - "grad_norm": 0.896195113658905, - "learning_rate": 9.952370394279781e-05, - "loss": 0.1106, + "epoch": 4.322407589139679, + "grad_norm": 0.4958540201187134, + "learning_rate": 3.405110457116318e-05, + "loss": 0.0493, "step": 16520 }, { - "epoch": 1.0814524043179587, - "grad_norm": 0.9140453338623047, - "learning_rate": 9.952243820898395e-05, - "loss": 0.1173, + "epoch": 4.325024533856722, + "grad_norm": 0.5449163913726807, + "learning_rate": 3.403183523053479e-05, + "loss": 0.0473, "step": 16530 }, { - "epoch": 1.0821066404972195, - "grad_norm": 0.798197329044342, - "learning_rate": 9.95211708036579e-05, - "loss": 0.095, + "epoch": 4.327641478573765, + "grad_norm": 0.7345625758171082, + "learning_rate": 3.4012559716732176e-05, + "loss": 0.0481, "step": 16540 }, { - "epoch": 1.0827608766764802, - "grad_norm": 0.7951470613479614, - "learning_rate": 9.951990172686248e-05, - "loss": 0.1061, + "epoch": 4.330258423290808, + "grad_norm": 0.4123026430606842, + "learning_rate": 3.3993278042929986e-05, + "loss": 0.0459, "step": 16550 }, { - "epoch": 1.083415112855741, - "grad_norm": 0.809076189994812, - "learning_rate": 9.951863097864052e-05, - "loss": 0.0965, + "epoch": 4.3328753680078504, + "grad_norm": 0.5084407925605774, + "learning_rate": 3.397399022230705e-05, + "loss": 0.0447, "step": 16560 }, { - "epoch": 1.0840693490350017, - "grad_norm": 0.8439357876777649, - "learning_rate": 9.951735855903488e-05, - "loss": 0.1074, + "epoch": 4.335492312724893, + "grad_norm": 0.45943090319633484, + "learning_rate": 3.395469626804642e-05, + "loss": 0.0447, "step": 16570 }, { - "epoch": 1.0847235852142623, - "grad_norm": 1.051507830619812, - "learning_rate": 9.951608446808852e-05, - "loss": 0.1089, + "epoch": 4.338109257441936, + "grad_norm": 0.4610060751438141, + "learning_rate": 3.393539619333533e-05, + "loss": 0.0543, "step": 16580 }, { - "epoch": 1.085377821393523, - "grad_norm": 0.9256449937820435, - "learning_rate": 9.951480870584445e-05, - "loss": 0.1052, + "epoch": 4.340726202158979, + "grad_norm": 0.5210998058319092, + "learning_rate": 3.3916090011365195e-05, + "loss": 0.0479, "step": 16590 }, { - "epoch": 1.0860320575727838, - "grad_norm": 0.7667525410652161, - "learning_rate": 9.951353127234574e-05, - "loss": 0.1128, + "epoch": 4.343343146876022, + "grad_norm": 0.4095534384250641, + "learning_rate": 3.389677773533161e-05, + "loss": 0.0548, "step": 16600 }, { - "epoch": 1.0866862937520445, - "grad_norm": 0.821887731552124, - "learning_rate": 9.951225216763549e-05, - "loss": 0.1031, + "epoch": 4.345960091593065, + "grad_norm": 0.5919458270072937, + "learning_rate": 3.387745937843433e-05, + "loss": 0.0484, "step": 16610 }, { - "epoch": 1.0873405299313053, - "grad_norm": 0.8057878017425537, - "learning_rate": 9.951097139175687e-05, - "loss": 0.1135, + "epoch": 4.3485770363101075, + "grad_norm": 0.41136297583580017, + "learning_rate": 3.385813495387728e-05, + "loss": 0.0457, "step": 16620 }, { - "epoch": 1.087994766110566, - "grad_norm": 0.9914917349815369, - "learning_rate": 9.950968894475313e-05, - "loss": 0.0971, + "epoch": 4.3511939810271505, + "grad_norm": 0.5454357266426086, + "learning_rate": 3.383880447486852e-05, + "loss": 0.0517, "step": 16630 }, { - "epoch": 1.0886490022898265, - "grad_norm": 0.8095641136169434, - "learning_rate": 9.950840482666755e-05, - "loss": 0.1118, + "epoch": 4.3538109257441935, + "grad_norm": 0.7451463937759399, + "learning_rate": 3.381946795462024e-05, + "loss": 0.046, "step": 16640 }, { - "epoch": 1.0893032384690873, - "grad_norm": 0.8370475172996521, - "learning_rate": 9.950711903754345e-05, - "loss": 0.1098, + "epoch": 4.3564278704612365, + "grad_norm": 0.5025386214256287, + "learning_rate": 3.380012540634878e-05, + "loss": 0.0459, "step": 16650 }, { - "epoch": 1.089957474648348, - "grad_norm": 0.8619472980499268, - "learning_rate": 9.950583157742426e-05, - "loss": 0.1036, + "epoch": 4.3590448151782795, + "grad_norm": 0.6015411019325256, + "learning_rate": 3.3780776843274575e-05, + "loss": 0.0471, "step": 16660 }, { - "epoch": 1.0906117108276088, - "grad_norm": 0.9595767855644226, - "learning_rate": 9.950454244635341e-05, - "loss": 0.1139, + "epoch": 4.3616617598953225, + "grad_norm": 0.6264476776123047, + "learning_rate": 3.376142227862221e-05, + "loss": 0.0534, "step": 16670 }, { - "epoch": 1.0912659470068695, - "grad_norm": 1.2058511972427368, - "learning_rate": 9.950325164437442e-05, - "loss": 0.1025, + "epoch": 4.3642787046123654, + "grad_norm": 0.44910645484924316, + "learning_rate": 3.3742061725620325e-05, + "loss": 0.0444, "step": 16680 }, { - "epoch": 1.0919201831861303, - "grad_norm": 0.9937551617622375, - "learning_rate": 9.950195917153086e-05, - "loss": 0.1148, + "epoch": 4.3668956493294075, + "grad_norm": 0.5679300427436829, + "learning_rate": 3.372269519750168e-05, + "loss": 0.0553, "step": 16690 }, { - "epoch": 1.0925744193653908, - "grad_norm": 0.8375882506370544, - "learning_rate": 9.950066502786637e-05, - "loss": 0.0937, + "epoch": 4.3695125940464505, + "grad_norm": 0.5327921509742737, + "learning_rate": 3.370332270750313e-05, + "loss": 0.0464, "step": 16700 }, { - "epoch": 1.0932286555446515, - "grad_norm": 0.9841185212135315, - "learning_rate": 9.94993692134246e-05, - "loss": 0.1113, + "epoch": 4.3721295387634935, + "grad_norm": 0.4204285442829132, + "learning_rate": 3.368394426886556e-05, + "loss": 0.0429, "step": 16710 }, { - "epoch": 1.0938828917239123, - "grad_norm": 1.1531639099121094, - "learning_rate": 9.949807172824929e-05, - "loss": 0.1074, + "epoch": 4.3747464834805365, + "grad_norm": 0.5075750946998596, + "learning_rate": 3.366455989483398e-05, + "loss": 0.0438, "step": 16720 }, { - "epoch": 1.094537127903173, - "grad_norm": 0.8746184706687927, - "learning_rate": 9.949677257238428e-05, - "loss": 0.1053, + "epoch": 4.3773634281975795, + "grad_norm": 0.4906587302684784, + "learning_rate": 3.364516959865741e-05, + "loss": 0.0462, "step": 16730 }, { - "epoch": 1.0951913640824338, - "grad_norm": 1.0337003469467163, - "learning_rate": 9.949547174587337e-05, - "loss": 0.1127, + "epoch": 4.3799803729146225, + "grad_norm": 0.531676709651947, + "learning_rate": 3.3625773393588935e-05, + "loss": 0.0462, "step": 16740 }, { - "epoch": 1.0958456002616945, - "grad_norm": 0.9409461617469788, - "learning_rate": 9.949416924876047e-05, - "loss": 0.1085, + "epoch": 4.3825973176316655, + "grad_norm": 0.47353696823120117, + "learning_rate": 3.360637129288569e-05, + "loss": 0.0486, "step": 16750 }, { - "epoch": 1.0964998364409553, - "grad_norm": 0.7191810011863708, - "learning_rate": 9.949286508108957e-05, - "loss": 0.096, + "epoch": 4.385214262348708, + "grad_norm": 0.5241471529006958, + "learning_rate": 3.358696330980881e-05, + "loss": 0.0494, "step": 16760 }, { - "epoch": 1.0971540726202158, - "grad_norm": 0.7731035351753235, - "learning_rate": 9.949155924290466e-05, - "loss": 0.102, + "epoch": 4.387831207065751, + "grad_norm": 0.3669177293777466, + "learning_rate": 3.356754945762348e-05, + "loss": 0.0436, "step": 16770 }, { - "epoch": 1.0978083087994766, - "grad_norm": 0.9170941710472107, - "learning_rate": 9.949025173424984e-05, - "loss": 0.1042, + "epoch": 4.390448151782794, + "grad_norm": 0.41063836216926575, + "learning_rate": 3.354812974959889e-05, + "loss": 0.0492, "step": 16780 }, { - "epoch": 1.0984625449787373, - "grad_norm": 0.8782985210418701, - "learning_rate": 9.948894255516923e-05, - "loss": 0.099, + "epoch": 4.393065096499837, + "grad_norm": 0.4066520631313324, + "learning_rate": 3.352870419900821e-05, + "loss": 0.0438, "step": 16790 }, { - "epoch": 1.099116781157998, - "grad_norm": 0.9650111794471741, - "learning_rate": 9.948763170570702e-05, - "loss": 0.1106, + "epoch": 4.3956820412168796, + "grad_norm": 0.46243414282798767, + "learning_rate": 3.350927281912864e-05, + "loss": 0.0508, "step": 16800 }, { - "epoch": 1.0997710173372588, - "grad_norm": 1.0858937501907349, - "learning_rate": 9.948631918590746e-05, - "loss": 0.1097, + "epoch": 4.3982989859339225, + "grad_norm": 0.42753133177757263, + "learning_rate": 3.348983562324133e-05, + "loss": 0.0474, "step": 16810 }, { - "epoch": 1.1004252535165195, - "grad_norm": 0.8748927712440491, - "learning_rate": 9.948500499581484e-05, - "loss": 0.1124, + "epoch": 4.400915930650965, + "grad_norm": 0.3843385577201843, + "learning_rate": 3.3470392624631425e-05, + "loss": 0.0496, "step": 16820 }, { - "epoch": 1.1010794896957803, - "grad_norm": 0.9151651859283447, - "learning_rate": 9.94836891354735e-05, - "loss": 0.1082, + "epoch": 4.403532875368008, + "grad_norm": 0.453375905752182, + "learning_rate": 3.3450943836588034e-05, + "loss": 0.0501, "step": 16830 }, { - "epoch": 1.1017337258750408, - "grad_norm": 0.9073200225830078, - "learning_rate": 9.948237160492791e-05, - "loss": 0.1106, + "epoch": 4.406149820085051, + "grad_norm": 0.46848195791244507, + "learning_rate": 3.3431489272404213e-05, + "loss": 0.0484, "step": 16840 }, { - "epoch": 1.1023879620543016, - "grad_norm": 0.8182057738304138, - "learning_rate": 9.948105240422247e-05, - "loss": 0.1098, + "epoch": 4.408766764802094, + "grad_norm": 0.40131011605262756, + "learning_rate": 3.341202894537699e-05, + "loss": 0.0477, "step": 16850 }, { - "epoch": 1.1030421982335623, - "grad_norm": 0.7684637904167175, - "learning_rate": 9.947973153340178e-05, - "loss": 0.1042, + "epoch": 4.411383709519137, + "grad_norm": 0.4040091037750244, + "learning_rate": 3.33925628688073e-05, + "loss": 0.045, "step": 16860 }, { - "epoch": 1.103696434412823, - "grad_norm": 0.8614751100540161, - "learning_rate": 9.947840899251036e-05, - "loss": 0.0928, + "epoch": 4.41400065423618, + "grad_norm": 0.42682337760925293, + "learning_rate": 3.337309105600002e-05, + "loss": 0.0456, "step": 16870 }, { - "epoch": 1.1043506705920838, - "grad_norm": 1.0662237405776978, - "learning_rate": 9.947708478159288e-05, - "loss": 0.1048, + "epoch": 4.416617598953222, + "grad_norm": 0.5235479474067688, + "learning_rate": 3.335361352026396e-05, + "loss": 0.0452, "step": 16880 }, { - "epoch": 1.1050049067713446, - "grad_norm": 0.8045850396156311, - "learning_rate": 9.947575890069404e-05, - "loss": 0.1133, + "epoch": 4.419234543670265, + "grad_norm": 0.49944835901260376, + "learning_rate": 3.3334130274911826e-05, + "loss": 0.0501, "step": 16890 }, { - "epoch": 1.105659142950605, - "grad_norm": 0.9639503955841064, - "learning_rate": 9.947443134985857e-05, - "loss": 0.1188, + "epoch": 4.421851488387308, + "grad_norm": 0.5998557806015015, + "learning_rate": 3.331464133326024e-05, + "loss": 0.0439, "step": 16900 }, { - "epoch": 1.1063133791298658, - "grad_norm": 0.8630536794662476, - "learning_rate": 9.94731021291313e-05, - "loss": 0.0985, + "epoch": 4.424468433104351, + "grad_norm": 0.506262481212616, + "learning_rate": 3.329514670862971e-05, + "loss": 0.0496, "step": 16910 }, { - "epoch": 1.1069676153091266, - "grad_norm": 0.8716172575950623, - "learning_rate": 9.947177123855708e-05, - "loss": 0.1138, + "epoch": 4.427085377821394, + "grad_norm": 0.7459492087364197, + "learning_rate": 3.3275646414344614e-05, + "loss": 0.0416, "step": 16920 }, { - "epoch": 1.1076218514883873, - "grad_norm": 0.8983464241027832, - "learning_rate": 9.947043867818084e-05, - "loss": 0.1056, + "epoch": 4.429702322538437, + "grad_norm": 0.4433119297027588, + "learning_rate": 3.325614046373323e-05, + "loss": 0.0473, "step": 16930 }, { - "epoch": 1.108276087667648, - "grad_norm": 0.889821469783783, - "learning_rate": 9.946910444804755e-05, - "loss": 0.1008, + "epoch": 4.43231926725548, + "grad_norm": 0.3905789256095886, + "learning_rate": 3.3236628870127696e-05, + "loss": 0.0398, "step": 16940 }, { - "epoch": 1.1089303238469088, - "grad_norm": 0.8643634915351868, - "learning_rate": 9.946776854820224e-05, - "loss": 0.1147, + "epoch": 4.434936211972522, + "grad_norm": 0.48078566789627075, + "learning_rate": 3.321711164686399e-05, + "loss": 0.0433, "step": 16950 }, { - "epoch": 1.1095845600261693, - "grad_norm": 0.8867340683937073, - "learning_rate": 9.946643097869002e-05, - "loss": 0.096, + "epoch": 4.437553156689565, + "grad_norm": 0.4129769802093506, + "learning_rate": 3.319758880728196e-05, + "loss": 0.0494, "step": 16960 }, { - "epoch": 1.11023879620543, - "grad_norm": 0.859140932559967, - "learning_rate": 9.946509173955603e-05, - "loss": 0.1037, + "epoch": 4.440170101406608, + "grad_norm": 0.6216508746147156, + "learning_rate": 3.317806036472527e-05, + "loss": 0.0431, "step": 16970 }, { - "epoch": 1.1108930323846908, - "grad_norm": 1.0756253004074097, - "learning_rate": 9.946375083084545e-05, - "loss": 0.11, + "epoch": 4.442787046123651, + "grad_norm": 0.5008850693702698, + "learning_rate": 3.3158526332541444e-05, + "loss": 0.0447, "step": 16980 }, { - "epoch": 1.1115472685639516, - "grad_norm": 0.7562744617462158, - "learning_rate": 9.946240825260356e-05, - "loss": 0.1174, + "epoch": 4.445403990840694, + "grad_norm": 0.30528682470321655, + "learning_rate": 3.31389867240818e-05, + "loss": 0.0482, "step": 16990 }, { - "epoch": 1.1122015047432123, - "grad_norm": 0.8000214099884033, - "learning_rate": 9.946106400487567e-05, - "loss": 0.1084, + "epoch": 4.448020935557737, + "grad_norm": 0.5385652184486389, + "learning_rate": 3.311944155270147e-05, + "loss": 0.0476, + "step": 17000 + }, + { + "epoch": 4.448020935557737, + "eval_loss": 0.05004162187291969, + "eval_runtime": 9.0744, + "eval_samples_per_second": 112.844, + "eval_steps_per_second": 1.763, "step": 17000 }, { - "epoch": 1.112855740922473, - "grad_norm": 0.9482758641242981, - "learning_rate": 9.945971808770716e-05, - "loss": 0.1032, + "epoch": 4.450637880274779, + "grad_norm": 0.5225152969360352, + "learning_rate": 3.309989083175941e-05, + "loss": 0.0464, "step": 17010 }, { - "epoch": 1.1135099771017338, - "grad_norm": 0.8377864360809326, - "learning_rate": 9.945837050114345e-05, - "loss": 0.0981, + "epoch": 4.453254824991822, + "grad_norm": 0.41770097613334656, + "learning_rate": 3.308033457461833e-05, + "loss": 0.0511, "step": 17020 }, { - "epoch": 1.1141642132809944, - "grad_norm": 0.9127662181854248, - "learning_rate": 9.945702124523002e-05, - "loss": 0.1082, + "epoch": 4.455871769708865, + "grad_norm": 0.4127200245857239, + "learning_rate": 3.3060772794644776e-05, + "loss": 0.0447, "step": 17030 }, { - "epoch": 1.114818449460255, - "grad_norm": 1.0239959955215454, - "learning_rate": 9.945567032001243e-05, - "loss": 0.1188, + "epoch": 4.458488714425908, + "grad_norm": 0.32362911105155945, + "learning_rate": 3.304120550520902e-05, + "loss": 0.0485, "step": 17040 }, { - "epoch": 1.1154726856395158, - "grad_norm": 1.2458797693252563, - "learning_rate": 9.945431772553626e-05, - "loss": 0.122, + "epoch": 4.461105659142951, + "grad_norm": 0.42366665601730347, + "learning_rate": 3.3021632719685125e-05, + "loss": 0.0416, "step": 17050 }, { - "epoch": 1.1161269218187766, - "grad_norm": 1.0552146434783936, - "learning_rate": 9.945296346184716e-05, - "loss": 0.1069, + "epoch": 4.463722603859994, + "grad_norm": 0.5108731389045715, + "learning_rate": 3.30020544514509e-05, + "loss": 0.0502, "step": 17060 }, { - "epoch": 1.1167811579980373, - "grad_norm": 0.7637792825698853, - "learning_rate": 9.945160752899085e-05, - "loss": 0.0978, + "epoch": 4.466339548577036, + "grad_norm": 0.26923635601997375, + "learning_rate": 3.2982470713887916e-05, + "loss": 0.0397, "step": 17070 }, { - "epoch": 1.117435394177298, - "grad_norm": 1.0093464851379395, - "learning_rate": 9.94502499270131e-05, - "loss": 0.121, + "epoch": 4.468956493294079, + "grad_norm": 0.36798936128616333, + "learning_rate": 3.296288152038147e-05, + "loss": 0.0506, "step": 17080 }, { - "epoch": 1.1180896303565586, - "grad_norm": 0.7700073719024658, - "learning_rate": 9.944889065595972e-05, - "loss": 0.1018, + "epoch": 4.471573438011122, + "grad_norm": 0.5822156667709351, + "learning_rate": 3.294328688432059e-05, + "loss": 0.0494, "step": 17090 }, { - "epoch": 1.1187438665358194, - "grad_norm": 0.8178618550300598, - "learning_rate": 9.94475297158766e-05, - "loss": 0.096, + "epoch": 4.474190382728165, + "grad_norm": 0.5101978778839111, + "learning_rate": 3.2923686819098024e-05, + "loss": 0.0453, "step": 17100 }, { - "epoch": 1.1193981027150801, - "grad_norm": 0.8623060584068298, - "learning_rate": 9.944616710680967e-05, - "loss": 0.1043, + "epoch": 4.476807327445208, + "grad_norm": 0.6352287530899048, + "learning_rate": 3.290408133811024e-05, + "loss": 0.0554, "step": 17110 }, { - "epoch": 1.1200523388943409, - "grad_norm": 0.7678028345108032, - "learning_rate": 9.944480282880493e-05, - "loss": 0.1027, + "epoch": 4.479424272162251, + "grad_norm": 0.37760841846466064, + "learning_rate": 3.288447045475739e-05, + "loss": 0.0424, "step": 17120 }, { - "epoch": 1.1207065750736016, - "grad_norm": 0.92671138048172, - "learning_rate": 9.944343688190842e-05, - "loss": 0.0993, + "epoch": 4.482041216879294, + "grad_norm": 0.5223073363304138, + "learning_rate": 3.2864854182443326e-05, + "loss": 0.0471, "step": 17130 }, { - "epoch": 1.1213608112528624, - "grad_norm": 0.9388012290000916, - "learning_rate": 9.944206926616624e-05, - "loss": 0.1094, + "epoch": 4.484658161596336, + "grad_norm": 0.650477945804596, + "learning_rate": 3.2845232534575594e-05, + "loss": 0.0515, "step": 17140 }, { - "epoch": 1.1220150474321229, - "grad_norm": 0.7402985095977783, - "learning_rate": 9.944069998162455e-05, - "loss": 0.1004, + "epoch": 4.487275106313379, + "grad_norm": 0.6777361035346985, + "learning_rate": 3.28256055245654e-05, + "loss": 0.0453, "step": 17150 }, { - "epoch": 1.1226692836113836, - "grad_norm": 0.8221508860588074, - "learning_rate": 9.943932902832959e-05, - "loss": 0.1123, + "epoch": 4.489892051030422, + "grad_norm": 0.693477213382721, + "learning_rate": 3.2805973165827614e-05, + "loss": 0.0445, "step": 17160 }, { - "epoch": 1.1233235197906444, - "grad_norm": 0.8503665924072266, - "learning_rate": 9.94379564063276e-05, - "loss": 0.1153, + "epoch": 4.492508995747465, + "grad_norm": 0.5595983266830444, + "learning_rate": 3.2786335471780774e-05, + "loss": 0.0462, "step": 17170 }, { - "epoch": 1.1239777559699051, - "grad_norm": 0.7901976704597473, - "learning_rate": 9.943658211566493e-05, - "loss": 0.0984, + "epoch": 4.495125940464508, + "grad_norm": 0.46131110191345215, + "learning_rate": 3.276669245584707e-05, + "loss": 0.0496, "step": 17180 }, { - "epoch": 1.1246319921491659, - "grad_norm": 0.8279184103012085, - "learning_rate": 9.943520615638797e-05, - "loss": 0.116, + "epoch": 4.497742885181551, + "grad_norm": 0.5225169658660889, + "learning_rate": 3.27470441314523e-05, + "loss": 0.045, "step": 17190 }, { - "epoch": 1.1252862283284266, - "grad_norm": 0.9072709679603577, - "learning_rate": 9.943382852854313e-05, - "loss": 0.1076, + "epoch": 4.500359829898594, + "grad_norm": 0.5240589380264282, + "learning_rate": 3.272739051202592e-05, + "loss": 0.0467, "step": 17200 }, { - "epoch": 1.1259404645076874, - "grad_norm": 0.9476553797721863, - "learning_rate": 9.943244923217695e-05, - "loss": 0.102, + "epoch": 4.502976774615636, + "grad_norm": 0.6839200258255005, + "learning_rate": 3.270773161100099e-05, + "loss": 0.0506, "step": 17210 }, { - "epoch": 1.126594700686948, - "grad_norm": 0.8165683746337891, - "learning_rate": 9.943106826733597e-05, - "loss": 0.1099, + "epoch": 4.505593719332679, + "grad_norm": 0.3838503658771515, + "learning_rate": 3.268806744181419e-05, + "loss": 0.0483, "step": 17220 }, { - "epoch": 1.1272489368662086, - "grad_norm": 0.9307758212089539, - "learning_rate": 9.942968563406679e-05, - "loss": 0.0967, + "epoch": 4.508210664049722, + "grad_norm": 0.45105141401290894, + "learning_rate": 3.266839801790578e-05, + "loss": 0.0546, "step": 17230 }, { - "epoch": 1.1279031730454694, - "grad_norm": 0.8825275897979736, - "learning_rate": 9.942830133241609e-05, - "loss": 0.1106, + "epoch": 4.510827608766765, + "grad_norm": 0.34984609484672546, + "learning_rate": 3.264872335271963e-05, + "loss": 0.0441, "step": 17240 }, { - "epoch": 1.1285574092247301, - "grad_norm": 0.6875970959663391, - "learning_rate": 9.942691536243058e-05, - "loss": 0.1083, + "epoch": 4.513444553483808, + "grad_norm": 0.35363954305648804, + "learning_rate": 3.26290434597032e-05, + "loss": 0.0513, "step": 17250 }, { - "epoch": 1.1292116454039909, - "grad_norm": 0.8216149806976318, - "learning_rate": 9.942552772415706e-05, - "loss": 0.1088, + "epoch": 4.51606149820085, + "grad_norm": 0.5348709225654602, + "learning_rate": 3.2609358352307496e-05, + "loss": 0.0456, "step": 17260 }, { - "epoch": 1.1298658815832516, - "grad_norm": 0.7268308997154236, - "learning_rate": 9.942413841764235e-05, - "loss": 0.0968, + "epoch": 4.518678442917893, + "grad_norm": 0.3599573075771332, + "learning_rate": 3.258966804398711e-05, + "loss": 0.0445, "step": 17270 }, { - "epoch": 1.1305201177625124, - "grad_norm": 0.8889786601066589, - "learning_rate": 9.942274744293336e-05, - "loss": 0.0991, + "epoch": 4.521295387634936, + "grad_norm": 0.5039171576499939, + "learning_rate": 3.256997254820019e-05, + "loss": 0.0458, "step": 17280 }, { - "epoch": 1.131174353941773, - "grad_norm": 0.8577876091003418, - "learning_rate": 9.942135480007701e-05, - "loss": 0.1016, + "epoch": 4.523912332351979, + "grad_norm": 0.45528119802474976, + "learning_rate": 3.255027187840841e-05, + "loss": 0.0458, "step": 17290 }, { - "epoch": 1.1318285901210337, - "grad_norm": 0.9688170552253723, - "learning_rate": 9.941996048912035e-05, - "loss": 0.106, + "epoch": 4.526529277069022, + "grad_norm": 0.6549640893936157, + "learning_rate": 3.253056604807699e-05, + "loss": 0.051, "step": 17300 }, { - "epoch": 1.1324828263002944, - "grad_norm": 0.9323129653930664, - "learning_rate": 9.941856451011039e-05, - "loss": 0.1081, + "epoch": 4.529146221786065, + "grad_norm": 0.5074205994606018, + "learning_rate": 3.251085507067469e-05, + "loss": 0.0428, "step": 17310 }, { - "epoch": 1.1331370624795551, - "grad_norm": 0.8865740895271301, - "learning_rate": 9.941716686309428e-05, - "loss": 0.1051, + "epoch": 4.531763166503108, + "grad_norm": 0.3896365761756897, + "learning_rate": 3.2491138959673776e-05, + "loss": 0.049, "step": 17320 }, { - "epoch": 1.133791298658816, - "grad_norm": 0.912921667098999, - "learning_rate": 9.941576754811919e-05, - "loss": 0.1075, + "epoch": 4.53438011122015, + "grad_norm": 0.5313991904258728, + "learning_rate": 3.2471417728550015e-05, + "loss": 0.0463, "step": 17330 }, { - "epoch": 1.1344455348380766, - "grad_norm": 0.7206995487213135, - "learning_rate": 9.941436656523236e-05, - "loss": 0.107, + "epoch": 4.536997055937193, + "grad_norm": 0.2488325983285904, + "learning_rate": 3.245169139078269e-05, + "loss": 0.0416, "step": 17340 }, { - "epoch": 1.1350997710173372, - "grad_norm": 0.8259555697441101, - "learning_rate": 9.941296391448103e-05, - "loss": 0.1092, + "epoch": 4.539614000654236, + "grad_norm": 0.5214577913284302, + "learning_rate": 3.243195995985456e-05, + "loss": 0.0453, "step": 17350 }, { - "epoch": 1.135754007196598, - "grad_norm": 0.7500777244567871, - "learning_rate": 9.94115595959126e-05, - "loss": 0.1002, + "epoch": 4.542230945371279, + "grad_norm": 0.49365049600601196, + "learning_rate": 3.2412223449251887e-05, + "loss": 0.0429, "step": 17360 }, { - "epoch": 1.1364082433758587, - "grad_norm": 1.0846604108810425, - "learning_rate": 9.941015360957445e-05, - "loss": 0.1082, + "epoch": 4.544847890088322, + "grad_norm": 0.5702369809150696, + "learning_rate": 3.239248187246437e-05, + "loss": 0.0493, "step": 17370 }, { - "epoch": 1.1370624795551194, - "grad_norm": 0.8614764213562012, - "learning_rate": 9.940874595551404e-05, - "loss": 0.1099, + "epoch": 4.547464834805365, + "grad_norm": 0.5522635579109192, + "learning_rate": 3.237273524298521e-05, + "loss": 0.0446, "step": 17380 }, { - "epoch": 1.1377167157343802, - "grad_norm": 1.0448957681655884, - "learning_rate": 9.940733663377885e-05, - "loss": 0.1172, + "epoch": 4.550081779522408, + "grad_norm": 0.4819454550743103, + "learning_rate": 3.2352983574311025e-05, + "loss": 0.047, "step": 17390 }, { - "epoch": 1.138370951913641, - "grad_norm": 0.8377976417541504, - "learning_rate": 9.940592564441649e-05, - "loss": 0.1061, + "epoch": 4.55269872423945, + "grad_norm": 0.38399413228034973, + "learning_rate": 3.23332268799419e-05, + "loss": 0.0512, "step": 17400 }, { - "epoch": 1.1390251880929014, - "grad_norm": 1.1926016807556152, - "learning_rate": 9.940451298747456e-05, - "loss": 0.1077, + "epoch": 4.555315668956493, + "grad_norm": 0.5839704871177673, + "learning_rate": 3.2313465173381355e-05, + "loss": 0.0471, "step": 17410 }, { - "epoch": 1.1396794242721622, - "grad_norm": 0.9103050231933594, - "learning_rate": 9.940309866300075e-05, - "loss": 0.1148, + "epoch": 4.557932613673536, + "grad_norm": 0.5217947959899902, + "learning_rate": 3.2293698468136326e-05, + "loss": 0.0443, "step": 17420 }, { - "epoch": 1.140333660451423, - "grad_norm": 0.8122014999389648, - "learning_rate": 9.940168267104279e-05, - "loss": 0.0984, + "epoch": 4.560549558390579, + "grad_norm": 0.4841725528240204, + "learning_rate": 3.227392677771716e-05, + "loss": 0.0478, "step": 17430 }, { - "epoch": 1.1409878966306837, - "grad_norm": 0.8694342374801636, - "learning_rate": 9.94002650116485e-05, - "loss": 0.0997, + "epoch": 4.563166503107622, + "grad_norm": 0.5399268865585327, + "learning_rate": 3.225415011563764e-05, + "loss": 0.0392, "step": 17440 }, { - "epoch": 1.1416421328099444, - "grad_norm": 0.8882526159286499, - "learning_rate": 9.939884568486571e-05, - "loss": 0.1021, + "epoch": 4.565783447824665, + "grad_norm": 0.3765436112880707, + "learning_rate": 3.223436849541491e-05, + "loss": 0.0431, "step": 17450 }, { - "epoch": 1.1422963689892052, - "grad_norm": 0.85523521900177, - "learning_rate": 9.939742469074229e-05, - "loss": 0.12, + "epoch": 4.568400392541708, + "grad_norm": 0.4995643198490143, + "learning_rate": 3.221458193056955e-05, + "loss": 0.0477, "step": 17460 }, { - "epoch": 1.142950605168466, - "grad_norm": 0.9386106729507446, - "learning_rate": 9.939600202932626e-05, - "loss": 0.11, + "epoch": 4.57101733725875, + "grad_norm": 0.37771889567375183, + "learning_rate": 3.219479043462545e-05, + "loss": 0.0504, "step": 17470 }, { - "epoch": 1.1436048413477264, - "grad_norm": 1.0228506326675415, - "learning_rate": 9.939457770066563e-05, - "loss": 0.0956, + "epoch": 4.573634281975793, + "grad_norm": 0.4961005747318268, + "learning_rate": 3.217499402110993e-05, + "loss": 0.0456, "step": 17480 }, { - "epoch": 1.1442590775269872, - "grad_norm": 0.8513170480728149, - "learning_rate": 9.939315170480843e-05, - "loss": 0.1032, + "epoch": 4.576251226692836, + "grad_norm": 0.6124542951583862, + "learning_rate": 3.215519270355366e-05, + "loss": 0.0458, "step": 17490 }, { - "epoch": 1.144913313706248, - "grad_norm": 0.8414477109909058, - "learning_rate": 9.939172404180284e-05, - "loss": 0.1, + "epoch": 4.578868171409879, + "grad_norm": 0.41645899415016174, + "learning_rate": 3.2135386495490644e-05, + "loss": 0.0417, "step": 17500 }, { - "epoch": 1.1455675498855087, - "grad_norm": 0.9836424589157104, - "learning_rate": 9.939029471169703e-05, - "loss": 0.0972, + "epoch": 4.581485116126922, + "grad_norm": 0.39719998836517334, + "learning_rate": 3.2115575410458254e-05, + "loss": 0.0423, "step": 17510 }, { - "epoch": 1.1462217860647694, - "grad_norm": 0.6796431541442871, - "learning_rate": 9.938886371453924e-05, - "loss": 0.103, + "epoch": 4.584102060843964, + "grad_norm": 0.39007094502449036, + "learning_rate": 3.2095759461997146e-05, + "loss": 0.0511, "step": 17520 }, { - "epoch": 1.1468760222440302, - "grad_norm": 0.7944294810295105, - "learning_rate": 9.938743105037777e-05, - "loss": 0.1015, + "epoch": 4.586719005561007, + "grad_norm": 0.6247320771217346, + "learning_rate": 3.2075938663651364e-05, + "loss": 0.0527, "step": 17530 }, { - "epoch": 1.147530258423291, - "grad_norm": 0.9507654309272766, - "learning_rate": 9.938599671926097e-05, - "loss": 0.1041, + "epoch": 4.58933595027805, + "grad_norm": 0.5982612371444702, + "learning_rate": 3.2056113028968224e-05, + "loss": 0.0484, "step": 17540 }, { - "epoch": 1.1481844946025515, - "grad_norm": 0.8638071417808533, - "learning_rate": 9.938456072123727e-05, - "loss": 0.1065, + "epoch": 4.591952894995093, + "grad_norm": 0.44412627816200256, + "learning_rate": 3.203628257149837e-05, + "loss": 0.0464, "step": 17550 }, { - "epoch": 1.1488387307818122, - "grad_norm": 0.8793585896492004, - "learning_rate": 9.938312305635514e-05, - "loss": 0.0983, + "epoch": 4.594569839712136, + "grad_norm": 0.6911810040473938, + "learning_rate": 3.2016447304795735e-05, + "loss": 0.0468, "step": 17560 }, { - "epoch": 1.149492966961073, - "grad_norm": 0.8482651114463806, - "learning_rate": 9.938168372466307e-05, - "loss": 0.1042, + "epoch": 4.597186784429179, + "grad_norm": 0.5549867153167725, + "learning_rate": 3.1996607242417506e-05, + "loss": 0.0519, "step": 17570 }, { - "epoch": 1.1501472031403337, - "grad_norm": 0.889570415019989, - "learning_rate": 9.93802427262097e-05, - "loss": 0.1028, + "epoch": 4.599803729146222, + "grad_norm": 0.7803630828857422, + "learning_rate": 3.197676239792422e-05, + "loss": 0.0503, "step": 17580 }, { - "epoch": 1.1508014393195944, - "grad_norm": 1.002482295036316, - "learning_rate": 9.93788000610436e-05, - "loss": 0.1081, + "epoch": 4.602420673863264, + "grad_norm": 0.4282356798648834, + "learning_rate": 3.195691278487961e-05, + "loss": 0.0524, "step": 17590 }, { - "epoch": 1.151455675498855, - "grad_norm": 0.8683956265449524, - "learning_rate": 9.937735572921352e-05, - "loss": 0.1026, + "epoch": 4.605037618580307, + "grad_norm": 0.5289554595947266, + "learning_rate": 3.193705841685072e-05, + "loss": 0.0461, "step": 17600 }, { - "epoch": 1.1521099116781157, - "grad_norm": 0.8563847541809082, - "learning_rate": 9.937590973076818e-05, - "loss": 0.1024, + "epoch": 4.60765456329735, + "grad_norm": 0.7792943120002747, + "learning_rate": 3.191719930740781e-05, + "loss": 0.0498, "step": 17610 }, { - "epoch": 1.1527641478573765, - "grad_norm": 1.01363205909729, - "learning_rate": 9.937446206575639e-05, - "loss": 0.1075, + "epoch": 4.610271508014393, + "grad_norm": 0.31162726879119873, + "learning_rate": 3.189733547012439e-05, + "loss": 0.0392, "step": 17620 }, { - "epoch": 1.1534183840366372, - "grad_norm": 0.998652458190918, - "learning_rate": 9.937301273422703e-05, - "loss": 0.1069, + "epoch": 4.612888452731436, + "grad_norm": 0.4618692696094513, + "learning_rate": 3.187746691857723e-05, + "loss": 0.0518, "step": 17630 }, { - "epoch": 1.154072620215898, - "grad_norm": 0.9310935139656067, - "learning_rate": 9.937156173622899e-05, - "loss": 0.1146, + "epoch": 4.615505397448479, + "grad_norm": 0.7038789987564087, + "learning_rate": 3.185759366634627e-05, + "loss": 0.051, "step": 17640 }, { - "epoch": 1.1547268563951587, - "grad_norm": 0.9854075312614441, - "learning_rate": 9.937010907181125e-05, - "loss": 0.1147, + "epoch": 4.618122342165522, + "grad_norm": 0.6297800540924072, + "learning_rate": 3.183771572701471e-05, + "loss": 0.0509, "step": 17650 }, { - "epoch": 1.1553810925744195, - "grad_norm": 1.0847511291503906, - "learning_rate": 9.936865474102289e-05, - "loss": 0.1066, + "epoch": 4.620739286882564, + "grad_norm": 0.5747802257537842, + "learning_rate": 3.1817833114168924e-05, + "loss": 0.0535, "step": 17660 }, { - "epoch": 1.15603532875368, - "grad_norm": 1.0146312713623047, - "learning_rate": 9.936719874391291e-05, - "loss": 0.1077, + "epoch": 4.623356231599607, + "grad_norm": 0.6170051097869873, + "learning_rate": 3.179794584139849e-05, + "loss": 0.0402, "step": 17670 }, { - "epoch": 1.1566895649329407, - "grad_norm": 1.1995619535446167, - "learning_rate": 9.936574108053054e-05, - "loss": 0.1099, + "epoch": 4.62597317631665, + "grad_norm": 0.5372450351715088, + "learning_rate": 3.177805392229617e-05, + "loss": 0.0447, "step": 17680 }, { - "epoch": 1.1573438011122015, - "grad_norm": 0.8147319555282593, - "learning_rate": 9.936428175092491e-05, - "loss": 0.1107, + "epoch": 4.628590121033693, + "grad_norm": 0.5293178558349609, + "learning_rate": 3.175815737045792e-05, + "loss": 0.0533, "step": 17690 }, { - "epoch": 1.1579980372914622, - "grad_norm": 1.276146650314331, - "learning_rate": 9.936282075514534e-05, - "loss": 0.1206, + "epoch": 4.631207065750736, + "grad_norm": 0.6131226420402527, + "learning_rate": 3.173825619948283e-05, + "loss": 0.0467, "step": 17700 }, { - "epoch": 1.158652273470723, - "grad_norm": 0.8899917602539062, - "learning_rate": 9.93613580932411e-05, - "loss": 0.1001, + "epoch": 4.633824010467779, + "grad_norm": 0.37538790702819824, + "learning_rate": 3.171835042297317e-05, + "loss": 0.0432, "step": 17710 }, { - "epoch": 1.1593065096499837, - "grad_norm": 1.1056793928146362, - "learning_rate": 9.935989376526156e-05, - "loss": 0.1106, + "epoch": 4.636440955184821, + "grad_norm": 0.3324880301952362, + "learning_rate": 3.169844005453433e-05, + "loss": 0.0433, "step": 17720 }, { - "epoch": 1.1599607458292445, - "grad_norm": 0.9014807343482971, - "learning_rate": 9.935842777125615e-05, - "loss": 0.1002, + "epoch": 4.639057899901864, + "grad_norm": 0.592828094959259, + "learning_rate": 3.167852510777487e-05, + "loss": 0.0471, "step": 17730 }, { - "epoch": 1.160614982008505, - "grad_norm": 0.9449447393417358, - "learning_rate": 9.935696011127438e-05, - "loss": 0.1039, + "epoch": 4.641674844618907, + "grad_norm": 0.4177514612674713, + "learning_rate": 3.16586055963065e-05, + "loss": 0.0417, "step": 17740 }, { - "epoch": 1.1612692181877657, - "grad_norm": 0.8191094994544983, - "learning_rate": 9.935549078536574e-05, - "loss": 0.1099, + "epoch": 4.64429178933595, + "grad_norm": 0.3623928129673004, + "learning_rate": 3.1638681533743975e-05, + "loss": 0.0436, "step": 17750 }, { - "epoch": 1.1619234543670265, - "grad_norm": 0.7725281715393066, - "learning_rate": 9.935401979357986e-05, - "loss": 0.096, + "epoch": 4.646908734052993, + "grad_norm": 0.346414178609848, + "learning_rate": 3.161875293370523e-05, + "loss": 0.0426, "step": 17760 }, { - "epoch": 1.1625776905462872, - "grad_norm": 0.8660596609115601, - "learning_rate": 9.935254713596637e-05, - "loss": 0.1046, + "epoch": 4.649525678770036, + "grad_norm": 0.3698805272579193, + "learning_rate": 3.159881980981126e-05, + "loss": 0.0412, "step": 17770 }, { - "epoch": 1.163231926725548, - "grad_norm": 1.0237655639648438, - "learning_rate": 9.935107281257498e-05, - "loss": 0.1121, + "epoch": 4.652142623487078, + "grad_norm": 0.4758628010749817, + "learning_rate": 3.157888217568617e-05, + "loss": 0.046, "step": 17780 }, { - "epoch": 1.1638861629048087, - "grad_norm": 1.306152582168579, - "learning_rate": 9.934959682345546e-05, - "loss": 0.1007, + "epoch": 4.654759568204121, + "grad_norm": 0.4483131170272827, + "learning_rate": 3.155894004495716e-05, + "loss": 0.0474, "step": 17790 }, { - "epoch": 1.1645403990840693, - "grad_norm": 0.8070068359375, - "learning_rate": 9.934811916865763e-05, - "loss": 0.1075, + "epoch": 4.657376512921164, + "grad_norm": 0.5199103951454163, + "learning_rate": 3.153899343125446e-05, + "loss": 0.0447, "step": 17800 }, { - "epoch": 1.16519463526333, - "grad_norm": 0.9843347668647766, - "learning_rate": 9.934663984823133e-05, - "loss": 0.1066, + "epoch": 4.659993457638207, + "grad_norm": 0.47452351450920105, + "learning_rate": 3.151904234821142e-05, + "loss": 0.0476, "step": 17810 }, { - "epoch": 1.1658488714425908, - "grad_norm": 0.8091020584106445, - "learning_rate": 9.934515886222655e-05, - "loss": 0.0987, + "epoch": 4.66261040235525, + "grad_norm": 0.36638322472572327, + "learning_rate": 3.14990868094644e-05, + "loss": 0.0444, "step": 17820 }, { - "epoch": 1.1665031076218515, - "grad_norm": 0.9831781983375549, - "learning_rate": 9.934367621069322e-05, - "loss": 0.1053, + "epoch": 4.665227347072293, + "grad_norm": 0.43507784605026245, + "learning_rate": 3.147912682865283e-05, + "loss": 0.0512, "step": 17830 }, { - "epoch": 1.1671573438011122, - "grad_norm": 0.9934841394424438, - "learning_rate": 9.934219189368143e-05, - "loss": 0.1143, + "epoch": 4.667844291789336, + "grad_norm": 0.4801563322544098, + "learning_rate": 3.145916241941917e-05, + "loss": 0.0475, "step": 17840 }, { - "epoch": 1.167811579980373, - "grad_norm": 0.9186519980430603, - "learning_rate": 9.934070591124127e-05, - "loss": 0.108, + "epoch": 4.6704612365063785, + "grad_norm": 0.5879807472229004, + "learning_rate": 3.14391935954089e-05, + "loss": 0.0471, "step": 17850 }, { - "epoch": 1.1684658161596335, - "grad_norm": 0.9979751706123352, - "learning_rate": 9.933921826342286e-05, - "loss": 0.1146, + "epoch": 4.6730781812234214, + "grad_norm": 0.49333956837654114, + "learning_rate": 3.141922037027053e-05, + "loss": 0.0471, "step": 17860 }, { - "epoch": 1.1691200523388943, - "grad_norm": 0.8182001709938049, - "learning_rate": 9.933772895027644e-05, - "loss": 0.0996, + "epoch": 4.675695125940464, + "grad_norm": 0.42784637212753296, + "learning_rate": 3.139924275765556e-05, + "loss": 0.0495, "step": 17870 }, { - "epoch": 1.169774288518155, - "grad_norm": 0.9773245453834534, - "learning_rate": 9.933623797185228e-05, - "loss": 0.1056, + "epoch": 4.678312070657507, + "grad_norm": 0.428956538438797, + "learning_rate": 3.137926077121851e-05, + "loss": 0.0391, "step": 17880 }, { - "epoch": 1.1704285246974158, - "grad_norm": 0.8775014281272888, - "learning_rate": 9.933474532820071e-05, - "loss": 0.1121, + "epoch": 4.68092901537455, + "grad_norm": 0.46614134311676025, + "learning_rate": 3.135927442461688e-05, + "loss": 0.0468, "step": 17890 }, { - "epoch": 1.1710827608766765, - "grad_norm": 0.8223180174827576, - "learning_rate": 9.933325101937207e-05, - "loss": 0.0976, + "epoch": 4.683545960091593, + "grad_norm": 0.5212433934211731, + "learning_rate": 3.133928373151114e-05, + "loss": 0.0401, "step": 17900 }, { - "epoch": 1.1717369970559373, - "grad_norm": 0.8393526673316956, - "learning_rate": 9.933175504541686e-05, - "loss": 0.1034, + "epoch": 4.686162904808636, + "grad_norm": 0.43236270546913147, + "learning_rate": 3.131928870556474e-05, + "loss": 0.0479, "step": 17910 }, { - "epoch": 1.172391233235198, - "grad_norm": 0.8224334716796875, - "learning_rate": 9.933025740638551e-05, - "loss": 0.1118, + "epoch": 4.6887798495256785, + "grad_norm": 0.41202881932258606, + "learning_rate": 3.129928936044411e-05, + "loss": 0.0494, "step": 17920 }, { - "epoch": 1.1730454694144585, - "grad_norm": 0.7815266847610474, - "learning_rate": 9.932875810232863e-05, - "loss": 0.1002, + "epoch": 4.6913967942427215, + "grad_norm": 0.45419037342071533, + "learning_rate": 3.127928570981859e-05, + "loss": 0.0455, "step": 17930 }, { - "epoch": 1.1736997055937193, - "grad_norm": 0.9696399569511414, - "learning_rate": 9.932725713329678e-05, - "loss": 0.1071, + "epoch": 4.6940137389597645, + "grad_norm": 0.48706430196762085, + "learning_rate": 3.1259277767360504e-05, + "loss": 0.0427, "step": 17940 }, { - "epoch": 1.17435394177298, - "grad_norm": 0.9797826409339905, - "learning_rate": 9.932575449934062e-05, - "loss": 0.1058, + "epoch": 4.6966306836768075, + "grad_norm": 0.40641650557518005, + "learning_rate": 3.123926554674508e-05, + "loss": 0.0426, "step": 17950 }, { - "epoch": 1.1750081779522408, - "grad_norm": 0.90691077709198, - "learning_rate": 9.93242502005109e-05, - "loss": 0.1041, + "epoch": 4.6992476283938505, + "grad_norm": 0.5983295440673828, + "learning_rate": 3.121924906165049e-05, + "loss": 0.0453, "step": 17960 }, { - "epoch": 1.1756624141315015, - "grad_norm": 0.8655257821083069, - "learning_rate": 9.932274423685839e-05, - "loss": 0.1105, + "epoch": 4.701864573110893, + "grad_norm": 0.5954302549362183, + "learning_rate": 3.1199228325757814e-05, + "loss": 0.0501, "step": 17970 }, { - "epoch": 1.1763166503107623, - "grad_norm": 0.9878515005111694, - "learning_rate": 9.932123660843389e-05, - "loss": 0.119, + "epoch": 4.7044815178279356, + "grad_norm": 0.3794120252132416, + "learning_rate": 3.117920335275102e-05, + "loss": 0.0392, "step": 17980 }, { - "epoch": 1.176970886490023, - "grad_norm": 1.0736156702041626, - "learning_rate": 9.931972731528831e-05, - "loss": 0.1112, + "epoch": 4.7070984625449785, + "grad_norm": 0.45704203844070435, + "learning_rate": 3.115917415631702e-05, + "loss": 0.0414, "step": 17990 }, { - "epoch": 1.1776251226692835, - "grad_norm": 0.9092381596565247, - "learning_rate": 9.931821635747259e-05, - "loss": 0.1154, + "epoch": 4.7097154072620215, + "grad_norm": 0.44982025027275085, + "learning_rate": 3.113914075014555e-05, + "loss": 0.0438, "step": 18000 }, { - "epoch": 1.1782793588485443, - "grad_norm": 0.9012355208396912, - "learning_rate": 9.931670373503771e-05, - "loss": 0.1123, + "epoch": 4.7097154072620215, + "eval_loss": 0.05080666894939734, + "eval_runtime": 9.0538, + "eval_samples_per_second": 113.102, + "eval_steps_per_second": 1.767, + "step": 18000 + }, + { + "epoch": 4.7123323519790645, + "grad_norm": 0.36351096630096436, + "learning_rate": 3.111910314792926e-05, + "loss": 0.0458, "step": 18010 }, { - "epoch": 1.178933595027805, - "grad_norm": 0.8360759019851685, - "learning_rate": 9.931518944803477e-05, - "loss": 0.1086, + "epoch": 4.7149492966961075, + "grad_norm": 0.513700008392334, + "learning_rate": 3.1099061363363685e-05, + "loss": 0.0462, "step": 18020 }, { - "epoch": 1.1795878312070658, - "grad_norm": 1.0164974927902222, - "learning_rate": 9.931367349651484e-05, - "loss": 0.1137, + "epoch": 4.7175662414131505, + "grad_norm": 0.34745171666145325, + "learning_rate": 3.107901541014717e-05, + "loss": 0.0424, "step": 18030 }, { - "epoch": 1.1802420673863265, - "grad_norm": 0.7645466327667236, - "learning_rate": 9.93121558805291e-05, - "loss": 0.0971, + "epoch": 4.720183186130193, + "grad_norm": 0.44875892996788025, + "learning_rate": 3.105896530198094e-05, + "loss": 0.0499, "step": 18040 }, { - "epoch": 1.180896303565587, - "grad_norm": 0.8536872863769531, - "learning_rate": 9.931063660012875e-05, - "loss": 0.1113, + "epoch": 4.722800130847236, + "grad_norm": 0.3810875117778778, + "learning_rate": 3.1038911052569055e-05, + "loss": 0.0504, "step": 18050 }, { - "epoch": 1.1815505397448478, - "grad_norm": 0.6680562496185303, - "learning_rate": 9.930911565536513e-05, - "loss": 0.0963, + "epoch": 4.725417075564279, + "grad_norm": 0.48473119735717773, + "learning_rate": 3.101885267561841e-05, + "loss": 0.0415, "step": 18060 }, { - "epoch": 1.1822047759241086, - "grad_norm": 0.9181721210479736, - "learning_rate": 9.930759304628952e-05, - "loss": 0.1102, + "epoch": 4.728034020281322, + "grad_norm": 0.5180307626724243, + "learning_rate": 3.0998790184838735e-05, + "loss": 0.0459, "step": 18070 }, { - "epoch": 1.1828590121033693, - "grad_norm": 0.9609587788581848, - "learning_rate": 9.930606877295334e-05, - "loss": 0.0972, + "epoch": 4.730650964998365, + "grad_norm": 0.3327622413635254, + "learning_rate": 3.0978723593942516e-05, + "loss": 0.0426, "step": 18080 }, { - "epoch": 1.18351324828263, - "grad_norm": 0.8478082418441772, - "learning_rate": 9.930454283540802e-05, - "loss": 0.0914, + "epoch": 4.733267909715408, + "grad_norm": 0.48294293880462646, + "learning_rate": 3.0958652916645104e-05, + "loss": 0.0463, "step": 18090 }, { - "epoch": 1.1841674844618908, - "grad_norm": 0.7738621830940247, - "learning_rate": 9.930301523370507e-05, - "loss": 0.11, + "epoch": 4.7358848544324506, + "grad_norm": 0.4532416760921478, + "learning_rate": 3.0938578166664604e-05, + "loss": 0.0471, "step": 18100 }, { - "epoch": 1.1848217206411515, - "grad_norm": 0.8491454720497131, - "learning_rate": 9.930148596789605e-05, - "loss": 0.1067, + "epoch": 4.738501799149493, + "grad_norm": 0.3738783895969391, + "learning_rate": 3.091849935772193e-05, + "loss": 0.0391, "step": 18110 }, { - "epoch": 1.185475956820412, - "grad_norm": 0.9434249401092529, - "learning_rate": 9.92999550380326e-05, - "loss": 0.1046, + "epoch": 4.741118743866536, + "grad_norm": 0.5960399508476257, + "learning_rate": 3.089841650354076e-05, + "loss": 0.0442, "step": 18120 }, { - "epoch": 1.1861301929996728, - "grad_norm": 1.3511039018630981, - "learning_rate": 9.929842244416636e-05, - "loss": 0.1087, + "epoch": 4.743735688583579, + "grad_norm": 0.7043212652206421, + "learning_rate": 3.0878329617847514e-05, + "loss": 0.0393, "step": 18130 }, { - "epoch": 1.1867844291789336, - "grad_norm": 0.9221027493476868, - "learning_rate": 9.929688818634907e-05, - "loss": 0.1059, + "epoch": 4.746352633300622, + "grad_norm": 0.42389801144599915, + "learning_rate": 3.0858238714371405e-05, + "loss": 0.052, "step": 18140 }, { - "epoch": 1.1874386653581943, - "grad_norm": 0.8466002941131592, - "learning_rate": 9.929535226463253e-05, - "loss": 0.1008, + "epoch": 4.748969578017665, + "grad_norm": 0.4757963716983795, + "learning_rate": 3.0838143806844374e-05, + "loss": 0.0474, "step": 18150 }, { - "epoch": 1.188092901537455, - "grad_norm": 0.9640662670135498, - "learning_rate": 9.929381467906858e-05, - "loss": 0.1137, + "epoch": 4.751586522734708, + "grad_norm": 0.5247064232826233, + "learning_rate": 3.081804490900111e-05, + "loss": 0.0402, "step": 18160 }, { - "epoch": 1.1887471377167158, - "grad_norm": 0.9665100574493408, - "learning_rate": 9.929227542970908e-05, - "loss": 0.0976, + "epoch": 4.75420346745175, + "grad_norm": 0.6219999194145203, + "learning_rate": 3.0797942034579016e-05, + "loss": 0.0512, "step": 18170 }, { - "epoch": 1.1894013738959766, - "grad_norm": 1.027797818183899, - "learning_rate": 9.929073451660602e-05, - "loss": 0.101, + "epoch": 4.756820412168793, + "grad_norm": 0.5547354221343994, + "learning_rate": 3.077783519731819e-05, + "loss": 0.0448, "step": 18180 }, { - "epoch": 1.190055610075237, - "grad_norm": 0.7733204364776611, - "learning_rate": 9.928919193981141e-05, - "loss": 0.105, + "epoch": 4.759437356885836, + "grad_norm": 0.25582262873649597, + "learning_rate": 3.075772441096151e-05, + "loss": 0.0425, "step": 18190 }, { - "epoch": 1.1907098462544978, - "grad_norm": 0.874467670917511, - "learning_rate": 9.928764769937729e-05, - "loss": 0.105, + "epoch": 4.762054301602879, + "grad_norm": 0.3228617310523987, + "learning_rate": 3.0737609689254473e-05, + "loss": 0.041, "step": 18200 }, { - "epoch": 1.1913640824337586, - "grad_norm": 0.8329676985740662, - "learning_rate": 9.92861017953558e-05, - "loss": 0.094, + "epoch": 4.764671246319922, + "grad_norm": 0.35130971670150757, + "learning_rate": 3.071749104594533e-05, + "loss": 0.0433, "step": 18210 }, { - "epoch": 1.1920183186130193, - "grad_norm": 0.9815132021903992, - "learning_rate": 9.928455422779913e-05, - "loss": 0.1056, + "epoch": 4.767288191036965, + "grad_norm": 0.5520963668823242, + "learning_rate": 3.0697368494784966e-05, + "loss": 0.0513, "step": 18220 }, { - "epoch": 1.19267255479228, - "grad_norm": 0.8479218482971191, - "learning_rate": 9.92830049967595e-05, - "loss": 0.1049, + "epoch": 4.769905135754007, + "grad_norm": 0.4022964835166931, + "learning_rate": 3.067724204952695e-05, + "loss": 0.0438, "step": 18230 }, { - "epoch": 1.1933267909715408, - "grad_norm": 1.3466743230819702, - "learning_rate": 9.928145410228919e-05, - "loss": 0.1031, + "epoch": 4.77252208047105, + "grad_norm": 0.5612972378730774, + "learning_rate": 3.0657111723927535e-05, + "loss": 0.0433, "step": 18240 }, { - "epoch": 1.1939810271508013, - "grad_norm": 0.9225918650627136, - "learning_rate": 9.927990154444056e-05, - "loss": 0.1104, + "epoch": 4.775139025188093, + "grad_norm": 0.5245701670646667, + "learning_rate": 3.0636977531745595e-05, + "loss": 0.0453, "step": 18250 }, { - "epoch": 1.194635263330062, - "grad_norm": 1.143968939781189, - "learning_rate": 9.927834732326602e-05, - "loss": 0.1122, + "epoch": 4.777755969905136, + "grad_norm": 0.8202319741249084, + "learning_rate": 3.0616839486742667e-05, + "loss": 0.0482, "step": 18260 }, { - "epoch": 1.1952894995093228, - "grad_norm": 0.9355697631835938, - "learning_rate": 9.927679143881802e-05, - "loss": 0.0969, + "epoch": 4.780372914622179, + "grad_norm": 0.45992013812065125, + "learning_rate": 3.059669760268292e-05, + "loss": 0.0444, "step": 18270 }, { - "epoch": 1.1959437356885836, - "grad_norm": 0.9907440543174744, - "learning_rate": 9.927523389114908e-05, - "loss": 0.1212, + "epoch": 4.782989859339222, + "grad_norm": 0.5470813512802124, + "learning_rate": 3.0576551893333124e-05, + "loss": 0.0432, "step": 18280 }, { - "epoch": 1.1965979718678443, - "grad_norm": 0.8513557314872742, - "learning_rate": 9.927367468031175e-05, - "loss": 0.0925, + "epoch": 4.785606804056265, + "grad_norm": 0.37620311975479126, + "learning_rate": 3.05564023724627e-05, + "loss": 0.0451, "step": 18290 }, { - "epoch": 1.197252208047105, - "grad_norm": 0.790827751159668, - "learning_rate": 9.927211380635869e-05, - "loss": 0.0964, + "epoch": 4.788223748773307, + "grad_norm": 0.4782375395298004, + "learning_rate": 3.053624905384364e-05, + "loss": 0.0507, "step": 18300 }, { - "epoch": 1.1979064442263656, - "grad_norm": 1.2124277353286743, - "learning_rate": 9.927055126934256e-05, - "loss": 0.0996, + "epoch": 4.79084069349035, + "grad_norm": 0.31268492341041565, + "learning_rate": 3.0516091951250563e-05, + "loss": 0.0431, "step": 18310 }, { - "epoch": 1.1985606804056264, - "grad_norm": 1.0649062395095825, - "learning_rate": 9.92689870693161e-05, - "loss": 0.0976, + "epoch": 4.793457638207393, + "grad_norm": 0.37621551752090454, + "learning_rate": 3.0495931078460654e-05, + "loss": 0.0456, "step": 18320 }, { - "epoch": 1.199214916584887, - "grad_norm": 1.0294406414031982, - "learning_rate": 9.926742120633214e-05, - "loss": 0.1023, + "epoch": 4.796074582924436, + "grad_norm": 0.42091211676597595, + "learning_rate": 3.047576644925367e-05, + "loss": 0.0486, "step": 18330 }, { - "epoch": 1.1998691527641479, - "grad_norm": 0.8996325135231018, - "learning_rate": 9.92658536804435e-05, - "loss": 0.0976, + "epoch": 4.798691527641479, + "grad_norm": 0.5487850904464722, + "learning_rate": 3.0455598077411952e-05, + "loss": 0.0493, "step": 18340 }, { - "epoch": 1.2005233889434086, - "grad_norm": 0.8989105820655823, - "learning_rate": 9.926428449170308e-05, - "loss": 0.1096, + "epoch": 4.801308472358522, + "grad_norm": 0.6485093832015991, + "learning_rate": 3.0435425976720395e-05, + "loss": 0.0437, "step": 18350 }, { - "epoch": 1.2011776251226693, - "grad_norm": 0.7884949445724487, - "learning_rate": 9.926271364016386e-05, - "loss": 0.1116, + "epoch": 4.803925417075565, + "grad_norm": 0.5202974081039429, + "learning_rate": 3.041525016096643e-05, + "loss": 0.044, "step": 18360 }, { - "epoch": 1.20183186130193, - "grad_norm": 0.8274450302124023, - "learning_rate": 9.926114112587886e-05, - "loss": 0.1015, + "epoch": 4.806542361792607, + "grad_norm": 0.3426308035850525, + "learning_rate": 3.0395070643940048e-05, + "loss": 0.0466, "step": 18370 }, { - "epoch": 1.2024860974811906, - "grad_norm": 1.003177285194397, - "learning_rate": 9.925956694890115e-05, - "loss": 0.1233, + "epoch": 4.80915930650965, + "grad_norm": 0.647731602191925, + "learning_rate": 3.0374887439433748e-05, + "loss": 0.0547, "step": 18380 }, { - "epoch": 1.2031403336604514, - "grad_norm": 0.9113197326660156, - "learning_rate": 9.925799110928388e-05, - "loss": 0.1148, + "epoch": 4.811776251226693, + "grad_norm": 0.8727405071258545, + "learning_rate": 3.0354700561242573e-05, + "loss": 0.0429, "step": 18390 }, { - "epoch": 1.2037945698397121, - "grad_norm": 0.8547518849372864, - "learning_rate": 9.925641360708021e-05, - "loss": 0.1136, + "epoch": 4.814393195943736, + "grad_norm": 0.7195037007331848, + "learning_rate": 3.0334510023164054e-05, + "loss": 0.0481, "step": 18400 }, { - "epoch": 1.2044488060189729, - "grad_norm": 0.7965877056121826, - "learning_rate": 9.925483444234341e-05, - "loss": 0.0907, + "epoch": 4.817010140660779, + "grad_norm": 0.4770624041557312, + "learning_rate": 3.031431583899823e-05, + "loss": 0.0526, "step": 18410 }, { - "epoch": 1.2051030421982336, - "grad_norm": 0.9823962450027466, - "learning_rate": 9.925325361512679e-05, - "loss": 0.0995, + "epoch": 4.819627085377821, + "grad_norm": 0.5087447166442871, + "learning_rate": 3.0294118022547645e-05, + "loss": 0.0431, "step": 18420 }, { - "epoch": 1.2057572783774944, - "grad_norm": 0.8240478038787842, - "learning_rate": 9.925167112548365e-05, - "loss": 0.1053, + "epoch": 4.822244030094864, + "grad_norm": 0.4283955693244934, + "learning_rate": 3.027391658761731e-05, + "loss": 0.0485, "step": 18430 }, { - "epoch": 1.206411514556755, - "grad_norm": 0.8721191883087158, - "learning_rate": 9.925008697346748e-05, - "loss": 0.1069, + "epoch": 4.824860974811907, + "grad_norm": 0.34631991386413574, + "learning_rate": 3.025371154801472e-05, + "loss": 0.0416, "step": 18440 }, { - "epoch": 1.2070657507360156, - "grad_norm": 0.9270473718643188, - "learning_rate": 9.924850115913169e-05, - "loss": 0.1148, + "epoch": 4.82747791952895, + "grad_norm": 0.2821793258190155, + "learning_rate": 3.0233502917549826e-05, + "loss": 0.0442, "step": 18450 }, { - "epoch": 1.2077199869152764, - "grad_norm": 0.7241986989974976, - "learning_rate": 9.924691368252983e-05, - "loss": 0.1004, + "epoch": 4.830094864245993, + "grad_norm": 0.631080687046051, + "learning_rate": 3.0213290710035035e-05, + "loss": 0.0473, "step": 18460 }, { - "epoch": 1.2083742230945371, - "grad_norm": 0.8268777132034302, - "learning_rate": 9.924532454371549e-05, - "loss": 0.0935, + "epoch": 4.832711808963036, + "grad_norm": 0.4035530090332031, + "learning_rate": 3.0193074939285206e-05, + "loss": 0.0453, "step": 18470 }, { - "epoch": 1.2090284592737979, - "grad_norm": 0.8865098357200623, - "learning_rate": 9.924373374274228e-05, - "loss": 0.1058, + "epoch": 4.835328753680079, + "grad_norm": 0.40414726734161377, + "learning_rate": 3.0172855619117612e-05, + "loss": 0.0399, "step": 18480 }, { - "epoch": 1.2096826954530586, - "grad_norm": 0.8308011293411255, - "learning_rate": 9.924214127966391e-05, - "loss": 0.1022, + "epoch": 4.837945698397121, + "grad_norm": 0.3670026957988739, + "learning_rate": 3.0152632763351995e-05, + "loss": 0.0447, "step": 18490 }, { - "epoch": 1.2103369316323191, - "grad_norm": 0.8910857439041138, - "learning_rate": 9.924054715453414e-05, - "loss": 0.0998, + "epoch": 4.840562643114164, + "grad_norm": 0.4316958487033844, + "learning_rate": 3.0132406385810463e-05, + "loss": 0.0453, "step": 18500 }, { - "epoch": 1.21099116781158, - "grad_norm": 0.8274952173233032, - "learning_rate": 9.923895136740676e-05, - "loss": 0.1031, + "epoch": 4.843179587831207, + "grad_norm": 0.3881956934928894, + "learning_rate": 3.011217650031756e-05, + "loss": 0.0478, "step": 18510 }, { - "epoch": 1.2116454039908406, - "grad_norm": 1.0838192701339722, - "learning_rate": 9.923735391833564e-05, - "loss": 0.1024, + "epoch": 4.84579653254825, + "grad_norm": 0.513076901435852, + "learning_rate": 3.0091943120700233e-05, + "loss": 0.0454, "step": 18520 }, { - "epoch": 1.2122996401701014, - "grad_norm": 0.8322728276252747, - "learning_rate": 9.92357548073747e-05, - "loss": 0.1234, + "epoch": 4.848413477265293, + "grad_norm": 0.5722450613975525, + "learning_rate": 3.0071706260787792e-05, + "loss": 0.0418, "step": 18530 }, { - "epoch": 1.2129538763493621, - "grad_norm": 0.8210560083389282, - "learning_rate": 9.923415403457789e-05, - "loss": 0.1051, + "epoch": 4.851030421982336, + "grad_norm": 0.4285776913166046, + "learning_rate": 3.0051465934411944e-05, + "loss": 0.0439, "step": 18540 }, { - "epoch": 1.2136081125286229, - "grad_norm": 0.8708292245864868, - "learning_rate": 9.92325515999993e-05, - "loss": 0.0952, + "epoch": 4.853647366699379, + "grad_norm": 0.4009416401386261, + "learning_rate": 3.0031222155406763e-05, + "loss": 0.047, "step": 18550 }, { - "epoch": 1.2142623487078836, - "grad_norm": 0.8271515965461731, - "learning_rate": 9.923094750369293e-05, - "loss": 0.1003, + "epoch": 4.856264311416421, + "grad_norm": 0.35369279980659485, + "learning_rate": 3.0010974937608677e-05, + "loss": 0.0452, "step": 18560 }, { - "epoch": 1.2149165848871442, - "grad_norm": 0.7365519404411316, - "learning_rate": 9.9229341745713e-05, - "loss": 0.1061, + "epoch": 4.858881256133464, + "grad_norm": 0.443196564912796, + "learning_rate": 2.9990724294856475e-05, + "loss": 0.0441, "step": 18570 }, { - "epoch": 1.215570821066405, - "grad_norm": 1.1527734994888306, - "learning_rate": 9.922773432611366e-05, - "loss": 0.1127, + "epoch": 4.861498200850507, + "grad_norm": 0.5534233450889587, + "learning_rate": 2.9970470240991284e-05, + "loss": 0.0468, "step": 18580 }, { - "epoch": 1.2162250572456657, - "grad_norm": 0.7652072310447693, - "learning_rate": 9.922612524494919e-05, - "loss": 0.1048, + "epoch": 4.86411514556755, + "grad_norm": 0.39392098784446716, + "learning_rate": 2.9950212789856535e-05, + "loss": 0.0414, "step": 18590 }, { - "epoch": 1.2168792934249264, - "grad_norm": 0.8632088899612427, - "learning_rate": 9.92245145022739e-05, - "loss": 0.101, + "epoch": 4.866732090284593, + "grad_norm": 0.3481430411338806, + "learning_rate": 2.9929951955298035e-05, + "loss": 0.0465, "step": 18600 }, { - "epoch": 1.2175335296041871, - "grad_norm": 0.8901262283325195, - "learning_rate": 9.922290209814214e-05, - "loss": 0.0977, + "epoch": 4.869349035001636, + "grad_norm": 0.4503512680530548, + "learning_rate": 2.9909687751163855e-05, + "loss": 0.0438, "step": 18610 }, { - "epoch": 1.218187765783448, - "grad_norm": 0.8626736402511597, - "learning_rate": 9.922128803260833e-05, - "loss": 0.0963, + "epoch": 4.871965979718678, + "grad_norm": 0.4893753230571747, + "learning_rate": 2.9889420191304397e-05, + "loss": 0.0432, "step": 18620 }, { - "epoch": 1.2188420019627086, - "grad_norm": 0.8524816632270813, - "learning_rate": 9.921967230572699e-05, - "loss": 0.1004, + "epoch": 4.874582924435721, + "grad_norm": 0.4819509983062744, + "learning_rate": 2.9869149289572347e-05, + "loss": 0.0426, "step": 18630 }, { - "epoch": 1.2194962381419692, - "grad_norm": 0.6984155774116516, - "learning_rate": 9.92180549175526e-05, - "loss": 0.1046, + "epoch": 4.877199869152764, + "grad_norm": 0.47873371839523315, + "learning_rate": 2.9848875059822657e-05, + "loss": 0.0437, "step": 18640 }, { - "epoch": 1.22015047432123, - "grad_norm": 0.9021031856536865, - "learning_rate": 9.92164358681398e-05, - "loss": 0.096, + "epoch": 4.879816813869807, + "grad_norm": 0.49186429381370544, + "learning_rate": 2.98285975159126e-05, + "loss": 0.0362, "step": 18650 }, { - "epoch": 1.2208047105004907, - "grad_norm": 0.8669722676277161, - "learning_rate": 9.921481515754321e-05, - "loss": 0.1071, + "epoch": 4.88243375858685, + "grad_norm": 1.002996802330017, + "learning_rate": 2.9808316671701658e-05, + "loss": 0.0444, "step": 18660 }, { - "epoch": 1.2214589466797514, - "grad_norm": 0.8371648788452148, - "learning_rate": 9.921319278581752e-05, - "loss": 0.1051, + "epoch": 4.885050703303893, + "grad_norm": 0.48731791973114014, + "learning_rate": 2.978803254105162e-05, + "loss": 0.0382, "step": 18670 }, { - "epoch": 1.2221131828590122, - "grad_norm": 0.8591510653495789, - "learning_rate": 9.921156875301751e-05, - "loss": 0.0984, + "epoch": 4.887667648020935, + "grad_norm": 0.45587220788002014, + "learning_rate": 2.9767745137826487e-05, + "loss": 0.0473, "step": 18680 }, { - "epoch": 1.222767419038273, - "grad_norm": 0.7590886950492859, - "learning_rate": 9.920994305919801e-05, - "loss": 0.1065, + "epoch": 4.890284592737978, + "grad_norm": 0.5397971868515015, + "learning_rate": 2.9747454475892505e-05, + "loss": 0.0456, "step": 18690 }, { - "epoch": 1.2234216552175334, - "grad_norm": 0.9001472592353821, - "learning_rate": 9.920831570441387e-05, - "loss": 0.0941, + "epoch": 4.892901537455021, + "grad_norm": 0.6106773018836975, + "learning_rate": 2.972716056911816e-05, + "loss": 0.0475, "step": 18700 }, { - "epoch": 1.2240758913967942, - "grad_norm": 1.0100491046905518, - "learning_rate": 9.920668668872002e-05, - "loss": 0.0984, + "epoch": 4.895518482172064, + "grad_norm": 0.9047291874885559, + "learning_rate": 2.9706863431374138e-05, + "loss": 0.0441, "step": 18710 }, { - "epoch": 1.224730127576055, - "grad_norm": 0.9002352952957153, - "learning_rate": 9.920505601217143e-05, - "loss": 0.1018, + "epoch": 4.898135426889107, + "grad_norm": 0.6912985444068909, + "learning_rate": 2.9686563076533347e-05, + "loss": 0.0469, "step": 18720 }, { - "epoch": 1.2253843637553157, - "grad_norm": 0.8079519867897034, - "learning_rate": 9.920342367482318e-05, - "loss": 0.1088, + "epoch": 4.90075237160615, + "grad_norm": 0.40746009349823, + "learning_rate": 2.9666259518470885e-05, + "loss": 0.042, "step": 18730 }, { - "epoch": 1.2260385999345764, - "grad_norm": 0.9378504157066345, - "learning_rate": 9.920178967673031e-05, - "loss": 0.1105, + "epoch": 4.903369316323193, + "grad_norm": 0.4547899067401886, + "learning_rate": 2.9645952771064035e-05, + "loss": 0.0439, "step": 18740 }, { - "epoch": 1.2266928361138372, - "grad_norm": 0.9143635034561157, - "learning_rate": 9.920015401794803e-05, - "loss": 0.0919, + "epoch": 4.905986261040235, + "grad_norm": 0.651161789894104, + "learning_rate": 2.9625642848192283e-05, + "loss": 0.0454, "step": 18750 }, { - "epoch": 1.2273470722930977, - "grad_norm": 0.8441551923751831, - "learning_rate": 9.91985166985315e-05, - "loss": 0.097, + "epoch": 4.908603205757278, + "grad_norm": 0.47897329926490784, + "learning_rate": 2.9605329763737254e-05, + "loss": 0.0445, "step": 18760 }, { - "epoch": 1.2280013084723584, - "grad_norm": 0.8805584907531738, - "learning_rate": 9.919687771853601e-05, - "loss": 0.1003, + "epoch": 4.911220150474321, + "grad_norm": 0.5273807644844055, + "learning_rate": 2.958501353158276e-05, + "loss": 0.0433, "step": 18770 }, { - "epoch": 1.2286555446516192, - "grad_norm": 1.0052324533462524, - "learning_rate": 9.919523707801687e-05, - "loss": 0.1009, + "epoch": 4.913837095191364, + "grad_norm": 0.7630659937858582, + "learning_rate": 2.956469416561476e-05, + "loss": 0.0457, "step": 18780 }, { - "epoch": 1.22930978083088, - "grad_norm": 0.9868443608283997, - "learning_rate": 9.919359477702948e-05, - "loss": 0.1163, + "epoch": 4.916454039908407, + "grad_norm": 0.6563160419464111, + "learning_rate": 2.9544371679721326e-05, + "loss": 0.0414, "step": 18790 }, { - "epoch": 1.2299640170101407, - "grad_norm": 1.144353985786438, - "learning_rate": 9.919195081562924e-05, - "loss": 0.1081, + "epoch": 4.91907098462545, + "grad_norm": 0.45346808433532715, + "learning_rate": 2.952404608779271e-05, + "loss": 0.0418, "step": 18800 }, { - "epoch": 1.2306182531894014, - "grad_norm": 1.0622575283050537, - "learning_rate": 9.919030519387164e-05, - "loss": 0.0981, + "epoch": 4.921687929342493, + "grad_norm": 0.3976815938949585, + "learning_rate": 2.950371740372125e-05, + "loss": 0.0449, "step": 18810 }, { - "epoch": 1.2312724893686622, - "grad_norm": 0.8485022783279419, - "learning_rate": 9.918865791181224e-05, - "loss": 0.1029, + "epoch": 4.924304874059535, + "grad_norm": 0.3786884546279907, + "learning_rate": 2.9483385641401407e-05, + "loss": 0.0416, "step": 18820 }, { - "epoch": 1.2319267255479227, - "grad_norm": 1.022147536277771, - "learning_rate": 9.918700896950664e-05, - "loss": 0.1053, + "epoch": 4.926921818776578, + "grad_norm": 0.4601690173149109, + "learning_rate": 2.946305081472976e-05, + "loss": 0.0477, "step": 18830 }, { - "epoch": 1.2325809617271835, - "grad_norm": 0.8381478190422058, - "learning_rate": 9.918535836701047e-05, - "loss": 0.1024, + "epoch": 4.929538763493621, + "grad_norm": 0.5352820754051208, + "learning_rate": 2.9442712937604962e-05, + "loss": 0.0455, "step": 18840 }, { - "epoch": 1.2332351979064442, - "grad_norm": 0.7995995879173279, - "learning_rate": 9.918370610437948e-05, - "loss": 0.0961, + "epoch": 4.932155708210664, + "grad_norm": 0.3937074840068817, + "learning_rate": 2.9422372023927764e-05, + "loss": 0.0439, "step": 18850 }, { - "epoch": 1.233889434085705, - "grad_norm": 1.0238806009292603, - "learning_rate": 9.91820521816694e-05, - "loss": 0.1049, + "epoch": 4.934772652927707, + "grad_norm": 0.5810989737510681, + "learning_rate": 2.9402028087600992e-05, + "loss": 0.0462, "step": 18860 }, { - "epoch": 1.2345436702649657, - "grad_norm": 1.0174421072006226, - "learning_rate": 9.91803965989361e-05, - "loss": 0.0893, + "epoch": 4.937389597644749, + "grad_norm": 0.5032179951667786, + "learning_rate": 2.938168114252952e-05, + "loss": 0.0386, "step": 18870 }, { - "epoch": 1.2351979064442264, - "grad_norm": 1.1415332555770874, - "learning_rate": 9.917873935623542e-05, - "loss": 0.1049, + "epoch": 4.940006542361792, + "grad_norm": 0.39301612973213196, + "learning_rate": 2.936133120262031e-05, + "loss": 0.0412, "step": 18880 }, { - "epoch": 1.2358521426234872, - "grad_norm": 0.7645314931869507, - "learning_rate": 9.91770804536233e-05, - "loss": 0.1065, + "epoch": 4.942623487078835, + "grad_norm": 0.39664873480796814, + "learning_rate": 2.934097828178235e-05, + "loss": 0.0418, "step": 18890 }, { - "epoch": 1.2365063788027477, - "grad_norm": 0.9801765084266663, - "learning_rate": 9.917541989115578e-05, - "loss": 0.1038, + "epoch": 4.945240431795878, + "grad_norm": 0.39426127076148987, + "learning_rate": 2.9320622393926667e-05, + "loss": 0.0425, "step": 18900 }, { - "epoch": 1.2371606149820085, - "grad_norm": 0.9121055603027344, - "learning_rate": 9.917375766888883e-05, - "loss": 0.1095, + "epoch": 4.947857376512921, + "grad_norm": 0.27260127663612366, + "learning_rate": 2.9300263552966324e-05, + "loss": 0.0369, "step": 18910 }, { - "epoch": 1.2378148511612692, - "grad_norm": 0.7746009230613708, - "learning_rate": 9.917209378687862e-05, - "loss": 0.0968, + "epoch": 4.950474321229964, + "grad_norm": 0.4988190233707428, + "learning_rate": 2.927990177281638e-05, + "loss": 0.05, "step": 18920 }, { - "epoch": 1.23846908734053, - "grad_norm": 0.8752132058143616, - "learning_rate": 9.917042824518127e-05, - "loss": 0.1014, + "epoch": 4.953091265947007, + "grad_norm": 0.4185965359210968, + "learning_rate": 2.9259537067393937e-05, + "loss": 0.0483, "step": 18930 }, { - "epoch": 1.2391233235197907, - "grad_norm": 0.8438636660575867, - "learning_rate": 9.916876104385303e-05, - "loss": 0.1035, + "epoch": 4.955708210664049, + "grad_norm": 0.36746054887771606, + "learning_rate": 2.923916945061807e-05, + "loss": 0.0406, "step": 18940 }, { - "epoch": 1.2397775596990512, - "grad_norm": 1.1780970096588135, - "learning_rate": 9.916709218295015e-05, - "loss": 0.1077, + "epoch": 4.958325155381092, + "grad_norm": 0.4041653871536255, + "learning_rate": 2.9218798936409868e-05, + "loss": 0.0375, "step": 18950 }, { - "epoch": 1.240431795878312, - "grad_norm": 0.9799401164054871, - "learning_rate": 9.916542166252895e-05, - "loss": 0.1137, + "epoch": 4.960942100098135, + "grad_norm": 0.40771132707595825, + "learning_rate": 2.9198425538692365e-05, + "loss": 0.0432, "step": 18960 }, { - "epoch": 1.2410860320575727, - "grad_norm": 0.6996291875839233, - "learning_rate": 9.916374948264584e-05, - "loss": 0.1007, + "epoch": 4.963559044815178, + "grad_norm": 0.5572474598884583, + "learning_rate": 2.917804927139059e-05, + "loss": 0.0475, "step": 18970 }, { - "epoch": 1.2417402682368335, - "grad_norm": 0.7507869601249695, - "learning_rate": 9.916207564335726e-05, - "loss": 0.0997, + "epoch": 4.966175989532221, + "grad_norm": 0.48255524039268494, + "learning_rate": 2.915767014843154e-05, + "loss": 0.0455, "step": 18980 }, { - "epoch": 1.2423945044160942, - "grad_norm": 0.8153190016746521, - "learning_rate": 9.916040014471968e-05, - "loss": 0.0974, + "epoch": 4.968792934249264, + "grad_norm": 0.5636023283004761, + "learning_rate": 2.913728818374415e-05, + "loss": 0.0423, "step": 18990 }, { - "epoch": 1.243048740595355, - "grad_norm": 0.7431339621543884, - "learning_rate": 9.915872298678966e-05, - "loss": 0.092, + "epoch": 4.971409878966307, + "grad_norm": 0.48205071687698364, + "learning_rate": 2.9116903391259305e-05, + "loss": 0.043, + "step": 19000 + }, + { + "epoch": 4.971409878966307, + "eval_loss": 0.04874729268246368, + "eval_runtime": 9.2011, + "eval_samples_per_second": 111.291, + "eval_steps_per_second": 1.739, "step": 19000 }, { - "epoch": 1.2437029767746157, - "grad_norm": 1.1183733940124512, - "learning_rate": 9.915704416962382e-05, - "loss": 0.1045, + "epoch": 4.974026823683349, + "grad_norm": 0.4441853165626526, + "learning_rate": 2.90965157849098e-05, + "loss": 0.0416, "step": 19010 }, { - "epoch": 1.2443572129538762, - "grad_norm": 1.058430790901184, - "learning_rate": 9.915536369327883e-05, - "loss": 0.1204, + "epoch": 4.976643768400392, + "grad_norm": 0.439701110124588, + "learning_rate": 2.907612537863038e-05, + "loss": 0.0378, "step": 19020 }, { - "epoch": 1.245011449133137, - "grad_norm": 0.9192705750465393, - "learning_rate": 9.91536815578114e-05, - "loss": 0.113, + "epoch": 4.979260713117435, + "grad_norm": 0.44340062141418457, + "learning_rate": 2.9055732186357716e-05, + "loss": 0.0444, "step": 19030 }, { - "epoch": 1.2456656853123977, - "grad_norm": 0.899825394153595, - "learning_rate": 9.91519977632783e-05, - "loss": 0.0965, + "epoch": 4.981877657834478, + "grad_norm": 0.439158171415329, + "learning_rate": 2.903533622203033e-05, + "loss": 0.0488, "step": 19040 }, { - "epoch": 1.2463199214916585, - "grad_norm": 0.8516882061958313, - "learning_rate": 9.915031230973637e-05, - "loss": 0.1042, + "epoch": 4.984494602551521, + "grad_norm": 0.4943934977054596, + "learning_rate": 2.9014937499588703e-05, + "loss": 0.0457, "step": 19050 }, { - "epoch": 1.2469741576709192, - "grad_norm": 0.8238911032676697, - "learning_rate": 9.914862519724251e-05, - "loss": 0.1051, + "epoch": 4.987111547268564, + "grad_norm": 0.47403454780578613, + "learning_rate": 2.8994536032975145e-05, + "loss": 0.0458, "step": 19060 }, { - "epoch": 1.24762839385018, - "grad_norm": 0.8177777528762817, - "learning_rate": 9.914693642585364e-05, - "loss": 0.1074, + "epoch": 4.989728491985606, + "grad_norm": 0.37442466616630554, + "learning_rate": 2.8974131836133865e-05, + "loss": 0.0419, "step": 19070 }, { - "epoch": 1.2482826300294407, - "grad_norm": 1.2330362796783447, - "learning_rate": 9.914524599562677e-05, - "loss": 0.1078, + "epoch": 4.992345436702649, + "grad_norm": 0.49841246008872986, + "learning_rate": 2.8953724923010965e-05, + "loss": 0.0449, "step": 19080 }, { - "epoch": 1.2489368662087013, - "grad_norm": 1.126373529434204, - "learning_rate": 9.914355390661896e-05, - "loss": 0.1013, + "epoch": 4.994962381419692, + "grad_norm": 0.33906620740890503, + "learning_rate": 2.893331530755436e-05, + "loss": 0.0412, "step": 19090 }, { - "epoch": 1.249591102387962, - "grad_norm": 0.8344969153404236, - "learning_rate": 9.914186015888733e-05, - "loss": 0.1009, + "epoch": 4.997579326136735, + "grad_norm": 0.5222365260124207, + "learning_rate": 2.8912903003713827e-05, + "loss": 0.0493, "step": 19100 }, { - "epoch": 1.2502453385672228, - "grad_norm": 0.8182376027107239, - "learning_rate": 9.914016475248904e-05, - "loss": 0.0968, + "epoch": 5.0, + "grad_norm": 0.2986067533493042, + "learning_rate": 2.8892488025440982e-05, + "loss": 0.0475, "step": 19110 }, { - "epoch": 1.2508995747464835, - "grad_norm": 0.9014137387275696, - "learning_rate": 9.913846768748132e-05, - "loss": 0.1089, + "epoch": 5.002616944717043, + "grad_norm": 0.4610765874385834, + "learning_rate": 2.8872070386689276e-05, + "loss": 0.0459, "step": 19120 }, { - "epoch": 1.2515538109257442, - "grad_norm": 0.9868068695068359, - "learning_rate": 9.913676896392144e-05, - "loss": 0.1141, + "epoch": 5.005233889434086, + "grad_norm": 0.47639599442481995, + "learning_rate": 2.885165010141398e-05, + "loss": 0.043, "step": 19130 }, { - "epoch": 1.2522080471050048, - "grad_norm": 1.0246328115463257, - "learning_rate": 9.913506858186673e-05, - "loss": 0.105, + "epoch": 5.007850834151129, + "grad_norm": 0.5547071695327759, + "learning_rate": 2.8831227183572158e-05, + "loss": 0.0444, "step": 19140 }, { - "epoch": 1.2528622832842657, - "grad_norm": 0.783169686794281, - "learning_rate": 9.913336654137459e-05, - "loss": 0.0996, + "epoch": 5.010467778868171, + "grad_norm": 0.6149731278419495, + "learning_rate": 2.881080164712268e-05, + "loss": 0.0496, "step": 19150 }, { - "epoch": 1.2535165194635263, - "grad_norm": 0.8389064073562622, - "learning_rate": 9.913166284250251e-05, - "loss": 0.089, + "epoch": 5.013084723585214, + "grad_norm": 0.3835916221141815, + "learning_rate": 2.8790373506026208e-05, + "loss": 0.0455, "step": 19160 }, { - "epoch": 1.254170755642787, - "grad_norm": 0.8309929966926575, - "learning_rate": 9.912995748530792e-05, - "loss": 0.0968, + "epoch": 5.015701668302257, + "grad_norm": 0.46660664677619934, + "learning_rate": 2.8769942774245186e-05, + "loss": 0.0374, "step": 19170 }, { - "epoch": 1.2548249918220478, - "grad_norm": 0.9536809325218201, - "learning_rate": 9.912825046984842e-05, - "loss": 0.1027, + "epoch": 5.0183186130193, + "grad_norm": 0.5138661861419678, + "learning_rate": 2.874950946574383e-05, + "loss": 0.0461, "step": 19180 }, { - "epoch": 1.2554792280013085, - "grad_norm": 1.0172020196914673, - "learning_rate": 9.912654179618164e-05, - "loss": 0.1111, + "epoch": 5.020935557736343, + "grad_norm": 0.5261203050613403, + "learning_rate": 2.8729073594488104e-05, + "loss": 0.0429, "step": 19190 }, { - "epoch": 1.2561334641805693, - "grad_norm": 0.8524767160415649, - "learning_rate": 9.912483146436522e-05, - "loss": 0.0941, + "epoch": 5.023552502453386, + "grad_norm": 0.5713613629341125, + "learning_rate": 2.870863517444575e-05, + "loss": 0.0496, "step": 19200 }, { - "epoch": 1.2567877003598298, - "grad_norm": 0.9486467242240906, - "learning_rate": 9.912311947445692e-05, - "loss": 0.1148, + "epoch": 5.026169447170428, + "grad_norm": 0.3695147931575775, + "learning_rate": 2.868819421958621e-05, + "loss": 0.0424, "step": 19210 }, { - "epoch": 1.2574419365390905, - "grad_norm": 0.9282219409942627, - "learning_rate": 9.91214058265145e-05, - "loss": 0.1087, + "epoch": 5.028786391887471, + "grad_norm": 0.3375604450702667, + "learning_rate": 2.86677507438807e-05, + "loss": 0.0444, "step": 19220 }, { - "epoch": 1.2580961727183513, - "grad_norm": 0.9173193573951721, - "learning_rate": 9.911969052059579e-05, - "loss": 0.1129, + "epoch": 5.031403336604514, + "grad_norm": 0.3388157784938812, + "learning_rate": 2.8647304761302158e-05, + "loss": 0.0431, "step": 19230 }, { - "epoch": 1.258750408897612, - "grad_norm": 0.8629287481307983, - "learning_rate": 9.911797355675874e-05, - "loss": 0.109, + "epoch": 5.034020281321557, + "grad_norm": 0.3800851106643677, + "learning_rate": 2.8626856285825188e-05, + "loss": 0.0406, "step": 19240 }, { - "epoch": 1.2594046450768728, - "grad_norm": 0.8385807871818542, - "learning_rate": 9.911625493506124e-05, - "loss": 0.1052, + "epoch": 5.0366372260386, + "grad_norm": 0.5340576171875, + "learning_rate": 2.8606405331426145e-05, + "loss": 0.0456, "step": 19250 }, { - "epoch": 1.2600588812561335, - "grad_norm": 0.7391451597213745, - "learning_rate": 9.911453465556133e-05, - "loss": 0.1045, + "epoch": 5.039254170755643, + "grad_norm": 0.5981658101081848, + "learning_rate": 2.8585951912083075e-05, + "loss": 0.0478, "step": 19260 }, { - "epoch": 1.2607131174353943, - "grad_norm": 0.8053876757621765, - "learning_rate": 9.911281271831707e-05, - "loss": 0.1016, + "epoch": 5.041871115472686, + "grad_norm": 0.40278035402297974, + "learning_rate": 2.8565496041775674e-05, + "loss": 0.0375, "step": 19270 }, { - "epoch": 1.2613673536146548, - "grad_norm": 0.8911042809486389, - "learning_rate": 9.911108912338657e-05, - "loss": 0.1077, + "epoch": 5.044488060189728, + "grad_norm": 0.473084419965744, + "learning_rate": 2.854503773448537e-05, + "loss": 0.0424, "step": 19280 }, { - "epoch": 1.2620215897939155, - "grad_norm": 0.8589571118354797, - "learning_rate": 9.910936387082802e-05, - "loss": 0.1093, + "epoch": 5.047105004906771, + "grad_norm": 0.726222813129425, + "learning_rate": 2.8524577004195187e-05, + "loss": 0.0509, "step": 19290 }, { - "epoch": 1.2626758259731763, - "grad_norm": 0.8094229102134705, - "learning_rate": 9.910763696069965e-05, - "loss": 0.1096, + "epoch": 5.049721949623814, + "grad_norm": 0.45083606243133545, + "learning_rate": 2.8504113864889855e-05, + "loss": 0.0458, "step": 19300 }, { - "epoch": 1.263330062152437, - "grad_norm": 0.8509666323661804, - "learning_rate": 9.910590839305973e-05, - "loss": 0.1072, + "epoch": 5.052338894340857, + "grad_norm": 0.48179444670677185, + "learning_rate": 2.848364833055574e-05, + "loss": 0.0448, "step": 19310 }, { - "epoch": 1.2639842983316978, - "grad_norm": 0.9664101600646973, - "learning_rate": 9.910417816796662e-05, - "loss": 0.1019, + "epoch": 5.0549558390579, + "grad_norm": 0.39184945821762085, + "learning_rate": 2.846318041518084e-05, + "loss": 0.0458, "step": 19320 }, { - "epoch": 1.2646385345109585, - "grad_norm": 0.8401138782501221, - "learning_rate": 9.910244628547872e-05, - "loss": 0.1015, + "epoch": 5.057572783774943, + "grad_norm": 0.5279496908187866, + "learning_rate": 2.844271013275479e-05, + "loss": 0.0459, "step": 19330 }, { - "epoch": 1.2652927706902193, - "grad_norm": 0.7719675898551941, - "learning_rate": 9.910071274565449e-05, - "loss": 0.1116, + "epoch": 5.060189728491985, + "grad_norm": 0.5475168824195862, + "learning_rate": 2.8422237497268816e-05, + "loss": 0.042, "step": 19340 }, { - "epoch": 1.2659470068694798, - "grad_norm": 1.0611908435821533, - "learning_rate": 9.909897754855242e-05, - "loss": 0.0963, + "epoch": 5.062806673209028, + "grad_norm": 0.6053759455680847, + "learning_rate": 2.840176252271578e-05, + "loss": 0.0444, "step": 19350 }, { - "epoch": 1.2666012430487406, - "grad_norm": 0.9017995595932007, - "learning_rate": 9.90972406942311e-05, - "loss": 0.1046, + "epoch": 5.065423617926071, + "grad_norm": 0.44237303733825684, + "learning_rate": 2.838128522309015e-05, + "loss": 0.04, "step": 19360 }, { - "epoch": 1.2672554792280013, - "grad_norm": 0.7480419278144836, - "learning_rate": 9.909550218274915e-05, - "loss": 0.0987, + "epoch": 5.068040562643114, + "grad_norm": 0.5803554058074951, + "learning_rate": 2.8360805612387946e-05, + "loss": 0.0403, "step": 19370 }, { - "epoch": 1.267909715407262, - "grad_norm": 0.949092447757721, - "learning_rate": 9.909376201416522e-05, - "loss": 0.1129, + "epoch": 5.070657507360157, + "grad_norm": 0.47988569736480713, + "learning_rate": 2.8340323704606797e-05, + "loss": 0.0489, "step": 19380 }, { - "epoch": 1.2685639515865228, - "grad_norm": 0.8834240436553955, - "learning_rate": 9.909202018853809e-05, - "loss": 0.1007, + "epoch": 5.0732744520772, + "grad_norm": 0.6084237098693848, + "learning_rate": 2.8319839513745895e-05, + "loss": 0.0451, "step": 19390 }, { - "epoch": 1.2692181877657833, - "grad_norm": 0.8566089272499084, - "learning_rate": 9.909027670592652e-05, - "loss": 0.1074, + "epoch": 5.075891396794242, + "grad_norm": 0.4827035367488861, + "learning_rate": 2.8299353053805983e-05, + "loss": 0.0464, "step": 19400 }, { - "epoch": 1.2698724239450443, - "grad_norm": 0.8526421785354614, - "learning_rate": 9.908853156638937e-05, - "loss": 0.116, + "epoch": 5.078508341511285, + "grad_norm": 0.5168353319168091, + "learning_rate": 2.827886433878938e-05, + "loss": 0.0414, "step": 19410 }, { - "epoch": 1.2705266601243048, - "grad_norm": 0.8203496932983398, - "learning_rate": 9.908678476998555e-05, - "loss": 0.098, + "epoch": 5.081125286228328, + "grad_norm": 0.44367021322250366, + "learning_rate": 2.825837338269991e-05, + "loss": 0.042, "step": 19420 }, { - "epoch": 1.2711808963035656, - "grad_norm": 0.8378643989562988, - "learning_rate": 9.908503631677399e-05, - "loss": 0.1081, + "epoch": 5.083742230945371, + "grad_norm": 0.5074538588523865, + "learning_rate": 2.8237880199542966e-05, + "loss": 0.041, "step": 19430 }, { - "epoch": 1.2718351324828263, - "grad_norm": 1.0013712644577026, - "learning_rate": 9.908328620681373e-05, - "loss": 0.1006, + "epoch": 5.086359175662414, + "grad_norm": 0.6296460628509521, + "learning_rate": 2.8217384803325432e-05, + "loss": 0.0481, "step": 19440 }, { - "epoch": 1.272489368662087, - "grad_norm": 0.9886232018470764, - "learning_rate": 9.908153444016385e-05, - "loss": 0.118, + "epoch": 5.088976120379457, + "grad_norm": 0.34975865483283997, + "learning_rate": 2.8196887208055716e-05, + "loss": 0.0408, "step": 19450 }, { - "epoch": 1.2731436048413478, - "grad_norm": 0.9862875938415527, - "learning_rate": 9.907978101688344e-05, - "loss": 0.1166, + "epoch": 5.0915930650965, + "grad_norm": 0.34852832555770874, + "learning_rate": 2.8176387427743755e-05, + "loss": 0.0407, "step": 19460 }, { - "epoch": 1.2737978410206083, - "grad_norm": 0.935799241065979, - "learning_rate": 9.907802593703173e-05, - "loss": 0.0968, + "epoch": 5.094210009813542, + "grad_norm": 0.365998238325119, + "learning_rate": 2.815588547640093e-05, + "loss": 0.0411, "step": 19470 }, { - "epoch": 1.274452077199869, - "grad_norm": 0.8719044923782349, - "learning_rate": 9.90762692006679e-05, - "loss": 0.0947, + "epoch": 5.096826954530585, + "grad_norm": 0.463615357875824, + "learning_rate": 2.8135381368040135e-05, + "loss": 0.0406, "step": 19480 }, { - "epoch": 1.2751063133791298, - "grad_norm": 0.7218495607376099, - "learning_rate": 9.90745108078513e-05, - "loss": 0.0911, + "epoch": 5.099443899247628, + "grad_norm": 0.4203396737575531, + "learning_rate": 2.811487511667574e-05, + "loss": 0.0433, "step": 19490 }, { - "epoch": 1.2757605495583906, - "grad_norm": 0.8420484066009521, - "learning_rate": 9.907275075864127e-05, - "loss": 0.1092, + "epoch": 5.102060843964671, + "grad_norm": 0.36783847212791443, + "learning_rate": 2.809436673632358e-05, + "loss": 0.0397, "step": 19500 }, { - "epoch": 1.2764147857376513, - "grad_norm": 0.808154821395874, - "learning_rate": 9.907098905309718e-05, - "loss": 0.0971, + "epoch": 5.104677788681714, + "grad_norm": 0.500868558883667, + "learning_rate": 2.807385624100094e-05, + "loss": 0.0498, "step": 19510 }, { - "epoch": 1.277069021916912, - "grad_norm": 0.7025526165962219, - "learning_rate": 9.906922569127853e-05, - "loss": 0.108, + "epoch": 5.107294733398757, + "grad_norm": 0.34654349088668823, + "learning_rate": 2.8053343644726533e-05, + "loss": 0.0444, "step": 19520 }, { - "epoch": 1.2777232580961728, - "grad_norm": 0.9783757925033569, - "learning_rate": 9.906746067324481e-05, - "loss": 0.1065, + "epoch": 5.109911678115799, + "grad_norm": 0.45738351345062256, + "learning_rate": 2.803282896152054e-05, + "loss": 0.0461, "step": 19530 }, { - "epoch": 1.2783774942754333, - "grad_norm": 0.8030864000320435, - "learning_rate": 9.906569399905561e-05, - "loss": 0.1054, + "epoch": 5.112528622832842, + "grad_norm": 0.41508185863494873, + "learning_rate": 2.8012312205404543e-05, + "loss": 0.0419, "step": 19540 }, { - "epoch": 1.279031730454694, - "grad_norm": 0.8669215440750122, - "learning_rate": 9.906392566877057e-05, - "loss": 0.0978, + "epoch": 5.115145567549885, + "grad_norm": 0.36502814292907715, + "learning_rate": 2.7991793390401567e-05, + "loss": 0.0402, "step": 19550 }, { - "epoch": 1.2796859666339548, - "grad_norm": 0.8483967781066895, - "learning_rate": 9.906215568244935e-05, - "loss": 0.1098, + "epoch": 5.117762512266928, + "grad_norm": 0.45576921105384827, + "learning_rate": 2.7971272530536025e-05, + "loss": 0.0447, "step": 19560 }, { - "epoch": 1.2803402028132156, - "grad_norm": 0.7597243785858154, - "learning_rate": 9.906038404015171e-05, - "loss": 0.106, + "epoch": 5.120379456983971, + "grad_norm": 0.47386133670806885, + "learning_rate": 2.7950749639833713e-05, + "loss": 0.0392, "step": 19570 }, { - "epoch": 1.2809944389924763, - "grad_norm": 0.9336119294166565, - "learning_rate": 9.905861074193745e-05, - "loss": 0.0995, + "epoch": 5.122996401701014, + "grad_norm": 0.3990080952644348, + "learning_rate": 2.793022473232185e-05, + "loss": 0.0435, "step": 19580 }, { - "epoch": 1.2816486751717369, - "grad_norm": 0.8763530254364014, - "learning_rate": 9.905683578786641e-05, - "loss": 0.0976, + "epoch": 5.125613346418057, + "grad_norm": 0.3933347165584564, + "learning_rate": 2.7909697822029012e-05, + "loss": 0.0406, "step": 19590 }, { - "epoch": 1.2823029113509978, - "grad_norm": 0.9109959602355957, - "learning_rate": 9.905505917799851e-05, - "loss": 0.1021, + "epoch": 5.128230291135099, + "grad_norm": 0.34663620591163635, + "learning_rate": 2.7889168922985155e-05, + "loss": 0.0405, "step": 19600 }, { - "epoch": 1.2829571475302584, - "grad_norm": 1.0181946754455566, - "learning_rate": 9.90532809123937e-05, - "loss": 0.1142, + "epoch": 5.130847235852142, + "grad_norm": 0.5836282968521118, + "learning_rate": 2.786863804922158e-05, + "loss": 0.0423, "step": 19610 }, { - "epoch": 1.283611383709519, - "grad_norm": 1.0679359436035156, - "learning_rate": 9.905150099111202e-05, - "loss": 0.1152, + "epoch": 5.133464180569185, + "grad_norm": 0.49704045057296753, + "learning_rate": 2.7848105214770942e-05, + "loss": 0.0431, "step": 19620 }, { - "epoch": 1.2842656198887799, - "grad_norm": 0.8650368452072144, - "learning_rate": 9.904971941421355e-05, - "loss": 0.0966, + "epoch": 5.136081125286228, + "grad_norm": 0.477346807718277, + "learning_rate": 2.7827570433667254e-05, + "loss": 0.0441, "step": 19630 }, { - "epoch": 1.2849198560680406, - "grad_norm": 1.0523502826690674, - "learning_rate": 9.90479361817584e-05, - "loss": 0.104, + "epoch": 5.138698070003271, + "grad_norm": 0.44346386194229126, + "learning_rate": 2.7807033719945828e-05, + "loss": 0.0383, "step": 19640 }, { - "epoch": 1.2855740922473013, - "grad_norm": 0.976112961769104, - "learning_rate": 9.904615129380676e-05, - "loss": 0.0967, + "epoch": 5.141315014720314, + "grad_norm": 0.33207669854164124, + "learning_rate": 2.778649508764333e-05, + "loss": 0.0432, "step": 19650 }, { - "epoch": 1.2862283284265619, - "grad_norm": 0.8600479364395142, - "learning_rate": 9.904436475041891e-05, - "loss": 0.105, + "epoch": 5.1439319594373565, + "grad_norm": 0.5000958442687988, + "learning_rate": 2.7765954550797718e-05, + "loss": 0.0418, "step": 19660 }, { - "epoch": 1.2868825646058226, - "grad_norm": 1.066514015197754, - "learning_rate": 9.904257655165512e-05, - "loss": 0.1022, + "epoch": 5.1465489041543995, + "grad_norm": 0.8191132545471191, + "learning_rate": 2.7745412123448245e-05, + "loss": 0.0474, "step": 19670 }, { - "epoch": 1.2875368007850834, - "grad_norm": 0.9571858644485474, - "learning_rate": 9.904078669757575e-05, - "loss": 0.1039, + "epoch": 5.1491658488714425, + "grad_norm": 0.4225716292858124, + "learning_rate": 2.7724867819635476e-05, + "loss": 0.0465, "step": 19680 }, { - "epoch": 1.2881910369643441, - "grad_norm": 0.8673845529556274, - "learning_rate": 9.90389951882412e-05, - "loss": 0.0921, + "epoch": 5.1517827935884855, + "grad_norm": 0.30504903197288513, + "learning_rate": 2.7704321653401245e-05, + "loss": 0.0372, "step": 19690 }, { - "epoch": 1.2888452731436049, - "grad_norm": 0.995274007320404, - "learning_rate": 9.903720202371198e-05, - "loss": 0.1015, + "epoch": 5.1543997383055284, + "grad_norm": 0.47394004464149475, + "learning_rate": 2.7683773638788664e-05, + "loss": 0.0409, "step": 19700 }, { - "epoch": 1.2894995093228656, - "grad_norm": 0.9427167773246765, - "learning_rate": 9.903540720404856e-05, - "loss": 0.0934, + "epoch": 5.157016683022571, + "grad_norm": 0.4267828166484833, + "learning_rate": 2.766322378984211e-05, + "loss": 0.0377, "step": 19710 }, { - "epoch": 1.2901537455021264, - "grad_norm": 0.8194208145141602, - "learning_rate": 9.903361072931156e-05, - "loss": 0.0943, + "epoch": 5.159633627739614, + "grad_norm": 0.48577219247817993, + "learning_rate": 2.7642672120607204e-05, + "loss": 0.0382, "step": 19720 }, { - "epoch": 1.2908079816813869, - "grad_norm": 0.8158044815063477, - "learning_rate": 9.903181259956161e-05, - "loss": 0.1123, + "epoch": 5.1622505724566565, + "grad_norm": 0.45582279562950134, + "learning_rate": 2.7622118645130823e-05, + "loss": 0.0421, "step": 19730 }, { - "epoch": 1.2914622178606476, - "grad_norm": 0.7874143123626709, - "learning_rate": 9.903001281485937e-05, - "loss": 0.0954, + "epoch": 5.1648675171736995, + "grad_norm": 0.3249519169330597, + "learning_rate": 2.7601563377461082e-05, + "loss": 0.0429, "step": 19740 }, { - "epoch": 1.2921164540399084, - "grad_norm": 1.0694737434387207, - "learning_rate": 9.902821137526564e-05, - "loss": 0.1028, + "epoch": 5.1674844618907425, + "grad_norm": 0.4997425079345703, + "learning_rate": 2.7581006331647292e-05, + "loss": 0.0452, "step": 19750 }, { - "epoch": 1.2927706902191691, - "grad_norm": 0.9189092516899109, - "learning_rate": 9.902640828084118e-05, - "loss": 0.1016, + "epoch": 5.1701014066077855, + "grad_norm": 0.594337522983551, + "learning_rate": 2.7560447521740017e-05, + "loss": 0.0417, "step": 19760 }, { - "epoch": 1.2934249263984299, - "grad_norm": 0.7503056526184082, - "learning_rate": 9.902460353164687e-05, - "loss": 0.0958, + "epoch": 5.1727183513248285, + "grad_norm": 0.49194571375846863, + "learning_rate": 2.7539886961791e-05, + "loss": 0.042, "step": 19770 }, { - "epoch": 1.2940791625776906, - "grad_norm": 1.0281898975372314, - "learning_rate": 9.90227971277436e-05, - "loss": 0.0997, + "epoch": 5.1753352960418715, + "grad_norm": 0.40315955877304077, + "learning_rate": 2.75193246658532e-05, + "loss": 0.0442, "step": 19780 }, { - "epoch": 1.2947333987569514, - "grad_norm": 0.817094624042511, - "learning_rate": 9.902098906919239e-05, - "loss": 0.1076, + "epoch": 5.177952240758914, + "grad_norm": 0.5103853940963745, + "learning_rate": 2.749876064798075e-05, + "loss": 0.0455, "step": 19790 }, { - "epoch": 1.295387634936212, - "grad_norm": 0.7925615906715393, - "learning_rate": 9.901917935605423e-05, - "loss": 0.0986, + "epoch": 5.180569185475957, + "grad_norm": 0.40776246786117554, + "learning_rate": 2.7478194922228952e-05, + "loss": 0.0432, "step": 19800 }, { - "epoch": 1.2960418711154726, - "grad_norm": 0.8204017877578735, - "learning_rate": 9.901736798839018e-05, - "loss": 0.0976, + "epoch": 5.183186130193, + "grad_norm": 0.4205770194530487, + "learning_rate": 2.7457627502654294e-05, + "loss": 0.0435, "step": 19810 }, { - "epoch": 1.2966961072947334, - "grad_norm": 0.7727102637290955, - "learning_rate": 9.901555496626145e-05, - "loss": 0.0922, + "epoch": 5.1858030749100426, + "grad_norm": 0.3220876157283783, + "learning_rate": 2.743705840331441e-05, + "loss": 0.0417, "step": 19820 }, { - "epoch": 1.2973503434739941, - "grad_norm": 0.9235280156135559, - "learning_rate": 9.901374028972916e-05, - "loss": 0.1103, + "epoch": 5.1884200196270855, + "grad_norm": 0.495869904756546, + "learning_rate": 2.741648763826809e-05, + "loss": 0.0426, "step": 19830 }, { - "epoch": 1.2980045796532549, - "grad_norm": 0.8759993314743042, - "learning_rate": 9.901192395885461e-05, - "loss": 0.0985, + "epoch": 5.1910369643441285, + "grad_norm": 0.45162034034729004, + "learning_rate": 2.7395915221575258e-05, + "loss": 0.0414, "step": 19840 }, { - "epoch": 1.2986588158325154, - "grad_norm": 0.7741413116455078, - "learning_rate": 9.901010597369907e-05, - "loss": 0.0985, + "epoch": 5.193653909061171, + "grad_norm": 0.34791964292526245, + "learning_rate": 2.737534116729696e-05, + "loss": 0.0408, "step": 19850 }, { - "epoch": 1.2993130520117764, - "grad_norm": 0.8373982906341553, - "learning_rate": 9.900828633432393e-05, - "loss": 0.1054, + "epoch": 5.196270853778214, + "grad_norm": 0.4020437002182007, + "learning_rate": 2.7354765489495375e-05, + "loss": 0.0442, "step": 19860 }, { - "epoch": 1.299967288191037, - "grad_norm": 0.708233654499054, - "learning_rate": 9.90064650407906e-05, - "loss": 0.1, + "epoch": 5.198887798495257, + "grad_norm": 0.4706566035747528, + "learning_rate": 2.733418820223378e-05, + "loss": 0.0433, "step": 19870 }, { - "epoch": 1.3006215243702977, - "grad_norm": 0.8219735026359558, - "learning_rate": 9.900464209316054e-05, - "loss": 0.1052, + "epoch": 5.2015047432123, + "grad_norm": 0.3812898099422455, + "learning_rate": 2.731360931957656e-05, + "loss": 0.0418, "step": 19880 }, { - "epoch": 1.3012757605495584, - "grad_norm": 0.7485021352767944, - "learning_rate": 9.900281749149531e-05, - "loss": 0.0986, + "epoch": 5.204121687929343, + "grad_norm": 0.48587170243263245, + "learning_rate": 2.7293028855589187e-05, + "loss": 0.0423, "step": 19890 }, { - "epoch": 1.3019299967288191, - "grad_norm": 0.8410215973854065, - "learning_rate": 9.900099123585646e-05, - "loss": 0.1012, + "epoch": 5.206738632646386, + "grad_norm": 0.5795092582702637, + "learning_rate": 2.727244682433821e-05, + "loss": 0.0441, "step": 19900 }, { - "epoch": 1.30258423290808, - "grad_norm": 0.9873031377792358, - "learning_rate": 9.899916332630565e-05, - "loss": 0.1031, + "epoch": 5.209355577363429, + "grad_norm": 0.41178473830223083, + "learning_rate": 2.7251863239891262e-05, + "loss": 0.0452, "step": 19910 }, { - "epoch": 1.3032384690873404, - "grad_norm": 0.7580830454826355, - "learning_rate": 9.899733376290458e-05, - "loss": 0.1084, + "epoch": 5.211972522080471, + "grad_norm": 0.47530198097229004, + "learning_rate": 2.7231278116317015e-05, + "loss": 0.0406, "step": 19920 }, { - "epoch": 1.3038927052666012, - "grad_norm": 0.8297830820083618, - "learning_rate": 9.899550254571499e-05, - "loss": 0.1034, + "epoch": 5.214589466797514, + "grad_norm": 0.3284004330635071, + "learning_rate": 2.7210691467685222e-05, + "loss": 0.0453, "step": 19930 }, { - "epoch": 1.304546941445862, - "grad_norm": 0.888883113861084, - "learning_rate": 9.899366967479868e-05, - "loss": 0.0951, + "epoch": 5.217206411514557, + "grad_norm": 0.5120458602905273, + "learning_rate": 2.7190103308066656e-05, + "loss": 0.0456, "step": 19940 }, { - "epoch": 1.3052011776251227, - "grad_norm": 0.7784373760223389, - "learning_rate": 9.899183515021755e-05, - "loss": 0.107, + "epoch": 5.2198233562316, + "grad_norm": 0.500260055065155, + "learning_rate": 2.7169513651533125e-05, + "loss": 0.0352, "step": 19950 }, { - "epoch": 1.3058554138043834, - "grad_norm": 0.7872931361198425, - "learning_rate": 9.898999897203347e-05, - "loss": 0.0941, + "epoch": 5.222440300948643, + "grad_norm": 0.5112075209617615, + "learning_rate": 2.7148922512157476e-05, + "loss": 0.0463, "step": 19960 }, { - "epoch": 1.3065096499836442, - "grad_norm": 0.8830298781394958, - "learning_rate": 9.898816114030846e-05, - "loss": 0.1035, + "epoch": 5.225057245665686, + "grad_norm": 0.516282320022583, + "learning_rate": 2.712832990401355e-05, + "loss": 0.0425, "step": 19970 }, { - "epoch": 1.307163886162905, - "grad_norm": 0.8231805562973022, - "learning_rate": 9.898632165510455e-05, - "loss": 0.1061, + "epoch": 5.227674190382728, + "grad_norm": 0.48279473185539246, + "learning_rate": 2.7107735841176206e-05, + "loss": 0.0413, "step": 19980 }, { - "epoch": 1.3078181223421654, - "grad_norm": 0.8644198179244995, - "learning_rate": 9.89844805164838e-05, - "loss": 0.1068, + "epoch": 5.230291135099771, + "grad_norm": 0.5420926809310913, + "learning_rate": 2.708714033772129e-05, + "loss": 0.0386, "step": 19990 }, { - "epoch": 1.3084723585214262, - "grad_norm": 0.7423153519630432, - "learning_rate": 9.898263772450836e-05, - "loss": 0.102, + "epoch": 5.232908079816814, + "grad_norm": 0.4328511357307434, + "learning_rate": 2.706654340772563e-05, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 5.232908079816814, + "eval_loss": 0.04762020907485075, + "eval_runtime": 8.9323, + "eval_samples_per_second": 114.64, + "eval_steps_per_second": 1.791, "step": 20000 }, { - "epoch": 1.309126594700687, - "grad_norm": 0.9147112965583801, - "learning_rate": 9.898079327924044e-05, - "loss": 0.1079, + "epoch": 5.235525024533857, + "grad_norm": 0.6541851162910461, + "learning_rate": 2.704594506526704e-05, + "loss": 0.0422, "step": 20010 }, { - "epoch": 1.3097808308799477, - "grad_norm": 0.8351669907569885, - "learning_rate": 9.897894718074229e-05, - "loss": 0.1015, + "epoch": 5.2381419692509, + "grad_norm": 0.474886953830719, + "learning_rate": 2.7025345324424288e-05, + "loss": 0.0378, "step": 20020 }, { - "epoch": 1.3104350670592084, - "grad_norm": 0.9719964861869812, - "learning_rate": 9.897709942907623e-05, - "loss": 0.096, + "epoch": 5.240758913967943, + "grad_norm": 0.35807138681411743, + "learning_rate": 2.7004744199277104e-05, + "loss": 0.0415, "step": 20030 }, { - "epoch": 1.311089303238469, - "grad_norm": 1.0228232145309448, - "learning_rate": 9.897525002430458e-05, - "loss": 0.1035, + "epoch": 5.243375858684986, + "grad_norm": 0.5502864718437195, + "learning_rate": 2.698414170390617e-05, + "loss": 0.0424, "step": 20040 }, { - "epoch": 1.31174353941773, - "grad_norm": 0.8931130766868591, - "learning_rate": 9.897339896648985e-05, - "loss": 0.1063, + "epoch": 5.245992803402028, + "grad_norm": 0.36918938159942627, + "learning_rate": 2.6963537852393085e-05, + "loss": 0.0387, "step": 20050 }, { - "epoch": 1.3123977755969904, - "grad_norm": 0.992167592048645, - "learning_rate": 9.897154625569443e-05, - "loss": 0.0942, + "epoch": 5.248609748119071, + "grad_norm": 0.4112201929092407, + "learning_rate": 2.694293265882039e-05, + "loss": 0.0343, "step": 20060 }, { - "epoch": 1.3130520117762512, - "grad_norm": 0.7140622735023499, - "learning_rate": 9.89696918919809e-05, - "loss": 0.0967, + "epoch": 5.251226692836114, + "grad_norm": 0.5169610977172852, + "learning_rate": 2.6922326137271555e-05, + "loss": 0.0415, "step": 20070 }, { - "epoch": 1.313706247955512, - "grad_norm": 0.7795881032943726, - "learning_rate": 9.896783587541186e-05, - "loss": 0.105, + "epoch": 5.253843637553157, + "grad_norm": 0.3329308032989502, + "learning_rate": 2.690171830183092e-05, + "loss": 0.041, "step": 20080 }, { - "epoch": 1.3143604841347727, - "grad_norm": 0.8552742004394531, - "learning_rate": 9.896597820604992e-05, - "loss": 0.0931, + "epoch": 5.2564605822702, + "grad_norm": 0.5257441401481628, + "learning_rate": 2.688110916658376e-05, + "loss": 0.0442, "step": 20090 }, { - "epoch": 1.3150147203140334, - "grad_norm": 0.7586219310760498, - "learning_rate": 9.89641188839578e-05, - "loss": 0.1137, + "epoch": 5.259077526987243, + "grad_norm": 0.3802300691604614, + "learning_rate": 2.6860498745616218e-05, + "loss": 0.035, "step": 20100 }, { - "epoch": 1.315668956493294, - "grad_norm": 0.9405565857887268, - "learning_rate": 9.896225790919826e-05, - "loss": 0.1114, + "epoch": 5.261694471704285, + "grad_norm": 0.4359605610370636, + "learning_rate": 2.683988705301534e-05, + "loss": 0.0405, "step": 20110 }, { - "epoch": 1.3163231926725547, - "grad_norm": 0.929673969745636, - "learning_rate": 9.89603952818341e-05, - "loss": 0.0999, + "epoch": 5.264311416421328, + "grad_norm": 0.42070209980010986, + "learning_rate": 2.6819274102869002e-05, + "loss": 0.0402, "step": 20120 }, { - "epoch": 1.3169774288518155, - "grad_norm": 0.8156664371490479, - "learning_rate": 9.89585310019282e-05, - "loss": 0.1036, + "epoch": 5.266928361138371, + "grad_norm": 0.4507392644882202, + "learning_rate": 2.679865990926597e-05, + "loss": 0.0426, "step": 20130 }, { - "epoch": 1.3176316650310762, - "grad_norm": 0.9651734232902527, - "learning_rate": 9.895666506954347e-05, - "loss": 0.104, + "epoch": 5.269545305855414, + "grad_norm": 0.6350913643836975, + "learning_rate": 2.677804448629585e-05, + "loss": 0.0427, "step": 20140 }, { - "epoch": 1.318285901210337, - "grad_norm": 1.05132257938385, - "learning_rate": 9.895479748474292e-05, - "loss": 0.1016, + "epoch": 5.272162250572457, + "grad_norm": 0.7911574840545654, + "learning_rate": 2.6757427848049088e-05, + "loss": 0.0391, "step": 20150 }, { - "epoch": 1.3189401373895977, - "grad_norm": 0.8636537790298462, - "learning_rate": 9.895292824758955e-05, - "loss": 0.1022, + "epoch": 5.2747791952895, + "grad_norm": 0.5109353065490723, + "learning_rate": 2.673681000861697e-05, + "loss": 0.0464, "step": 20160 }, { - "epoch": 1.3195943735688584, - "grad_norm": 0.7908622026443481, - "learning_rate": 9.895105735814647e-05, - "loss": 0.1048, + "epoch": 5.277396140006543, + "grad_norm": 0.5190655589103699, + "learning_rate": 2.6716190982091588e-05, + "loss": 0.042, "step": 20170 }, { - "epoch": 1.320248609748119, - "grad_norm": 0.8223440051078796, - "learning_rate": 9.894918481647684e-05, - "loss": 0.0958, + "epoch": 5.280013084723585, + "grad_norm": 0.3754146099090576, + "learning_rate": 2.6695570782565843e-05, + "loss": 0.0466, "step": 20180 }, { - "epoch": 1.3209028459273797, - "grad_norm": 0.9097592234611511, - "learning_rate": 9.894731062264383e-05, - "loss": 0.1036, + "epoch": 5.282630029440628, + "grad_norm": 0.4404628872871399, + "learning_rate": 2.6674949424133468e-05, + "loss": 0.042, "step": 20190 }, { - "epoch": 1.3215570821066405, - "grad_norm": 0.8000107407569885, - "learning_rate": 9.894543477671072e-05, - "loss": 0.0993, + "epoch": 5.285246974157671, + "grad_norm": 0.30781108140945435, + "learning_rate": 2.6654326920888946e-05, + "loss": 0.0377, "step": 20200 }, { - "epoch": 1.3222113182859012, - "grad_norm": 0.8129693865776062, - "learning_rate": 9.894355727874083e-05, - "loss": 0.113, + "epoch": 5.287863918874714, + "grad_norm": 0.4061555862426758, + "learning_rate": 2.6633703286927576e-05, + "loss": 0.0426, "step": 20210 }, { - "epoch": 1.322865554465162, - "grad_norm": 0.7431675791740417, - "learning_rate": 9.894167812879751e-05, - "loss": 0.1008, + "epoch": 5.290480863591757, + "grad_norm": 0.4692903757095337, + "learning_rate": 2.6613078536345414e-05, + "loss": 0.0476, "step": 20220 }, { - "epoch": 1.3235197906444227, - "grad_norm": 1.0079487562179565, - "learning_rate": 9.893979732694421e-05, - "loss": 0.1075, + "epoch": 5.2930978083088, + "grad_norm": 0.46763700246810913, + "learning_rate": 2.659245268323928e-05, + "loss": 0.0456, "step": 20230 }, { - "epoch": 1.3241740268236835, - "grad_norm": 0.7943064570426941, - "learning_rate": 9.89379148732444e-05, - "loss": 0.1154, + "epoch": 5.295714753025842, + "grad_norm": 0.6062338948249817, + "learning_rate": 2.6571825741706762e-05, + "loss": 0.04, "step": 20240 }, { - "epoch": 1.324828263002944, - "grad_norm": 0.7072292566299438, - "learning_rate": 9.893603076776162e-05, - "loss": 0.113, + "epoch": 5.298331697742885, + "grad_norm": 0.4335156977176666, + "learning_rate": 2.655119772584616e-05, + "loss": 0.0408, "step": 20250 }, { - "epoch": 1.3254824991822047, - "grad_norm": 0.8471561074256897, - "learning_rate": 9.893414501055947e-05, - "loss": 0.1068, + "epoch": 5.300948642459928, + "grad_norm": 0.4853779077529907, + "learning_rate": 2.653056864975655e-05, + "loss": 0.0441, "step": 20260 }, { - "epoch": 1.3261367353614655, - "grad_norm": 0.842134416103363, - "learning_rate": 9.893225760170157e-05, - "loss": 0.1066, + "epoch": 5.303565587176971, + "grad_norm": 0.17079606652259827, + "learning_rate": 2.65099385275377e-05, + "loss": 0.0404, "step": 20270 }, { - "epoch": 1.3267909715407262, - "grad_norm": 0.836654543876648, - "learning_rate": 9.893036854125166e-05, - "loss": 0.099, + "epoch": 5.306182531894014, + "grad_norm": 0.32185569405555725, + "learning_rate": 2.6489307373290096e-05, + "loss": 0.039, "step": 20280 }, { - "epoch": 1.327445207719987, - "grad_norm": 0.8249868750572205, - "learning_rate": 9.892847782927348e-05, - "loss": 0.0977, + "epoch": 5.308799476611057, + "grad_norm": 0.3419385254383087, + "learning_rate": 2.646867520111495e-05, + "loss": 0.0381, "step": 20290 }, { - "epoch": 1.3280994438992475, - "grad_norm": 0.7410039305686951, - "learning_rate": 9.892658546583087e-05, - "loss": 0.114, + "epoch": 5.311416421328099, + "grad_norm": 0.5202774405479431, + "learning_rate": 2.644804202511415e-05, + "loss": 0.0473, "step": 20300 }, { - "epoch": 1.3287536800785085, - "grad_norm": 0.7557688355445862, - "learning_rate": 9.892469145098766e-05, - "loss": 0.0889, + "epoch": 5.314033366045142, + "grad_norm": 0.416843056678772, + "learning_rate": 2.642740785939028e-05, + "loss": 0.04, "step": 20310 }, { - "epoch": 1.329407916257769, - "grad_norm": 1.1154882907867432, - "learning_rate": 9.892279578480783e-05, - "loss": 0.0977, + "epoch": 5.316650310762185, + "grad_norm": 0.47133147716522217, + "learning_rate": 2.6406772718046603e-05, + "loss": 0.0404, "step": 20320 }, { - "epoch": 1.3300621524370297, - "grad_norm": 1.0034886598587036, - "learning_rate": 9.892089846735533e-05, - "loss": 0.0942, + "epoch": 5.319267255479228, + "grad_norm": 0.37643080949783325, + "learning_rate": 2.638613661518703e-05, + "loss": 0.038, "step": 20330 }, { - "epoch": 1.3307163886162905, - "grad_norm": 0.837894082069397, - "learning_rate": 9.891899949869418e-05, - "loss": 0.1187, + "epoch": 5.321884200196271, + "grad_norm": 0.5968944430351257, + "learning_rate": 2.6365499564916163e-05, + "loss": 0.04, "step": 20340 }, { - "epoch": 1.3313706247955512, - "grad_norm": 0.8454033732414246, - "learning_rate": 9.891709887888853e-05, - "loss": 0.1037, + "epoch": 5.324501144913314, + "grad_norm": 0.4017389416694641, + "learning_rate": 2.6344861581339216e-05, + "loss": 0.0424, "step": 20350 }, { - "epoch": 1.332024860974812, - "grad_norm": 0.8291163444519043, - "learning_rate": 9.89151966080025e-05, - "loss": 0.1154, + "epoch": 5.327118089630357, + "grad_norm": 0.5358704924583435, + "learning_rate": 2.632422267856205e-05, + "loss": 0.039, "step": 20360 }, { - "epoch": 1.3326790971540725, - "grad_norm": 0.9798288941383362, - "learning_rate": 9.89132926861003e-05, - "loss": 0.1007, + "epoch": 5.329735034347399, + "grad_norm": 0.39304521679878235, + "learning_rate": 2.6303582870691175e-05, + "loss": 0.0434, "step": 20370 }, { - "epoch": 1.3333333333333333, - "grad_norm": 1.067610740661621, - "learning_rate": 9.891138711324619e-05, - "loss": 0.0952, + "epoch": 5.332351979064442, + "grad_norm": 0.3788582384586334, + "learning_rate": 2.6282942171833695e-05, + "loss": 0.0467, "step": 20380 }, { - "epoch": 1.333987569512594, - "grad_norm": 0.8580688834190369, - "learning_rate": 9.89094798895045e-05, - "loss": 0.0965, + "epoch": 5.334968923781485, + "grad_norm": 0.6729533672332764, + "learning_rate": 2.626230059609735e-05, + "loss": 0.0419, "step": 20390 }, { - "epoch": 1.3346418056918548, - "grad_norm": 0.9280464053153992, - "learning_rate": 9.890757101493958e-05, - "loss": 0.1037, + "epoch": 5.337585868498528, + "grad_norm": 0.5997945070266724, + "learning_rate": 2.6241658157590444e-05, + "loss": 0.047, "step": 20400 }, { - "epoch": 1.3352960418711155, - "grad_norm": 0.8157536387443542, - "learning_rate": 9.890566048961587e-05, - "loss": 0.097, + "epoch": 5.340202813215571, + "grad_norm": 0.5033451318740845, + "learning_rate": 2.6221014870421895e-05, + "loss": 0.0393, "step": 20410 }, { - "epoch": 1.3359502780503762, - "grad_norm": 0.8738055229187012, - "learning_rate": 9.890374831359787e-05, - "loss": 0.0991, + "epoch": 5.342819757932614, + "grad_norm": 0.5576150417327881, + "learning_rate": 2.6200370748701196e-05, + "loss": 0.034, "step": 20420 }, { - "epoch": 1.336604514229637, - "grad_norm": 0.9500564932823181, - "learning_rate": 9.89018344869501e-05, - "loss": 0.1093, + "epoch": 5.345436702649657, + "grad_norm": 0.6364089846611023, + "learning_rate": 2.6179725806538407e-05, + "loss": 0.0383, "step": 20430 }, { - "epoch": 1.3372587504088975, - "grad_norm": 0.8507956266403198, - "learning_rate": 9.889991900973717e-05, - "loss": 0.1148, + "epoch": 5.348053647366699, + "grad_norm": 0.5496673583984375, + "learning_rate": 2.615908005804416e-05, + "loss": 0.0428, "step": 20440 }, { - "epoch": 1.3379129865881583, - "grad_norm": 0.8578815460205078, - "learning_rate": 9.889800188202374e-05, - "loss": 0.1032, + "epoch": 5.350670592083742, + "grad_norm": 0.3670984208583832, + "learning_rate": 2.613843351732962e-05, + "loss": 0.0398, "step": 20450 }, { - "epoch": 1.338567222767419, - "grad_norm": 0.8666653633117676, - "learning_rate": 9.889608310387449e-05, - "loss": 0.1048, + "epoch": 5.353287536800785, + "grad_norm": 0.5018607974052429, + "learning_rate": 2.61177861985065e-05, + "loss": 0.0393, "step": 20460 }, { - "epoch": 1.3392214589466798, - "grad_norm": 0.7998878359794617, - "learning_rate": 9.88941626753542e-05, - "loss": 0.103, + "epoch": 5.355904481517828, + "grad_norm": 0.40461036562919617, + "learning_rate": 2.6097138115687057e-05, + "loss": 0.0376, "step": 20470 }, { - "epoch": 1.3398756951259405, - "grad_norm": 0.8280777931213379, - "learning_rate": 9.889224059652771e-05, - "loss": 0.107, + "epoch": 5.358521426234871, + "grad_norm": 0.5538284778594971, + "learning_rate": 2.607648928298405e-05, + "loss": 0.0434, "step": 20480 }, { - "epoch": 1.340529931305201, - "grad_norm": 0.9405080676078796, - "learning_rate": 9.889031686745987e-05, - "loss": 0.1129, + "epoch": 5.361138370951914, + "grad_norm": 0.40360862016677856, + "learning_rate": 2.6055839714510782e-05, + "loss": 0.0433, "step": 20490 }, { - "epoch": 1.341184167484462, - "grad_norm": 1.0205219984054565, - "learning_rate": 9.88883914882156e-05, - "loss": 0.101, + "epoch": 5.363755315668956, + "grad_norm": 0.419687956571579, + "learning_rate": 2.6035189424381024e-05, + "loss": 0.0416, "step": 20500 }, { - "epoch": 1.3418384036637225, - "grad_norm": 0.8715194463729858, - "learning_rate": 9.888646445885991e-05, - "loss": 0.1148, + "epoch": 5.366372260385999, + "grad_norm": 0.4900745451450348, + "learning_rate": 2.6014538426709046e-05, + "loss": 0.0361, "step": 20510 }, { - "epoch": 1.3424926398429833, - "grad_norm": 0.9229245185852051, - "learning_rate": 9.888453577945784e-05, - "loss": 0.0971, + "epoch": 5.368989205103042, + "grad_norm": 0.48524007201194763, + "learning_rate": 2.599388673560963e-05, + "loss": 0.0398, "step": 20520 }, { - "epoch": 1.343146876022244, - "grad_norm": 0.9182971119880676, - "learning_rate": 9.888260545007448e-05, - "loss": 0.0908, + "epoch": 5.371606149820085, + "grad_norm": 0.5027137994766235, + "learning_rate": 2.597323436519799e-05, + "loss": 0.0391, "step": 20530 }, { - "epoch": 1.3438011122015048, - "grad_norm": 0.9435850977897644, - "learning_rate": 9.888067347077499e-05, - "loss": 0.1113, + "epoch": 5.374223094537128, + "grad_norm": 0.31066951155662537, + "learning_rate": 2.5952581329589848e-05, + "loss": 0.0432, "step": 20540 }, { - "epoch": 1.3444553483807655, - "grad_norm": 0.9361865520477295, - "learning_rate": 9.887873984162457e-05, - "loss": 0.1119, + "epoch": 5.376840039254171, + "grad_norm": 0.5412424206733704, + "learning_rate": 2.593192764290135e-05, + "loss": 0.0409, "step": 20550 }, { - "epoch": 1.345109584560026, - "grad_norm": 0.9154676198959351, - "learning_rate": 9.887680456268848e-05, - "loss": 0.1064, + "epoch": 5.379456983971213, + "grad_norm": 0.3911846876144409, + "learning_rate": 2.591127331924909e-05, + "loss": 0.0382, "step": 20560 }, { - "epoch": 1.345763820739287, - "grad_norm": 1.0098940134048462, - "learning_rate": 9.887486763403207e-05, - "loss": 0.0951, + "epoch": 5.382073928688256, + "grad_norm": 0.31931638717651367, + "learning_rate": 2.5890618372750115e-05, + "loss": 0.0428, "step": 20570 }, { - "epoch": 1.3464180569185475, - "grad_norm": 0.7353399991989136, - "learning_rate": 9.88729290557207e-05, - "loss": 0.0975, + "epoch": 5.384690873405299, + "grad_norm": 0.5206882953643799, + "learning_rate": 2.5869962817521876e-05, + "loss": 0.0442, "step": 20580 }, { - "epoch": 1.3470722930978083, - "grad_norm": 0.7690742611885071, - "learning_rate": 9.88709888278198e-05, - "loss": 0.0931, + "epoch": 5.387307818122342, + "grad_norm": 0.5111570954322815, + "learning_rate": 2.5849306667682255e-05, + "loss": 0.041, "step": 20590 }, { - "epoch": 1.347726529277069, - "grad_norm": 0.780418872833252, - "learning_rate": 9.886904695039484e-05, - "loss": 0.0957, + "epoch": 5.389924762839385, + "grad_norm": 0.42905184626579285, + "learning_rate": 2.5828649937349535e-05, + "loss": 0.0406, "step": 20600 }, { - "epoch": 1.3483807654563298, - "grad_norm": 0.817748486995697, - "learning_rate": 9.88671034235114e-05, - "loss": 0.1011, + "epoch": 5.392541707556428, + "grad_norm": 0.5467479228973389, + "learning_rate": 2.580799264064239e-05, + "loss": 0.0422, "step": 20610 }, { - "epoch": 1.3490350016355905, - "grad_norm": 0.9198586940765381, - "learning_rate": 9.886515824723505e-05, - "loss": 0.0976, + "epoch": 5.395158652273471, + "grad_norm": 0.5880939960479736, + "learning_rate": 2.5787334791679906e-05, + "loss": 0.0384, "step": 20620 }, { - "epoch": 1.349689237814851, - "grad_norm": 0.8738722205162048, - "learning_rate": 9.886321142163147e-05, - "loss": 0.1132, + "epoch": 5.397775596990513, + "grad_norm": 0.7603851556777954, + "learning_rate": 2.5766676404581512e-05, + "loss": 0.0398, "step": 20630 }, { - "epoch": 1.3503434739941118, - "grad_norm": 0.7624223828315735, - "learning_rate": 9.886126294676634e-05, - "loss": 0.1022, + "epoch": 5.400392541707556, + "grad_norm": 0.5429633855819702, + "learning_rate": 2.5746017493467023e-05, + "loss": 0.0418, "step": 20640 }, { - "epoch": 1.3509977101733726, - "grad_norm": 0.9689080715179443, - "learning_rate": 9.885931282270545e-05, - "loss": 0.1014, + "epoch": 5.403009486424599, + "grad_norm": 0.3459802269935608, + "learning_rate": 2.5725358072456612e-05, + "loss": 0.0374, "step": 20650 }, { - "epoch": 1.3516519463526333, - "grad_norm": 0.7586194276809692, - "learning_rate": 9.885736104951462e-05, - "loss": 0.0962, + "epoch": 5.405626431141642, + "grad_norm": 0.5180562138557434, + "learning_rate": 2.5704698155670797e-05, + "loss": 0.042, "step": 20660 }, { - "epoch": 1.352306182531894, - "grad_norm": 0.8439247012138367, - "learning_rate": 9.88554076272597e-05, - "loss": 0.0948, + "epoch": 5.408243375858685, + "grad_norm": 0.4653855264186859, + "learning_rate": 2.5684037757230444e-05, + "loss": 0.0393, "step": 20670 }, { - "epoch": 1.3529604187111548, - "grad_norm": 1.0176705121994019, - "learning_rate": 9.885345255600666e-05, - "loss": 0.1043, + "epoch": 5.410860320575728, + "grad_norm": 0.34826746582984924, + "learning_rate": 2.566337689125673e-05, + "loss": 0.0391, "step": 20680 }, { - "epoch": 1.3536146548904155, - "grad_norm": 0.753044843673706, - "learning_rate": 9.885149583582148e-05, - "loss": 0.0954, + "epoch": 5.41347726529277, + "grad_norm": 0.43290433287620544, + "learning_rate": 2.5642715571871162e-05, + "loss": 0.0443, "step": 20690 }, { - "epoch": 1.354268891069676, - "grad_norm": 0.9865116477012634, - "learning_rate": 9.884953746677019e-05, - "loss": 0.1019, + "epoch": 5.416094210009813, + "grad_norm": 0.3276048004627228, + "learning_rate": 2.5622053813195568e-05, + "loss": 0.0412, "step": 20700 }, { - "epoch": 1.3549231272489368, - "grad_norm": 0.8925600051879883, - "learning_rate": 9.88475774489189e-05, - "loss": 0.1057, + "epoch": 5.418711154726856, + "grad_norm": 0.3624131381511688, + "learning_rate": 2.560139162935205e-05, + "loss": 0.0422, "step": 20710 }, { - "epoch": 1.3555773634281976, - "grad_norm": 0.8571544289588928, - "learning_rate": 9.884561578233375e-05, - "loss": 0.1052, + "epoch": 5.421328099443899, + "grad_norm": 0.4569075107574463, + "learning_rate": 2.5580729034463036e-05, + "loss": 0.0422, "step": 20720 }, { - "epoch": 1.3562315996074583, - "grad_norm": 0.7800583839416504, - "learning_rate": 9.884365246708098e-05, - "loss": 0.0942, + "epoch": 5.423945044160942, + "grad_norm": 0.35856783390045166, + "learning_rate": 2.5560066042651192e-05, + "loss": 0.0433, "step": 20730 }, { - "epoch": 1.356885835786719, - "grad_norm": 0.9630329608917236, - "learning_rate": 9.884168750322684e-05, - "loss": 0.1151, + "epoch": 5.426561988877985, + "grad_norm": 0.5207006335258484, + "learning_rate": 2.553940266803949e-05, + "loss": 0.0404, "step": 20740 }, { - "epoch": 1.3575400719659796, - "grad_norm": 1.2140957117080688, - "learning_rate": 9.883972089083766e-05, - "loss": 0.1038, + "epoch": 5.429178933595027, + "grad_norm": 0.5183231234550476, + "learning_rate": 2.5518738924751155e-05, + "loss": 0.0358, "step": 20750 }, { - "epoch": 1.3581943081452406, - "grad_norm": 0.7877089977264404, - "learning_rate": 9.883775262997981e-05, - "loss": 0.0968, + "epoch": 5.43179587831207, + "grad_norm": 0.450226753950119, + "learning_rate": 2.549807482690965e-05, + "loss": 0.0376, "step": 20760 }, { - "epoch": 1.358848544324501, - "grad_norm": 0.9409202933311462, - "learning_rate": 9.883578272071971e-05, - "loss": 0.1003, + "epoch": 5.434412823029113, + "grad_norm": 0.42255696654319763, + "learning_rate": 2.547741038863871e-05, + "loss": 0.0359, "step": 20770 }, { - "epoch": 1.3595027805037618, - "grad_norm": 1.0525962114334106, - "learning_rate": 9.883381116312389e-05, - "loss": 0.0987, + "epoch": 5.437029767746156, + "grad_norm": 0.6406026482582092, + "learning_rate": 2.545674562406226e-05, + "loss": 0.0421, "step": 20780 }, { - "epoch": 1.3601570166830226, - "grad_norm": 0.8905682563781738, - "learning_rate": 9.883183795725885e-05, - "loss": 0.0976, + "epoch": 5.439646712463199, + "grad_norm": 0.3330078721046448, + "learning_rate": 2.5436080547304485e-05, + "loss": 0.0397, "step": 20790 }, { - "epoch": 1.3608112528622833, - "grad_norm": 0.860859215259552, - "learning_rate": 9.882986310319124e-05, - "loss": 0.1083, + "epoch": 5.442263657180242, + "grad_norm": 0.32984280586242676, + "learning_rate": 2.541541517248977e-05, + "loss": 0.0357, "step": 20800 }, { - "epoch": 1.361465489041544, - "grad_norm": 1.0243405103683472, - "learning_rate": 9.882788660098768e-05, - "loss": 0.0915, + "epoch": 5.444880601897285, + "grad_norm": 0.2796110510826111, + "learning_rate": 2.53947495137427e-05, + "loss": 0.0369, "step": 20810 }, { - "epoch": 1.3621197252208046, - "grad_norm": 1.0279419422149658, - "learning_rate": 9.882590845071487e-05, - "loss": 0.1119, + "epoch": 5.447497546614327, + "grad_norm": 0.3588564991950989, + "learning_rate": 2.537408358518807e-05, + "loss": 0.0405, "step": 20820 }, { - "epoch": 1.3627739614000653, - "grad_norm": 1.0260818004608154, - "learning_rate": 9.882392865243961e-05, - "loss": 0.105, + "epoch": 5.45011449133137, + "grad_norm": 0.41805946826934814, + "learning_rate": 2.5353417400950825e-05, + "loss": 0.0419, "step": 20830 }, { - "epoch": 1.363428197579326, - "grad_norm": 0.9079784750938416, - "learning_rate": 9.882194720622873e-05, - "loss": 0.0935, + "epoch": 5.452731436048413, + "grad_norm": 0.25932222604751587, + "learning_rate": 2.5332750975156115e-05, + "loss": 0.0359, "step": 20840 }, { - "epoch": 1.3640824337585868, - "grad_norm": 0.8685601949691772, - "learning_rate": 9.881996411214906e-05, - "loss": 0.0976, + "epoch": 5.455348380765456, + "grad_norm": 0.37364596128463745, + "learning_rate": 2.531208432192926e-05, + "loss": 0.041, "step": 20850 }, { - "epoch": 1.3647366699378476, - "grad_norm": 0.7832821011543274, - "learning_rate": 9.88179793702676e-05, - "loss": 0.1072, + "epoch": 5.457965325482499, + "grad_norm": 0.47812214493751526, + "learning_rate": 2.529141745539571e-05, + "loss": 0.0375, "step": 20860 }, { - "epoch": 1.3653909061171083, - "grad_norm": 0.6875680685043335, - "learning_rate": 9.88159929806513e-05, - "loss": 0.0963, + "epoch": 5.460582270199542, + "grad_norm": 0.6243589520454407, + "learning_rate": 2.527075038968108e-05, + "loss": 0.0466, "step": 20870 }, { - "epoch": 1.366045142296369, - "grad_norm": 0.8509207367897034, - "learning_rate": 9.881400494336719e-05, - "loss": 0.1061, + "epoch": 5.463199214916585, + "grad_norm": 0.45652157068252563, + "learning_rate": 2.5250083138911107e-05, + "loss": 0.0359, "step": 20880 }, { - "epoch": 1.3666993784756296, - "grad_norm": 0.7752100229263306, - "learning_rate": 9.88120152584824e-05, - "loss": 0.1041, + "epoch": 5.465816159633627, + "grad_norm": 0.4943414330482483, + "learning_rate": 2.5229415717211667e-05, + "loss": 0.0402, "step": 20890 }, { - "epoch": 1.3673536146548904, - "grad_norm": 0.7963639497756958, - "learning_rate": 9.88100239260641e-05, - "loss": 0.1042, + "epoch": 5.46843310435067, + "grad_norm": 0.6210593581199646, + "learning_rate": 2.5208748138708753e-05, + "loss": 0.0427, "step": 20900 }, { - "epoch": 1.368007850834151, - "grad_norm": 0.8341352939605713, - "learning_rate": 9.880803094617948e-05, - "loss": 0.0957, + "epoch": 5.471050049067713, + "grad_norm": 0.391830712556839, + "learning_rate": 2.5188080417528454e-05, + "loss": 0.0438, "step": 20910 }, { - "epoch": 1.3686620870134119, - "grad_norm": 0.7604305744171143, - "learning_rate": 9.88060363188958e-05, - "loss": 0.1033, + "epoch": 5.473666993784756, + "grad_norm": 0.4893028736114502, + "learning_rate": 2.5167412567796968e-05, + "loss": 0.0433, "step": 20920 }, { - "epoch": 1.3693163231926726, - "grad_norm": 0.8400859236717224, - "learning_rate": 9.880404004428039e-05, - "loss": 0.0955, + "epoch": 5.476283938501799, + "grad_norm": 0.5769691467285156, + "learning_rate": 2.5146744603640555e-05, + "loss": 0.0374, "step": 20930 }, { - "epoch": 1.3699705593719333, - "grad_norm": 0.8343890905380249, - "learning_rate": 9.880204212240065e-05, - "loss": 0.0925, + "epoch": 5.478900883218842, + "grad_norm": 0.4640278220176697, + "learning_rate": 2.5126076539185593e-05, + "loss": 0.0423, "step": 20940 }, { - "epoch": 1.370624795551194, - "grad_norm": 0.9283037781715393, - "learning_rate": 9.880004255332399e-05, - "loss": 0.1094, + "epoch": 5.4815178279358845, + "grad_norm": 0.535298228263855, + "learning_rate": 2.510540838855852e-05, + "loss": 0.0405, "step": 20950 }, { - "epoch": 1.3712790317304546, - "grad_norm": 1.0284000635147095, - "learning_rate": 9.879804133711792e-05, - "loss": 0.0951, + "epoch": 5.4841347726529275, + "grad_norm": 0.5258188247680664, + "learning_rate": 2.5084740165885795e-05, + "loss": 0.0381, "step": 20960 }, { - "epoch": 1.3719332679097154, - "grad_norm": 1.0388516187667847, - "learning_rate": 9.879603847384997e-05, - "loss": 0.106, + "epoch": 5.4867517173699705, + "grad_norm": 0.3611849546432495, + "learning_rate": 2.5064071885293964e-05, + "loss": 0.0448, "step": 20970 }, { - "epoch": 1.3725875040889761, - "grad_norm": 0.9273998141288757, - "learning_rate": 9.879403396358775e-05, - "loss": 0.0964, + "epoch": 5.4893686620870135, + "grad_norm": 0.693965494632721, + "learning_rate": 2.5043403560909605e-05, + "loss": 0.0463, "step": 20980 }, { - "epoch": 1.3732417402682369, - "grad_norm": 0.8812925815582275, - "learning_rate": 9.879202780639892e-05, - "loss": 0.1022, + "epoch": 5.4919856068040565, + "grad_norm": 0.4176058769226074, + "learning_rate": 2.5022735206859323e-05, + "loss": 0.0416, "step": 20990 }, { - "epoch": 1.3738959764474976, - "grad_norm": 0.8167784214019775, - "learning_rate": 9.87900200023512e-05, - "loss": 0.113, + "epoch": 5.494602551521099, + "grad_norm": 0.3145519495010376, + "learning_rate": 2.500206683726975e-05, + "loss": 0.0377, + "step": 21000 + }, + { + "epoch": 5.494602551521099, + "eval_loss": 0.04668252001432974, + "eval_runtime": 9.0331, + "eval_samples_per_second": 113.36, + "eval_steps_per_second": 1.771, "step": 21000 }, { - "epoch": 1.3745502126267581, - "grad_norm": 0.961060106754303, - "learning_rate": 9.878801055151232e-05, - "loss": 0.0984, + "epoch": 5.4972194962381415, + "grad_norm": 0.4744648337364197, + "learning_rate": 2.4981398466267496e-05, + "loss": 0.0433, "step": 21010 }, { - "epoch": 1.375204448806019, - "grad_norm": 1.0047707557678223, - "learning_rate": 9.878599945395015e-05, - "loss": 0.0912, + "epoch": 5.4998364409551845, + "grad_norm": 0.48282021284103394, + "learning_rate": 2.4960730107979233e-05, + "loss": 0.0415, "step": 21020 }, { - "epoch": 1.3758586849852796, - "grad_norm": 1.0364177227020264, - "learning_rate": 9.878398670973256e-05, - "loss": 0.1161, + "epoch": 5.5024533856722275, + "grad_norm": 0.34886685013771057, + "learning_rate": 2.4940061776531565e-05, + "loss": 0.0409, "step": 21030 }, { - "epoch": 1.3765129211645404, - "grad_norm": 0.7439346313476562, - "learning_rate": 9.878197231892747e-05, - "loss": 0.1047, + "epoch": 5.5050703303892705, + "grad_norm": 0.5502273440361023, + "learning_rate": 2.49193934860511e-05, + "loss": 0.0451, "step": 21040 }, { - "epoch": 1.3771671573438011, - "grad_norm": 0.9560453295707703, - "learning_rate": 9.877995628160288e-05, - "loss": 0.0911, + "epoch": 5.5076872751063135, + "grad_norm": 0.33859339356422424, + "learning_rate": 2.4898725250664433e-05, + "loss": 0.0389, "step": 21050 }, { - "epoch": 1.3778213935230619, - "grad_norm": 0.8289201855659485, - "learning_rate": 9.877793859782683e-05, - "loss": 0.092, + "epoch": 5.5103042198233565, + "grad_norm": 0.25911879539489746, + "learning_rate": 2.48780570844981e-05, + "loss": 0.036, "step": 21060 }, { - "epoch": 1.3784756297023226, - "grad_norm": 0.9151121377944946, - "learning_rate": 9.877591926766743e-05, - "loss": 0.0996, + "epoch": 5.5129211645403995, + "grad_norm": 0.2707931697368622, + "learning_rate": 2.4857389001678606e-05, + "loss": 0.0385, "step": 21070 }, { - "epoch": 1.3791298658815832, - "grad_norm": 0.8370741605758667, - "learning_rate": 9.877389829119284e-05, - "loss": 0.0936, + "epoch": 5.515538109257442, + "grad_norm": 0.3240604102611542, + "learning_rate": 2.4836721016332374e-05, + "loss": 0.0399, "step": 21080 }, { - "epoch": 1.379784102060844, - "grad_norm": 0.9695648550987244, - "learning_rate": 9.877187566847125e-05, - "loss": 0.0963, + "epoch": 5.518155053974485, + "grad_norm": 0.551508903503418, + "learning_rate": 2.4816053142585792e-05, + "loss": 0.0388, "step": 21090 }, { - "epoch": 1.3804383382401046, - "grad_norm": 0.9655565619468689, - "learning_rate": 9.876985139957098e-05, - "loss": 0.1032, + "epoch": 5.520771998691528, + "grad_norm": 0.43382567167282104, + "learning_rate": 2.4795385394565154e-05, + "loss": 0.0383, "step": 21100 }, { - "epoch": 1.3810925744193654, - "grad_norm": 0.9897701144218445, - "learning_rate": 9.876782548456029e-05, - "loss": 0.1066, + "epoch": 5.523388943408571, + "grad_norm": 0.4543988108634949, + "learning_rate": 2.4774717786396666e-05, + "loss": 0.039, "step": 21110 }, { - "epoch": 1.3817468105986261, - "grad_norm": 1.0255415439605713, - "learning_rate": 9.876579792350762e-05, - "loss": 0.1065, + "epoch": 5.5260058881256136, + "grad_norm": 0.45710262656211853, + "learning_rate": 2.4754050332206458e-05, + "loss": 0.0348, "step": 21120 }, { - "epoch": 1.3824010467778869, - "grad_norm": 0.9819886684417725, - "learning_rate": 9.876376871648137e-05, - "loss": 0.0994, + "epoch": 5.5286228328426565, + "grad_norm": 0.3298129439353943, + "learning_rate": 2.4733383046120523e-05, + "loss": 0.0423, "step": 21130 }, { - "epoch": 1.3830552829571476, - "grad_norm": 0.9188646078109741, - "learning_rate": 9.876173786355003e-05, - "loss": 0.107, + "epoch": 5.5312397775596995, + "grad_norm": 0.5027996897697449, + "learning_rate": 2.471271594226476e-05, + "loss": 0.0369, "step": 21140 }, { - "epoch": 1.3837095191364082, - "grad_norm": 0.8057947158813477, - "learning_rate": 9.875970536478213e-05, - "loss": 0.0964, + "epoch": 5.533856722276742, + "grad_norm": 0.49993273615837097, + "learning_rate": 2.469204903476494e-05, + "loss": 0.0426, "step": 21150 }, { - "epoch": 1.384363755315669, - "grad_norm": 0.8052737712860107, - "learning_rate": 9.875767122024634e-05, - "loss": 0.1005, + "epoch": 5.536473666993785, + "grad_norm": 0.5039677619934082, + "learning_rate": 2.46713823377467e-05, + "loss": 0.0372, "step": 21160 }, { - "epoch": 1.3850179914949297, - "grad_norm": 0.8417797684669495, - "learning_rate": 9.875563543001125e-05, - "loss": 0.0942, + "epoch": 5.539090611710828, + "grad_norm": 0.5910190939903259, + "learning_rate": 2.465071586533554e-05, + "loss": 0.0405, "step": 21170 }, { - "epoch": 1.3856722276741904, - "grad_norm": 1.070704698562622, - "learning_rate": 9.875359799414561e-05, - "loss": 0.099, + "epoch": 5.541707556427871, + "grad_norm": 0.5953308939933777, + "learning_rate": 2.4630049631656782e-05, + "loss": 0.0379, "step": 21180 }, { - "epoch": 1.3863264638534512, - "grad_norm": 0.871180534362793, - "learning_rate": 9.875155891271817e-05, - "loss": 0.0993, + "epoch": 5.544324501144914, + "grad_norm": 0.31143149733543396, + "learning_rate": 2.4609383650835616e-05, + "loss": 0.0414, "step": 21190 }, { - "epoch": 1.3869807000327117, - "grad_norm": 0.9643855690956116, - "learning_rate": 9.874951818579776e-05, - "loss": 0.1023, + "epoch": 5.546941445861956, + "grad_norm": 0.3030470609664917, + "learning_rate": 2.4588717936997045e-05, + "loss": 0.0434, "step": 21200 }, { - "epoch": 1.3876349362119726, - "grad_norm": 0.8685986399650574, - "learning_rate": 9.874747581345328e-05, - "loss": 0.0985, + "epoch": 5.549558390578999, + "grad_norm": 0.2863839566707611, + "learning_rate": 2.456805250426589e-05, + "loss": 0.0352, "step": 21210 }, { - "epoch": 1.3882891723912332, - "grad_norm": 0.7610760927200317, - "learning_rate": 9.874543179575362e-05, - "loss": 0.0978, + "epoch": 5.552175335296042, + "grad_norm": 0.37792912125587463, + "learning_rate": 2.454738736676677e-05, + "loss": 0.038, "step": 21220 }, { - "epoch": 1.388943408570494, - "grad_norm": 0.7402287721633911, - "learning_rate": 9.874338613276781e-05, - "loss": 0.1047, + "epoch": 5.554792280013085, + "grad_norm": 0.3757082521915436, + "learning_rate": 2.4526722538624118e-05, + "loss": 0.0414, "step": 21230 }, { - "epoch": 1.3895976447497547, - "grad_norm": 0.8294671773910522, - "learning_rate": 9.874133882456489e-05, - "loss": 0.0976, + "epoch": 5.557409224730128, + "grad_norm": 0.3112926185131073, + "learning_rate": 2.4506058033962146e-05, + "loss": 0.0403, "step": 21240 }, { - "epoch": 1.3902518809290154, - "grad_norm": 1.0377382040023804, - "learning_rate": 9.873928987121394e-05, - "loss": 0.1077, + "epoch": 5.560026169447171, + "grad_norm": 0.30436331033706665, + "learning_rate": 2.448539386690485e-05, + "loss": 0.0412, "step": 21250 }, { - "epoch": 1.3909061171082762, - "grad_norm": 0.8378622531890869, - "learning_rate": 9.873723927278414e-05, - "loss": 0.0988, + "epoch": 5.562643114164214, + "grad_norm": 0.30948469042778015, + "learning_rate": 2.4464730051575994e-05, + "loss": 0.0351, "step": 21260 }, { - "epoch": 1.3915603532875367, - "grad_norm": 0.7670599222183228, - "learning_rate": 9.87351870293447e-05, - "loss": 0.1022, + "epoch": 5.565260058881256, + "grad_norm": 0.3362123668193817, + "learning_rate": 2.4444066602099102e-05, + "loss": 0.0367, "step": 21270 }, { - "epoch": 1.3922145894667974, - "grad_norm": 0.7315048575401306, - "learning_rate": 9.873313314096488e-05, - "loss": 0.0977, + "epoch": 5.567877003598299, + "grad_norm": 0.49534282088279724, + "learning_rate": 2.4423403532597443e-05, + "loss": 0.0352, "step": 21280 }, { - "epoch": 1.3928688256460582, - "grad_norm": 0.960702121257782, - "learning_rate": 9.873107760771401e-05, - "loss": 0.0966, + "epoch": 5.570493948315342, + "grad_norm": 0.4199749529361725, + "learning_rate": 2.440274085719403e-05, + "loss": 0.0385, "step": 21290 }, { - "epoch": 1.393523061825319, - "grad_norm": 0.7483635544776917, - "learning_rate": 9.872902042966147e-05, - "loss": 0.0883, + "epoch": 5.573110893032385, + "grad_norm": 0.4081372618675232, + "learning_rate": 2.4382078590011622e-05, + "loss": 0.0405, "step": 21300 }, { - "epoch": 1.3941772980045797, - "grad_norm": 0.8449085354804993, - "learning_rate": 9.872696160687669e-05, - "loss": 0.1002, + "epoch": 5.575727837749428, + "grad_norm": 0.26627975702285767, + "learning_rate": 2.4361416745172665e-05, + "loss": 0.0434, "step": 21310 }, { - "epoch": 1.3948315341838404, - "grad_norm": 0.8629240393638611, - "learning_rate": 9.872490113942918e-05, - "loss": 0.0977, + "epoch": 5.578344782466471, + "grad_norm": 0.43419206142425537, + "learning_rate": 2.4340755336799337e-05, + "loss": 0.0365, "step": 21320 }, { - "epoch": 1.3954857703631012, - "grad_norm": 0.8286533951759338, - "learning_rate": 9.872283902738845e-05, - "loss": 0.0968, + "epoch": 5.580961727183514, + "grad_norm": 0.6178485155105591, + "learning_rate": 2.4320094379013523e-05, + "loss": 0.0397, "step": 21330 }, { - "epoch": 1.3961400065423617, - "grad_norm": 0.8153654932975769, - "learning_rate": 9.872077527082413e-05, - "loss": 0.1113, + "epoch": 5.583578671900556, + "grad_norm": 0.3559263050556183, + "learning_rate": 2.4299433885936784e-05, + "loss": 0.0401, "step": 21340 }, { - "epoch": 1.3967942427216224, - "grad_norm": 0.7683613896369934, - "learning_rate": 9.871870986980587e-05, - "loss": 0.0887, + "epoch": 5.586195616617599, + "grad_norm": 0.5083751082420349, + "learning_rate": 2.4278773871690386e-05, + "loss": 0.0426, "step": 21350 }, { - "epoch": 1.3974484789008832, - "grad_norm": 1.0355619192123413, - "learning_rate": 9.871664282440339e-05, - "loss": 0.1074, + "epoch": 5.588812561334642, + "grad_norm": 0.5449803471565247, + "learning_rate": 2.425811435039524e-05, + "loss": 0.0408, "step": 21360 }, { - "epoch": 1.398102715080144, - "grad_norm": 0.8770645260810852, - "learning_rate": 9.871457413468644e-05, - "loss": 0.0993, + "epoch": 5.591429506051685, + "grad_norm": 0.5429864525794983, + "learning_rate": 2.4237455336171944e-05, + "loss": 0.0424, "step": 21370 }, { - "epoch": 1.3987569512594047, - "grad_norm": 0.709586501121521, - "learning_rate": 9.871250380072487e-05, - "loss": 0.094, + "epoch": 5.594046450768728, + "grad_norm": 0.32913637161254883, + "learning_rate": 2.421679684314073e-05, + "loss": 0.0377, "step": 21380 }, { - "epoch": 1.3994111874386654, - "grad_norm": 0.7333911061286926, - "learning_rate": 9.871043182258852e-05, - "loss": 0.1037, + "epoch": 5.596663395485771, + "grad_norm": 0.2501143217086792, + "learning_rate": 2.4196138885421488e-05, + "loss": 0.0342, "step": 21390 }, { - "epoch": 1.4000654236179262, - "grad_norm": 0.7974872589111328, - "learning_rate": 9.870835820034736e-05, - "loss": 0.0953, + "epoch": 5.599280340202813, + "grad_norm": 0.3179362714290619, + "learning_rate": 2.417548147713375e-05, + "loss": 0.041, "step": 21400 }, { - "epoch": 1.4007196597971867, - "grad_norm": 0.8069256544113159, - "learning_rate": 9.870628293407138e-05, - "loss": 0.0922, + "epoch": 5.601897284919856, + "grad_norm": 0.4324061870574951, + "learning_rate": 2.4154824632396645e-05, + "loss": 0.0394, "step": 21410 }, { - "epoch": 1.4013738959764475, - "grad_norm": 0.8490813374519348, - "learning_rate": 9.870420602383059e-05, - "loss": 0.1018, + "epoch": 5.604514229636899, + "grad_norm": 0.49988454580307007, + "learning_rate": 2.4134168365328925e-05, + "loss": 0.0433, "step": 21420 }, { - "epoch": 1.4020281321557082, - "grad_norm": 0.9025737643241882, - "learning_rate": 9.870212746969514e-05, - "loss": 0.0992, + "epoch": 5.607131174353942, + "grad_norm": 0.5067137479782104, + "learning_rate": 2.411351269004897e-05, + "loss": 0.0398, "step": 21430 }, { - "epoch": 1.402682368334969, - "grad_norm": 0.7554858922958374, - "learning_rate": 9.870004727173514e-05, - "loss": 0.1055, + "epoch": 5.609748119070985, + "grad_norm": 0.5921526551246643, + "learning_rate": 2.4092857620674725e-05, + "loss": 0.0411, "step": 21440 }, { - "epoch": 1.4033366045142297, - "grad_norm": 0.9373718500137329, - "learning_rate": 9.869796543002083e-05, - "loss": 0.1052, + "epoch": 5.612365063788028, + "grad_norm": 0.3700776994228363, + "learning_rate": 2.4072203171323748e-05, + "loss": 0.0352, "step": 21450 }, { - "epoch": 1.4039908406934902, - "grad_norm": 0.916695237159729, - "learning_rate": 9.869588194462249e-05, - "loss": 0.103, + "epoch": 5.61498200850507, + "grad_norm": 0.3668251633644104, + "learning_rate": 2.405154935611315e-05, + "loss": 0.0357, "step": 21460 }, { - "epoch": 1.4046450768727512, - "grad_norm": 1.047790288925171, - "learning_rate": 9.869379681561041e-05, - "loss": 0.0934, + "epoch": 5.617598953222113, + "grad_norm": 0.7859848141670227, + "learning_rate": 2.403089618915963e-05, + "loss": 0.0422, "step": 21470 }, { - "epoch": 1.4052993130520117, - "grad_norm": 0.8690340518951416, - "learning_rate": 9.869171004305497e-05, - "loss": 0.0933, + "epoch": 5.620215897939156, + "grad_norm": 0.4612215459346771, + "learning_rate": 2.401024368457942e-05, + "loss": 0.0441, "step": 21480 }, { - "epoch": 1.4059535492312725, - "grad_norm": 0.8632329106330872, - "learning_rate": 9.868962162702664e-05, - "loss": 0.1004, + "epoch": 5.622832842656199, + "grad_norm": 0.46146160364151, + "learning_rate": 2.398959185648833e-05, + "loss": 0.0383, "step": 21490 }, { - "epoch": 1.4066077854105332, - "grad_norm": 0.8703600168228149, - "learning_rate": 9.868753156759587e-05, - "loss": 0.0961, + "epoch": 5.625449787373242, + "grad_norm": 0.5035638213157654, + "learning_rate": 2.396894071900167e-05, + "loss": 0.0337, "step": 21500 }, { - "epoch": 1.407262021589794, - "grad_norm": 0.9060193300247192, - "learning_rate": 9.868543986483325e-05, - "loss": 0.1021, + "epoch": 5.628066732090285, + "grad_norm": 0.37995830178260803, + "learning_rate": 2.394829028623431e-05, + "loss": 0.04, "step": 21510 }, { - "epoch": 1.4079162577690547, - "grad_norm": 1.0863100290298462, - "learning_rate": 9.868334651880932e-05, - "loss": 0.1095, + "epoch": 5.630683676807328, + "grad_norm": 0.5449048280715942, + "learning_rate": 2.3927640572300613e-05, + "loss": 0.0452, "step": 21520 }, { - "epoch": 1.4085704939483152, - "grad_norm": 0.837095320224762, - "learning_rate": 9.868125152959477e-05, - "loss": 0.0981, + "epoch": 5.63330062152437, + "grad_norm": 0.4563062787055969, + "learning_rate": 2.3906991591314485e-05, + "loss": 0.0381, "step": 21530 }, { - "epoch": 1.409224730127576, - "grad_norm": 1.0157208442687988, - "learning_rate": 9.867915489726034e-05, - "loss": 0.102, + "epoch": 5.635917566241413, + "grad_norm": 0.4141029417514801, + "learning_rate": 2.388634335738929e-05, + "loss": 0.0419, "step": 21540 }, { - "epoch": 1.4098789663068367, - "grad_norm": 0.9771285057067871, - "learning_rate": 9.867705662187673e-05, - "loss": 0.1063, + "epoch": 5.638534510958456, + "grad_norm": 0.4503481984138489, + "learning_rate": 2.386569588463791e-05, + "loss": 0.0389, "step": 21550 }, { - "epoch": 1.4105332024860975, - "grad_norm": 0.9457305669784546, - "learning_rate": 9.867495670351483e-05, - "loss": 0.094, + "epoch": 5.641151455675499, + "grad_norm": 0.5653615593910217, + "learning_rate": 2.3845049187172696e-05, + "loss": 0.0381, "step": 21560 }, { - "epoch": 1.4111874386653582, - "grad_norm": 0.90764981508255, - "learning_rate": 9.867285514224547e-05, - "loss": 0.0918, + "epoch": 5.643768400392542, + "grad_norm": 0.36770060658454895, + "learning_rate": 2.3824403279105474e-05, + "loss": 0.0375, "step": 21570 }, { - "epoch": 1.411841674844619, - "grad_norm": 0.8110638856887817, - "learning_rate": 9.867075193813959e-05, - "loss": 0.1003, + "epoch": 5.646385345109585, + "grad_norm": 0.42274710536003113, + "learning_rate": 2.380375817454754e-05, + "loss": 0.0384, "step": 21580 }, { - "epoch": 1.4124959110238797, - "grad_norm": 1.1416128873825073, - "learning_rate": 9.866864709126821e-05, - "loss": 0.1112, + "epoch": 5.649002289826628, + "grad_norm": 0.443455308675766, + "learning_rate": 2.3783113887609595e-05, + "loss": 0.0414, "step": 21590 }, { - "epoch": 1.4131501472031402, - "grad_norm": 1.088036060333252, - "learning_rate": 9.866654060170234e-05, - "loss": 0.1099, + "epoch": 5.65161923454367, + "grad_norm": 0.3747701644897461, + "learning_rate": 2.376247043240184e-05, + "loss": 0.0345, "step": 21600 }, { - "epoch": 1.413804383382401, - "grad_norm": 0.9464057087898254, - "learning_rate": 9.866443246951308e-05, - "loss": 0.0975, + "epoch": 5.654236179260713, + "grad_norm": 0.6288304924964905, + "learning_rate": 2.3741827823033872e-05, + "loss": 0.0398, "step": 21610 }, { - "epoch": 1.4144586195616617, - "grad_norm": 0.8685123324394226, - "learning_rate": 9.866232269477162e-05, - "loss": 0.0942, + "epoch": 5.656853123977756, + "grad_norm": 0.29721206426620483, + "learning_rate": 2.372118607361472e-05, + "loss": 0.0399, "step": 21620 }, { - "epoch": 1.4151128557409225, - "grad_norm": 0.7753238677978516, - "learning_rate": 9.866021127754915e-05, - "loss": 0.0973, + "epoch": 5.659470068694799, + "grad_norm": 0.4514649510383606, + "learning_rate": 2.3700545198252836e-05, + "loss": 0.0396, "step": 21630 }, { - "epoch": 1.4157670919201832, - "grad_norm": 0.8483602404594421, - "learning_rate": 9.865809821791692e-05, - "loss": 0.1016, + "epoch": 5.662087013411842, + "grad_norm": 0.5440108180046082, + "learning_rate": 2.367990521105605e-05, + "loss": 0.0339, "step": 21640 }, { - "epoch": 1.4164213280994438, - "grad_norm": 0.712035596370697, - "learning_rate": 9.865598351594627e-05, - "loss": 0.0985, + "epoch": 5.664703958128884, + "grad_norm": 0.4818888008594513, + "learning_rate": 2.365926612613161e-05, + "loss": 0.0411, "step": 21650 }, { - "epoch": 1.4170755642787047, - "grad_norm": 0.87371426820755, - "learning_rate": 9.865386717170856e-05, - "loss": 0.0952, + "epoch": 5.667320902845927, + "grad_norm": 0.3079739511013031, + "learning_rate": 2.3638627957586124e-05, + "loss": 0.0367, "step": 21660 }, { - "epoch": 1.4177298004579653, - "grad_norm": 1.058411717414856, - "learning_rate": 9.865174918527525e-05, - "loss": 0.0956, + "epoch": 5.66993784756297, + "grad_norm": 0.4219807982444763, + "learning_rate": 2.3617990719525594e-05, + "loss": 0.0349, "step": 21670 }, { - "epoch": 1.418384036637226, - "grad_norm": 0.7664424777030945, - "learning_rate": 9.864962955671779e-05, - "loss": 0.0889, + "epoch": 5.672554792280013, + "grad_norm": 0.3461092710494995, + "learning_rate": 2.3597354426055383e-05, + "loss": 0.0423, "step": 21680 }, { - "epoch": 1.4190382728164868, - "grad_norm": 0.9568914771080017, - "learning_rate": 9.864750828610776e-05, - "loss": 0.102, + "epoch": 5.675171736997056, + "grad_norm": 0.3110405206680298, + "learning_rate": 2.3576719091280193e-05, + "loss": 0.033, "step": 21690 }, { - "epoch": 1.4196925089957475, - "grad_norm": 0.8429945707321167, - "learning_rate": 9.864538537351675e-05, - "loss": 0.1018, + "epoch": 5.677788681714099, + "grad_norm": 0.6025128364562988, + "learning_rate": 2.3556084729304074e-05, + "loss": 0.0418, "step": 21700 }, { - "epoch": 1.4203467451750083, - "grad_norm": 1.3158056735992432, - "learning_rate": 9.864326081901639e-05, - "loss": 0.0934, + "epoch": 5.680405626431142, + "grad_norm": 0.46335169672966003, + "learning_rate": 2.353545135423044e-05, + "loss": 0.0355, "step": 21710 }, { - "epoch": 1.4210009813542688, - "grad_norm": 0.8673258423805237, - "learning_rate": 9.864113462267841e-05, - "loss": 0.0917, + "epoch": 5.683022571148184, + "grad_norm": 0.4924328625202179, + "learning_rate": 2.3514818980161986e-05, + "loss": 0.0411, "step": 21720 }, { - "epoch": 1.4216552175335295, - "grad_norm": 1.0829505920410156, - "learning_rate": 9.863900678457457e-05, - "loss": 0.1176, + "epoch": 5.685639515865227, + "grad_norm": 0.3370501399040222, + "learning_rate": 2.3494187621200757e-05, + "loss": 0.0396, "step": 21730 }, { - "epoch": 1.4223094537127903, - "grad_norm": 0.9344543218612671, - "learning_rate": 9.86368773047767e-05, - "loss": 0.0962, + "epoch": 5.68825646058227, + "grad_norm": 0.44423383474349976, + "learning_rate": 2.347355729144809e-05, + "loss": 0.0405, "step": 21740 }, { - "epoch": 1.422963689892051, - "grad_norm": 0.9904654026031494, - "learning_rate": 9.863474618335666e-05, - "loss": 0.1118, + "epoch": 5.690873405299313, + "grad_norm": 0.3318318724632263, + "learning_rate": 2.3452928005004623e-05, + "loss": 0.0389, "step": 21750 }, { - "epoch": 1.4236179260713118, - "grad_norm": 0.864806592464447, - "learning_rate": 9.863261342038639e-05, - "loss": 0.0914, + "epoch": 5.693490350016356, + "grad_norm": 0.5180429220199585, + "learning_rate": 2.3432299775970274e-05, + "loss": 0.0389, "step": 21760 }, { - "epoch": 1.4242721622505725, - "grad_norm": 0.8543987274169922, - "learning_rate": 9.863047901593786e-05, - "loss": 0.1041, + "epoch": 5.696107294733399, + "grad_norm": 0.3516453504562378, + "learning_rate": 2.3411672618444252e-05, + "loss": 0.0374, "step": 21770 }, { - "epoch": 1.4249263984298333, - "grad_norm": 0.8456289172172546, - "learning_rate": 9.862834297008314e-05, - "loss": 0.1148, + "epoch": 5.698724239450442, + "grad_norm": 0.4864489734172821, + "learning_rate": 2.339104654652501e-05, + "loss": 0.0367, "step": 21780 }, { - "epoch": 1.4255806346090938, - "grad_norm": 0.9336904287338257, - "learning_rate": 9.862620528289431e-05, - "loss": 0.1032, + "epoch": 5.701341184167484, + "grad_norm": 0.3349211812019348, + "learning_rate": 2.3370421574310286e-05, + "loss": 0.0342, "step": 21790 }, { - "epoch": 1.4262348707883545, - "grad_norm": 0.9453654885292053, - "learning_rate": 9.862406595444351e-05, - "loss": 0.1059, + "epoch": 5.703958128884527, + "grad_norm": 0.546351969242096, + "learning_rate": 2.3349797715897044e-05, + "loss": 0.0379, "step": 21800 }, { - "epoch": 1.4268891069676153, - "grad_norm": 1.0138676166534424, - "learning_rate": 9.862192498480299e-05, - "loss": 0.0979, + "epoch": 5.70657507360157, + "grad_norm": 0.468004435300827, + "learning_rate": 2.3329174985381514e-05, + "loss": 0.0402, "step": 21810 }, { - "epoch": 1.427543343146876, - "grad_norm": 1.177456259727478, - "learning_rate": 9.861978237404496e-05, - "loss": 0.0932, + "epoch": 5.709192018318613, + "grad_norm": 0.3514151871204376, + "learning_rate": 2.3308553396859114e-05, + "loss": 0.0371, "step": 21820 }, { - "epoch": 1.4281975793261368, - "grad_norm": 0.792131781578064, - "learning_rate": 9.861763812224177e-05, - "loss": 0.0958, + "epoch": 5.711808963035656, + "grad_norm": 0.28557074069976807, + "learning_rate": 2.3287932964424526e-05, + "loss": 0.0402, "step": 21830 }, { - "epoch": 1.4288518155053975, - "grad_norm": 0.7728269696235657, - "learning_rate": 9.86154922294658e-05, - "loss": 0.0975, + "epoch": 5.714425907752699, + "grad_norm": 0.2928674519062042, + "learning_rate": 2.326731370217161e-05, + "loss": 0.0351, "step": 21840 }, { - "epoch": 1.4295060516846583, - "grad_norm": 0.7983576059341431, - "learning_rate": 9.861334469578946e-05, - "loss": 0.0987, + "epoch": 5.717042852469741, + "grad_norm": 0.3764609396457672, + "learning_rate": 2.3246695624193444e-05, + "loss": 0.0391, "step": 21850 }, { - "epoch": 1.4301602878639188, - "grad_norm": 0.8768244385719299, - "learning_rate": 9.861119552128523e-05, - "loss": 0.0961, + "epoch": 5.719659797186784, + "grad_norm": 0.37425652146339417, + "learning_rate": 2.3226078744582287e-05, + "loss": 0.039, "step": 21860 }, { - "epoch": 1.4308145240431795, - "grad_norm": 1.0018515586853027, - "learning_rate": 9.86090447060257e-05, - "loss": 0.0906, + "epoch": 5.722276741903827, + "grad_norm": 0.5584398508071899, + "learning_rate": 2.3205463077429578e-05, + "loss": 0.0435, "step": 21870 }, { - "epoch": 1.4314687602224403, - "grad_norm": 0.9329746961593628, - "learning_rate": 9.86068922500834e-05, - "loss": 0.0925, + "epoch": 5.72489368662087, + "grad_norm": 0.39990872144699097, + "learning_rate": 2.318484863682593e-05, + "loss": 0.0415, "step": 21880 }, { - "epoch": 1.432122996401701, - "grad_norm": 0.8282088041305542, - "learning_rate": 9.860473815353102e-05, - "loss": 0.1014, + "epoch": 5.727510631337913, + "grad_norm": 0.5545628070831299, + "learning_rate": 2.316423543686113e-05, + "loss": 0.0353, "step": 21890 }, { - "epoch": 1.4327772325809618, - "grad_norm": 0.7623408436775208, - "learning_rate": 9.860258241644126e-05, - "loss": 0.1107, + "epoch": 5.730127576054956, + "grad_norm": 0.6198563575744629, + "learning_rate": 2.314362349162409e-05, + "loss": 0.0382, "step": 21900 }, { - "epoch": 1.4334314687602223, - "grad_norm": 0.8091941475868225, - "learning_rate": 9.860042503888687e-05, - "loss": 0.0936, + "epoch": 5.732744520771998, + "grad_norm": 0.4582308530807495, + "learning_rate": 2.3123012815202897e-05, + "loss": 0.035, "step": 21910 }, { - "epoch": 1.4340857049394833, - "grad_norm": 0.7656387686729431, - "learning_rate": 9.859826602094068e-05, - "loss": 0.091, + "epoch": 5.735361465489041, + "grad_norm": 0.24980854988098145, + "learning_rate": 2.3102403421684737e-05, + "loss": 0.0358, "step": 21920 }, { - "epoch": 1.4347399411187438, - "grad_norm": 1.1164404153823853, - "learning_rate": 9.859610536267556e-05, - "loss": 0.1016, + "epoch": 5.737978410206084, + "grad_norm": 0.32388317584991455, + "learning_rate": 2.3081795325155955e-05, + "loss": 0.0429, "step": 21930 }, { - "epoch": 1.4353941772980046, - "grad_norm": 0.9509071707725525, - "learning_rate": 9.859394306416444e-05, - "loss": 0.0934, + "epoch": 5.740595354923127, + "grad_norm": 0.417156845331192, + "learning_rate": 2.3061188539701973e-05, + "loss": 0.0413, "step": 21940 }, { - "epoch": 1.4360484134772653, - "grad_norm": 0.9692185521125793, - "learning_rate": 9.859177912548028e-05, - "loss": 0.1056, + "epoch": 5.74321229964017, + "grad_norm": 0.4360509216785431, + "learning_rate": 2.3040583079407348e-05, + "loss": 0.0358, "step": 21950 }, { - "epoch": 1.436702649656526, - "grad_norm": 0.9519806504249573, - "learning_rate": 9.858961354669616e-05, - "loss": 0.1038, + "epoch": 5.745829244357213, + "grad_norm": 0.26789259910583496, + "learning_rate": 2.301997895835572e-05, + "loss": 0.033, "step": 21960 }, { - "epoch": 1.4373568858357868, - "grad_norm": 1.0304690599441528, - "learning_rate": 9.858744632788514e-05, - "loss": 0.1002, + "epoch": 5.748446189074256, + "grad_norm": 0.43363526463508606, + "learning_rate": 2.2999376190629786e-05, + "loss": 0.0419, "step": 21970 }, { - "epoch": 1.4380111220150473, - "grad_norm": 0.7147766947746277, - "learning_rate": 9.858527746912039e-05, - "loss": 0.1132, + "epoch": 5.751063133791298, + "grad_norm": 0.46473145484924316, + "learning_rate": 2.2978774790311365e-05, + "loss": 0.0409, "step": 21980 }, { - "epoch": 1.438665358194308, - "grad_norm": 0.9351019263267517, - "learning_rate": 9.85831069704751e-05, - "loss": 0.0982, + "epoch": 5.753680078508341, + "grad_norm": 0.472726434469223, + "learning_rate": 2.2958174771481324e-05, + "loss": 0.038, "step": 21990 }, { - "epoch": 1.4393195943735688, - "grad_norm": 1.303455114364624, - "learning_rate": 9.858093483202254e-05, - "loss": 0.112, + "epoch": 5.756297023225384, + "grad_norm": 0.35688552260398865, + "learning_rate": 2.2937576148219564e-05, + "loss": 0.0423, + "step": 22000 + }, + { + "epoch": 5.756297023225384, + "eval_loss": 0.05000167867024996, + "eval_runtime": 8.9268, + "eval_samples_per_second": 114.711, + "eval_steps_per_second": 1.792, "step": 22000 }, { - "epoch": 1.4399738305528296, - "grad_norm": 0.778627872467041, - "learning_rate": 9.857876105383602e-05, - "loss": 0.1028, + "epoch": 5.758913967942427, + "grad_norm": 0.4734850525856018, + "learning_rate": 2.2916978934605065e-05, + "loss": 0.0343, "step": 22010 }, { - "epoch": 1.4406280667320903, - "grad_norm": 1.0590648651123047, - "learning_rate": 9.85765856359889e-05, - "loss": 0.1024, + "epoch": 5.76153091265947, + "grad_norm": 0.5695239901542664, + "learning_rate": 2.289638314471582e-05, + "loss": 0.0397, "step": 22020 }, { - "epoch": 1.441282302911351, - "grad_norm": 0.9106878042221069, - "learning_rate": 9.857440857855462e-05, - "loss": 0.0975, + "epoch": 5.764147857376513, + "grad_norm": 0.47718679904937744, + "learning_rate": 2.287578879262886e-05, + "loss": 0.0419, "step": 22030 }, { - "epoch": 1.4419365390906118, - "grad_norm": 0.8523755669593811, - "learning_rate": 9.857222988160667e-05, - "loss": 0.105, + "epoch": 5.766764802093556, + "grad_norm": 0.49803072214126587, + "learning_rate": 2.285519589242023e-05, + "loss": 0.0369, "step": 22040 }, { - "epoch": 1.4425907752698723, - "grad_norm": 0.8732689619064331, - "learning_rate": 9.857004954521858e-05, - "loss": 0.0981, + "epoch": 5.769381746810598, + "grad_norm": 0.564346194267273, + "learning_rate": 2.283460445816499e-05, + "loss": 0.0372, "step": 22050 }, { - "epoch": 1.443245011449133, - "grad_norm": 0.9251449704170227, - "learning_rate": 9.856786756946392e-05, - "loss": 0.0962, + "epoch": 5.771998691527641, + "grad_norm": 0.2933105230331421, + "learning_rate": 2.281401450393718e-05, + "loss": 0.033, "step": 22060 }, { - "epoch": 1.4438992476283938, - "grad_norm": 0.7228383421897888, - "learning_rate": 9.856568395441637e-05, - "loss": 0.0949, + "epoch": 5.774615636244684, + "grad_norm": 0.3057403862476349, + "learning_rate": 2.279342604380984e-05, + "loss": 0.0352, "step": 22070 }, { - "epoch": 1.4445534838076546, - "grad_norm": 0.7796227931976318, - "learning_rate": 9.856349870014961e-05, - "loss": 0.0964, + "epoch": 5.777232580961727, + "grad_norm": 0.6289494037628174, + "learning_rate": 2.277283909185499e-05, + "loss": 0.0401, "step": 22080 }, { - "epoch": 1.4452077199869153, - "grad_norm": 0.8933011889457703, - "learning_rate": 9.856131180673742e-05, - "loss": 0.0917, + "epoch": 5.77984952567877, + "grad_norm": 0.3429940640926361, + "learning_rate": 2.275225366214363e-05, + "loss": 0.0358, "step": 22090 }, { - "epoch": 1.4458619561661759, - "grad_norm": 0.6613444685935974, - "learning_rate": 9.855912327425359e-05, - "loss": 0.0944, + "epoch": 5.782466470395812, + "grad_norm": 0.47569212317466736, + "learning_rate": 2.2731669768745686e-05, + "loss": 0.0364, "step": 22100 }, { - "epoch": 1.4465161923454368, - "grad_norm": 0.8646618723869324, - "learning_rate": 9.8556933102772e-05, - "loss": 0.0989, + "epoch": 5.785083415112855, + "grad_norm": 0.5378376245498657, + "learning_rate": 2.2711087425730077e-05, + "loss": 0.0405, "step": 22110 }, { - "epoch": 1.4471704285246973, - "grad_norm": 1.0366922616958618, - "learning_rate": 9.855474129236657e-05, - "loss": 0.0964, + "epoch": 5.787700359829898, + "grad_norm": 0.583967924118042, + "learning_rate": 2.269050664716462e-05, + "loss": 0.0357, "step": 22120 }, { - "epoch": 1.447824664703958, - "grad_norm": 0.78702712059021, - "learning_rate": 9.855254784311129e-05, - "loss": 0.1023, + "epoch": 5.790317304546941, + "grad_norm": 0.374970406293869, + "learning_rate": 2.2669927447116097e-05, + "loss": 0.0401, "step": 22130 }, { - "epoch": 1.4484789008832188, - "grad_norm": 0.685092568397522, - "learning_rate": 9.855035275508017e-05, - "loss": 0.0978, + "epoch": 5.792934249263984, + "grad_norm": 0.4154004752635956, + "learning_rate": 2.26493498396502e-05, + "loss": 0.0381, "step": 22140 }, { - "epoch": 1.4491331370624796, - "grad_norm": 0.9662759900093079, - "learning_rate": 9.854815602834733e-05, - "loss": 0.101, + "epoch": 5.795551193981027, + "grad_norm": 0.29512980580329895, + "learning_rate": 2.2628773838831512e-05, + "loss": 0.038, "step": 22150 }, { - "epoch": 1.4497873732417403, - "grad_norm": 0.8414245843887329, - "learning_rate": 9.854595766298692e-05, - "loss": 0.0887, + "epoch": 5.79816813869807, + "grad_norm": 0.5380761623382568, + "learning_rate": 2.260819945872355e-05, + "loss": 0.046, "step": 22160 }, { - "epoch": 1.4504416094210009, - "grad_norm": 0.9167496562004089, - "learning_rate": 9.854375765907309e-05, - "loss": 0.097, + "epoch": 5.8007850834151125, + "grad_norm": 0.36293846368789673, + "learning_rate": 2.25876267133887e-05, + "loss": 0.0348, "step": 22170 }, { - "epoch": 1.4510958456002616, - "grad_norm": 0.8719131946563721, - "learning_rate": 9.854155601668013e-05, - "loss": 0.0972, + "epoch": 5.803402028132155, + "grad_norm": 0.3755585551261902, + "learning_rate": 2.2567055616888244e-05, + "loss": 0.0403, "step": 22180 }, { - "epoch": 1.4517500817795224, - "grad_norm": 1.0566436052322388, - "learning_rate": 9.853935273588236e-05, - "loss": 0.1119, + "epoch": 5.806018972849198, + "grad_norm": 0.2739109694957733, + "learning_rate": 2.2546486183282338e-05, + "loss": 0.0334, "step": 22190 }, { - "epoch": 1.452404317958783, - "grad_norm": 0.8643470406532288, - "learning_rate": 9.853714781675414e-05, - "loss": 0.1097, + "epoch": 5.808635917566241, + "grad_norm": 0.6384417414665222, + "learning_rate": 2.2525918426629984e-05, + "loss": 0.0372, "step": 22200 }, { - "epoch": 1.4530585541380439, - "grad_norm": 0.8405054211616516, - "learning_rate": 9.853494125936989e-05, - "loss": 0.1023, + "epoch": 5.811252862283284, + "grad_norm": 0.49684906005859375, + "learning_rate": 2.2505352360989062e-05, + "loss": 0.0418, "step": 22210 }, { - "epoch": 1.4537127903173046, - "grad_norm": 0.8226882219314575, - "learning_rate": 9.853273306380407e-05, - "loss": 0.1037, + "epoch": 5.813869807000327, + "grad_norm": 0.24710462987422943, + "learning_rate": 2.2484788000416275e-05, + "loss": 0.0421, "step": 22220 }, { - "epoch": 1.4543670264965654, - "grad_norm": 0.9032091498374939, - "learning_rate": 9.853052323013124e-05, - "loss": 0.09, + "epoch": 5.81648675171737, + "grad_norm": 0.7345560789108276, + "learning_rate": 2.2464225358967172e-05, + "loss": 0.0359, "step": 22230 }, { - "epoch": 1.4550212626758259, - "grad_norm": 0.7893581986427307, - "learning_rate": 9.852831175842596e-05, - "loss": 0.1065, + "epoch": 5.8191036964344125, + "grad_norm": 0.559100329875946, + "learning_rate": 2.2443664450696136e-05, + "loss": 0.0382, "step": 22240 }, { - "epoch": 1.4556754988550866, - "grad_norm": 0.8432521820068359, - "learning_rate": 9.85260986487629e-05, - "loss": 0.0988, + "epoch": 5.8217206411514555, + "grad_norm": 0.42378759384155273, + "learning_rate": 2.2423105289656332e-05, + "loss": 0.034, "step": 22250 }, { - "epoch": 1.4563297350343474, - "grad_norm": 0.8194817304611206, - "learning_rate": 9.852388390121675e-05, - "loss": 0.1014, + "epoch": 5.8243375858684985, + "grad_norm": 0.588720440864563, + "learning_rate": 2.2402547889899766e-05, + "loss": 0.0375, "step": 22260 }, { - "epoch": 1.4569839712136081, - "grad_norm": 0.7900116443634033, - "learning_rate": 9.852166751586225e-05, - "loss": 0.1056, + "epoch": 5.8269545305855415, + "grad_norm": 0.518203616142273, + "learning_rate": 2.2381992265477224e-05, + "loss": 0.0349, "step": 22270 }, { - "epoch": 1.4576382073928689, - "grad_norm": 0.7171338200569153, - "learning_rate": 9.851944949277423e-05, - "loss": 0.0906, + "epoch": 5.8295714753025845, + "grad_norm": 0.38674628734588623, + "learning_rate": 2.236143843043828e-05, + "loss": 0.0404, "step": 22280 }, { - "epoch": 1.4582924435721296, - "grad_norm": 0.7563906311988831, - "learning_rate": 9.851722983202753e-05, - "loss": 0.0959, + "epoch": 5.8321884200196275, + "grad_norm": 0.4627288579940796, + "learning_rate": 2.2340886398831294e-05, + "loss": 0.0351, "step": 22290 }, { - "epoch": 1.4589466797513904, - "grad_norm": 0.7728096842765808, - "learning_rate": 9.851500853369709e-05, - "loss": 0.1029, + "epoch": 5.8348053647366696, + "grad_norm": 0.32829442620277405, + "learning_rate": 2.2320336184703373e-05, + "loss": 0.044, "step": 22300 }, { - "epoch": 1.4596009159306509, - "grad_norm": 0.9391055107116699, - "learning_rate": 9.851278559785788e-05, - "loss": 0.0979, + "epoch": 5.8374223094537125, + "grad_norm": 0.6168282628059387, + "learning_rate": 2.229978780210041e-05, + "loss": 0.0354, "step": 22310 }, { - "epoch": 1.4602551521099116, - "grad_norm": 0.8876420855522156, - "learning_rate": 9.851056102458492e-05, - "loss": 0.0963, + "epoch": 5.8400392541707555, + "grad_norm": 0.3957426846027374, + "learning_rate": 2.2279241265067015e-05, + "loss": 0.0338, "step": 22320 }, { - "epoch": 1.4609093882891724, - "grad_norm": 1.0009467601776123, - "learning_rate": 9.85083348139533e-05, - "loss": 0.098, + "epoch": 5.8426561988877985, + "grad_norm": 0.4070344865322113, + "learning_rate": 2.2258696587646573e-05, + "loss": 0.0464, "step": 22330 }, { - "epoch": 1.4615636244684331, - "grad_norm": 0.9100044369697571, - "learning_rate": 9.850610696603817e-05, - "loss": 0.0935, + "epoch": 5.8452731436048415, + "grad_norm": 0.43342939019203186, + "learning_rate": 2.223815378388116e-05, + "loss": 0.0362, "step": 22340 }, { - "epoch": 1.4622178606476939, - "grad_norm": 1.0415489673614502, - "learning_rate": 9.850387748091471e-05, - "loss": 0.104, + "epoch": 5.8478900883218845, + "grad_norm": 0.36159420013427734, + "learning_rate": 2.221761286781159e-05, + "loss": 0.0391, "step": 22350 }, { - "epoch": 1.4628720968269544, - "grad_norm": 0.9339014887809753, - "learning_rate": 9.850164635865819e-05, - "loss": 0.1056, + "epoch": 5.850507033038927, + "grad_norm": 0.45029348134994507, + "learning_rate": 2.2197073853477388e-05, + "loss": 0.0376, "step": 22360 }, { - "epoch": 1.4635263330062154, - "grad_norm": 0.9061875343322754, - "learning_rate": 9.84994135993439e-05, - "loss": 0.1065, + "epoch": 5.85312397775597, + "grad_norm": 0.39053642749786377, + "learning_rate": 2.2176536754916775e-05, + "loss": 0.0344, "step": 22370 }, { - "epoch": 1.464180569185476, - "grad_norm": 0.7294244766235352, - "learning_rate": 9.849717920304719e-05, - "loss": 0.1058, + "epoch": 5.855740922473013, + "grad_norm": 0.4565856456756592, + "learning_rate": 2.2156001586166663e-05, + "loss": 0.0412, "step": 22380 }, { - "epoch": 1.4648348053647366, - "grad_norm": 0.9181326031684875, - "learning_rate": 9.849494316984352e-05, - "loss": 0.0996, + "epoch": 5.858357867190056, + "grad_norm": 0.49078816175460815, + "learning_rate": 2.2135468361262656e-05, + "loss": 0.0385, "step": 22390 }, { - "epoch": 1.4654890415439974, - "grad_norm": 0.7590892910957336, - "learning_rate": 9.849270549980832e-05, - "loss": 0.1013, + "epoch": 5.860974811907099, + "grad_norm": 0.33995869755744934, + "learning_rate": 2.211493709423901e-05, + "loss": 0.0413, "step": 22400 }, { - "epoch": 1.4661432777232581, - "grad_norm": 0.7892376780509949, - "learning_rate": 9.849046619301713e-05, - "loss": 0.0953, + "epoch": 5.863591756624142, + "grad_norm": 0.4810589849948883, + "learning_rate": 2.2094407799128662e-05, + "loss": 0.0372, "step": 22410 }, { - "epoch": 1.466797513902519, - "grad_norm": 0.7668907642364502, - "learning_rate": 9.848822524954553e-05, - "loss": 0.1001, + "epoch": 5.8662087013411846, + "grad_norm": 0.4321928918361664, + "learning_rate": 2.207388048996319e-05, + "loss": 0.045, "step": 22420 }, { - "epoch": 1.4674517500817794, - "grad_norm": 0.6292420625686646, - "learning_rate": 9.848598266946918e-05, - "loss": 0.0882, + "epoch": 5.868825646058227, + "grad_norm": 0.3577482998371124, + "learning_rate": 2.2053355180772797e-05, + "loss": 0.0385, "step": 22430 }, { - "epoch": 1.4681059862610402, - "grad_norm": 0.8127971887588501, - "learning_rate": 9.848373845286376e-05, - "loss": 0.1029, + "epoch": 5.87144259077527, + "grad_norm": 0.3033449947834015, + "learning_rate": 2.203283188558636e-05, + "loss": 0.0379, "step": 22440 }, { - "epoch": 1.468760222440301, - "grad_norm": 0.9663169980049133, - "learning_rate": 9.848149259980499e-05, - "loss": 0.1066, + "epoch": 5.874059535492313, + "grad_norm": 0.5386301279067993, + "learning_rate": 2.201231061843135e-05, + "loss": 0.0424, "step": 22450 }, { - "epoch": 1.4694144586195617, - "grad_norm": 0.8952876925468445, - "learning_rate": 9.847924511036872e-05, - "loss": 0.1101, + "epoch": 5.876676480209356, + "grad_norm": 0.2471160590648651, + "learning_rate": 2.1991791393333858e-05, + "loss": 0.0344, "step": 22460 }, { - "epoch": 1.4700686947988224, - "grad_norm": 0.9786347150802612, - "learning_rate": 9.847699598463079e-05, - "loss": 0.0865, + "epoch": 5.879293424926399, + "grad_norm": 0.2815709114074707, + "learning_rate": 2.197127422431858e-05, + "loss": 0.0374, "step": 22470 }, { - "epoch": 1.4707229309780832, - "grad_norm": 1.0925096273422241, - "learning_rate": 9.847474522266708e-05, - "loss": 0.1013, + "epoch": 5.881910369643442, + "grad_norm": 0.38048800826072693, + "learning_rate": 2.195075912540881e-05, + "loss": 0.0351, "step": 22480 }, { - "epoch": 1.471377167157344, - "grad_norm": 0.7084100842475891, - "learning_rate": 9.84724928245536e-05, - "loss": 0.0975, + "epoch": 5.884527314360485, + "grad_norm": 0.2458164393901825, + "learning_rate": 2.193024611062643e-05, + "loss": 0.0333, "step": 22490 }, { - "epoch": 1.4720314033366044, - "grad_norm": 0.8973484039306641, - "learning_rate": 9.847023879036637e-05, - "loss": 0.0956, + "epoch": 5.887144259077527, + "grad_norm": 0.4755036234855652, + "learning_rate": 2.1909735193991887e-05, + "loss": 0.0376, "step": 22500 }, { - "epoch": 1.4726856395158652, - "grad_norm": 1.1839032173156738, - "learning_rate": 9.846798312018146e-05, - "loss": 0.0958, + "epoch": 5.88976120379457, + "grad_norm": 0.4101316034793854, + "learning_rate": 2.1889226389524206e-05, + "loss": 0.0351, "step": 22510 }, { - "epoch": 1.473339875695126, - "grad_norm": 1.004971981048584, - "learning_rate": 9.846572581407502e-05, - "loss": 0.1028, + "epoch": 5.892378148511613, + "grad_norm": 0.4491457939147949, + "learning_rate": 2.186871971124095e-05, + "loss": 0.0396, "step": 22520 }, { - "epoch": 1.4739941118743867, - "grad_norm": 0.854030966758728, - "learning_rate": 9.846346687212322e-05, - "loss": 0.1093, + "epoch": 5.894995093228656, + "grad_norm": 0.39417290687561035, + "learning_rate": 2.184821517315824e-05, + "loss": 0.0368, "step": 22530 }, { - "epoch": 1.4746483480536474, - "grad_norm": 0.9426572918891907, - "learning_rate": 9.846120629440231e-05, - "loss": 0.1084, + "epoch": 5.897612037945699, + "grad_norm": 0.3355071246623993, + "learning_rate": 2.1827712789290746e-05, + "loss": 0.0386, "step": 22540 }, { - "epoch": 1.475302584232908, - "grad_norm": 0.7341740131378174, - "learning_rate": 9.84589440809886e-05, - "loss": 0.0945, + "epoch": 5.900228982662741, + "grad_norm": 0.3364246189594269, + "learning_rate": 2.1807212573651644e-05, + "loss": 0.0394, "step": 22550 }, { - "epoch": 1.475956820412169, - "grad_norm": 0.7816892862319946, - "learning_rate": 9.845668023195841e-05, - "loss": 0.0945, + "epoch": 5.902845927379784, + "grad_norm": 0.41326799988746643, + "learning_rate": 2.178671454025264e-05, + "loss": 0.0345, "step": 22560 }, { - "epoch": 1.4766110565914294, - "grad_norm": 1.015047311782837, - "learning_rate": 9.845441474738821e-05, - "loss": 0.0912, + "epoch": 5.905462872096827, + "grad_norm": 0.41661393642425537, + "learning_rate": 2.1766218703103948e-05, + "loss": 0.0376, "step": 22570 }, { - "epoch": 1.4772652927706902, - "grad_norm": 1.0794068574905396, - "learning_rate": 9.845214762735444e-05, - "loss": 0.092, + "epoch": 5.90807981681387, + "grad_norm": 0.3999796509742737, + "learning_rate": 2.174572507621428e-05, + "loss": 0.0359, "step": 22580 }, { - "epoch": 1.477919528949951, - "grad_norm": 0.8350719809532166, - "learning_rate": 9.84498788719336e-05, - "loss": 0.1005, + "epoch": 5.910696761530913, + "grad_norm": 0.5220701694488525, + "learning_rate": 2.172523367359084e-05, + "loss": 0.0398, "step": 22590 }, { - "epoch": 1.4785737651292117, - "grad_norm": 0.7004404664039612, - "learning_rate": 9.844760848120229e-05, - "loss": 0.0958, + "epoch": 5.913313706247956, + "grad_norm": 0.4379059374332428, + "learning_rate": 2.17047445092393e-05, + "loss": 0.0364, "step": 22600 }, { - "epoch": 1.4792280013084724, - "grad_norm": 0.9362242221832275, - "learning_rate": 9.844533645523714e-05, - "loss": 0.0953, + "epoch": 5.915930650964999, + "grad_norm": 0.3713061213493347, + "learning_rate": 2.1684257597163826e-05, + "loss": 0.0397, "step": 22610 }, { - "epoch": 1.479882237487733, - "grad_norm": 1.0264415740966797, - "learning_rate": 9.844306279411482e-05, - "loss": 0.0954, + "epoch": 5.918547595682041, + "grad_norm": 0.3561409115791321, + "learning_rate": 2.1663772951367014e-05, + "loss": 0.0353, "step": 22620 }, { - "epoch": 1.4805364736669937, - "grad_norm": 0.9986675977706909, - "learning_rate": 9.84407874979121e-05, - "loss": 0.1014, + "epoch": 5.921164540399084, + "grad_norm": 0.47510769963264465, + "learning_rate": 2.1643290585849927e-05, + "loss": 0.0423, "step": 22630 }, { - "epoch": 1.4811907098462544, - "grad_norm": 0.8744516372680664, - "learning_rate": 9.843851056670574e-05, - "loss": 0.0955, + "epoch": 5.923781485116127, + "grad_norm": 0.5692808032035828, + "learning_rate": 2.162281051461208e-05, + "loss": 0.0362, "step": 22640 }, { - "epoch": 1.4818449460255152, - "grad_norm": 0.8320896029472351, - "learning_rate": 9.843623200057263e-05, - "loss": 0.0927, + "epoch": 5.92639842983317, + "grad_norm": 0.5427606105804443, + "learning_rate": 2.160233275165139e-05, + "loss": 0.038, "step": 22650 }, { - "epoch": 1.482499182204776, - "grad_norm": 0.9211897850036621, - "learning_rate": 9.843395179958965e-05, - "loss": 0.1046, + "epoch": 5.929015374550213, + "grad_norm": 0.3027240037918091, + "learning_rate": 2.1581857310964233e-05, + "loss": 0.039, "step": 22660 }, { - "epoch": 1.4831534183840367, - "grad_norm": 0.9335985779762268, - "learning_rate": 9.84316699638338e-05, - "loss": 0.0968, + "epoch": 5.931632319267256, + "grad_norm": 0.2940119504928589, + "learning_rate": 2.156138420654537e-05, + "loss": 0.0384, "step": 22670 }, { - "epoch": 1.4838076545632974, - "grad_norm": 0.9013245105743408, - "learning_rate": 9.842938649338205e-05, - "loss": 0.0968, + "epoch": 5.934249263984299, + "grad_norm": 0.521510660648346, + "learning_rate": 2.1540913452387972e-05, + "loss": 0.0343, "step": 22680 }, { - "epoch": 1.484461890742558, - "grad_norm": 1.0684399604797363, - "learning_rate": 9.842710138831148e-05, - "loss": 0.0904, + "epoch": 5.936866208701341, + "grad_norm": 0.2505738139152527, + "learning_rate": 2.1520445062483623e-05, + "loss": 0.0379, "step": 22690 }, { - "epoch": 1.4851161269218187, - "grad_norm": 1.023200273513794, - "learning_rate": 9.842481464869927e-05, - "loss": 0.099, + "epoch": 5.939483153418384, + "grad_norm": 0.4490431845188141, + "learning_rate": 2.1499979050822268e-05, + "loss": 0.0388, "step": 22700 }, { - "epoch": 1.4857703631010795, - "grad_norm": 0.9011964797973633, - "learning_rate": 9.842252627462254e-05, - "loss": 0.0982, + "epoch": 5.942100098135427, + "grad_norm": 0.3115292191505432, + "learning_rate": 2.1479515431392217e-05, + "loss": 0.0399, "step": 22710 }, { - "epoch": 1.4864245992803402, - "grad_norm": 0.9003528356552124, - "learning_rate": 9.842023626615857e-05, - "loss": 0.0916, + "epoch": 5.94471704285247, + "grad_norm": 0.3649538457393646, + "learning_rate": 2.145905421818018e-05, + "loss": 0.0374, "step": 22720 }, { - "epoch": 1.487078835459601, - "grad_norm": 0.8671954274177551, - "learning_rate": 9.841794462338463e-05, - "loss": 0.0844, + "epoch": 5.947333987569513, + "grad_norm": 0.2825697362422943, + "learning_rate": 2.1438595425171188e-05, + "loss": 0.0352, "step": 22730 }, { - "epoch": 1.4877330716388617, - "grad_norm": 1.0183930397033691, - "learning_rate": 9.841565134637808e-05, - "loss": 0.1009, + "epoch": 5.949950932286556, + "grad_norm": 0.32974621653556824, + "learning_rate": 2.1418139066348647e-05, + "loss": 0.0351, "step": 22740 }, { - "epoch": 1.4883873078181225, - "grad_norm": 0.77434903383255, - "learning_rate": 9.841335643521632e-05, - "loss": 0.0979, + "epoch": 5.952567877003599, + "grad_norm": 0.3849112391471863, + "learning_rate": 2.1397685155694274e-05, + "loss": 0.0397, "step": 22750 }, { - "epoch": 1.489041543997383, - "grad_norm": 0.9142956733703613, - "learning_rate": 9.841105988997682e-05, - "loss": 0.0911, + "epoch": 5.955184821720641, + "grad_norm": 0.5038356781005859, + "learning_rate": 2.1377233707188126e-05, + "loss": 0.0408, "step": 22760 }, { - "epoch": 1.4896957801766437, - "grad_norm": 0.9437901973724365, - "learning_rate": 9.840876171073707e-05, - "loss": 0.101, + "epoch": 5.957801766437684, + "grad_norm": 0.3917370140552521, + "learning_rate": 2.1356784734808588e-05, + "loss": 0.037, "step": 22770 }, { - "epoch": 1.4903500163559045, - "grad_norm": 0.8872728943824768, - "learning_rate": 9.840646189757468e-05, - "loss": 0.0982, + "epoch": 5.960418711154727, + "grad_norm": 0.34260237216949463, + "learning_rate": 2.1336338252532324e-05, + "loss": 0.0362, "step": 22780 }, { - "epoch": 1.4910042525351652, - "grad_norm": 0.7070728540420532, - "learning_rate": 9.840416045056724e-05, - "loss": 0.0918, + "epoch": 5.96303565587177, + "grad_norm": 0.3475625813007355, + "learning_rate": 2.131589427433433e-05, + "loss": 0.0364, "step": 22790 }, { - "epoch": 1.491658488714426, - "grad_norm": 0.9747138619422913, - "learning_rate": 9.840185736979244e-05, - "loss": 0.0953, + "epoch": 5.965652600588813, + "grad_norm": 0.3870244324207306, + "learning_rate": 2.1295452814187854e-05, + "loss": 0.0357, "step": 22800 }, { - "epoch": 1.4923127248936865, - "grad_norm": 0.9739376902580261, - "learning_rate": 9.839955265532801e-05, - "loss": 0.1069, + "epoch": 5.968269545305855, + "grad_norm": 0.3478778004646301, + "learning_rate": 2.127501388606444e-05, + "loss": 0.0324, "step": 22810 }, { - "epoch": 1.4929669610729475, - "grad_norm": 0.8057883977890015, - "learning_rate": 9.839724630725175e-05, - "loss": 0.0882, + "epoch": 5.970886490022898, + "grad_norm": 0.4857855439186096, + "learning_rate": 2.1254577503933916e-05, + "loss": 0.0384, "step": 22820 }, { - "epoch": 1.493621197252208, - "grad_norm": 0.9417016506195068, - "learning_rate": 9.839493832564149e-05, - "loss": 0.1076, + "epoch": 5.973503434739941, + "grad_norm": 0.4679870009422302, + "learning_rate": 2.123414368176435e-05, + "loss": 0.028, "step": 22830 }, { - "epoch": 1.4942754334314687, - "grad_norm": 0.7495242953300476, - "learning_rate": 9.839262871057515e-05, - "loss": 0.091, + "epoch": 5.976120379456984, + "grad_norm": 0.32966911792755127, + "learning_rate": 2.121371243352207e-05, + "loss": 0.0338, "step": 22840 }, { - "epoch": 1.4949296696107295, - "grad_norm": 0.818233847618103, - "learning_rate": 9.839031746213068e-05, - "loss": 0.1056, + "epoch": 5.978737324174027, + "grad_norm": 0.4067830741405487, + "learning_rate": 2.1193283773171636e-05, + "loss": 0.0365, "step": 22850 }, { - "epoch": 1.4955839057899902, - "grad_norm": 0.8192006945610046, - "learning_rate": 9.838800458038609e-05, - "loss": 0.0928, + "epoch": 5.98135426889107, + "grad_norm": 0.26921984553337097, + "learning_rate": 2.117285771467584e-05, + "loss": 0.0366, "step": 22860 }, { - "epoch": 1.496238141969251, - "grad_norm": 0.9962643980979919, - "learning_rate": 9.838569006541944e-05, - "loss": 0.097, + "epoch": 5.983971213608113, + "grad_norm": 0.4890765845775604, + "learning_rate": 2.115243427199572e-05, + "loss": 0.0364, "step": 22870 }, { - "epoch": 1.4968923781485115, - "grad_norm": 0.8884673714637756, - "learning_rate": 9.838337391730886e-05, - "loss": 0.0924, + "epoch": 5.986588158325155, + "grad_norm": 0.44049689173698425, + "learning_rate": 2.113201345909049e-05, + "loss": 0.0404, "step": 22880 }, { - "epoch": 1.4975466143277723, - "grad_norm": 0.7692938446998596, - "learning_rate": 9.83810561361325e-05, - "loss": 0.1011, + "epoch": 5.989205103042198, + "grad_norm": 0.3951586186885834, + "learning_rate": 2.1111595289917598e-05, + "loss": 0.0369, "step": 22890 }, { - "epoch": 1.498200850507033, - "grad_norm": 1.0276055335998535, - "learning_rate": 9.837873672196863e-05, - "loss": 0.097, + "epoch": 5.991822047759241, + "grad_norm": 0.40105995535850525, + "learning_rate": 2.1091179778432655e-05, + "loss": 0.0353, "step": 22900 }, { - "epoch": 1.4988550866862937, - "grad_norm": 0.9219088554382324, - "learning_rate": 9.83764156748955e-05, - "loss": 0.0843, + "epoch": 5.994438992476284, + "grad_norm": 0.38641372323036194, + "learning_rate": 2.107076693858947e-05, + "loss": 0.0374, "step": 22910 }, { - "epoch": 1.4995093228655545, - "grad_norm": 0.7710665464401245, - "learning_rate": 9.837409299499149e-05, - "loss": 0.0997, + "epoch": 5.997055937193327, + "grad_norm": 0.36292564868927, + "learning_rate": 2.1050356784340035e-05, + "loss": 0.0344, "step": 22920 }, { - "epoch": 1.500163559044815, - "grad_norm": 0.7890987396240234, - "learning_rate": 9.837176868233496e-05, - "loss": 0.1039, + "epoch": 5.99967288191037, + "grad_norm": 0.3365738093852997, + "learning_rate": 2.102994932963449e-05, + "loss": 0.0375, "step": 22930 }, { - "epoch": 1.500817795224076, - "grad_norm": 0.7325422167778015, - "learning_rate": 9.836944273700439e-05, - "loss": 0.0917, + "epoch": 6.002093555773635, + "grad_norm": 0.47521135210990906, + "learning_rate": 2.1009544588421147e-05, + "loss": 0.0386, "step": 22940 }, { - "epoch": 1.5014720314033365, - "grad_norm": 0.8291808366775513, - "learning_rate": 9.836711515907827e-05, - "loss": 0.1017, + "epoch": 6.004710500490678, + "grad_norm": 0.5351961255073547, + "learning_rate": 2.0989142574646447e-05, + "loss": 0.0331, "step": 22950 }, { - "epoch": 1.5021262675825973, - "grad_norm": 0.7755422592163086, - "learning_rate": 9.836478594863516e-05, - "loss": 0.0887, + "epoch": 6.00732744520772, + "grad_norm": 0.3139323890209198, + "learning_rate": 2.096874330225498e-05, + "loss": 0.0357, "step": 22960 }, { - "epoch": 1.502780503761858, - "grad_norm": 0.8843225836753845, - "learning_rate": 9.836245510575368e-05, - "loss": 0.1055, + "epoch": 6.009944389924763, + "grad_norm": 0.42696279287338257, + "learning_rate": 2.0948346785189455e-05, + "loss": 0.0342, "step": 22970 }, { - "epoch": 1.5034347399411188, - "grad_norm": 1.0079516172409058, - "learning_rate": 9.836012263051252e-05, - "loss": 0.1042, + "epoch": 6.012561334641806, + "grad_norm": 0.44340917468070984, + "learning_rate": 2.0927953037390702e-05, + "loss": 0.0366, "step": 22980 }, { - "epoch": 1.5040889761203795, - "grad_norm": 0.6518259048461914, - "learning_rate": 9.835778852299039e-05, - "loss": 0.1012, + "epoch": 6.015178279358849, + "grad_norm": 0.5112841725349426, + "learning_rate": 2.0907562072797642e-05, + "loss": 0.0391, "step": 22990 }, { - "epoch": 1.50474321229964, - "grad_norm": 0.7768208980560303, - "learning_rate": 9.835545278326606e-05, - "loss": 0.0852, + "epoch": 6.017795224075892, + "grad_norm": 0.5477259159088135, + "learning_rate": 2.0887173905347322e-05, + "loss": 0.0361, + "step": 23000 + }, + { + "epoch": 6.017795224075892, + "eval_loss": 0.04335802614650672, + "eval_runtime": 9.204, + "eval_samples_per_second": 111.256, + "eval_steps_per_second": 1.738, "step": 23000 }, { - "epoch": 1.505397448478901, - "grad_norm": 0.6538995504379272, - "learning_rate": 9.835311541141839e-05, - "loss": 0.0882, + "epoch": 6.020412168792935, + "grad_norm": 0.43827566504478455, + "learning_rate": 2.086678854897485e-05, + "loss": 0.0352, "step": 23010 }, { - "epoch": 1.5060516846581615, - "grad_norm": 0.8296500444412231, - "learning_rate": 9.835077640752626e-05, - "loss": 0.0938, + "epoch": 6.023029113509977, + "grad_norm": 0.4269797205924988, + "learning_rate": 2.0846406017613434e-05, + "loss": 0.0371, "step": 23020 }, { - "epoch": 1.5067059208374223, - "grad_norm": 0.9206341505050659, - "learning_rate": 9.834843577166863e-05, - "loss": 0.0974, + "epoch": 6.02564605822702, + "grad_norm": 0.43934446573257446, + "learning_rate": 2.0826026325194337e-05, + "loss": 0.0359, "step": 23030 }, { - "epoch": 1.507360157016683, - "grad_norm": 0.779178261756897, - "learning_rate": 9.83460935039245e-05, - "loss": 0.1044, + "epoch": 6.028263002944063, + "grad_norm": 0.3212452232837677, + "learning_rate": 2.0805649485646893e-05, + "loss": 0.0385, "step": 23040 }, { - "epoch": 1.5080143931959438, - "grad_norm": 0.9146134853363037, - "learning_rate": 9.834374960437291e-05, - "loss": 0.0909, + "epoch": 6.030879947661106, + "grad_norm": 0.3714478611946106, + "learning_rate": 2.0785275512898467e-05, + "loss": 0.0417, "step": 23050 }, { - "epoch": 1.5086686293752045, - "grad_norm": 0.9325788617134094, - "learning_rate": 9.834140407309298e-05, - "loss": 0.0969, + "epoch": 6.033496892378149, + "grad_norm": 0.32258906960487366, + "learning_rate": 2.0764904420874486e-05, + "loss": 0.0378, "step": 23060 }, { - "epoch": 1.509322865554465, - "grad_norm": 0.9949310421943665, - "learning_rate": 9.833905691016389e-05, - "loss": 0.0969, + "epoch": 6.036113837095192, + "grad_norm": 0.3787921965122223, + "learning_rate": 2.074453622349841e-05, + "loss": 0.0318, "step": 23070 }, { - "epoch": 1.509977101733726, - "grad_norm": 0.8266165256500244, - "learning_rate": 9.833670811566485e-05, - "loss": 0.0924, + "epoch": 6.038730781812234, + "grad_norm": 0.6138239502906799, + "learning_rate": 2.0724170934691698e-05, + "loss": 0.0325, "step": 23080 }, { - "epoch": 1.5106313379129865, - "grad_norm": 0.8005866408348083, - "learning_rate": 9.833435768967514e-05, - "loss": 0.0931, + "epoch": 6.041347726529277, + "grad_norm": 0.42927268147468567, + "learning_rate": 2.0703808568373824e-05, + "loss": 0.0408, "step": 23090 }, { - "epoch": 1.5112855740922473, - "grad_norm": 0.9937982559204102, - "learning_rate": 9.833200563227411e-05, - "loss": 0.0897, + "epoch": 6.04396467124632, + "grad_norm": 0.6134794354438782, + "learning_rate": 2.0683449138462287e-05, + "loss": 0.0332, "step": 23100 }, { - "epoch": 1.511939810271508, - "grad_norm": 0.8967451453208923, - "learning_rate": 9.832965194354113e-05, - "loss": 0.1015, + "epoch": 6.046581615963363, + "grad_norm": 0.5098093748092651, + "learning_rate": 2.066309265887256e-05, + "loss": 0.0428, "step": 23110 }, { - "epoch": 1.5125940464507688, - "grad_norm": 0.7568728923797607, - "learning_rate": 9.832729662355566e-05, - "loss": 0.0945, + "epoch": 6.049198560680406, + "grad_norm": 0.3562641143798828, + "learning_rate": 2.064273914351811e-05, + "loss": 0.0375, "step": 23120 }, { - "epoch": 1.5132482826300295, - "grad_norm": 0.8389810919761658, - "learning_rate": 9.832493967239716e-05, - "loss": 0.0958, + "epoch": 6.051815505397449, + "grad_norm": 0.31301990151405334, + "learning_rate": 2.0622388606310363e-05, + "loss": 0.0354, "step": 23130 }, { - "epoch": 1.51390251880929, - "grad_norm": 0.7844974398612976, - "learning_rate": 9.832258109014522e-05, - "loss": 0.0963, + "epoch": 6.054432450114492, + "grad_norm": 0.35312601923942566, + "learning_rate": 2.060204106115873e-05, + "loss": 0.034, "step": 23140 }, { - "epoch": 1.514556754988551, - "grad_norm": 0.7761784791946411, - "learning_rate": 9.832022087687944e-05, - "loss": 0.0913, + "epoch": 6.057049394831534, + "grad_norm": 0.6812915802001953, + "learning_rate": 2.0581696521970554e-05, + "loss": 0.0374, "step": 23150 }, { - "epoch": 1.5152109911678115, - "grad_norm": 1.0485121011734009, - "learning_rate": 9.831785903267949e-05, - "loss": 0.1062, + "epoch": 6.059666339548577, + "grad_norm": 0.6285141706466675, + "learning_rate": 2.0561355002651145e-05, + "loss": 0.0448, "step": 23160 }, { - "epoch": 1.5158652273470723, - "grad_norm": 0.8118041157722473, - "learning_rate": 9.831549555762507e-05, - "loss": 0.0926, + "epoch": 6.06228328426562, + "grad_norm": 0.5887035727500916, + "learning_rate": 2.054101651710375e-05, + "loss": 0.0339, "step": 23170 }, { - "epoch": 1.516519463526333, - "grad_norm": 1.069348931312561, - "learning_rate": 9.831313045179595e-05, - "loss": 0.0977, + "epoch": 6.064900228982663, + "grad_norm": 0.8369037508964539, + "learning_rate": 2.0520681079229513e-05, + "loss": 0.0394, "step": 23180 }, { - "epoch": 1.5171736997055936, - "grad_norm": 0.8095777034759521, - "learning_rate": 9.8310763715272e-05, - "loss": 0.0975, + "epoch": 6.067517173699706, + "grad_norm": 0.3949189782142639, + "learning_rate": 2.0500348702927512e-05, + "loss": 0.0341, "step": 23190 }, { - "epoch": 1.5178279358848545, - "grad_norm": 0.6841381788253784, - "learning_rate": 9.830839534813305e-05, - "loss": 0.0924, + "epoch": 6.070134118416749, + "grad_norm": 0.2599748969078064, + "learning_rate": 2.0480019402094755e-05, + "loss": 0.0379, "step": 23200 }, { - "epoch": 1.518482172064115, - "grad_norm": 0.7451679706573486, - "learning_rate": 9.830602535045908e-05, - "loss": 0.0989, + "epoch": 6.072751063133791, + "grad_norm": 0.3696807026863098, + "learning_rate": 2.0459693190626107e-05, + "loss": 0.0312, "step": 23210 }, { - "epoch": 1.5191364082433758, - "grad_norm": 0.7688360214233398, - "learning_rate": 9.830365372233006e-05, - "loss": 0.1033, + "epoch": 6.075368007850834, + "grad_norm": 0.3513152599334717, + "learning_rate": 2.043937008241436e-05, + "loss": 0.0329, "step": 23220 }, { - "epoch": 1.5197906444226366, - "grad_norm": 0.7664195895195007, - "learning_rate": 9.830128046382605e-05, - "loss": 0.0857, + "epoch": 6.077984952567877, + "grad_norm": 0.28093549609184265, + "learning_rate": 2.0419050091350148e-05, + "loss": 0.0325, "step": 23230 }, { - "epoch": 1.5204448806018973, - "grad_norm": 0.7574120163917542, - "learning_rate": 9.829890557502714e-05, - "loss": 0.0912, + "epoch": 6.08060189728492, + "grad_norm": 0.37431958317756653, + "learning_rate": 2.039873323132201e-05, + "loss": 0.0351, "step": 23240 }, { - "epoch": 1.521099116781158, - "grad_norm": 0.8044725656509399, - "learning_rate": 9.82965290560135e-05, - "loss": 0.093, + "epoch": 6.083218842001963, + "grad_norm": 0.44764524698257446, + "learning_rate": 2.037841951621631e-05, + "loss": 0.0374, "step": 23250 }, { - "epoch": 1.5217533529604186, - "grad_norm": 0.9325090050697327, - "learning_rate": 9.829415090686535e-05, - "loss": 0.0893, + "epoch": 6.085835786719006, + "grad_norm": 0.39713025093078613, + "learning_rate": 2.035810895991731e-05, + "loss": 0.0345, "step": 23260 }, { - "epoch": 1.5224075891396796, - "grad_norm": 0.9916201829910278, - "learning_rate": 9.829177112766294e-05, - "loss": 0.0998, + "epoch": 6.088452731436049, + "grad_norm": 0.35608890652656555, + "learning_rate": 2.033780157630705e-05, + "loss": 0.0398, "step": 23270 }, { - "epoch": 1.52306182531894, - "grad_norm": 0.8349946141242981, - "learning_rate": 9.828938971848663e-05, - "loss": 0.0991, + "epoch": 6.091069676153091, + "grad_norm": 0.18921782076358795, + "learning_rate": 2.031749737926546e-05, + "loss": 0.0311, "step": 23280 }, { - "epoch": 1.5237160614982008, - "grad_norm": 0.8166770339012146, - "learning_rate": 9.828700667941675e-05, - "loss": 0.0983, + "epoch": 6.093686620870134, + "grad_norm": 0.3688729703426361, + "learning_rate": 2.0297196382670253e-05, + "loss": 0.0361, "step": 23290 }, { - "epoch": 1.5243702976774616, - "grad_norm": 0.8615389466285706, - "learning_rate": 9.828462201053376e-05, - "loss": 0.0957, + "epoch": 6.096303565587177, + "grad_norm": 0.3176236152648926, + "learning_rate": 2.0276898600396977e-05, + "loss": 0.0325, "step": 23300 }, { - "epoch": 1.5250245338567223, - "grad_norm": 0.8796236515045166, - "learning_rate": 9.828223571191814e-05, - "loss": 0.0974, + "epoch": 6.09892051030422, + "grad_norm": 0.4786258935928345, + "learning_rate": 2.0256604046318963e-05, + "loss": 0.0385, "step": 23310 }, { - "epoch": 1.525678770035983, - "grad_norm": 0.8417108654975891, - "learning_rate": 9.827984778365045e-05, - "loss": 0.1001, + "epoch": 6.101537455021263, + "grad_norm": 0.2971251308917999, + "learning_rate": 2.0236312734307367e-05, + "loss": 0.031, "step": 23320 }, { - "epoch": 1.5263330062152436, - "grad_norm": 0.8357135653495789, - "learning_rate": 9.827745822581128e-05, - "loss": 0.1045, + "epoch": 6.104154399738306, + "grad_norm": 0.44764214754104614, + "learning_rate": 2.021602467823109e-05, + "loss": 0.0342, "step": 23330 }, { - "epoch": 1.5269872423945046, - "grad_norm": 0.7510766386985779, - "learning_rate": 9.827506703848128e-05, - "loss": 0.1021, + "epoch": 6.106771344455348, + "grad_norm": 0.3997710049152374, + "learning_rate": 2.0195739891956838e-05, + "loss": 0.0359, "step": 23340 }, { - "epoch": 1.527641478573765, - "grad_norm": 1.0681124925613403, - "learning_rate": 9.827267422174115e-05, - "loss": 0.0953, + "epoch": 6.109388289172391, + "grad_norm": 0.5370054244995117, + "learning_rate": 2.0175458389349077e-05, + "loss": 0.0353, "step": 23350 }, { - "epoch": 1.5282957147530258, - "grad_norm": 0.8562158942222595, - "learning_rate": 9.82702797756717e-05, - "loss": 0.0992, + "epoch": 6.112005233889434, + "grad_norm": 0.5448628067970276, + "learning_rate": 2.0155180184270003e-05, + "loss": 0.0347, "step": 23360 }, { - "epoch": 1.5289499509322866, - "grad_norm": 0.9547275900840759, - "learning_rate": 9.826788370035368e-05, - "loss": 0.1004, + "epoch": 6.114622178606477, + "grad_norm": 0.5665889978408813, + "learning_rate": 2.013490529057959e-05, + "loss": 0.0358, "step": 23370 }, { - "epoch": 1.529604187111547, - "grad_norm": 0.8317331075668335, - "learning_rate": 9.826548599586802e-05, - "loss": 0.1016, + "epoch": 6.11723912332352, + "grad_norm": 0.3886018693447113, + "learning_rate": 2.011463372213554e-05, + "loss": 0.0355, "step": 23380 }, { - "epoch": 1.530258423290808, - "grad_norm": 0.9036648273468018, - "learning_rate": 9.82630866622956e-05, - "loss": 0.1068, + "epoch": 6.119856068040563, + "grad_norm": 0.3535195589065552, + "learning_rate": 2.009436549279327e-05, + "loss": 0.0353, "step": 23390 }, { - "epoch": 1.5309126594700686, - "grad_norm": 0.9889810085296631, - "learning_rate": 9.826068569971745e-05, - "loss": 0.0998, + "epoch": 6.122473012757606, + "grad_norm": 0.26383087038993835, + "learning_rate": 2.007410061640593e-05, + "loss": 0.0333, "step": 23400 }, { - "epoch": 1.5315668956493296, - "grad_norm": 0.8592228293418884, - "learning_rate": 9.825828310821459e-05, - "loss": 0.0989, + "epoch": 6.125089957474648, + "grad_norm": 0.46765440702438354, + "learning_rate": 2.0053839106824368e-05, + "loss": 0.0379, "step": 23410 }, { - "epoch": 1.53222113182859, - "grad_norm": 0.8532178401947021, - "learning_rate": 9.82558788878681e-05, - "loss": 0.097, + "epoch": 6.127706902191691, + "grad_norm": 0.7617926597595215, + "learning_rate": 2.003358097789714e-05, + "loss": 0.0375, "step": 23420 }, { - "epoch": 1.5328753680078508, - "grad_norm": 0.9547865390777588, - "learning_rate": 9.825347303875916e-05, - "loss": 0.0851, + "epoch": 6.130323846908734, + "grad_norm": 0.4366847276687622, + "learning_rate": 2.001332624347048e-05, + "loss": 0.0352, "step": 23430 }, { - "epoch": 1.5335296041871116, - "grad_norm": 0.833165168762207, - "learning_rate": 9.825106556096894e-05, - "loss": 0.1017, + "epoch": 6.132940791625777, + "grad_norm": 0.4177016317844391, + "learning_rate": 1.999307491738832e-05, + "loss": 0.0326, "step": 23440 }, { - "epoch": 1.5341838403663721, - "grad_norm": 1.1163369417190552, - "learning_rate": 9.824865645457872e-05, - "loss": 0.0899, + "epoch": 6.13555773634282, + "grad_norm": 0.4015594720840454, + "learning_rate": 1.997282701349224e-05, + "loss": 0.0351, "step": 23450 }, { - "epoch": 1.534838076545633, - "grad_norm": 0.8560509085655212, - "learning_rate": 9.824624571966981e-05, - "loss": 0.0965, + "epoch": 6.138174681059863, + "grad_norm": 0.24507103860378265, + "learning_rate": 1.9952582545621487e-05, + "loss": 0.0334, "step": 23460 }, { - "epoch": 1.5354923127248936, - "grad_norm": 0.8199770450592041, - "learning_rate": 9.824383335632357e-05, - "loss": 0.0981, + "epoch": 6.140791625776905, + "grad_norm": 0.2932671010494232, + "learning_rate": 1.9932341527612968e-05, + "loss": 0.0396, "step": 23470 }, { - "epoch": 1.5361465489041544, - "grad_norm": 0.8420577645301819, - "learning_rate": 9.824141936462144e-05, - "loss": 0.0892, + "epoch": 6.143408570493948, + "grad_norm": 0.4256709814071655, + "learning_rate": 1.9912103973301236e-05, + "loss": 0.0403, "step": 23480 }, { - "epoch": 1.536800785083415, - "grad_norm": 0.8969340324401855, - "learning_rate": 9.823900374464487e-05, - "loss": 0.1015, + "epoch": 6.146025515210991, + "grad_norm": 0.4459935128688812, + "learning_rate": 1.9891869896518455e-05, + "loss": 0.0316, "step": 23490 }, { - "epoch": 1.5374550212626759, - "grad_norm": 1.0403921604156494, - "learning_rate": 9.823658649647544e-05, - "loss": 0.0966, + "epoch": 6.148642459928034, + "grad_norm": 0.3941150903701782, + "learning_rate": 1.987163931109444e-05, + "loss": 0.0355, "step": 23500 }, { - "epoch": 1.5381092574419366, - "grad_norm": 0.866157591342926, - "learning_rate": 9.82341676201947e-05, - "loss": 0.098, + "epoch": 6.151259404645077, + "grad_norm": 0.3339027166366577, + "learning_rate": 1.985141223085659e-05, + "loss": 0.0358, "step": 23510 }, { - "epoch": 1.5387634936211971, - "grad_norm": 0.8696637153625488, - "learning_rate": 9.82317471158843e-05, - "loss": 0.0963, + "epoch": 6.15387634936212, + "grad_norm": 0.27572101354599, + "learning_rate": 1.983118866962994e-05, + "loss": 0.0369, "step": 23520 }, { - "epoch": 1.539417729800458, - "grad_norm": 0.7463034987449646, - "learning_rate": 9.822932498362593e-05, - "loss": 0.1031, + "epoch": 6.156493294079162, + "grad_norm": 0.5284891128540039, + "learning_rate": 1.981096864123709e-05, + "loss": 0.0408, "step": 23530 }, { - "epoch": 1.5400719659797186, - "grad_norm": 0.8361960053443909, - "learning_rate": 9.822690122350138e-05, - "loss": 0.0929, + "epoch": 6.159110238796205, + "grad_norm": 0.4090319275856018, + "learning_rate": 1.9790752159498255e-05, + "loss": 0.0364, "step": 23540 }, { - "epoch": 1.5407262021589794, - "grad_norm": 0.8186551928520203, - "learning_rate": 9.822447583559242e-05, - "loss": 0.1008, + "epoch": 6.161727183513248, + "grad_norm": 0.39981624484062195, + "learning_rate": 1.977053923823119e-05, + "loss": 0.0425, "step": 23550 }, { - "epoch": 1.5413804383382401, - "grad_norm": 0.8494019508361816, - "learning_rate": 9.822204881998093e-05, - "loss": 0.0876, + "epoch": 6.164344128230291, + "grad_norm": 0.48455047607421875, + "learning_rate": 1.9750329891251244e-05, + "loss": 0.0376, "step": 23560 }, { - "epoch": 1.5420346745175009, - "grad_norm": 0.9531555771827698, - "learning_rate": 9.821962017674881e-05, - "loss": 0.094, + "epoch": 6.166961072947334, + "grad_norm": 0.4755387008190155, + "learning_rate": 1.9730124132371312e-05, + "loss": 0.0379, "step": 23570 }, { - "epoch": 1.5426889106967616, - "grad_norm": 0.6968259215354919, - "learning_rate": 9.821718990597808e-05, - "loss": 0.0877, + "epoch": 6.169578017664377, + "grad_norm": 0.35988909006118774, + "learning_rate": 1.9709921975401854e-05, + "loss": 0.0317, "step": 23580 }, { - "epoch": 1.5433431468760221, - "grad_norm": 0.864067018032074, - "learning_rate": 9.821475800775072e-05, - "loss": 0.0921, + "epoch": 6.17219496238142, + "grad_norm": 0.38816285133361816, + "learning_rate": 1.9689723434150835e-05, + "loss": 0.0365, "step": 23590 }, { - "epoch": 1.5439973830552831, - "grad_norm": 0.6976091861724854, - "learning_rate": 9.821232448214883e-05, - "loss": 0.0921, + "epoch": 6.174811907098462, + "grad_norm": 0.40134334564208984, + "learning_rate": 1.966952852242378e-05, + "loss": 0.0367, "step": 23600 }, { - "epoch": 1.5446516192345436, - "grad_norm": 0.83482426404953, - "learning_rate": 9.820988932925455e-05, - "loss": 0.1032, + "epoch": 6.177428851815505, + "grad_norm": 0.32368433475494385, + "learning_rate": 1.9649337254023713e-05, + "loss": 0.0328, "step": 23610 }, { - "epoch": 1.5453058554138044, - "grad_norm": 0.8034452199935913, - "learning_rate": 9.820745254915005e-05, - "loss": 0.0894, + "epoch": 6.180045796532548, + "grad_norm": 0.4156053066253662, + "learning_rate": 1.9629149642751185e-05, + "loss": 0.0357, "step": 23620 }, { - "epoch": 1.5459600915930651, - "grad_norm": 0.7795824408531189, - "learning_rate": 9.820501414191763e-05, - "loss": 0.0911, + "epoch": 6.182662741249591, + "grad_norm": 0.4975856840610504, + "learning_rate": 1.9608965702404236e-05, + "loss": 0.0358, "step": 23630 }, { - "epoch": 1.5466143277723257, - "grad_norm": 0.7786241769790649, - "learning_rate": 9.820257410763953e-05, - "loss": 0.0885, + "epoch": 6.185279685966634, + "grad_norm": 0.4767300486564636, + "learning_rate": 1.9588785446778384e-05, + "loss": 0.033, "step": 23640 }, { - "epoch": 1.5472685639515866, - "grad_norm": 0.8625045418739319, - "learning_rate": 9.820013244639816e-05, - "loss": 0.0974, + "epoch": 6.187896630683677, + "grad_norm": 0.3882130980491638, + "learning_rate": 1.9568608889666663e-05, + "loss": 0.0347, "step": 23650 }, { - "epoch": 1.5479228001308472, - "grad_norm": 0.7645002603530884, - "learning_rate": 9.81976891582759e-05, - "loss": 0.097, + "epoch": 6.19051357540072, + "grad_norm": 0.2682572603225708, + "learning_rate": 1.9548436044859542e-05, + "loss": 0.04, "step": 23660 }, { - "epoch": 1.548577036310108, - "grad_norm": 0.978847861289978, - "learning_rate": 9.819524424335524e-05, - "loss": 0.0958, + "epoch": 6.193130520117762, + "grad_norm": 0.4771081209182739, + "learning_rate": 1.952826692614498e-05, + "loss": 0.042, "step": 23670 }, { - "epoch": 1.5492312724893686, - "grad_norm": 0.9067314267158508, - "learning_rate": 9.819279770171867e-05, - "loss": 0.0973, + "epoch": 6.195747464834805, + "grad_norm": 0.3116135001182556, + "learning_rate": 1.9508101547308384e-05, + "loss": 0.0311, "step": 23680 }, { - "epoch": 1.5498855086686294, - "grad_norm": 0.8743839263916016, - "learning_rate": 9.819034953344881e-05, - "loss": 0.1002, + "epoch": 6.198364409551848, + "grad_norm": 0.24202032387256622, + "learning_rate": 1.948793992213259e-05, + "loss": 0.031, "step": 23690 }, { - "epoch": 1.5505397448478901, - "grad_norm": 0.7451719641685486, - "learning_rate": 9.818789973862825e-05, - "loss": 0.0889, + "epoch": 6.200981354268891, + "grad_norm": 0.4628809988498688, + "learning_rate": 1.9467782064397886e-05, + "loss": 0.0355, "step": 23700 }, { - "epoch": 1.5511939810271507, - "grad_norm": 0.833907961845398, - "learning_rate": 9.818544831733971e-05, - "loss": 0.092, + "epoch": 6.203598298985934, + "grad_norm": 0.2606217563152313, + "learning_rate": 1.9447627987881974e-05, + "loss": 0.0342, "step": 23710 }, { - "epoch": 1.5518482172064116, - "grad_norm": 0.8655661344528198, - "learning_rate": 9.81829952696659e-05, - "loss": 0.0924, + "epoch": 6.206215243702977, + "grad_norm": 0.4413122832775116, + "learning_rate": 1.9427477706359982e-05, + "loss": 0.0395, "step": 23720 }, { - "epoch": 1.5525024533856722, - "grad_norm": 0.8889705538749695, - "learning_rate": 9.818054059568966e-05, - "loss": 0.0926, + "epoch": 6.208832188420019, + "grad_norm": 0.26698562502861023, + "learning_rate": 1.9407331233604434e-05, + "loss": 0.0294, "step": 23730 }, { - "epoch": 1.553156689564933, - "grad_norm": 0.9665441513061523, - "learning_rate": 9.81780842954938e-05, - "loss": 0.105, + "epoch": 6.211449133137062, + "grad_norm": 0.3271200358867645, + "learning_rate": 1.9387188583385242e-05, + "loss": 0.0362, "step": 23740 }, { - "epoch": 1.5538109257441937, - "grad_norm": 0.8035663962364197, - "learning_rate": 9.817562636916125e-05, - "loss": 0.1007, + "epoch": 6.214066077854105, + "grad_norm": 0.37548309564590454, + "learning_rate": 1.9367049769469737e-05, + "loss": 0.0383, "step": 23750 }, { - "epoch": 1.5544651619234544, - "grad_norm": 0.8797777891159058, - "learning_rate": 9.817316681677496e-05, - "loss": 0.1106, + "epoch": 6.216683022571148, + "grad_norm": 0.34529152512550354, + "learning_rate": 1.934691480562259e-05, + "loss": 0.0312, "step": 23760 }, { - "epoch": 1.5551193981027152, - "grad_norm": 0.799263060092926, - "learning_rate": 9.817070563841795e-05, - "loss": 0.0978, + "epoch": 6.219299967288191, + "grad_norm": 0.24557435512542725, + "learning_rate": 1.9326783705605868e-05, + "loss": 0.034, "step": 23770 }, { - "epoch": 1.5557736342819757, - "grad_norm": 0.8599784970283508, - "learning_rate": 9.81682428341733e-05, - "loss": 0.1008, + "epoch": 6.221916912005234, + "grad_norm": 0.3468078672885895, + "learning_rate": 1.9306656483178993e-05, + "loss": 0.0382, "step": 23780 }, { - "epoch": 1.5564278704612367, - "grad_norm": 0.8856527805328369, - "learning_rate": 9.816577840412414e-05, - "loss": 0.0869, + "epoch": 6.224533856722276, + "grad_norm": 0.419986754655838, + "learning_rate": 1.9286533152098724e-05, + "loss": 0.0343, "step": 23790 }, { - "epoch": 1.5570821066404972, - "grad_norm": 0.6909691691398621, - "learning_rate": 9.81633123483536e-05, - "loss": 0.0982, + "epoch": 6.227150801439319, + "grad_norm": 0.40486636757850647, + "learning_rate": 1.926641372611917e-05, + "loss": 0.0303, "step": 23800 }, { - "epoch": 1.557736342819758, - "grad_norm": 0.8058643937110901, - "learning_rate": 9.8160844666945e-05, - "loss": 0.0944, + "epoch": 6.229767746156362, + "grad_norm": 0.30131152272224426, + "learning_rate": 1.9246298218991773e-05, + "loss": 0.0373, "step": 23810 }, { - "epoch": 1.5583905789990187, - "grad_norm": 1.1985560655593872, - "learning_rate": 9.815837535998156e-05, - "loss": 0.1016, + "epoch": 6.232384690873405, + "grad_norm": 0.3338201642036438, + "learning_rate": 1.9226186644465293e-05, + "loss": 0.0383, "step": 23820 }, { - "epoch": 1.5590448151782792, - "grad_norm": 0.8352839946746826, - "learning_rate": 9.815590442754666e-05, - "loss": 0.1072, + "epoch": 6.235001635590448, + "grad_norm": 0.33441248536109924, + "learning_rate": 1.9206079016285796e-05, + "loss": 0.0371, "step": 23830 }, { - "epoch": 1.5596990513575402, - "grad_norm": 0.9324560761451721, - "learning_rate": 9.815343186972369e-05, - "loss": 0.1034, + "epoch": 6.237618580307491, + "grad_norm": 0.365160197019577, + "learning_rate": 1.918597534819665e-05, + "loss": 0.0372, "step": 23840 }, { - "epoch": 1.5603532875368007, - "grad_norm": 0.8299842476844788, - "learning_rate": 9.81509576865961e-05, - "loss": 0.0935, + "epoch": 6.240235525024534, + "grad_norm": 0.4978759288787842, + "learning_rate": 1.9165875653938543e-05, + "loss": 0.0372, "step": 23850 }, { - "epoch": 1.5610075237160617, - "grad_norm": 0.9215813875198364, - "learning_rate": 9.814848187824742e-05, - "loss": 0.0986, + "epoch": 6.242852469741576, + "grad_norm": 0.37960320711135864, + "learning_rate": 1.91457799472494e-05, + "loss": 0.0372, "step": 23860 }, { - "epoch": 1.5616617598953222, - "grad_norm": 0.8809934854507446, - "learning_rate": 9.81460044447612e-05, - "loss": 0.0994, + "epoch": 6.245469414458619, + "grad_norm": 0.4017830193042755, + "learning_rate": 1.9125688241864464e-05, + "loss": 0.0326, "step": 23870 }, { - "epoch": 1.562315996074583, - "grad_norm": 0.8337189555168152, - "learning_rate": 9.814352538622106e-05, - "loss": 0.09, + "epoch": 6.248086359175662, + "grad_norm": 0.36038631200790405, + "learning_rate": 1.9105600551516232e-05, + "loss": 0.0338, "step": 23880 }, { - "epoch": 1.5629702322538437, - "grad_norm": 0.8376109004020691, - "learning_rate": 9.814104470271068e-05, - "loss": 0.103, + "epoch": 6.250703303892705, + "grad_norm": 0.5656988024711609, + "learning_rate": 1.9085516889934433e-05, + "loss": 0.0334, "step": 23890 }, { - "epoch": 1.5636244684331042, - "grad_norm": 0.7608706951141357, - "learning_rate": 9.813856239431378e-05, - "loss": 0.1002, + "epoch": 6.253320248609748, + "grad_norm": 0.6395511627197266, + "learning_rate": 1.9065437270846076e-05, + "loss": 0.038, "step": 23900 }, { - "epoch": 1.5642787046123652, - "grad_norm": 0.7375048995018005, - "learning_rate": 9.813607846111416e-05, - "loss": 0.0901, + "epoch": 6.255937193326791, + "grad_norm": 0.30840107798576355, + "learning_rate": 1.904536170797539e-05, + "loss": 0.0354, "step": 23910 }, { - "epoch": 1.5649329407916257, - "grad_norm": 0.7545016407966614, - "learning_rate": 9.813359290319563e-05, - "loss": 0.1055, + "epoch": 6.258554138043833, + "grad_norm": 0.29775843024253845, + "learning_rate": 1.9025290215043818e-05, + "loss": 0.0313, "step": 23920 }, { - "epoch": 1.5655871769708865, - "grad_norm": 0.7844988703727722, - "learning_rate": 9.813110572064212e-05, - "loss": 0.0992, + "epoch": 6.261171082760876, + "grad_norm": 0.28461727499961853, + "learning_rate": 1.9005222805770048e-05, + "loss": 0.0315, "step": 23930 }, { - "epoch": 1.5662414131501472, - "grad_norm": 0.7850669622421265, - "learning_rate": 9.812861691353757e-05, - "loss": 0.0912, + "epoch": 6.263788027477919, + "grad_norm": 0.2945394814014435, + "learning_rate": 1.898515949386996e-05, + "loss": 0.0353, "step": 23940 }, { - "epoch": 1.566895649329408, - "grad_norm": 1.1129896640777588, - "learning_rate": 9.812612648196598e-05, - "loss": 0.0932, + "epoch": 6.266404972194962, + "grad_norm": 0.41512030363082886, + "learning_rate": 1.8965100293056644e-05, + "loss": 0.0352, "step": 23950 }, { - "epoch": 1.5675498855086687, - "grad_norm": 0.9184969663619995, - "learning_rate": 9.81236344260114e-05, - "loss": 0.1044, + "epoch": 6.269021916912005, + "grad_norm": 0.4127851724624634, + "learning_rate": 1.894504521704037e-05, + "loss": 0.0386, "step": 23960 }, { - "epoch": 1.5682041216879292, - "grad_norm": 0.790431559085846, - "learning_rate": 9.812114074575793e-05, - "loss": 0.1014, + "epoch": 6.271638861629048, + "grad_norm": 0.5242753624916077, + "learning_rate": 1.8924994279528597e-05, + "loss": 0.0391, "step": 23970 }, { - "epoch": 1.5688583578671902, - "grad_norm": 0.7852354645729065, - "learning_rate": 9.811864544128978e-05, - "loss": 0.0822, + "epoch": 6.2742558063460905, + "grad_norm": 0.41487598419189453, + "learning_rate": 1.890494749422595e-05, + "loss": 0.0358, "step": 23980 }, { - "epoch": 1.5695125940464507, - "grad_norm": 0.7458804249763489, - "learning_rate": 9.811614851269114e-05, - "loss": 0.0996, + "epoch": 6.2768727510631335, + "grad_norm": 0.3981493413448334, + "learning_rate": 1.8884904874834216e-05, + "loss": 0.035, "step": 23990 }, { - "epoch": 1.5701668302257115, - "grad_norm": 0.7905547022819519, - "learning_rate": 9.81136499600463e-05, - "loss": 0.0905, + "epoch": 6.2794896957801765, + "grad_norm": 0.315698504447937, + "learning_rate": 1.886486643505234e-05, + "loss": 0.0308, "step": 24000 }, { - "epoch": 1.5708210664049722, - "grad_norm": 0.8950849771499634, - "learning_rate": 9.811114978343961e-05, - "loss": 0.0945, + "epoch": 6.2794896957801765, + "eval_loss": 0.043558861109126004, + "eval_runtime": 9.1848, + "eval_samples_per_second": 111.489, + "eval_steps_per_second": 1.742, + "step": 24000 + }, + { + "epoch": 6.2821066404972195, + "grad_norm": 0.4844658374786377, + "learning_rate": 1.8844832188576416e-05, + "loss": 0.0406, "step": 24010 }, { - "epoch": 1.571475302584233, - "grad_norm": 0.9239217638969421, - "learning_rate": 9.810864798295541e-05, - "loss": 0.0896, + "epoch": 6.284723585214262, + "grad_norm": 0.3282155990600586, + "learning_rate": 1.8824802149099637e-05, + "loss": 0.0365, "step": 24020 }, { - "epoch": 1.5721295387634937, - "grad_norm": 0.8260421752929688, - "learning_rate": 9.810614455867818e-05, - "loss": 0.0826, + "epoch": 6.287340529931305, + "grad_norm": 0.3532108962535858, + "learning_rate": 1.8804776330312364e-05, + "loss": 0.0312, "step": 24030 }, { - "epoch": 1.5727837749427542, - "grad_norm": 0.8214983940124512, - "learning_rate": 9.810363951069241e-05, - "loss": 0.0918, + "epoch": 6.289957474648348, + "grad_norm": 0.39708057045936584, + "learning_rate": 1.878475474590205e-05, + "loss": 0.032, "step": 24040 }, { - "epoch": 1.5734380111220152, - "grad_norm": 0.7926087975502014, - "learning_rate": 9.810113283908266e-05, - "loss": 0.0858, + "epoch": 6.2925744193653905, + "grad_norm": 0.4074369966983795, + "learning_rate": 1.876473740955326e-05, + "loss": 0.0335, "step": 24050 }, { - "epoch": 1.5740922473012757, - "grad_norm": 0.8653861284255981, - "learning_rate": 9.809862454393352e-05, - "loss": 0.1005, + "epoch": 6.2951913640824335, + "grad_norm": 0.42009133100509644, + "learning_rate": 1.8744724334947662e-05, + "loss": 0.0336, "step": 24060 }, { - "epoch": 1.5747464834805365, - "grad_norm": 0.846734881401062, - "learning_rate": 9.809611462532964e-05, - "loss": 0.0908, + "epoch": 6.2978083087994765, + "grad_norm": 0.48954713344573975, + "learning_rate": 1.872471553576399e-05, + "loss": 0.0358, "step": 24070 }, { - "epoch": 1.5754007196597972, - "grad_norm": 1.0754064321517944, - "learning_rate": 9.809360308335578e-05, - "loss": 0.1067, + "epoch": 6.3004252535165195, + "grad_norm": 0.391671746969223, + "learning_rate": 1.8704711025678082e-05, + "loss": 0.0335, "step": 24080 }, { - "epoch": 1.5760549558390577, - "grad_norm": 0.7325643301010132, - "learning_rate": 9.809108991809668e-05, - "loss": 0.0927, + "epoch": 6.3030421982335625, + "grad_norm": 0.6001835465431213, + "learning_rate": 1.868471081836282e-05, + "loss": 0.0343, "step": 24090 }, { - "epoch": 1.5767091920183187, - "grad_norm": 0.8486270904541016, - "learning_rate": 9.808857512963717e-05, - "loss": 0.1034, + "epoch": 6.3056591429506055, + "grad_norm": 0.40074849128723145, + "learning_rate": 1.866471492748818e-05, + "loss": 0.0352, "step": 24100 }, { - "epoch": 1.5773634281975792, - "grad_norm": 0.8908068537712097, - "learning_rate": 9.808605871806213e-05, - "loss": 0.1011, + "epoch": 6.3082760876676485, + "grad_norm": 0.3913089632987976, + "learning_rate": 1.864472336672114e-05, + "loss": 0.0326, "step": 24110 }, { - "epoch": 1.57801766437684, - "grad_norm": 0.9737763404846191, - "learning_rate": 9.808354068345649e-05, - "loss": 0.1027, + "epoch": 6.310893032384691, + "grad_norm": 0.26605844497680664, + "learning_rate": 1.862473614972575e-05, + "loss": 0.0331, "step": 24120 }, { - "epoch": 1.5786719005561007, - "grad_norm": 0.9432488083839417, - "learning_rate": 9.808102102590526e-05, - "loss": 0.1, + "epoch": 6.313509977101734, + "grad_norm": 0.3286307156085968, + "learning_rate": 1.8604753290163086e-05, + "loss": 0.0325, "step": 24130 }, { - "epoch": 1.5793261367353615, - "grad_norm": 0.8621255159378052, - "learning_rate": 9.807849974549347e-05, - "loss": 0.0851, + "epoch": 6.3161269218187766, + "grad_norm": 0.30002516508102417, + "learning_rate": 1.8584774801691244e-05, + "loss": 0.0358, "step": 24140 }, { - "epoch": 1.5799803729146222, - "grad_norm": 0.8411361575126648, - "learning_rate": 9.807597684230623e-05, - "loss": 0.106, + "epoch": 6.3187438665358195, + "grad_norm": 0.39178580045700073, + "learning_rate": 1.856480069796533e-05, + "loss": 0.0296, "step": 24150 }, { - "epoch": 1.5806346090938828, - "grad_norm": 0.8948150873184204, - "learning_rate": 9.807345231642868e-05, - "loss": 0.1006, + "epoch": 6.3213608112528625, + "grad_norm": 0.2837730050086975, + "learning_rate": 1.8544830992637465e-05, + "loss": 0.0333, "step": 24160 }, { - "epoch": 1.5812888452731437, - "grad_norm": 1.042441725730896, - "learning_rate": 9.807092616794605e-05, - "loss": 0.1131, + "epoch": 6.3239777559699055, + "grad_norm": 0.342427521944046, + "learning_rate": 1.8524865699356745e-05, + "loss": 0.0344, "step": 24170 }, { - "epoch": 1.5819430814524043, - "grad_norm": 1.0746121406555176, - "learning_rate": 9.806839839694358e-05, - "loss": 0.0919, + "epoch": 6.326594700686948, + "grad_norm": 0.35260245203971863, + "learning_rate": 1.8504904831769265e-05, + "loss": 0.0341, "step": 24180 }, { - "epoch": 1.582597317631665, - "grad_norm": 0.8730924725532532, - "learning_rate": 9.806586900350658e-05, - "loss": 0.0961, + "epoch": 6.329211645403991, + "grad_norm": 0.37071171402931213, + "learning_rate": 1.8484948403518095e-05, + "loss": 0.034, "step": 24190 }, { - "epoch": 1.5832515538109257, - "grad_norm": 0.7895834445953369, - "learning_rate": 9.806333798772047e-05, - "loss": 0.0892, + "epoch": 6.331828590121034, + "grad_norm": 0.32197433710098267, + "learning_rate": 1.846499642824325e-05, + "loss": 0.0374, "step": 24200 }, { - "epoch": 1.5839057899901865, - "grad_norm": 0.786361038684845, - "learning_rate": 9.806080534967065e-05, - "loss": 0.0953, + "epoch": 6.334445534838077, + "grad_norm": 0.31927716732025146, + "learning_rate": 1.8445048919581724e-05, + "loss": 0.0317, "step": 24210 }, { - "epoch": 1.5845600261694472, - "grad_norm": 0.8377049565315247, - "learning_rate": 9.80582710894426e-05, - "loss": 0.1049, + "epoch": 6.33706247955512, + "grad_norm": 0.3790701925754547, + "learning_rate": 1.8425105891167448e-05, + "loss": 0.0365, "step": 24220 }, { - "epoch": 1.5852142623487078, - "grad_norm": 0.7862370014190674, - "learning_rate": 9.805573520712186e-05, - "loss": 0.1034, + "epoch": 6.339679424272163, + "grad_norm": 0.47591888904571533, + "learning_rate": 1.8405167356631304e-05, + "loss": 0.0363, "step": 24230 }, { - "epoch": 1.5858684985279687, - "grad_norm": 0.8017224073410034, - "learning_rate": 9.805319770279404e-05, - "loss": 0.0996, + "epoch": 6.342296368989205, + "grad_norm": 0.27775129675865173, + "learning_rate": 1.838523332960108e-05, + "loss": 0.0331, "step": 24240 }, { - "epoch": 1.5865227347072293, - "grad_norm": 0.8779584765434265, - "learning_rate": 9.805065857654476e-05, - "loss": 0.1031, + "epoch": 6.344913313706248, + "grad_norm": 0.4258742928504944, + "learning_rate": 1.8365303823701503e-05, + "loss": 0.0286, "step": 24250 }, { - "epoch": 1.58717697088649, - "grad_norm": 0.8980815410614014, - "learning_rate": 9.804811782845974e-05, - "loss": 0.0956, + "epoch": 6.347530258423291, + "grad_norm": 0.2823595404624939, + "learning_rate": 1.8345378852554208e-05, + "loss": 0.0351, "step": 24260 }, { - "epoch": 1.5878312070657508, - "grad_norm": 1.0139700174331665, - "learning_rate": 9.804557545862474e-05, - "loss": 0.0996, + "epoch": 6.350147203140334, + "grad_norm": 0.3638388216495514, + "learning_rate": 1.832545842977771e-05, + "loss": 0.0333, "step": 24270 }, { - "epoch": 1.5884854432450113, - "grad_norm": 0.8047105073928833, - "learning_rate": 9.804303146712555e-05, - "loss": 0.1042, + "epoch": 6.352764147857377, + "grad_norm": 0.3300917148590088, + "learning_rate": 1.8305542568987448e-05, + "loss": 0.0355, "step": 24280 }, { - "epoch": 1.5891396794242723, - "grad_norm": 0.7328765988349915, - "learning_rate": 9.804048585404806e-05, - "loss": 0.0914, + "epoch": 6.35538109257442, + "grad_norm": 0.3087752163410187, + "learning_rate": 1.8285631283795714e-05, + "loss": 0.032, "step": 24290 }, { - "epoch": 1.5897939156035328, - "grad_norm": 0.866847813129425, - "learning_rate": 9.803793861947816e-05, - "loss": 0.0952, + "epoch": 6.357998037291463, + "grad_norm": 0.32038813829421997, + "learning_rate": 1.8265724587811676e-05, + "loss": 0.0358, "step": 24300 }, { - "epoch": 1.5904481517827938, - "grad_norm": 0.8152327537536621, - "learning_rate": 9.803538976350189e-05, - "loss": 0.0915, + "epoch": 6.360614982008505, + "grad_norm": 0.45256373286247253, + "learning_rate": 1.8245822494641384e-05, + "loss": 0.0428, "step": 24310 }, { - "epoch": 1.5911023879620543, - "grad_norm": 0.7884990572929382, - "learning_rate": 9.80328392862052e-05, - "loss": 0.0914, + "epoch": 6.363231926725548, + "grad_norm": 0.519258975982666, + "learning_rate": 1.822592501788773e-05, + "loss": 0.0378, "step": 24320 }, { - "epoch": 1.591756624141315, - "grad_norm": 1.0479592084884644, - "learning_rate": 9.803028718767423e-05, - "loss": 0.1004, + "epoch": 6.365848871442591, + "grad_norm": 0.5432097315788269, + "learning_rate": 1.8206032171150453e-05, + "loss": 0.0311, "step": 24330 }, { - "epoch": 1.5924108603205758, - "grad_norm": 0.8851988911628723, - "learning_rate": 9.80277334679951e-05, - "loss": 0.0979, + "epoch": 6.368465816159634, + "grad_norm": 0.3527891933917999, + "learning_rate": 1.818614396802612e-05, + "loss": 0.0329, "step": 24340 }, { - "epoch": 1.5930650964998363, - "grad_norm": 0.8954311609268188, - "learning_rate": 9.8025178127254e-05, - "loss": 0.0914, + "epoch": 6.371082760876677, + "grad_norm": 0.6945310235023499, + "learning_rate": 1.8166260422108132e-05, + "loss": 0.0359, "step": 24350 }, { - "epoch": 1.5937193326790973, - "grad_norm": 0.8873519897460938, - "learning_rate": 9.80226211655372e-05, - "loss": 0.0978, + "epoch": 6.37369970559372, + "grad_norm": 0.528319776058197, + "learning_rate": 1.8146381546986712e-05, + "loss": 0.0378, "step": 24360 }, { - "epoch": 1.5943735688583578, - "grad_norm": 0.8193051815032959, - "learning_rate": 9.8020062582931e-05, - "loss": 0.1028, + "epoch": 6.376316650310762, + "grad_norm": 0.4661575257778168, + "learning_rate": 1.8126507356248877e-05, + "loss": 0.0368, "step": 24370 }, { - "epoch": 1.5950278050376185, - "grad_norm": 0.8333913683891296, - "learning_rate": 9.801750237952172e-05, - "loss": 0.0951, + "epoch": 6.378933595027805, + "grad_norm": 0.3477420508861542, + "learning_rate": 1.810663786347846e-05, + "loss": 0.0323, "step": 24380 }, { - "epoch": 1.5956820412168793, - "grad_norm": 0.8090634942054749, - "learning_rate": 9.801494055539584e-05, - "loss": 0.1018, + "epoch": 6.381550539744848, + "grad_norm": 0.38132211565971375, + "learning_rate": 1.8086773082256054e-05, + "loss": 0.0319, "step": 24390 }, { - "epoch": 1.59633627739614, - "grad_norm": 0.8224121332168579, - "learning_rate": 9.801237711063978e-05, - "loss": 0.099, + "epoch": 6.384167484461891, + "grad_norm": 0.41567564010620117, + "learning_rate": 1.8066913026159058e-05, + "loss": 0.0346, "step": 24400 }, { - "epoch": 1.5969905135754008, - "grad_norm": 1.034574031829834, - "learning_rate": 9.800981204534006e-05, - "loss": 0.1089, + "epoch": 6.386784429178934, + "grad_norm": 0.38132917881011963, + "learning_rate": 1.8047057708761637e-05, + "loss": 0.0444, "step": 24410 }, { - "epoch": 1.5976447497546613, - "grad_norm": 0.8645883202552795, - "learning_rate": 9.800724535958328e-05, - "loss": 0.094, + "epoch": 6.389401373895977, + "grad_norm": 0.31965717673301697, + "learning_rate": 1.8027207143634702e-05, + "loss": 0.029, "step": 24420 }, { - "epoch": 1.5982989859339223, - "grad_norm": 0.8233097195625305, - "learning_rate": 9.800467705345607e-05, - "loss": 0.097, + "epoch": 6.392018318613019, + "grad_norm": 0.1878511756658554, + "learning_rate": 1.800736134434594e-05, + "loss": 0.0331, "step": 24430 }, { - "epoch": 1.5989532221131828, - "grad_norm": 0.9189668893814087, - "learning_rate": 9.800210712704512e-05, - "loss": 0.1041, + "epoch": 6.394635263330062, + "grad_norm": 0.32016149163246155, + "learning_rate": 1.798752032445976e-05, + "loss": 0.0393, "step": 24440 }, { - "epoch": 1.5996074582924436, - "grad_norm": 0.8542932271957397, - "learning_rate": 9.799953558043715e-05, - "loss": 0.0997, + "epoch": 6.397252208047105, + "grad_norm": 0.6267549395561218, + "learning_rate": 1.7967684097537318e-05, + "loss": 0.0399, "step": 24450 }, { - "epoch": 1.6002616944717043, - "grad_norm": 0.7710226774215698, - "learning_rate": 9.799696241371898e-05, - "loss": 0.1006, + "epoch": 6.399869152764148, + "grad_norm": 0.5221057534217834, + "learning_rate": 1.7947852677136485e-05, + "loss": 0.0327, "step": 24460 }, { - "epoch": 1.600915930650965, - "grad_norm": 0.937626302242279, - "learning_rate": 9.799438762697744e-05, - "loss": 0.1013, + "epoch": 6.402486097481191, + "grad_norm": 0.3823908865451813, + "learning_rate": 1.7928026076811854e-05, + "loss": 0.0277, "step": 24470 }, { - "epoch": 1.6015701668302258, - "grad_norm": 0.8287016153335571, - "learning_rate": 9.799181122029946e-05, - "loss": 0.0906, + "epoch": 6.405103042198234, + "grad_norm": 0.31546902656555176, + "learning_rate": 1.7908204310114707e-05, + "loss": 0.0381, "step": 24480 }, { - "epoch": 1.6022244030094863, - "grad_norm": 0.8471201062202454, - "learning_rate": 9.798923319377199e-05, - "loss": 0.0886, + "epoch": 6.407719986915277, + "grad_norm": 0.3193889260292053, + "learning_rate": 1.788838739059305e-05, + "loss": 0.0283, "step": 24490 }, { - "epoch": 1.6028786391887473, - "grad_norm": 0.8005667924880981, - "learning_rate": 9.798665354748205e-05, - "loss": 0.0949, + "epoch": 6.410336931632319, + "grad_norm": 0.3339197337627411, + "learning_rate": 1.786857533179154e-05, + "loss": 0.0349, "step": 24500 }, { - "epoch": 1.6035328753680078, - "grad_norm": 0.7234973311424255, - "learning_rate": 9.798407228151667e-05, - "loss": 0.0955, + "epoch": 6.412953876349362, + "grad_norm": 0.3514746427536011, + "learning_rate": 1.7848768147251555e-05, + "loss": 0.034, "step": 24510 }, { - "epoch": 1.6041871115472686, - "grad_norm": 0.8595584630966187, - "learning_rate": 9.798148939596303e-05, - "loss": 0.0885, + "epoch": 6.415570821066405, + "grad_norm": 0.29451289772987366, + "learning_rate": 1.7828965850511104e-05, + "loss": 0.032, "step": 24520 }, { - "epoch": 1.6048413477265293, - "grad_norm": 0.8628395199775696, - "learning_rate": 9.797890489090829e-05, - "loss": 0.0917, + "epoch": 6.418187765783448, + "grad_norm": 0.4204190671443939, + "learning_rate": 1.780916845510488e-05, + "loss": 0.0345, "step": 24530 }, { - "epoch": 1.6054955839057898, - "grad_norm": 0.698773980140686, - "learning_rate": 9.797631876643967e-05, - "loss": 0.0888, + "epoch": 6.420804710500491, + "grad_norm": 0.42853644490242004, + "learning_rate": 1.7789375974564208e-05, + "loss": 0.0308, "step": 24540 }, { - "epoch": 1.6061498200850508, - "grad_norm": 0.7893082499504089, - "learning_rate": 9.797373102264448e-05, - "loss": 0.108, + "epoch": 6.423421655217534, + "grad_norm": 0.35799336433410645, + "learning_rate": 1.7769588422417063e-05, + "loss": 0.0332, "step": 24550 }, { - "epoch": 1.6068040562643113, - "grad_norm": 0.9154890775680542, - "learning_rate": 9.797114165961006e-05, - "loss": 0.1043, + "epoch": 6.426038599934577, + "grad_norm": 0.3346477746963501, + "learning_rate": 1.7749805812188063e-05, + "loss": 0.0308, "step": 24560 }, { - "epoch": 1.607458292443572, - "grad_norm": 0.9815983176231384, - "learning_rate": 9.796855067742378e-05, - "loss": 0.1072, + "epoch": 6.428655544651619, + "grad_norm": 0.3403894901275635, + "learning_rate": 1.773002815739842e-05, + "loss": 0.0353, "step": 24570 }, { - "epoch": 1.6081125286228328, - "grad_norm": 0.8936287760734558, - "learning_rate": 9.796595807617313e-05, - "loss": 0.1025, + "epoch": 6.431272489368662, + "grad_norm": 0.3275148272514343, + "learning_rate": 1.771025547156598e-05, + "loss": 0.0341, "step": 24580 }, { - "epoch": 1.6087667648020936, - "grad_norm": 0.953900933265686, - "learning_rate": 9.796336385594557e-05, - "loss": 0.0932, + "epoch": 6.433889434085705, + "grad_norm": 0.4482429623603821, + "learning_rate": 1.7690487768205182e-05, + "loss": 0.0345, "step": 24590 }, { - "epoch": 1.6094210009813543, - "grad_norm": 0.8990422487258911, - "learning_rate": 9.796076801682871e-05, - "loss": 0.0958, + "epoch": 6.436506378802748, + "grad_norm": 0.27314290404319763, + "learning_rate": 1.7670725060827074e-05, + "loss": 0.0307, "step": 24600 }, { - "epoch": 1.6100752371606148, - "grad_norm": 0.7732602953910828, - "learning_rate": 9.795817055891016e-05, - "loss": 0.0885, + "epoch": 6.439123323519791, + "grad_norm": 0.4241769015789032, + "learning_rate": 1.7650967362939273e-05, + "loss": 0.0321, "step": 24610 }, { - "epoch": 1.6107294733398758, - "grad_norm": 0.913487434387207, - "learning_rate": 9.795557148227756e-05, - "loss": 0.1008, + "epoch": 6.441740268236834, + "grad_norm": 0.3373008668422699, + "learning_rate": 1.7631214688045984e-05, + "loss": 0.0304, "step": 24620 }, { - "epoch": 1.6113837095191363, - "grad_norm": 0.9833926558494568, - "learning_rate": 9.795297078701867e-05, - "loss": 0.0969, + "epoch": 6.444357212953876, + "grad_norm": 0.4360610842704773, + "learning_rate": 1.7611467049647974e-05, + "loss": 0.036, "step": 24630 }, { - "epoch": 1.612037945698397, - "grad_norm": 0.9888587594032288, - "learning_rate": 9.795036847322124e-05, - "loss": 0.1004, + "epoch": 6.446974157670919, + "grad_norm": 0.39723801612854004, + "learning_rate": 1.7591724461242564e-05, + "loss": 0.0305, "step": 24640 }, { - "epoch": 1.6126921818776578, - "grad_norm": 0.7544565796852112, - "learning_rate": 9.794776454097314e-05, - "loss": 0.0883, + "epoch": 6.449591102387962, + "grad_norm": 0.3017699718475342, + "learning_rate": 1.7571986936323626e-05, + "loss": 0.0283, "step": 24650 }, { - "epoch": 1.6133464180569186, - "grad_norm": 0.8678017258644104, - "learning_rate": 9.794515899036222e-05, - "loss": 0.0974, + "epoch": 6.452208047105005, + "grad_norm": 0.2911607623100281, + "learning_rate": 1.7552254488381588e-05, + "loss": 0.0337, "step": 24660 }, { - "epoch": 1.6140006542361793, - "grad_norm": 1.1111491918563843, - "learning_rate": 9.794255182147644e-05, - "loss": 0.0926, + "epoch": 6.454824991822048, + "grad_norm": 0.39926955103874207, + "learning_rate": 1.753252713090337e-05, + "loss": 0.0337, "step": 24670 }, { - "epoch": 1.6146548904154399, - "grad_norm": 0.8813068270683289, - "learning_rate": 9.793994303440382e-05, - "loss": 0.0952, + "epoch": 6.457441936539091, + "grad_norm": 0.36836519837379456, + "learning_rate": 1.751280487737244e-05, + "loss": 0.0306, "step": 24680 }, { - "epoch": 1.6153091265947008, - "grad_norm": 0.8787200450897217, - "learning_rate": 9.793733262923238e-05, - "loss": 0.1085, + "epoch": 6.460058881256133, + "grad_norm": 0.3723924160003662, + "learning_rate": 1.7493087741268783e-05, + "loss": 0.0306, "step": 24690 }, { - "epoch": 1.6159633627739614, - "grad_norm": 0.7039420008659363, - "learning_rate": 9.793472060605024e-05, - "loss": 0.099, + "epoch": 6.462675825973176, + "grad_norm": 0.27300289273262024, + "learning_rate": 1.7473375736068862e-05, + "loss": 0.033, "step": 24700 }, { - "epoch": 1.616617598953222, - "grad_norm": 0.7518815994262695, - "learning_rate": 9.793210696494559e-05, - "loss": 0.0981, + "epoch": 6.465292770690219, + "grad_norm": 0.4438270628452301, + "learning_rate": 1.745366887524566e-05, + "loss": 0.0359, "step": 24710 }, { - "epoch": 1.6172718351324828, - "grad_norm": 0.8269907832145691, - "learning_rate": 9.79294917060066e-05, - "loss": 0.089, + "epoch": 6.467909715407262, + "grad_norm": 0.4966143071651459, + "learning_rate": 1.7433967172268618e-05, + "loss": 0.0328, "step": 24720 }, { - "epoch": 1.6179260713117434, - "grad_norm": 0.8977356553077698, - "learning_rate": 9.792687482932158e-05, - "loss": 0.0868, + "epoch": 6.470526660124305, + "grad_norm": 0.3728248178958893, + "learning_rate": 1.7414270640603674e-05, + "loss": 0.0261, "step": 24730 }, { - "epoch": 1.6185803074910043, - "grad_norm": 0.7739750742912292, - "learning_rate": 9.792425633497883e-05, - "loss": 0.0873, + "epoch": 6.473143604841348, + "grad_norm": 0.530876874923706, + "learning_rate": 1.7394579293713215e-05, + "loss": 0.0329, "step": 24740 }, { - "epoch": 1.6192345436702649, - "grad_norm": 0.9570137858390808, - "learning_rate": 9.792163622306676e-05, - "loss": 0.091, + "epoch": 6.475760549558391, + "grad_norm": 0.46062782406806946, + "learning_rate": 1.7374893145056103e-05, + "loss": 0.0331, "step": 24750 }, { - "epoch": 1.6198887798495258, - "grad_norm": 0.817414402961731, - "learning_rate": 9.791901449367378e-05, - "loss": 0.0953, + "epoch": 6.478377494275433, + "grad_norm": 0.35258668661117554, + "learning_rate": 1.735521220808762e-05, + "loss": 0.0312, "step": 24760 }, { - "epoch": 1.6205430160287864, - "grad_norm": 0.7275998592376709, - "learning_rate": 9.791639114688837e-05, - "loss": 0.1121, + "epoch": 6.480994438992476, + "grad_norm": 0.2958410084247589, + "learning_rate": 1.733553649625951e-05, + "loss": 0.0345, "step": 24770 }, { - "epoch": 1.6211972522080471, - "grad_norm": 0.828654944896698, - "learning_rate": 9.791376618279913e-05, - "loss": 0.0982, + "epoch": 6.483611383709519, + "grad_norm": 0.40343642234802246, + "learning_rate": 1.731586602301992e-05, + "loss": 0.0372, "step": 24780 }, { - "epoch": 1.6218514883873079, - "grad_norm": 1.096555471420288, - "learning_rate": 9.791113960149458e-05, - "loss": 0.0948, + "epoch": 6.486228328426562, + "grad_norm": 0.29527971148490906, + "learning_rate": 1.7296200801813462e-05, + "loss": 0.0329, "step": 24790 }, { - "epoch": 1.6225057245665684, - "grad_norm": 0.7662184834480286, - "learning_rate": 9.790851140306345e-05, - "loss": 0.0966, + "epoch": 6.488845273143605, + "grad_norm": 0.36426976323127747, + "learning_rate": 1.72765408460811e-05, + "loss": 0.0335, "step": 24800 }, { - "epoch": 1.6231599607458294, - "grad_norm": 0.7542035579681396, - "learning_rate": 9.790588158759441e-05, - "loss": 0.1002, + "epoch": 6.491462217860648, + "grad_norm": 0.3566276729106903, + "learning_rate": 1.7256886169260255e-05, + "loss": 0.0318, "step": 24810 }, { - "epoch": 1.6238141969250899, - "grad_norm": 0.8306244015693665, - "learning_rate": 9.790325015517622e-05, - "loss": 0.0923, + "epoch": 6.49407916257769, + "grad_norm": 0.4963986277580261, + "learning_rate": 1.7237236784784693e-05, + "loss": 0.0363, "step": 24820 }, { - "epoch": 1.6244684331043506, - "grad_norm": 0.8431245684623718, - "learning_rate": 9.790061710589771e-05, - "loss": 0.092, + "epoch": 6.496696107294733, + "grad_norm": 0.7406085133552551, + "learning_rate": 1.721759270608459e-05, + "loss": 0.0375, "step": 24830 }, { - "epoch": 1.6251226692836114, - "grad_norm": 1.4137808084487915, - "learning_rate": 9.789798243984775e-05, - "loss": 0.0905, + "epoch": 6.499313052011776, + "grad_norm": 0.4087287485599518, + "learning_rate": 1.7197953946586497e-05, + "loss": 0.0301, "step": 24840 }, { - "epoch": 1.6257769054628721, - "grad_norm": 0.9978521466255188, - "learning_rate": 9.789534615711527e-05, - "loss": 0.0955, + "epoch": 6.501929996728819, + "grad_norm": 0.3173830211162567, + "learning_rate": 1.7178320519713303e-05, + "loss": 0.0323, "step": 24850 }, { - "epoch": 1.6264311416421329, - "grad_norm": 0.766831636428833, - "learning_rate": 9.789270825778923e-05, - "loss": 0.0896, + "epoch": 6.504546941445862, + "grad_norm": 0.5561368465423584, + "learning_rate": 1.7158692438884284e-05, + "loss": 0.04, "step": 24860 }, { - "epoch": 1.6270853778213934, - "grad_norm": 0.8945890665054321, - "learning_rate": 9.78900687419587e-05, - "loss": 0.0953, + "epoch": 6.507163886162905, + "grad_norm": 0.41783007979393005, + "learning_rate": 1.7139069717515042e-05, + "loss": 0.0353, "step": 24870 }, { - "epoch": 1.6277396140006544, - "grad_norm": 0.9509278535842896, - "learning_rate": 9.788742760971274e-05, - "loss": 0.0944, + "epoch": 6.509780830879947, + "grad_norm": 0.6484546661376953, + "learning_rate": 1.711945236901752e-05, + "loss": 0.0332, "step": 24880 }, { - "epoch": 1.628393850179915, - "grad_norm": 0.752797544002533, - "learning_rate": 9.788478486114052e-05, - "loss": 0.0847, + "epoch": 6.51239777559699, + "grad_norm": 0.5079058408737183, + "learning_rate": 1.70998404068e-05, + "loss": 0.0353, "step": 24890 }, { - "epoch": 1.6290480863591756, - "grad_norm": 0.8730521202087402, - "learning_rate": 9.788214049633123e-05, - "loss": 0.0901, + "epoch": 6.515014720314033, + "grad_norm": 0.392955482006073, + "learning_rate": 1.7080233844267066e-05, + "loss": 0.0303, "step": 24900 }, { - "epoch": 1.6297023225384364, - "grad_norm": 0.9180911183357239, - "learning_rate": 9.78794945153741e-05, - "loss": 0.0958, + "epoch": 6.517631665031076, + "grad_norm": 0.5023982524871826, + "learning_rate": 1.7060632694819624e-05, + "loss": 0.0392, "step": 24910 }, { - "epoch": 1.6303565587176971, - "grad_norm": 0.9941650629043579, - "learning_rate": 9.787684691835849e-05, - "loss": 0.0961, + "epoch": 6.520248609748119, + "grad_norm": 0.27257394790649414, + "learning_rate": 1.7041036971854863e-05, + "loss": 0.0336, "step": 24920 }, { - "epoch": 1.6310107948969579, - "grad_norm": 0.9088178873062134, - "learning_rate": 9.787419770537371e-05, - "loss": 0.1026, + "epoch": 6.522865554465162, + "grad_norm": 0.3066208064556122, + "learning_rate": 1.702144668876629e-05, + "loss": 0.0377, "step": 24930 }, { - "epoch": 1.6316650310762184, - "grad_norm": 0.8117355108261108, - "learning_rate": 9.787154687650923e-05, - "loss": 0.0913, + "epoch": 6.525482499182205, + "grad_norm": 0.45781561732292175, + "learning_rate": 1.700186185894368e-05, + "loss": 0.0349, "step": 24940 }, { - "epoch": 1.6323192672554794, - "grad_norm": 0.7052477598190308, - "learning_rate": 9.786889443185449e-05, - "loss": 0.0911, + "epoch": 6.528099443899247, + "grad_norm": 0.3222908079624176, + "learning_rate": 1.6982282495773062e-05, + "loss": 0.0337, "step": 24950 }, { - "epoch": 1.63297350343474, - "grad_norm": 0.8779917359352112, - "learning_rate": 9.7866240371499e-05, - "loss": 0.0997, + "epoch": 6.53071638861629, + "grad_norm": 0.3585456609725952, + "learning_rate": 1.6962708612636753e-05, + "loss": 0.0342, "step": 24960 }, { - "epoch": 1.6336277396140007, - "grad_norm": 0.7817448377609253, - "learning_rate": 9.786358469553238e-05, - "loss": 0.0967, + "epoch": 6.533333333333333, + "grad_norm": 0.292620450258255, + "learning_rate": 1.6943140222913322e-05, + "loss": 0.0353, "step": 24970 }, { - "epoch": 1.6342819757932614, - "grad_norm": 0.8375545144081116, - "learning_rate": 9.786092740404424e-05, - "loss": 0.1043, + "epoch": 6.535950278050376, + "grad_norm": 0.40674445033073425, + "learning_rate": 1.6923577339977577e-05, + "loss": 0.0335, "step": 24980 }, { - "epoch": 1.634936211972522, - "grad_norm": 1.144739031791687, - "learning_rate": 9.78582684971243e-05, - "loss": 0.1022, + "epoch": 6.538567222767419, + "grad_norm": 0.32033807039260864, + "learning_rate": 1.6904019977200564e-05, + "loss": 0.034, "step": 24990 }, { - "epoch": 1.635590448151783, - "grad_norm": 1.2085695266723633, - "learning_rate": 9.785560797486227e-05, - "loss": 0.101, + "epoch": 6.541184167484462, + "grad_norm": 0.29049545526504517, + "learning_rate": 1.6884468147949557e-05, + "loss": 0.0306, + "step": 25000 + }, + { + "epoch": 6.541184167484462, + "eval_loss": 0.03778619730581603, + "eval_runtime": 9.3515, + "eval_samples_per_second": 109.501, + "eval_steps_per_second": 1.711, "step": 25000 }, { - "epoch": 1.6362446843310434, - "grad_norm": 0.6930639147758484, - "learning_rate": 9.785294583734796e-05, - "loss": 0.0947, + "epoch": 6.543801112201505, + "grad_norm": 0.37644854187965393, + "learning_rate": 1.6864921865588047e-05, + "loss": 0.0351, "step": 25010 }, { - "epoch": 1.6368989205103042, - "grad_norm": 0.921447217464447, - "learning_rate": 9.785028208467123e-05, - "loss": 0.0978, + "epoch": 6.546418056918547, + "grad_norm": 0.3338150978088379, + "learning_rate": 1.684538114347573e-05, + "loss": 0.03, "step": 25020 }, { - "epoch": 1.637553156689565, - "grad_norm": 1.0130419731140137, - "learning_rate": 9.784761671692202e-05, - "loss": 0.0903, + "epoch": 6.54903500163559, + "grad_norm": 0.34916117787361145, + "learning_rate": 1.6825845994968516e-05, + "loss": 0.0352, "step": 25030 }, { - "epoch": 1.6382073928688257, - "grad_norm": 0.7007921934127808, - "learning_rate": 9.784494973419022e-05, - "loss": 0.0962, + "epoch": 6.551651946352633, + "grad_norm": 0.29273179173469543, + "learning_rate": 1.6806316433418484e-05, + "loss": 0.0348, "step": 25040 }, { - "epoch": 1.6388616290480864, - "grad_norm": 0.7728266716003418, - "learning_rate": 9.784228113656591e-05, - "loss": 0.0994, + "epoch": 6.554268891069676, + "grad_norm": 0.3352114260196686, + "learning_rate": 1.67867924721739e-05, + "loss": 0.0343, "step": 25050 }, { - "epoch": 1.639515865227347, - "grad_norm": 0.8407819867134094, - "learning_rate": 9.783961092413914e-05, - "loss": 0.089, + "epoch": 6.556885835786719, + "grad_norm": 0.461609423160553, + "learning_rate": 1.6767274124579217e-05, + "loss": 0.0334, "step": 25060 }, { - "epoch": 1.640170101406608, - "grad_norm": 0.9866244792938232, - "learning_rate": 9.783693909700002e-05, - "loss": 0.0996, + "epoch": 6.559502780503762, + "grad_norm": 0.5588709712028503, + "learning_rate": 1.6747761403975043e-05, + "loss": 0.0318, "step": 25070 }, { - "epoch": 1.6408243375858684, - "grad_norm": 0.7183135151863098, - "learning_rate": 9.783426565523877e-05, - "loss": 0.0873, + "epoch": 6.562119725220804, + "grad_norm": 0.35401543974876404, + "learning_rate": 1.6728254323698135e-05, + "loss": 0.035, "step": 25080 }, { - "epoch": 1.6414785737651292, - "grad_norm": 0.886570394039154, - "learning_rate": 9.78315905989456e-05, - "loss": 0.1016, + "epoch": 6.564736669937847, + "grad_norm": 0.3450101315975189, + "learning_rate": 1.6708752897081397e-05, + "loss": 0.0342, "step": 25090 }, { - "epoch": 1.64213280994439, - "grad_norm": 0.887607753276825, - "learning_rate": 9.782891392821078e-05, - "loss": 0.0904, + "epoch": 6.56735361465489, + "grad_norm": 0.22737601399421692, + "learning_rate": 1.6689257137453873e-05, + "loss": 0.0298, "step": 25100 }, { - "epoch": 1.6427870461236507, - "grad_norm": 0.9125217795372009, - "learning_rate": 9.78262356431247e-05, - "loss": 0.0913, + "epoch": 6.569970559371933, + "grad_norm": 0.34681960940361023, + "learning_rate": 1.6669767058140735e-05, + "loss": 0.0382, "step": 25110 }, { - "epoch": 1.6434412823029114, - "grad_norm": 0.9861580729484558, - "learning_rate": 9.782355574377775e-05, - "loss": 0.0996, + "epoch": 6.572587504088976, + "grad_norm": 0.308298796415329, + "learning_rate": 1.6650282672463265e-05, + "loss": 0.0318, "step": 25120 }, { - "epoch": 1.644095518482172, - "grad_norm": 1.0295411348342896, - "learning_rate": 9.782087423026036e-05, - "loss": 0.0995, + "epoch": 6.575204448806019, + "grad_norm": 0.34104683995246887, + "learning_rate": 1.663080399373885e-05, + "loss": 0.032, "step": 25130 }, { - "epoch": 1.644749754661433, - "grad_norm": 0.7599952816963196, - "learning_rate": 9.781819110266304e-05, - "loss": 0.0966, + "epoch": 6.577821393523061, + "grad_norm": 0.4612085819244385, + "learning_rate": 1.6611331035280985e-05, + "loss": 0.0368, "step": 25140 }, { - "epoch": 1.6454039908406934, - "grad_norm": 0.7582678198814392, - "learning_rate": 9.781550636107637e-05, - "loss": 0.0931, + "epoch": 6.580438338240104, + "grad_norm": 0.4950728416442871, + "learning_rate": 1.659186381039926e-05, + "loss": 0.0372, "step": 25150 }, { - "epoch": 1.6460582270199542, - "grad_norm": 0.9556160569190979, - "learning_rate": 9.781282000559095e-05, - "loss": 0.1033, + "epoch": 6.583055282957147, + "grad_norm": 0.3914588391780853, + "learning_rate": 1.6572402332399333e-05, + "loss": 0.0295, "step": 25160 }, { - "epoch": 1.646712463199215, - "grad_norm": 0.8655904531478882, - "learning_rate": 9.781013203629748e-05, - "loss": 0.1, + "epoch": 6.58567222767419, + "grad_norm": 0.33319687843322754, + "learning_rate": 1.6552946614582947e-05, + "loss": 0.0322, "step": 25170 }, { - "epoch": 1.6473666993784757, - "grad_norm": 0.7407214045524597, - "learning_rate": 9.780744245328666e-05, - "loss": 0.0947, + "epoch": 6.588289172391233, + "grad_norm": 0.29961922764778137, + "learning_rate": 1.65334966702479e-05, + "loss": 0.0314, "step": 25180 }, { - "epoch": 1.6480209355577364, - "grad_norm": 0.6862826943397522, - "learning_rate": 9.780475125664927e-05, - "loss": 0.0812, + "epoch": 6.590906117108276, + "grad_norm": 0.27232062816619873, + "learning_rate": 1.6514052512688044e-05, + "loss": 0.0379, "step": 25190 }, { - "epoch": 1.648675171736997, - "grad_norm": 0.862034022808075, - "learning_rate": 9.780205844647616e-05, - "loss": 0.0928, + "epoch": 6.593523061825319, + "grad_norm": 0.3368346393108368, + "learning_rate": 1.6494614155193276e-05, + "loss": 0.0336, "step": 25200 }, { - "epoch": 1.649329407916258, - "grad_norm": 0.8481500744819641, - "learning_rate": 9.77993640228582e-05, - "loss": 0.0957, + "epoch": 6.596140006542361, + "grad_norm": 0.44280457496643066, + "learning_rate": 1.6475181611049537e-05, + "loss": 0.0304, "step": 25210 }, { - "epoch": 1.6499836440955185, - "grad_norm": 1.0111854076385498, - "learning_rate": 9.779666798588637e-05, - "loss": 0.0879, + "epoch": 6.598756951259404, + "grad_norm": 0.35077938437461853, + "learning_rate": 1.645575489353879e-05, + "loss": 0.0332, "step": 25220 }, { - "epoch": 1.6506378802747792, - "grad_norm": 0.6928382515907288, - "learning_rate": 9.779397033565164e-05, - "loss": 0.0917, + "epoch": 6.601373895976447, + "grad_norm": 0.48652276396751404, + "learning_rate": 1.643633401593899e-05, + "loss": 0.0347, "step": 25230 }, { - "epoch": 1.65129211645404, - "grad_norm": 0.8151880502700806, - "learning_rate": 9.779127107224505e-05, - "loss": 0.1028, + "epoch": 6.60399084069349, + "grad_norm": 0.24766449630260468, + "learning_rate": 1.6416918991524145e-05, + "loss": 0.0345, "step": 25240 }, { - "epoch": 1.6519463526333005, - "grad_norm": 0.8269695043563843, - "learning_rate": 9.778857019575774e-05, - "loss": 0.1064, + "epoch": 6.606607785410533, + "grad_norm": 0.285058856010437, + "learning_rate": 1.6397509833564234e-05, + "loss": 0.0341, "step": 25250 }, { - "epoch": 1.6526005888125614, - "grad_norm": 0.8382137417793274, - "learning_rate": 9.778586770628084e-05, - "loss": 0.0939, + "epoch": 6.609224730127576, + "grad_norm": 0.2656119465827942, + "learning_rate": 1.6378106555325234e-05, + "loss": 0.0331, "step": 25260 }, { - "epoch": 1.653254824991822, - "grad_norm": 0.8290400505065918, - "learning_rate": 9.778316360390558e-05, - "loss": 0.0899, + "epoch": 6.611841674844619, + "grad_norm": 0.36539924144744873, + "learning_rate": 1.635870917006911e-05, + "loss": 0.0347, "step": 25270 }, { - "epoch": 1.6539090611710827, - "grad_norm": 0.8005163073539734, - "learning_rate": 9.778045788872324e-05, - "loss": 0.0892, + "epoch": 6.6144586195616615, + "grad_norm": 0.35211730003356934, + "learning_rate": 1.633931769105378e-05, + "loss": 0.0284, "step": 25280 }, { - "epoch": 1.6545632973503435, - "grad_norm": 0.9442570209503174, - "learning_rate": 9.777775056082514e-05, - "loss": 0.1007, + "epoch": 6.6170755642787045, + "grad_norm": 0.3001176416873932, + "learning_rate": 1.6319932131533148e-05, + "loss": 0.0326, "step": 25290 }, { - "epoch": 1.6552175335296042, - "grad_norm": 0.7411683797836304, - "learning_rate": 9.777504162030267e-05, - "loss": 0.1008, + "epoch": 6.6196925089957475, + "grad_norm": 0.32967913150787354, + "learning_rate": 1.630055250475705e-05, + "loss": 0.0314, "step": 25300 }, { - "epoch": 1.655871769708865, - "grad_norm": 0.8269376158714294, - "learning_rate": 9.777233106724722e-05, - "loss": 0.0943, + "epoch": 6.6223094537127905, + "grad_norm": 0.26268577575683594, + "learning_rate": 1.6281178823971295e-05, + "loss": 0.0298, "step": 25310 }, { - "epoch": 1.6565260058881255, - "grad_norm": 0.6999377608299255, - "learning_rate": 9.776961890175034e-05, - "loss": 0.106, + "epoch": 6.624926398429833, + "grad_norm": 0.5058441162109375, + "learning_rate": 1.6261811102417597e-05, + "loss": 0.0342, "step": 25320 }, { - "epoch": 1.6571802420673865, - "grad_norm": 0.8918529152870178, - "learning_rate": 9.776690512390352e-05, - "loss": 0.0957, + "epoch": 6.6275433431468755, + "grad_norm": 0.5294371843338013, + "learning_rate": 1.6242449353333607e-05, + "loss": 0.0332, "step": 25330 }, { - "epoch": 1.657834478246647, - "grad_norm": 0.7899738550186157, - "learning_rate": 9.776418973379838e-05, - "loss": 0.0943, + "epoch": 6.6301602878639185, + "grad_norm": 0.411773681640625, + "learning_rate": 1.6223093589952903e-05, + "loss": 0.0344, "step": 25340 }, { - "epoch": 1.6584887144259077, - "grad_norm": 0.7438578009605408, - "learning_rate": 9.776147273152659e-05, - "loss": 0.0977, + "epoch": 6.6327772325809615, + "grad_norm": 0.44051438570022583, + "learning_rate": 1.6203743825504974e-05, + "loss": 0.0341, "step": 25350 }, { - "epoch": 1.6591429506051685, - "grad_norm": 0.7523559331893921, - "learning_rate": 9.77587541171798e-05, - "loss": 0.0979, + "epoch": 6.6353941772980045, + "grad_norm": 0.22843913733959198, + "learning_rate": 1.6184400073215194e-05, + "loss": 0.0324, "step": 25360 }, { - "epoch": 1.6597971867844292, - "grad_norm": 0.8759346008300781, - "learning_rate": 9.775603389084985e-05, - "loss": 0.1003, + "epoch": 6.6380111220150475, + "grad_norm": 0.25327709317207336, + "learning_rate": 1.6165062346304845e-05, + "loss": 0.0287, "step": 25370 }, { - "epoch": 1.66045142296369, - "grad_norm": 0.7944019436836243, - "learning_rate": 9.775331205262847e-05, - "loss": 0.0903, + "epoch": 6.6406280667320905, + "grad_norm": 0.38789933919906616, + "learning_rate": 1.6145730657991066e-05, + "loss": 0.0363, "step": 25380 }, { - "epoch": 1.6611056591429505, - "grad_norm": 0.809095561504364, - "learning_rate": 9.775058860260759e-05, - "loss": 0.1002, + "epoch": 6.6432450114491335, + "grad_norm": 0.28665658831596375, + "learning_rate": 1.6126405021486897e-05, + "loss": 0.0292, "step": 25390 }, { - "epoch": 1.6617598953222115, - "grad_norm": 0.8237670660018921, - "learning_rate": 9.774786354087913e-05, - "loss": 0.0944, + "epoch": 6.645861956166176, + "grad_norm": 0.3016824424266815, + "learning_rate": 1.6107085450001228e-05, + "loss": 0.031, "step": 25400 }, { - "epoch": 1.662414131501472, - "grad_norm": 0.8633749485015869, - "learning_rate": 9.774513686753504e-05, - "loss": 0.1029, + "epoch": 6.648478900883219, + "grad_norm": 0.3922291696071625, + "learning_rate": 1.608777195673879e-05, + "loss": 0.0341, "step": 25410 }, { - "epoch": 1.6630683676807327, - "grad_norm": 0.7542330622673035, - "learning_rate": 9.774240858266735e-05, - "loss": 0.0886, + "epoch": 6.651095845600262, + "grad_norm": 0.29873526096343994, + "learning_rate": 1.6068464554900186e-05, + "loss": 0.0357, "step": 25420 }, { - "epoch": 1.6637226038599935, - "grad_norm": 0.8325693607330322, - "learning_rate": 9.773967868636818e-05, - "loss": 0.0929, + "epoch": 6.653712790317305, + "grad_norm": 0.3894430696964264, + "learning_rate": 1.6049163257681833e-05, + "loss": 0.0363, "step": 25430 }, { - "epoch": 1.664376840039254, - "grad_norm": 0.9114111065864563, - "learning_rate": 9.773694717872963e-05, - "loss": 0.0922, + "epoch": 6.6563297350343476, + "grad_norm": 0.20600323379039764, + "learning_rate": 1.6029868078275995e-05, + "loss": 0.0265, "step": 25440 }, { - "epoch": 1.665031076218515, - "grad_norm": 0.732631266117096, - "learning_rate": 9.773421405984394e-05, - "loss": 0.0999, + "epoch": 6.6589466797513905, + "grad_norm": 0.3169163763523102, + "learning_rate": 1.6010579029870744e-05, + "loss": 0.0356, "step": 25450 }, { - "epoch": 1.6656853123977755, - "grad_norm": 0.7207270860671997, - "learning_rate": 9.773147932980334e-05, - "loss": 0.0991, + "epoch": 6.6615636244684335, + "grad_norm": 0.3466549217700958, + "learning_rate": 1.5991296125649958e-05, + "loss": 0.032, "step": 25460 }, { - "epoch": 1.6663395485770363, - "grad_norm": 0.7336596250534058, - "learning_rate": 9.772874298870012e-05, - "loss": 0.096, + "epoch": 6.664180569185476, + "grad_norm": 0.32664406299591064, + "learning_rate": 1.5972019378793328e-05, + "loss": 0.0338, "step": 25470 }, { - "epoch": 1.666993784756297, - "grad_norm": 0.9117507338523865, - "learning_rate": 9.772600503662665e-05, - "loss": 0.0961, + "epoch": 6.666797513902519, + "grad_norm": 0.47851356863975525, + "learning_rate": 1.5952748802476316e-05, + "loss": 0.0343, "step": 25480 }, { - "epoch": 1.6676480209355578, - "grad_norm": 0.7073248028755188, - "learning_rate": 9.772326547367534e-05, - "loss": 0.0996, + "epoch": 6.669414458619562, + "grad_norm": 0.39265337586402893, + "learning_rate": 1.5933484409870196e-05, + "loss": 0.0363, "step": 25490 }, { - "epoch": 1.6683022571148185, - "grad_norm": 1.0052919387817383, - "learning_rate": 9.772052429993868e-05, - "loss": 0.1011, + "epoch": 6.672031403336605, + "grad_norm": 0.5032387971878052, + "learning_rate": 1.5914226214141993e-05, + "loss": 0.0365, "step": 25500 }, { - "epoch": 1.668956493294079, - "grad_norm": 0.8882834911346436, - "learning_rate": 9.771778151550917e-05, - "loss": 0.0876, + "epoch": 6.674648348053648, + "grad_norm": 0.24106982350349426, + "learning_rate": 1.589497422845449e-05, + "loss": 0.0306, "step": 25510 }, { - "epoch": 1.66961072947334, - "grad_norm": 0.8214176893234253, - "learning_rate": 9.771503712047937e-05, - "loss": 0.0875, + "epoch": 6.677265292770691, + "grad_norm": 0.5653230547904968, + "learning_rate": 1.587572846596625e-05, + "loss": 0.0372, "step": 25520 }, { - "epoch": 1.6702649656526005, - "grad_norm": 0.9225919842720032, - "learning_rate": 9.771229111494194e-05, - "loss": 0.0981, + "epoch": 6.679882237487733, + "grad_norm": 0.5159065127372742, + "learning_rate": 1.585648893983156e-05, + "loss": 0.0324, "step": 25530 }, { - "epoch": 1.6709192018318613, - "grad_norm": 0.8596377968788147, - "learning_rate": 9.770954349898956e-05, - "loss": 0.0932, + "epoch": 6.682499182204776, + "grad_norm": 0.36855408549308777, + "learning_rate": 1.5837255663200464e-05, + "loss": 0.0319, "step": 25540 }, { - "epoch": 1.671573438011122, - "grad_norm": 0.7050160765647888, - "learning_rate": 9.770679427271496e-05, - "loss": 0.1051, + "epoch": 6.685116126921819, + "grad_norm": 0.39614546298980713, + "learning_rate": 1.581802864921873e-05, + "loss": 0.0349, "step": 25550 }, { - "epoch": 1.6722276741903828, - "grad_norm": 1.0160956382751465, - "learning_rate": 9.770404343621094e-05, - "loss": 0.1, + "epoch": 6.687733071638862, + "grad_norm": 0.48540249466896057, + "learning_rate": 1.5798807911027826e-05, + "loss": 0.036, "step": 25560 }, { - "epoch": 1.6728819103696435, - "grad_norm": 0.7766900062561035, - "learning_rate": 9.770129098957035e-05, - "loss": 0.0909, + "epoch": 6.690350016355905, + "grad_norm": 0.3857311010360718, + "learning_rate": 1.577959346176496e-05, + "loss": 0.0347, "step": 25570 }, { - "epoch": 1.673536146548904, - "grad_norm": 0.9387828707695007, - "learning_rate": 9.769853693288608e-05, - "loss": 0.1045, + "epoch": 6.692966961072948, + "grad_norm": 0.3490583598613739, + "learning_rate": 1.576038531456301e-05, + "loss": 0.0298, "step": 25580 }, { - "epoch": 1.674190382728165, - "grad_norm": 1.0153820514678955, - "learning_rate": 9.76957812662511e-05, - "loss": 0.0992, + "epoch": 6.69558390578999, + "grad_norm": 0.6179901957511902, + "learning_rate": 1.5741183482550585e-05, + "loss": 0.0364, "step": 25590 }, { - "epoch": 1.6748446189074255, - "grad_norm": 0.7925774455070496, - "learning_rate": 9.769302398975841e-05, - "loss": 0.0978, + "epoch": 6.698200850507033, + "grad_norm": 0.35921305418014526, + "learning_rate": 1.572198797885194e-05, + "loss": 0.0344, "step": 25600 }, { - "epoch": 1.6754988550866863, - "grad_norm": 1.1735292673110962, - "learning_rate": 9.769026510350108e-05, - "loss": 0.1079, + "epoch": 6.700817795224076, + "grad_norm": 0.3145570158958435, + "learning_rate": 1.5702798816587018e-05, + "loss": 0.032, "step": 25610 }, { - "epoch": 1.676153091265947, - "grad_norm": 0.7813097834587097, - "learning_rate": 9.768750460757223e-05, - "loss": 0.0903, + "epoch": 6.703434739941119, + "grad_norm": 0.2709348201751709, + "learning_rate": 1.5683616008871444e-05, + "loss": 0.0311, "step": 25620 }, { - "epoch": 1.6768073274452078, - "grad_norm": 0.8915982842445374, - "learning_rate": 9.768474250206504e-05, - "loss": 0.0922, + "epoch": 6.706051684658162, + "grad_norm": 0.2840203046798706, + "learning_rate": 1.5664439568816474e-05, + "loss": 0.0297, "step": 25630 }, { - "epoch": 1.6774615636244685, - "grad_norm": 0.8092440366744995, - "learning_rate": 9.768197878707273e-05, - "loss": 0.0983, + "epoch": 6.708668629375205, + "grad_norm": 0.28185781836509705, + "learning_rate": 1.564526950952903e-05, + "loss": 0.0313, "step": 25640 }, { - "epoch": 1.678115799803729, - "grad_norm": 0.9614975452423096, - "learning_rate": 9.767921346268858e-05, - "loss": 0.103, + "epoch": 6.711285574092248, + "grad_norm": 0.2776778042316437, + "learning_rate": 1.5626105844111676e-05, + "loss": 0.0297, "step": 25650 }, { - "epoch": 1.67877003598299, - "grad_norm": 0.6998571753501892, - "learning_rate": 9.767644652900594e-05, - "loss": 0.0918, + "epoch": 6.71390251880929, + "grad_norm": 0.2556576132774353, + "learning_rate": 1.5606948585662577e-05, + "loss": 0.0309, "step": 25660 }, { - "epoch": 1.6794242721622505, - "grad_norm": 0.7789784073829651, - "learning_rate": 9.76736779861182e-05, - "loss": 0.0903, + "epoch": 6.716519463526333, + "grad_norm": 0.26259034872055054, + "learning_rate": 1.5587797747275558e-05, + "loss": 0.0306, "step": 25670 }, { - "epoch": 1.6800785083415113, - "grad_norm": 0.7573550939559937, - "learning_rate": 9.767090783411878e-05, - "loss": 0.0869, + "epoch": 6.719136408243376, + "grad_norm": 0.25451749563217163, + "learning_rate": 1.5568653342040022e-05, + "loss": 0.0305, "step": 25680 }, { - "epoch": 1.680732744520772, - "grad_norm": 0.8103246688842773, - "learning_rate": 9.766813607310122e-05, - "loss": 0.0959, + "epoch": 6.721753352960419, + "grad_norm": 0.3301449716091156, + "learning_rate": 1.5549515383040993e-05, + "loss": 0.037, "step": 25690 }, { - "epoch": 1.6813869807000326, - "grad_norm": 0.7668115496635437, - "learning_rate": 9.766536270315903e-05, - "loss": 0.1045, + "epoch": 6.724370297677462, + "grad_norm": 0.31745246052742004, + "learning_rate": 1.553038388335909e-05, + "loss": 0.0326, "step": 25700 }, { - "epoch": 1.6820412168792935, - "grad_norm": 0.7344299554824829, - "learning_rate": 9.766258772438586e-05, - "loss": 0.098, + "epoch": 6.726987242394505, + "grad_norm": 0.30288368463516235, + "learning_rate": 1.5511258856070504e-05, + "loss": 0.0323, "step": 25710 }, { - "epoch": 1.682695453058554, - "grad_norm": 0.9814144968986511, - "learning_rate": 9.765981113687534e-05, - "loss": 0.1032, + "epoch": 6.729604187111548, + "grad_norm": 0.35990169644355774, + "learning_rate": 1.549214031424702e-05, + "loss": 0.0345, "step": 25720 }, { - "epoch": 1.6833496892378148, - "grad_norm": 0.9255467653274536, - "learning_rate": 9.765703294072121e-05, - "loss": 0.0991, + "epoch": 6.73222113182859, + "grad_norm": 0.24277259409427643, + "learning_rate": 1.5473028270955976e-05, + "loss": 0.0309, "step": 25730 }, { - "epoch": 1.6840039254170756, - "grad_norm": 0.8468577861785889, - "learning_rate": 9.765425313601724e-05, - "loss": 0.0978, + "epoch": 6.734838076545633, + "grad_norm": 0.3610764443874359, + "learning_rate": 1.5453922739260275e-05, + "loss": 0.0301, "step": 25740 }, { - "epoch": 1.6846581615963363, - "grad_norm": 0.8852717876434326, - "learning_rate": 9.765147172285725e-05, - "loss": 0.1103, + "epoch": 6.737455021262676, + "grad_norm": 0.4325360357761383, + "learning_rate": 1.5434823732218373e-05, + "loss": 0.0341, "step": 25750 }, { - "epoch": 1.685312397775597, - "grad_norm": 0.6971980333328247, - "learning_rate": 9.764868870133511e-05, - "loss": 0.0939, + "epoch": 6.740071965979719, + "grad_norm": 0.23844900727272034, + "learning_rate": 1.5415731262884248e-05, + "loss": 0.0266, "step": 25760 }, { - "epoch": 1.6859666339548576, - "grad_norm": 0.9403474926948547, - "learning_rate": 9.764590407154476e-05, - "loss": 0.1042, + "epoch": 6.742688910696762, + "grad_norm": 0.3218114376068115, + "learning_rate": 1.539664534430744e-05, + "loss": 0.0324, "step": 25770 }, { - "epoch": 1.6866208701341185, - "grad_norm": 0.8653038740158081, - "learning_rate": 9.76431178335802e-05, - "loss": 0.097, + "epoch": 6.745305855413804, + "grad_norm": 0.31861692667007446, + "learning_rate": 1.5377565989532984e-05, + "loss": 0.0318, "step": 25780 }, { - "epoch": 1.687275106313379, - "grad_norm": 0.8561285138130188, - "learning_rate": 9.764032998753547e-05, - "loss": 0.0911, + "epoch": 6.747922800130847, + "grad_norm": 0.4393862783908844, + "learning_rate": 1.535849321160143e-05, + "loss": 0.0339, "step": 25790 }, { - "epoch": 1.6879293424926398, - "grad_norm": 0.7791575789451599, - "learning_rate": 9.763754053350465e-05, - "loss": 0.0927, + "epoch": 6.75053974484789, + "grad_norm": 0.3930702209472656, + "learning_rate": 1.533942702354886e-05, + "loss": 0.038, "step": 25800 }, { - "epoch": 1.6885835786719006, - "grad_norm": 0.7109609842300415, - "learning_rate": 9.76347494715819e-05, - "loss": 0.0879, + "epoch": 6.753156689564933, + "grad_norm": 0.2467467188835144, + "learning_rate": 1.5320367438406818e-05, + "loss": 0.0318, "step": 25810 }, { - "epoch": 1.6892378148511613, - "grad_norm": 0.7509888410568237, - "learning_rate": 9.763195680186143e-05, - "loss": 0.0846, + "epoch": 6.755773634281976, + "grad_norm": 0.3010796308517456, + "learning_rate": 1.5301314469202365e-05, + "loss": 0.0276, "step": 25820 }, { - "epoch": 1.689892051030422, - "grad_norm": 0.727888286113739, - "learning_rate": 9.762916252443751e-05, - "loss": 0.0913, + "epoch": 6.758390578999019, + "grad_norm": 0.29643282294273376, + "learning_rate": 1.5282268128958015e-05, + "loss": 0.0315, "step": 25830 }, { - "epoch": 1.6905462872096826, - "grad_norm": 0.8703082203865051, - "learning_rate": 9.762636663940443e-05, - "loss": 0.0902, + "epoch": 6.761007523716062, + "grad_norm": 0.2423584759235382, + "learning_rate": 1.5263228430691764e-05, + "loss": 0.0318, "step": 25840 }, { - "epoch": 1.6912005233889436, - "grad_norm": 0.8051765561103821, - "learning_rate": 9.762356914685658e-05, - "loss": 0.093, + "epoch": 6.763624468433104, + "grad_norm": 0.287006139755249, + "learning_rate": 1.5244195387417076e-05, + "loss": 0.0333, "step": 25850 }, { - "epoch": 1.691854759568204, - "grad_norm": 0.8503502607345581, - "learning_rate": 9.762077004688836e-05, - "loss": 0.0913, + "epoch": 6.766241413150147, + "grad_norm": 0.21658548712730408, + "learning_rate": 1.5225169012142842e-05, + "loss": 0.0295, "step": 25860 }, { - "epoch": 1.6925089957474648, - "grad_norm": 0.7812398672103882, - "learning_rate": 9.761796933959428e-05, - "loss": 0.0876, + "epoch": 6.76885835786719, + "grad_norm": 0.24003279209136963, + "learning_rate": 1.5206149317873427e-05, + "loss": 0.0308, "step": 25870 }, { - "epoch": 1.6931632319267256, - "grad_norm": 0.8080005049705505, - "learning_rate": 9.761516702506886e-05, - "loss": 0.0872, + "epoch": 6.771475302584233, + "grad_norm": 0.310829758644104, + "learning_rate": 1.51871363176086e-05, + "loss": 0.0327, "step": 25880 }, { - "epoch": 1.693817468105986, - "grad_norm": 0.7798691391944885, - "learning_rate": 9.761236310340665e-05, - "loss": 0.0895, + "epoch": 6.774092247301276, + "grad_norm": 0.5693554282188416, + "learning_rate": 1.5168130024343563e-05, + "loss": 0.0346, "step": 25890 }, { - "epoch": 1.694471704285247, - "grad_norm": 0.7314333915710449, - "learning_rate": 9.760955757470233e-05, - "loss": 0.0925, + "epoch": 6.776709192018319, + "grad_norm": 0.24450276792049408, + "learning_rate": 1.5149130451068948e-05, + "loss": 0.0308, "step": 25900 }, { - "epoch": 1.6951259404645076, - "grad_norm": 0.8456811904907227, - "learning_rate": 9.760675043905058e-05, - "loss": 0.1005, + "epoch": 6.779326136735362, + "grad_norm": 0.2665019929409027, + "learning_rate": 1.5130137610770783e-05, + "loss": 0.0335, "step": 25910 }, { - "epoch": 1.6957801766437686, - "grad_norm": 0.6989087462425232, - "learning_rate": 9.760394169654615e-05, - "loss": 0.106, + "epoch": 6.781943081452404, + "grad_norm": 0.3969002068042755, + "learning_rate": 1.5111151516430494e-05, + "loss": 0.0292, "step": 25920 }, { - "epoch": 1.696434412823029, - "grad_norm": 1.0372560024261475, - "learning_rate": 9.760113134728384e-05, - "loss": 0.0865, + "epoch": 6.784560026169447, + "grad_norm": 0.34969353675842285, + "learning_rate": 1.5092172181024894e-05, + "loss": 0.0306, "step": 25930 }, { - "epoch": 1.6970886490022898, - "grad_norm": 1.2391959428787231, - "learning_rate": 9.75983193913585e-05, - "loss": 0.1008, + "epoch": 6.78717697088649, + "grad_norm": 0.49384376406669617, + "learning_rate": 1.5073199617526184e-05, + "loss": 0.0314, "step": 25940 }, { - "epoch": 1.6977428851815506, - "grad_norm": 0.8071995973587036, - "learning_rate": 9.759550582886506e-05, - "loss": 0.094, + "epoch": 6.789793915603533, + "grad_norm": 0.36186710000038147, + "learning_rate": 1.5054233838901932e-05, + "loss": 0.0291, "step": 25950 }, { - "epoch": 1.6983971213608111, - "grad_norm": 0.8598625063896179, - "learning_rate": 9.759269065989848e-05, - "loss": 0.0938, + "epoch": 6.792410860320576, + "grad_norm": 0.3704721927642822, + "learning_rate": 1.5035274858115078e-05, + "loss": 0.029, "step": 25960 }, { - "epoch": 1.699051357540072, - "grad_norm": 0.853850245475769, - "learning_rate": 9.758987388455377e-05, - "loss": 0.087, + "epoch": 6.795027805037619, + "grad_norm": 0.5105478763580322, + "learning_rate": 1.5016322688123885e-05, + "loss": 0.0312, "step": 25970 }, { - "epoch": 1.6997055937193326, - "grad_norm": 0.8352051973342896, - "learning_rate": 9.7587055502926e-05, - "loss": 0.0959, + "epoch": 6.797644749754662, + "grad_norm": 0.3988720178604126, + "learning_rate": 1.4997377341882e-05, + "loss": 0.0338, "step": 25980 }, { - "epoch": 1.7003598298985934, - "grad_norm": 0.7932831048965454, - "learning_rate": 9.758423551511031e-05, - "loss": 0.0852, + "epoch": 6.800261694471704, + "grad_norm": 0.3886851668357849, + "learning_rate": 1.4978438832338377e-05, + "loss": 0.0365, "step": 25990 }, { - "epoch": 1.701014066077854, - "grad_norm": 0.8113059997558594, - "learning_rate": 9.758141392120188e-05, - "loss": 0.099, + "epoch": 6.802878639188747, + "grad_norm": 0.3836858868598938, + "learning_rate": 1.4959507172437318e-05, + "loss": 0.0285, + "step": 26000 + }, + { + "epoch": 6.802878639188747, + "eval_loss": 0.037352538310355815, + "eval_runtime": 9.4742, + "eval_samples_per_second": 108.083, + "eval_steps_per_second": 1.689, "step": 26000 }, { - "epoch": 1.7016683022571149, - "grad_norm": 0.9581121802330017, - "learning_rate": 9.757859072129594e-05, - "loss": 0.0961, + "epoch": 6.80549558390579, + "grad_norm": 0.33088138699531555, + "learning_rate": 1.4940582375118429e-05, + "loss": 0.0278, "step": 26010 }, { - "epoch": 1.7023225384363756, - "grad_norm": 0.897538423538208, - "learning_rate": 9.757576591548778e-05, - "loss": 0.0892, + "epoch": 6.808112528622833, + "grad_norm": 0.29455891251564026, + "learning_rate": 1.492166445331663e-05, + "loss": 0.0357, "step": 26020 }, { - "epoch": 1.7029767746156361, - "grad_norm": 1.0049359798431396, - "learning_rate": 9.757293950387275e-05, - "loss": 0.1075, + "epoch": 6.810729473339876, + "grad_norm": 0.3399808406829834, + "learning_rate": 1.4902753419962146e-05, + "loss": 0.0332, "step": 26030 }, { - "epoch": 1.703631010794897, - "grad_norm": 1.0087964534759521, - "learning_rate": 9.757011148654625e-05, - "loss": 0.1066, + "epoch": 6.813346418056918, + "grad_norm": 0.3951043486595154, + "learning_rate": 1.4883849287980484e-05, + "loss": 0.0316, "step": 26040 }, { - "epoch": 1.7042852469741576, - "grad_norm": 0.9875262379646301, - "learning_rate": 9.756728186360373e-05, - "loss": 0.0973, + "epoch": 6.815963362773961, + "grad_norm": 0.3731957674026489, + "learning_rate": 1.4864952070292457e-05, + "loss": 0.0305, "step": 26050 }, { - "epoch": 1.7049394831534184, - "grad_norm": 0.837603747844696, - "learning_rate": 9.75644506351407e-05, - "loss": 0.1014, + "epoch": 6.818580307491004, + "grad_norm": 0.3230712413787842, + "learning_rate": 1.4846061779814117e-05, + "loss": 0.0327, "step": 26060 }, { - "epoch": 1.7055937193326791, - "grad_norm": 0.7701033353805542, - "learning_rate": 9.756161780125271e-05, - "loss": 0.0923, + "epoch": 6.821197252208047, + "grad_norm": 0.33193638920783997, + "learning_rate": 1.4827178429456798e-05, + "loss": 0.0351, "step": 26070 }, { - "epoch": 1.7062479555119399, - "grad_norm": 0.8201101422309875, - "learning_rate": 9.755878336203539e-05, - "loss": 0.1013, + "epoch": 6.82381419692509, + "grad_norm": 0.34041762351989746, + "learning_rate": 1.4808302032127103e-05, + "loss": 0.0293, "step": 26080 }, { - "epoch": 1.7069021916912006, - "grad_norm": 0.8958601951599121, - "learning_rate": 9.755594731758441e-05, - "loss": 0.0874, + "epoch": 6.826431141642133, + "grad_norm": 0.4517126679420471, + "learning_rate": 1.4789432600726866e-05, + "loss": 0.0293, "step": 26090 }, { - "epoch": 1.7075564278704611, - "grad_norm": 0.8118382692337036, - "learning_rate": 9.755310966799546e-05, - "loss": 0.0847, + "epoch": 6.829048086359176, + "grad_norm": 0.49828705191612244, + "learning_rate": 1.4770570148153167e-05, + "loss": 0.0293, "step": 26100 }, { - "epoch": 1.708210664049722, - "grad_norm": 0.8663161993026733, - "learning_rate": 9.755027041336439e-05, - "loss": 0.0967, + "epoch": 6.831665031076218, + "grad_norm": 0.3268827497959137, + "learning_rate": 1.4751714687298313e-05, + "loss": 0.0327, "step": 26110 }, { - "epoch": 1.7088649002289826, - "grad_norm": 0.7478710412979126, - "learning_rate": 9.754742955378696e-05, - "loss": 0.0861, + "epoch": 6.834281975793261, + "grad_norm": 0.49581748247146606, + "learning_rate": 1.4732866231049835e-05, + "loss": 0.033, "step": 26120 }, { - "epoch": 1.7095191364082434, - "grad_norm": 0.9378357529640198, - "learning_rate": 9.75445870893591e-05, - "loss": 0.0999, + "epoch": 6.836898920510304, + "grad_norm": 0.4515392780303955, + "learning_rate": 1.471402479229047e-05, + "loss": 0.0324, "step": 26130 }, { - "epoch": 1.7101733725875041, - "grad_norm": 0.8898491263389587, - "learning_rate": 9.754174302017671e-05, - "loss": 0.096, + "epoch": 6.839515865227347, + "grad_norm": 0.37542739510536194, + "learning_rate": 1.4695190383898172e-05, + "loss": 0.0303, "step": 26140 }, { - "epoch": 1.7108276087667647, - "grad_norm": 0.9357932806015015, - "learning_rate": 9.753889734633583e-05, - "loss": 0.0884, + "epoch": 6.84213280994439, + "grad_norm": 0.39024776220321655, + "learning_rate": 1.4676363018746087e-05, + "loss": 0.0283, "step": 26150 }, { - "epoch": 1.7114818449460256, - "grad_norm": 0.7866037487983704, - "learning_rate": 9.753605006793249e-05, - "loss": 0.0864, + "epoch": 6.844749754661433, + "grad_norm": 0.3718578815460205, + "learning_rate": 1.4657542709702526e-05, + "loss": 0.0338, "step": 26160 }, { - "epoch": 1.7121360811252861, - "grad_norm": 0.938848614692688, - "learning_rate": 9.75332011850628e-05, - "loss": 0.0993, + "epoch": 6.847366699378476, + "grad_norm": 0.29767003655433655, + "learning_rate": 1.4638729469630996e-05, + "loss": 0.0317, "step": 26170 }, { - "epoch": 1.712790317304547, - "grad_norm": 0.7685279846191406, - "learning_rate": 9.753035069782288e-05, - "loss": 0.0905, + "epoch": 6.849983644095518, + "grad_norm": 0.2173587679862976, + "learning_rate": 1.4619923311390179e-05, + "loss": 0.0288, "step": 26180 }, { - "epoch": 1.7134445534838076, - "grad_norm": 0.7881633043289185, - "learning_rate": 9.7527498606309e-05, - "loss": 0.0956, + "epoch": 6.852600588812561, + "grad_norm": 0.24532240629196167, + "learning_rate": 1.4601124247833894e-05, + "loss": 0.0265, "step": 26190 }, { - "epoch": 1.7140987896630684, - "grad_norm": 0.8515254259109497, - "learning_rate": 9.752464491061738e-05, - "loss": 0.0971, + "epoch": 6.855217533529604, + "grad_norm": 0.29330992698669434, + "learning_rate": 1.4582332291811134e-05, + "loss": 0.0328, "step": 26200 }, { - "epoch": 1.7147530258423291, - "grad_norm": 1.0867912769317627, - "learning_rate": 9.752178961084438e-05, - "loss": 0.1, + "epoch": 6.857834478246647, + "grad_norm": 0.3279236853122711, + "learning_rate": 1.4563547456166017e-05, + "loss": 0.0289, "step": 26210 }, { - "epoch": 1.7154072620215897, - "grad_norm": 0.8622691631317139, - "learning_rate": 9.751893270708631e-05, - "loss": 0.0981, + "epoch": 6.86045142296369, + "grad_norm": 0.35393086075782776, + "learning_rate": 1.4544769753737803e-05, + "loss": 0.0311, "step": 26220 }, { - "epoch": 1.7160614982008506, - "grad_norm": 0.8179900050163269, - "learning_rate": 9.751607419943966e-05, - "loss": 0.0891, + "epoch": 6.863068367680732, + "grad_norm": 0.49016067385673523, + "learning_rate": 1.4525999197360874e-05, + "loss": 0.0342, "step": 26230 }, { - "epoch": 1.7167157343801112, - "grad_norm": 0.9989860653877258, - "learning_rate": 9.75132140880009e-05, - "loss": 0.0906, + "epoch": 6.865685312397775, + "grad_norm": 0.33892664313316345, + "learning_rate": 1.450723579986474e-05, + "loss": 0.0321, "step": 26240 }, { - "epoch": 1.717369970559372, - "grad_norm": 0.8796525597572327, - "learning_rate": 9.751035237286654e-05, - "loss": 0.0915, + "epoch": 6.868302257114818, + "grad_norm": 0.28874385356903076, + "learning_rate": 1.4488479574074e-05, + "loss": 0.0316, "step": 26250 }, { - "epoch": 1.7180242067386327, - "grad_norm": 0.7103739380836487, - "learning_rate": 9.750748905413321e-05, - "loss": 0.0898, + "epoch": 6.870919201831861, + "grad_norm": 0.35149651765823364, + "learning_rate": 1.4469730532808337e-05, + "loss": 0.0351, "step": 26260 }, { - "epoch": 1.7186784429178934, - "grad_norm": 0.8540217876434326, - "learning_rate": 9.75046241318975e-05, - "loss": 0.0971, + "epoch": 6.873536146548904, + "grad_norm": 0.3583768904209137, + "learning_rate": 1.4450988688882563e-05, + "loss": 0.0337, "step": 26270 }, { - "epoch": 1.7193326790971541, - "grad_norm": 0.8470193147659302, - "learning_rate": 9.750175760625616e-05, - "loss": 0.0885, + "epoch": 6.876153091265947, + "grad_norm": 0.23553206026554108, + "learning_rate": 1.4432254055106547e-05, + "loss": 0.0326, "step": 26280 }, { - "epoch": 1.7199869152764147, - "grad_norm": 0.8234091401100159, - "learning_rate": 9.749888947730592e-05, - "loss": 0.0924, + "epoch": 6.87877003598299, + "grad_norm": 0.4123249053955078, + "learning_rate": 1.4413526644285252e-05, + "loss": 0.0286, "step": 26290 }, { - "epoch": 1.7206411514556756, - "grad_norm": 0.7980528473854065, - "learning_rate": 9.749601974514358e-05, - "loss": 0.0907, + "epoch": 6.881386980700032, + "grad_norm": 0.3820636570453644, + "learning_rate": 1.4394806469218658e-05, + "loss": 0.0276, "step": 26300 }, { - "epoch": 1.7212953876349362, - "grad_norm": 0.8894267082214355, - "learning_rate": 9.749314840986603e-05, - "loss": 0.0888, + "epoch": 6.884003925417075, + "grad_norm": 0.38106122612953186, + "learning_rate": 1.4376093542701841e-05, + "loss": 0.0315, "step": 26310 }, { - "epoch": 1.721949623814197, - "grad_norm": 0.9575929641723633, - "learning_rate": 9.749027547157015e-05, - "loss": 0.098, + "epoch": 6.886620870134118, + "grad_norm": 0.3545074462890625, + "learning_rate": 1.4357387877524909e-05, + "loss": 0.0371, "step": 26320 }, { - "epoch": 1.7226038599934577, - "grad_norm": 0.9381182193756104, - "learning_rate": 9.748740093035293e-05, - "loss": 0.0851, + "epoch": 6.889237814851161, + "grad_norm": 0.39840003848075867, + "learning_rate": 1.433868948647302e-05, + "loss": 0.0358, "step": 26330 }, { - "epoch": 1.7232580961727182, - "grad_norm": 0.8991712927818298, - "learning_rate": 9.748452478631139e-05, - "loss": 0.1015, + "epoch": 6.891854759568204, + "grad_norm": 0.3619614839553833, + "learning_rate": 1.4319998382326327e-05, + "loss": 0.0345, "step": 26340 }, { - "epoch": 1.7239123323519792, - "grad_norm": 0.9150116443634033, - "learning_rate": 9.74816470395426e-05, - "loss": 0.095, + "epoch": 6.894471704285247, + "grad_norm": 0.2903418242931366, + "learning_rate": 1.4301314577860042e-05, + "loss": 0.0341, "step": 26350 }, { - "epoch": 1.7245665685312397, - "grad_norm": 0.9800270199775696, - "learning_rate": 9.74787676901437e-05, - "loss": 0.0929, + "epoch": 6.89708864900229, + "grad_norm": 0.27816247940063477, + "learning_rate": 1.4282638085844351e-05, + "loss": 0.029, "step": 26360 }, { - "epoch": 1.7252208047105007, - "grad_norm": 0.8880261182785034, - "learning_rate": 9.747588673821187e-05, - "loss": 0.0904, + "epoch": 6.899705593719332, + "grad_norm": 0.32325780391693115, + "learning_rate": 1.4263968919044472e-05, + "loss": 0.0295, "step": 26370 }, { - "epoch": 1.7258750408897612, - "grad_norm": 0.8214205503463745, - "learning_rate": 9.747300418384436e-05, - "loss": 0.0933, + "epoch": 6.902322538436375, + "grad_norm": 0.4054308235645294, + "learning_rate": 1.42453070902206e-05, + "loss": 0.0314, "step": 26380 }, { - "epoch": 1.726529277069022, - "grad_norm": 0.9119507074356079, - "learning_rate": 9.747012002713846e-05, - "loss": 0.0881, + "epoch": 6.904939483153418, + "grad_norm": 0.4908261001110077, + "learning_rate": 1.4226652612127933e-05, + "loss": 0.0371, "step": 26390 }, { - "epoch": 1.7271835132482827, - "grad_norm": 0.8520674109458923, - "learning_rate": 9.746723426819151e-05, - "loss": 0.0915, + "epoch": 6.907556427870461, + "grad_norm": 0.42528581619262695, + "learning_rate": 1.4208005497516608e-05, + "loss": 0.0309, "step": 26400 }, { - "epoch": 1.7278377494275432, - "grad_norm": 0.836610734462738, - "learning_rate": 9.74643469071009e-05, - "loss": 0.0977, + "epoch": 6.910173372587504, + "grad_norm": 0.36515799164772034, + "learning_rate": 1.4189365759131762e-05, + "loss": 0.0362, "step": 26410 }, { - "epoch": 1.7284919856068042, - "grad_norm": 1.0091043710708618, - "learning_rate": 9.746145794396412e-05, - "loss": 0.0978, + "epoch": 6.912790317304547, + "grad_norm": 0.35099223256111145, + "learning_rate": 1.417073340971348e-05, + "loss": 0.0289, "step": 26420 }, { - "epoch": 1.7291462217860647, - "grad_norm": 0.77135169506073, - "learning_rate": 9.745856737887866e-05, - "loss": 0.099, + "epoch": 6.91540726202159, + "grad_norm": 0.26511016488075256, + "learning_rate": 1.4152108461996811e-05, + "loss": 0.0303, "step": 26430 }, { - "epoch": 1.7298004579653254, - "grad_norm": 0.9269153475761414, - "learning_rate": 9.745567521194207e-05, - "loss": 0.0883, + "epoch": 6.918024206738632, + "grad_norm": 0.4376407265663147, + "learning_rate": 1.4133490928711706e-05, + "loss": 0.0319, "step": 26440 }, { - "epoch": 1.7304546941445862, - "grad_norm": 0.8424230813980103, - "learning_rate": 9.7452781443252e-05, - "loss": 0.0935, + "epoch": 6.920641151455675, + "grad_norm": 0.31073182821273804, + "learning_rate": 1.4114880822583099e-05, + "loss": 0.0296, "step": 26450 }, { - "epoch": 1.731108930323847, - "grad_norm": 0.7595318555831909, - "learning_rate": 9.744988607290611e-05, - "loss": 0.0949, + "epoch": 6.923258096172718, + "grad_norm": 0.2978487014770508, + "learning_rate": 1.40962781563308e-05, + "loss": 0.0307, "step": 26460 }, { - "epoch": 1.7317631665031077, - "grad_norm": 0.7194068431854248, - "learning_rate": 9.744698910100211e-05, - "loss": 0.099, + "epoch": 6.925875040889761, + "grad_norm": 0.37724074721336365, + "learning_rate": 1.4077682942669562e-05, + "loss": 0.0288, "step": 26470 }, { - "epoch": 1.7324174026823682, - "grad_norm": 0.8426297307014465, - "learning_rate": 9.74440905276378e-05, - "loss": 0.0878, + "epoch": 6.928491985606804, + "grad_norm": 0.31989142298698425, + "learning_rate": 1.4059095194309047e-05, + "loss": 0.0349, "step": 26480 }, { - "epoch": 1.7330716388616292, - "grad_norm": 0.8402136564254761, - "learning_rate": 9.744119035291101e-05, - "loss": 0.094, + "epoch": 6.9311089303238465, + "grad_norm": 0.2881931662559509, + "learning_rate": 1.4040514923953807e-05, + "loss": 0.0307, "step": 26490 }, { - "epoch": 1.7337258750408897, - "grad_norm": 0.9715792536735535, - "learning_rate": 9.743828857691963e-05, - "loss": 0.094, + "epoch": 6.933725875040889, + "grad_norm": 0.3236912488937378, + "learning_rate": 1.4021942144303262e-05, + "loss": 0.0312, "step": 26500 }, { - "epoch": 1.7343801112201505, - "grad_norm": 0.8805036544799805, - "learning_rate": 9.74353851997616e-05, - "loss": 0.1009, + "epoch": 6.936342819757932, + "grad_norm": 0.27450498938560486, + "learning_rate": 1.4003376868051748e-05, + "loss": 0.0338, "step": 26510 }, { - "epoch": 1.7350343473994112, - "grad_norm": 0.9216516017913818, - "learning_rate": 9.743248022153491e-05, - "loss": 0.0903, + "epoch": 6.938959764474975, + "grad_norm": 0.40014001727104187, + "learning_rate": 1.3984819107888459e-05, + "loss": 0.032, "step": 26520 }, { - "epoch": 1.735688583578672, - "grad_norm": 0.7972573041915894, - "learning_rate": 9.742957364233763e-05, - "loss": 0.089, + "epoch": 6.941576709192018, + "grad_norm": 0.3042270243167877, + "learning_rate": 1.3966268876497435e-05, + "loss": 0.0309, "step": 26530 }, { - "epoch": 1.7363428197579327, - "grad_norm": 0.8343783617019653, - "learning_rate": 9.742666546226784e-05, - "loss": 0.082, + "epoch": 6.944193653909061, + "grad_norm": 0.4195344150066376, + "learning_rate": 1.3947726186557592e-05, + "loss": 0.0258, "step": 26540 }, { - "epoch": 1.7369970559371932, - "grad_norm": 1.03281831741333, - "learning_rate": 9.74237556814237e-05, - "loss": 0.0882, + "epoch": 6.946810598626104, + "grad_norm": 0.43801119923591614, + "learning_rate": 1.3929191050742695e-05, + "loss": 0.0303, "step": 26550 }, { - "epoch": 1.7376512921164542, - "grad_norm": 0.9120453596115112, - "learning_rate": 9.742084429990344e-05, - "loss": 0.0942, + "epoch": 6.9494275433431465, + "grad_norm": 0.5042412281036377, + "learning_rate": 1.3910663481721314e-05, + "loss": 0.0346, "step": 26560 }, { - "epoch": 1.7383055282957147, - "grad_norm": 0.752260684967041, - "learning_rate": 9.741793131780532e-05, - "loss": 0.105, + "epoch": 6.9520444880601895, + "grad_norm": 0.5035489797592163, + "learning_rate": 1.3892143492156872e-05, + "loss": 0.0355, "step": 26570 }, { - "epoch": 1.7389597644749755, - "grad_norm": 0.8960766792297363, - "learning_rate": 9.741501673522767e-05, - "loss": 0.1, + "epoch": 6.9546614327772325, + "grad_norm": 0.39491507411003113, + "learning_rate": 1.3873631094707618e-05, + "loss": 0.0369, "step": 26580 }, { - "epoch": 1.7396140006542362, - "grad_norm": 0.9580662250518799, - "learning_rate": 9.741210055226883e-05, - "loss": 0.0972, + "epoch": 6.9572783774942755, + "grad_norm": 0.3853635787963867, + "learning_rate": 1.3855126302026602e-05, + "loss": 0.0326, "step": 26590 }, { - "epoch": 1.7402682368334967, - "grad_norm": 0.8709340691566467, - "learning_rate": 9.740918276902726e-05, - "loss": 0.0925, + "epoch": 6.9598953222113185, + "grad_norm": 0.4158419370651245, + "learning_rate": 1.383662912676166e-05, + "loss": 0.0329, "step": 26600 }, { - "epoch": 1.7409224730127577, - "grad_norm": 0.8603937029838562, - "learning_rate": 9.740626338560146e-05, - "loss": 0.099, + "epoch": 6.9625122669283614, + "grad_norm": 0.36834490299224854, + "learning_rate": 1.3818139581555456e-05, + "loss": 0.0354, "step": 26610 }, { - "epoch": 1.7415767091920182, - "grad_norm": 0.8637518286705017, - "learning_rate": 9.740334240208992e-05, - "loss": 0.1037, + "epoch": 6.965129211645404, + "grad_norm": 0.2967318296432495, + "learning_rate": 1.3799657679045397e-05, + "loss": 0.0271, "step": 26620 }, { - "epoch": 1.742230945371279, - "grad_norm": 0.954017698764801, - "learning_rate": 9.740041981859126e-05, - "loss": 0.0986, + "epoch": 6.9677461563624465, + "grad_norm": 0.3893934190273285, + "learning_rate": 1.3781183431863703e-05, + "loss": 0.0308, "step": 26630 }, { - "epoch": 1.7428851815505397, - "grad_norm": 0.9584331512451172, - "learning_rate": 9.739749563520413e-05, - "loss": 0.0905, + "epoch": 6.9703631010794895, + "grad_norm": 0.5401083827018738, + "learning_rate": 1.3762716852637348e-05, + "loss": 0.0384, "step": 26640 }, { - "epoch": 1.7435394177298005, - "grad_norm": 0.8306182026863098, - "learning_rate": 9.73945698520272e-05, - "loss": 0.0921, + "epoch": 6.9729800457965325, + "grad_norm": 0.5646620988845825, + "learning_rate": 1.374425795398807e-05, + "loss": 0.0312, "step": 26650 }, { - "epoch": 1.7441936539090612, - "grad_norm": 0.9190852642059326, - "learning_rate": 9.739164246915926e-05, - "loss": 0.0971, + "epoch": 6.9755969905135755, + "grad_norm": 0.31603124737739563, + "learning_rate": 1.3725806748532338e-05, + "loss": 0.0317, "step": 26660 }, { - "epoch": 1.7448478900883218, - "grad_norm": 0.8570190072059631, - "learning_rate": 9.738871348669907e-05, - "loss": 0.0914, + "epoch": 6.9782139352306185, + "grad_norm": 0.42641425132751465, + "learning_rate": 1.3707363248881383e-05, + "loss": 0.0321, "step": 26670 }, { - "epoch": 1.7455021262675827, - "grad_norm": 0.7864238023757935, - "learning_rate": 9.738578290474554e-05, - "loss": 0.0974, + "epoch": 6.9808308799476615, + "grad_norm": 0.5660805106163025, + "learning_rate": 1.368892746764116e-05, + "loss": 0.0321, "step": 26680 }, { - "epoch": 1.7461563624468432, - "grad_norm": 0.9038723707199097, - "learning_rate": 9.738285072339755e-05, - "loss": 0.0914, + "epoch": 6.983447824664704, + "grad_norm": 0.4165215492248535, + "learning_rate": 1.3670499417412373e-05, + "loss": 0.0285, "step": 26690 }, { - "epoch": 1.746810598626104, - "grad_norm": 1.1634936332702637, - "learning_rate": 9.73799169427541e-05, - "loss": 0.0965, + "epoch": 6.986064769381747, + "grad_norm": 0.4457850754261017, + "learning_rate": 1.3652079110790388e-05, + "loss": 0.029, "step": 26700 }, { - "epoch": 1.7474648348053647, - "grad_norm": 0.830987811088562, - "learning_rate": 9.737698156291418e-05, - "loss": 0.1034, + "epoch": 6.98868171409879, + "grad_norm": 0.32575851678848267, + "learning_rate": 1.3633666560365337e-05, + "loss": 0.0289, "step": 26710 }, { - "epoch": 1.7481190709846255, - "grad_norm": 0.7684716582298279, - "learning_rate": 9.737404458397688e-05, - "loss": 0.0894, + "epoch": 6.991298658815833, + "grad_norm": 0.24893328547477722, + "learning_rate": 1.3615261778722008e-05, + "loss": 0.0331, "step": 26720 }, { - "epoch": 1.7487733071638862, - "grad_norm": 1.0796003341674805, - "learning_rate": 9.737110600604135e-05, - "loss": 0.1027, + "epoch": 6.993915603532876, + "grad_norm": 0.2833508551120758, + "learning_rate": 1.3596864778439899e-05, + "loss": 0.03, "step": 26730 }, { - "epoch": 1.7494275433431468, - "grad_norm": 0.8072198629379272, - "learning_rate": 9.736816582920674e-05, - "loss": 0.0956, + "epoch": 6.9965325482499185, + "grad_norm": 0.4072076380252838, + "learning_rate": 1.3578475572093185e-05, + "loss": 0.0344, "step": 26740 }, { - "epoch": 1.7500817795224077, - "grad_norm": 0.9745908379554749, - "learning_rate": 9.736522405357231e-05, - "loss": 0.0923, + "epoch": 6.999149492966961, + "grad_norm": 0.30101343989372253, + "learning_rate": 1.3560094172250737e-05, + "loss": 0.0324, "step": 26750 }, { - "epoch": 1.7507360157016683, - "grad_norm": 0.9096083641052246, - "learning_rate": 9.736228067923735e-05, - "loss": 0.0863, + "epoch": 7.001570166830226, + "grad_norm": 0.3735860586166382, + "learning_rate": 1.3541720591476033e-05, + "loss": 0.0292, "step": 26760 }, { - "epoch": 1.751390251880929, - "grad_norm": 0.7566020488739014, - "learning_rate": 9.73593357063012e-05, - "loss": 0.0921, + "epoch": 7.004187111547268, + "grad_norm": 0.3974541425704956, + "learning_rate": 1.3523354842327263e-05, + "loss": 0.0309, "step": 26770 }, { - "epoch": 1.7520444880601898, - "grad_norm": 0.9019480347633362, - "learning_rate": 9.735638913486327e-05, - "loss": 0.0888, + "epoch": 7.006804056264311, + "grad_norm": 0.32980749011039734, + "learning_rate": 1.350499693735724e-05, + "loss": 0.0335, "step": 26780 }, { - "epoch": 1.7526987242394503, - "grad_norm": 0.8336036205291748, - "learning_rate": 9.735344096502302e-05, - "loss": 0.0952, + "epoch": 7.009421000981354, + "grad_norm": 0.3228886127471924, + "learning_rate": 1.3486646889113427e-05, + "loss": 0.032, "step": 26790 }, { - "epoch": 1.7533529604187112, - "grad_norm": 0.8614187240600586, - "learning_rate": 9.735049119687993e-05, - "loss": 0.0946, + "epoch": 7.012037945698397, + "grad_norm": 0.2811286449432373, + "learning_rate": 1.3468304710137902e-05, + "loss": 0.0305, "step": 26800 }, { - "epoch": 1.7540071965979718, - "grad_norm": 0.9070919752120972, - "learning_rate": 9.73475398305336e-05, - "loss": 0.0853, + "epoch": 7.01465489041544, + "grad_norm": 0.4297829270362854, + "learning_rate": 1.344997041296736e-05, + "loss": 0.031, "step": 26810 }, { - "epoch": 1.7546614327772327, - "grad_norm": 0.8516311049461365, - "learning_rate": 9.734458686608361e-05, - "loss": 0.0972, + "epoch": 7.017271835132483, + "grad_norm": 0.3122144937515259, + "learning_rate": 1.3431644010133132e-05, + "loss": 0.032, "step": 26820 }, { - "epoch": 1.7553156689564933, - "grad_norm": 1.2225861549377441, - "learning_rate": 9.734163230362965e-05, - "loss": 0.1026, + "epoch": 7.019888779849525, + "grad_norm": 0.4094059467315674, + "learning_rate": 1.341332551416114e-05, + "loss": 0.03, "step": 26830 }, { - "epoch": 1.755969905135754, - "grad_norm": 0.8379483222961426, - "learning_rate": 9.733867614327145e-05, - "loss": 0.0914, + "epoch": 7.022505724566568, + "grad_norm": 0.29214930534362793, + "learning_rate": 1.33950149375719e-05, + "loss": 0.0286, "step": 26840 }, { - "epoch": 1.7566241413150148, - "grad_norm": 1.0506983995437622, - "learning_rate": 9.733571838510878e-05, - "loss": 0.089, + "epoch": 7.025122669283611, + "grad_norm": 0.2737705111503601, + "learning_rate": 1.3376712292880533e-05, + "loss": 0.0274, "step": 26850 }, { - "epoch": 1.7572783774942753, - "grad_norm": 0.8693654537200928, - "learning_rate": 9.733275902924146e-05, - "loss": 0.1042, + "epoch": 7.027739614000654, + "grad_norm": 0.3263177275657654, + "learning_rate": 1.3358417592596705e-05, + "loss": 0.0364, "step": 26860 }, { - "epoch": 1.7579326136735363, - "grad_norm": 0.8979597091674805, - "learning_rate": 9.732979807576941e-05, - "loss": 0.0866, + "epoch": 7.030356558717697, + "grad_norm": 0.5150085091590881, + "learning_rate": 1.334013084922468e-05, + "loss": 0.0309, "step": 26870 }, { - "epoch": 1.7585868498527968, - "grad_norm": 1.0074806213378906, - "learning_rate": 9.732683552479252e-05, - "loss": 0.1029, + "epoch": 7.03297350343474, + "grad_norm": 0.35235485434532166, + "learning_rate": 1.3321852075263269e-05, + "loss": 0.0258, "step": 26880 }, { - "epoch": 1.7592410860320575, - "grad_norm": 0.7297909259796143, - "learning_rate": 9.732387137641084e-05, - "loss": 0.1038, + "epoch": 7.035590448151782, + "grad_norm": 0.26451367139816284, + "learning_rate": 1.3303581283205858e-05, + "loss": 0.0335, "step": 26890 }, { - "epoch": 1.7598953222113183, - "grad_norm": 0.7585257887840271, - "learning_rate": 9.732090563072437e-05, - "loss": 0.1036, + "epoch": 7.038207392868825, + "grad_norm": 0.20942290127277374, + "learning_rate": 1.3285318485540348e-05, + "loss": 0.0241, "step": 26900 }, { - "epoch": 1.760549558390579, - "grad_norm": 0.7857770919799805, - "learning_rate": 9.731793828783323e-05, - "loss": 0.0912, + "epoch": 7.040824337585868, + "grad_norm": 0.24867643415927887, + "learning_rate": 1.3267063694749182e-05, + "loss": 0.0305, "step": 26910 }, { - "epoch": 1.7612037945698398, - "grad_norm": 0.7956094741821289, - "learning_rate": 9.731496934783759e-05, - "loss": 0.0953, + "epoch": 7.043441282302911, + "grad_norm": 0.3994474411010742, + "learning_rate": 1.3248816923309348e-05, + "loss": 0.0256, "step": 26920 }, { - "epoch": 1.7618580307491003, - "grad_norm": 0.9437527656555176, - "learning_rate": 9.731199881083763e-05, - "loss": 0.0901, + "epoch": 7.046058227019954, + "grad_norm": 0.2341812402009964, + "learning_rate": 1.3230578183692339e-05, + "loss": 0.0266, "step": 26930 }, { - "epoch": 1.7625122669283613, - "grad_norm": 0.733113169670105, - "learning_rate": 9.730902667693365e-05, - "loss": 0.0945, + "epoch": 7.048675171736997, + "grad_norm": 0.3208339512348175, + "learning_rate": 1.3212347488364158e-05, + "loss": 0.0302, "step": 26940 }, { - "epoch": 1.7631665031076218, - "grad_norm": 0.9366680383682251, - "learning_rate": 9.730605294622593e-05, - "loss": 0.1043, + "epoch": 7.05129211645404, + "grad_norm": 0.28134050965309143, + "learning_rate": 1.3194124849785334e-05, + "loss": 0.0284, "step": 26950 }, { - "epoch": 1.7638207392868825, - "grad_norm": 0.963205873966217, - "learning_rate": 9.730307761881487e-05, - "loss": 0.1127, + "epoch": 7.053909061171082, + "grad_norm": 0.20203755795955658, + "learning_rate": 1.3175910280410836e-05, + "loss": 0.0292, "step": 26960 }, { - "epoch": 1.7644749754661433, - "grad_norm": 0.8156423568725586, - "learning_rate": 9.730010069480088e-05, - "loss": 0.088, + "epoch": 7.056526005888125, + "grad_norm": 0.2081657201051712, + "learning_rate": 1.315770379269017e-05, + "loss": 0.0287, "step": 26970 }, { - "epoch": 1.765129211645404, - "grad_norm": 1.1396329402923584, - "learning_rate": 9.729712217428444e-05, - "loss": 0.0834, + "epoch": 7.059142950605168, + "grad_norm": 0.37536320090293884, + "learning_rate": 1.3139505399067298e-05, + "loss": 0.0296, "step": 26980 }, { - "epoch": 1.7657834478246648, - "grad_norm": 0.7981417775154114, - "learning_rate": 9.72941420573661e-05, - "loss": 0.0932, + "epoch": 7.061759895322211, + "grad_norm": 0.351087749004364, + "learning_rate": 1.3121315111980653e-05, + "loss": 0.0306, "step": 26990 }, { - "epoch": 1.7664376840039253, - "grad_norm": 0.8251798748970032, - "learning_rate": 9.729116034414641e-05, - "loss": 0.085, + "epoch": 7.064376840039254, + "grad_norm": 0.5465127229690552, + "learning_rate": 1.3103132943863122e-05, + "loss": 0.0336, + "step": 27000 + }, + { + "epoch": 7.064376840039254, + "eval_loss": 0.03515085224030179, + "eval_runtime": 9.687, + "eval_samples_per_second": 105.709, + "eval_steps_per_second": 1.652, "step": 27000 }, { - "epoch": 1.7670919201831863, - "grad_norm": 0.8225140571594238, - "learning_rate": 9.728817703472604e-05, - "loss": 0.0906, + "epoch": 7.066993784756297, + "grad_norm": 0.19655290246009827, + "learning_rate": 1.3084958907142033e-05, + "loss": 0.0257, "step": 27010 }, { - "epoch": 1.7677461563624468, - "grad_norm": 0.8406652808189392, - "learning_rate": 9.728519212920568e-05, - "loss": 0.0907, + "epoch": 7.0696107294733395, + "grad_norm": 0.4432132840156555, + "learning_rate": 1.3066793014239182e-05, + "loss": 0.0298, "step": 27020 }, { - "epoch": 1.7684003925417076, - "grad_norm": 0.7829009294509888, - "learning_rate": 9.728220562768607e-05, - "loss": 0.0933, + "epoch": 7.0722276741903825, + "grad_norm": 0.2571084499359131, + "learning_rate": 1.3048635277570776e-05, + "loss": 0.0315, "step": 27030 }, { - "epoch": 1.7690546287209683, - "grad_norm": 0.6848630309104919, - "learning_rate": 9.727921753026802e-05, - "loss": 0.0981, + "epoch": 7.074844618907425, + "grad_norm": 0.2586584687232971, + "learning_rate": 1.303048570954747e-05, + "loss": 0.0316, "step": 27040 }, { - "epoch": 1.7697088649002288, - "grad_norm": 0.7441691756248474, - "learning_rate": 9.727622783705239e-05, - "loss": 0.0891, + "epoch": 7.077461563624468, + "grad_norm": 0.29508477449417114, + "learning_rate": 1.3012344322574322e-05, + "loss": 0.0291, "step": 27050 }, { - "epoch": 1.7703631010794898, - "grad_norm": 0.8913453221321106, - "learning_rate": 9.727323654814009e-05, - "loss": 0.0987, + "epoch": 7.080078508341511, + "grad_norm": 0.3852759599685669, + "learning_rate": 1.2994211129050782e-05, + "loss": 0.0302, "step": 27060 }, { - "epoch": 1.7710173372587503, - "grad_norm": 0.7993099093437195, - "learning_rate": 9.727024366363206e-05, - "loss": 0.0904, + "epoch": 7.082695453058554, + "grad_norm": 0.30965447425842285, + "learning_rate": 1.2976086141370727e-05, + "loss": 0.0326, "step": 27070 }, { - "epoch": 1.771671573438011, - "grad_norm": 0.821160078048706, - "learning_rate": 9.726724918362935e-05, - "loss": 0.0987, + "epoch": 7.085312397775597, + "grad_norm": 0.5237151384353638, + "learning_rate": 1.2957969371922427e-05, + "loss": 0.0346, "step": 27080 }, { - "epoch": 1.7723258096172718, - "grad_norm": 0.7259571552276611, - "learning_rate": 9.7264253108233e-05, - "loss": 0.0973, + "epoch": 7.0879293424926395, + "grad_norm": 0.27156862616539, + "learning_rate": 1.2939860833088501e-05, + "loss": 0.029, "step": 27090 }, { - "epoch": 1.7729800457965326, - "grad_norm": 0.8779374361038208, - "learning_rate": 9.726125543754417e-05, - "loss": 0.0934, + "epoch": 7.0905462872096825, + "grad_norm": 0.29252901673316956, + "learning_rate": 1.2921760537245986e-05, + "loss": 0.0294, "step": 27100 }, { - "epoch": 1.7736342819757933, - "grad_norm": 0.7990883588790894, - "learning_rate": 9.725825617166402e-05, - "loss": 0.0946, + "epoch": 7.0931632319267255, + "grad_norm": 0.35140424966812134, + "learning_rate": 1.2903668496766244e-05, + "loss": 0.0283, "step": 27110 }, { - "epoch": 1.7742885181550538, - "grad_norm": 0.7716463804244995, - "learning_rate": 9.725525531069377e-05, - "loss": 0.0956, + "epoch": 7.0957801766437685, + "grad_norm": 0.34542346000671387, + "learning_rate": 1.288558472401502e-05, + "loss": 0.0303, "step": 27120 }, { - "epoch": 1.7749427543343148, - "grad_norm": 1.0061755180358887, - "learning_rate": 9.725225285473473e-05, - "loss": 0.0804, + "epoch": 7.0983971213608115, + "grad_norm": 0.2882545590400696, + "learning_rate": 1.2867509231352409e-05, + "loss": 0.0302, "step": 27130 }, { - "epoch": 1.7755969905135753, - "grad_norm": 0.9990642070770264, - "learning_rate": 9.724924880388824e-05, - "loss": 0.1089, + "epoch": 7.1010140660778545, + "grad_norm": 0.28216639161109924, + "learning_rate": 1.2849442031132832e-05, + "loss": 0.0281, "step": 27140 }, { - "epoch": 1.776251226692836, - "grad_norm": 0.8501982688903809, - "learning_rate": 9.72462431582557e-05, - "loss": 0.0938, + "epoch": 7.103631010794897, + "grad_norm": 0.2185831218957901, + "learning_rate": 1.2831383135705067e-05, + "loss": 0.0257, "step": 27150 }, { - "epoch": 1.7769054628720968, - "grad_norm": 0.8450808525085449, - "learning_rate": 9.724323591793851e-05, - "loss": 0.0846, + "epoch": 7.1062479555119396, + "grad_norm": 0.2274857759475708, + "learning_rate": 1.2813332557412171e-05, + "loss": 0.0278, "step": 27160 }, { - "epoch": 1.7775596990513576, - "grad_norm": 0.6056027412414551, - "learning_rate": 9.724022708303824e-05, - "loss": 0.0928, + "epoch": 7.1088649002289825, + "grad_norm": 0.22652925550937653, + "learning_rate": 1.2795290308591574e-05, + "loss": 0.0253, "step": 27170 }, { - "epoch": 1.7782139352306183, - "grad_norm": 0.7293991446495056, - "learning_rate": 9.723721665365639e-05, - "loss": 0.0927, + "epoch": 7.1114818449460255, + "grad_norm": 0.3402152955532074, + "learning_rate": 1.2777256401574956e-05, + "loss": 0.0326, "step": 27180 }, { - "epoch": 1.7788681714098789, - "grad_norm": 0.8087413311004639, - "learning_rate": 9.723420462989461e-05, - "loss": 0.0866, + "epoch": 7.1140987896630685, + "grad_norm": 0.43154576420783997, + "learning_rate": 1.2759230848688331e-05, + "loss": 0.0322, "step": 27190 }, { - "epoch": 1.7795224075891398, - "grad_norm": 0.9952290654182434, - "learning_rate": 9.723119101185455e-05, - "loss": 0.0868, + "epoch": 7.1167157343801115, + "grad_norm": 0.4719350039958954, + "learning_rate": 1.274121366225201e-05, + "loss": 0.028, "step": 27200 }, { - "epoch": 1.7801766437684003, - "grad_norm": 0.6939850449562073, - "learning_rate": 9.722817579963789e-05, - "loss": 0.0953, + "epoch": 7.1193326790971545, + "grad_norm": 0.30630409717559814, + "learning_rate": 1.2723204854580548e-05, + "loss": 0.029, "step": 27210 }, { - "epoch": 1.780830879947661, - "grad_norm": 0.7775471806526184, - "learning_rate": 9.722515899334647e-05, - "loss": 0.0985, + "epoch": 7.121949623814197, + "grad_norm": 0.39883729815483093, + "learning_rate": 1.2705204437982805e-05, + "loss": 0.0306, "step": 27220 }, { - "epoch": 1.7814851161269218, - "grad_norm": 0.824198842048645, - "learning_rate": 9.722214059308208e-05, - "loss": 0.0992, + "epoch": 7.12456656853124, + "grad_norm": 0.2656046748161316, + "learning_rate": 1.268721242476189e-05, + "loss": 0.0295, "step": 27230 }, { - "epoch": 1.7821393523061824, - "grad_norm": 0.8829124569892883, - "learning_rate": 9.72191205989466e-05, - "loss": 0.095, + "epoch": 7.127183513248283, + "grad_norm": 0.32729870080947876, + "learning_rate": 1.2669228827215186e-05, + "loss": 0.0284, "step": 27240 }, { - "epoch": 1.7827935884854433, - "grad_norm": 0.8177944421768188, - "learning_rate": 9.721609901104194e-05, - "loss": 0.094, + "epoch": 7.129800457965326, + "grad_norm": 0.3247520923614502, + "learning_rate": 1.2651253657634315e-05, + "loss": 0.0282, "step": 27250 }, { - "epoch": 1.7834478246647039, - "grad_norm": 1.004304051399231, - "learning_rate": 9.721307582947014e-05, - "loss": 0.091, + "epoch": 7.132417402682369, + "grad_norm": 0.4259008467197418, + "learning_rate": 1.2633286928305127e-05, + "loss": 0.0338, "step": 27260 }, { - "epoch": 1.7841020608439648, - "grad_norm": 0.8551913499832153, - "learning_rate": 9.721005105433319e-05, - "loss": 0.0965, + "epoch": 7.135034347399412, + "grad_norm": 0.2542956471443176, + "learning_rate": 1.2615328651507701e-05, + "loss": 0.029, "step": 27270 }, { - "epoch": 1.7847562970232254, - "grad_norm": 0.7581825256347656, - "learning_rate": 9.720702468573321e-05, - "loss": 0.0934, + "epoch": 7.137651292116454, + "grad_norm": 0.26459887623786926, + "learning_rate": 1.2597378839516364e-05, + "loss": 0.0275, "step": 27280 }, { - "epoch": 1.785410533202486, - "grad_norm": 0.761549711227417, - "learning_rate": 9.720399672377234e-05, - "loss": 0.0978, + "epoch": 7.140268236833497, + "grad_norm": 0.3292798399925232, + "learning_rate": 1.2579437504599639e-05, + "loss": 0.0319, "step": 27290 }, { - "epoch": 1.7860647693817469, - "grad_norm": 0.8848084211349487, - "learning_rate": 9.72009671685528e-05, - "loss": 0.0811, + "epoch": 7.14288518155054, + "grad_norm": 0.35263168811798096, + "learning_rate": 1.2561504659020269e-05, + "loss": 0.0307, "step": 27300 }, { - "epoch": 1.7867190055610074, - "grad_norm": 0.8144980669021606, - "learning_rate": 9.719793602017681e-05, - "loss": 0.0844, + "epoch": 7.145502126267583, + "grad_norm": 0.2072511911392212, + "learning_rate": 1.254358031503517e-05, + "loss": 0.0252, "step": 27310 }, { - "epoch": 1.7873732417402683, - "grad_norm": 0.8796505928039551, - "learning_rate": 9.71949032787467e-05, - "loss": 0.0904, + "epoch": 7.148119070984626, + "grad_norm": 0.39903518557548523, + "learning_rate": 1.2525664484895467e-05, + "loss": 0.0312, "step": 27320 }, { - "epoch": 1.7880274779195289, - "grad_norm": 0.8658892512321472, - "learning_rate": 9.719186894436484e-05, - "loss": 0.0914, + "epoch": 7.150736015701669, + "grad_norm": 0.33573272824287415, + "learning_rate": 1.250775718084646e-05, + "loss": 0.029, "step": 27330 }, { - "epoch": 1.7886817140987896, - "grad_norm": 0.9171237349510193, - "learning_rate": 9.718883301713363e-05, - "loss": 0.0923, + "epoch": 7.153352960418712, + "grad_norm": 0.2959030568599701, + "learning_rate": 1.2489858415127628e-05, + "loss": 0.0279, "step": 27340 }, { - "epoch": 1.7893359502780504, - "grad_norm": 0.7952425479888916, - "learning_rate": 9.718579549715555e-05, - "loss": 0.1069, + "epoch": 7.155969905135754, + "grad_norm": 0.26307299733161926, + "learning_rate": 1.2471968199972616e-05, + "loss": 0.0314, "step": 27350 }, { - "epoch": 1.7899901864573111, - "grad_norm": 0.8469095230102539, - "learning_rate": 9.718275638453312e-05, - "loss": 0.0942, + "epoch": 7.158586849852797, + "grad_norm": 0.26677271723747253, + "learning_rate": 1.2454086547609206e-05, + "loss": 0.0266, "step": 27360 }, { - "epoch": 1.7906444226365719, - "grad_norm": 0.8001088500022888, - "learning_rate": 9.717971567936892e-05, - "loss": 0.0931, + "epoch": 7.16120379456984, + "grad_norm": 0.3760434687137604, + "learning_rate": 1.2436213470259331e-05, + "loss": 0.0316, "step": 27370 }, { - "epoch": 1.7912986588158324, - "grad_norm": 0.9924861192703247, - "learning_rate": 9.71766733817656e-05, - "loss": 0.0924, + "epoch": 7.163820739286883, + "grad_norm": 0.2940519452095032, + "learning_rate": 1.2418348980139078e-05, + "loss": 0.0253, "step": 27380 }, { - "epoch": 1.7919528949950934, - "grad_norm": 0.79388028383255, - "learning_rate": 9.71736294918258e-05, - "loss": 0.0949, + "epoch": 7.166437684003926, + "grad_norm": 0.33235180377960205, + "learning_rate": 1.240049308945866e-05, + "loss": 0.0274, "step": 27390 }, { - "epoch": 1.7926071311743539, - "grad_norm": 0.8403041362762451, - "learning_rate": 9.71705840096523e-05, - "loss": 0.084, + "epoch": 7.169054628720969, + "grad_norm": 0.31940221786499023, + "learning_rate": 1.2382645810422418e-05, + "loss": 0.0245, "step": 27400 }, { - "epoch": 1.7932613673536146, - "grad_norm": 0.9250532984733582, - "learning_rate": 9.716753693534791e-05, - "loss": 0.092, + "epoch": 7.171671573438011, + "grad_norm": 0.20587033033370972, + "learning_rate": 1.236480715522878e-05, + "loss": 0.0311, "step": 27410 }, { - "epoch": 1.7939156035328754, - "grad_norm": 0.8198657035827637, - "learning_rate": 9.716448826901541e-05, - "loss": 0.096, + "epoch": 7.174288518155054, + "grad_norm": 0.2504628896713257, + "learning_rate": 1.2346977136070311e-05, + "loss": 0.0296, "step": 27420 }, { - "epoch": 1.7945698397121361, - "grad_norm": 0.7525361180305481, - "learning_rate": 9.716143801075775e-05, - "loss": 0.0925, + "epoch": 7.176905462872097, + "grad_norm": 0.23364555835723877, + "learning_rate": 1.2329155765133658e-05, + "loss": 0.0282, "step": 27430 }, { - "epoch": 1.7952240758913969, - "grad_norm": 0.7295049428939819, - "learning_rate": 9.715838616067786e-05, - "loss": 0.0912, + "epoch": 7.17952240758914, + "grad_norm": 0.23926670849323273, + "learning_rate": 1.2311343054599562e-05, + "loss": 0.0315, "step": 27440 }, { - "epoch": 1.7958783120706574, - "grad_norm": 0.9026070833206177, - "learning_rate": 9.715533271887876e-05, - "loss": 0.0941, + "epoch": 7.182139352306183, + "grad_norm": 0.4970473349094391, + "learning_rate": 1.2293539016642847e-05, + "loss": 0.0359, "step": 27450 }, { - "epoch": 1.7965325482499184, - "grad_norm": 0.9286572933197021, - "learning_rate": 9.715227768546354e-05, - "loss": 0.0895, + "epoch": 7.184756297023226, + "grad_norm": 0.3282844126224518, + "learning_rate": 1.22757436634324e-05, + "loss": 0.0317, "step": 27460 }, { - "epoch": 1.797186784429179, - "grad_norm": 0.6571982502937317, - "learning_rate": 9.714922106053526e-05, - "loss": 0.0912, + "epoch": 7.187373241740268, + "grad_norm": 0.27741098403930664, + "learning_rate": 1.2257957007131168e-05, + "loss": 0.0339, "step": 27470 }, { - "epoch": 1.7978410206084396, - "grad_norm": 0.7618511915206909, - "learning_rate": 9.71461628441971e-05, - "loss": 0.0883, + "epoch": 7.189990186457311, + "grad_norm": 0.3691469728946686, + "learning_rate": 1.2240179059896172e-05, + "loss": 0.0267, "step": 27480 }, { - "epoch": 1.7984952567877004, - "grad_norm": 0.8936156630516052, - "learning_rate": 9.714310303655234e-05, - "loss": 0.0926, + "epoch": 7.192607131174354, + "grad_norm": 0.3223341107368469, + "learning_rate": 1.2222409833878471e-05, + "loss": 0.0323, "step": 27490 }, { - "epoch": 1.799149492966961, - "grad_norm": 0.7239774465560913, - "learning_rate": 9.71400416377042e-05, - "loss": 0.0846, + "epoch": 7.195224075891397, + "grad_norm": 0.37310001254081726, + "learning_rate": 1.2204649341223173e-05, + "loss": 0.0247, "step": 27500 }, { - "epoch": 1.7998037291462219, - "grad_norm": 0.7614884376525879, - "learning_rate": 9.713697864775601e-05, - "loss": 0.1061, + "epoch": 7.19784102060844, + "grad_norm": 0.3943912386894226, + "learning_rate": 1.2186897594069385e-05, + "loss": 0.0292, "step": 27510 }, { - "epoch": 1.8004579653254824, - "grad_norm": 0.8885960578918457, - "learning_rate": 9.713391406681118e-05, - "loss": 0.0838, + "epoch": 7.200457965325483, + "grad_norm": 0.4825931191444397, + "learning_rate": 1.2169154604550274e-05, + "loss": 0.0309, "step": 27520 }, { - "epoch": 1.8011122015047432, - "grad_norm": 0.7284700274467468, - "learning_rate": 9.713084789497315e-05, - "loss": 0.0917, + "epoch": 7.203074910042526, + "grad_norm": 0.4375806450843811, + "learning_rate": 1.2151420384793002e-05, + "loss": 0.0304, "step": 27530 }, { - "epoch": 1.801766437684004, - "grad_norm": 0.7352108955383301, - "learning_rate": 9.712778013234538e-05, - "loss": 0.0833, + "epoch": 7.205691854759568, + "grad_norm": 0.44367581605911255, + "learning_rate": 1.213369494691875e-05, + "loss": 0.0283, "step": 27540 }, { - "epoch": 1.8024206738632647, - "grad_norm": 0.7528089880943298, - "learning_rate": 9.712471077903144e-05, - "loss": 0.0889, + "epoch": 7.208308799476611, + "grad_norm": 0.28335586190223694, + "learning_rate": 1.2115978303042671e-05, + "loss": 0.0303, "step": 27550 }, { - "epoch": 1.8030749100425254, - "grad_norm": 0.895668625831604, - "learning_rate": 9.712163983513491e-05, - "loss": 0.0957, + "epoch": 7.210925744193654, + "grad_norm": 0.3478769361972809, + "learning_rate": 1.2098270465273945e-05, + "loss": 0.0269, "step": 27560 }, { - "epoch": 1.803729146221786, - "grad_norm": 0.7745876908302307, - "learning_rate": 9.711856730075948e-05, - "loss": 0.1034, + "epoch": 7.213542688910697, + "grad_norm": 0.1998811960220337, + "learning_rate": 1.2080571445715687e-05, + "loss": 0.0342, "step": 27570 }, { - "epoch": 1.804383382401047, - "grad_norm": 0.9422993659973145, - "learning_rate": 9.711549317600881e-05, - "loss": 0.0915, + "epoch": 7.21615963362774, + "grad_norm": 0.32851898670196533, + "learning_rate": 1.2062881256465024e-05, + "loss": 0.0259, "step": 27580 }, { - "epoch": 1.8050376185803074, - "grad_norm": 0.8201385140419006, - "learning_rate": 9.711241746098669e-05, - "loss": 0.092, + "epoch": 7.218776578344783, + "grad_norm": 0.24480850994586945, + "learning_rate": 1.2045199909613034e-05, + "loss": 0.027, "step": 27590 }, { - "epoch": 1.8056918547595682, - "grad_norm": 0.8114073872566223, - "learning_rate": 9.710934015579693e-05, - "loss": 0.0939, + "epoch": 7.221393523061825, + "grad_norm": 0.34246158599853516, + "learning_rate": 1.2027527417244757e-05, + "loss": 0.0294, "step": 27600 }, { - "epoch": 1.806346090938829, - "grad_norm": 0.742363452911377, - "learning_rate": 9.710626126054338e-05, - "loss": 0.0994, + "epoch": 7.224010467778868, + "grad_norm": 0.2054915428161621, + "learning_rate": 1.200986379143916e-05, + "loss": 0.0281, "step": 27610 }, { - "epoch": 1.8070003271180897, - "grad_norm": 0.9187309145927429, - "learning_rate": 9.710318077532998e-05, - "loss": 0.0937, + "epoch": 7.226627412495911, + "grad_norm": 0.38791176676750183, + "learning_rate": 1.199220904426917e-05, + "loss": 0.0301, "step": 27620 }, { - "epoch": 1.8076545632973504, - "grad_norm": 0.9056969881057739, - "learning_rate": 9.71000987002607e-05, - "loss": 0.0975, + "epoch": 7.229244357212954, + "grad_norm": 0.3540157973766327, + "learning_rate": 1.1974563187801644e-05, + "loss": 0.0309, "step": 27630 }, { - "epoch": 1.808308799476611, - "grad_norm": 0.7385011911392212, - "learning_rate": 9.709701503543954e-05, - "loss": 0.1049, + "epoch": 7.231861301929997, + "grad_norm": 0.3500435948371887, + "learning_rate": 1.1956926234097362e-05, + "loss": 0.0325, "step": 27640 }, { - "epoch": 1.808963035655872, - "grad_norm": 0.8550733327865601, - "learning_rate": 9.709392978097061e-05, - "loss": 0.0927, + "epoch": 7.23447824664704, + "grad_norm": 0.3816050887107849, + "learning_rate": 1.1939298195211005e-05, + "loss": 0.0315, "step": 27650 }, { - "epoch": 1.8096172718351324, - "grad_norm": 0.8086677193641663, - "learning_rate": 9.709084293695806e-05, - "loss": 0.0882, + "epoch": 7.237095191364083, + "grad_norm": 0.3657297194004059, + "learning_rate": 1.1921679083191184e-05, + "loss": 0.0279, "step": 27660 }, { - "epoch": 1.8102715080143932, - "grad_norm": 0.8741077184677124, - "learning_rate": 9.708775450350605e-05, - "loss": 0.0907, + "epoch": 7.239712136081125, + "grad_norm": 0.3235689401626587, + "learning_rate": 1.1904068910080379e-05, + "loss": 0.032, "step": 27670 }, { - "epoch": 1.810925744193654, - "grad_norm": 0.7944101095199585, - "learning_rate": 9.708466448071884e-05, - "loss": 0.084, + "epoch": 7.242329080798168, + "grad_norm": 0.3269582986831665, + "learning_rate": 1.1886467687914988e-05, + "loss": 0.0281, "step": 27680 }, { - "epoch": 1.8115799803729145, - "grad_norm": 0.9104844331741333, - "learning_rate": 9.708157286870072e-05, - "loss": 0.1128, + "epoch": 7.244946025515211, + "grad_norm": 0.3353860080242157, + "learning_rate": 1.1868875428725276e-05, + "loss": 0.033, "step": 27690 }, { - "epoch": 1.8122342165521754, - "grad_norm": 0.8403039574623108, - "learning_rate": 9.707847966755604e-05, - "loss": 0.0835, + "epoch": 7.247562970232254, + "grad_norm": 0.3661133348941803, + "learning_rate": 1.1851292144535403e-05, + "loss": 0.0267, "step": 27700 }, { - "epoch": 1.812888452731436, - "grad_norm": 0.8909661769866943, - "learning_rate": 9.707538487738918e-05, - "loss": 0.0925, + "epoch": 7.250179914949297, + "grad_norm": 0.37683379650115967, + "learning_rate": 1.183371784736335e-05, + "loss": 0.0265, "step": 27710 }, { - "epoch": 1.813542688910697, - "grad_norm": 0.8909949660301208, - "learning_rate": 9.707228849830465e-05, - "loss": 0.0944, + "epoch": 7.25279685966634, + "grad_norm": 0.2772020995616913, + "learning_rate": 1.1816152549221002e-05, + "loss": 0.036, "step": 27720 }, { - "epoch": 1.8141969250899574, - "grad_norm": 1.029722809791565, - "learning_rate": 9.706919053040692e-05, - "loss": 0.1, + "epoch": 7.255413804383382, + "grad_norm": 0.42098507285118103, + "learning_rate": 1.1798596262114078e-05, + "loss": 0.0357, "step": 27730 }, { - "epoch": 1.8148511612692182, - "grad_norm": 0.7977635264396667, - "learning_rate": 9.706609097380058e-05, - "loss": 0.0894, + "epoch": 7.258030749100425, + "grad_norm": 0.3395950198173523, + "learning_rate": 1.178104899804212e-05, + "loss": 0.0275, "step": 27740 }, { - "epoch": 1.815505397448479, - "grad_norm": 0.9570349454879761, - "learning_rate": 9.706298982859021e-05, - "loss": 0.0977, + "epoch": 7.260647693817468, + "grad_norm": 0.37505412101745605, + "learning_rate": 1.176351076899852e-05, + "loss": 0.0303, "step": 27750 }, { - "epoch": 1.8161596336277395, - "grad_norm": 0.8739515542984009, - "learning_rate": 9.705988709488052e-05, - "loss": 0.1012, + "epoch": 7.263264638534511, + "grad_norm": 0.25512251257896423, + "learning_rate": 1.1745981586970509e-05, + "loss": 0.0305, "step": 27760 }, { - "epoch": 1.8168138698070004, - "grad_norm": 0.8953158259391785, - "learning_rate": 9.705678277277622e-05, - "loss": 0.1025, + "epoch": 7.265881583251554, + "grad_norm": 0.3966490924358368, + "learning_rate": 1.1728461463939098e-05, + "loss": 0.0271, "step": 27770 }, { - "epoch": 1.817468105986261, - "grad_norm": 0.8194689154624939, - "learning_rate": 9.70536768623821e-05, - "loss": 0.0959, + "epoch": 7.268498527968597, + "grad_norm": 0.36627981066703796, + "learning_rate": 1.1710950411879129e-05, + "loss": 0.0277, "step": 27780 }, { - "epoch": 1.8181223421655217, - "grad_norm": 1.037415623664856, - "learning_rate": 9.705056936380296e-05, - "loss": 0.0908, + "epoch": 7.27111547268564, + "grad_norm": 0.3093130588531494, + "learning_rate": 1.1693448442759248e-05, + "loss": 0.0308, "step": 27790 }, { - "epoch": 1.8187765783447825, - "grad_norm": 1.1118489503860474, - "learning_rate": 9.704746027714372e-05, - "loss": 0.0972, + "epoch": 7.273732417402682, + "grad_norm": 0.36231744289398193, + "learning_rate": 1.167595556854189e-05, + "loss": 0.033, "step": 27800 }, { - "epoch": 1.8194308145240432, - "grad_norm": 0.847061276435852, - "learning_rate": 9.704434960250931e-05, - "loss": 0.0857, + "epoch": 7.276349362119725, + "grad_norm": 0.2014455944299698, + "learning_rate": 1.1658471801183255e-05, + "loss": 0.0274, "step": 27810 }, { - "epoch": 1.820085050703304, - "grad_norm": 1.020371437072754, - "learning_rate": 9.704123734000473e-05, - "loss": 0.1029, + "epoch": 7.278966306836768, + "grad_norm": 0.3836444020271301, + "learning_rate": 1.1640997152633351e-05, + "loss": 0.0309, "step": 27820 }, { - "epoch": 1.8207392868825645, - "grad_norm": 1.1115399599075317, - "learning_rate": 9.703812348973501e-05, - "loss": 0.0988, + "epoch": 7.281583251553811, + "grad_norm": 0.2450692355632782, + "learning_rate": 1.1623531634835913e-05, + "loss": 0.0341, "step": 27830 }, { - "epoch": 1.8213935230618254, - "grad_norm": 0.8873423337936401, - "learning_rate": 9.703500805180527e-05, - "loss": 0.0971, + "epoch": 7.284200196270854, + "grad_norm": 0.23850251734256744, + "learning_rate": 1.1606075259728474e-05, + "loss": 0.0237, "step": 27840 }, { - "epoch": 1.822047759241086, - "grad_norm": 0.7394191026687622, - "learning_rate": 9.703189102632064e-05, - "loss": 0.0968, + "epoch": 7.286817140987897, + "grad_norm": 0.3725256025791168, + "learning_rate": 1.1588628039242294e-05, + "loss": 0.0262, "step": 27850 }, { - "epoch": 1.8227019954203467, - "grad_norm": 0.965347945690155, - "learning_rate": 9.702877241338635e-05, - "loss": 0.0818, + "epoch": 7.289434085704939, + "grad_norm": 0.31703150272369385, + "learning_rate": 1.1571189985302399e-05, + "loss": 0.025, "step": 27860 }, { - "epoch": 1.8233562315996075, - "grad_norm": 0.8741346597671509, - "learning_rate": 9.702565221310766e-05, - "loss": 0.1045, + "epoch": 7.292051030421982, + "grad_norm": 0.3323570191860199, + "learning_rate": 1.1553761109827513e-05, + "loss": 0.029, "step": 27870 }, { - "epoch": 1.8240104677788682, - "grad_norm": 0.7647256255149841, - "learning_rate": 9.702253042558986e-05, - "loss": 0.0899, + "epoch": 7.294667975139025, + "grad_norm": 0.30076855421066284, + "learning_rate": 1.1536341424730118e-05, + "loss": 0.0299, "step": 27880 }, { - "epoch": 1.824664703958129, - "grad_norm": 0.8359330296516418, - "learning_rate": 9.701940705093835e-05, - "loss": 0.095, + "epoch": 7.297284919856068, + "grad_norm": 0.3726050853729248, + "learning_rate": 1.1518930941916405e-05, + "loss": 0.0264, "step": 27890 }, { - "epoch": 1.8253189401373895, - "grad_norm": 0.9208669066429138, - "learning_rate": 9.701628208925855e-05, - "loss": 0.101, + "epoch": 7.299901864573111, + "grad_norm": 0.30087316036224365, + "learning_rate": 1.1501529673286286e-05, + "loss": 0.0363, "step": 27900 }, { - "epoch": 1.8259731763166505, - "grad_norm": 0.9185352325439453, - "learning_rate": 9.70131555406559e-05, - "loss": 0.0914, + "epoch": 7.302518809290154, + "grad_norm": 0.2716040313243866, + "learning_rate": 1.1484137630733338e-05, + "loss": 0.0288, "step": 27910 }, { - "epoch": 1.826627412495911, - "grad_norm": 0.8828587532043457, - "learning_rate": 9.701002740523597e-05, - "loss": 0.0881, + "epoch": 7.305135754007196, + "grad_norm": 0.28562942147254944, + "learning_rate": 1.1466754826144885e-05, + "loss": 0.0342, "step": 27920 }, { - "epoch": 1.8272816486751717, - "grad_norm": 0.8667250275611877, - "learning_rate": 9.700689768310434e-05, - "loss": 0.0945, + "epoch": 7.307752698724239, + "grad_norm": 0.26102012395858765, + "learning_rate": 1.1449381271401888e-05, + "loss": 0.0295, "step": 27930 }, { - "epoch": 1.8279358848544325, - "grad_norm": 0.9793092012405396, - "learning_rate": 9.700376637436662e-05, - "loss": 0.1026, + "epoch": 7.310369643441282, + "grad_norm": 0.2839173972606659, + "learning_rate": 1.1432016978379015e-05, + "loss": 0.0266, "step": 27940 }, { - "epoch": 1.828590121033693, - "grad_norm": 0.8025292158126831, - "learning_rate": 9.70006334791285e-05, - "loss": 0.0846, + "epoch": 7.312986588158325, + "grad_norm": 0.2480892837047577, + "learning_rate": 1.14146619589446e-05, + "loss": 0.0292, "step": 27950 }, { - "epoch": 1.829244357212954, - "grad_norm": 0.9942945241928101, - "learning_rate": 9.699749899749576e-05, - "loss": 0.09, + "epoch": 7.315603532875368, + "grad_norm": 0.26429545879364014, + "learning_rate": 1.1397316224960643e-05, + "loss": 0.0292, "step": 27960 }, { - "epoch": 1.8298985933922145, - "grad_norm": 0.8808379769325256, - "learning_rate": 9.699436292957414e-05, - "loss": 0.0887, + "epoch": 7.318220477592411, + "grad_norm": 0.38657790422439575, + "learning_rate": 1.1379979788282775e-05, + "loss": 0.0299, "step": 27970 }, { - "epoch": 1.8305528295714752, - "grad_norm": 0.8378937840461731, - "learning_rate": 9.699122527546955e-05, - "loss": 0.0885, + "epoch": 7.320837422309454, + "grad_norm": 0.34630465507507324, + "learning_rate": 1.136265266076029e-05, + "loss": 0.0272, "step": 27980 }, { - "epoch": 1.831207065750736, - "grad_norm": 0.7837172150611877, - "learning_rate": 9.698808603528786e-05, - "loss": 0.0812, + "epoch": 7.323454367026496, + "grad_norm": 0.39583563804626465, + "learning_rate": 1.1345334854236116e-05, + "loss": 0.0269, "step": 27990 }, { - "epoch": 1.8318613019299967, - "grad_norm": 0.7706719636917114, - "learning_rate": 9.698494520913503e-05, - "loss": 0.0887, + "epoch": 7.326071311743539, + "grad_norm": 0.30565205216407776, + "learning_rate": 1.1328026380546828e-05, + "loss": 0.0312, "step": 28000 }, { - "epoch": 1.8325155381092575, - "grad_norm": 1.1025103330612183, - "learning_rate": 9.69818027971171e-05, - "loss": 0.0896, + "epoch": 7.326071311743539, + "eval_loss": 0.03350659035862492, + "eval_runtime": 9.5497, + "eval_samples_per_second": 107.228, + "eval_steps_per_second": 1.675, + "step": 28000 + }, + { + "epoch": 7.328688256460582, + "grad_norm": 0.4849902093410492, + "learning_rate": 1.1310727251522585e-05, + "loss": 0.0315, "step": 28010 }, { - "epoch": 1.833169774288518, - "grad_norm": 0.9141650795936584, - "learning_rate": 9.697865879934009e-05, - "loss": 0.1002, + "epoch": 7.331305201177625, + "grad_norm": 0.3135029375553131, + "learning_rate": 1.1293437478987176e-05, + "loss": 0.0295, "step": 28020 }, { - "epoch": 1.833824010467779, - "grad_norm": 0.7713013887405396, - "learning_rate": 9.697551321591014e-05, - "loss": 0.1056, + "epoch": 7.333922145894668, + "grad_norm": 0.25514817237854004, + "learning_rate": 1.1276157074758006e-05, + "loss": 0.0277, "step": 28030 }, { - "epoch": 1.8344782466470395, - "grad_norm": 0.8746879696846008, - "learning_rate": 9.697236604693343e-05, - "loss": 0.1024, + "epoch": 7.336539090611711, + "grad_norm": 0.31504568457603455, + "learning_rate": 1.1258886050646067e-05, + "loss": 0.0272, "step": 28040 }, { - "epoch": 1.8351324828263003, - "grad_norm": 0.8916308283805847, - "learning_rate": 9.696921729251617e-05, - "loss": 0.0951, + "epoch": 7.339156035328753, + "grad_norm": 0.2858773469924927, + "learning_rate": 1.124162441845594e-05, + "loss": 0.0293, "step": 28050 }, { - "epoch": 1.835786719005561, - "grad_norm": 0.7855492830276489, - "learning_rate": 9.696606695276464e-05, - "loss": 0.0857, + "epoch": 7.341772980045796, + "grad_norm": 0.29029589891433716, + "learning_rate": 1.12243721899858e-05, + "loss": 0.0356, "step": 28060 }, { - "epoch": 1.8364409551848218, - "grad_norm": 0.8872267603874207, - "learning_rate": 9.696291502778519e-05, - "loss": 0.0942, + "epoch": 7.344389924762839, + "grad_norm": 0.3983522951602936, + "learning_rate": 1.120712937702736e-05, + "loss": 0.0287, "step": 28070 }, { - "epoch": 1.8370951913640825, - "grad_norm": 0.8778836727142334, - "learning_rate": 9.695976151768419e-05, - "loss": 0.0933, + "epoch": 7.347006869479882, + "grad_norm": 0.430837482213974, + "learning_rate": 1.1189895991365934e-05, + "loss": 0.0291, "step": 28080 }, { - "epoch": 1.837749427543343, - "grad_norm": 0.8608387112617493, - "learning_rate": 9.695660642256807e-05, - "loss": 0.0811, + "epoch": 7.349623814196925, + "grad_norm": 0.37221601605415344, + "learning_rate": 1.1172672044780378e-05, + "loss": 0.0285, "step": 28090 }, { - "epoch": 1.838403663722604, - "grad_norm": 0.9035623073577881, - "learning_rate": 9.695344974254336e-05, - "loss": 0.1077, + "epoch": 7.352240758913968, + "grad_norm": 0.29085683822631836, + "learning_rate": 1.1155457549043103e-05, + "loss": 0.0345, "step": 28100 }, { - "epoch": 1.8390578999018645, - "grad_norm": 1.024272084236145, - "learning_rate": 9.695029147771655e-05, - "loss": 0.0941, + "epoch": 7.35485770363101, + "grad_norm": 0.3261438012123108, + "learning_rate": 1.1138252515920045e-05, + "loss": 0.0299, "step": 28110 }, { - "epoch": 1.8397121360811253, - "grad_norm": 0.9248595833778381, - "learning_rate": 9.69471316281943e-05, - "loss": 0.1043, + "epoch": 7.357474648348053, + "grad_norm": 0.2531701922416687, + "learning_rate": 1.1121056957170679e-05, + "loss": 0.0287, "step": 28120 }, { - "epoch": 1.840366372260386, - "grad_norm": 0.7749372720718384, - "learning_rate": 9.694397019408322e-05, - "loss": 0.0868, + "epoch": 7.360091593065096, + "grad_norm": 0.39535531401634216, + "learning_rate": 1.110387088454801e-05, + "loss": 0.0308, "step": 28130 }, { - "epoch": 1.8410206084396468, - "grad_norm": 0.88482666015625, - "learning_rate": 9.694080717549004e-05, - "loss": 0.101, + "epoch": 7.362708537782139, + "grad_norm": 0.18832387030124664, + "learning_rate": 1.1086694309798557e-05, + "loss": 0.0312, "step": 28140 }, { - "epoch": 1.8416748446189075, - "grad_norm": 0.8212443590164185, - "learning_rate": 9.693764257252149e-05, - "loss": 0.0838, + "epoch": 7.365325482499182, + "grad_norm": 0.2911202013492584, + "learning_rate": 1.106952724466235e-05, + "loss": 0.0259, "step": 28150 }, { - "epoch": 1.842329080798168, - "grad_norm": 0.7402942180633545, - "learning_rate": 9.693447638528443e-05, - "loss": 0.0853, + "epoch": 7.367942427216225, + "grad_norm": 0.4606091380119324, + "learning_rate": 1.1052369700872924e-05, + "loss": 0.0321, "step": 28160 }, { - "epoch": 1.842983316977429, - "grad_norm": 0.8029096126556396, - "learning_rate": 9.693130861388569e-05, - "loss": 0.081, + "epoch": 7.370559371933268, + "grad_norm": 0.3079313039779663, + "learning_rate": 1.103522169015728e-05, + "loss": 0.0283, "step": 28170 }, { - "epoch": 1.8436375531566895, - "grad_norm": 0.9067233204841614, - "learning_rate": 9.69281392584322e-05, - "loss": 0.0928, + "epoch": 7.37317631665031, + "grad_norm": 0.3586598336696625, + "learning_rate": 1.101808322423593e-05, + "loss": 0.0303, "step": 28180 }, { - "epoch": 1.8442917893359503, - "grad_norm": 0.959036111831665, - "learning_rate": 9.692496831903092e-05, - "loss": 0.0982, + "epoch": 7.375793261367353, + "grad_norm": 0.497734934091568, + "learning_rate": 1.1000954314822856e-05, + "loss": 0.0278, "step": 28190 }, { - "epoch": 1.844946025515211, - "grad_norm": 1.1850526332855225, - "learning_rate": 9.692179579578893e-05, - "loss": 0.0981, + "epoch": 7.378410206084396, + "grad_norm": 0.2749795615673065, + "learning_rate": 1.0983834973625512e-05, + "loss": 0.0274, "step": 28200 }, { - "epoch": 1.8456002616944716, - "grad_norm": 0.9170256853103638, - "learning_rate": 9.691862168881325e-05, - "loss": 0.0931, + "epoch": 7.381027150801439, + "grad_norm": 0.4311588704586029, + "learning_rate": 1.0966725212344791e-05, + "loss": 0.0235, "step": 28210 }, { - "epoch": 1.8462544978737325, - "grad_norm": 0.894639790058136, - "learning_rate": 9.691544599821105e-05, - "loss": 0.0802, + "epoch": 7.383644095518482, + "grad_norm": 0.23907941579818726, + "learning_rate": 1.0949625042675071e-05, + "loss": 0.0263, "step": 28220 }, { - "epoch": 1.846908734052993, - "grad_norm": 0.8013020157814026, - "learning_rate": 9.69122687240895e-05, - "loss": 0.0937, + "epoch": 7.386261040235525, + "grad_norm": 0.32718607783317566, + "learning_rate": 1.0932534476304138e-05, + "loss": 0.0264, "step": 28230 }, { - "epoch": 1.8475629702322538, - "grad_norm": 0.93445885181427, - "learning_rate": 9.690908986655586e-05, - "loss": 0.0911, + "epoch": 7.388877984952568, + "grad_norm": 0.260172963142395, + "learning_rate": 1.0915453524913243e-05, + "loss": 0.0257, "step": 28240 }, { - "epoch": 1.8482172064115145, - "grad_norm": 0.7632222771644592, - "learning_rate": 9.69059094257174e-05, - "loss": 0.0953, + "epoch": 7.39149492966961, + "grad_norm": 0.4206393361091614, + "learning_rate": 1.0898382200177055e-05, + "loss": 0.0328, "step": 28250 }, { - "epoch": 1.8488714425907753, - "grad_norm": 0.9063422083854675, - "learning_rate": 9.690272740168149e-05, - "loss": 0.0885, + "epoch": 7.394111874386653, + "grad_norm": 0.43060967326164246, + "learning_rate": 1.0881320513763668e-05, + "loss": 0.0303, "step": 28260 }, { - "epoch": 1.849525678770036, - "grad_norm": 0.652265191078186, - "learning_rate": 9.689954379455552e-05, - "loss": 0.0847, + "epoch": 7.396728819103696, + "grad_norm": 0.575630784034729, + "learning_rate": 1.0864268477334571e-05, + "loss": 0.0377, "step": 28270 }, { - "epoch": 1.8501799149492966, - "grad_norm": 0.8725261688232422, - "learning_rate": 9.689635860444696e-05, - "loss": 0.092, + "epoch": 7.399345763820739, + "grad_norm": 0.3237912356853485, + "learning_rate": 1.084722610254468e-05, + "loss": 0.0278, "step": 28280 }, { - "epoch": 1.8508341511285575, - "grad_norm": 0.7449600696563721, - "learning_rate": 9.689317183146329e-05, - "loss": 0.0878, + "epoch": 7.401962708537782, + "grad_norm": 0.25009626150131226, + "learning_rate": 1.0830193401042305e-05, + "loss": 0.028, "step": 28290 }, { - "epoch": 1.851488387307818, - "grad_norm": 0.8442542552947998, - "learning_rate": 9.68899834757121e-05, - "loss": 0.0938, + "epoch": 7.404579653254825, + "grad_norm": 0.2242855280637741, + "learning_rate": 1.0813170384469115e-05, + "loss": 0.0248, "step": 28300 }, { - "epoch": 1.8521426234870788, - "grad_norm": 0.9524549841880798, - "learning_rate": 9.6886793537301e-05, - "loss": 0.1059, + "epoch": 7.4071965979718675, + "grad_norm": 0.3656257390975952, + "learning_rate": 1.0796157064460194e-05, + "loss": 0.0301, "step": 28310 }, { - "epoch": 1.8527968596663396, - "grad_norm": 0.9726237654685974, - "learning_rate": 9.688360201633763e-05, - "loss": 0.0942, + "epoch": 7.4098135426889105, + "grad_norm": 0.4277788996696472, + "learning_rate": 1.0779153452643995e-05, + "loss": 0.0304, "step": 28320 }, { - "epoch": 1.8534510958456003, - "grad_norm": 0.991133987903595, - "learning_rate": 9.688040891292976e-05, - "loss": 0.1008, + "epoch": 7.4124304874059534, + "grad_norm": 0.31005632877349854, + "learning_rate": 1.0762159560642307e-05, + "loss": 0.0265, "step": 28330 }, { - "epoch": 1.854105332024861, - "grad_norm": 0.9573308229446411, - "learning_rate": 9.687721422718512e-05, - "loss": 0.0898, + "epoch": 7.415047432122996, + "grad_norm": 0.3337995409965515, + "learning_rate": 1.0745175400070303e-05, + "loss": 0.0306, "step": 28340 }, { - "epoch": 1.8547595682041216, - "grad_norm": 1.1355527639389038, - "learning_rate": 9.687401795921156e-05, - "loss": 0.095, + "epoch": 7.417664376840039, + "grad_norm": 0.47239336371421814, + "learning_rate": 1.0728200982536501e-05, + "loss": 0.0272, "step": 28350 }, { - "epoch": 1.8554138043833825, - "grad_norm": 0.7798793315887451, - "learning_rate": 9.687082010911698e-05, - "loss": 0.0883, + "epoch": 7.420281321557082, + "grad_norm": 0.3395785689353943, + "learning_rate": 1.0711236319642762e-05, + "loss": 0.0278, "step": 28360 }, { - "epoch": 1.856068040562643, - "grad_norm": 0.5980040431022644, - "learning_rate": 9.686762067700928e-05, - "loss": 0.0922, + "epoch": 7.4228982662741245, + "grad_norm": 0.4349222779273987, + "learning_rate": 1.069428142298425e-05, + "loss": 0.029, "step": 28370 }, { - "epoch": 1.8567222767419038, - "grad_norm": 0.9269242286682129, - "learning_rate": 9.686441966299649e-05, - "loss": 0.0893, + "epoch": 7.4255152109911675, + "grad_norm": 0.3717556297779083, + "learning_rate": 1.0677336304149505e-05, + "loss": 0.025, "step": 28380 }, { - "epoch": 1.8573765129211646, - "grad_norm": 0.6925560235977173, - "learning_rate": 9.68612170671866e-05, - "loss": 0.0957, + "epoch": 7.4281321557082105, + "grad_norm": 0.17544744908809662, + "learning_rate": 1.0660400974720327e-05, + "loss": 0.0312, "step": 28390 }, { - "epoch": 1.858030749100425, - "grad_norm": 0.8151103258132935, - "learning_rate": 9.685801288968777e-05, - "loss": 0.0925, + "epoch": 7.4307491004252535, + "grad_norm": 0.21056394279003143, + "learning_rate": 1.0643475446271872e-05, + "loss": 0.0272, "step": 28400 }, { - "epoch": 1.858684985279686, - "grad_norm": 0.9811034798622131, - "learning_rate": 9.685480713060808e-05, - "loss": 0.1071, + "epoch": 7.4333660451422965, + "grad_norm": 0.38550424575805664, + "learning_rate": 1.0626559730372575e-05, + "loss": 0.0261, "step": 28410 }, { - "epoch": 1.8593392214589466, - "grad_norm": 0.9472489953041077, - "learning_rate": 9.68515997900558e-05, - "loss": 0.089, + "epoch": 7.4359829898593395, + "grad_norm": 0.33460870385169983, + "learning_rate": 1.0609653838584177e-05, + "loss": 0.0241, "step": 28420 }, { - "epoch": 1.8599934576382076, - "grad_norm": 0.8960433602333069, - "learning_rate": 9.684839086813913e-05, - "loss": 0.1004, + "epoch": 7.4385999345763825, + "grad_norm": 0.2785506546497345, + "learning_rate": 1.059275778246168e-05, + "loss": 0.0254, "step": 28430 }, { - "epoch": 1.860647693817468, - "grad_norm": 0.9435994625091553, - "learning_rate": 9.684518036496641e-05, - "loss": 0.0928, + "epoch": 7.441216879293425, + "grad_norm": 0.3150511682033539, + "learning_rate": 1.0575871573553387e-05, + "loss": 0.0236, "step": 28440 }, { - "epoch": 1.8613019299967288, - "grad_norm": 0.8257802128791809, - "learning_rate": 9.6841968280646e-05, - "loss": 0.0994, + "epoch": 7.443833824010468, + "grad_norm": 0.21259582042694092, + "learning_rate": 1.055899522340086e-05, + "loss": 0.0271, "step": 28450 }, { - "epoch": 1.8619561661759896, - "grad_norm": 0.8301753997802734, - "learning_rate": 9.683875461528632e-05, - "loss": 0.086, + "epoch": 7.4464507687275105, + "grad_norm": 0.2286917269229889, + "learning_rate": 1.0542128743538937e-05, + "loss": 0.0268, "step": 28460 }, { - "epoch": 1.86261040235525, - "grad_norm": 0.8830812573432922, - "learning_rate": 9.683553936899583e-05, - "loss": 0.0972, + "epoch": 7.4490677134445535, + "grad_norm": 0.4267086684703827, + "learning_rate": 1.0525272145495679e-05, + "loss": 0.0294, "step": 28470 }, { - "epoch": 1.863264638534511, - "grad_norm": 0.7733142971992493, - "learning_rate": 9.683232254188305e-05, - "loss": 0.1017, + "epoch": 7.4516846581615965, + "grad_norm": 0.3371066749095917, + "learning_rate": 1.050842544079243e-05, + "loss": 0.0278, "step": 28480 }, { - "epoch": 1.8639188747137716, - "grad_norm": 0.7784489393234253, - "learning_rate": 9.682910413405657e-05, - "loss": 0.0959, + "epoch": 7.4543016028786395, + "grad_norm": 0.1786905974149704, + "learning_rate": 1.0491588640943736e-05, + "loss": 0.0283, "step": 28490 }, { - "epoch": 1.8645731108930323, - "grad_norm": 0.9781941771507263, - "learning_rate": 9.6825884145625e-05, - "loss": 0.0977, + "epoch": 7.4569185475956825, + "grad_norm": 0.35492804646492004, + "learning_rate": 1.0474761757457399e-05, + "loss": 0.028, "step": 28500 }, { - "epoch": 1.865227347072293, - "grad_norm": 0.9205982685089111, - "learning_rate": 9.682266257669703e-05, - "loss": 0.1062, + "epoch": 7.459535492312725, + "grad_norm": 0.3589496910572052, + "learning_rate": 1.0457944801834438e-05, + "loss": 0.0322, "step": 28510 }, { - "epoch": 1.8658815832515538, - "grad_norm": 1.1955499649047852, - "learning_rate": 9.681943942738141e-05, - "loss": 0.1022, + "epoch": 7.462152437029768, + "grad_norm": 0.22844111919403076, + "learning_rate": 1.0441137785569088e-05, + "loss": 0.0292, "step": 28520 }, { - "epoch": 1.8665358194308146, - "grad_norm": 0.8939968943595886, - "learning_rate": 9.681621469778692e-05, - "loss": 0.0958, + "epoch": 7.464769381746811, + "grad_norm": 0.26975587010383606, + "learning_rate": 1.0424340720148773e-05, + "loss": 0.0227, "step": 28530 }, { - "epoch": 1.8671900556100751, - "grad_norm": 0.7560677528381348, - "learning_rate": 9.681298838802242e-05, - "loss": 0.0943, + "epoch": 7.467386326463854, + "grad_norm": 0.8781476020812988, + "learning_rate": 1.0407553617054135e-05, + "loss": 0.0237, "step": 28540 }, { - "epoch": 1.867844291789336, - "grad_norm": 0.8495007753372192, - "learning_rate": 9.680976049819677e-05, - "loss": 0.0946, + "epoch": 7.470003271180897, + "grad_norm": 0.2750764787197113, + "learning_rate": 1.0390776487759001e-05, + "loss": 0.0287, "step": 28550 }, { - "epoch": 1.8684985279685966, - "grad_norm": 0.8184399008750916, - "learning_rate": 9.680653102841895e-05, - "loss": 0.0948, + "epoch": 7.47262021589794, + "grad_norm": 0.44443485140800476, + "learning_rate": 1.037400934373039e-05, + "loss": 0.025, "step": 28560 }, { - "epoch": 1.8691527641478574, - "grad_norm": 1.0168190002441406, - "learning_rate": 9.680329997879795e-05, - "loss": 0.0905, + "epoch": 7.475237160614982, + "grad_norm": 0.3391941785812378, + "learning_rate": 1.0357252196428477e-05, + "loss": 0.0294, "step": 28570 }, { - "epoch": 1.869807000327118, - "grad_norm": 0.8939940929412842, - "learning_rate": 9.680006734944283e-05, - "loss": 0.0849, + "epoch": 7.477854105332025, + "grad_norm": 0.3009742200374603, + "learning_rate": 1.0340505057306608e-05, + "loss": 0.0301, "step": 28580 }, { - "epoch": 1.8704612365063789, - "grad_norm": 0.9259584546089172, - "learning_rate": 9.679683314046269e-05, - "loss": 0.0986, + "epoch": 7.480471050049068, + "grad_norm": 0.36080223321914673, + "learning_rate": 1.0323767937811299e-05, + "loss": 0.0294, "step": 28590 }, { - "epoch": 1.8711154726856396, - "grad_norm": 0.8003655076026917, - "learning_rate": 9.679359735196671e-05, - "loss": 0.087, + "epoch": 7.483087994766111, + "grad_norm": 0.4406573474407196, + "learning_rate": 1.0307040849382216e-05, + "loss": 0.0301, "step": 28600 }, { - "epoch": 1.8717697088649001, - "grad_norm": 0.8092601895332336, - "learning_rate": 9.67903599840641e-05, - "loss": 0.0937, + "epoch": 7.485704939483154, + "grad_norm": 0.2862299978733063, + "learning_rate": 1.0290323803452167e-05, + "loss": 0.0248, "step": 28610 }, { - "epoch": 1.872423945044161, - "grad_norm": 0.757056713104248, - "learning_rate": 9.678712103686413e-05, - "loss": 0.0857, + "epoch": 7.488321884200197, + "grad_norm": 0.4238396883010864, + "learning_rate": 1.0273616811447104e-05, + "loss": 0.0256, "step": 28620 }, { - "epoch": 1.8730781812234216, - "grad_norm": 0.903866171836853, - "learning_rate": 9.678388051047611e-05, - "loss": 0.0948, + "epoch": 7.490938828917239, + "grad_norm": 0.3760383129119873, + "learning_rate": 1.0256919884786078e-05, + "loss": 0.0296, "step": 28630 }, { - "epoch": 1.8737324174026824, - "grad_norm": 0.8220004439353943, - "learning_rate": 9.678063840500944e-05, - "loss": 0.0914, + "epoch": 7.493555773634282, + "grad_norm": 0.26118260622024536, + "learning_rate": 1.0240233034881292e-05, + "loss": 0.027, "step": 28640 }, { - "epoch": 1.8743866535819431, - "grad_norm": 0.8988321423530579, - "learning_rate": 9.677739472057354e-05, - "loss": 0.0916, + "epoch": 7.496172718351325, + "grad_norm": 0.39909711480140686, + "learning_rate": 1.0223556273138052e-05, + "loss": 0.0288, "step": 28650 }, { - "epoch": 1.8750408897612036, - "grad_norm": 0.8908348679542542, - "learning_rate": 9.677414945727787e-05, - "loss": 0.0882, + "epoch": 7.498789663068368, + "grad_norm": 0.27352455258369446, + "learning_rate": 1.0206889610954774e-05, + "loss": 0.028, "step": 28660 }, { - "epoch": 1.8756951259404646, - "grad_norm": 0.7631081938743591, - "learning_rate": 9.677090261523201e-05, - "loss": 0.0929, + "epoch": 7.501406607785411, + "grad_norm": 0.2847869098186493, + "learning_rate": 1.0190233059722956e-05, + "loss": 0.0275, "step": 28670 }, { - "epoch": 1.8763493621197251, - "grad_norm": 0.7455977201461792, - "learning_rate": 9.676765419454552e-05, - "loss": 0.0902, + "epoch": 7.504023552502454, + "grad_norm": 0.37741467356681824, + "learning_rate": 1.0173586630827186e-05, + "loss": 0.0316, "step": 28680 }, { - "epoch": 1.8770035982989859, - "grad_norm": 0.9186832904815674, - "learning_rate": 9.676440419532804e-05, - "loss": 0.0948, + "epoch": 7.506640497219497, + "grad_norm": 0.33624228835105896, + "learning_rate": 1.015695033564515e-05, + "loss": 0.0277, "step": 28690 }, { - "epoch": 1.8776578344782466, - "grad_norm": 0.8906334042549133, - "learning_rate": 9.676115261768928e-05, - "loss": 0.0814, + "epoch": 7.509257441936539, + "grad_norm": 0.31950801610946655, + "learning_rate": 1.0140324185547594e-05, + "loss": 0.0275, "step": 28700 }, { - "epoch": 1.8783120706575074, - "grad_norm": 0.9550861716270447, - "learning_rate": 9.675789946173897e-05, - "loss": 0.0939, + "epoch": 7.511874386653582, + "grad_norm": 0.24474136531352997, + "learning_rate": 1.0123708191898343e-05, + "loss": 0.0288, "step": 28710 }, { - "epoch": 1.8789663068367681, - "grad_norm": 0.7002192735671997, - "learning_rate": 9.675464472758695e-05, - "loss": 0.0872, + "epoch": 7.514491331370625, + "grad_norm": 0.38690003752708435, + "learning_rate": 1.0107102366054274e-05, + "loss": 0.0259, "step": 28720 }, { - "epoch": 1.8796205430160287, - "grad_norm": 0.9470962882041931, - "learning_rate": 9.675138841534303e-05, - "loss": 0.0865, + "epoch": 7.517108276087668, + "grad_norm": 0.1933925300836563, + "learning_rate": 1.0090506719365298e-05, + "loss": 0.0284, "step": 28730 }, { - "epoch": 1.8802747791952896, - "grad_norm": 0.8719517588615417, - "learning_rate": 9.674813052511715e-05, - "loss": 0.0844, + "epoch": 7.519725220804711, + "grad_norm": 0.27855780720710754, + "learning_rate": 1.0073921263174391e-05, + "loss": 0.0263, "step": 28740 }, { - "epoch": 1.8809290153745502, - "grad_norm": 0.8722051978111267, - "learning_rate": 9.674487105701926e-05, - "loss": 0.0913, + "epoch": 7.522342165521754, + "grad_norm": 0.23865076899528503, + "learning_rate": 1.0057346008817556e-05, + "loss": 0.0266, "step": 28750 }, { - "epoch": 1.881583251553811, - "grad_norm": 0.8734009265899658, - "learning_rate": 9.674161001115938e-05, - "loss": 0.0878, + "epoch": 7.524959110238796, + "grad_norm": 0.34086933732032776, + "learning_rate": 1.0040780967623833e-05, + "loss": 0.0349, "step": 28760 }, { - "epoch": 1.8822374877330716, - "grad_norm": 0.8401185274124146, - "learning_rate": 9.673834738764759e-05, - "loss": 0.0889, + "epoch": 7.527576054955839, + "grad_norm": 0.29268211126327515, + "learning_rate": 1.0024226150915261e-05, + "loss": 0.0283, "step": 28770 }, { - "epoch": 1.8828917239123324, - "grad_norm": 0.8197746872901917, - "learning_rate": 9.6735083186594e-05, - "loss": 0.0991, + "epoch": 7.530192999672882, + "grad_norm": 0.24367311596870422, + "learning_rate": 1.0007681570006894e-05, + "loss": 0.0277, "step": 28780 }, { - "epoch": 1.8835459600915931, - "grad_norm": 0.8320461511611938, - "learning_rate": 9.673181740810876e-05, - "loss": 0.0823, + "epoch": 7.532809944389925, + "grad_norm": 0.36311009526252747, + "learning_rate": 9.991147236206803e-06, + "loss": 0.03, "step": 28790 }, { - "epoch": 1.8842001962708537, - "grad_norm": 0.9948931336402893, - "learning_rate": 9.672855005230214e-05, - "loss": 0.0862, + "epoch": 7.535426889106968, + "grad_norm": 0.3049049973487854, + "learning_rate": 9.974623160816051e-06, + "loss": 0.0272, "step": 28800 }, { - "epoch": 1.8848544324501146, - "grad_norm": 0.8380158543586731, - "learning_rate": 9.67252811192844e-05, - "loss": 0.0835, + "epoch": 7.538043833824011, + "grad_norm": 0.4554224908351898, + "learning_rate": 9.958109355128689e-06, + "loss": 0.0286, "step": 28810 }, { - "epoch": 1.8855086686293752, - "grad_norm": 0.8905105590820312, - "learning_rate": 9.672201060916589e-05, - "loss": 0.1009, + "epoch": 7.540660778541053, + "grad_norm": 0.24151623249053955, + "learning_rate": 9.941605830431756e-06, + "loss": 0.0295, "step": 28820 }, { - "epoch": 1.886162904808636, - "grad_norm": 0.8555837869644165, - "learning_rate": 9.671873852205699e-05, - "loss": 0.0983, + "epoch": 7.543277723258096, + "grad_norm": 0.3120863139629364, + "learning_rate": 9.925112598005234e-06, + "loss": 0.031, "step": 28830 }, { - "epoch": 1.8868171409878967, - "grad_norm": 0.7890472412109375, - "learning_rate": 9.671546485806813e-05, - "loss": 0.102, + "epoch": 7.545894667975139, + "grad_norm": 0.29075419902801514, + "learning_rate": 9.908629669122104e-06, + "loss": 0.0255, "step": 28840 }, { - "epoch": 1.8874713771671572, - "grad_norm": 0.8302340507507324, - "learning_rate": 9.671218961730981e-05, - "loss": 0.1047, + "epoch": 7.548511612692182, + "grad_norm": 0.20289865136146545, + "learning_rate": 9.892157055048304e-06, + "loss": 0.0311, "step": 28850 }, { - "epoch": 1.8881256133464182, - "grad_norm": 0.912526547908783, - "learning_rate": 9.670891279989261e-05, - "loss": 0.0886, + "epoch": 7.551128557409225, + "grad_norm": 0.40245625376701355, + "learning_rate": 9.875694767042687e-06, + "loss": 0.0306, "step": 28860 }, { - "epoch": 1.8887798495256787, - "grad_norm": 0.9973543882369995, - "learning_rate": 9.670563440592709e-05, - "loss": 0.0974, + "epoch": 7.553745502126268, + "grad_norm": 0.25056540966033936, + "learning_rate": 9.859242816357092e-06, + "loss": 0.0276, "step": 28870 }, { - "epoch": 1.8894340857049396, - "grad_norm": 0.8430191874504089, - "learning_rate": 9.670235443552391e-05, - "loss": 0.084, + "epoch": 7.556362446843311, + "grad_norm": 0.2368532121181488, + "learning_rate": 9.842801214236255e-06, + "loss": 0.0303, "step": 28880 }, { - "epoch": 1.8900883218842002, - "grad_norm": 0.7398623824119568, - "learning_rate": 9.669907288879379e-05, - "loss": 0.0874, + "epoch": 7.558979391560353, + "grad_norm": 0.32772502303123474, + "learning_rate": 9.826369971917865e-06, + "loss": 0.0279, "step": 28890 }, { - "epoch": 1.890742558063461, - "grad_norm": 0.8663063049316406, - "learning_rate": 9.669578976584748e-05, - "loss": 0.0852, + "epoch": 7.561596336277396, + "grad_norm": 0.3745412528514862, + "learning_rate": 9.80994910063252e-06, + "loss": 0.0268, "step": 28900 }, { - "epoch": 1.8913967942427217, - "grad_norm": 0.8032066226005554, - "learning_rate": 9.669250506679582e-05, - "loss": 0.1002, + "epoch": 7.564213280994439, + "grad_norm": 0.2986398935317993, + "learning_rate": 9.79353861160373e-06, + "loss": 0.0293, "step": 28910 }, { - "epoch": 1.8920510304219822, - "grad_norm": 0.7833549976348877, - "learning_rate": 9.668921879174965e-05, - "loss": 0.0991, + "epoch": 7.566830225711482, + "grad_norm": 0.364200621843338, + "learning_rate": 9.77713851604792e-06, + "loss": 0.0301, "step": 28920 }, { - "epoch": 1.8927052666012432, - "grad_norm": 0.8108584880828857, - "learning_rate": 9.66859309408199e-05, - "loss": 0.0876, + "epoch": 7.569447170428525, + "grad_norm": 0.417913556098938, + "learning_rate": 9.760748825174382e-06, + "loss": 0.0265, "step": 28930 }, { - "epoch": 1.8933595027805037, - "grad_norm": 0.6968421936035156, - "learning_rate": 9.668264151411755e-05, - "loss": 0.098, + "epoch": 7.572064115145568, + "grad_norm": 0.36317500472068787, + "learning_rate": 9.744369550185334e-06, + "loss": 0.0274, "step": 28940 }, { - "epoch": 1.8940137389597644, - "grad_norm": 1.000785231590271, - "learning_rate": 9.66793505117536e-05, - "loss": 0.0879, + "epoch": 7.574681059862611, + "grad_norm": 0.4232650101184845, + "learning_rate": 9.728000702275839e-06, + "loss": 0.027, "step": 28950 }, { - "epoch": 1.8946679751390252, - "grad_norm": 0.9198935627937317, - "learning_rate": 9.667605793383916e-05, - "loss": 0.1009, + "epoch": 7.577298004579653, + "grad_norm": 0.4161834716796875, + "learning_rate": 9.711642292633854e-06, + "loss": 0.0313, "step": 28960 }, { - "epoch": 1.895322211318286, - "grad_norm": 0.9679718613624573, - "learning_rate": 9.667276378048535e-05, - "loss": 0.0987, + "epoch": 7.579914949296696, + "grad_norm": 0.40490856766700745, + "learning_rate": 9.695294332440214e-06, + "loss": 0.0326, "step": 28970 }, { - "epoch": 1.8959764474975467, - "grad_norm": 0.9554332494735718, - "learning_rate": 9.666946805180336e-05, - "loss": 0.101, + "epoch": 7.582531894013739, + "grad_norm": 0.26717275381088257, + "learning_rate": 9.678956832868572e-06, + "loss": 0.0258, "step": 28980 }, { - "epoch": 1.8966306836768072, - "grad_norm": 0.9619019031524658, - "learning_rate": 9.666617074790442e-05, - "loss": 0.103, + "epoch": 7.585148838730782, + "grad_norm": 0.3033079206943512, + "learning_rate": 9.662629805085466e-06, + "loss": 0.0271, "step": 28990 }, { - "epoch": 1.8972849198560682, - "grad_norm": 0.8431410789489746, - "learning_rate": 9.666287186889983e-05, - "loss": 0.097, + "epoch": 7.587765783447825, + "grad_norm": 0.4634299874305725, + "learning_rate": 9.646313260250267e-06, + "loss": 0.0287, + "step": 29000 + }, + { + "epoch": 7.587765783447825, + "eval_loss": 0.03173584048044022, + "eval_runtime": 9.2653, + "eval_samples_per_second": 110.519, + "eval_steps_per_second": 1.727, "step": 29000 }, { - "epoch": 1.8979391560353287, - "grad_norm": 0.9269760847091675, - "learning_rate": 9.665957141490096e-05, - "loss": 0.0859, + "epoch": 7.590382728164867, + "grad_norm": 0.30492672324180603, + "learning_rate": 9.630007209515177e-06, + "loss": 0.03, "step": 29010 }, { - "epoch": 1.8985933922145894, - "grad_norm": 0.9166405200958252, - "learning_rate": 9.665626938601917e-05, - "loss": 0.0887, + "epoch": 7.59299967288191, + "grad_norm": 0.5153994560241699, + "learning_rate": 9.613711664025237e-06, + "loss": 0.0258, "step": 29020 }, { - "epoch": 1.8992476283938502, - "grad_norm": 0.973930835723877, - "learning_rate": 9.665296578236593e-05, - "loss": 0.0897, + "epoch": 7.595616617598953, + "grad_norm": 0.4768589437007904, + "learning_rate": 9.597426634918291e-06, + "loss": 0.0279, "step": 29030 }, { - "epoch": 1.899901864573111, - "grad_norm": 0.9470407366752625, - "learning_rate": 9.664966060405275e-05, - "loss": 0.0925, + "epoch": 7.598233562315996, + "grad_norm": 0.3797157406806946, + "learning_rate": 9.581152133324994e-06, + "loss": 0.0275, "step": 29040 }, { - "epoch": 1.9005561007523717, - "grad_norm": 0.762370228767395, - "learning_rate": 9.664635385119117e-05, - "loss": 0.0958, + "epoch": 7.600850507033039, + "grad_norm": 0.2744821608066559, + "learning_rate": 9.564888170368825e-06, + "loss": 0.0275, "step": 29050 }, { - "epoch": 1.9012103369316322, - "grad_norm": 0.8602021932601929, - "learning_rate": 9.66430455238928e-05, - "loss": 0.091, + "epoch": 7.603467451750082, + "grad_norm": 0.3670021891593933, + "learning_rate": 9.548634757166041e-06, + "loss": 0.034, "step": 29060 }, { - "epoch": 1.9018645731108932, - "grad_norm": 0.8808605074882507, - "learning_rate": 9.663973562226934e-05, - "loss": 0.0837, + "epoch": 7.606084396467125, + "grad_norm": 0.2540300190448761, + "learning_rate": 9.532391904825716e-06, + "loss": 0.03, "step": 29070 }, { - "epoch": 1.9025188092901537, - "grad_norm": 0.7062970399856567, - "learning_rate": 9.663642414643248e-05, - "loss": 0.0916, + "epoch": 7.608701341184167, + "grad_norm": 0.2962121367454529, + "learning_rate": 9.516159624449667e-06, + "loss": 0.025, "step": 29080 }, { - "epoch": 1.9031730454694145, - "grad_norm": 0.7167387008666992, - "learning_rate": 9.6633111096494e-05, - "loss": 0.0866, + "epoch": 7.61131828590121, + "grad_norm": 0.2839810848236084, + "learning_rate": 9.499937927132508e-06, + "loss": 0.027, "step": 29090 }, { - "epoch": 1.9038272816486752, - "grad_norm": 0.91849684715271, - "learning_rate": 9.662979647256572e-05, - "loss": 0.0882, + "epoch": 7.613935230618253, + "grad_norm": 0.3168564736843109, + "learning_rate": 9.483726823961616e-06, + "loss": 0.0262, "step": 29100 }, { - "epoch": 1.9044815178279357, - "grad_norm": 0.7310143709182739, - "learning_rate": 9.662648027475952e-05, - "loss": 0.0947, + "epoch": 7.616552175335296, + "grad_norm": 0.3472195863723755, + "learning_rate": 9.467526326017135e-06, + "loss": 0.0275, "step": 29110 }, { - "epoch": 1.9051357540071967, - "grad_norm": 0.7814351916313171, - "learning_rate": 9.66231625031873e-05, - "loss": 0.102, + "epoch": 7.619169120052339, + "grad_norm": 0.289442777633667, + "learning_rate": 9.451336444371953e-06, + "loss": 0.0296, "step": 29120 }, { - "epoch": 1.9057899901864572, - "grad_norm": 1.0291553735733032, - "learning_rate": 9.661984315796111e-05, - "loss": 0.0924, + "epoch": 7.621786064769382, + "grad_norm": 0.35899093747138977, + "learning_rate": 9.435157190091698e-06, + "loss": 0.0294, "step": 29130 }, { - "epoch": 1.906444226365718, - "grad_norm": 0.6880286335945129, - "learning_rate": 9.661652223919293e-05, - "loss": 0.0787, + "epoch": 7.624403009486425, + "grad_norm": 0.4903254210948944, + "learning_rate": 9.418988574234724e-06, + "loss": 0.0282, "step": 29140 }, { - "epoch": 1.9070984625449787, - "grad_norm": 0.7895125150680542, - "learning_rate": 9.661319974699487e-05, - "loss": 0.0845, + "epoch": 7.627019954203467, + "grad_norm": 0.32704609632492065, + "learning_rate": 9.402830607852145e-06, + "loss": 0.031, "step": 29150 }, { - "epoch": 1.9077526987242395, - "grad_norm": 0.846792995929718, - "learning_rate": 9.660987568147907e-05, - "loss": 0.0884, + "epoch": 7.62963689892051, + "grad_norm": 0.3716278374195099, + "learning_rate": 9.38668330198777e-06, + "loss": 0.029, "step": 29160 }, { - "epoch": 1.9084069349035002, - "grad_norm": 1.0212069749832153, - "learning_rate": 9.660655004275772e-05, - "loss": 0.0912, + "epoch": 7.632253843637553, + "grad_norm": 0.3026614487171173, + "learning_rate": 9.37054666767814e-06, + "loss": 0.0289, "step": 29170 }, { - "epoch": 1.9090611710827607, - "grad_norm": 0.8728976845741272, - "learning_rate": 9.660322283094309e-05, - "loss": 0.0873, + "epoch": 7.634870788354596, + "grad_norm": 0.23007415235042572, + "learning_rate": 9.35442071595248e-06, + "loss": 0.0288, "step": 29180 }, { - "epoch": 1.9097154072620217, - "grad_norm": 0.9119880199432373, - "learning_rate": 9.659989404614746e-05, - "loss": 0.0921, + "epoch": 7.637487733071639, + "grad_norm": 0.36757123470306396, + "learning_rate": 9.33830545783273e-06, + "loss": 0.0278, "step": 29190 }, { - "epoch": 1.9103696434412822, - "grad_norm": 0.7712875008583069, - "learning_rate": 9.65965636884832e-05, - "loss": 0.1135, + "epoch": 7.640104677788682, + "grad_norm": 0.3388332426548004, + "learning_rate": 9.32220090433352e-06, + "loss": 0.0247, "step": 29200 }, { - "epoch": 1.911023879620543, - "grad_norm": 0.8132479786872864, - "learning_rate": 9.659323175806271e-05, - "loss": 0.0848, + "epoch": 7.642721622505725, + "grad_norm": 0.25686201453208923, + "learning_rate": 9.306107066462159e-06, + "loss": 0.0252, "step": 29210 }, { - "epoch": 1.9116781157998037, - "grad_norm": 1.06707763671875, - "learning_rate": 9.658989825499845e-05, - "loss": 0.0922, + "epoch": 7.645338567222767, + "grad_norm": 0.18186792731285095, + "learning_rate": 9.29002395521864e-06, + "loss": 0.0273, "step": 29220 }, { - "epoch": 1.9123323519790645, - "grad_norm": 0.9323616623878479, - "learning_rate": 9.658656317940293e-05, - "loss": 0.1036, + "epoch": 7.64795551193981, + "grad_norm": 0.26176708936691284, + "learning_rate": 9.273951581595614e-06, + "loss": 0.0251, "step": 29230 }, { - "epoch": 1.9129865881583252, - "grad_norm": 0.8813486099243164, - "learning_rate": 9.658322653138873e-05, - "loss": 0.096, + "epoch": 7.650572456656853, + "grad_norm": 0.3464919924736023, + "learning_rate": 9.257889956578383e-06, + "loss": 0.0323, "step": 29240 }, { - "epoch": 1.9136408243375858, - "grad_norm": 0.9012136459350586, - "learning_rate": 9.657988831106847e-05, - "loss": 0.0916, + "epoch": 7.653189401373896, + "grad_norm": 0.31770631670951843, + "learning_rate": 9.24183909114493e-06, + "loss": 0.0272, "step": 29250 }, { - "epoch": 1.9142950605168467, - "grad_norm": 0.8024969696998596, - "learning_rate": 9.657654851855483e-05, - "loss": 0.0926, + "epoch": 7.655806346090939, + "grad_norm": 0.39476248621940613, + "learning_rate": 9.225798996265867e-06, + "loss": 0.0277, "step": 29260 }, { - "epoch": 1.9149492966961073, - "grad_norm": 0.7480851411819458, - "learning_rate": 9.657320715396051e-05, - "loss": 0.0975, + "epoch": 7.658423290807981, + "grad_norm": 0.28465256094932556, + "learning_rate": 9.209769682904457e-06, + "loss": 0.0256, "step": 29270 }, { - "epoch": 1.915603532875368, - "grad_norm": 0.8564775586128235, - "learning_rate": 9.65698642173983e-05, - "loss": 0.0857, + "epoch": 7.661040235525024, + "grad_norm": 0.4322209358215332, + "learning_rate": 9.193751162016565e-06, + "loss": 0.0288, "step": 29280 }, { - "epoch": 1.9162577690546287, - "grad_norm": 0.79481440782547, - "learning_rate": 9.656651970898105e-05, - "loss": 0.0852, + "epoch": 7.663657180242067, + "grad_norm": 0.3503320515155792, + "learning_rate": 9.17774344455071e-06, + "loss": 0.0284, "step": 29290 }, { - "epoch": 1.9169120052338893, - "grad_norm": 0.8546467423439026, - "learning_rate": 9.656317362882164e-05, - "loss": 0.0975, + "epoch": 7.66627412495911, + "grad_norm": 0.36152294278144836, + "learning_rate": 9.161746541448019e-06, + "loss": 0.027, "step": 29300 }, { - "epoch": 1.9175662414131502, - "grad_norm": 0.8550596833229065, - "learning_rate": 9.6559825977033e-05, - "loss": 0.0848, + "epoch": 7.668891069676153, + "grad_norm": 0.2110384702682495, + "learning_rate": 9.145760463642227e-06, + "loss": 0.0276, "step": 29310 }, { - "epoch": 1.9182204775924108, - "grad_norm": 0.8139176368713379, - "learning_rate": 9.655647675372812e-05, - "loss": 0.0843, + "epoch": 7.671508014393196, + "grad_norm": 0.32499822974205017, + "learning_rate": 9.129785222059653e-06, + "loss": 0.026, "step": 29320 }, { - "epoch": 1.9188747137716717, - "grad_norm": 0.847602128982544, - "learning_rate": 9.655312595902004e-05, - "loss": 0.0865, + "epoch": 7.674124959110239, + "grad_norm": 0.286526083946228, + "learning_rate": 9.113820827619244e-06, + "loss": 0.0283, "step": 29330 }, { - "epoch": 1.9195289499509323, - "grad_norm": 0.7845019698143005, - "learning_rate": 9.654977359302189e-05, - "loss": 0.0959, + "epoch": 7.676741903827281, + "grad_norm": 0.24248047173023224, + "learning_rate": 9.09786729123249e-06, + "loss": 0.0253, "step": 29340 }, { - "epoch": 1.920183186130193, - "grad_norm": 0.8975633382797241, - "learning_rate": 9.654641965584678e-05, - "loss": 0.0842, + "epoch": 7.679358848544324, + "grad_norm": 0.38848909735679626, + "learning_rate": 9.081924623803495e-06, + "loss": 0.032, "step": 29350 }, { - "epoch": 1.9208374223094538, - "grad_norm": 0.872686505317688, - "learning_rate": 9.654306414760796e-05, - "loss": 0.0976, + "epoch": 7.681975793261367, + "grad_norm": 0.3085334002971649, + "learning_rate": 9.06599283622892e-06, + "loss": 0.0291, "step": 29360 }, { - "epoch": 1.9214916584887143, - "grad_norm": 1.081772804260254, - "learning_rate": 9.653970706841864e-05, - "loss": 0.0854, + "epoch": 7.68459273797841, + "grad_norm": 0.2823046147823334, + "learning_rate": 9.050071939398003e-06, + "loss": 0.0244, "step": 29370 }, { - "epoch": 1.9221458946679753, - "grad_norm": 0.8015065789222717, - "learning_rate": 9.653634841839216e-05, - "loss": 0.0988, + "epoch": 7.687209682695453, + "grad_norm": 0.3811781406402588, + "learning_rate": 9.034161944192508e-06, + "loss": 0.0268, "step": 29380 }, { - "epoch": 1.9228001308472358, - "grad_norm": 0.8011831045150757, - "learning_rate": 9.653298819764187e-05, - "loss": 0.0838, + "epoch": 7.689826627412496, + "grad_norm": 0.2050655484199524, + "learning_rate": 9.018262861486776e-06, + "loss": 0.0235, "step": 29390 }, { - "epoch": 1.9234543670264965, - "grad_norm": 0.8473259210586548, - "learning_rate": 9.65296264062812e-05, - "loss": 0.0897, + "epoch": 7.692443572129539, + "grad_norm": 0.2599548399448395, + "learning_rate": 9.002374702147676e-06, + "loss": 0.0291, "step": 29400 }, { - "epoch": 1.9241086032057573, - "grad_norm": 0.883358895778656, - "learning_rate": 9.652626304442361e-05, - "loss": 0.0868, + "epoch": 7.695060516846581, + "grad_norm": 0.2197568714618683, + "learning_rate": 8.98649747703463e-06, + "loss": 0.0246, "step": 29410 }, { - "epoch": 1.924762839385018, - "grad_norm": 1.020606279373169, - "learning_rate": 9.652289811218261e-05, - "loss": 0.094, + "epoch": 7.697677461563624, + "grad_norm": 0.23569698631763458, + "learning_rate": 8.970631196999552e-06, + "loss": 0.0289, "step": 29420 }, { - "epoch": 1.9254170755642788, - "grad_norm": 0.7326846718788147, - "learning_rate": 9.65195316096718e-05, - "loss": 0.0892, + "epoch": 7.700294406280667, + "grad_norm": 0.48520246148109436, + "learning_rate": 8.954775872886908e-06, + "loss": 0.0294, "step": 29430 }, { - "epoch": 1.9260713117435393, - "grad_norm": 1.0169782638549805, - "learning_rate": 9.651616353700479e-05, - "loss": 0.088, + "epoch": 7.70291135099771, + "grad_norm": 0.23008428514003754, + "learning_rate": 8.938931515533652e-06, + "loss": 0.0306, "step": 29440 }, { - "epoch": 1.9267255479228003, - "grad_norm": 0.9561719298362732, - "learning_rate": 9.651279389429526e-05, - "loss": 0.0906, + "epoch": 7.705528295714753, + "grad_norm": 0.4600384831428528, + "learning_rate": 8.923098135769258e-06, + "loss": 0.022, "step": 29450 }, { - "epoch": 1.9273797841020608, - "grad_norm": 0.8069345355033875, - "learning_rate": 9.650942268165698e-05, - "loss": 0.1029, + "epoch": 7.708145240431795, + "grad_norm": 0.39224159717559814, + "learning_rate": 8.907275744415692e-06, + "loss": 0.0263, "step": 29460 }, { - "epoch": 1.9280340202813215, - "grad_norm": 1.1819461584091187, - "learning_rate": 9.65060498992037e-05, - "loss": 0.0911, + "epoch": 7.710762185148838, + "grad_norm": 0.24442845582962036, + "learning_rate": 8.891464352287418e-06, + "loss": 0.028, "step": 29470 }, { - "epoch": 1.9286882564605823, - "grad_norm": 0.7691984176635742, - "learning_rate": 9.650267554704924e-05, - "loss": 0.0923, + "epoch": 7.713379129865881, + "grad_norm": 0.25245606899261475, + "learning_rate": 8.875663970191356e-06, + "loss": 0.0281, "step": 29480 }, { - "epoch": 1.929342492639843, - "grad_norm": 0.8642717599868774, - "learning_rate": 9.649929962530756e-05, - "loss": 0.0843, + "epoch": 7.715996074582924, + "grad_norm": 0.28361907601356506, + "learning_rate": 8.859874608926928e-06, + "loss": 0.0254, "step": 29490 }, { - "epoch": 1.9299967288191038, - "grad_norm": 0.7647484540939331, - "learning_rate": 9.649592213409253e-05, - "loss": 0.0902, + "epoch": 7.718613019299967, + "grad_norm": 0.23908400535583496, + "learning_rate": 8.844096279286019e-06, + "loss": 0.0278, "step": 29500 }, { - "epoch": 1.9306509649983643, - "grad_norm": 0.7451311349868774, - "learning_rate": 9.64925430735182e-05, - "loss": 0.0902, + "epoch": 7.72122996401701, + "grad_norm": 0.22012092173099518, + "learning_rate": 8.828328992052953e-06, + "loss": 0.0245, "step": 29510 }, { - "epoch": 1.9313052011776253, - "grad_norm": 0.820818305015564, - "learning_rate": 9.648916244369863e-05, - "loss": 0.0895, + "epoch": 7.723846908734053, + "grad_norm": 0.4234012961387634, + "learning_rate": 8.812572758004534e-06, + "loss": 0.0319, "step": 29520 }, { - "epoch": 1.9319594373568858, - "grad_norm": 1.151523232460022, - "learning_rate": 9.648578024474789e-05, - "loss": 0.092, + "epoch": 7.726463853451095, + "grad_norm": 0.3428882360458374, + "learning_rate": 8.796827587910003e-06, + "loss": 0.0263, "step": 29530 }, { - "epoch": 1.9326136735361465, - "grad_norm": 0.9633611440658569, - "learning_rate": 9.648239647678017e-05, - "loss": 0.0947, + "epoch": 7.729080798168138, + "grad_norm": 0.4102848768234253, + "learning_rate": 8.781093492531023e-06, + "loss": 0.0262, "step": 29540 }, { - "epoch": 1.9332679097154073, - "grad_norm": 0.7115769386291504, - "learning_rate": 9.647901113990964e-05, - "loss": 0.0909, + "epoch": 7.731697742885181, + "grad_norm": 0.39069345593452454, + "learning_rate": 8.765370482621701e-06, + "loss": 0.0251, "step": 29550 }, { - "epoch": 1.9339221458946678, - "grad_norm": 0.7603625655174255, - "learning_rate": 9.647562423425061e-05, - "loss": 0.0846, + "epoch": 7.734314687602224, + "grad_norm": 0.31306493282318115, + "learning_rate": 8.749658568928577e-06, + "loss": 0.024, "step": 29560 }, { - "epoch": 1.9345763820739288, - "grad_norm": 1.0014585256576538, - "learning_rate": 9.647223575991735e-05, - "loss": 0.0954, + "epoch": 7.736931632319267, + "grad_norm": 0.29695364832878113, + "learning_rate": 8.733957762190592e-06, + "loss": 0.0275, "step": 29570 }, { - "epoch": 1.9352306182531893, - "grad_norm": 0.6401631832122803, - "learning_rate": 9.646884571702428e-05, - "loss": 0.0848, + "epoch": 7.73954857703631, + "grad_norm": 0.3999333679676056, + "learning_rate": 8.718268073139091e-06, + "loss": 0.0277, "step": 29580 }, { - "epoch": 1.93588485443245, - "grad_norm": 0.8242216110229492, - "learning_rate": 9.64654541056858e-05, - "loss": 0.0834, + "epoch": 7.742165521753353, + "grad_norm": 0.2966053783893585, + "learning_rate": 8.702589512497844e-06, + "loss": 0.0286, "step": 29590 }, { - "epoch": 1.9365390906117108, - "grad_norm": 0.9129480719566345, - "learning_rate": 9.646206092601636e-05, - "loss": 0.0794, + "epoch": 7.7447824664703955, + "grad_norm": 0.4280104339122772, + "learning_rate": 8.68692209098298e-06, + "loss": 0.024, "step": 29600 }, { - "epoch": 1.9371933267909716, - "grad_norm": 0.8380915522575378, - "learning_rate": 9.645866617813053e-05, - "loss": 0.1019, + "epoch": 7.7473994111874385, + "grad_norm": 0.2608658969402313, + "learning_rate": 8.671265819303046e-06, + "loss": 0.0272, "step": 29610 }, { - "epoch": 1.9378475629702323, - "grad_norm": 0.7691987156867981, - "learning_rate": 9.645526986214286e-05, - "loss": 0.0872, + "epoch": 7.7500163559044815, + "grad_norm": 0.24662499129772186, + "learning_rate": 8.655620708158948e-06, + "loss": 0.023, "step": 29620 }, { - "epoch": 1.9385017991494928, - "grad_norm": 0.9295036792755127, - "learning_rate": 9.6451871978168e-05, - "loss": 0.0978, + "epoch": 7.7526333006215244, + "grad_norm": 0.23771394789218903, + "learning_rate": 8.639986768243991e-06, + "loss": 0.026, "step": 29630 }, { - "epoch": 1.9391560353287538, - "grad_norm": 0.8664805889129639, - "learning_rate": 9.644847252632065e-05, - "loss": 0.0909, + "epoch": 7.755250245338567, + "grad_norm": 0.2451602816581726, + "learning_rate": 8.624364010243805e-06, + "loss": 0.0312, "step": 29640 }, { - "epoch": 1.9398102715080143, - "grad_norm": 0.9215870499610901, - "learning_rate": 9.644507150671554e-05, - "loss": 0.0918, + "epoch": 7.75786719005561, + "grad_norm": 0.19674725830554962, + "learning_rate": 8.608752444836401e-06, + "loss": 0.0256, "step": 29650 }, { - "epoch": 1.940464507687275, - "grad_norm": 0.8502547740936279, - "learning_rate": 9.644166891946745e-05, - "loss": 0.086, + "epoch": 7.760484134772653, + "grad_norm": 0.2612418532371521, + "learning_rate": 8.593152082692143e-06, + "loss": 0.0278, "step": 29660 }, { - "epoch": 1.9411187438665358, - "grad_norm": 0.7667042016983032, - "learning_rate": 9.643826476469124e-05, - "loss": 0.0816, + "epoch": 7.7631010794896955, + "grad_norm": 0.2845847010612488, + "learning_rate": 8.577562934473737e-06, + "loss": 0.0232, "step": 29670 }, { - "epoch": 1.9417729800457966, - "grad_norm": 0.8172445297241211, - "learning_rate": 9.64348590425018e-05, - "loss": 0.0926, + "epoch": 7.7657180242067385, + "grad_norm": 0.2566286325454712, + "learning_rate": 8.561985010836202e-06, + "loss": 0.0235, "step": 29680 }, { - "epoch": 1.9424272162250573, - "grad_norm": 0.8571412563323975, - "learning_rate": 9.643145175301409e-05, - "loss": 0.0836, + "epoch": 7.7683349689237815, + "grad_norm": 0.20483295619487762, + "learning_rate": 8.54641832242692e-06, + "loss": 0.0298, "step": 29690 }, { - "epoch": 1.9430814524043178, - "grad_norm": 0.8760371804237366, - "learning_rate": 9.642804289634311e-05, - "loss": 0.0851, + "epoch": 7.7709519136408245, + "grad_norm": 0.22542567551136017, + "learning_rate": 8.530862879885556e-06, + "loss": 0.0257, "step": 29700 }, { - "epoch": 1.9437356885835788, - "grad_norm": 0.8582215905189514, - "learning_rate": 9.642463247260391e-05, - "loss": 0.085, + "epoch": 7.7735688583578675, + "grad_norm": 0.21155819296836853, + "learning_rate": 8.515318693844124e-06, + "loss": 0.0292, "step": 29710 }, { - "epoch": 1.9443899247628393, - "grad_norm": 0.915457010269165, - "learning_rate": 9.642122048191164e-05, - "loss": 0.091, + "epoch": 7.77618580307491, + "grad_norm": 0.27790671586990356, + "learning_rate": 8.499785774926918e-06, + "loss": 0.0226, "step": 29720 }, { - "epoch": 1.9450441609421, - "grad_norm": 0.8546823859214783, - "learning_rate": 9.641780692438142e-05, - "loss": 0.0907, + "epoch": 7.778802747791953, + "grad_norm": 0.37075120210647583, + "learning_rate": 8.48426413375056e-06, + "loss": 0.0259, "step": 29730 }, { - "epoch": 1.9456983971213608, - "grad_norm": 0.9210653305053711, - "learning_rate": 9.641439180012848e-05, - "loss": 0.0893, + "epoch": 7.781419692508996, + "grad_norm": 0.23859499394893646, + "learning_rate": 8.468753780923922e-06, + "loss": 0.0244, "step": 29740 }, { - "epoch": 1.9463526333006214, - "grad_norm": 0.7805274128913879, - "learning_rate": 9.641097510926809e-05, - "loss": 0.0863, + "epoch": 7.784036637226039, + "grad_norm": 0.3619495630264282, + "learning_rate": 8.453254727048193e-06, + "loss": 0.0262, "step": 29750 }, { - "epoch": 1.9470068694798823, - "grad_norm": 0.8311102390289307, - "learning_rate": 9.640755685191556e-05, - "loss": 0.083, + "epoch": 7.7866535819430815, + "grad_norm": 0.2726457715034485, + "learning_rate": 8.437766982716835e-06, + "loss": 0.0277, "step": 29760 }, { - "epoch": 1.9476611056591429, - "grad_norm": 0.9258164763450623, - "learning_rate": 9.640413702818629e-05, - "loss": 0.0849, + "epoch": 7.7892705266601245, + "grad_norm": 0.3399900794029236, + "learning_rate": 8.422290558515577e-06, + "loss": 0.0293, "step": 29770 }, { - "epoch": 1.9483153418384038, - "grad_norm": 0.8838931918144226, - "learning_rate": 9.64007156381957e-05, - "loss": 0.0955, + "epoch": 7.7918874713771675, + "grad_norm": 0.2570652663707733, + "learning_rate": 8.406825465022405e-06, + "loss": 0.0258, "step": 29780 }, { - "epoch": 1.9489695780176644, - "grad_norm": 0.9353017210960388, - "learning_rate": 9.639729268205926e-05, - "loss": 0.0864, + "epoch": 7.79450441609421, + "grad_norm": 0.3375178575515747, + "learning_rate": 8.391371712807556e-06, + "loss": 0.0293, "step": 29790 }, { - "epoch": 1.949623814196925, - "grad_norm": 0.8417748808860779, - "learning_rate": 9.639386815989252e-05, - "loss": 0.104, + "epoch": 7.797121360811253, + "grad_norm": 0.4316467344760895, + "learning_rate": 8.37592931243353e-06, + "loss": 0.0282, "step": 29800 }, { - "epoch": 1.9502780503761858, - "grad_norm": 0.8374495506286621, - "learning_rate": 9.639044207181105e-05, - "loss": 0.1021, + "epoch": 7.799738305528296, + "grad_norm": 0.292636513710022, + "learning_rate": 8.360498274455064e-06, + "loss": 0.0302, "step": 29810 }, { - "epoch": 1.9509322865554464, - "grad_norm": 0.8973211646080017, - "learning_rate": 9.63870144179305e-05, - "loss": 0.1022, + "epoch": 7.802355250245339, + "grad_norm": 0.3243941366672516, + "learning_rate": 8.345078609419124e-06, + "loss": 0.0228, "step": 29820 }, { - "epoch": 1.9515865227347073, - "grad_norm": 0.9601650834083557, - "learning_rate": 9.638358519836656e-05, - "loss": 0.1001, + "epoch": 7.804972194962382, + "grad_norm": 0.2369060516357422, + "learning_rate": 8.32967032786492e-06, + "loss": 0.0261, "step": 29830 }, { - "epoch": 1.9522407589139679, - "grad_norm": 0.9467968344688416, - "learning_rate": 9.638015441323496e-05, - "loss": 0.0787, + "epoch": 7.807589139679425, + "grad_norm": 0.5797624588012695, + "learning_rate": 8.314273440323844e-06, + "loss": 0.0255, "step": 29840 }, { - "epoch": 1.9528949950932286, - "grad_norm": 1.110438585281372, - "learning_rate": 9.637672206265152e-05, - "loss": 0.1028, + "epoch": 7.810206084396468, + "grad_norm": 0.3765929043292999, + "learning_rate": 8.298887957319538e-06, + "loss": 0.027, "step": 29850 }, { - "epoch": 1.9535492312724894, - "grad_norm": 1.1199315786361694, - "learning_rate": 9.63732881467321e-05, - "loss": 0.0869, + "epoch": 7.81282302911351, + "grad_norm": 0.2738799452781677, + "learning_rate": 8.283513889367827e-06, + "loss": 0.0302, "step": 29860 }, { - "epoch": 1.95420346745175, - "grad_norm": 0.796136736869812, - "learning_rate": 9.636985266559258e-05, - "loss": 0.0975, + "epoch": 7.815439973830553, + "grad_norm": 0.3794039189815521, + "learning_rate": 8.268151246976755e-06, + "loss": 0.0282, "step": 29870 }, { - "epoch": 1.9548577036310109, - "grad_norm": 0.8099967241287231, - "learning_rate": 9.63664156193489e-05, - "loss": 0.0837, + "epoch": 7.818056918547596, + "grad_norm": 0.3417680263519287, + "learning_rate": 8.252800040646536e-06, + "loss": 0.0278, "step": 29880 }, { - "epoch": 1.9555119398102714, - "grad_norm": 0.7563862204551697, - "learning_rate": 9.636297700811712e-05, - "loss": 0.0844, + "epoch": 7.820673863264639, + "grad_norm": 0.2978939414024353, + "learning_rate": 8.23746028086956e-06, + "loss": 0.026, "step": 29890 }, { - "epoch": 1.9561661759895324, - "grad_norm": 1.2173089981079102, - "learning_rate": 9.635953683201325e-05, - "loss": 0.0896, + "epoch": 7.823290807981682, + "grad_norm": 0.2458307147026062, + "learning_rate": 8.222131978130424e-06, + "loss": 0.0314, "step": 29900 }, { - "epoch": 1.9568204121687929, - "grad_norm": 0.7833836078643799, - "learning_rate": 9.635609509115344e-05, - "loss": 0.0898, + "epoch": 7.825907752698724, + "grad_norm": 0.47884535789489746, + "learning_rate": 8.206815142905875e-06, + "loss": 0.0277, "step": 29910 }, { - "epoch": 1.9574746483480536, - "grad_norm": 0.6414048075675964, - "learning_rate": 9.635265178565385e-05, - "loss": 0.0914, + "epoch": 7.828524697415767, + "grad_norm": 0.5162719488143921, + "learning_rate": 8.191509785664825e-06, + "loss": 0.0256, "step": 29920 }, { - "epoch": 1.9581288845273144, - "grad_norm": 0.7374199032783508, - "learning_rate": 9.63492069156307e-05, - "loss": 0.0874, + "epoch": 7.83114164213281, + "grad_norm": 0.41363653540611267, + "learning_rate": 8.176215916868351e-06, + "loss": 0.031, "step": 29930 }, { - "epoch": 1.9587831207065751, - "grad_norm": 1.0497411489486694, - "learning_rate": 9.634576048120027e-05, - "loss": 0.086, + "epoch": 7.833758586849853, + "grad_norm": 0.455657422542572, + "learning_rate": 8.160933546969649e-06, + "loss": 0.0289, "step": 29940 }, { - "epoch": 1.9594373568858359, - "grad_norm": 0.7760803699493408, - "learning_rate": 9.634231248247886e-05, - "loss": 0.091, + "epoch": 7.836375531566896, + "grad_norm": 0.39834895730018616, + "learning_rate": 8.145662686414086e-06, + "loss": 0.0271, "step": 29950 }, { - "epoch": 1.9600915930650964, - "grad_norm": 0.7324857711791992, - "learning_rate": 9.633886291958287e-05, - "loss": 0.0913, + "epoch": 7.838992476283939, + "grad_norm": 0.23709778487682343, + "learning_rate": 8.13040334563915e-06, + "loss": 0.0278, "step": 29960 }, { - "epoch": 1.9607458292443574, - "grad_norm": 0.9544631838798523, - "learning_rate": 9.633541179262874e-05, - "loss": 0.0936, + "epoch": 7.841609421000982, + "grad_norm": 0.46160802245140076, + "learning_rate": 8.115155535074465e-06, + "loss": 0.026, "step": 29970 }, { - "epoch": 1.9614000654236179, - "grad_norm": 0.8895130157470703, - "learning_rate": 9.633195910173294e-05, - "loss": 0.0883, + "epoch": 7.844226365718024, + "grad_norm": 0.2947714626789093, + "learning_rate": 8.099919265141755e-06, + "loss": 0.0247, "step": 29980 }, { - "epoch": 1.9620543016028786, - "grad_norm": 0.8787074089050293, - "learning_rate": 9.632850484701199e-05, - "loss": 0.0989, + "epoch": 7.846843310435067, + "grad_norm": 0.31289541721343994, + "learning_rate": 8.084694546254862e-06, + "loss": 0.0234, "step": 29990 }, { - "epoch": 1.9627085377821394, - "grad_norm": 0.9097030162811279, - "learning_rate": 9.632504902858253e-05, - "loss": 0.0864, + "epoch": 7.84946025515211, + "grad_norm": 0.4130816161632538, + "learning_rate": 8.069481388819747e-06, + "loss": 0.0246, + "step": 30000 + }, + { + "epoch": 7.84946025515211, + "eval_loss": 0.030050184655183172, + "eval_runtime": 9.5757, + "eval_samples_per_second": 106.938, + "eval_steps_per_second": 1.671, "step": 30000 }, { - "epoch": 1.9633627739614, - "grad_norm": 0.9169159531593323, - "learning_rate": 9.632159164656114e-05, - "loss": 0.101, + "epoch": 7.852077199869153, + "grad_norm": 0.3342052102088928, + "learning_rate": 8.054279803234455e-06, + "loss": 0.0265, "step": 30010 }, { - "epoch": 1.9640170101406609, - "grad_norm": 0.8284802436828613, - "learning_rate": 9.631813270106458e-05, - "loss": 0.0929, + "epoch": 7.854694144586196, + "grad_norm": 0.265415757894516, + "learning_rate": 8.03908979988913e-06, + "loss": 0.0313, "step": 30020 }, { - "epoch": 1.9646712463199214, - "grad_norm": 0.69024258852005, - "learning_rate": 9.631467219220955e-05, - "loss": 0.0962, + "epoch": 7.857311089303239, + "grad_norm": 0.2864171266555786, + "learning_rate": 8.023911389166002e-06, + "loss": 0.0284, "step": 30030 }, { - "epoch": 1.9653254824991822, - "grad_norm": 0.8356816172599792, - "learning_rate": 9.631121012011288e-05, - "loss": 0.0838, + "epoch": 7.859928034020282, + "grad_norm": 0.551828920841217, + "learning_rate": 8.008744581439357e-06, + "loss": 0.029, "step": 30040 }, { - "epoch": 1.965979718678443, - "grad_norm": 0.8797650933265686, - "learning_rate": 9.630774648489141e-05, - "loss": 0.0877, + "epoch": 7.862544978737324, + "grad_norm": 0.2644766569137573, + "learning_rate": 7.993589387075574e-06, + "loss": 0.025, "step": 30050 }, { - "epoch": 1.9666339548577036, - "grad_norm": 0.9771427512168884, - "learning_rate": 9.630428128666204e-05, - "loss": 0.1115, + "epoch": 7.865161923454367, + "grad_norm": 0.34633108973503113, + "learning_rate": 7.978445816433092e-06, + "loss": 0.0251, "step": 30060 }, { - "epoch": 1.9672881910369644, - "grad_norm": 0.791826605796814, - "learning_rate": 9.630081452554174e-05, - "loss": 0.0912, + "epoch": 7.86777886817141, + "grad_norm": 0.32692670822143555, + "learning_rate": 7.96331387986238e-06, + "loss": 0.0262, "step": 30070 }, { - "epoch": 1.967942427216225, - "grad_norm": 0.7139357924461365, - "learning_rate": 9.629734620164753e-05, - "loss": 0.084, + "epoch": 7.870395812888453, + "grad_norm": 0.34559890627861023, + "learning_rate": 7.948193587705993e-06, + "loss": 0.031, "step": 30080 }, { - "epoch": 1.968596663395486, - "grad_norm": 0.9226008653640747, - "learning_rate": 9.629387631509646e-05, - "loss": 0.0954, + "epoch": 7.873012757605496, + "grad_norm": 0.29954639077186584, + "learning_rate": 7.933084950298495e-06, + "loss": 0.0229, "step": 30090 }, { - "epoch": 1.9692508995747464, - "grad_norm": 0.9457941055297852, - "learning_rate": 9.629040486600567e-05, - "loss": 0.0924, + "epoch": 7.875629702322539, + "grad_norm": 0.3873235881328583, + "learning_rate": 7.917987977966501e-06, + "loss": 0.0333, "step": 30100 }, { - "epoch": 1.9699051357540072, - "grad_norm": 0.6717526912689209, - "learning_rate": 9.628693185449228e-05, - "loss": 0.0829, + "epoch": 7.878246647039582, + "grad_norm": 0.2134721726179123, + "learning_rate": 7.902902681028648e-06, + "loss": 0.0267, "step": 30110 }, { - "epoch": 1.970559371933268, - "grad_norm": 0.8272339105606079, - "learning_rate": 9.628345728067359e-05, - "loss": 0.0924, + "epoch": 7.880863591756624, + "grad_norm": 0.3731750249862671, + "learning_rate": 7.8878290697956e-06, + "loss": 0.0231, "step": 30120 }, { - "epoch": 1.9712136081125287, - "grad_norm": 0.8791738748550415, - "learning_rate": 9.62799811446668e-05, - "loss": 0.0819, + "epoch": 7.883480536473667, + "grad_norm": 0.33244821429252625, + "learning_rate": 7.87276715457003e-06, + "loss": 0.0291, "step": 30130 }, { - "epoch": 1.9718678442917894, - "grad_norm": 0.899770200252533, - "learning_rate": 9.627650344658929e-05, - "loss": 0.0892, + "epoch": 7.88609748119071, + "grad_norm": 0.3392798602581024, + "learning_rate": 7.857716945646603e-06, + "loss": 0.0292, "step": 30140 }, { - "epoch": 1.97252208047105, - "grad_norm": 0.8326700329780579, - "learning_rate": 9.627302418655844e-05, - "loss": 0.0937, + "epoch": 7.888714425907753, + "grad_norm": 0.291120707988739, + "learning_rate": 7.842678453312008e-06, + "loss": 0.0269, "step": 30150 }, { - "epoch": 1.973176316650311, - "grad_norm": 0.9431321024894714, - "learning_rate": 9.626954336469166e-05, - "loss": 0.0902, + "epoch": 7.891331370624796, + "grad_norm": 0.28149276971817017, + "learning_rate": 7.8276516878449e-06, + "loss": 0.029, "step": 30160 }, { - "epoch": 1.9738305528295714, - "grad_norm": 0.9222663044929504, - "learning_rate": 9.626606098110643e-05, - "loss": 0.0893, + "epoch": 7.893948315341838, + "grad_norm": 0.2638510763645172, + "learning_rate": 7.812636659515937e-06, + "loss": 0.0235, "step": 30170 }, { - "epoch": 1.9744847890088322, - "grad_norm": 0.7812402248382568, - "learning_rate": 9.62625770359203e-05, - "loss": 0.0853, + "epoch": 7.896565260058881, + "grad_norm": 0.26685631275177, + "learning_rate": 7.797633378587759e-06, + "loss": 0.0266, "step": 30180 }, { - "epoch": 1.975139025188093, - "grad_norm": 0.8364319801330566, - "learning_rate": 9.625909152925088e-05, - "loss": 0.0936, + "epoch": 7.899182204775924, + "grad_norm": 0.27392229437828064, + "learning_rate": 7.78264185531495e-06, + "loss": 0.023, "step": 30190 }, { - "epoch": 1.9757932613673534, - "grad_norm": 1.1527427434921265, - "learning_rate": 9.62556044612158e-05, - "loss": 0.0859, + "epoch": 7.901799149492967, + "grad_norm": 0.33641985058784485, + "learning_rate": 7.767662099944082e-06, + "loss": 0.0242, "step": 30200 }, { - "epoch": 1.9764474975466144, - "grad_norm": 0.8224945068359375, - "learning_rate": 9.625211583193275e-05, - "loss": 0.0813, + "epoch": 7.90441609421001, + "grad_norm": 0.32816699147224426, + "learning_rate": 7.752694122713678e-06, + "loss": 0.0249, "step": 30210 }, { - "epoch": 1.977101733725875, - "grad_norm": 0.81529700756073, - "learning_rate": 9.62486256415195e-05, - "loss": 0.0915, + "epoch": 7.907033038927053, + "grad_norm": 0.31403347849845886, + "learning_rate": 7.737737933854209e-06, + "loss": 0.0265, "step": 30220 }, { - "epoch": 1.977755969905136, - "grad_norm": 0.839607834815979, - "learning_rate": 9.624513389009385e-05, - "loss": 0.09, + "epoch": 7.909649983644096, + "grad_norm": 0.24018634855747223, + "learning_rate": 7.722793543588097e-06, + "loss": 0.0218, "step": 30230 }, { - "epoch": 1.9784102060843964, - "grad_norm": 0.7841370105743408, - "learning_rate": 9.624164057777363e-05, - "loss": 0.0921, + "epoch": 7.912266928361138, + "grad_norm": 0.21781381964683533, + "learning_rate": 7.707860962129673e-06, + "loss": 0.0306, "step": 30240 }, { - "epoch": 1.9790644422636572, - "grad_norm": 0.8300483226776123, - "learning_rate": 9.623814570467678e-05, - "loss": 0.083, + "epoch": 7.914883873078181, + "grad_norm": 0.33943313360214233, + "learning_rate": 7.692940199685236e-06, + "loss": 0.0306, "step": 30250 }, { - "epoch": 1.979718678442918, - "grad_norm": 0.7854914665222168, - "learning_rate": 9.623464927092123e-05, - "loss": 0.0886, + "epoch": 7.917500817795224, + "grad_norm": 0.3407123386859894, + "learning_rate": 7.67803126645297e-06, + "loss": 0.0259, "step": 30260 }, { - "epoch": 1.9803729146221785, - "grad_norm": 0.7365754842758179, - "learning_rate": 9.623115127662504e-05, - "loss": 0.0808, + "epoch": 7.920117762512267, + "grad_norm": 0.32126396894454956, + "learning_rate": 7.663134172622996e-06, + "loss": 0.0282, "step": 30270 }, { - "epoch": 1.9810271508014394, - "grad_norm": 0.8063216209411621, - "learning_rate": 9.622765172190624e-05, - "loss": 0.0885, + "epoch": 7.92273470722931, + "grad_norm": 0.42551764845848083, + "learning_rate": 7.648248928377347e-06, + "loss": 0.0269, "step": 30280 }, { - "epoch": 1.9816813869807, - "grad_norm": 0.9440072774887085, - "learning_rate": 9.622415060688294e-05, - "loss": 0.0916, + "epoch": 7.925351651946353, + "grad_norm": 0.2464340627193451, + "learning_rate": 7.633375543889929e-06, + "loss": 0.0246, "step": 30290 }, { - "epoch": 1.9823356231599607, - "grad_norm": 0.9911891222000122, - "learning_rate": 9.622064793167336e-05, - "loss": 0.0785, + "epoch": 7.927968596663396, + "grad_norm": 0.2966938316822052, + "learning_rate": 7.618514029326571e-06, + "loss": 0.0246, "step": 30300 }, { - "epoch": 1.9829898593392215, - "grad_norm": 0.8785775303840637, - "learning_rate": 9.621714369639567e-05, - "loss": 0.0886, + "epoch": 7.930585541380438, + "grad_norm": 0.27248331904411316, + "learning_rate": 7.603664394844973e-06, + "loss": 0.0232, "step": 30310 }, { - "epoch": 1.9836440955184822, - "grad_norm": 0.9227697849273682, - "learning_rate": 9.621363790116819e-05, - "loss": 0.0839, + "epoch": 7.933202486097481, + "grad_norm": 0.322000116109848, + "learning_rate": 7.588826650594727e-06, + "loss": 0.0238, "step": 30320 }, { - "epoch": 1.984298331697743, - "grad_norm": 1.0479682683944702, - "learning_rate": 9.621013054610922e-05, - "loss": 0.0892, + "epoch": 7.935819430814524, + "grad_norm": 0.33744585514068604, + "learning_rate": 7.574000806717293e-06, + "loss": 0.0265, "step": 30330 }, { - "epoch": 1.9849525678770035, - "grad_norm": 0.957587718963623, - "learning_rate": 9.620662163133715e-05, - "loss": 0.0773, + "epoch": 7.938436375531567, + "grad_norm": 0.17370611429214478, + "learning_rate": 7.5591868733459915e-06, + "loss": 0.0284, "step": 30340 }, { - "epoch": 1.9856068040562644, - "grad_norm": 1.071298360824585, - "learning_rate": 9.620311115697043e-05, - "loss": 0.0865, + "epoch": 7.94105332024861, + "grad_norm": 0.2931711673736572, + "learning_rate": 7.544384860605996e-06, + "loss": 0.0291, "step": 30350 }, { - "epoch": 1.986261040235525, - "grad_norm": 0.8113812208175659, - "learning_rate": 9.619959912312752e-05, - "loss": 0.0953, + "epoch": 7.943670264965653, + "grad_norm": 0.25150954723358154, + "learning_rate": 7.529594778614355e-06, + "loss": 0.0227, "step": 30360 }, { - "epoch": 1.9869152764147857, - "grad_norm": 0.7876493334770203, - "learning_rate": 9.6196085529927e-05, - "loss": 0.0948, + "epoch": 7.946287209682695, + "grad_norm": 0.6533074975013733, + "learning_rate": 7.514816637479943e-06, + "loss": 0.0247, "step": 30370 }, { - "epoch": 1.9875695125940465, - "grad_norm": 0.9819501638412476, - "learning_rate": 9.619257037748742e-05, - "loss": 0.0929, + "epoch": 7.948904154399738, + "grad_norm": 0.2975616455078125, + "learning_rate": 7.500050447303494e-06, + "loss": 0.0246, "step": 30380 }, { - "epoch": 1.9882237487733072, - "grad_norm": 0.8558057546615601, - "learning_rate": 9.618905366592745e-05, - "loss": 0.0884, + "epoch": 7.951521099116781, + "grad_norm": 0.37849241495132446, + "learning_rate": 7.4852962181775396e-06, + "loss": 0.026, "step": 30390 }, { - "epoch": 1.988877984952568, - "grad_norm": 0.7599206566810608, - "learning_rate": 9.618553539536579e-05, - "loss": 0.0896, + "epoch": 7.954138043833824, + "grad_norm": 0.33247023820877075, + "learning_rate": 7.470553960186469e-06, + "loss": 0.0265, "step": 30400 }, { - "epoch": 1.9895322211318285, - "grad_norm": 0.8661050200462341, - "learning_rate": 9.618201556592117e-05, - "loss": 0.0754, + "epoch": 7.956754988550867, + "grad_norm": 0.2878034710884094, + "learning_rate": 7.455823683406474e-06, + "loss": 0.0318, "step": 30410 }, { - "epoch": 1.9901864573110895, - "grad_norm": 0.7166074514389038, - "learning_rate": 9.617849417771244e-05, - "loss": 0.086, + "epoch": 7.95937193326791, + "grad_norm": 0.2719588577747345, + "learning_rate": 7.441105397905557e-06, + "loss": 0.0252, "step": 30420 }, { - "epoch": 1.99084069349035, - "grad_norm": 0.7684576511383057, - "learning_rate": 9.61749712308584e-05, - "loss": 0.0905, + "epoch": 7.961988877984952, + "grad_norm": 0.33773666620254517, + "learning_rate": 7.42639911374354e-06, + "loss": 0.0257, "step": 30430 }, { - "epoch": 1.9914949296696107, - "grad_norm": 0.8405929803848267, - "learning_rate": 9.617144672547798e-05, - "loss": 0.09, + "epoch": 7.964605822701995, + "grad_norm": 0.34900739789009094, + "learning_rate": 7.411704840972017e-06, + "loss": 0.0228, "step": 30440 }, { - "epoch": 1.9921491658488715, - "grad_norm": 0.9315941333770752, - "learning_rate": 9.616792066169013e-05, - "loss": 0.091, + "epoch": 7.967222767419038, + "grad_norm": 0.17360925674438477, + "learning_rate": 7.397022589634381e-06, + "loss": 0.022, "step": 30450 }, { - "epoch": 1.992803402028132, - "grad_norm": 0.7858991622924805, - "learning_rate": 9.616439303961391e-05, - "loss": 0.095, + "epoch": 7.969839712136081, + "grad_norm": 0.29911497235298157, + "learning_rate": 7.382352369765821e-06, + "loss": 0.0271, "step": 30460 }, { - "epoch": 1.993457638207393, - "grad_norm": 0.6836791038513184, - "learning_rate": 9.616086385936833e-05, - "loss": 0.0867, + "epoch": 7.972456656853124, + "grad_norm": 0.2711620032787323, + "learning_rate": 7.36769419139329e-06, + "loss": 0.0233, "step": 30470 }, { - "epoch": 1.9941118743866535, - "grad_norm": 0.8010273575782776, - "learning_rate": 9.615733312107255e-05, - "loss": 0.0946, + "epoch": 7.975073601570167, + "grad_norm": 0.18015000224113464, + "learning_rate": 7.353048064535523e-06, + "loss": 0.0226, "step": 30480 }, { - "epoch": 1.9947661105659142, - "grad_norm": 0.9536259770393372, - "learning_rate": 9.615380082484571e-05, - "loss": 0.0926, + "epoch": 7.97769054628721, + "grad_norm": 0.21266216039657593, + "learning_rate": 7.338413999202998e-06, + "loss": 0.0254, "step": 30490 }, { - "epoch": 1.995420346745175, - "grad_norm": 0.880687952041626, - "learning_rate": 9.615026697080707e-05, - "loss": 0.1006, + "epoch": 7.980307491004252, + "grad_norm": 0.3124334514141083, + "learning_rate": 7.323792005397964e-06, + "loss": 0.0226, "step": 30500 }, { - "epoch": 1.9960745829244357, - "grad_norm": 0.9787200689315796, - "learning_rate": 9.614673155907587e-05, - "loss": 0.0929, + "epoch": 7.982924435721295, + "grad_norm": 0.1995360255241394, + "learning_rate": 7.309182093114417e-06, + "loss": 0.0203, "step": 30510 }, { - "epoch": 1.9967288191036965, - "grad_norm": 0.906243085861206, - "learning_rate": 9.614319458977145e-05, - "loss": 0.0931, + "epoch": 7.985541380438338, + "grad_norm": 0.2655145227909088, + "learning_rate": 7.2945842723381035e-06, + "loss": 0.0248, "step": 30520 }, { - "epoch": 1.997383055282957, - "grad_norm": 0.7984228730201721, - "learning_rate": 9.613965606301321e-05, - "loss": 0.0822, + "epoch": 7.988158325155381, + "grad_norm": 0.34035325050354004, + "learning_rate": 7.27999855304648e-06, + "loss": 0.0223, "step": 30530 }, { - "epoch": 1.998037291462218, - "grad_norm": 0.8581556081771851, - "learning_rate": 9.613611597892059e-05, - "loss": 0.0877, + "epoch": 7.990775269872424, + "grad_norm": 0.21241000294685364, + "learning_rate": 7.265424945208765e-06, + "loss": 0.0251, "step": 30540 }, { - "epoch": 1.9986915276414785, - "grad_norm": 0.724999725818634, - "learning_rate": 9.613257433761303e-05, - "loss": 0.0897, + "epoch": 7.993392214589467, + "grad_norm": 0.29762494564056396, + "learning_rate": 7.250863458785864e-06, + "loss": 0.0269, "step": 30550 }, { - "epoch": 1.9993457638207393, - "grad_norm": 0.6740709543228149, - "learning_rate": 9.612903113921011e-05, - "loss": 0.0895, + "epoch": 7.99600915930651, + "grad_norm": 0.28159022331237793, + "learning_rate": 7.236314103730424e-06, + "loss": 0.0247, "step": 30560 }, { - "epoch": 2.0, - "grad_norm": 0.8931834101676941, - "learning_rate": 9.612548638383141e-05, - "loss": 0.0857, + "epoch": 7.998626104023552, + "grad_norm": 0.39448240399360657, + "learning_rate": 7.221776889986792e-06, + "loss": 0.0255, "step": 30570 }, { - "epoch": 2.0006542361792605, - "grad_norm": 0.8923904299736023, - "learning_rate": 9.612194007159657e-05, - "loss": 0.0848, + "epoch": 8.001046777886817, + "grad_norm": 0.22735320031642914, + "learning_rate": 7.2072518274910185e-06, + "loss": 0.0209, "step": 30580 }, { - "epoch": 2.0013084723585215, - "grad_norm": 0.8426147699356079, - "learning_rate": 9.61183922026253e-05, - "loss": 0.0931, + "epoch": 8.00366372260386, + "grad_norm": 0.2990175187587738, + "learning_rate": 7.192738926170853e-06, + "loss": 0.0237, "step": 30590 }, { - "epoch": 2.001962708537782, - "grad_norm": 0.7947473526000977, - "learning_rate": 9.611484277703733e-05, - "loss": 0.0989, + "epoch": 8.006280667320903, + "grad_norm": 0.25007107853889465, + "learning_rate": 7.1782381959457105e-06, + "loss": 0.0262, "step": 30600 }, { - "epoch": 2.002616944717043, - "grad_norm": 0.8306055665016174, - "learning_rate": 9.61112917949525e-05, - "loss": 0.0867, + "epoch": 8.008897612037945, + "grad_norm": 0.2612306475639343, + "learning_rate": 7.1637496467267115e-06, + "loss": 0.0296, "step": 30610 }, { - "epoch": 2.0032711808963035, - "grad_norm": 0.9573779702186584, - "learning_rate": 9.610773925649062e-05, - "loss": 0.0943, + "epoch": 8.011514556754989, + "grad_norm": 0.327019065618515, + "learning_rate": 7.149273288416652e-06, + "loss": 0.0241, "step": 30620 }, { - "epoch": 2.0039254170755645, - "grad_norm": 0.8894043564796448, - "learning_rate": 9.610418516177164e-05, - "loss": 0.0891, + "epoch": 8.014131501472031, + "grad_norm": 0.3024859130382538, + "learning_rate": 7.13480913090997e-06, + "loss": 0.0263, "step": 30630 }, { - "epoch": 2.004579653254825, - "grad_norm": 0.8106326460838318, - "learning_rate": 9.610062951091547e-05, - "loss": 0.0892, + "epoch": 8.016748446189075, + "grad_norm": 0.302470862865448, + "learning_rate": 7.120357184092796e-06, + "loss": 0.0219, "step": 30640 }, { - "epoch": 2.0052338894340855, - "grad_norm": 0.9291239976882935, - "learning_rate": 9.609707230404217e-05, - "loss": 0.0835, + "epoch": 8.019365390906117, + "grad_norm": 0.19182458519935608, + "learning_rate": 7.1059174578428835e-06, + "loss": 0.0227, "step": 30650 }, { - "epoch": 2.0058881256133465, - "grad_norm": 0.7362892627716064, - "learning_rate": 9.609351354127178e-05, - "loss": 0.0838, + "epoch": 8.02198233562316, + "grad_norm": 0.2500637173652649, + "learning_rate": 7.091489962029657e-06, + "loss": 0.0235, "step": 30660 }, { - "epoch": 2.006542361792607, - "grad_norm": 0.7407713532447815, - "learning_rate": 9.608995322272442e-05, - "loss": 0.0951, + "epoch": 8.024599280340203, + "grad_norm": 0.173533633351326, + "learning_rate": 7.077074706514175e-06, + "loss": 0.0238, "step": 30670 }, { - "epoch": 2.007196597971868, - "grad_norm": 0.9148522019386292, - "learning_rate": 9.608639134852028e-05, - "loss": 0.0895, + "epoch": 8.027216225057245, + "grad_norm": 0.3211864233016968, + "learning_rate": 7.0626717011491285e-06, + "loss": 0.0269, "step": 30680 }, { - "epoch": 2.0078508341511285, - "grad_norm": 0.7129114866256714, - "learning_rate": 9.608282791877955e-05, - "loss": 0.0866, + "epoch": 8.029833169774289, + "grad_norm": 0.2305508255958557, + "learning_rate": 7.048280955778844e-06, + "loss": 0.0272, "step": 30690 }, { - "epoch": 2.008505070330389, - "grad_norm": 1.1984944343566895, - "learning_rate": 9.607926293362253e-05, - "loss": 0.0941, + "epoch": 8.032450114491331, + "grad_norm": 0.25919488072395325, + "learning_rate": 7.0339024802392404e-06, + "loss": 0.0252, "step": 30700 }, { - "epoch": 2.00915930650965, - "grad_norm": 0.7906424403190613, - "learning_rate": 9.607569639316953e-05, - "loss": 0.0811, + "epoch": 8.035067059208375, + "grad_norm": 0.2955307960510254, + "learning_rate": 7.019536284357892e-06, + "loss": 0.0257, "step": 30710 }, { - "epoch": 2.0098135426889105, - "grad_norm": 0.9424049258232117, - "learning_rate": 9.607212829754094e-05, - "loss": 0.0872, + "epoch": 8.037684003925417, + "grad_norm": 0.24567954242229462, + "learning_rate": 7.00518237795394e-06, + "loss": 0.0276, "step": 30720 }, { - "epoch": 2.0104677788681715, - "grad_norm": 0.9077602028846741, - "learning_rate": 9.60685586468572e-05, - "loss": 0.0946, + "epoch": 8.04030094864246, + "grad_norm": 0.2553439438343048, + "learning_rate": 6.9908407708381505e-06, + "loss": 0.0246, "step": 30730 }, { - "epoch": 2.011122015047432, - "grad_norm": 0.867131769657135, - "learning_rate": 9.606498744123877e-05, - "loss": 0.0903, + "epoch": 8.042917893359503, + "grad_norm": 0.223582461476326, + "learning_rate": 6.976511472812886e-06, + "loss": 0.0258, "step": 30740 }, { - "epoch": 2.011776251226693, - "grad_norm": 0.7888831496238708, - "learning_rate": 9.606141468080623e-05, - "loss": 0.0829, + "epoch": 8.045534838076545, + "grad_norm": 0.40044087171554565, + "learning_rate": 6.962194493672069e-06, + "loss": 0.0297, "step": 30750 }, { - "epoch": 2.0124304874059535, - "grad_norm": 1.2040573358535767, - "learning_rate": 9.605784036568011e-05, - "loss": 0.1066, + "epoch": 8.048151782793589, + "grad_norm": 0.23338930308818817, + "learning_rate": 6.947889843201233e-06, + "loss": 0.0241, "step": 30760 }, { - "epoch": 2.013084723585214, - "grad_norm": 0.864673376083374, - "learning_rate": 9.605426449598112e-05, - "loss": 0.0929, + "epoch": 8.050768727510631, + "grad_norm": 0.23881715536117554, + "learning_rate": 6.933597531177466e-06, + "loss": 0.0251, "step": 30770 }, { - "epoch": 2.013738959764475, - "grad_norm": 0.7464171648025513, - "learning_rate": 9.60506870718299e-05, - "loss": 0.0877, + "epoch": 8.053385672227675, + "grad_norm": 0.3370324969291687, + "learning_rate": 6.919317567369429e-06, + "loss": 0.027, "step": 30780 }, { - "epoch": 2.0143931959437356, - "grad_norm": 0.8971266150474548, - "learning_rate": 9.604710809334723e-05, - "loss": 0.096, + "epoch": 8.056002616944717, + "grad_norm": 0.24091914296150208, + "learning_rate": 6.905049961537352e-06, + "loss": 0.0227, "step": 30790 }, { - "epoch": 2.0150474321229965, - "grad_norm": 0.9102427959442139, - "learning_rate": 9.60435275606539e-05, - "loss": 0.0975, + "epoch": 8.058619561661759, + "grad_norm": 0.24746572971343994, + "learning_rate": 6.890794723433003e-06, + "loss": 0.0237, "step": 30800 }, { - "epoch": 2.015701668302257, - "grad_norm": 0.8453085422515869, - "learning_rate": 9.603994547387074e-05, - "loss": 0.0989, + "epoch": 8.061236506378803, + "grad_norm": 0.2858780324459076, + "learning_rate": 6.8765518627996936e-06, + "loss": 0.027, "step": 30810 }, { - "epoch": 2.016355904481518, - "grad_norm": 0.9234716892242432, - "learning_rate": 9.603636183311871e-05, - "loss": 0.089, + "epoch": 8.063853451095845, + "grad_norm": 0.2548196315765381, + "learning_rate": 6.8623213893722895e-06, + "loss": 0.027, "step": 30820 }, { - "epoch": 2.0170101406607785, - "grad_norm": 0.8139061331748962, - "learning_rate": 9.60327766385187e-05, - "loss": 0.0854, + "epoch": 8.066470395812889, + "grad_norm": 0.24900494515895844, + "learning_rate": 6.848103312877188e-06, + "loss": 0.0227, "step": 30830 }, { - "epoch": 2.017664376840039, - "grad_norm": 0.8283985257148743, - "learning_rate": 9.602918989019176e-05, - "loss": 0.0881, + "epoch": 8.069087340529931, + "grad_norm": 0.2937477231025696, + "learning_rate": 6.833897643032319e-06, + "loss": 0.0259, "step": 30840 }, { - "epoch": 2.0183186130193, - "grad_norm": 0.780783474445343, - "learning_rate": 9.602560158825896e-05, - "loss": 0.0888, + "epoch": 8.071704285246975, + "grad_norm": 0.25163620710372925, + "learning_rate": 6.819704389547108e-06, + "loss": 0.0222, "step": 30850 }, { - "epoch": 2.0189728491985606, - "grad_norm": 0.6656304001808167, - "learning_rate": 9.602201173284139e-05, - "loss": 0.0792, + "epoch": 8.074321229964017, + "grad_norm": 0.1904536336660385, + "learning_rate": 6.805523562122515e-06, + "loss": 0.0237, "step": 30860 }, { - "epoch": 2.0196270853778215, - "grad_norm": 0.7495284080505371, - "learning_rate": 9.601842032406023e-05, - "loss": 0.0935, + "epoch": 8.07693817468106, + "grad_norm": 0.24911633133888245, + "learning_rate": 6.791355170451005e-06, + "loss": 0.0226, "step": 30870 }, { - "epoch": 2.020281321557082, - "grad_norm": 0.730228066444397, - "learning_rate": 9.60148273620367e-05, - "loss": 0.083, + "epoch": 8.079555119398103, + "grad_norm": 0.27910029888153076, + "learning_rate": 6.777199224216538e-06, + "loss": 0.023, "step": 30880 }, { - "epoch": 2.020935557736343, - "grad_norm": 0.8227654099464417, - "learning_rate": 9.601123284689206e-05, - "loss": 0.0904, + "epoch": 8.082172064115145, + "grad_norm": 0.31502899527549744, + "learning_rate": 6.763055733094578e-06, + "loss": 0.0228, "step": 30890 }, { - "epoch": 2.0215897939156036, - "grad_norm": 0.8423411250114441, - "learning_rate": 9.600763677874764e-05, - "loss": 0.0854, + "epoch": 8.084789008832189, + "grad_norm": 0.3428595960140228, + "learning_rate": 6.7489247067520606e-06, + "loss": 0.0306, "step": 30900 }, { - "epoch": 2.022244030094864, - "grad_norm": 0.9482555985450745, - "learning_rate": 9.600403915772484e-05, - "loss": 0.0973, + "epoch": 8.087405953549231, + "grad_norm": 0.20517821609973907, + "learning_rate": 6.734806154847401e-06, + "loss": 0.0261, "step": 30910 }, { - "epoch": 2.022898266274125, - "grad_norm": 0.7677321434020996, - "learning_rate": 9.600043998394506e-05, - "loss": 0.0847, + "epoch": 8.090022898266275, + "grad_norm": 0.28139781951904297, + "learning_rate": 6.720700087030504e-06, + "loss": 0.0224, "step": 30920 }, { - "epoch": 2.0235525024533856, - "grad_norm": 0.7692193984985352, - "learning_rate": 9.599683925752979e-05, - "loss": 0.1019, + "epoch": 8.092639842983317, + "grad_norm": 0.34407392144203186, + "learning_rate": 6.706606512942734e-06, + "loss": 0.0231, "step": 30930 }, { - "epoch": 2.0242067386326466, - "grad_norm": 0.870507001876831, - "learning_rate": 9.599323697860055e-05, - "loss": 0.0897, + "epoch": 8.09525678770036, + "grad_norm": 0.29121291637420654, + "learning_rate": 6.6925254422169265e-06, + "loss": 0.024, "step": 30940 }, { - "epoch": 2.024860974811907, - "grad_norm": 0.933172345161438, - "learning_rate": 9.598963314727894e-05, - "loss": 0.0922, + "epoch": 8.097873732417403, + "grad_norm": 0.23466309905052185, + "learning_rate": 6.678456884477338e-06, + "loss": 0.0258, "step": 30950 }, { - "epoch": 2.0255152109911676, - "grad_norm": 0.8484349250793457, - "learning_rate": 9.598602776368661e-05, - "loss": 0.0927, + "epoch": 8.100490677134445, + "grad_norm": 0.3875909447669983, + "learning_rate": 6.664400849339708e-06, + "loss": 0.0281, "step": 30960 }, { - "epoch": 2.0261694471704286, - "grad_norm": 0.7965954542160034, - "learning_rate": 9.598242082794524e-05, - "loss": 0.0983, + "epoch": 8.103107621851489, + "grad_norm": 0.2323644906282425, + "learning_rate": 6.6503573464112065e-06, + "loss": 0.0239, "step": 30970 }, { - "epoch": 2.026823683349689, - "grad_norm": 0.7531421184539795, - "learning_rate": 9.597881234017657e-05, - "loss": 0.0841, + "epoch": 8.105724566568531, + "grad_norm": 0.41235247254371643, + "learning_rate": 6.636326385290429e-06, + "loss": 0.0228, "step": 30980 }, { - "epoch": 2.02747791952895, - "grad_norm": 0.9377996325492859, - "learning_rate": 9.597520230050242e-05, - "loss": 0.1034, + "epoch": 8.108341511285573, + "grad_norm": 0.3516373038291931, + "learning_rate": 6.6223079755674154e-06, + "loss": 0.0254, "step": 30990 }, { - "epoch": 2.0281321557082106, - "grad_norm": 0.8710869550704956, - "learning_rate": 9.597159070904458e-05, - "loss": 0.0923, + "epoch": 8.110958456002617, + "grad_norm": 0.4276266098022461, + "learning_rate": 6.608302126823609e-06, + "loss": 0.0244, "step": 31000 }, { - "epoch": 2.0287863918874716, - "grad_norm": 0.9062069058418274, - "learning_rate": 9.596797756592502e-05, - "loss": 0.0931, + "epoch": 8.110958456002617, + "eval_loss": 0.029148803018330102, + "eval_runtime": 9.3161, + "eval_samples_per_second": 109.917, + "eval_steps_per_second": 1.717, + "step": 31000 + }, + { + "epoch": 8.11357540071966, + "grad_norm": 0.275944322347641, + "learning_rate": 6.594308848631869e-06, + "loss": 0.0228, "step": 31010 }, { - "epoch": 2.029440628066732, - "grad_norm": 1.0289934873580933, - "learning_rate": 9.596436287126565e-05, - "loss": 0.0977, + "epoch": 8.116192345436703, + "grad_norm": 0.3111572265625, + "learning_rate": 6.580328150556478e-06, + "loss": 0.0261, "step": 31020 }, { - "epoch": 2.0300948642459926, - "grad_norm": 0.9156734943389893, - "learning_rate": 9.596074662518848e-05, - "loss": 0.0874, + "epoch": 8.118809290153745, + "grad_norm": 0.22269070148468018, + "learning_rate": 6.5663600421531055e-06, + "loss": 0.0225, "step": 31030 }, { - "epoch": 2.0307491004252536, - "grad_norm": 0.9219240546226501, - "learning_rate": 9.595712882781558e-05, - "loss": 0.099, + "epoch": 8.121426234870789, + "grad_norm": 0.21635177731513977, + "learning_rate": 6.552404532968834e-06, + "loss": 0.0256, "step": 31040 }, { - "epoch": 2.031403336604514, - "grad_norm": 0.9066693186759949, - "learning_rate": 9.595350947926907e-05, - "loss": 0.09, + "epoch": 8.124043179587831, + "grad_norm": 0.24473895132541656, + "learning_rate": 6.538461632542106e-06, + "loss": 0.0292, "step": 31050 }, { - "epoch": 2.032057572783775, - "grad_norm": 0.8575799465179443, - "learning_rate": 9.594988857967106e-05, - "loss": 0.0895, + "epoch": 8.126660124304873, + "grad_norm": 0.25067785382270813, + "learning_rate": 6.524531350402771e-06, + "loss": 0.0233, "step": 31060 }, { - "epoch": 2.0327118089630356, - "grad_norm": 0.977655291557312, - "learning_rate": 9.594626612914383e-05, - "loss": 0.0949, + "epoch": 8.129277069021917, + "grad_norm": 0.3121054470539093, + "learning_rate": 6.510613696072046e-06, + "loss": 0.0255, "step": 31070 }, { - "epoch": 2.0333660451422966, - "grad_norm": 0.8753423094749451, - "learning_rate": 9.594264212780962e-05, - "loss": 0.0801, + "epoch": 8.13189401373896, + "grad_norm": 0.4683177173137665, + "learning_rate": 6.4967086790625185e-06, + "loss": 0.0256, "step": 31080 }, { - "epoch": 2.034020281321557, - "grad_norm": 0.7495356798171997, - "learning_rate": 9.593901657579075e-05, - "loss": 0.096, + "epoch": 8.134510958456003, + "grad_norm": 0.25463202595710754, + "learning_rate": 6.482816308878129e-06, + "loss": 0.0262, "step": 31090 }, { - "epoch": 2.0346745175008176, - "grad_norm": 0.8719765543937683, - "learning_rate": 9.593538947320959e-05, - "loss": 0.0975, + "epoch": 8.137127903173045, + "grad_norm": 0.254255086183548, + "learning_rate": 6.468936595014194e-06, + "loss": 0.0237, "step": 31100 }, { - "epoch": 2.0353287536800786, - "grad_norm": 0.7684383988380432, - "learning_rate": 9.593176082018855e-05, - "loss": 0.0857, + "epoch": 8.13974484789009, + "grad_norm": 0.3515304923057556, + "learning_rate": 6.4550695469573485e-06, + "loss": 0.0272, "step": 31110 }, { - "epoch": 2.035982989859339, - "grad_norm": 1.1163846254348755, - "learning_rate": 9.592813061685015e-05, - "loss": 0.1057, + "epoch": 8.142361792607131, + "grad_norm": 0.19320529699325562, + "learning_rate": 6.441215174185602e-06, + "loss": 0.0249, "step": 31120 }, { - "epoch": 2.0366372260386, - "grad_norm": 0.8532420992851257, - "learning_rate": 9.592449886331687e-05, - "loss": 0.0969, + "epoch": 8.144978737324173, + "grad_norm": 0.27671656012535095, + "learning_rate": 6.427373486168284e-06, + "loss": 0.0262, "step": 31130 }, { - "epoch": 2.0372914622178606, - "grad_norm": 0.8979114890098572, - "learning_rate": 9.592086555971131e-05, - "loss": 0.0875, + "epoch": 8.147595682041217, + "grad_norm": 0.23765401542186737, + "learning_rate": 6.413544492366066e-06, + "loss": 0.0254, "step": 31140 }, { - "epoch": 2.037945698397121, - "grad_norm": 0.8417396545410156, - "learning_rate": 9.591723070615612e-05, - "loss": 0.0906, + "epoch": 8.15021262675826, + "grad_norm": 0.3123226761817932, + "learning_rate": 6.39972820223092e-06, + "loss": 0.0241, "step": 31150 }, { - "epoch": 2.038599934576382, - "grad_norm": 1.090818166732788, - "learning_rate": 9.591359430277396e-05, - "loss": 0.0861, + "epoch": 8.152829571475303, + "grad_norm": 0.3324793577194214, + "learning_rate": 6.385924625206158e-06, + "loss": 0.0224, "step": 31160 }, { - "epoch": 2.0392541707556426, - "grad_norm": 0.6975811123847961, - "learning_rate": 9.590995634968759e-05, - "loss": 0.0902, + "epoch": 8.155446516192345, + "grad_norm": 0.35988208651542664, + "learning_rate": 6.372133770726396e-06, + "loss": 0.0244, "step": 31170 }, { - "epoch": 2.0399084069349036, - "grad_norm": 0.996051549911499, - "learning_rate": 9.590631684701979e-05, - "loss": 0.0854, + "epoch": 8.15806346090939, + "grad_norm": 0.4930897057056427, + "learning_rate": 6.358355648217556e-06, + "loss": 0.0266, "step": 31180 }, { - "epoch": 2.040562643114164, - "grad_norm": 0.9619652628898621, - "learning_rate": 9.590267579489338e-05, - "loss": 0.0975, + "epoch": 8.160680405626431, + "grad_norm": 0.4564793109893799, + "learning_rate": 6.344590267096845e-06, + "loss": 0.0256, "step": 31190 }, { - "epoch": 2.041216879293425, - "grad_norm": 0.8321636319160461, - "learning_rate": 9.589903319343129e-05, - "loss": 0.0883, + "epoch": 8.163297350343473, + "grad_norm": 0.4039897322654724, + "learning_rate": 6.330837636772782e-06, + "loss": 0.0293, "step": 31200 }, { - "epoch": 2.0418711154726856, - "grad_norm": 0.8303142189979553, - "learning_rate": 9.589538904275645e-05, - "loss": 0.086, + "epoch": 8.165914295060517, + "grad_norm": 0.40973928570747375, + "learning_rate": 6.31709776664515e-06, + "loss": 0.0244, "step": 31210 }, { - "epoch": 2.042525351651946, - "grad_norm": 0.7571492195129395, - "learning_rate": 9.589174334299189e-05, - "loss": 0.0857, + "epoch": 8.16853123977756, + "grad_norm": 0.5403375625610352, + "learning_rate": 6.303370666105024e-06, + "loss": 0.0239, "step": 31220 }, { - "epoch": 2.043179587831207, - "grad_norm": 0.6824153065681458, - "learning_rate": 9.588809609426061e-05, - "loss": 0.0876, + "epoch": 8.171148184494603, + "grad_norm": 0.30449724197387695, + "learning_rate": 6.289656344534747e-06, + "loss": 0.0251, "step": 31230 }, { - "epoch": 2.0438338240104676, - "grad_norm": 0.907193660736084, - "learning_rate": 9.588444729668575e-05, - "loss": 0.0847, + "epoch": 8.173765129211645, + "grad_norm": 0.23199830949306488, + "learning_rate": 6.275954811307941e-06, + "loss": 0.022, "step": 31240 }, { - "epoch": 2.0444880601897286, - "grad_norm": 0.9690275192260742, - "learning_rate": 9.588079695039047e-05, - "loss": 0.096, + "epoch": 8.176382073928687, + "grad_norm": 0.12945488095283508, + "learning_rate": 6.262266075789455e-06, + "loss": 0.0204, "step": 31250 }, { - "epoch": 2.045142296368989, - "grad_norm": 0.8058456182479858, - "learning_rate": 9.587714505549796e-05, - "loss": 0.0905, + "epoch": 8.178999018645731, + "grad_norm": 0.3655667006969452, + "learning_rate": 6.2485901473354205e-06, + "loss": 0.029, "step": 31260 }, { - "epoch": 2.04579653254825, - "grad_norm": 0.9792216420173645, - "learning_rate": 9.587349161213148e-05, - "loss": 0.0956, + "epoch": 8.181615963362773, + "grad_norm": 0.3365057110786438, + "learning_rate": 6.234927035293212e-06, + "loss": 0.0222, "step": 31270 }, { - "epoch": 2.0464507687275106, - "grad_norm": 0.8495068550109863, - "learning_rate": 9.586983662041434e-05, - "loss": 0.0937, + "epoch": 8.184232908079817, + "grad_norm": 0.2938461899757385, + "learning_rate": 6.2212767490014225e-06, + "loss": 0.0269, "step": 31280 }, { - "epoch": 2.047105004906771, - "grad_norm": 0.9974722862243652, - "learning_rate": 9.586618008046992e-05, - "loss": 0.086, + "epoch": 8.18684985279686, + "grad_norm": 0.22843977808952332, + "learning_rate": 6.207639297789905e-06, + "loss": 0.0264, "step": 31290 }, { - "epoch": 2.047759241086032, - "grad_norm": 0.8604632616043091, - "learning_rate": 9.586252199242166e-05, - "loss": 0.0877, + "epoch": 8.189466797513903, + "grad_norm": 0.21421143412590027, + "learning_rate": 6.19401469097973e-06, + "loss": 0.0233, "step": 31300 }, { - "epoch": 2.0484134772652927, - "grad_norm": 0.803986668586731, - "learning_rate": 9.585886235639299e-05, - "loss": 0.0971, + "epoch": 8.192083742230945, + "grad_norm": 0.28643175959587097, + "learning_rate": 6.1804029378831785e-06, + "loss": 0.0214, "step": 31310 }, { - "epoch": 2.0490677134445536, - "grad_norm": 0.6995072960853577, - "learning_rate": 9.585520117250744e-05, - "loss": 0.0965, + "epoch": 8.194700686947987, + "grad_norm": 0.3194423317909241, + "learning_rate": 6.166804047803762e-06, + "loss": 0.0246, "step": 31320 }, { - "epoch": 2.049721949623814, - "grad_norm": 0.9633769989013672, - "learning_rate": 9.585153844088858e-05, - "loss": 0.0906, + "epoch": 8.197317631665031, + "grad_norm": 0.23330186307430267, + "learning_rate": 6.15321803003619e-06, + "loss": 0.0258, "step": 31330 }, { - "epoch": 2.050376185803075, - "grad_norm": 0.7706478238105774, - "learning_rate": 9.584787416166006e-05, - "loss": 0.0924, + "epoch": 8.199934576382073, + "grad_norm": 0.2894728183746338, + "learning_rate": 6.139644893866389e-06, + "loss": 0.0266, "step": 31340 }, { - "epoch": 2.0510304219823356, - "grad_norm": 0.7993813753128052, - "learning_rate": 9.584420833494555e-05, - "loss": 0.0945, + "epoch": 8.202551521099117, + "grad_norm": 0.2600817084312439, + "learning_rate": 6.126084648571453e-06, + "loss": 0.0252, "step": 31350 }, { - "epoch": 2.051684658161596, - "grad_norm": 0.7742961645126343, - "learning_rate": 9.584054096086877e-05, - "loss": 0.0973, + "epoch": 8.20516846581616, + "grad_norm": 0.31294360756874084, + "learning_rate": 6.112537303419696e-06, + "loss": 0.0244, "step": 31360 }, { - "epoch": 2.052338894340857, - "grad_norm": 0.7687771320343018, - "learning_rate": 9.583687203955352e-05, - "loss": 0.0907, + "epoch": 8.207785410533203, + "grad_norm": 0.39260825514793396, + "learning_rate": 6.0990028676705866e-06, + "loss": 0.0266, "step": 31370 }, { - "epoch": 2.0529931305201177, - "grad_norm": 0.7839317917823792, - "learning_rate": 9.583320157112362e-05, - "loss": 0.0888, + "epoch": 8.210402355250245, + "grad_norm": 0.21804678440093994, + "learning_rate": 6.085481350574792e-06, + "loss": 0.024, "step": 31380 }, { - "epoch": 2.0536473666993786, - "grad_norm": 0.891391396522522, - "learning_rate": 9.582952955570297e-05, - "loss": 0.0929, + "epoch": 8.213019299967288, + "grad_norm": 0.4497775435447693, + "learning_rate": 6.071972761374142e-06, + "loss": 0.0245, "step": 31390 }, { - "epoch": 2.054301602878639, - "grad_norm": 0.699019730091095, - "learning_rate": 9.58258559934155e-05, - "loss": 0.0879, + "epoch": 8.215636244684331, + "grad_norm": 0.3159434199333191, + "learning_rate": 6.058477109301633e-06, + "loss": 0.0246, "step": 31400 }, { - "epoch": 2.0549558390578997, - "grad_norm": 0.765687108039856, - "learning_rate": 9.582218088438522e-05, - "loss": 0.0901, + "epoch": 8.218253189401374, + "grad_norm": 0.4656051695346832, + "learning_rate": 6.044994403581408e-06, + "loss": 0.0257, "step": 31410 }, { - "epoch": 2.0556100752371607, - "grad_norm": 0.7088882327079773, - "learning_rate": 9.581850422873615e-05, - "loss": 0.0878, + "epoch": 8.220870134118417, + "grad_norm": 0.31082114577293396, + "learning_rate": 6.031524653428772e-06, + "loss": 0.025, "step": 31420 }, { - "epoch": 2.056264311416421, - "grad_norm": 0.9518930315971375, - "learning_rate": 9.58148260265924e-05, - "loss": 0.0831, + "epoch": 8.22348707883546, + "grad_norm": 0.35635101795196533, + "learning_rate": 6.018067868050173e-06, + "loss": 0.0233, "step": 31430 }, { - "epoch": 2.056918547595682, - "grad_norm": 0.6729696393013, - "learning_rate": 9.581114627807812e-05, - "loss": 0.0931, + "epoch": 8.226104023552502, + "grad_norm": 0.36490118503570557, + "learning_rate": 6.004624056643205e-06, + "loss": 0.026, "step": 31440 }, { - "epoch": 2.0575727837749427, - "grad_norm": 0.8468174338340759, - "learning_rate": 9.58074649833175e-05, - "loss": 0.0958, + "epoch": 8.228720968269545, + "grad_norm": 0.37780648469924927, + "learning_rate": 5.991193228396571e-06, + "loss": 0.0237, "step": 31450 }, { - "epoch": 2.0582270199542037, - "grad_norm": 0.8482375144958496, - "learning_rate": 9.580378214243482e-05, - "loss": 0.09, + "epoch": 8.231337912986588, + "grad_norm": 0.23720017075538635, + "learning_rate": 5.977775392490128e-06, + "loss": 0.0215, "step": 31460 }, { - "epoch": 2.058881256133464, - "grad_norm": 0.7801223397254944, - "learning_rate": 9.580009775555435e-05, - "loss": 0.0862, + "epoch": 8.233954857703631, + "grad_norm": 0.22219017148017883, + "learning_rate": 5.964370558094831e-06, + "loss": 0.0218, "step": 31470 }, { - "epoch": 2.0595354923127247, - "grad_norm": 0.7654222249984741, - "learning_rate": 9.579641182280049e-05, - "loss": 0.0897, + "epoch": 8.236571802420674, + "grad_norm": 0.19312626123428345, + "learning_rate": 5.950978734372764e-06, + "loss": 0.0251, "step": 31480 }, { - "epoch": 2.0601897284919857, - "grad_norm": 0.9105095267295837, - "learning_rate": 9.57927243442976e-05, - "loss": 0.0818, + "epoch": 8.239188747137717, + "grad_norm": 0.2890303432941437, + "learning_rate": 5.937599930477108e-06, + "loss": 0.0274, "step": 31490 }, { - "epoch": 2.060843964671246, - "grad_norm": 0.8952615261077881, - "learning_rate": 9.578903532017017e-05, - "loss": 0.0898, + "epoch": 8.24180569185476, + "grad_norm": 0.19071075320243835, + "learning_rate": 5.924234155552158e-06, + "loss": 0.0226, "step": 31500 }, { - "epoch": 2.061498200850507, - "grad_norm": 0.9962287545204163, - "learning_rate": 9.578534475054272e-05, - "loss": 0.0912, + "epoch": 8.244422636571802, + "grad_norm": 0.2616288363933563, + "learning_rate": 5.910881418733283e-06, + "loss": 0.023, "step": 31510 }, { - "epoch": 2.0621524370297677, - "grad_norm": 0.9723460674285889, - "learning_rate": 9.578165263553982e-05, - "loss": 0.0816, + "epoch": 8.247039581288846, + "grad_norm": 0.22435951232910156, + "learning_rate": 5.89754172914696e-06, + "loss": 0.023, "step": 31520 }, { - "epoch": 2.0628066732090287, - "grad_norm": 0.8590718507766724, - "learning_rate": 9.577795897528605e-05, - "loss": 0.0866, + "epoch": 8.249656526005888, + "grad_norm": 0.2875134348869324, + "learning_rate": 5.884215095910739e-06, + "loss": 0.0244, "step": 31530 }, { - "epoch": 2.063460909388289, - "grad_norm": 0.8612424731254578, - "learning_rate": 9.577426376990613e-05, - "loss": 0.0951, + "epoch": 8.252273470722931, + "grad_norm": 0.2992618680000305, + "learning_rate": 5.870901528133255e-06, + "loss": 0.0219, "step": 31540 }, { - "epoch": 2.0641151455675497, - "grad_norm": 0.8139830231666565, - "learning_rate": 9.577056701952474e-05, - "loss": 0.0836, + "epoch": 8.254890415439974, + "grad_norm": 0.5597621202468872, + "learning_rate": 5.857601034914201e-06, + "loss": 0.0256, "step": 31550 }, { - "epoch": 2.0647693817468107, - "grad_norm": 0.7661455869674683, - "learning_rate": 9.57668687242667e-05, - "loss": 0.0822, + "epoch": 8.257507360157017, + "grad_norm": 0.3123728632926941, + "learning_rate": 5.844313625344331e-06, + "loss": 0.0224, "step": 31560 }, { - "epoch": 2.065423617926071, - "grad_norm": 0.7846705317497253, - "learning_rate": 9.576316888425681e-05, - "loss": 0.089, + "epoch": 8.26012430487406, + "grad_norm": 0.1956491321325302, + "learning_rate": 5.831039308505467e-06, + "loss": 0.0237, "step": 31570 }, { - "epoch": 2.066077854105332, - "grad_norm": 0.9100658893585205, - "learning_rate": 9.575946749961992e-05, - "loss": 0.078, + "epoch": 8.262741249591102, + "grad_norm": 0.31507089734077454, + "learning_rate": 5.817778093470486e-06, + "loss": 0.0242, "step": 31580 }, { - "epoch": 2.0667320902845927, - "grad_norm": 0.8876975178718567, - "learning_rate": 9.575576457048102e-05, - "loss": 0.0923, + "epoch": 8.265358194308146, + "grad_norm": 0.4155109226703644, + "learning_rate": 5.804529989303301e-06, + "loss": 0.0223, "step": 31590 }, { - "epoch": 2.0673863264638532, - "grad_norm": 0.9435741901397705, - "learning_rate": 9.575206009696507e-05, - "loss": 0.0967, + "epoch": 8.267975139025188, + "grad_norm": 0.23790472745895386, + "learning_rate": 5.7912950050588725e-06, + "loss": 0.0217, "step": 31600 }, { - "epoch": 2.068040562643114, - "grad_norm": 0.7191019058227539, - "learning_rate": 9.574835407919709e-05, - "loss": 0.0965, + "epoch": 8.270592083742232, + "grad_norm": 0.20042049884796143, + "learning_rate": 5.778073149783172e-06, + "loss": 0.02, "step": 31610 }, { - "epoch": 2.0686947988223747, - "grad_norm": 0.7950155735015869, - "learning_rate": 9.574464651730219e-05, - "loss": 0.0873, + "epoch": 8.273209028459274, + "grad_norm": 0.272510290145874, + "learning_rate": 5.764864432513226e-06, + "loss": 0.028, "step": 31620 }, { - "epoch": 2.0693490350016357, - "grad_norm": 0.7821862697601318, - "learning_rate": 9.574093741140549e-05, - "loss": 0.0901, + "epoch": 8.275825973176318, + "grad_norm": 0.3318398892879486, + "learning_rate": 5.75166886227706e-06, + "loss": 0.0256, "step": 31630 }, { - "epoch": 2.070003271180896, - "grad_norm": 0.7659751176834106, - "learning_rate": 9.57372267616322e-05, - "loss": 0.0866, + "epoch": 8.27844291789336, + "grad_norm": 0.26033854484558105, + "learning_rate": 5.738486448093733e-06, + "loss": 0.025, "step": 31640 }, { - "epoch": 2.070657507360157, - "grad_norm": 0.8592097759246826, - "learning_rate": 9.573351456810755e-05, - "loss": 0.0871, + "epoch": 8.281059862610402, + "grad_norm": 0.21934951841831207, + "learning_rate": 5.725317198973296e-06, + "loss": 0.0251, "step": 31650 }, { - "epoch": 2.0713117435394177, - "grad_norm": 0.9904183149337769, - "learning_rate": 9.572980083095684e-05, - "loss": 0.0931, + "epoch": 8.283676807327446, + "grad_norm": 0.2376284897327423, + "learning_rate": 5.712161123916795e-06, + "loss": 0.0245, "step": 31660 }, { - "epoch": 2.0719659797186782, - "grad_norm": 0.8140203952789307, - "learning_rate": 9.572608555030543e-05, - "loss": 0.0921, + "epoch": 8.286293752044488, + "grad_norm": 0.3624178469181061, + "learning_rate": 5.699018231916292e-06, + "loss": 0.0201, "step": 31670 }, { - "epoch": 2.072620215897939, - "grad_norm": 1.019144058227539, - "learning_rate": 9.57223687262787e-05, - "loss": 0.0873, + "epoch": 8.288910696761532, + "grad_norm": 0.34957781434059143, + "learning_rate": 5.685888531954831e-06, + "loss": 0.0244, "step": 31680 }, { - "epoch": 2.0732744520771997, - "grad_norm": 0.8662041425704956, - "learning_rate": 9.571865035900213e-05, - "loss": 0.0883, + "epoch": 8.291527641478574, + "grad_norm": 0.42351824045181274, + "learning_rate": 5.672772033006437e-06, + "loss": 0.0256, "step": 31690 }, { - "epoch": 2.0739286882564607, - "grad_norm": 0.7000994682312012, - "learning_rate": 9.571493044860121e-05, - "loss": 0.0819, + "epoch": 8.294144586195616, + "grad_norm": 0.25669652223587036, + "learning_rate": 5.65966874403612e-06, + "loss": 0.0222, "step": 31700 }, { - "epoch": 2.0745829244357212, - "grad_norm": 0.9109962582588196, - "learning_rate": 9.571120899520148e-05, - "loss": 0.0957, + "epoch": 8.29676153091266, + "grad_norm": 0.30895277857780457, + "learning_rate": 5.646578673999841e-06, + "loss": 0.0246, "step": 31710 }, { - "epoch": 2.075237160614982, - "grad_norm": 0.7647160887718201, - "learning_rate": 9.570748599892858e-05, - "loss": 0.0833, + "epoch": 8.299378475629702, + "grad_norm": 0.1788632720708847, + "learning_rate": 5.6335018318445485e-06, + "loss": 0.0238, "step": 31720 }, { - "epoch": 2.0758913967942427, - "grad_norm": 0.8084906339645386, - "learning_rate": 9.570376145990814e-05, - "loss": 0.086, + "epoch": 8.301995420346746, + "grad_norm": 0.33750054240226746, + "learning_rate": 5.620438226508138e-06, + "loss": 0.0243, "step": 31730 }, { - "epoch": 2.0765456329735033, - "grad_norm": 0.9867994785308838, - "learning_rate": 9.57000353782659e-05, - "loss": 0.0799, + "epoch": 8.304612365063788, + "grad_norm": 0.2295209765434265, + "learning_rate": 5.607387866919467e-06, + "loss": 0.0208, "step": 31740 }, { - "epoch": 2.0771998691527642, - "grad_norm": 0.9291912317276001, - "learning_rate": 9.569630775412762e-05, - "loss": 0.0904, + "epoch": 8.307229309780832, + "grad_norm": 0.1997835338115692, + "learning_rate": 5.59435076199833e-06, + "loss": 0.0255, "step": 31750 }, { - "epoch": 2.0778541053320247, - "grad_norm": 0.9430752992630005, - "learning_rate": 9.569257858761909e-05, - "loss": 0.0868, + "epoch": 8.309846254497874, + "grad_norm": 0.17285549640655518, + "learning_rate": 5.581326920655452e-06, + "loss": 0.0236, "step": 31760 }, { - "epoch": 2.0785083415112857, - "grad_norm": 0.7106003165245056, - "learning_rate": 9.568884787886621e-05, - "loss": 0.0909, + "epoch": 8.312463199214916, + "grad_norm": 0.22822895646095276, + "learning_rate": 5.5683163517925215e-06, + "loss": 0.0254, "step": 31770 }, { - "epoch": 2.0791625776905462, - "grad_norm": 1.0632734298706055, - "learning_rate": 9.56851156279949e-05, - "loss": 0.0883, + "epoch": 8.31508014393196, + "grad_norm": 0.27281373739242554, + "learning_rate": 5.55531906430213e-06, + "loss": 0.0253, "step": 31780 }, { - "epoch": 2.079816813869807, - "grad_norm": 0.8719366192817688, - "learning_rate": 9.568138183513111e-05, - "loss": 0.0842, + "epoch": 8.317697088649002, + "grad_norm": 0.24460025131702423, + "learning_rate": 5.542335067067808e-06, + "loss": 0.0282, "step": 31790 }, { - "epoch": 2.0804710500490677, - "grad_norm": 0.806563675403595, - "learning_rate": 9.567764650040087e-05, - "loss": 0.0993, + "epoch": 8.320314033366046, + "grad_norm": 0.1998259276151657, + "learning_rate": 5.529364368963999e-06, + "loss": 0.026, "step": 31800 }, { - "epoch": 2.0811252862283283, - "grad_norm": 0.8879052996635437, - "learning_rate": 9.567390962393029e-05, - "loss": 0.0953, + "epoch": 8.322930978083088, + "grad_norm": 0.3485002815723419, + "learning_rate": 5.516406978856043e-06, + "loss": 0.0197, "step": 31810 }, { - "epoch": 2.0817795224075892, - "grad_norm": 0.8709869384765625, - "learning_rate": 9.567017120584545e-05, - "loss": 0.087, + "epoch": 8.325547922800132, + "grad_norm": 0.24614469707012177, + "learning_rate": 5.503462905600193e-06, + "loss": 0.0263, "step": 31820 }, { - "epoch": 2.0824337585868498, - "grad_norm": 0.7416978478431702, - "learning_rate": 9.566643124627258e-05, - "loss": 0.0861, + "epoch": 8.328164867517174, + "grad_norm": 0.3442598283290863, + "learning_rate": 5.490532158043616e-06, + "loss": 0.0294, "step": 31830 }, { - "epoch": 2.0830879947661107, - "grad_norm": 0.8955287337303162, - "learning_rate": 9.566268974533789e-05, - "loss": 0.0933, + "epoch": 8.330781812234216, + "grad_norm": 0.20003820955753326, + "learning_rate": 5.477614745024337e-06, + "loss": 0.0206, "step": 31840 }, { - "epoch": 2.0837422309453713, - "grad_norm": 0.8596556186676025, - "learning_rate": 9.565894670316767e-05, - "loss": 0.0911, + "epoch": 8.33339875695126, + "grad_norm": 0.28994110226631165, + "learning_rate": 5.464710675371301e-06, + "loss": 0.0254, "step": 31850 }, { - "epoch": 2.084396467124632, - "grad_norm": 0.8234604597091675, - "learning_rate": 9.565520211988823e-05, - "loss": 0.0864, + "epoch": 8.336015701668302, + "grad_norm": 0.2949778735637665, + "learning_rate": 5.451819957904305e-06, + "loss": 0.0289, "step": 31860 }, { - "epoch": 2.0850507033038927, - "grad_norm": 0.862963855266571, - "learning_rate": 9.5651455995626e-05, - "loss": 0.0942, + "epoch": 8.338632646385346, + "grad_norm": 0.1833481788635254, + "learning_rate": 5.438942601434041e-06, + "loss": 0.0233, "step": 31870 }, { - "epoch": 2.0857049394831533, - "grad_norm": 0.8025641441345215, - "learning_rate": 9.56477083305074e-05, - "loss": 0.0871, + "epoch": 8.341249591102388, + "grad_norm": 0.2757635712623596, + "learning_rate": 5.426078614762059e-06, + "loss": 0.0236, "step": 31880 }, { - "epoch": 2.0863591756624142, - "grad_norm": 0.7878461480140686, - "learning_rate": 9.564395912465893e-05, - "loss": 0.0889, + "epoch": 8.34386653581943, + "grad_norm": 0.49641239643096924, + "learning_rate": 5.413228006680771e-06, + "loss": 0.0279, "step": 31890 }, { - "epoch": 2.0870134118416748, - "grad_norm": 0.8542391061782837, - "learning_rate": 9.564020837820713e-05, - "loss": 0.0936, + "epoch": 8.346483480536474, + "grad_norm": 0.159059077501297, + "learning_rate": 5.400390785973455e-06, + "loss": 0.0257, "step": 31900 }, { - "epoch": 2.0876676480209357, - "grad_norm": 0.7547128796577454, - "learning_rate": 9.56364560912786e-05, - "loss": 0.0922, + "epoch": 8.349100425253516, + "grad_norm": 0.1922084391117096, + "learning_rate": 5.38756696141422e-06, + "loss": 0.0258, "step": 31910 }, { - "epoch": 2.0883218842001963, - "grad_norm": 0.7220647931098938, - "learning_rate": 9.563270226400001e-05, - "loss": 0.0832, + "epoch": 8.35171736997056, + "grad_norm": 0.14421360194683075, + "learning_rate": 5.3747565417680365e-06, + "loss": 0.0226, "step": 31920 }, { - "epoch": 2.088976120379457, - "grad_norm": 0.9882017374038696, - "learning_rate": 9.562894689649802e-05, - "loss": 0.0984, + "epoch": 8.354334314687602, + "grad_norm": 0.24148830771446228, + "learning_rate": 5.361959535790695e-06, + "loss": 0.0197, "step": 31930 }, { - "epoch": 2.0896303565587178, - "grad_norm": 0.949703574180603, - "learning_rate": 9.562518998889942e-05, - "loss": 0.0903, + "epoch": 8.356951259404646, + "grad_norm": 0.3264545202255249, + "learning_rate": 5.349175952228838e-06, + "loss": 0.023, "step": 31940 }, { - "epoch": 2.0902845927379783, - "grad_norm": 1.0962672233581543, - "learning_rate": 9.562143154133099e-05, - "loss": 0.0848, + "epoch": 8.359568204121688, + "grad_norm": 0.23968908190727234, + "learning_rate": 5.336405799819924e-06, + "loss": 0.0257, "step": 31950 }, { - "epoch": 2.0909388289172393, - "grad_norm": 0.9452337622642517, - "learning_rate": 9.561767155391961e-05, - "loss": 0.0901, + "epoch": 8.36218514883873, + "grad_norm": 0.30460986495018005, + "learning_rate": 5.323649087292226e-06, + "loss": 0.0299, "step": 31960 }, { - "epoch": 2.0915930650965, - "grad_norm": 0.9834058880805969, - "learning_rate": 9.561391002679217e-05, - "loss": 0.0849, + "epoch": 8.364802093555774, + "grad_norm": 0.548783004283905, + "learning_rate": 5.3109058233648365e-06, + "loss": 0.022, "step": 31970 }, { - "epoch": 2.0922473012757608, - "grad_norm": 0.8670421838760376, - "learning_rate": 9.561014696007565e-05, - "loss": 0.0926, + "epoch": 8.367419038272816, + "grad_norm": 0.3376639485359192, + "learning_rate": 5.298176016747664e-06, + "loss": 0.0277, "step": 31980 }, { - "epoch": 2.0929015374550213, - "grad_norm": 0.8581560254096985, - "learning_rate": 9.560638235389704e-05, - "loss": 0.0923, + "epoch": 8.37003598298986, + "grad_norm": 0.2433539777994156, + "learning_rate": 5.285459676141405e-06, + "loss": 0.0226, "step": 31990 }, { - "epoch": 2.093555773634282, - "grad_norm": 0.9156831502914429, - "learning_rate": 9.560261620838342e-05, - "loss": 0.0951, + "epoch": 8.372652927706902, + "grad_norm": 0.30453231930732727, + "learning_rate": 5.272756810237567e-06, + "loss": 0.0214, "step": 32000 }, { - "epoch": 2.0942100098135428, - "grad_norm": 0.6964516639709473, - "learning_rate": 9.559884852366191e-05, - "loss": 0.0958, - "step": 32010 - }, - { - "epoch": 2.0948642459928033, - "grad_norm": 0.859331488609314, - "learning_rate": 9.559507929985968e-05, - "loss": 0.0784, - "step": 32020 - }, - { - "epoch": 2.0955184821720643, - "grad_norm": 0.8123635649681091, - "learning_rate": 9.559130853710395e-05, - "loss": 0.0899, - "step": 32030 - }, - { - "epoch": 2.096172718351325, - "grad_norm": 0.7182052731513977, - "learning_rate": 9.558753623552197e-05, - "loss": 0.0874, - "step": 32040 - }, - { - "epoch": 2.0968269545305853, - "grad_norm": 0.8559288382530212, - "learning_rate": 9.558376239524109e-05, - "loss": 0.0788, - "step": 32050 - }, - { - "epoch": 2.0974811907098463, - "grad_norm": 0.8461546301841736, - "learning_rate": 9.557998701638868e-05, - "loss": 0.0858, - "step": 32060 - }, - { - "epoch": 2.098135426889107, - "grad_norm": 0.8831964135169983, - "learning_rate": 9.557621009909218e-05, - "loss": 0.1007, - "step": 32070 - }, - { - "epoch": 2.098789663068368, - "grad_norm": 0.8673757910728455, - "learning_rate": 9.557243164347907e-05, - "loss": 0.0876, - "step": 32080 - }, - { - "epoch": 2.0994438992476283, - "grad_norm": 0.9492565393447876, - "learning_rate": 9.556865164967685e-05, - "loss": 0.0919, - "step": 32090 - }, - { - "epoch": 2.1000981354268893, - "grad_norm": 0.8999154567718506, - "learning_rate": 9.556487011781314e-05, - "loss": 0.087, - "step": 32100 - }, - { - "epoch": 2.10075237160615, - "grad_norm": 0.7991196513175964, - "learning_rate": 9.556108704801558e-05, - "loss": 0.0878, - "step": 32110 - }, - { - "epoch": 2.1014066077854103, - "grad_norm": 0.8117789030075073, - "learning_rate": 9.555730244041182e-05, - "loss": 0.0961, - "step": 32120 - }, - { - "epoch": 2.1020608439646713, - "grad_norm": 0.8746902346611023, - "learning_rate": 9.555351629512963e-05, - "loss": 0.0885, - "step": 32130 - }, - { - "epoch": 2.102715080143932, - "grad_norm": 1.0785053968429565, - "learning_rate": 9.55497286122968e-05, - "loss": 0.0928, - "step": 32140 - }, - { - "epoch": 2.103369316323193, - "grad_norm": 0.9100284576416016, - "learning_rate": 9.554593939204117e-05, - "loss": 0.0856, - "step": 32150 - }, - { - "epoch": 2.1040235525024533, - "grad_norm": 0.7973883748054504, - "learning_rate": 9.554214863449065e-05, - "loss": 0.0826, - "step": 32160 - }, - { - "epoch": 2.1046777886817143, - "grad_norm": 0.9235144257545471, - "learning_rate": 9.553835633977316e-05, - "loss": 0.0876, - "step": 32170 - }, - { - "epoch": 2.105332024860975, - "grad_norm": 0.7327439785003662, - "learning_rate": 9.553456250801671e-05, - "loss": 0.0962, - "step": 32180 - }, - { - "epoch": 2.1059862610402353, - "grad_norm": 0.7771127223968506, - "learning_rate": 9.553076713934936e-05, - "loss": 0.0884, - "step": 32190 - }, - { - "epoch": 2.1066404972194963, - "grad_norm": 0.8690281510353088, - "learning_rate": 9.552697023389922e-05, - "loss": 0.0966, - "step": 32200 - }, - { - "epoch": 2.107294733398757, - "grad_norm": 0.9444182515144348, - "learning_rate": 9.552317179179444e-05, - "loss": 0.1007, - "step": 32210 - }, - { - "epoch": 2.107948969578018, - "grad_norm": 0.7808245420455933, - "learning_rate": 9.551937181316322e-05, - "loss": 0.0893, - "step": 32220 - }, - { - "epoch": 2.1086032057572783, - "grad_norm": 0.830314576625824, - "learning_rate": 9.55155702981338e-05, - "loss": 0.104, - "step": 32230 - }, - { - "epoch": 2.1092574419365393, - "grad_norm": 0.7720317244529724, - "learning_rate": 9.551176724683453e-05, - "loss": 0.0856, - "step": 32240 - }, - { - "epoch": 2.1099116781158, - "grad_norm": 0.8897857666015625, - "learning_rate": 9.550796265939377e-05, - "loss": 0.0877, - "step": 32250 - }, - { - "epoch": 2.1105659142950604, - "grad_norm": 0.8322727084159851, - "learning_rate": 9.550415653593989e-05, - "loss": 0.0817, - "step": 32260 - }, - { - "epoch": 2.1112201504743213, - "grad_norm": 0.8915346264839172, - "learning_rate": 9.550034887660143e-05, - "loss": 0.0801, - "step": 32270 - }, - { - "epoch": 2.111874386653582, - "grad_norm": 0.859603762626648, - "learning_rate": 9.549653968150682e-05, - "loss": 0.0939, - "step": 32280 - }, - { - "epoch": 2.112528622832843, - "grad_norm": 0.7257876396179199, - "learning_rate": 9.54927289507847e-05, - "loss": 0.0931, - "step": 32290 - }, - { - "epoch": 2.1131828590121033, - "grad_norm": 1.2223953008651733, - "learning_rate": 9.548891668456367e-05, - "loss": 0.1026, - "step": 32300 - }, - { - "epoch": 2.113837095191364, - "grad_norm": 0.8096550107002258, - "learning_rate": 9.54851028829724e-05, - "loss": 0.0877, - "step": 32310 - }, - { - "epoch": 2.114491331370625, - "grad_norm": 0.9068686366081238, - "learning_rate": 9.548128754613963e-05, - "loss": 0.1137, - "step": 32320 - }, - { - "epoch": 2.1151455675498854, - "grad_norm": 0.8379555940628052, - "learning_rate": 9.54774706741941e-05, - "loss": 0.0789, - "step": 32330 - }, - { - "epoch": 2.1157998037291463, - "grad_norm": 1.188866376876831, - "learning_rate": 9.547365226726468e-05, - "loss": 0.0919, - "step": 32340 - }, - { - "epoch": 2.116454039908407, - "grad_norm": 0.776324450969696, - "learning_rate": 9.546983232548023e-05, - "loss": 0.0906, - "step": 32350 - }, - { - "epoch": 2.117108276087668, - "grad_norm": 0.7669685482978821, - "learning_rate": 9.546601084896971e-05, - "loss": 0.0905, - "step": 32360 - }, - { - "epoch": 2.1177625122669284, - "grad_norm": 0.8013961911201477, - "learning_rate": 9.546218783786207e-05, - "loss": 0.091, - "step": 32370 - }, - { - "epoch": 2.118416748446189, - "grad_norm": 0.8074659705162048, - "learning_rate": 9.545836329228637e-05, - "loss": 0.0855, - "step": 32380 - }, - { - "epoch": 2.11907098462545, - "grad_norm": 0.8233710527420044, - "learning_rate": 9.545453721237167e-05, - "loss": 0.0879, - "step": 32390 - }, - { - "epoch": 2.1197252208047104, - "grad_norm": 0.8255857229232788, - "learning_rate": 9.545070959824716e-05, - "loss": 0.0876, - "step": 32400 - }, - { - "epoch": 2.1203794569839713, - "grad_norm": 0.8304885625839233, - "learning_rate": 9.544688045004197e-05, - "loss": 0.0903, - "step": 32410 - }, - { - "epoch": 2.121033693163232, - "grad_norm": 0.9255449175834656, - "learning_rate": 9.544304976788541e-05, - "loss": 0.0913, - "step": 32420 - }, - { - "epoch": 2.121687929342493, - "grad_norm": 0.8845074772834778, - "learning_rate": 9.543921755190671e-05, - "loss": 0.0849, - "step": 32430 - }, - { - "epoch": 2.1223421655217534, - "grad_norm": 0.9310880303382874, - "learning_rate": 9.543538380223527e-05, - "loss": 0.0988, - "step": 32440 - }, - { - "epoch": 2.122996401701014, - "grad_norm": 0.953241765499115, - "learning_rate": 9.543154851900045e-05, - "loss": 0.0909, - "step": 32450 - }, - { - "epoch": 2.123650637880275, - "grad_norm": 0.8089374303817749, - "learning_rate": 9.542771170233173e-05, - "loss": 0.0877, - "step": 32460 - }, - { - "epoch": 2.1243048740595354, - "grad_norm": 0.8584646582603455, - "learning_rate": 9.542387335235861e-05, - "loss": 0.0873, - "step": 32470 - }, - { - "epoch": 2.1249591102387964, - "grad_norm": 0.7349388599395752, - "learning_rate": 9.542003346921063e-05, - "loss": 0.0909, - "step": 32480 - }, - { - "epoch": 2.125613346418057, - "grad_norm": 0.9695256352424622, - "learning_rate": 9.541619205301739e-05, - "loss": 0.102, - "step": 32490 - }, - { - "epoch": 2.1262675825973174, - "grad_norm": 0.6882131695747375, - "learning_rate": 9.541234910390857e-05, - "loss": 0.0823, - "step": 32500 - }, - { - "epoch": 2.1269218187765784, - "grad_norm": 0.8628367185592651, - "learning_rate": 9.540850462201387e-05, - "loss": 0.0998, - "step": 32510 - }, - { - "epoch": 2.127576054955839, - "grad_norm": 0.7853913903236389, - "learning_rate": 9.540465860746305e-05, - "loss": 0.0905, - "step": 32520 - }, - { - "epoch": 2.1282302911351, - "grad_norm": 0.9766311049461365, - "learning_rate": 9.540081106038591e-05, - "loss": 0.0928, - "step": 32530 - }, - { - "epoch": 2.1288845273143604, - "grad_norm": 0.7391847372055054, - "learning_rate": 9.539696198091235e-05, - "loss": 0.084, - "step": 32540 - }, - { - "epoch": 2.1295387634936214, - "grad_norm": 0.9374977946281433, - "learning_rate": 9.539311136917227e-05, - "loss": 0.0979, - "step": 32550 - }, - { - "epoch": 2.130192999672882, - "grad_norm": 0.8064951300621033, - "learning_rate": 9.53892592252956e-05, - "loss": 0.0788, - "step": 32560 - }, - { - "epoch": 2.1308472358521424, - "grad_norm": 0.8344107866287231, - "learning_rate": 9.538540554941242e-05, - "loss": 0.0878, - "step": 32570 - }, - { - "epoch": 2.1315014720314034, - "grad_norm": 0.8620904684066772, - "learning_rate": 9.538155034165277e-05, - "loss": 0.0801, - "step": 32580 - }, - { - "epoch": 2.132155708210664, - "grad_norm": 0.764388918876648, - "learning_rate": 9.537769360214678e-05, - "loss": 0.0829, - "step": 32590 - }, - { - "epoch": 2.132809944389925, - "grad_norm": 0.7205486297607422, - "learning_rate": 9.537383533102462e-05, - "loss": 0.0815, - "step": 32600 - }, - { - "epoch": 2.1334641805691854, - "grad_norm": 0.7179217338562012, - "learning_rate": 9.536997552841653e-05, - "loss": 0.097, - "step": 32610 - }, - { - "epoch": 2.1341184167484464, - "grad_norm": 0.8533260226249695, - "learning_rate": 9.536611419445276e-05, - "loss": 0.0957, - "step": 32620 - }, - { - "epoch": 2.134772652927707, - "grad_norm": 0.9501368403434753, - "learning_rate": 9.536225132926366e-05, - "loss": 0.0954, - "step": 32630 - }, - { - "epoch": 2.1354268891069674, - "grad_norm": 0.8990276455879211, - "learning_rate": 9.535838693297963e-05, - "loss": 0.0895, - "step": 32640 - }, - { - "epoch": 2.1360811252862284, - "grad_norm": 0.9564955830574036, - "learning_rate": 9.535452100573108e-05, - "loss": 0.0933, - "step": 32650 - }, - { - "epoch": 2.136735361465489, - "grad_norm": 0.9729709625244141, - "learning_rate": 9.53506535476485e-05, - "loss": 0.1018, - "step": 32660 - }, - { - "epoch": 2.13738959764475, - "grad_norm": 0.8804893493652344, - "learning_rate": 9.534678455886241e-05, - "loss": 0.0903, - "step": 32670 - }, - { - "epoch": 2.1380438338240104, - "grad_norm": 0.8837690353393555, - "learning_rate": 9.534291403950341e-05, - "loss": 0.0917, - "step": 32680 - }, - { - "epoch": 2.1386980700032714, - "grad_norm": 0.9111039042472839, - "learning_rate": 9.533904198970218e-05, - "loss": 0.0803, - "step": 32690 - }, - { - "epoch": 2.139352306182532, - "grad_norm": 0.7728859186172485, - "learning_rate": 9.533516840958934e-05, - "loss": 0.081, - "step": 32700 - }, - { - "epoch": 2.1400065423617924, - "grad_norm": 0.8555951118469238, - "learning_rate": 9.533129329929568e-05, - "loss": 0.0857, - "step": 32710 - }, - { - "epoch": 2.1406607785410534, - "grad_norm": 0.761581540107727, - "learning_rate": 9.5327416658952e-05, - "loss": 0.0819, - "step": 32720 - }, - { - "epoch": 2.141315014720314, - "grad_norm": 0.7355059385299683, - "learning_rate": 9.532353848868914e-05, - "loss": 0.0839, - "step": 32730 - }, - { - "epoch": 2.141969250899575, - "grad_norm": 0.8941879868507385, - "learning_rate": 9.531965878863797e-05, - "loss": 0.0905, - "step": 32740 - }, - { - "epoch": 2.1426234870788354, - "grad_norm": 0.7693347334861755, - "learning_rate": 9.531577755892947e-05, - "loss": 0.0845, - "step": 32750 - }, - { - "epoch": 2.1432777232580964, - "grad_norm": 0.8309009671211243, - "learning_rate": 9.531189479969462e-05, - "loss": 0.0811, - "step": 32760 - }, - { - "epoch": 2.143931959437357, - "grad_norm": 0.8143077492713928, - "learning_rate": 9.530801051106449e-05, - "loss": 0.0857, - "step": 32770 - }, - { - "epoch": 2.1445861956166175, - "grad_norm": 0.9192956686019897, - "learning_rate": 9.53041246931702e-05, - "loss": 0.0907, - "step": 32780 - }, - { - "epoch": 2.1452404317958784, - "grad_norm": 0.9745957255363464, - "learning_rate": 9.530023734614286e-05, - "loss": 0.0898, - "step": 32790 - }, - { - "epoch": 2.145894667975139, - "grad_norm": 1.038955569267273, - "learning_rate": 9.529634847011373e-05, - "loss": 0.0901, - "step": 32800 - }, - { - "epoch": 2.1465489041544, - "grad_norm": 0.9316650032997131, - "learning_rate": 9.529245806521402e-05, - "loss": 0.078, - "step": 32810 - }, - { - "epoch": 2.1472031403336604, - "grad_norm": 0.9526748061180115, - "learning_rate": 9.528856613157509e-05, - "loss": 0.0921, - "step": 32820 - }, - { - "epoch": 2.147857376512921, - "grad_norm": 0.9869240522384644, - "learning_rate": 9.528467266932826e-05, - "loss": 0.0872, - "step": 32830 - }, - { - "epoch": 2.148511612692182, - "grad_norm": 0.9078952670097351, - "learning_rate": 9.528077767860497e-05, - "loss": 0.0867, - "step": 32840 - }, - { - "epoch": 2.1491658488714425, - "grad_norm": 1.0087946653366089, - "learning_rate": 9.527688115953668e-05, - "loss": 0.0835, - "step": 32850 - }, - { - "epoch": 2.1498200850507034, - "grad_norm": 0.8324182629585266, - "learning_rate": 9.527298311225493e-05, - "loss": 0.084, - "step": 32860 - }, - { - "epoch": 2.150474321229964, - "grad_norm": 0.7914925217628479, - "learning_rate": 9.526908353689123e-05, - "loss": 0.09, - "step": 32870 - }, - { - "epoch": 2.151128557409225, - "grad_norm": 0.8204302787780762, - "learning_rate": 9.526518243357725e-05, - "loss": 0.0969, - "step": 32880 - }, - { - "epoch": 2.1517827935884855, - "grad_norm": 0.8147171139717102, - "learning_rate": 9.526127980244466e-05, - "loss": 0.0867, - "step": 32890 - }, - { - "epoch": 2.152437029767746, - "grad_norm": 0.8153404593467712, - "learning_rate": 9.525737564362517e-05, - "loss": 0.078, - "step": 32900 - }, - { - "epoch": 2.153091265947007, - "grad_norm": 0.9826894402503967, - "learning_rate": 9.525346995725057e-05, - "loss": 0.0961, - "step": 32910 - }, - { - "epoch": 2.1537455021262675, - "grad_norm": 0.911864697933197, - "learning_rate": 9.524956274345268e-05, - "loss": 0.0884, - "step": 32920 - }, - { - "epoch": 2.1543997383055284, - "grad_norm": 0.9221819639205933, - "learning_rate": 9.524565400236335e-05, - "loss": 0.0984, - "step": 32930 - }, - { - "epoch": 2.155053974484789, - "grad_norm": 0.8833640217781067, - "learning_rate": 9.524174373411456e-05, - "loss": 0.0844, - "step": 32940 - }, - { - "epoch": 2.1557082106640495, - "grad_norm": 0.9329414963722229, - "learning_rate": 9.523783193883825e-05, - "loss": 0.0762, - "step": 32950 - }, - { - "epoch": 2.1563624468433105, - "grad_norm": 0.8548219799995422, - "learning_rate": 9.523391861666649e-05, - "loss": 0.0892, - "step": 32960 - }, - { - "epoch": 2.157016683022571, - "grad_norm": 0.9100401401519775, - "learning_rate": 9.523000376773132e-05, - "loss": 0.0935, - "step": 32970 - }, - { - "epoch": 2.157670919201832, - "grad_norm": 0.8718044757843018, - "learning_rate": 9.522608739216493e-05, - "loss": 0.0816, - "step": 32980 - }, - { - "epoch": 2.1583251553810925, - "grad_norm": 0.8366650938987732, - "learning_rate": 9.522216949009946e-05, - "loss": 0.079, - "step": 32990 - }, - { - "epoch": 2.1589793915603535, - "grad_norm": 0.9176976084709167, - "learning_rate": 9.52182500616672e-05, - "loss": 0.089, - "step": 33000 - }, - { - "epoch": 2.159633627739614, - "grad_norm": 1.1271123886108398, - "learning_rate": 9.521432910700039e-05, - "loss": 0.0919, - "step": 33010 - }, - { - "epoch": 2.1602878639188745, - "grad_norm": 0.935555100440979, - "learning_rate": 9.521040662623139e-05, - "loss": 0.0968, - "step": 33020 - }, - { - "epoch": 2.1609421000981355, - "grad_norm": 0.7743455767631531, - "learning_rate": 9.52064826194926e-05, - "loss": 0.0987, - "step": 33030 - }, - { - "epoch": 2.161596336277396, - "grad_norm": 0.7663136720657349, - "learning_rate": 9.520255708691646e-05, - "loss": 0.0865, - "step": 33040 - }, - { - "epoch": 2.162250572456657, - "grad_norm": 0.851262092590332, - "learning_rate": 9.519863002863548e-05, - "loss": 0.0805, - "step": 33050 - }, - { - "epoch": 2.1629048086359175, - "grad_norm": 1.051383137702942, - "learning_rate": 9.519470144478219e-05, - "loss": 0.099, - "step": 33060 - }, - { - "epoch": 2.1635590448151785, - "grad_norm": 0.7119585871696472, - "learning_rate": 9.519077133548922e-05, - "loss": 0.0792, - "step": 33070 - }, - { - "epoch": 2.164213280994439, - "grad_norm": 0.7884335517883301, - "learning_rate": 9.518683970088918e-05, - "loss": 0.087, - "step": 33080 - }, - { - "epoch": 2.1648675171736995, - "grad_norm": 0.80777907371521, - "learning_rate": 9.51829065411148e-05, - "loss": 0.0824, - "step": 33090 - }, - { - "epoch": 2.1655217533529605, - "grad_norm": 1.0060641765594482, - "learning_rate": 9.517897185629882e-05, - "loss": 0.0832, - "step": 33100 - }, - { - "epoch": 2.166175989532221, - "grad_norm": 0.9100120067596436, - "learning_rate": 9.517503564657407e-05, - "loss": 0.0868, - "step": 33110 - }, - { - "epoch": 2.166830225711482, - "grad_norm": 0.8094409704208374, - "learning_rate": 9.517109791207337e-05, - "loss": 0.0823, - "step": 33120 - }, - { - "epoch": 2.1674844618907425, - "grad_norm": 1.0569559335708618, - "learning_rate": 9.516715865292967e-05, - "loss": 0.0803, - "step": 33130 - }, - { - "epoch": 2.1681386980700035, - "grad_norm": 0.7555559277534485, - "learning_rate": 9.51632178692759e-05, - "loss": 0.0821, - "step": 33140 - }, - { - "epoch": 2.168792934249264, - "grad_norm": 0.9259467720985413, - "learning_rate": 9.515927556124507e-05, - "loss": 0.0833, - "step": 33150 - }, - { - "epoch": 2.1694471704285245, - "grad_norm": 0.9541639685630798, - "learning_rate": 9.515533172897028e-05, - "loss": 0.0959, - "step": 33160 - }, - { - "epoch": 2.1701014066077855, - "grad_norm": 0.7850528955459595, - "learning_rate": 9.51513863725846e-05, - "loss": 0.0848, - "step": 33170 - }, - { - "epoch": 2.170755642787046, - "grad_norm": 0.921647310256958, - "learning_rate": 9.514743949222122e-05, - "loss": 0.0864, - "step": 33180 - }, - { - "epoch": 2.171409878966307, - "grad_norm": 0.8776901364326477, - "learning_rate": 9.514349108801337e-05, - "loss": 0.1069, - "step": 33190 - }, - { - "epoch": 2.1720641151455675, - "grad_norm": 0.8369219303131104, - "learning_rate": 9.513954116009429e-05, - "loss": 0.0923, - "step": 33200 - }, - { - "epoch": 2.1727183513248285, - "grad_norm": 0.8588852882385254, - "learning_rate": 9.51355897085973e-05, - "loss": 0.0969, - "step": 33210 - }, - { - "epoch": 2.173372587504089, - "grad_norm": 0.8265673518180847, - "learning_rate": 9.513163673365581e-05, - "loss": 0.0817, - "step": 33220 - }, - { - "epoch": 2.1740268236833495, - "grad_norm": 0.8380441069602966, - "learning_rate": 9.512768223540321e-05, - "loss": 0.0827, - "step": 33230 - }, - { - "epoch": 2.1746810598626105, - "grad_norm": 0.7195032835006714, - "learning_rate": 9.512372621397298e-05, - "loss": 0.0922, - "step": 33240 - }, - { - "epoch": 2.175335296041871, - "grad_norm": 0.8967887759208679, - "learning_rate": 9.511976866949864e-05, - "loss": 0.0861, - "step": 33250 - }, - { - "epoch": 2.175989532221132, - "grad_norm": 0.8154705762863159, - "learning_rate": 9.51158096021138e-05, - "loss": 0.0936, - "step": 33260 - }, - { - "epoch": 2.1766437684003925, - "grad_norm": 0.8090271353721619, - "learning_rate": 9.511184901195204e-05, - "loss": 0.084, - "step": 33270 - }, - { - "epoch": 2.177298004579653, - "grad_norm": 0.8615165948867798, - "learning_rate": 9.510788689914707e-05, - "loss": 0.0891, - "step": 33280 - }, - { - "epoch": 2.177952240758914, - "grad_norm": 0.8889568448066711, - "learning_rate": 9.510392326383262e-05, - "loss": 0.0916, - "step": 33290 - }, - { - "epoch": 2.1786064769381746, - "grad_norm": 0.9349720478057861, - "learning_rate": 9.509995810614247e-05, - "loss": 0.0903, - "step": 33300 - }, - { - "epoch": 2.1792607131174355, - "grad_norm": 0.8932443261146545, - "learning_rate": 9.509599142621047e-05, - "loss": 0.0844, - "step": 33310 - }, - { - "epoch": 2.179914949296696, - "grad_norm": 0.8143129944801331, - "learning_rate": 9.509202322417047e-05, - "loss": 0.087, - "step": 33320 - }, - { - "epoch": 2.180569185475957, - "grad_norm": 0.8007728457450867, - "learning_rate": 9.508805350015643e-05, - "loss": 0.0925, - "step": 33330 - }, - { - "epoch": 2.1812234216552175, - "grad_norm": 0.8921264410018921, - "learning_rate": 9.508408225430237e-05, - "loss": 0.0925, - "step": 33340 - }, - { - "epoch": 2.181877657834478, - "grad_norm": 0.9129869341850281, - "learning_rate": 9.508010948674227e-05, - "loss": 0.0911, - "step": 33350 - }, - { - "epoch": 2.182531894013739, - "grad_norm": 0.9659237265586853, - "learning_rate": 9.507613519761022e-05, - "loss": 0.0915, - "step": 33360 - }, - { - "epoch": 2.1831861301929996, - "grad_norm": 0.8064131736755371, - "learning_rate": 9.507215938704043e-05, - "loss": 0.0845, - "step": 33370 - }, - { - "epoch": 2.1838403663722605, - "grad_norm": 0.9400182366371155, - "learning_rate": 9.506818205516705e-05, - "loss": 0.0886, - "step": 33380 - }, - { - "epoch": 2.184494602551521, - "grad_norm": 0.8010856509208679, - "learning_rate": 9.506420320212433e-05, - "loss": 0.0912, - "step": 33390 - }, - { - "epoch": 2.1851488387307816, - "grad_norm": 0.7543482184410095, - "learning_rate": 9.506022282804656e-05, - "loss": 0.0765, - "step": 33400 - }, - { - "epoch": 2.1858030749100426, - "grad_norm": 0.8534349799156189, - "learning_rate": 9.505624093306809e-05, - "loss": 0.0913, - "step": 33410 - }, - { - "epoch": 2.186457311089303, - "grad_norm": 0.8279025554656982, - "learning_rate": 9.505225751732333e-05, - "loss": 0.0815, - "step": 33420 - }, - { - "epoch": 2.187111547268564, - "grad_norm": 0.8194913268089294, - "learning_rate": 9.504827258094673e-05, - "loss": 0.0924, - "step": 33430 - }, - { - "epoch": 2.1877657834478246, - "grad_norm": 0.9188143014907837, - "learning_rate": 9.504428612407277e-05, - "loss": 0.0892, - "step": 33440 - }, - { - "epoch": 2.1884200196270855, - "grad_norm": 0.6940245032310486, - "learning_rate": 9.504029814683603e-05, - "loss": 0.0797, - "step": 33450 - }, - { - "epoch": 2.189074255806346, - "grad_norm": 0.8881454467773438, - "learning_rate": 9.503630864937112e-05, - "loss": 0.0794, - "step": 33460 - }, - { - "epoch": 2.1897284919856066, - "grad_norm": 0.8755089044570923, - "learning_rate": 9.503231763181266e-05, - "loss": 0.0877, - "step": 33470 - }, - { - "epoch": 2.1903827281648676, - "grad_norm": 0.7432528734207153, - "learning_rate": 9.502832509429538e-05, - "loss": 0.0857, - "step": 33480 - }, - { - "epoch": 2.191036964344128, - "grad_norm": 0.7518520355224609, - "learning_rate": 9.502433103695405e-05, - "loss": 0.0862, - "step": 33490 - }, - { - "epoch": 2.191691200523389, - "grad_norm": 0.7373502254486084, - "learning_rate": 9.502033545992347e-05, - "loss": 0.0839, - "step": 33500 - }, - { - "epoch": 2.1923454367026496, - "grad_norm": 0.8541769981384277, - "learning_rate": 9.501633836333847e-05, - "loss": 0.0858, - "step": 33510 - }, - { - "epoch": 2.1929996728819106, - "grad_norm": 0.8180940747261047, - "learning_rate": 9.501233974733402e-05, - "loss": 0.0857, - "step": 33520 - }, - { - "epoch": 2.193653909061171, - "grad_norm": 0.6392052173614502, - "learning_rate": 9.500833961204504e-05, - "loss": 0.0873, - "step": 33530 - }, - { - "epoch": 2.1943081452404316, - "grad_norm": 0.8578611016273499, - "learning_rate": 9.500433795760656e-05, - "loss": 0.0951, - "step": 33540 - }, - { - "epoch": 2.1949623814196926, - "grad_norm": 0.7681283354759216, - "learning_rate": 9.500033478415364e-05, - "loss": 0.0896, - "step": 33550 - }, - { - "epoch": 2.195616617598953, - "grad_norm": 1.0089340209960938, - "learning_rate": 9.499633009182141e-05, - "loss": 0.0847, - "step": 33560 - }, - { - "epoch": 2.196270853778214, - "grad_norm": 0.9923962950706482, - "learning_rate": 9.499232388074503e-05, - "loss": 0.0849, - "step": 33570 - }, - { - "epoch": 2.1969250899574746, - "grad_norm": 0.9396527409553528, - "learning_rate": 9.498831615105974e-05, - "loss": 0.0817, - "step": 33580 - }, - { - "epoch": 2.1975793261367356, - "grad_norm": 0.7648763060569763, - "learning_rate": 9.498430690290078e-05, - "loss": 0.0966, - "step": 33590 - }, - { - "epoch": 2.198233562315996, - "grad_norm": 0.8955914974212646, - "learning_rate": 9.498029613640349e-05, - "loss": 0.1008, - "step": 33600 - }, - { - "epoch": 2.1988877984952566, - "grad_norm": 1.131964921951294, - "learning_rate": 9.497628385170323e-05, - "loss": 0.0985, - "step": 33610 - }, - { - "epoch": 2.1995420346745176, - "grad_norm": 0.738293468952179, - "learning_rate": 9.497227004893544e-05, - "loss": 0.0794, - "step": 33620 - }, - { - "epoch": 2.200196270853778, - "grad_norm": 0.7371350526809692, - "learning_rate": 9.496825472823559e-05, - "loss": 0.0758, - "step": 33630 - }, - { - "epoch": 2.200850507033039, - "grad_norm": 0.8156116604804993, - "learning_rate": 9.496423788973922e-05, - "loss": 0.0796, - "step": 33640 - }, - { - "epoch": 2.2015047432122996, - "grad_norm": 0.832074761390686, - "learning_rate": 9.496021953358189e-05, - "loss": 0.0778, - "step": 33650 - }, - { - "epoch": 2.2021589793915606, - "grad_norm": 0.9009979963302612, - "learning_rate": 9.495619965989924e-05, - "loss": 0.0953, - "step": 33660 - }, - { - "epoch": 2.202813215570821, - "grad_norm": 0.9261824488639832, - "learning_rate": 9.495217826882694e-05, - "loss": 0.0941, - "step": 33670 - }, - { - "epoch": 2.2034674517500816, - "grad_norm": 0.815540611743927, - "learning_rate": 9.494815536050075e-05, - "loss": 0.0858, - "step": 33680 - }, - { - "epoch": 2.2041216879293426, - "grad_norm": 0.8841716647148132, - "learning_rate": 9.494413093505643e-05, - "loss": 0.0789, - "step": 33690 - }, - { - "epoch": 2.204775924108603, - "grad_norm": 0.9141961932182312, - "learning_rate": 9.494010499262982e-05, - "loss": 0.0936, - "step": 33700 - }, - { - "epoch": 2.205430160287864, - "grad_norm": 0.7224341630935669, - "learning_rate": 9.49360775333568e-05, - "loss": 0.0938, - "step": 33710 - }, - { - "epoch": 2.2060843964671246, - "grad_norm": 0.9036959409713745, - "learning_rate": 9.493204855737332e-05, - "loss": 0.0769, - "step": 33720 - }, - { - "epoch": 2.206738632646385, - "grad_norm": 0.9776110649108887, - "learning_rate": 9.492801806481535e-05, - "loss": 0.0925, - "step": 33730 - }, - { - "epoch": 2.207392868825646, - "grad_norm": 0.6493380665779114, - "learning_rate": 9.492398605581896e-05, - "loss": 0.0948, - "step": 33740 - }, - { - "epoch": 2.2080471050049066, - "grad_norm": 0.8940728902816772, - "learning_rate": 9.491995253052022e-05, - "loss": 0.0949, - "step": 33750 - }, - { - "epoch": 2.2087013411841676, - "grad_norm": 0.9045453071594238, - "learning_rate": 9.491591748905527e-05, - "loss": 0.0905, - "step": 33760 - }, - { - "epoch": 2.209355577363428, - "grad_norm": 0.7662135362625122, - "learning_rate": 9.49118809315603e-05, - "loss": 0.0852, - "step": 33770 - }, - { - "epoch": 2.210009813542689, - "grad_norm": 0.7903282642364502, - "learning_rate": 9.490784285817158e-05, - "loss": 0.0792, - "step": 33780 - }, - { - "epoch": 2.2106640497219496, - "grad_norm": 0.9852091073989868, - "learning_rate": 9.490380326902537e-05, - "loss": 0.0826, - "step": 33790 - }, - { - "epoch": 2.21131828590121, - "grad_norm": 0.9630650281906128, - "learning_rate": 9.489976216425804e-05, - "loss": 0.0994, - "step": 33800 - }, - { - "epoch": 2.211972522080471, - "grad_norm": 0.7870905995368958, - "learning_rate": 9.4895719544006e-05, - "loss": 0.0926, - "step": 33810 - }, - { - "epoch": 2.2126267582597317, - "grad_norm": 0.8298508524894714, - "learning_rate": 9.489167540840567e-05, - "loss": 0.0815, - "step": 33820 - }, - { - "epoch": 2.2132809944389926, - "grad_norm": 0.8366494178771973, - "learning_rate": 9.488762975759357e-05, - "loss": 0.1, - "step": 33830 - }, - { - "epoch": 2.213935230618253, - "grad_norm": 0.9144079685211182, - "learning_rate": 9.488358259170622e-05, - "loss": 0.0881, - "step": 33840 - }, - { - "epoch": 2.2145894667975137, - "grad_norm": 0.8029112815856934, - "learning_rate": 9.487953391088027e-05, - "loss": 0.0793, - "step": 33850 - }, - { - "epoch": 2.2152437029767746, - "grad_norm": 0.9416177272796631, - "learning_rate": 9.487548371525234e-05, - "loss": 0.0827, - "step": 33860 - }, - { - "epoch": 2.215897939156035, - "grad_norm": 0.7527725696563721, - "learning_rate": 9.487143200495914e-05, - "loss": 0.0921, - "step": 33870 - }, - { - "epoch": 2.216552175335296, - "grad_norm": 1.0076922178268433, - "learning_rate": 9.486737878013745e-05, - "loss": 0.0852, - "step": 33880 - }, - { - "epoch": 2.2172064115145567, - "grad_norm": 1.0222320556640625, - "learning_rate": 9.486332404092403e-05, - "loss": 0.09, - "step": 33890 - }, - { - "epoch": 2.2178606476938176, - "grad_norm": 0.9221048951148987, - "learning_rate": 9.485926778745579e-05, - "loss": 0.091, - "step": 33900 - }, - { - "epoch": 2.218514883873078, - "grad_norm": 0.8487015962600708, - "learning_rate": 9.485521001986962e-05, - "loss": 0.0853, - "step": 33910 - }, - { - "epoch": 2.2191691200523387, - "grad_norm": 0.8243840336799622, - "learning_rate": 9.485115073830245e-05, - "loss": 0.0884, - "step": 33920 - }, - { - "epoch": 2.2198233562315997, - "grad_norm": 0.8864408731460571, - "learning_rate": 9.48470899428913e-05, - "loss": 0.1056, - "step": 33930 - }, - { - "epoch": 2.22047759241086, - "grad_norm": 1.3254485130310059, - "learning_rate": 9.484302763377328e-05, - "loss": 0.0855, - "step": 33940 - }, - { - "epoch": 2.221131828590121, - "grad_norm": 1.0001471042633057, - "learning_rate": 9.483896381108548e-05, - "loss": 0.0901, - "step": 33950 - }, - { - "epoch": 2.2217860647693817, - "grad_norm": 0.8576057553291321, - "learning_rate": 9.483489847496503e-05, - "loss": 0.0802, - "step": 33960 - }, - { - "epoch": 2.2224403009486426, - "grad_norm": 0.8410997986793518, - "learning_rate": 9.48308316255492e-05, - "loss": 0.0777, - "step": 33970 - }, - { - "epoch": 2.223094537127903, - "grad_norm": 1.0738435983657837, - "learning_rate": 9.482676326297522e-05, - "loss": 0.093, - "step": 33980 - }, - { - "epoch": 2.2237487733071637, - "grad_norm": 0.7275444865226746, - "learning_rate": 9.482269338738038e-05, - "loss": 0.0788, - "step": 33990 - }, - { - "epoch": 2.2244030094864247, - "grad_norm": 0.7812187075614929, - "learning_rate": 9.481862199890213e-05, - "loss": 0.0866, - "step": 34000 - }, - { - "epoch": 2.225057245665685, - "grad_norm": 0.7161325216293335, - "learning_rate": 9.481454909767784e-05, - "loss": 0.0865, - "step": 34010 - }, - { - "epoch": 2.225711481844946, - "grad_norm": 0.9743297696113586, - "learning_rate": 9.481047468384499e-05, - "loss": 0.0829, - "step": 34020 - }, - { - "epoch": 2.2263657180242067, - "grad_norm": 0.9673652648925781, - "learning_rate": 9.480639875754108e-05, - "loss": 0.0828, - "step": 34030 - }, - { - "epoch": 2.2270199542034677, - "grad_norm": 0.8574245572090149, - "learning_rate": 9.480232131890371e-05, - "loss": 0.0832, - "step": 34040 - }, - { - "epoch": 2.227674190382728, - "grad_norm": 1.0479404926300049, - "learning_rate": 9.479824236807051e-05, - "loss": 0.0825, - "step": 34050 - }, - { - "epoch": 2.2283284265619887, - "grad_norm": 0.9028341174125671, - "learning_rate": 9.479416190517914e-05, - "loss": 0.0804, - "step": 34060 - }, - { - "epoch": 2.2289826627412497, - "grad_norm": 0.7553834915161133, - "learning_rate": 9.479007993036733e-05, - "loss": 0.0798, - "step": 34070 - }, - { - "epoch": 2.22963689892051, - "grad_norm": 0.9540044069290161, - "learning_rate": 9.478599644377284e-05, - "loss": 0.092, - "step": 34080 - }, - { - "epoch": 2.230291135099771, - "grad_norm": 1.0096077919006348, - "learning_rate": 9.478191144553352e-05, - "loss": 0.0833, - "step": 34090 - }, - { - "epoch": 2.2309453712790317, - "grad_norm": 0.8619536757469177, - "learning_rate": 9.477782493578725e-05, - "loss": 0.0983, - "step": 34100 - }, - { - "epoch": 2.2315996074582927, - "grad_norm": 0.8202316761016846, - "learning_rate": 9.477373691467195e-05, - "loss": 0.0795, - "step": 34110 - }, - { - "epoch": 2.232253843637553, - "grad_norm": 0.8354489207267761, - "learning_rate": 9.47696473823256e-05, - "loss": 0.0805, - "step": 34120 - }, - { - "epoch": 2.2329080798168137, - "grad_norm": 1.0788700580596924, - "learning_rate": 9.476555633888625e-05, - "loss": 0.0929, - "step": 34130 - }, - { - "epoch": 2.2335623159960747, - "grad_norm": 0.8058347702026367, - "learning_rate": 9.476146378449197e-05, - "loss": 0.0913, - "step": 34140 - }, - { - "epoch": 2.234216552175335, - "grad_norm": 0.8646268844604492, - "learning_rate": 9.475736971928088e-05, - "loss": 0.0819, - "step": 34150 - }, - { - "epoch": 2.234870788354596, - "grad_norm": 0.8829324841499329, - "learning_rate": 9.475327414339121e-05, - "loss": 0.0907, - "step": 34160 - }, - { - "epoch": 2.2355250245338567, - "grad_norm": 0.8465047478675842, - "learning_rate": 9.474917705696114e-05, - "loss": 0.0884, - "step": 34170 - }, - { - "epoch": 2.2361792607131172, - "grad_norm": 0.8332507014274597, - "learning_rate": 9.474507846012901e-05, - "loss": 0.0899, - "step": 34180 - }, - { - "epoch": 2.236833496892378, - "grad_norm": 0.890265941619873, - "learning_rate": 9.474097835303311e-05, - "loss": 0.0799, - "step": 34190 - }, - { - "epoch": 2.2374877330716387, - "grad_norm": 0.9846871495246887, - "learning_rate": 9.473687673581186e-05, - "loss": 0.0828, - "step": 34200 - }, - { - "epoch": 2.2381419692508997, - "grad_norm": 0.7665266394615173, - "learning_rate": 9.47327736086037e-05, - "loss": 0.0792, - "step": 34210 - }, - { - "epoch": 2.2387962054301602, - "grad_norm": 0.82786625623703, - "learning_rate": 9.472866897154712e-05, - "loss": 0.0888, - "step": 34220 - }, - { - "epoch": 2.239450441609421, - "grad_norm": 0.8131442666053772, - "learning_rate": 9.472456282478065e-05, - "loss": 0.0819, - "step": 34230 - }, - { - "epoch": 2.2401046777886817, - "grad_norm": 0.8473891615867615, - "learning_rate": 9.47204551684429e-05, - "loss": 0.0905, - "step": 34240 - }, - { - "epoch": 2.2407589139679422, - "grad_norm": 0.8712317943572998, - "learning_rate": 9.471634600267247e-05, - "loss": 0.0955, - "step": 34250 - }, - { - "epoch": 2.241413150147203, - "grad_norm": 0.7699024081230164, - "learning_rate": 9.471223532760812e-05, - "loss": 0.0799, - "step": 34260 - }, - { - "epoch": 2.2420673863264637, - "grad_norm": 0.7678673267364502, - "learning_rate": 9.470812314338855e-05, - "loss": 0.0846, - "step": 34270 - }, - { - "epoch": 2.2427216225057247, - "grad_norm": 0.8645968437194824, - "learning_rate": 9.470400945015258e-05, - "loss": 0.0767, - "step": 34280 - }, - { - "epoch": 2.2433758586849852, - "grad_norm": 0.9479940533638, - "learning_rate": 9.469989424803907e-05, - "loss": 0.0875, - "step": 34290 - }, - { - "epoch": 2.2440300948642458, - "grad_norm": 0.7931011319160461, - "learning_rate": 9.469577753718689e-05, - "loss": 0.0817, - "step": 34300 - }, - { - "epoch": 2.2446843310435067, - "grad_norm": 0.9428794980049133, - "learning_rate": 9.469165931773498e-05, - "loss": 0.0824, - "step": 34310 - }, - { - "epoch": 2.2453385672227673, - "grad_norm": 0.9132287502288818, - "learning_rate": 9.468753958982238e-05, - "loss": 0.0886, - "step": 34320 - }, - { - "epoch": 2.2459928034020282, - "grad_norm": 0.9737602472305298, - "learning_rate": 9.468341835358809e-05, - "loss": 0.0937, - "step": 34330 - }, - { - "epoch": 2.2466470395812888, - "grad_norm": 0.918177604675293, - "learning_rate": 9.467929560917128e-05, - "loss": 0.0813, - "step": 34340 - }, - { - "epoch": 2.2473012757605497, - "grad_norm": 0.7503966093063354, - "learning_rate": 9.467517135671104e-05, - "loss": 0.0886, - "step": 34350 - }, - { - "epoch": 2.2479555119398102, - "grad_norm": 0.83729487657547, - "learning_rate": 9.467104559634663e-05, - "loss": 0.0983, - "step": 34360 - }, - { - "epoch": 2.2486097481190708, - "grad_norm": 1.0300636291503906, - "learning_rate": 9.466691832821725e-05, - "loss": 0.0959, - "step": 34370 - }, - { - "epoch": 2.2492639842983317, - "grad_norm": 0.8676597476005554, - "learning_rate": 9.466278955246225e-05, - "loss": 0.0833, - "step": 34380 - }, - { - "epoch": 2.2499182204775923, - "grad_norm": 0.9559263586997986, - "learning_rate": 9.465865926922098e-05, - "loss": 0.0978, - "step": 34390 - }, - { - "epoch": 2.2505724566568532, - "grad_norm": 0.7178763151168823, - "learning_rate": 9.465452747863281e-05, - "loss": 0.0909, - "step": 34400 - }, - { - "epoch": 2.2512266928361138, - "grad_norm": 0.7472977638244629, - "learning_rate": 9.465039418083723e-05, - "loss": 0.0954, - "step": 34410 - }, - { - "epoch": 2.2518809290153747, - "grad_norm": 0.766511082649231, - "learning_rate": 9.464625937597377e-05, - "loss": 0.0853, - "step": 34420 - }, - { - "epoch": 2.2525351651946353, - "grad_norm": 0.9715924263000488, - "learning_rate": 9.464212306418194e-05, - "loss": 0.0931, - "step": 34430 - }, - { - "epoch": 2.253189401373896, - "grad_norm": 0.8225154280662537, - "learning_rate": 9.463798524560141e-05, - "loss": 0.0927, - "step": 34440 - }, - { - "epoch": 2.2538436375531568, - "grad_norm": 0.7842962145805359, - "learning_rate": 9.463384592037178e-05, - "loss": 0.0843, - "step": 34450 - }, - { - "epoch": 2.2544978737324173, - "grad_norm": 0.817205011844635, - "learning_rate": 9.46297050886328e-05, - "loss": 0.0868, - "step": 34460 - }, - { - "epoch": 2.2551521099116782, - "grad_norm": 0.8969591856002808, - "learning_rate": 9.462556275052425e-05, - "loss": 0.0888, - "step": 34470 - }, - { - "epoch": 2.2558063460909388, - "grad_norm": 0.924565851688385, - "learning_rate": 9.46214189061859e-05, - "loss": 0.0763, - "step": 34480 - }, - { - "epoch": 2.2564605822701997, - "grad_norm": 0.7565225958824158, - "learning_rate": 9.461727355575764e-05, - "loss": 0.0943, - "step": 34490 - }, - { - "epoch": 2.2571148184494603, - "grad_norm": 0.9678487181663513, - "learning_rate": 9.461312669937938e-05, - "loss": 0.1002, - "step": 34500 - }, - { - "epoch": 2.257769054628721, - "grad_norm": 1.0543937683105469, - "learning_rate": 9.460897833719111e-05, - "loss": 0.0908, - "step": 34510 - }, - { - "epoch": 2.2584232908079818, - "grad_norm": 0.8663365840911865, - "learning_rate": 9.460482846933283e-05, - "loss": 0.0793, - "step": 34520 - }, - { - "epoch": 2.2590775269872423, - "grad_norm": 0.9217035174369812, - "learning_rate": 9.460067709594459e-05, - "loss": 0.0862, - "step": 34530 - }, - { - "epoch": 2.2597317631665033, - "grad_norm": 0.8142322301864624, - "learning_rate": 9.459652421716654e-05, - "loss": 0.0895, - "step": 34540 - }, - { - "epoch": 2.260385999345764, - "grad_norm": 0.7949792146682739, - "learning_rate": 9.459236983313884e-05, - "loss": 0.0831, - "step": 34550 - }, - { - "epoch": 2.2610402355250248, - "grad_norm": 1.0977342128753662, - "learning_rate": 9.45882139440017e-05, - "loss": 0.079, - "step": 34560 - }, - { - "epoch": 2.2616944717042853, - "grad_norm": 0.8566722869873047, - "learning_rate": 9.458405654989542e-05, - "loss": 0.0799, - "step": 34570 - }, - { - "epoch": 2.262348707883546, - "grad_norm": 0.9690262079238892, - "learning_rate": 9.457989765096028e-05, - "loss": 0.0845, - "step": 34580 - }, - { - "epoch": 2.2630029440628068, - "grad_norm": 0.684556245803833, - "learning_rate": 9.45757372473367e-05, - "loss": 0.0934, - "step": 34590 - }, - { - "epoch": 2.2636571802420673, - "grad_norm": 0.804113507270813, - "learning_rate": 9.457157533916508e-05, - "loss": 0.0828, - "step": 34600 - }, - { - "epoch": 2.2643114164213283, - "grad_norm": 0.8007469177246094, - "learning_rate": 9.456741192658589e-05, - "loss": 0.0922, - "step": 34610 - }, - { - "epoch": 2.264965652600589, - "grad_norm": 1.1300758123397827, - "learning_rate": 9.456324700973966e-05, - "loss": 0.0841, - "step": 34620 - }, - { - "epoch": 2.2656198887798498, - "grad_norm": 0.7691265344619751, - "learning_rate": 9.4559080588767e-05, - "loss": 0.088, - "step": 34630 - }, - { - "epoch": 2.2662741249591103, - "grad_norm": 0.6707723140716553, - "learning_rate": 9.455491266380849e-05, - "loss": 0.0842, - "step": 34640 - }, - { - "epoch": 2.266928361138371, - "grad_norm": 0.848480224609375, - "learning_rate": 9.455074323500484e-05, - "loss": 0.0938, - "step": 34650 - }, - { - "epoch": 2.267582597317632, - "grad_norm": 0.9524247050285339, - "learning_rate": 9.454657230249675e-05, - "loss": 0.086, - "step": 34660 - }, - { - "epoch": 2.2682368334968923, - "grad_norm": 0.8271149396896362, - "learning_rate": 9.454239986642502e-05, - "loss": 0.086, - "step": 34670 - }, - { - "epoch": 2.2688910696761533, - "grad_norm": 0.9054349064826965, - "learning_rate": 9.453822592693049e-05, - "loss": 0.0883, - "step": 34680 - }, - { - "epoch": 2.269545305855414, - "grad_norm": 0.9008976221084595, - "learning_rate": 9.453405048415402e-05, - "loss": 0.084, - "step": 34690 - }, - { - "epoch": 2.2701995420346743, - "grad_norm": 0.7716291546821594, - "learning_rate": 9.452987353823654e-05, - "loss": 0.0858, - "step": 34700 - }, - { - "epoch": 2.2708537782139353, - "grad_norm": 0.7925845384597778, - "learning_rate": 9.452569508931908e-05, - "loss": 0.0862, - "step": 34710 - }, - { - "epoch": 2.271508014393196, - "grad_norm": 0.7443545460700989, - "learning_rate": 9.452151513754262e-05, - "loss": 0.0817, - "step": 34720 - }, - { - "epoch": 2.272162250572457, - "grad_norm": 0.7486448884010315, - "learning_rate": 9.451733368304825e-05, - "loss": 0.0779, - "step": 34730 - }, - { - "epoch": 2.2728164867517173, - "grad_norm": 0.8984254002571106, - "learning_rate": 9.451315072597713e-05, - "loss": 0.0854, - "step": 34740 - }, - { - "epoch": 2.273470722930978, - "grad_norm": 0.880130410194397, - "learning_rate": 9.450896626647041e-05, - "loss": 0.09, - "step": 34750 - }, - { - "epoch": 2.274124959110239, - "grad_norm": 0.8903000354766846, - "learning_rate": 9.450478030466938e-05, - "loss": 0.0876, - "step": 34760 - }, - { - "epoch": 2.2747791952894993, - "grad_norm": 0.8735268115997314, - "learning_rate": 9.450059284071529e-05, - "loss": 0.1018, - "step": 34770 - }, - { - "epoch": 2.2754334314687603, - "grad_norm": 0.7384045720100403, - "learning_rate": 9.449640387474948e-05, - "loss": 0.0849, - "step": 34780 - }, - { - "epoch": 2.276087667648021, - "grad_norm": 0.8934576511383057, - "learning_rate": 9.449221340691333e-05, - "loss": 0.0834, - "step": 34790 - }, - { - "epoch": 2.276741903827282, - "grad_norm": 1.060774564743042, - "learning_rate": 9.448802143734831e-05, - "loss": 0.0932, - "step": 34800 - }, - { - "epoch": 2.2773961400065423, - "grad_norm": 0.825049638748169, - "learning_rate": 9.448382796619589e-05, - "loss": 0.0854, - "step": 34810 - }, - { - "epoch": 2.278050376185803, - "grad_norm": 0.8421124815940857, - "learning_rate": 9.44796329935976e-05, - "loss": 0.0836, - "step": 34820 - }, - { - "epoch": 2.278704612365064, - "grad_norm": 0.7823123335838318, - "learning_rate": 9.447543651969506e-05, - "loss": 0.0852, - "step": 34830 - }, - { - "epoch": 2.2793588485443244, - "grad_norm": 1.0496047735214233, - "learning_rate": 9.447123854462989e-05, - "loss": 0.0891, - "step": 34840 - }, - { - "epoch": 2.2800130847235853, - "grad_norm": 0.8308643698692322, - "learning_rate": 9.44670390685438e-05, - "loss": 0.0831, - "step": 34850 - }, - { - "epoch": 2.280667320902846, - "grad_norm": 0.8488014936447144, - "learning_rate": 9.44628380915785e-05, - "loss": 0.0911, - "step": 34860 - }, - { - "epoch": 2.281321557082107, - "grad_norm": 0.9563519358634949, - "learning_rate": 9.445863561387582e-05, - "loss": 0.0957, - "step": 34870 - }, - { - "epoch": 2.2819757932613673, - "grad_norm": 0.8364535570144653, - "learning_rate": 9.44544316355776e-05, - "loss": 0.0821, - "step": 34880 - }, - { - "epoch": 2.282630029440628, - "grad_norm": 0.918013870716095, - "learning_rate": 9.445022615682571e-05, - "loss": 0.0789, - "step": 34890 - }, - { - "epoch": 2.283284265619889, - "grad_norm": 0.7931444048881531, - "learning_rate": 9.44460191777621e-05, - "loss": 0.0908, - "step": 34900 - }, - { - "epoch": 2.2839385017991494, - "grad_norm": 0.9308255314826965, - "learning_rate": 9.44418106985288e-05, - "loss": 0.0819, - "step": 34910 - }, - { - "epoch": 2.2845927379784103, - "grad_norm": 1.0205659866333008, - "learning_rate": 9.443760071926784e-05, - "loss": 0.0908, - "step": 34920 - }, - { - "epoch": 2.285246974157671, - "grad_norm": 0.8457900881767273, - "learning_rate": 9.44333892401213e-05, - "loss": 0.0996, - "step": 34930 - }, - { - "epoch": 2.285901210336932, - "grad_norm": 0.8027629852294922, - "learning_rate": 9.442917626123136e-05, - "loss": 0.0913, - "step": 34940 - }, - { - "epoch": 2.2865554465161924, - "grad_norm": 0.8995937705039978, - "learning_rate": 9.442496178274019e-05, - "loss": 0.0892, - "step": 34950 - }, - { - "epoch": 2.287209682695453, - "grad_norm": 0.9904996752738953, - "learning_rate": 9.442074580479004e-05, - "loss": 0.0921, - "step": 34960 - }, - { - "epoch": 2.287863918874714, - "grad_norm": 0.9785271286964417, - "learning_rate": 9.441652832752324e-05, - "loss": 0.0801, - "step": 34970 - }, - { - "epoch": 2.2885181550539744, - "grad_norm": 0.8374738097190857, - "learning_rate": 9.441230935108212e-05, - "loss": 0.0855, - "step": 34980 - }, - { - "epoch": 2.2891723912332353, - "grad_norm": 0.8071938157081604, - "learning_rate": 9.440808887560907e-05, - "loss": 0.0967, - "step": 34990 - }, - { - "epoch": 2.289826627412496, - "grad_norm": 1.0699573755264282, - "learning_rate": 9.440386690124656e-05, - "loss": 0.0865, - "step": 35000 - }, - { - "epoch": 2.290480863591757, - "grad_norm": 1.1547331809997559, - "learning_rate": 9.43996434281371e-05, - "loss": 0.085, - "step": 35010 - }, - { - "epoch": 2.2911350997710174, - "grad_norm": 0.8544806838035583, - "learning_rate": 9.439541845642322e-05, - "loss": 0.0808, - "step": 35020 - }, - { - "epoch": 2.291789335950278, - "grad_norm": 0.8750424385070801, - "learning_rate": 9.439119198624755e-05, - "loss": 0.0826, - "step": 35030 - }, - { - "epoch": 2.292443572129539, - "grad_norm": 1.0498063564300537, - "learning_rate": 9.438696401775271e-05, - "loss": 0.0912, - "step": 35040 - }, - { - "epoch": 2.2930978083087994, - "grad_norm": 0.7838863730430603, - "learning_rate": 9.438273455108144e-05, - "loss": 0.0985, - "step": 35050 - }, - { - "epoch": 2.2937520444880604, - "grad_norm": 0.7744349241256714, - "learning_rate": 9.437850358637648e-05, - "loss": 0.0799, - "step": 35060 - }, - { - "epoch": 2.294406280667321, - "grad_norm": 0.7421576380729675, - "learning_rate": 9.437427112378063e-05, - "loss": 0.0907, - "step": 35070 - }, - { - "epoch": 2.295060516846582, - "grad_norm": 0.7675076723098755, - "learning_rate": 9.437003716343676e-05, - "loss": 0.0768, - "step": 35080 - }, - { - "epoch": 2.2957147530258424, - "grad_norm": 0.6893305778503418, - "learning_rate": 9.436580170548777e-05, - "loss": 0.076, - "step": 35090 - }, - { - "epoch": 2.296368989205103, - "grad_norm": 0.7479074001312256, - "learning_rate": 9.436156475007662e-05, - "loss": 0.0824, - "step": 35100 - }, - { - "epoch": 2.297023225384364, - "grad_norm": 0.7704381346702576, - "learning_rate": 9.435732629734633e-05, - "loss": 0.0882, - "step": 35110 - }, - { - "epoch": 2.2976774615636244, - "grad_norm": 0.9246371984481812, - "learning_rate": 9.435308634743992e-05, - "loss": 0.0812, - "step": 35120 - }, - { - "epoch": 2.2983316977428854, - "grad_norm": 0.8029863834381104, - "learning_rate": 9.434884490050053e-05, - "loss": 0.0976, - "step": 35130 - }, - { - "epoch": 2.298985933922146, - "grad_norm": 0.8177743554115295, - "learning_rate": 9.434460195667133e-05, - "loss": 0.0784, - "step": 35140 - }, - { - "epoch": 2.2996401701014064, - "grad_norm": 0.8609619140625, - "learning_rate": 9.434035751609551e-05, - "loss": 0.1004, - "step": 35150 - }, - { - "epoch": 2.3002944062806674, - "grad_norm": 0.7648687958717346, - "learning_rate": 9.433611157891633e-05, - "loss": 0.085, - "step": 35160 - }, - { - "epoch": 2.300948642459928, - "grad_norm": 0.9224421977996826, - "learning_rate": 9.433186414527713e-05, - "loss": 0.0882, - "step": 35170 - }, - { - "epoch": 2.301602878639189, - "grad_norm": 0.793401837348938, - "learning_rate": 9.432761521532123e-05, - "loss": 0.0769, - "step": 35180 - }, - { - "epoch": 2.3022571148184494, - "grad_norm": 0.86360102891922, - "learning_rate": 9.432336478919206e-05, - "loss": 0.0851, - "step": 35190 - }, - { - "epoch": 2.30291135099771, - "grad_norm": 0.860308825969696, - "learning_rate": 9.43191128670331e-05, - "loss": 0.0844, - "step": 35200 - }, - { - "epoch": 2.303565587176971, - "grad_norm": 0.8408114314079285, - "learning_rate": 9.431485944898784e-05, - "loss": 0.0892, - "step": 35210 - }, - { - "epoch": 2.3042198233562314, - "grad_norm": 0.8600111603736877, - "learning_rate": 9.431060453519986e-05, - "loss": 0.0838, - "step": 35220 - }, - { - "epoch": 2.3048740595354924, - "grad_norm": 0.9598779082298279, - "learning_rate": 9.430634812581276e-05, - "loss": 0.0891, - "step": 35230 - }, - { - "epoch": 2.305528295714753, - "grad_norm": 0.9931259155273438, - "learning_rate": 9.430209022097023e-05, - "loss": 0.08, - "step": 35240 - }, - { - "epoch": 2.306182531894014, - "grad_norm": 0.687402069568634, - "learning_rate": 9.429783082081596e-05, - "loss": 0.0882, - "step": 35250 - }, - { - "epoch": 2.3068367680732744, - "grad_norm": 0.8022744059562683, - "learning_rate": 9.429356992549372e-05, - "loss": 0.0828, - "step": 35260 - }, - { - "epoch": 2.307491004252535, - "grad_norm": 0.9008263945579529, - "learning_rate": 9.428930753514734e-05, - "loss": 0.0775, - "step": 35270 - }, - { - "epoch": 2.308145240431796, - "grad_norm": 0.8905544877052307, - "learning_rate": 9.428504364992066e-05, - "loss": 0.0863, - "step": 35280 - }, - { - "epoch": 2.3087994766110564, - "grad_norm": 0.9496570825576782, - "learning_rate": 9.428077826995762e-05, - "loss": 0.0863, - "step": 35290 - }, - { - "epoch": 2.3094537127903174, - "grad_norm": 0.8719865679740906, - "learning_rate": 9.427651139540218e-05, - "loss": 0.0777, - "step": 35300 - }, - { - "epoch": 2.310107948969578, - "grad_norm": 1.0145576000213623, - "learning_rate": 9.427224302639837e-05, - "loss": 0.089, - "step": 35310 - }, - { - "epoch": 2.310762185148839, - "grad_norm": 0.8923918604850769, - "learning_rate": 9.426797316309026e-05, - "loss": 0.0766, - "step": 35320 - }, - { - "epoch": 2.3114164213280994, - "grad_norm": 0.7120718955993652, - "learning_rate": 9.426370180562195e-05, - "loss": 0.0789, - "step": 35330 - }, - { - "epoch": 2.31207065750736, - "grad_norm": 0.9214240312576294, - "learning_rate": 9.425942895413761e-05, - "loss": 0.082, - "step": 35340 - }, - { - "epoch": 2.312724893686621, - "grad_norm": 0.7993736863136292, - "learning_rate": 9.425515460878148e-05, - "loss": 0.078, - "step": 35350 - }, - { - "epoch": 2.3133791298658815, - "grad_norm": 0.7409219741821289, - "learning_rate": 9.42508787696978e-05, - "loss": 0.0859, - "step": 35360 - }, - { - "epoch": 2.3140333660451424, - "grad_norm": 0.8568753600120544, - "learning_rate": 9.424660143703092e-05, - "loss": 0.0974, - "step": 35370 - }, - { - "epoch": 2.314687602224403, - "grad_norm": 0.8471707701683044, - "learning_rate": 9.424232261092521e-05, - "loss": 0.0926, - "step": 35380 - }, - { - "epoch": 2.315341838403664, - "grad_norm": 0.9042620658874512, - "learning_rate": 9.423804229152507e-05, - "loss": 0.084, - "step": 35390 - }, - { - "epoch": 2.3159960745829244, - "grad_norm": 0.9078444838523865, - "learning_rate": 9.423376047897499e-05, - "loss": 0.0903, - "step": 35400 - }, - { - "epoch": 2.316650310762185, - "grad_norm": 0.8419270515441895, - "learning_rate": 9.422947717341948e-05, - "loss": 0.0791, - "step": 35410 - }, - { - "epoch": 2.317304546941446, - "grad_norm": 0.8305777907371521, - "learning_rate": 9.422519237500313e-05, - "loss": 0.092, - "step": 35420 - }, - { - "epoch": 2.3179587831207065, - "grad_norm": 0.7356106638908386, - "learning_rate": 9.422090608387055e-05, - "loss": 0.0809, - "step": 35430 - }, - { - "epoch": 2.3186130192999674, - "grad_norm": 0.9605696201324463, - "learning_rate": 9.421661830016642e-05, - "loss": 0.0746, - "step": 35440 - }, - { - "epoch": 2.319267255479228, - "grad_norm": 1.005142092704773, - "learning_rate": 9.421232902403545e-05, - "loss": 0.0917, - "step": 35450 - }, - { - "epoch": 2.319921491658489, - "grad_norm": 0.965522289276123, - "learning_rate": 9.420803825562243e-05, - "loss": 0.0977, - "step": 35460 - }, - { - "epoch": 2.3205757278377495, - "grad_norm": 0.860283613204956, - "learning_rate": 9.420374599507217e-05, - "loss": 0.0872, - "step": 35470 - }, - { - "epoch": 2.32122996401701, - "grad_norm": 0.834602952003479, - "learning_rate": 9.419945224252955e-05, - "loss": 0.1007, - "step": 35480 - }, - { - "epoch": 2.321884200196271, - "grad_norm": 0.8806980848312378, - "learning_rate": 9.419515699813952e-05, - "loss": 0.0795, - "step": 35490 - }, - { - "epoch": 2.3225384363755315, - "grad_norm": 1.0480656623840332, - "learning_rate": 9.419086026204703e-05, - "loss": 0.0898, - "step": 35500 - }, - { - "epoch": 2.3231926725547924, - "grad_norm": 0.8428975343704224, - "learning_rate": 9.41865620343971e-05, - "loss": 0.0792, - "step": 35510 - }, - { - "epoch": 2.323846908734053, - "grad_norm": 0.9427582621574402, - "learning_rate": 9.418226231533482e-05, - "loss": 0.0796, - "step": 35520 - }, - { - "epoch": 2.324501144913314, - "grad_norm": 0.7207076549530029, - "learning_rate": 9.417796110500532e-05, - "loss": 0.0826, - "step": 35530 - }, - { - "epoch": 2.3251553810925745, - "grad_norm": 0.9341986179351807, - "learning_rate": 9.417365840355377e-05, - "loss": 0.0965, - "step": 35540 - }, - { - "epoch": 2.325809617271835, - "grad_norm": 0.8259122371673584, - "learning_rate": 9.416935421112541e-05, - "loss": 0.0909, - "step": 35550 - }, - { - "epoch": 2.326463853451096, - "grad_norm": 0.8562160134315491, - "learning_rate": 9.41650485278655e-05, - "loss": 0.0918, - "step": 35560 - }, - { - "epoch": 2.3271180896303565, - "grad_norm": 0.9982649683952332, - "learning_rate": 9.416074135391937e-05, - "loss": 0.0832, - "step": 35570 - }, - { - "epoch": 2.3277723258096175, - "grad_norm": 0.9082480072975159, - "learning_rate": 9.415643268943239e-05, - "loss": 0.0969, - "step": 35580 - }, - { - "epoch": 2.328426561988878, - "grad_norm": 0.8344404101371765, - "learning_rate": 9.415212253455004e-05, - "loss": 0.0815, - "step": 35590 - }, - { - "epoch": 2.3290807981681385, - "grad_norm": 0.8839426040649414, - "learning_rate": 9.414781088941772e-05, - "loss": 0.0899, - "step": 35600 - }, - { - "epoch": 2.3297350343473995, - "grad_norm": 0.7677926421165466, - "learning_rate": 9.414349775418104e-05, - "loss": 0.0774, - "step": 35610 - }, - { - "epoch": 2.33038927052666, - "grad_norm": 1.086987853050232, - "learning_rate": 9.413918312898551e-05, - "loss": 0.0909, - "step": 35620 - }, - { - "epoch": 2.331043506705921, - "grad_norm": 0.8547496795654297, - "learning_rate": 9.41348670139768e-05, - "loss": 0.0781, - "step": 35630 - }, - { - "epoch": 2.3316977428851815, - "grad_norm": 0.8789479732513428, - "learning_rate": 9.413054940930057e-05, - "loss": 0.0926, - "step": 35640 - }, - { - "epoch": 2.332351979064442, - "grad_norm": 0.8583962917327881, - "learning_rate": 9.412623031510257e-05, - "loss": 0.0854, - "step": 35650 - }, - { - "epoch": 2.333006215243703, - "grad_norm": 0.8400371670722961, - "learning_rate": 9.412190973152858e-05, - "loss": 0.0908, - "step": 35660 - }, - { - "epoch": 2.3336604514229635, - "grad_norm": 0.8149643540382385, - "learning_rate": 9.411758765872441e-05, - "loss": 0.0837, - "step": 35670 - }, - { - "epoch": 2.3343146876022245, - "grad_norm": 0.7172325849533081, - "learning_rate": 9.411326409683596e-05, - "loss": 0.0828, - "step": 35680 - }, - { - "epoch": 2.334968923781485, - "grad_norm": 0.9276403188705444, - "learning_rate": 9.410893904600917e-05, - "loss": 0.083, - "step": 35690 - }, - { - "epoch": 2.335623159960746, - "grad_norm": 0.7993019223213196, - "learning_rate": 9.410461250638997e-05, - "loss": 0.0831, - "step": 35700 - }, - { - "epoch": 2.3362773961400065, - "grad_norm": 0.7336340546607971, - "learning_rate": 9.410028447812447e-05, - "loss": 0.0866, - "step": 35710 - }, - { - "epoch": 2.336931632319267, - "grad_norm": 0.9019221067428589, - "learning_rate": 9.409595496135869e-05, - "loss": 0.0892, - "step": 35720 - }, - { - "epoch": 2.337585868498528, - "grad_norm": 0.8154904842376709, - "learning_rate": 9.409162395623879e-05, - "loss": 0.0865, - "step": 35730 - }, - { - "epoch": 2.3382401046777885, - "grad_norm": 0.8226253986358643, - "learning_rate": 9.408729146291093e-05, - "loss": 0.0901, - "step": 35740 - }, - { - "epoch": 2.3388943408570495, - "grad_norm": 0.7311269640922546, - "learning_rate": 9.408295748152138e-05, - "loss": 0.0889, - "step": 35750 - }, - { - "epoch": 2.33954857703631, - "grad_norm": 0.8451585173606873, - "learning_rate": 9.40786220122164e-05, - "loss": 0.0848, - "step": 35760 - }, - { - "epoch": 2.340202813215571, - "grad_norm": 1.0941405296325684, - "learning_rate": 9.407428505514233e-05, - "loss": 0.0871, - "step": 35770 - }, - { - "epoch": 2.3408570493948315, - "grad_norm": 0.9233847260475159, - "learning_rate": 9.406994661044554e-05, - "loss": 0.0976, - "step": 35780 - }, - { - "epoch": 2.341511285574092, - "grad_norm": 0.8936124444007874, - "learning_rate": 9.406560667827248e-05, - "loss": 0.0839, - "step": 35790 - }, - { - "epoch": 2.342165521753353, - "grad_norm": 0.9002397060394287, - "learning_rate": 9.406126525876963e-05, - "loss": 0.0781, - "step": 35800 - }, - { - "epoch": 2.3428197579326135, - "grad_norm": 0.7002133131027222, - "learning_rate": 9.405692235208353e-05, - "loss": 0.0845, - "step": 35810 - }, - { - "epoch": 2.3434739941118745, - "grad_norm": 0.8459717035293579, - "learning_rate": 9.405257795836074e-05, - "loss": 0.0769, - "step": 35820 - }, - { - "epoch": 2.344128230291135, - "grad_norm": 1.1364301443099976, - "learning_rate": 9.404823207774791e-05, - "loss": 0.0878, - "step": 35830 - }, - { - "epoch": 2.344782466470396, - "grad_norm": 0.8352745771408081, - "learning_rate": 9.404388471039173e-05, - "loss": 0.0828, - "step": 35840 - }, - { - "epoch": 2.3454367026496565, - "grad_norm": 0.8302096128463745, - "learning_rate": 9.403953585643895e-05, - "loss": 0.0885, - "step": 35850 - }, - { - "epoch": 2.346090938828917, - "grad_norm": 0.7666718363761902, - "learning_rate": 9.403518551603632e-05, - "loss": 0.0813, - "step": 35860 - }, - { - "epoch": 2.346745175008178, - "grad_norm": 0.9522207975387573, - "learning_rate": 9.40308336893307e-05, - "loss": 0.0909, - "step": 35870 - }, - { - "epoch": 2.3473994111874386, - "grad_norm": 0.7051165699958801, - "learning_rate": 9.402648037646895e-05, - "loss": 0.0871, - "step": 35880 - }, - { - "epoch": 2.3480536473666995, - "grad_norm": 0.7525814771652222, - "learning_rate": 9.402212557759805e-05, - "loss": 0.09, - "step": 35890 - }, - { - "epoch": 2.34870788354596, - "grad_norm": 1.1107085943222046, - "learning_rate": 9.401776929286494e-05, - "loss": 0.083, - "step": 35900 - }, - { - "epoch": 2.349362119725221, - "grad_norm": 0.7754766941070557, - "learning_rate": 9.401341152241668e-05, - "loss": 0.0819, - "step": 35910 - }, - { - "epoch": 2.3500163559044815, - "grad_norm": 0.9919909834861755, - "learning_rate": 9.400905226640036e-05, - "loss": 0.0859, - "step": 35920 - }, - { - "epoch": 2.350670592083742, - "grad_norm": 0.7626148462295532, - "learning_rate": 9.40046915249631e-05, - "loss": 0.0889, - "step": 35930 - }, - { - "epoch": 2.351324828263003, - "grad_norm": 0.8603907823562622, - "learning_rate": 9.40003292982521e-05, - "loss": 0.0847, - "step": 35940 - }, - { - "epoch": 2.3519790644422636, - "grad_norm": 0.8849925994873047, - "learning_rate": 9.399596558641459e-05, - "loss": 0.0782, - "step": 35950 - }, - { - "epoch": 2.3526333006215245, - "grad_norm": 0.9044029712677002, - "learning_rate": 9.399160038959785e-05, - "loss": 0.0841, - "step": 35960 - }, - { - "epoch": 2.353287536800785, - "grad_norm": 0.7774613499641418, - "learning_rate": 9.398723370794923e-05, - "loss": 0.0725, - "step": 35970 - }, - { - "epoch": 2.353941772980046, - "grad_norm": 0.7712434530258179, - "learning_rate": 9.398286554161612e-05, - "loss": 0.0855, - "step": 35980 - }, - { - "epoch": 2.3545960091593066, - "grad_norm": 0.8731211423873901, - "learning_rate": 9.397849589074593e-05, - "loss": 0.0941, - "step": 35990 - }, - { - "epoch": 2.355250245338567, - "grad_norm": 0.8799338340759277, - "learning_rate": 9.397412475548618e-05, - "loss": 0.0835, - "step": 36000 - }, - { - "epoch": 2.355904481517828, - "grad_norm": 0.8053388595581055, - "learning_rate": 9.396975213598439e-05, - "loss": 0.0872, - "step": 36010 - }, - { - "epoch": 2.3565587176970886, - "grad_norm": 0.8215650320053101, - "learning_rate": 9.396537803238815e-05, - "loss": 0.0894, - "step": 36020 - }, - { - "epoch": 2.3572129538763495, - "grad_norm": 0.7709128856658936, - "learning_rate": 9.39610024448451e-05, - "loss": 0.0852, - "step": 36030 - }, - { - "epoch": 2.35786719005561, - "grad_norm": 0.8839540481567383, - "learning_rate": 9.395662537350292e-05, - "loss": 0.0874, - "step": 36040 - }, - { - "epoch": 2.3585214262348706, - "grad_norm": 0.8985713124275208, - "learning_rate": 9.395224681850935e-05, - "loss": 0.0774, - "step": 36050 - }, - { - "epoch": 2.3591756624141316, - "grad_norm": 0.7541700601577759, - "learning_rate": 9.39478667800122e-05, - "loss": 0.0782, - "step": 36060 - }, - { - "epoch": 2.359829898593392, - "grad_norm": 0.6992083191871643, - "learning_rate": 9.394348525815928e-05, - "loss": 0.0821, - "step": 36070 - }, - { - "epoch": 2.360484134772653, - "grad_norm": 0.9433765411376953, - "learning_rate": 9.393910225309848e-05, - "loss": 0.0825, - "step": 36080 - }, - { - "epoch": 2.3611383709519136, - "grad_norm": 0.892088770866394, - "learning_rate": 9.393471776497776e-05, - "loss": 0.083, - "step": 36090 - }, - { - "epoch": 2.361792607131174, - "grad_norm": 0.7801177501678467, - "learning_rate": 9.393033179394506e-05, - "loss": 0.0998, - "step": 36100 - }, - { - "epoch": 2.362446843310435, - "grad_norm": 0.8416787981987, - "learning_rate": 9.392594434014847e-05, - "loss": 0.0978, - "step": 36110 - }, - { - "epoch": 2.3631010794896956, - "grad_norm": 0.8340370059013367, - "learning_rate": 9.392155540373606e-05, - "loss": 0.0899, - "step": 36120 - }, - { - "epoch": 2.3637553156689566, - "grad_norm": 0.8223138451576233, - "learning_rate": 9.391716498485597e-05, - "loss": 0.0878, - "step": 36130 - }, - { - "epoch": 2.364409551848217, - "grad_norm": 1.106229305267334, - "learning_rate": 9.391277308365638e-05, - "loss": 0.0879, - "step": 36140 - }, - { - "epoch": 2.365063788027478, - "grad_norm": 0.8265957832336426, - "learning_rate": 9.390837970028553e-05, - "loss": 0.0796, - "step": 36150 - }, - { - "epoch": 2.3657180242067386, - "grad_norm": 0.8237302303314209, - "learning_rate": 9.390398483489171e-05, - "loss": 0.0843, - "step": 36160 - }, - { - "epoch": 2.366372260385999, - "grad_norm": 0.819557249546051, - "learning_rate": 9.389958848762327e-05, - "loss": 0.0811, - "step": 36170 - }, - { - "epoch": 2.36702649656526, - "grad_norm": 0.9804088473320007, - "learning_rate": 9.389519065862858e-05, - "loss": 0.0887, - "step": 36180 - }, - { - "epoch": 2.3676807327445206, - "grad_norm": 1.014117956161499, - "learning_rate": 9.389079134805609e-05, - "loss": 0.0732, - "step": 36190 - }, - { - "epoch": 2.3683349689237816, - "grad_norm": 0.8419691920280457, - "learning_rate": 9.388639055605428e-05, - "loss": 0.0909, - "step": 36200 - }, - { - "epoch": 2.368989205103042, - "grad_norm": 0.9921837449073792, - "learning_rate": 9.388198828277169e-05, - "loss": 0.0783, - "step": 36210 - }, - { - "epoch": 2.369643441282303, - "grad_norm": 0.9146479964256287, - "learning_rate": 9.387758452835692e-05, - "loss": 0.078, - "step": 36220 - }, - { - "epoch": 2.3702976774615636, - "grad_norm": 0.7768714427947998, - "learning_rate": 9.387317929295859e-05, - "loss": 0.084, - "step": 36230 - }, - { - "epoch": 2.370951913640824, - "grad_norm": 0.8944301009178162, - "learning_rate": 9.38687725767254e-05, - "loss": 0.0873, - "step": 36240 - }, - { - "epoch": 2.371606149820085, - "grad_norm": 0.7859957814216614, - "learning_rate": 9.38643643798061e-05, - "loss": 0.0849, - "step": 36250 - }, - { - "epoch": 2.3722603859993456, - "grad_norm": 0.8967961668968201, - "learning_rate": 9.385995470234944e-05, - "loss": 0.075, - "step": 36260 - }, - { - "epoch": 2.3729146221786066, - "grad_norm": 0.7501564621925354, - "learning_rate": 9.38555435445043e-05, - "loss": 0.093, - "step": 36270 - }, - { - "epoch": 2.373568858357867, - "grad_norm": 0.7810158729553223, - "learning_rate": 9.385113090641953e-05, - "loss": 0.0823, - "step": 36280 - }, - { - "epoch": 2.374223094537128, - "grad_norm": 0.8977556228637695, - "learning_rate": 9.38467167882441e-05, - "loss": 0.0823, - "step": 36290 - }, - { - "epoch": 2.3748773307163886, - "grad_norm": 0.9205357432365417, - "learning_rate": 9.384230119012698e-05, - "loss": 0.0869, - "step": 36300 - }, - { - "epoch": 2.375531566895649, - "grad_norm": 0.7959444522857666, - "learning_rate": 9.383788411221724e-05, - "loss": 0.0865, - "step": 36310 - }, - { - "epoch": 2.37618580307491, - "grad_norm": 0.7368982434272766, - "learning_rate": 9.383346555466392e-05, - "loss": 0.0879, - "step": 36320 - }, - { - "epoch": 2.3768400392541706, - "grad_norm": 1.0566291809082031, - "learning_rate": 9.382904551761618e-05, - "loss": 0.0976, - "step": 36330 - }, - { - "epoch": 2.3774942754334316, - "grad_norm": 0.7619825601577759, - "learning_rate": 9.38246240012232e-05, - "loss": 0.0847, - "step": 36340 - }, - { - "epoch": 2.378148511612692, - "grad_norm": 0.7798367738723755, - "learning_rate": 9.382020100563425e-05, - "loss": 0.0897, - "step": 36350 - }, - { - "epoch": 2.378802747791953, - "grad_norm": 0.8497381210327148, - "learning_rate": 9.381577653099858e-05, - "loss": 0.0823, - "step": 36360 - }, - { - "epoch": 2.3794569839712136, - "grad_norm": 0.8219572901725769, - "learning_rate": 9.381135057746552e-05, - "loss": 0.0889, - "step": 36370 - }, - { - "epoch": 2.380111220150474, - "grad_norm": 0.8166351914405823, - "learning_rate": 9.380692314518451e-05, - "loss": 0.0907, - "step": 36380 - }, - { - "epoch": 2.380765456329735, - "grad_norm": 1.0758252143859863, - "learning_rate": 9.380249423430494e-05, - "loss": 0.096, - "step": 36390 - }, - { - "epoch": 2.3814196925089957, - "grad_norm": 0.8256650567054749, - "learning_rate": 9.379806384497633e-05, - "loss": 0.0718, - "step": 36400 - }, - { - "epoch": 2.3820739286882566, - "grad_norm": 0.8822137117385864, - "learning_rate": 9.379363197734818e-05, - "loss": 0.095, - "step": 36410 - }, - { - "epoch": 2.382728164867517, - "grad_norm": 0.9527086615562439, - "learning_rate": 9.37891986315701e-05, - "loss": 0.0912, - "step": 36420 - }, - { - "epoch": 2.383382401046778, - "grad_norm": 0.8427805304527283, - "learning_rate": 9.378476380779174e-05, - "loss": 0.0898, - "step": 36430 - }, - { - "epoch": 2.3840366372260386, - "grad_norm": 0.8125025629997253, - "learning_rate": 9.378032750616277e-05, - "loss": 0.0719, - "step": 36440 - }, - { - "epoch": 2.384690873405299, - "grad_norm": 0.7970302700996399, - "learning_rate": 9.377588972683292e-05, - "loss": 0.0855, - "step": 36450 - }, - { - "epoch": 2.38534510958456, - "grad_norm": 0.8656693696975708, - "learning_rate": 9.377145046995198e-05, - "loss": 0.0804, - "step": 36460 - }, - { - "epoch": 2.3859993457638207, - "grad_norm": 0.7493268251419067, - "learning_rate": 9.37670097356698e-05, - "loss": 0.0845, - "step": 36470 - }, - { - "epoch": 2.3866535819430816, - "grad_norm": 0.8390613794326782, - "learning_rate": 9.376256752413626e-05, - "loss": 0.0772, - "step": 36480 - }, - { - "epoch": 2.387307818122342, - "grad_norm": 0.8210413455963135, - "learning_rate": 9.37581238355013e-05, - "loss": 0.0785, - "step": 36490 - }, - { - "epoch": 2.3879620543016027, - "grad_norm": 0.8246368765830994, - "learning_rate": 9.375367866991488e-05, - "loss": 0.0812, - "step": 36500 - }, - { - "epoch": 2.3886162904808637, - "grad_norm": 0.7549490332603455, - "learning_rate": 9.374923202752707e-05, - "loss": 0.0878, - "step": 36510 - }, - { - "epoch": 2.389270526660124, - "grad_norm": 0.8099923729896545, - "learning_rate": 9.374478390848794e-05, - "loss": 0.087, - "step": 36520 - }, - { - "epoch": 2.389924762839385, - "grad_norm": 0.8852878212928772, - "learning_rate": 9.374033431294763e-05, - "loss": 0.0897, - "step": 36530 - }, - { - "epoch": 2.3905789990186457, - "grad_norm": 0.9920669198036194, - "learning_rate": 9.373588324105634e-05, - "loss": 0.0871, - "step": 36540 - }, - { - "epoch": 2.391233235197906, - "grad_norm": 0.8708438873291016, - "learning_rate": 9.373143069296426e-05, - "loss": 0.0796, - "step": 36550 - }, - { - "epoch": 2.391887471377167, - "grad_norm": 0.751469612121582, - "learning_rate": 9.372697666882171e-05, - "loss": 0.0851, - "step": 36560 - }, - { - "epoch": 2.3925417075564277, - "grad_norm": 0.8722557425498962, - "learning_rate": 9.372252116877903e-05, - "loss": 0.0951, - "step": 36570 - }, - { - "epoch": 2.3931959437356887, - "grad_norm": 0.844551146030426, - "learning_rate": 9.371806419298659e-05, - "loss": 0.08, - "step": 36580 - }, - { - "epoch": 2.393850179914949, - "grad_norm": 0.8585530519485474, - "learning_rate": 9.371360574159483e-05, - "loss": 0.086, - "step": 36590 - }, - { - "epoch": 2.39450441609421, - "grad_norm": 0.9330928921699524, - "learning_rate": 9.370914581475423e-05, - "loss": 0.0793, - "step": 36600 - }, - { - "epoch": 2.3951586522734707, - "grad_norm": 1.0671617984771729, - "learning_rate": 9.370468441261532e-05, - "loss": 0.0737, - "step": 36610 - }, - { - "epoch": 2.395812888452731, - "grad_norm": 0.8291199803352356, - "learning_rate": 9.370022153532871e-05, - "loss": 0.0971, - "step": 36620 - }, - { - "epoch": 2.396467124631992, - "grad_norm": 1.0896260738372803, - "learning_rate": 9.3695757183045e-05, - "loss": 0.0821, - "step": 36630 - }, - { - "epoch": 2.3971213608112527, - "grad_norm": 0.7931303977966309, - "learning_rate": 9.369129135591491e-05, - "loss": 0.0857, - "step": 36640 - }, - { - "epoch": 2.3977755969905137, - "grad_norm": 0.9220706224441528, - "learning_rate": 9.368682405408912e-05, - "loss": 0.0844, - "step": 36650 - }, - { - "epoch": 2.398429833169774, - "grad_norm": 0.9693981409072876, - "learning_rate": 9.368235527771847e-05, - "loss": 0.091, - "step": 36660 - }, - { - "epoch": 2.399084069349035, - "grad_norm": 0.8742493987083435, - "learning_rate": 9.367788502695376e-05, - "loss": 0.0868, - "step": 36670 - }, - { - "epoch": 2.3997383055282957, - "grad_norm": 0.8460095524787903, - "learning_rate": 9.367341330194587e-05, - "loss": 0.0846, - "step": 36680 - }, - { - "epoch": 2.4003925417075562, - "grad_norm": 0.883783757686615, - "learning_rate": 9.366894010284576e-05, - "loss": 0.0819, - "step": 36690 - }, - { - "epoch": 2.401046777886817, - "grad_norm": 0.8907153606414795, - "learning_rate": 9.366446542980439e-05, - "loss": 0.0831, - "step": 36700 - }, - { - "epoch": 2.4017010140660777, - "grad_norm": 0.8098263144493103, - "learning_rate": 9.36599892829728e-05, - "loss": 0.0865, - "step": 36710 - }, - { - "epoch": 2.4023552502453387, - "grad_norm": 0.9012731909751892, - "learning_rate": 9.365551166250206e-05, - "loss": 0.0835, - "step": 36720 - }, - { - "epoch": 2.403009486424599, - "grad_norm": 0.817130446434021, - "learning_rate": 9.365103256854332e-05, - "loss": 0.0877, - "step": 36730 - }, - { - "epoch": 2.40366372260386, - "grad_norm": 1.0334715843200684, - "learning_rate": 9.364655200124775e-05, - "loss": 0.0905, - "step": 36740 - }, - { - "epoch": 2.4043179587831207, - "grad_norm": 0.6475959420204163, - "learning_rate": 9.364206996076659e-05, - "loss": 0.0826, - "step": 36750 - }, - { - "epoch": 2.4049721949623812, - "grad_norm": 1.0449169874191284, - "learning_rate": 9.36375864472511e-05, - "loss": 0.0842, - "step": 36760 - }, - { - "epoch": 2.405626431141642, - "grad_norm": 0.8349801898002625, - "learning_rate": 9.363310146085262e-05, - "loss": 0.089, - "step": 36770 - }, - { - "epoch": 2.4062806673209027, - "grad_norm": 0.9147889614105225, - "learning_rate": 9.362861500172255e-05, - "loss": 0.082, - "step": 36780 - }, - { - "epoch": 2.4069349035001637, - "grad_norm": 0.7441397905349731, - "learning_rate": 9.362412707001229e-05, - "loss": 0.0753, - "step": 36790 - }, - { - "epoch": 2.4075891396794242, - "grad_norm": 1.0056695938110352, - "learning_rate": 9.361963766587334e-05, - "loss": 0.08, - "step": 36800 - }, - { - "epoch": 2.408243375858685, - "grad_norm": 0.7822602987289429, - "learning_rate": 9.361514678945722e-05, - "loss": 0.0866, - "step": 36810 - }, - { - "epoch": 2.4088976120379457, - "grad_norm": 0.8155549764633179, - "learning_rate": 9.36106544409155e-05, - "loss": 0.0787, - "step": 36820 - }, - { - "epoch": 2.4095518482172062, - "grad_norm": 0.7978348135948181, - "learning_rate": 9.360616062039985e-05, - "loss": 0.0847, - "step": 36830 - }, - { - "epoch": 2.410206084396467, - "grad_norm": 0.867654025554657, - "learning_rate": 9.360166532806189e-05, - "loss": 0.0844, - "step": 36840 - }, - { - "epoch": 2.4108603205757277, - "grad_norm": 0.975135087966919, - "learning_rate": 9.359716856405339e-05, - "loss": 0.0915, - "step": 36850 - }, - { - "epoch": 2.4115145567549887, - "grad_norm": 0.7564871907234192, - "learning_rate": 9.359267032852609e-05, - "loss": 0.0968, - "step": 36860 - }, - { - "epoch": 2.4121687929342492, - "grad_norm": 0.8742430210113525, - "learning_rate": 9.358817062163188e-05, - "loss": 0.0793, - "step": 36870 - }, - { - "epoch": 2.41282302911351, - "grad_norm": 0.8599095344543457, - "learning_rate": 9.358366944352258e-05, - "loss": 0.0825, - "step": 36880 - }, - { - "epoch": 2.4134772652927707, - "grad_norm": 0.6971985101699829, - "learning_rate": 9.357916679435012e-05, - "loss": 0.0813, - "step": 36890 - }, - { - "epoch": 2.4141315014720313, - "grad_norm": 0.9535905122756958, - "learning_rate": 9.357466267426649e-05, - "loss": 0.0847, - "step": 36900 - }, - { - "epoch": 2.4147857376512922, - "grad_norm": 0.9580764174461365, - "learning_rate": 9.357015708342373e-05, - "loss": 0.0974, - "step": 36910 - }, - { - "epoch": 2.4154399738305528, - "grad_norm": 0.9554745554924011, - "learning_rate": 9.35656500219739e-05, - "loss": 0.08, - "step": 36920 - }, - { - "epoch": 2.4160942100098137, - "grad_norm": 0.7604560852050781, - "learning_rate": 9.356114149006911e-05, - "loss": 0.0854, - "step": 36930 - }, - { - "epoch": 2.4167484461890743, - "grad_norm": 0.876243531703949, - "learning_rate": 9.355663148786158e-05, - "loss": 0.0835, - "step": 36940 - }, - { - "epoch": 2.4174026823683348, - "grad_norm": 0.9993807673454285, - "learning_rate": 9.355212001550349e-05, - "loss": 0.087, - "step": 36950 - }, - { - "epoch": 2.4180569185475957, - "grad_norm": 0.934245228767395, - "learning_rate": 9.354760707314713e-05, - "loss": 0.0764, - "step": 36960 - }, - { - "epoch": 2.4187111547268563, - "grad_norm": 0.8030011057853699, - "learning_rate": 9.354309266094482e-05, - "loss": 0.0891, - "step": 36970 - }, - { - "epoch": 2.4193653909061172, - "grad_norm": 0.7659550309181213, - "learning_rate": 9.353857677904893e-05, - "loss": 0.0852, - "step": 36980 - }, - { - "epoch": 2.4200196270853778, - "grad_norm": 0.8483705520629883, - "learning_rate": 9.353405942761191e-05, - "loss": 0.0777, - "step": 36990 - }, - { - "epoch": 2.4206738632646383, - "grad_norm": 0.7422915101051331, - "learning_rate": 9.35295406067862e-05, - "loss": 0.0827, - "step": 37000 - }, - { - "epoch": 2.4213280994438993, - "grad_norm": 0.8917139172554016, - "learning_rate": 9.352502031672435e-05, - "loss": 0.0917, - "step": 37010 - }, - { - "epoch": 2.42198233562316, - "grad_norm": 0.973841667175293, - "learning_rate": 9.35204985575789e-05, - "loss": 0.0856, - "step": 37020 - }, - { - "epoch": 2.4226365718024208, - "grad_norm": 0.8658559918403625, - "learning_rate": 9.351597532950247e-05, - "loss": 0.0911, - "step": 37030 - }, - { - "epoch": 2.4232908079816813, - "grad_norm": 1.0100637674331665, - "learning_rate": 9.351145063264778e-05, - "loss": 0.0832, - "step": 37040 - }, - { - "epoch": 2.4239450441609423, - "grad_norm": 0.8611866235733032, - "learning_rate": 9.35069244671675e-05, - "loss": 0.0859, - "step": 37050 - }, - { - "epoch": 2.424599280340203, - "grad_norm": 0.8808668255805969, - "learning_rate": 9.350239683321443e-05, - "loss": 0.0876, - "step": 37060 - }, - { - "epoch": 2.4252535165194633, - "grad_norm": 0.767738938331604, - "learning_rate": 9.349786773094137e-05, - "loss": 0.0824, - "step": 37070 - }, - { - "epoch": 2.4259077526987243, - "grad_norm": 0.8980047106742859, - "learning_rate": 9.34933371605012e-05, - "loss": 0.0844, - "step": 37080 - }, - { - "epoch": 2.426561988877985, - "grad_norm": 0.6932862401008606, - "learning_rate": 9.348880512204683e-05, - "loss": 0.0765, - "step": 37090 - }, - { - "epoch": 2.4272162250572458, - "grad_norm": 0.8698898553848267, - "learning_rate": 9.348427161573124e-05, - "loss": 0.0769, - "step": 37100 - }, - { - "epoch": 2.4278704612365063, - "grad_norm": 0.8262230157852173, - "learning_rate": 9.347973664170744e-05, - "loss": 0.0876, - "step": 37110 - }, - { - "epoch": 2.4285246974157673, - "grad_norm": 0.690406322479248, - "learning_rate": 9.347520020012848e-05, - "loss": 0.086, - "step": 37120 - }, - { - "epoch": 2.429178933595028, - "grad_norm": 0.7355995178222656, - "learning_rate": 9.347066229114751e-05, - "loss": 0.0938, - "step": 37130 - }, - { - "epoch": 2.4298331697742883, - "grad_norm": 0.8815680742263794, - "learning_rate": 9.34661229149177e-05, - "loss": 0.0923, - "step": 37140 - }, - { - "epoch": 2.4304874059535493, - "grad_norm": 0.8908131122589111, - "learning_rate": 9.346158207159222e-05, - "loss": 0.0841, - "step": 37150 - }, - { - "epoch": 2.43114164213281, - "grad_norm": 0.7907306551933289, - "learning_rate": 9.345703976132438e-05, - "loss": 0.0847, - "step": 37160 - }, - { - "epoch": 2.431795878312071, - "grad_norm": 0.9056040048599243, - "learning_rate": 9.345249598426746e-05, - "loss": 0.0879, - "step": 37170 - }, - { - "epoch": 2.4324501144913313, - "grad_norm": 0.7010656595230103, - "learning_rate": 9.344795074057487e-05, - "loss": 0.0732, - "step": 37180 - }, - { - "epoch": 2.4331043506705923, - "grad_norm": 0.7615019083023071, - "learning_rate": 9.344340403039998e-05, - "loss": 0.0846, - "step": 37190 - }, - { - "epoch": 2.433758586849853, - "grad_norm": 0.9144221544265747, - "learning_rate": 9.343885585389627e-05, - "loss": 0.0982, - "step": 37200 - }, - { - "epoch": 2.4344128230291133, - "grad_norm": 1.0988235473632812, - "learning_rate": 9.343430621121724e-05, - "loss": 0.0877, - "step": 37210 - }, - { - "epoch": 2.4350670592083743, - "grad_norm": 0.8255583643913269, - "learning_rate": 9.342975510251649e-05, - "loss": 0.0832, - "step": 37220 - }, - { - "epoch": 2.435721295387635, - "grad_norm": 0.7476474642753601, - "learning_rate": 9.342520252794759e-05, - "loss": 0.0843, - "step": 37230 - }, - { - "epoch": 2.436375531566896, - "grad_norm": 0.7885528802871704, - "learning_rate": 9.342064848766423e-05, - "loss": 0.0896, - "step": 37240 - }, - { - "epoch": 2.4370297677461563, - "grad_norm": 0.8484315872192383, - "learning_rate": 9.341609298182008e-05, - "loss": 0.0814, - "step": 37250 - }, - { - "epoch": 2.4376840039254173, - "grad_norm": 1.143760085105896, - "learning_rate": 9.341153601056896e-05, - "loss": 0.0843, - "step": 37260 - }, - { - "epoch": 2.438338240104678, - "grad_norm": 0.7596016526222229, - "learning_rate": 9.340697757406462e-05, - "loss": 0.0813, - "step": 37270 - }, - { - "epoch": 2.4389924762839383, - "grad_norm": 0.9794483184814453, - "learning_rate": 9.340241767246099e-05, - "loss": 0.0755, - "step": 37280 - }, - { - "epoch": 2.4396467124631993, - "grad_norm": 0.7860924005508423, - "learning_rate": 9.33978563059119e-05, - "loss": 0.0782, - "step": 37290 - }, - { - "epoch": 2.44030094864246, - "grad_norm": 0.8889253735542297, - "learning_rate": 9.339329347457135e-05, - "loss": 0.0896, - "step": 37300 - }, - { - "epoch": 2.440955184821721, - "grad_norm": 0.6699261665344238, - "learning_rate": 9.338872917859335e-05, - "loss": 0.0847, - "step": 37310 - }, - { - "epoch": 2.4416094210009813, - "grad_norm": 0.8872549533843994, - "learning_rate": 9.338416341813196e-05, - "loss": 0.081, - "step": 37320 - }, - { - "epoch": 2.4422636571802423, - "grad_norm": 0.8817291259765625, - "learning_rate": 9.337959619334125e-05, - "loss": 0.0812, - "step": 37330 - }, - { - "epoch": 2.442917893359503, - "grad_norm": 0.9922673106193542, - "learning_rate": 9.337502750437542e-05, - "loss": 0.0871, - "step": 37340 - }, - { - "epoch": 2.4435721295387633, - "grad_norm": 0.7725539803504944, - "learning_rate": 9.337045735138865e-05, - "loss": 0.0787, - "step": 37350 - }, - { - "epoch": 2.4442263657180243, - "grad_norm": 0.9328073859214783, - "learning_rate": 9.336588573453521e-05, - "loss": 0.0799, - "step": 37360 - }, - { - "epoch": 2.444880601897285, - "grad_norm": 0.7553234696388245, - "learning_rate": 9.33613126539694e-05, - "loss": 0.0799, - "step": 37370 - }, - { - "epoch": 2.445534838076546, - "grad_norm": 0.8015692830085754, - "learning_rate": 9.335673810984553e-05, - "loss": 0.0894, - "step": 37380 - }, - { - "epoch": 2.4461890742558063, - "grad_norm": 0.7665364146232605, - "learning_rate": 9.335216210231807e-05, - "loss": 0.0815, - "step": 37390 - }, - { - "epoch": 2.446843310435067, - "grad_norm": 0.7971277236938477, - "learning_rate": 9.334758463154145e-05, - "loss": 0.0827, - "step": 37400 - }, - { - "epoch": 2.447497546614328, - "grad_norm": 0.7936835885047913, - "learning_rate": 9.334300569767016e-05, - "loss": 0.0867, - "step": 37410 - }, - { - "epoch": 2.4481517827935884, - "grad_norm": 0.9368561506271362, - "learning_rate": 9.333842530085875e-05, - "loss": 0.0768, - "step": 37420 - }, - { - "epoch": 2.4488060189728493, - "grad_norm": 0.947975218296051, - "learning_rate": 9.333384344126184e-05, - "loss": 0.0807, - "step": 37430 - }, - { - "epoch": 2.44946025515211, - "grad_norm": 0.7947817444801331, - "learning_rate": 9.332926011903405e-05, - "loss": 0.0909, - "step": 37440 - }, - { - "epoch": 2.4501144913313704, - "grad_norm": 0.8594073057174683, - "learning_rate": 9.33246753343301e-05, - "loss": 0.082, - "step": 37450 - }, - { - "epoch": 2.4507687275106314, - "grad_norm": 1.020121693611145, - "learning_rate": 9.332008908730473e-05, - "loss": 0.078, - "step": 37460 - }, - { - "epoch": 2.451422963689892, - "grad_norm": 0.7262875437736511, - "learning_rate": 9.331550137811276e-05, - "loss": 0.0789, - "step": 37470 - }, - { - "epoch": 2.452077199869153, - "grad_norm": 0.7914772033691406, - "learning_rate": 9.331091220690902e-05, - "loss": 0.083, - "step": 37480 - }, - { - "epoch": 2.4527314360484134, - "grad_norm": 0.8740646243095398, - "learning_rate": 9.330632157384838e-05, - "loss": 0.0757, - "step": 37490 - }, - { - "epoch": 2.4533856722276743, - "grad_norm": 0.9303426742553711, - "learning_rate": 9.330172947908583e-05, - "loss": 0.0835, - "step": 37500 - }, - { - "epoch": 2.454039908406935, - "grad_norm": 0.7235417366027832, - "learning_rate": 9.329713592277634e-05, - "loss": 0.0836, - "step": 37510 - }, - { - "epoch": 2.4546941445861954, - "grad_norm": 0.7460285425186157, - "learning_rate": 9.329254090507498e-05, - "loss": 0.0807, - "step": 37520 - }, - { - "epoch": 2.4553483807654564, - "grad_norm": 0.6465203166007996, - "learning_rate": 9.32879444261368e-05, - "loss": 0.0784, - "step": 37530 - }, - { - "epoch": 2.456002616944717, - "grad_norm": 0.80832839012146, - "learning_rate": 9.328334648611699e-05, - "loss": 0.0777, - "step": 37540 - }, - { - "epoch": 2.456656853123978, - "grad_norm": 0.7820184230804443, - "learning_rate": 9.32787470851707e-05, - "loss": 0.0819, - "step": 37550 - }, - { - "epoch": 2.4573110893032384, - "grad_norm": 0.934441864490509, - "learning_rate": 9.32741462234532e-05, - "loss": 0.08, - "step": 37560 - }, - { - "epoch": 2.4579653254824994, - "grad_norm": 0.9643844962120056, - "learning_rate": 9.32695439011198e-05, - "loss": 0.0737, - "step": 37570 - }, - { - "epoch": 2.45861956166176, - "grad_norm": 1.1645561456680298, - "learning_rate": 9.326494011832578e-05, - "loss": 0.0802, - "step": 37580 - }, - { - "epoch": 2.4592737978410204, - "grad_norm": 0.8493544459342957, - "learning_rate": 9.326033487522659e-05, - "loss": 0.0881, - "step": 37590 - }, - { - "epoch": 2.4599280340202814, - "grad_norm": 0.7914831042289734, - "learning_rate": 9.325572817197763e-05, - "loss": 0.088, - "step": 37600 - }, - { - "epoch": 2.460582270199542, - "grad_norm": 0.7926651239395142, - "learning_rate": 9.325112000873439e-05, - "loss": 0.0873, - "step": 37610 - }, - { - "epoch": 2.461236506378803, - "grad_norm": 1.0331966876983643, - "learning_rate": 9.324651038565244e-05, - "loss": 0.0813, - "step": 37620 - }, - { - "epoch": 2.4618907425580634, - "grad_norm": 1.0331898927688599, - "learning_rate": 9.324189930288734e-05, - "loss": 0.0896, - "step": 37630 - }, - { - "epoch": 2.4625449787373244, - "grad_norm": 0.877951443195343, - "learning_rate": 9.323728676059474e-05, - "loss": 0.0802, - "step": 37640 - }, - { - "epoch": 2.463199214916585, - "grad_norm": 0.9235687255859375, - "learning_rate": 9.32326727589303e-05, - "loss": 0.0785, - "step": 37650 - }, - { - "epoch": 2.4638534510958454, - "grad_norm": 0.8438425660133362, - "learning_rate": 9.322805729804979e-05, - "loss": 0.0837, - "step": 37660 - }, - { - "epoch": 2.4645076872751064, - "grad_norm": 0.9116701483726501, - "learning_rate": 9.322344037810898e-05, - "loss": 0.0815, - "step": 37670 - }, - { - "epoch": 2.465161923454367, - "grad_norm": 0.7631514072418213, - "learning_rate": 9.321882199926369e-05, - "loss": 0.0812, - "step": 37680 - }, - { - "epoch": 2.465816159633628, - "grad_norm": 0.6126758456230164, - "learning_rate": 9.321420216166979e-05, - "loss": 0.0787, - "step": 37690 - }, - { - "epoch": 2.4664703958128884, - "grad_norm": 0.8149334192276001, - "learning_rate": 9.320958086548326e-05, - "loss": 0.0836, - "step": 37700 - }, - { - "epoch": 2.4671246319921494, - "grad_norm": 0.9959266781806946, - "learning_rate": 9.320495811086006e-05, - "loss": 0.0878, - "step": 37710 - }, - { - "epoch": 2.46777886817141, - "grad_norm": 0.7576195597648621, - "learning_rate": 9.320033389795619e-05, - "loss": 0.0739, - "step": 37720 - }, - { - "epoch": 2.4684331043506704, - "grad_norm": 0.8124213814735413, - "learning_rate": 9.319570822692778e-05, - "loss": 0.0902, - "step": 37730 - }, - { - "epoch": 2.4690873405299314, - "grad_norm": 0.8176689743995667, - "learning_rate": 9.319108109793091e-05, - "loss": 0.0825, - "step": 37740 - }, - { - "epoch": 2.469741576709192, - "grad_norm": 0.7739967703819275, - "learning_rate": 9.318645251112179e-05, - "loss": 0.0901, - "step": 37750 - }, - { - "epoch": 2.470395812888453, - "grad_norm": 0.7479947209358215, - "learning_rate": 9.318182246665663e-05, - "loss": 0.0813, - "step": 37760 - }, - { - "epoch": 2.4710500490677134, - "grad_norm": 0.8923253417015076, - "learning_rate": 9.317719096469172e-05, - "loss": 0.0796, - "step": 37770 - }, - { - "epoch": 2.4717042852469744, - "grad_norm": 0.9640159010887146, - "learning_rate": 9.317255800538339e-05, - "loss": 0.0797, - "step": 37780 - }, - { - "epoch": 2.472358521426235, - "grad_norm": 0.9136691689491272, - "learning_rate": 9.3167923588888e-05, - "loss": 0.0888, - "step": 37790 - }, - { - "epoch": 2.4730127576054954, - "grad_norm": 0.7730658650398254, - "learning_rate": 9.316328771536195e-05, - "loss": 0.0832, - "step": 37800 - }, - { - "epoch": 2.4736669937847564, - "grad_norm": 0.7457364201545715, - "learning_rate": 9.315865038496177e-05, - "loss": 0.0875, - "step": 37810 - }, - { - "epoch": 2.474321229964017, - "grad_norm": 0.7927015423774719, - "learning_rate": 9.315401159784394e-05, - "loss": 0.0857, - "step": 37820 - }, - { - "epoch": 2.474975466143278, - "grad_norm": 0.861659586429596, - "learning_rate": 9.314937135416506e-05, - "loss": 0.079, - "step": 37830 - }, - { - "epoch": 2.4756297023225384, - "grad_norm": 0.9620111584663391, - "learning_rate": 9.31447296540817e-05, - "loss": 0.0918, - "step": 37840 - }, - { - "epoch": 2.4762839385017994, - "grad_norm": 0.7795575857162476, - "learning_rate": 9.314008649775059e-05, - "loss": 0.0752, - "step": 37850 - }, - { - "epoch": 2.47693817468106, - "grad_norm": 0.9564193487167358, - "learning_rate": 9.313544188532841e-05, - "loss": 0.0756, - "step": 37860 - }, - { - "epoch": 2.4775924108603204, - "grad_norm": 0.7260618805885315, - "learning_rate": 9.313079581697194e-05, - "loss": 0.0773, - "step": 37870 - }, - { - "epoch": 2.4782466470395814, - "grad_norm": 1.0667495727539062, - "learning_rate": 9.312614829283799e-05, - "loss": 0.0941, - "step": 37880 - }, - { - "epoch": 2.478900883218842, - "grad_norm": 0.7811123728752136, - "learning_rate": 9.312149931308345e-05, - "loss": 0.0791, - "step": 37890 - }, - { - "epoch": 2.4795551193981025, - "grad_norm": 0.8247928619384766, - "learning_rate": 9.31168488778652e-05, - "loss": 0.0873, - "step": 37900 - }, - { - "epoch": 2.4802093555773634, - "grad_norm": 0.8802565932273865, - "learning_rate": 9.311219698734024e-05, - "loss": 0.0797, - "step": 37910 - }, - { - "epoch": 2.480863591756624, - "grad_norm": 0.9362874031066895, - "learning_rate": 9.310754364166554e-05, - "loss": 0.0814, - "step": 37920 - }, - { - "epoch": 2.481517827935885, - "grad_norm": 0.9797009825706482, - "learning_rate": 9.310288884099822e-05, - "loss": 0.0892, - "step": 37930 - }, - { - "epoch": 2.4821720641151455, - "grad_norm": 0.7946538925170898, - "learning_rate": 9.309823258549535e-05, - "loss": 0.0809, - "step": 37940 - }, - { - "epoch": 2.4828263002944064, - "grad_norm": 0.8862201571464539, - "learning_rate": 9.30935748753141e-05, - "loss": 0.084, - "step": 37950 - }, - { - "epoch": 2.483480536473667, - "grad_norm": 0.8820653557777405, - "learning_rate": 9.308891571061167e-05, - "loss": 0.0921, - "step": 37960 - }, - { - "epoch": 2.4841347726529275, - "grad_norm": 0.8164055347442627, - "learning_rate": 9.308425509154533e-05, - "loss": 0.0819, - "step": 37970 - }, - { - "epoch": 2.4847890088321885, - "grad_norm": 0.8694265484809875, - "learning_rate": 9.307959301827241e-05, - "loss": 0.0782, - "step": 37980 - }, - { - "epoch": 2.485443245011449, - "grad_norm": 1.1511107683181763, - "learning_rate": 9.307492949095021e-05, - "loss": 0.0913, - "step": 37990 - }, - { - "epoch": 2.48609748119071, - "grad_norm": 0.8150752186775208, - "learning_rate": 9.307026450973619e-05, - "loss": 0.0899, - "step": 38000 - }, - { - "epoch": 2.4867517173699705, - "grad_norm": 0.9316548109054565, - "learning_rate": 9.306559807478779e-05, - "loss": 0.0798, - "step": 38010 - }, - { - "epoch": 2.4874059535492314, - "grad_norm": 0.9588937759399414, - "learning_rate": 9.306093018626252e-05, - "loss": 0.082, - "step": 38020 - }, - { - "epoch": 2.488060189728492, - "grad_norm": 1.0078554153442383, - "learning_rate": 9.30562608443179e-05, - "loss": 0.078, - "step": 38030 - }, - { - "epoch": 2.4887144259077525, - "grad_norm": 1.0711711645126343, - "learning_rate": 9.30515900491116e-05, - "loss": 0.0909, - "step": 38040 - }, - { - "epoch": 2.4893686620870135, - "grad_norm": 0.867173433303833, - "learning_rate": 9.30469178008012e-05, - "loss": 0.0822, - "step": 38050 - }, - { - "epoch": 2.490022898266274, - "grad_norm": 0.7762174606323242, - "learning_rate": 9.304224409954442e-05, - "loss": 0.0857, - "step": 38060 - }, - { - "epoch": 2.490677134445535, - "grad_norm": 1.0067529678344727, - "learning_rate": 9.303756894549903e-05, - "loss": 0.0851, - "step": 38070 - }, - { - "epoch": 2.4913313706247955, - "grad_norm": 0.9543056488037109, - "learning_rate": 9.303289233882281e-05, - "loss": 0.0754, - "step": 38080 - }, - { - "epoch": 2.4919856068040565, - "grad_norm": 1.024979591369629, - "learning_rate": 9.302821427967363e-05, - "loss": 0.0859, - "step": 38090 - }, - { - "epoch": 2.492639842983317, - "grad_norm": 0.7905625700950623, - "learning_rate": 9.302353476820936e-05, - "loss": 0.0789, - "step": 38100 - }, - { - "epoch": 2.4932940791625775, - "grad_norm": 0.8276375532150269, - "learning_rate": 9.301885380458797e-05, - "loss": 0.0807, - "step": 38110 - }, - { - "epoch": 2.4939483153418385, - "grad_norm": 0.7747846245765686, - "learning_rate": 9.301417138896743e-05, - "loss": 0.0798, - "step": 38120 - }, - { - "epoch": 2.494602551521099, - "grad_norm": 0.9668558835983276, - "learning_rate": 9.30094875215058e-05, - "loss": 0.0816, - "step": 38130 - }, - { - "epoch": 2.49525678770036, - "grad_norm": 1.070252537727356, - "learning_rate": 9.300480220236119e-05, - "loss": 0.0888, - "step": 38140 - }, - { - "epoch": 2.4959110238796205, - "grad_norm": 0.9471240043640137, - "learning_rate": 9.30001154316917e-05, - "loss": 0.0879, - "step": 38150 - }, - { - "epoch": 2.4965652600588815, - "grad_norm": 0.7727608680725098, - "learning_rate": 9.299542720965554e-05, - "loss": 0.0879, - "step": 38160 - }, - { - "epoch": 2.497219496238142, - "grad_norm": 1.2870181798934937, - "learning_rate": 9.299073753641096e-05, - "loss": 0.0907, - "step": 38170 - }, - { - "epoch": 2.4978737324174025, - "grad_norm": 1.0817999839782715, - "learning_rate": 9.298604641211624e-05, - "loss": 0.1045, - "step": 38180 - }, - { - "epoch": 2.4985279685966635, - "grad_norm": 0.7632635831832886, - "learning_rate": 9.298135383692972e-05, - "loss": 0.0793, - "step": 38190 - }, - { - "epoch": 2.499182204775924, - "grad_norm": 0.9093852639198303, - "learning_rate": 9.297665981100978e-05, - "loss": 0.0774, - "step": 38200 - }, - { - "epoch": 2.499836440955185, - "grad_norm": 1.0639636516571045, - "learning_rate": 9.297196433451487e-05, - "loss": 0.0728, - "step": 38210 - }, - { - "epoch": 2.5004906771344455, - "grad_norm": 0.9197503924369812, - "learning_rate": 9.296726740760346e-05, - "loss": 0.0849, - "step": 38220 - }, - { - "epoch": 2.5011449133137065, - "grad_norm": 0.8370291590690613, - "learning_rate": 9.296256903043408e-05, - "loss": 0.0764, - "step": 38230 - }, - { - "epoch": 2.501799149492967, - "grad_norm": 0.7800946831703186, - "learning_rate": 9.295786920316533e-05, - "loss": 0.0802, - "step": 38240 - }, - { - "epoch": 2.5024533856722275, - "grad_norm": 1.0258721113204956, - "learning_rate": 9.295316792595586e-05, - "loss": 0.0874, - "step": 38250 - }, - { - "epoch": 2.5031076218514885, - "grad_norm": 0.7372403144836426, - "learning_rate": 9.294846519896429e-05, - "loss": 0.0774, - "step": 38260 - }, - { - "epoch": 2.503761858030749, - "grad_norm": 0.8242309093475342, - "learning_rate": 9.294376102234938e-05, - "loss": 0.0814, - "step": 38270 - }, - { - "epoch": 2.5044160942100095, - "grad_norm": 0.8311209678649902, - "learning_rate": 9.293905539626993e-05, - "loss": 0.079, - "step": 38280 - }, - { - "epoch": 2.5050703303892705, - "grad_norm": 0.7952736616134644, - "learning_rate": 9.293434832088475e-05, - "loss": 0.0779, - "step": 38290 - }, - { - "epoch": 2.5057245665685315, - "grad_norm": 0.9289050102233887, - "learning_rate": 9.29296397963527e-05, - "loss": 0.0824, - "step": 38300 - }, - { - "epoch": 2.506378802747792, - "grad_norm": 0.9673059582710266, - "learning_rate": 9.292492982283272e-05, - "loss": 0.0874, - "step": 38310 - }, - { - "epoch": 2.5070330389270525, - "grad_norm": 0.8958445191383362, - "learning_rate": 9.29202184004838e-05, - "loss": 0.0877, - "step": 38320 - }, - { - "epoch": 2.5076872751063135, - "grad_norm": 0.8611701726913452, - "learning_rate": 9.291550552946493e-05, - "loss": 0.0834, - "step": 38330 - }, - { - "epoch": 2.508341511285574, - "grad_norm": 0.7747957706451416, - "learning_rate": 9.29107912099352e-05, - "loss": 0.077, - "step": 38340 - }, - { - "epoch": 2.5089957474648346, - "grad_norm": 1.0127016305923462, - "learning_rate": 9.290607544205374e-05, - "loss": 0.0961, - "step": 38350 - }, - { - "epoch": 2.5096499836440955, - "grad_norm": 0.829646110534668, - "learning_rate": 9.290135822597969e-05, - "loss": 0.0863, - "step": 38360 - }, - { - "epoch": 2.5103042198233565, - "grad_norm": 0.9763094782829285, - "learning_rate": 9.28966395618723e-05, - "loss": 0.08, - "step": 38370 - }, - { - "epoch": 2.510958456002617, - "grad_norm": 0.8551107048988342, - "learning_rate": 9.289191944989083e-05, - "loss": 0.079, - "step": 38380 - }, - { - "epoch": 2.5116126921818775, - "grad_norm": 0.9324933886528015, - "learning_rate": 9.288719789019458e-05, - "loss": 0.0947, - "step": 38390 - }, - { - "epoch": 2.5122669283611385, - "grad_norm": 0.8589342832565308, - "learning_rate": 9.288247488294293e-05, - "loss": 0.0805, - "step": 38400 - }, - { - "epoch": 2.512921164540399, - "grad_norm": 0.7855199575424194, - "learning_rate": 9.28777504282953e-05, - "loss": 0.0841, - "step": 38410 - }, - { - "epoch": 2.5135754007196596, - "grad_norm": 0.9341410994529724, - "learning_rate": 9.287302452641112e-05, - "loss": 0.0938, - "step": 38420 - }, - { - "epoch": 2.5142296368989205, - "grad_norm": 1.012837290763855, - "learning_rate": 9.286829717744993e-05, - "loss": 0.076, - "step": 38430 - }, - { - "epoch": 2.514883873078181, - "grad_norm": 0.8277202844619751, - "learning_rate": 9.286356838157128e-05, - "loss": 0.0847, - "step": 38440 - }, - { - "epoch": 2.515538109257442, - "grad_norm": 0.7765419483184814, - "learning_rate": 9.28588381389348e-05, - "loss": 0.0896, - "step": 38450 - }, - { - "epoch": 2.5161923454367026, - "grad_norm": 1.0105395317077637, - "learning_rate": 9.285410644970013e-05, - "loss": 0.0918, - "step": 38460 - }, - { - "epoch": 2.5168465816159635, - "grad_norm": 0.7408658266067505, - "learning_rate": 9.284937331402697e-05, - "loss": 0.0808, - "step": 38470 - }, - { - "epoch": 2.517500817795224, - "grad_norm": 0.782345712184906, - "learning_rate": 9.284463873207508e-05, - "loss": 0.0748, - "step": 38480 - }, - { - "epoch": 2.5181550539744846, - "grad_norm": 0.7079125046730042, - "learning_rate": 9.283990270400428e-05, - "loss": 0.0831, - "step": 38490 - }, - { - "epoch": 2.5188092901537456, - "grad_norm": 0.920768678188324, - "learning_rate": 9.28351652299744e-05, - "loss": 0.0907, - "step": 38500 - }, - { - "epoch": 2.519463526333006, - "grad_norm": 0.9361595511436462, - "learning_rate": 9.283042631014535e-05, - "loss": 0.0848, - "step": 38510 - }, - { - "epoch": 2.520117762512267, - "grad_norm": 0.9975919723510742, - "learning_rate": 9.282568594467711e-05, - "loss": 0.0841, - "step": 38520 - }, - { - "epoch": 2.5207719986915276, - "grad_norm": 0.7760083675384521, - "learning_rate": 9.282094413372963e-05, - "loss": 0.0785, - "step": 38530 - }, - { - "epoch": 2.5214262348707885, - "grad_norm": 0.8321374654769897, - "learning_rate": 9.2816200877463e-05, - "loss": 0.0842, - "step": 38540 - }, - { - "epoch": 2.522080471050049, - "grad_norm": 1.0121804475784302, - "learning_rate": 9.28114561760373e-05, - "loss": 0.08, - "step": 38550 - }, - { - "epoch": 2.5227347072293096, - "grad_norm": 0.8037693500518799, - "learning_rate": 9.280671002961267e-05, - "loss": 0.0767, - "step": 38560 - }, - { - "epoch": 2.5233889434085706, - "grad_norm": 0.9284687042236328, - "learning_rate": 9.280196243834931e-05, - "loss": 0.0862, - "step": 38570 - }, - { - "epoch": 2.524043179587831, - "grad_norm": 0.7956443428993225, - "learning_rate": 9.279721340240745e-05, - "loss": 0.0783, - "step": 38580 - }, - { - "epoch": 2.524697415767092, - "grad_norm": 0.7840268015861511, - "learning_rate": 9.279246292194743e-05, - "loss": 0.0842, - "step": 38590 - }, - { - "epoch": 2.5253516519463526, - "grad_norm": 0.9286373257637024, - "learning_rate": 9.278771099712956e-05, - "loss": 0.0921, - "step": 38600 - }, - { - "epoch": 2.5260058881256136, - "grad_norm": 0.8080535531044006, - "learning_rate": 9.27829576281142e-05, - "loss": 0.0879, - "step": 38610 - }, - { - "epoch": 2.526660124304874, - "grad_norm": 0.9005104899406433, - "learning_rate": 9.277820281506184e-05, - "loss": 0.0819, - "step": 38620 - }, - { - "epoch": 2.5273143604841346, - "grad_norm": 0.7505617141723633, - "learning_rate": 9.277344655813292e-05, - "loss": 0.0816, - "step": 38630 - }, - { - "epoch": 2.5279685966633956, - "grad_norm": 0.6954260468482971, - "learning_rate": 9.276868885748802e-05, - "loss": 0.0822, - "step": 38640 - }, - { - "epoch": 2.528622832842656, - "grad_norm": 0.7404685616493225, - "learning_rate": 9.276392971328771e-05, - "loss": 0.071, - "step": 38650 - }, - { - "epoch": 2.529277069021917, - "grad_norm": 0.9218646287918091, - "learning_rate": 9.27591691256926e-05, - "loss": 0.0841, - "step": 38660 - }, - { - "epoch": 2.5299313052011776, - "grad_norm": 0.894137978553772, - "learning_rate": 9.275440709486342e-05, - "loss": 0.0914, - "step": 38670 - }, - { - "epoch": 2.5305855413804386, - "grad_norm": 0.7454158067703247, - "learning_rate": 9.274964362096085e-05, - "loss": 0.0858, - "step": 38680 - }, - { - "epoch": 2.531239777559699, - "grad_norm": 0.8861522674560547, - "learning_rate": 9.274487870414569e-05, - "loss": 0.0809, - "step": 38690 - }, - { - "epoch": 2.5318940137389596, - "grad_norm": 0.653781533241272, - "learning_rate": 9.27401123445788e-05, - "loss": 0.0796, - "step": 38700 - }, - { - "epoch": 2.5325482499182206, - "grad_norm": 0.8634945154190063, - "learning_rate": 9.273534454242101e-05, - "loss": 0.0744, - "step": 38710 - }, - { - "epoch": 2.533202486097481, - "grad_norm": 0.7516806125640869, - "learning_rate": 9.273057529783327e-05, - "loss": 0.0827, - "step": 38720 - }, - { - "epoch": 2.5338567222767416, - "grad_norm": 0.7837287783622742, - "learning_rate": 9.272580461097654e-05, - "loss": 0.0785, - "step": 38730 - }, - { - "epoch": 2.5345109584560026, - "grad_norm": 0.8319099545478821, - "learning_rate": 9.272103248201185e-05, - "loss": 0.0801, - "step": 38740 - }, - { - "epoch": 2.5351651946352636, - "grad_norm": 0.9561896324157715, - "learning_rate": 9.271625891110028e-05, - "loss": 0.0834, - "step": 38750 - }, - { - "epoch": 2.535819430814524, - "grad_norm": 0.8369835615158081, - "learning_rate": 9.271148389840294e-05, - "loss": 0.0829, - "step": 38760 - }, - { - "epoch": 2.5364736669937846, - "grad_norm": 0.8404773473739624, - "learning_rate": 9.270670744408101e-05, - "loss": 0.0795, - "step": 38770 - }, - { - "epoch": 2.5371279031730456, - "grad_norm": 0.7207024693489075, - "learning_rate": 9.270192954829571e-05, - "loss": 0.0775, - "step": 38780 - }, - { - "epoch": 2.537782139352306, - "grad_norm": 0.7661088109016418, - "learning_rate": 9.269715021120827e-05, - "loss": 0.0861, - "step": 38790 - }, - { - "epoch": 2.5384363755315666, - "grad_norm": 1.0687098503112793, - "learning_rate": 9.269236943298006e-05, - "loss": 0.0865, - "step": 38800 - }, - { - "epoch": 2.5390906117108276, - "grad_norm": 0.8831005096435547, - "learning_rate": 9.26875872137724e-05, - "loss": 0.0743, - "step": 38810 - }, - { - "epoch": 2.5397448478900886, - "grad_norm": 0.9108031988143921, - "learning_rate": 9.268280355374673e-05, - "loss": 0.0824, - "step": 38820 - }, - { - "epoch": 2.540399084069349, - "grad_norm": 0.8143110871315002, - "learning_rate": 9.26780184530645e-05, - "loss": 0.0764, - "step": 38830 - }, - { - "epoch": 2.5410533202486096, - "grad_norm": 0.7931360006332397, - "learning_rate": 9.267323191188721e-05, - "loss": 0.0811, - "step": 38840 - }, - { - "epoch": 2.5417075564278706, - "grad_norm": 0.9138084053993225, - "learning_rate": 9.266844393037644e-05, - "loss": 0.0804, - "step": 38850 - }, - { - "epoch": 2.542361792607131, - "grad_norm": 0.8782002329826355, - "learning_rate": 9.266365450869376e-05, - "loss": 0.081, - "step": 38860 - }, - { - "epoch": 2.5430160287863917, - "grad_norm": 0.8105145692825317, - "learning_rate": 9.265886364700089e-05, - "loss": 0.0772, - "step": 38870 - }, - { - "epoch": 2.5436702649656526, - "grad_norm": 1.0995359420776367, - "learning_rate": 9.265407134545947e-05, - "loss": 0.0805, - "step": 38880 - }, - { - "epoch": 2.544324501144913, - "grad_norm": 0.8194625377655029, - "learning_rate": 9.264927760423128e-05, - "loss": 0.0827, - "step": 38890 - }, - { - "epoch": 2.544978737324174, - "grad_norm": 0.9476547837257385, - "learning_rate": 9.264448242347812e-05, - "loss": 0.0885, - "step": 38900 - }, - { - "epoch": 2.5456329735034346, - "grad_norm": 0.9267840385437012, - "learning_rate": 9.263968580336185e-05, - "loss": 0.0826, - "step": 38910 - }, - { - "epoch": 2.5462872096826956, - "grad_norm": 0.8388037085533142, - "learning_rate": 9.263488774404434e-05, - "loss": 0.0834, - "step": 38920 - }, - { - "epoch": 2.546941445861956, - "grad_norm": 1.0958263874053955, - "learning_rate": 9.263008824568756e-05, - "loss": 0.0903, - "step": 38930 - }, - { - "epoch": 2.5475956820412167, - "grad_norm": 0.7269095778465271, - "learning_rate": 9.26252873084535e-05, - "loss": 0.0842, - "step": 38940 - }, - { - "epoch": 2.5482499182204776, - "grad_norm": 0.782738447189331, - "learning_rate": 9.262048493250422e-05, - "loss": 0.0848, - "step": 38950 - }, - { - "epoch": 2.548904154399738, - "grad_norm": 0.8937769532203674, - "learning_rate": 9.261568111800177e-05, - "loss": 0.0772, - "step": 38960 - }, - { - "epoch": 2.549558390578999, - "grad_norm": 0.7501261234283447, - "learning_rate": 9.261087586510834e-05, - "loss": 0.0836, - "step": 38970 - }, - { - "epoch": 2.5502126267582597, - "grad_norm": 0.8389232754707336, - "learning_rate": 9.260606917398609e-05, - "loss": 0.0939, - "step": 38980 - }, - { - "epoch": 2.5508668629375206, - "grad_norm": 0.7912153005599976, - "learning_rate": 9.260126104479727e-05, - "loss": 0.093, - "step": 38990 - }, - { - "epoch": 2.551521099116781, - "grad_norm": 0.8847309947013855, - "learning_rate": 9.259645147770415e-05, - "loss": 0.0745, - "step": 39000 - }, - { - "epoch": 2.5521753352960417, - "grad_norm": 0.7888216376304626, - "learning_rate": 9.25916404728691e-05, - "loss": 0.0829, - "step": 39010 - }, - { - "epoch": 2.5528295714753027, - "grad_norm": 1.0451523065567017, - "learning_rate": 9.258682803045448e-05, - "loss": 0.0973, - "step": 39020 - }, - { - "epoch": 2.553483807654563, - "grad_norm": 0.8907498717308044, - "learning_rate": 9.25820141506227e-05, - "loss": 0.0958, - "step": 39030 - }, - { - "epoch": 2.554138043833824, - "grad_norm": 0.9830831289291382, - "learning_rate": 9.257719883353631e-05, - "loss": 0.0944, - "step": 39040 - }, - { - "epoch": 2.5547922800130847, - "grad_norm": 0.8762632608413696, - "learning_rate": 9.257238207935777e-05, - "loss": 0.0822, - "step": 39050 - }, - { - "epoch": 2.5554465161923456, - "grad_norm": 0.8764397501945496, - "learning_rate": 9.256756388824968e-05, - "loss": 0.0811, - "step": 39060 - }, - { - "epoch": 2.556100752371606, - "grad_norm": 0.8016869425773621, - "learning_rate": 9.256274426037468e-05, - "loss": 0.0797, - "step": 39070 - }, - { - "epoch": 2.5567549885508667, - "grad_norm": 0.9790674448013306, - "learning_rate": 9.255792319589544e-05, - "loss": 0.0853, - "step": 39080 - }, - { - "epoch": 2.5574092247301277, - "grad_norm": 0.8641859889030457, - "learning_rate": 9.255310069497468e-05, - "loss": 0.0833, - "step": 39090 - }, - { - "epoch": 2.558063460909388, - "grad_norm": 0.8329319357872009, - "learning_rate": 9.254827675777517e-05, - "loss": 0.0812, - "step": 39100 - }, - { - "epoch": 2.558717697088649, - "grad_norm": 0.9464491009712219, - "learning_rate": 9.254345138445973e-05, - "loss": 0.0826, - "step": 39110 - }, - { - "epoch": 2.5593719332679097, - "grad_norm": 0.8477292656898499, - "learning_rate": 9.253862457519122e-05, - "loss": 0.0755, - "step": 39120 - }, - { - "epoch": 2.5600261694471707, - "grad_norm": 1.1739881038665771, - "learning_rate": 9.253379633013259e-05, - "loss": 0.0791, - "step": 39130 - }, - { - "epoch": 2.560680405626431, - "grad_norm": 0.8935783505439758, - "learning_rate": 9.252896664944677e-05, - "loss": 0.0834, - "step": 39140 - }, - { - "epoch": 2.5613346418056917, - "grad_norm": 0.8971335291862488, - "learning_rate": 9.25241355332968e-05, - "loss": 0.0781, - "step": 39150 - }, - { - "epoch": 2.5619888779849527, - "grad_norm": 0.8568238019943237, - "learning_rate": 9.251930298184572e-05, - "loss": 0.0795, - "step": 39160 - }, - { - "epoch": 2.562643114164213, - "grad_norm": 0.8689826726913452, - "learning_rate": 9.251446899525667e-05, - "loss": 0.0755, - "step": 39170 - }, - { - "epoch": 2.5632973503434737, - "grad_norm": 0.8828352093696594, - "learning_rate": 9.250963357369278e-05, - "loss": 0.0883, - "step": 39180 - }, - { - "epoch": 2.5639515865227347, - "grad_norm": 0.7668461799621582, - "learning_rate": 9.250479671731726e-05, - "loss": 0.0745, - "step": 39190 - }, - { - "epoch": 2.5646058227019957, - "grad_norm": 0.7046298980712891, - "learning_rate": 9.24999584262934e-05, - "loss": 0.0733, - "step": 39200 - }, - { - "epoch": 2.565260058881256, - "grad_norm": 0.77901691198349, - "learning_rate": 9.249511870078449e-05, - "loss": 0.0894, - "step": 39210 - }, - { - "epoch": 2.5659142950605167, - "grad_norm": 0.7388069033622742, - "learning_rate": 9.249027754095385e-05, - "loss": 0.0842, - "step": 39220 - }, - { - "epoch": 2.5665685312397777, - "grad_norm": 0.841890275478363, - "learning_rate": 9.248543494696494e-05, - "loss": 0.0827, - "step": 39230 - }, - { - "epoch": 2.567222767419038, - "grad_norm": 0.9349285364151001, - "learning_rate": 9.248059091898114e-05, - "loss": 0.0818, - "step": 39240 - }, - { - "epoch": 2.5678770035982987, - "grad_norm": 0.9265575408935547, - "learning_rate": 9.2475745457166e-05, - "loss": 0.0782, - "step": 39250 - }, - { - "epoch": 2.5685312397775597, - "grad_norm": 0.9001551270484924, - "learning_rate": 9.247089856168307e-05, - "loss": 0.0785, - "step": 39260 - }, - { - "epoch": 2.5691854759568207, - "grad_norm": 0.8538965582847595, - "learning_rate": 9.246605023269592e-05, - "loss": 0.0813, - "step": 39270 - }, - { - "epoch": 2.569839712136081, - "grad_norm": 0.9209931492805481, - "learning_rate": 9.24612004703682e-05, - "loss": 0.0824, - "step": 39280 - }, - { - "epoch": 2.5704939483153417, - "grad_norm": 0.6929624676704407, - "learning_rate": 9.245634927486361e-05, - "loss": 0.0781, - "step": 39290 - }, - { - "epoch": 2.5711481844946027, - "grad_norm": 0.7202306985855103, - "learning_rate": 9.245149664634589e-05, - "loss": 0.0899, - "step": 39300 - }, - { - "epoch": 2.571802420673863, - "grad_norm": 1.034266710281372, - "learning_rate": 9.244664258497881e-05, - "loss": 0.0793, - "step": 39310 - }, - { - "epoch": 2.5724566568531237, - "grad_norm": 0.7991123795509338, - "learning_rate": 9.244178709092624e-05, - "loss": 0.0764, - "step": 39320 - }, - { - "epoch": 2.5731108930323847, - "grad_norm": 0.7285871505737305, - "learning_rate": 9.243693016435204e-05, - "loss": 0.0873, - "step": 39330 - }, - { - "epoch": 2.5737651292116452, - "grad_norm": 0.7833579182624817, - "learning_rate": 9.243207180542016e-05, - "loss": 0.0819, - "step": 39340 - }, - { - "epoch": 2.574419365390906, - "grad_norm": 1.0560925006866455, - "learning_rate": 9.242721201429456e-05, - "loss": 0.0778, - "step": 39350 - }, - { - "epoch": 2.5750736015701667, - "grad_norm": 0.9079683423042297, - "learning_rate": 9.242235079113928e-05, - "loss": 0.0715, - "step": 39360 - }, - { - "epoch": 2.5757278377494277, - "grad_norm": 1.1182979345321655, - "learning_rate": 9.241748813611842e-05, - "loss": 0.0802, - "step": 39370 - }, - { - "epoch": 2.5763820739286882, - "grad_norm": 0.8554732799530029, - "learning_rate": 9.241262404939608e-05, - "loss": 0.0885, - "step": 39380 - }, - { - "epoch": 2.5770363101079488, - "grad_norm": 0.9766889214515686, - "learning_rate": 9.240775853113646e-05, - "loss": 0.0796, - "step": 39390 - }, - { - "epoch": 2.5776905462872097, - "grad_norm": 0.7499867081642151, - "learning_rate": 9.240289158150378e-05, - "loss": 0.0782, - "step": 39400 - }, - { - "epoch": 2.5783447824664703, - "grad_norm": 0.8955393433570862, - "learning_rate": 9.239802320066228e-05, - "loss": 0.0791, - "step": 39410 - }, - { - "epoch": 2.5789990186457312, - "grad_norm": 0.8181598782539368, - "learning_rate": 9.239315338877631e-05, - "loss": 0.0776, - "step": 39420 - }, - { - "epoch": 2.5796532548249917, - "grad_norm": 1.1022766828536987, - "learning_rate": 9.238828214601023e-05, - "loss": 0.0855, - "step": 39430 - }, - { - "epoch": 2.5803074910042527, - "grad_norm": 0.7724325656890869, - "learning_rate": 9.238340947252847e-05, - "loss": 0.0685, - "step": 39440 - }, - { - "epoch": 2.5809617271835132, - "grad_norm": 0.8263006806373596, - "learning_rate": 9.237853536849548e-05, - "loss": 0.0796, - "step": 39450 - }, - { - "epoch": 2.5816159633627738, - "grad_norm": 0.7736819386482239, - "learning_rate": 9.237365983407578e-05, - "loss": 0.0894, - "step": 39460 - }, - { - "epoch": 2.5822701995420347, - "grad_norm": 0.9027025699615479, - "learning_rate": 9.236878286943393e-05, - "loss": 0.0799, - "step": 39470 - }, - { - "epoch": 2.5829244357212953, - "grad_norm": 0.7817458510398865, - "learning_rate": 9.236390447473455e-05, - "loss": 0.073, - "step": 39480 - }, - { - "epoch": 2.5835786719005562, - "grad_norm": 1.0722339153289795, - "learning_rate": 9.23590246501423e-05, - "loss": 0.0872, - "step": 39490 - }, - { - "epoch": 2.5842329080798168, - "grad_norm": 1.1376055479049683, - "learning_rate": 9.235414339582185e-05, - "loss": 0.0818, - "step": 39500 - }, - { - "epoch": 2.5848871442590777, - "grad_norm": 0.907859206199646, - "learning_rate": 9.234926071193799e-05, - "loss": 0.0813, - "step": 39510 - }, - { - "epoch": 2.5855413804383383, - "grad_norm": 0.9271683096885681, - "learning_rate": 9.234437659865554e-05, - "loss": 0.0776, - "step": 39520 - }, - { - "epoch": 2.586195616617599, - "grad_norm": 1.029358148574829, - "learning_rate": 9.23394910561393e-05, - "loss": 0.0809, - "step": 39530 - }, - { - "epoch": 2.5868498527968598, - "grad_norm": 0.8963791131973267, - "learning_rate": 9.233460408455422e-05, - "loss": 0.0724, - "step": 39540 - }, - { - "epoch": 2.5875040889761203, - "grad_norm": 0.7677753567695618, - "learning_rate": 9.232971568406518e-05, - "loss": 0.0768, - "step": 39550 - }, - { - "epoch": 2.5881583251553812, - "grad_norm": 0.8181271553039551, - "learning_rate": 9.232482585483725e-05, - "loss": 0.0746, - "step": 39560 - }, - { - "epoch": 2.5888125613346418, - "grad_norm": 0.853274405002594, - "learning_rate": 9.231993459703547e-05, - "loss": 0.0705, - "step": 39570 - }, - { - "epoch": 2.5894667975139027, - "grad_norm": 0.9550169110298157, - "learning_rate": 9.23150419108249e-05, - "loss": 0.0884, - "step": 39580 - }, - { - "epoch": 2.5901210336931633, - "grad_norm": 0.9100688099861145, - "learning_rate": 9.231014779637067e-05, - "loss": 0.0702, - "step": 39590 - }, - { - "epoch": 2.590775269872424, - "grad_norm": 1.1120498180389404, - "learning_rate": 9.2305252253838e-05, - "loss": 0.0797, - "step": 39600 - }, - { - "epoch": 2.5914295060516848, - "grad_norm": 0.8531731963157654, - "learning_rate": 9.230035528339211e-05, - "loss": 0.0931, - "step": 39610 - }, - { - "epoch": 2.5920837422309453, - "grad_norm": 0.7823407649993896, - "learning_rate": 9.229545688519829e-05, - "loss": 0.0795, - "step": 39620 - }, - { - "epoch": 2.592737978410206, - "grad_norm": 0.9250206351280212, - "learning_rate": 9.229055705942189e-05, - "loss": 0.0858, - "step": 39630 - }, - { - "epoch": 2.593392214589467, - "grad_norm": 0.7410492897033691, - "learning_rate": 9.228565580622828e-05, - "loss": 0.0803, - "step": 39640 - }, - { - "epoch": 2.5940464507687278, - "grad_norm": 0.6830748915672302, - "learning_rate": 9.228075312578288e-05, - "loss": 0.0849, - "step": 39650 - }, - { - "epoch": 2.5947006869479883, - "grad_norm": 1.0575082302093506, - "learning_rate": 9.227584901825116e-05, - "loss": 0.0926, - "step": 39660 - }, - { - "epoch": 2.595354923127249, - "grad_norm": 0.950671911239624, - "learning_rate": 9.22709434837987e-05, - "loss": 0.079, - "step": 39670 - }, - { - "epoch": 2.5960091593065098, - "grad_norm": 0.7701423168182373, - "learning_rate": 9.226603652259102e-05, - "loss": 0.0928, - "step": 39680 - }, - { - "epoch": 2.5966633954857703, - "grad_norm": 0.9989359378814697, - "learning_rate": 9.226112813479377e-05, - "loss": 0.0818, - "step": 39690 - }, - { - "epoch": 2.597317631665031, - "grad_norm": 0.8732580542564392, - "learning_rate": 9.225621832057259e-05, - "loss": 0.0892, - "step": 39700 - }, - { - "epoch": 2.597971867844292, - "grad_norm": 0.894294261932373, - "learning_rate": 9.225130708009323e-05, - "loss": 0.0831, - "step": 39710 - }, - { - "epoch": 2.5986261040235528, - "grad_norm": 0.771438479423523, - "learning_rate": 9.224639441352145e-05, - "loss": 0.0812, - "step": 39720 - }, - { - "epoch": 2.5992803402028133, - "grad_norm": 1.0812944173812866, - "learning_rate": 9.224148032102307e-05, - "loss": 0.0876, - "step": 39730 - }, - { - "epoch": 2.599934576382074, - "grad_norm": 0.8493326902389526, - "learning_rate": 9.223656480276394e-05, - "loss": 0.0823, - "step": 39740 - }, - { - "epoch": 2.600588812561335, - "grad_norm": 0.8835618495941162, - "learning_rate": 9.223164785890997e-05, - "loss": 0.0766, - "step": 39750 - }, - { - "epoch": 2.6012430487405953, - "grad_norm": 0.8064636588096619, - "learning_rate": 9.222672948962713e-05, - "loss": 0.0913, - "step": 39760 - }, - { - "epoch": 2.601897284919856, - "grad_norm": 1.0927833318710327, - "learning_rate": 9.222180969508145e-05, - "loss": 0.0847, - "step": 39770 - }, - { - "epoch": 2.602551521099117, - "grad_norm": 0.9413180351257324, - "learning_rate": 9.221688847543894e-05, - "loss": 0.0852, - "step": 39780 - }, - { - "epoch": 2.6032057572783773, - "grad_norm": 0.8473165035247803, - "learning_rate": 9.221196583086573e-05, - "loss": 0.0854, - "step": 39790 - }, - { - "epoch": 2.6038599934576383, - "grad_norm": 0.8541494011878967, - "learning_rate": 9.220704176152797e-05, - "loss": 0.0803, - "step": 39800 - }, - { - "epoch": 2.604514229636899, - "grad_norm": 0.7438517808914185, - "learning_rate": 9.220211626759185e-05, - "loss": 0.0776, - "step": 39810 - }, - { - "epoch": 2.60516846581616, - "grad_norm": 0.6814586520195007, - "learning_rate": 9.219718934922364e-05, - "loss": 0.0854, - "step": 39820 - }, - { - "epoch": 2.6058227019954203, - "grad_norm": 0.7921211123466492, - "learning_rate": 9.219226100658962e-05, - "loss": 0.0748, - "step": 39830 - }, - { - "epoch": 2.606476938174681, - "grad_norm": 0.9955710768699646, - "learning_rate": 9.218733123985613e-05, - "loss": 0.0926, - "step": 39840 - }, - { - "epoch": 2.607131174353942, - "grad_norm": 0.7756005525588989, - "learning_rate": 9.218240004918958e-05, - "loss": 0.0792, - "step": 39850 - }, - { - "epoch": 2.6077854105332023, - "grad_norm": 0.7657015323638916, - "learning_rate": 9.21774674347564e-05, - "loss": 0.077, - "step": 39860 - }, - { - "epoch": 2.6084396467124633, - "grad_norm": 0.8866888880729675, - "learning_rate": 9.217253339672307e-05, - "loss": 0.0916, - "step": 39870 - }, - { - "epoch": 2.609093882891724, - "grad_norm": 1.0825079679489136, - "learning_rate": 9.216759793525615e-05, - "loss": 0.0777, - "step": 39880 - }, - { - "epoch": 2.609748119070985, - "grad_norm": 0.8313194513320923, - "learning_rate": 9.21626610505222e-05, - "loss": 0.0753, - "step": 39890 - }, - { - "epoch": 2.6104023552502453, - "grad_norm": 0.8642783164978027, - "learning_rate": 9.215772274268787e-05, - "loss": 0.0903, - "step": 39900 - }, - { - "epoch": 2.611056591429506, - "grad_norm": 0.8965114951133728, - "learning_rate": 9.215278301191982e-05, - "loss": 0.078, - "step": 39910 - }, - { - "epoch": 2.611710827608767, - "grad_norm": 0.7599003911018372, - "learning_rate": 9.214784185838483e-05, - "loss": 0.0928, - "step": 39920 - }, - { - "epoch": 2.6123650637880274, - "grad_norm": 1.0140457153320312, - "learning_rate": 9.21428992822496e-05, - "loss": 0.0797, - "step": 39930 - }, - { - "epoch": 2.6130192999672883, - "grad_norm": 0.9935689568519592, - "learning_rate": 9.213795528368102e-05, - "loss": 0.0964, - "step": 39940 - }, - { - "epoch": 2.613673536146549, - "grad_norm": 0.7364537715911865, - "learning_rate": 9.213300986284593e-05, - "loss": 0.0837, - "step": 39950 - }, - { - "epoch": 2.61432777232581, - "grad_norm": 0.7998794317245483, - "learning_rate": 9.212806301991125e-05, - "loss": 0.0843, - "step": 39960 - }, - { - "epoch": 2.6149820085050703, - "grad_norm": 0.8715366125106812, - "learning_rate": 9.212311475504398e-05, - "loss": 0.0864, - "step": 39970 - }, - { - "epoch": 2.615636244684331, - "grad_norm": 0.9843313694000244, - "learning_rate": 9.21181650684111e-05, - "loss": 0.0816, - "step": 39980 - }, - { - "epoch": 2.616290480863592, - "grad_norm": 0.8258795738220215, - "learning_rate": 9.21132139601797e-05, - "loss": 0.0795, - "step": 39990 - }, - { - "epoch": 2.6169447170428524, - "grad_norm": 0.8979169130325317, - "learning_rate": 9.210826143051688e-05, - "loss": 0.0815, - "step": 40000 - }, - { - "epoch": 2.6175989532221133, - "grad_norm": 0.8749635815620422, - "learning_rate": 9.210330747958979e-05, - "loss": 0.0771, - "step": 40010 - }, - { - "epoch": 2.618253189401374, - "grad_norm": 0.8296936750411987, - "learning_rate": 9.209835210756565e-05, - "loss": 0.0793, - "step": 40020 - }, - { - "epoch": 2.618907425580635, - "grad_norm": 0.7305333614349365, - "learning_rate": 9.209339531461173e-05, - "loss": 0.0747, - "step": 40030 - }, - { - "epoch": 2.6195616617598954, - "grad_norm": 0.888248860836029, - "learning_rate": 9.208843710089534e-05, - "loss": 0.0855, - "step": 40040 - }, - { - "epoch": 2.620215897939156, - "grad_norm": 0.7976614236831665, - "learning_rate": 9.20834774665838e-05, - "loss": 0.0876, - "step": 40050 - }, - { - "epoch": 2.620870134118417, - "grad_norm": 1.2806037664413452, - "learning_rate": 9.207851641184453e-05, - "loss": 0.0805, - "step": 40060 - }, - { - "epoch": 2.6215243702976774, - "grad_norm": 0.7971034049987793, - "learning_rate": 9.207355393684499e-05, - "loss": 0.0706, - "step": 40070 - }, - { - "epoch": 2.622178606476938, - "grad_norm": 0.7905651330947876, - "learning_rate": 9.206859004175264e-05, - "loss": 0.0784, - "step": 40080 - }, - { - "epoch": 2.622832842656199, - "grad_norm": 0.7352070808410645, - "learning_rate": 9.206362472673505e-05, - "loss": 0.0723, - "step": 40090 - }, - { - "epoch": 2.62348707883546, - "grad_norm": 0.9722936153411865, - "learning_rate": 9.205865799195982e-05, - "loss": 0.081, - "step": 40100 - }, - { - "epoch": 2.6241413150147204, - "grad_norm": 0.849582850933075, - "learning_rate": 9.205368983759457e-05, - "loss": 0.0933, - "step": 40110 - }, - { - "epoch": 2.624795551193981, - "grad_norm": 0.7181478142738342, - "learning_rate": 9.2048720263807e-05, - "loss": 0.0824, - "step": 40120 - }, - { - "epoch": 2.625449787373242, - "grad_norm": 0.8331068158149719, - "learning_rate": 9.204374927076486e-05, - "loss": 0.0774, - "step": 40130 - }, - { - "epoch": 2.6261040235525024, - "grad_norm": 0.7740342617034912, - "learning_rate": 9.20387768586359e-05, - "loss": 0.0791, - "step": 40140 - }, - { - "epoch": 2.626758259731763, - "grad_norm": 0.7920116186141968, - "learning_rate": 9.203380302758797e-05, - "loss": 0.0765, - "step": 40150 - }, - { - "epoch": 2.627412495911024, - "grad_norm": 0.8771019577980042, - "learning_rate": 9.202882777778896e-05, - "loss": 0.08, - "step": 40160 - }, - { - "epoch": 2.628066732090285, - "grad_norm": 0.9749835729598999, - "learning_rate": 9.202385110940678e-05, - "loss": 0.0779, - "step": 40170 - }, - { - "epoch": 2.6287209682695454, - "grad_norm": 0.8514975309371948, - "learning_rate": 9.201887302260943e-05, - "loss": 0.0887, - "step": 40180 - }, - { - "epoch": 2.629375204448806, - "grad_norm": 0.820841372013092, - "learning_rate": 9.201389351756491e-05, - "loss": 0.0747, - "step": 40190 - }, - { - "epoch": 2.630029440628067, - "grad_norm": 0.9230198860168457, - "learning_rate": 9.200891259444129e-05, - "loss": 0.0867, - "step": 40200 - }, - { - "epoch": 2.6306836768073274, - "grad_norm": 0.7532345056533813, - "learning_rate": 9.20039302534067e-05, - "loss": 0.0861, - "step": 40210 - }, - { - "epoch": 2.631337912986588, - "grad_norm": 0.7638508677482605, - "learning_rate": 9.19989464946293e-05, - "loss": 0.087, - "step": 40220 - }, - { - "epoch": 2.631992149165849, - "grad_norm": 0.7697664499282837, - "learning_rate": 9.199396131827731e-05, - "loss": 0.0725, - "step": 40230 - }, - { - "epoch": 2.6326463853451094, - "grad_norm": 0.7672895193099976, - "learning_rate": 9.198897472451901e-05, - "loss": 0.086, - "step": 40240 - }, - { - "epoch": 2.6333006215243704, - "grad_norm": 0.830986499786377, - "learning_rate": 9.198398671352267e-05, - "loss": 0.0793, - "step": 40250 - }, - { - "epoch": 2.633954857703631, - "grad_norm": 0.8693823218345642, - "learning_rate": 9.19789972854567e-05, - "loss": 0.0868, - "step": 40260 - }, - { - "epoch": 2.634609093882892, - "grad_norm": 0.8019348978996277, - "learning_rate": 9.197400644048944e-05, - "loss": 0.0741, - "step": 40270 - }, - { - "epoch": 2.6352633300621524, - "grad_norm": 0.9302101731300354, - "learning_rate": 9.196901417878941e-05, - "loss": 0.0779, - "step": 40280 - }, - { - "epoch": 2.635917566241413, - "grad_norm": 0.9065577387809753, - "learning_rate": 9.196402050052507e-05, - "loss": 0.0985, - "step": 40290 - }, - { - "epoch": 2.636571802420674, - "grad_norm": 0.8283711671829224, - "learning_rate": 9.195902540586498e-05, - "loss": 0.0797, - "step": 40300 - }, - { - "epoch": 2.6372260385999344, - "grad_norm": 0.9816059470176697, - "learning_rate": 9.195402889497777e-05, - "loss": 0.0925, - "step": 40310 - }, - { - "epoch": 2.6378802747791954, - "grad_norm": 0.7747599482536316, - "learning_rate": 9.194903096803202e-05, - "loss": 0.0841, - "step": 40320 - }, - { - "epoch": 2.638534510958456, - "grad_norm": 0.9950234889984131, - "learning_rate": 9.194403162519648e-05, - "loss": 0.0904, - "step": 40330 - }, - { - "epoch": 2.639188747137717, - "grad_norm": 1.1040619611740112, - "learning_rate": 9.193903086663987e-05, - "loss": 0.0822, - "step": 40340 - }, - { - "epoch": 2.6398429833169774, - "grad_norm": 1.090773582458496, - "learning_rate": 9.193402869253097e-05, - "loss": 0.0944, - "step": 40350 - }, - { - "epoch": 2.640497219496238, - "grad_norm": 1.1542222499847412, - "learning_rate": 9.192902510303862e-05, - "loss": 0.0864, - "step": 40360 - }, - { - "epoch": 2.641151455675499, - "grad_norm": 0.9892890453338623, - "learning_rate": 9.192402009833173e-05, - "loss": 0.0861, - "step": 40370 - }, - { - "epoch": 2.6418056918547594, - "grad_norm": 1.2127673625946045, - "learning_rate": 9.19190136785792e-05, - "loss": 0.0837, - "step": 40380 - }, - { - "epoch": 2.6424599280340204, - "grad_norm": 0.8235146999359131, - "learning_rate": 9.191400584395003e-05, - "loss": 0.0825, - "step": 40390 - }, - { - "epoch": 2.643114164213281, - "grad_norm": 0.7639740109443665, - "learning_rate": 9.190899659461323e-05, - "loss": 0.0767, - "step": 40400 - }, - { - "epoch": 2.643768400392542, - "grad_norm": 0.8715619444847107, - "learning_rate": 9.19039859307379e-05, - "loss": 0.0795, - "step": 40410 - }, - { - "epoch": 2.6444226365718024, - "grad_norm": 0.8103946447372437, - "learning_rate": 9.189897385249313e-05, - "loss": 0.0898, - "step": 40420 - }, - { - "epoch": 2.645076872751063, - "grad_norm": 0.9815798997879028, - "learning_rate": 9.189396036004811e-05, - "loss": 0.0782, - "step": 40430 - }, - { - "epoch": 2.645731108930324, - "grad_norm": 0.884543776512146, - "learning_rate": 9.188894545357207e-05, - "loss": 0.0775, - "step": 40440 - }, - { - "epoch": 2.6463853451095845, - "grad_norm": 0.8589003682136536, - "learning_rate": 9.188392913323423e-05, - "loss": 0.0867, - "step": 40450 - }, - { - "epoch": 2.6470395812888454, - "grad_norm": 0.8187326192855835, - "learning_rate": 9.187891139920397e-05, - "loss": 0.081, - "step": 40460 - }, - { - "epoch": 2.647693817468106, - "grad_norm": 0.8445239663124084, - "learning_rate": 9.187389225165062e-05, - "loss": 0.0812, - "step": 40470 - }, - { - "epoch": 2.648348053647367, - "grad_norm": 0.7465826869010925, - "learning_rate": 9.186887169074356e-05, - "loss": 0.08, - "step": 40480 - }, - { - "epoch": 2.6490022898266274, - "grad_norm": 0.8137046098709106, - "learning_rate": 9.186384971665229e-05, - "loss": 0.0792, - "step": 40490 - }, - { - "epoch": 2.649656526005888, - "grad_norm": 0.912079393863678, - "learning_rate": 9.185882632954632e-05, - "loss": 0.0809, - "step": 40500 - }, - { - "epoch": 2.650310762185149, - "grad_norm": 0.9054073691368103, - "learning_rate": 9.185380152959515e-05, - "loss": 0.0777, - "step": 40510 - }, - { - "epoch": 2.6509649983644095, - "grad_norm": 0.9537298083305359, - "learning_rate": 9.184877531696844e-05, - "loss": 0.0822, - "step": 40520 - }, - { - "epoch": 2.65161923454367, - "grad_norm": 0.8448399901390076, - "learning_rate": 9.18437476918358e-05, - "loss": 0.0788, - "step": 40530 - }, - { - "epoch": 2.652273470722931, - "grad_norm": 0.8022094368934631, - "learning_rate": 9.183871865436693e-05, - "loss": 0.0927, - "step": 40540 - }, - { - "epoch": 2.652927706902192, - "grad_norm": 0.7613011598587036, - "learning_rate": 9.183368820473159e-05, - "loss": 0.0743, - "step": 40550 - }, - { - "epoch": 2.6535819430814525, - "grad_norm": 0.8020244836807251, - "learning_rate": 9.182865634309956e-05, - "loss": 0.0729, - "step": 40560 - }, - { - "epoch": 2.654236179260713, - "grad_norm": 0.7869201898574829, - "learning_rate": 9.182362306964067e-05, - "loss": 0.0818, - "step": 40570 - }, - { - "epoch": 2.654890415439974, - "grad_norm": 0.859542965888977, - "learning_rate": 9.181858838452481e-05, - "loss": 0.0909, - "step": 40580 - }, - { - "epoch": 2.6555446516192345, - "grad_norm": 0.8157853484153748, - "learning_rate": 9.181355228792194e-05, - "loss": 0.0727, - "step": 40590 - }, - { - "epoch": 2.656198887798495, - "grad_norm": 0.8959032297134399, - "learning_rate": 9.180851478000199e-05, - "loss": 0.0762, - "step": 40600 - }, - { - "epoch": 2.656853123977756, - "grad_norm": 0.759432315826416, - "learning_rate": 9.180347586093505e-05, - "loss": 0.0805, - "step": 40610 - }, - { - "epoch": 2.657507360157017, - "grad_norm": 0.842648983001709, - "learning_rate": 9.179843553089114e-05, - "loss": 0.0766, - "step": 40620 - }, - { - "epoch": 2.6581615963362775, - "grad_norm": 0.8938133120536804, - "learning_rate": 9.179339379004043e-05, - "loss": 0.0839, - "step": 40630 - }, - { - "epoch": 2.658815832515538, - "grad_norm": 1.022493600845337, - "learning_rate": 9.178835063855306e-05, - "loss": 0.0812, - "step": 40640 - }, - { - "epoch": 2.659470068694799, - "grad_norm": 0.8210968375205994, - "learning_rate": 9.178330607659927e-05, - "loss": 0.0804, - "step": 40650 - }, - { - "epoch": 2.6601243048740595, - "grad_norm": 0.7599284648895264, - "learning_rate": 9.177826010434931e-05, - "loss": 0.0731, - "step": 40660 - }, - { - "epoch": 2.66077854105332, - "grad_norm": 0.8126060962677002, - "learning_rate": 9.177321272197352e-05, - "loss": 0.079, - "step": 40670 - }, - { - "epoch": 2.661432777232581, - "grad_norm": 0.9725340604782104, - "learning_rate": 9.176816392964223e-05, - "loss": 0.0808, - "step": 40680 - }, - { - "epoch": 2.662087013411842, - "grad_norm": 0.7665135264396667, - "learning_rate": 9.176311372752589e-05, - "loss": 0.0781, - "step": 40690 - }, - { - "epoch": 2.6627412495911025, - "grad_norm": 0.9257283806800842, - "learning_rate": 9.175806211579491e-05, - "loss": 0.088, - "step": 40700 - }, - { - "epoch": 2.663395485770363, - "grad_norm": 0.9089275598526001, - "learning_rate": 9.175300909461982e-05, - "loss": 0.0823, - "step": 40710 - }, - { - "epoch": 2.664049721949624, - "grad_norm": 0.8126506209373474, - "learning_rate": 9.174795466417119e-05, - "loss": 0.0721, - "step": 40720 - }, - { - "epoch": 2.6647039581288845, - "grad_norm": 1.0364658832550049, - "learning_rate": 9.17428988246196e-05, - "loss": 0.0827, - "step": 40730 - }, - { - "epoch": 2.665358194308145, - "grad_norm": 0.8109714984893799, - "learning_rate": 9.173784157613568e-05, - "loss": 0.0815, - "step": 40740 - }, - { - "epoch": 2.666012430487406, - "grad_norm": 0.8420194387435913, - "learning_rate": 9.173278291889015e-05, - "loss": 0.0824, - "step": 40750 - }, - { - "epoch": 2.6666666666666665, - "grad_norm": 0.7918026447296143, - "learning_rate": 9.172772285305375e-05, - "loss": 0.0894, - "step": 40760 - }, - { - "epoch": 2.6673209028459275, - "grad_norm": 0.9323230385780334, - "learning_rate": 9.172266137879728e-05, - "loss": 0.0864, - "step": 40770 - }, - { - "epoch": 2.667975139025188, - "grad_norm": 0.8225651383399963, - "learning_rate": 9.171759849629155e-05, - "loss": 0.0806, - "step": 40780 - }, - { - "epoch": 2.668629375204449, - "grad_norm": 0.9506831765174866, - "learning_rate": 9.171253420570748e-05, - "loss": 0.0861, - "step": 40790 - }, - { - "epoch": 2.6692836113837095, - "grad_norm": 0.8589584827423096, - "learning_rate": 9.170746850721598e-05, - "loss": 0.0791, - "step": 40800 - }, - { - "epoch": 2.66993784756297, - "grad_norm": 0.9063448309898376, - "learning_rate": 9.170240140098802e-05, - "loss": 0.0776, - "step": 40810 - }, - { - "epoch": 2.670592083742231, - "grad_norm": 0.7764676809310913, - "learning_rate": 9.169733288719467e-05, - "loss": 0.0784, - "step": 40820 - }, - { - "epoch": 2.6712463199214915, - "grad_norm": 0.8123670816421509, - "learning_rate": 9.169226296600696e-05, - "loss": 0.0658, - "step": 40830 - }, - { - "epoch": 2.6719005561007525, - "grad_norm": 0.8324966430664062, - "learning_rate": 9.168719163759604e-05, - "loss": 0.0741, - "step": 40840 - }, - { - "epoch": 2.672554792280013, - "grad_norm": 0.925599217414856, - "learning_rate": 9.168211890213307e-05, - "loss": 0.0765, - "step": 40850 - }, - { - "epoch": 2.673209028459274, - "grad_norm": 1.0423486232757568, - "learning_rate": 9.167704475978928e-05, - "loss": 0.0788, - "step": 40860 - }, - { - "epoch": 2.6738632646385345, - "grad_norm": 0.8119267821311951, - "learning_rate": 9.167196921073593e-05, - "loss": 0.0855, - "step": 40870 - }, - { - "epoch": 2.674517500817795, - "grad_norm": 0.9077923893928528, - "learning_rate": 9.166689225514431e-05, - "loss": 0.0715, - "step": 40880 - }, - { - "epoch": 2.675171736997056, - "grad_norm": 0.9296697378158569, - "learning_rate": 9.166181389318583e-05, - "loss": 0.0768, - "step": 40890 - }, - { - "epoch": 2.6758259731763165, - "grad_norm": 0.7002086043357849, - "learning_rate": 9.165673412503183e-05, - "loss": 0.0896, - "step": 40900 - }, - { - "epoch": 2.6764802093555775, - "grad_norm": 0.7667686939239502, - "learning_rate": 9.165165295085385e-05, - "loss": 0.0858, - "step": 40910 - }, - { - "epoch": 2.677134445534838, - "grad_norm": 1.128864049911499, - "learning_rate": 9.164657037082331e-05, - "loss": 0.0809, - "step": 40920 - }, - { - "epoch": 2.677788681714099, - "grad_norm": 0.8613152503967285, - "learning_rate": 9.164148638511182e-05, - "loss": 0.0882, - "step": 40930 - }, - { - "epoch": 2.6784429178933595, - "grad_norm": 0.7017923593521118, - "learning_rate": 9.163640099389095e-05, - "loss": 0.0807, - "step": 40940 - }, - { - "epoch": 2.67909715407262, - "grad_norm": 0.8696222305297852, - "learning_rate": 9.163131419733235e-05, - "loss": 0.0817, - "step": 40950 - }, - { - "epoch": 2.679751390251881, - "grad_norm": 0.950098991394043, - "learning_rate": 9.16262259956077e-05, - "loss": 0.0775, - "step": 40960 - }, - { - "epoch": 2.6804056264311416, - "grad_norm": 1.0551055669784546, - "learning_rate": 9.162113638888879e-05, - "loss": 0.0838, - "step": 40970 - }, - { - "epoch": 2.681059862610402, - "grad_norm": 0.8011052012443542, - "learning_rate": 9.161604537734733e-05, - "loss": 0.0891, - "step": 40980 - }, - { - "epoch": 2.681714098789663, - "grad_norm": 0.9275655746459961, - "learning_rate": 9.161095296115523e-05, - "loss": 0.0828, - "step": 40990 - }, - { - "epoch": 2.682368334968924, - "grad_norm": 1.1314018964767456, - "learning_rate": 9.160585914048432e-05, - "loss": 0.0815, - "step": 41000 - }, - { - "epoch": 2.6830225711481845, - "grad_norm": 0.9840704202651978, - "learning_rate": 9.160076391550654e-05, - "loss": 0.0804, - "step": 41010 - }, - { - "epoch": 2.683676807327445, - "grad_norm": 0.8175294399261475, - "learning_rate": 9.15956672863939e-05, - "loss": 0.0729, - "step": 41020 - }, - { - "epoch": 2.684331043506706, - "grad_norm": 0.7896432280540466, - "learning_rate": 9.159056925331837e-05, - "loss": 0.079, - "step": 41030 - }, - { - "epoch": 2.6849852796859666, - "grad_norm": 0.7831984758377075, - "learning_rate": 9.158546981645208e-05, - "loss": 0.0845, - "step": 41040 - }, - { - "epoch": 2.685639515865227, - "grad_norm": 0.8683128952980042, - "learning_rate": 9.158036897596712e-05, - "loss": 0.0869, - "step": 41050 - }, - { - "epoch": 2.686293752044488, - "grad_norm": 0.8791904449462891, - "learning_rate": 9.157526673203565e-05, - "loss": 0.0835, - "step": 41060 - }, - { - "epoch": 2.686947988223749, - "grad_norm": 0.7771376371383667, - "learning_rate": 9.15701630848299e-05, - "loss": 0.0825, - "step": 41070 - }, - { - "epoch": 2.6876022244030096, - "grad_norm": 0.6314225792884827, - "learning_rate": 9.156505803452212e-05, - "loss": 0.0699, - "step": 41080 - }, - { - "epoch": 2.68825646058227, - "grad_norm": 1.073372483253479, - "learning_rate": 9.15599515812846e-05, - "loss": 0.0892, - "step": 41090 - }, - { - "epoch": 2.688910696761531, - "grad_norm": 0.9210318922996521, - "learning_rate": 9.155484372528975e-05, - "loss": 0.0779, - "step": 41100 - }, - { - "epoch": 2.6895649329407916, - "grad_norm": 0.8791244626045227, - "learning_rate": 9.154973446670993e-05, - "loss": 0.0981, - "step": 41110 - }, - { - "epoch": 2.690219169120052, - "grad_norm": 0.8100093007087708, - "learning_rate": 9.154462380571761e-05, - "loss": 0.0867, - "step": 41120 - }, - { - "epoch": 2.690873405299313, - "grad_norm": 1.020846962928772, - "learning_rate": 9.153951174248528e-05, - "loss": 0.0945, - "step": 41130 - }, - { - "epoch": 2.691527641478574, - "grad_norm": 0.929047167301178, - "learning_rate": 9.153439827718549e-05, - "loss": 0.0766, - "step": 41140 - }, - { - "epoch": 2.6921818776578346, - "grad_norm": 0.8004492521286011, - "learning_rate": 9.152928340999083e-05, - "loss": 0.0754, - "step": 41150 - }, - { - "epoch": 2.692836113837095, - "grad_norm": 0.7606692910194397, - "learning_rate": 9.152416714107393e-05, - "loss": 0.0761, - "step": 41160 - }, - { - "epoch": 2.693490350016356, - "grad_norm": 0.9273959994316101, - "learning_rate": 9.15190494706075e-05, - "loss": 0.0838, - "step": 41170 - }, - { - "epoch": 2.6941445861956166, - "grad_norm": 0.8332855701446533, - "learning_rate": 9.151393039876425e-05, - "loss": 0.0781, - "step": 41180 - }, - { - "epoch": 2.694798822374877, - "grad_norm": 0.6218408942222595, - "learning_rate": 9.150880992571698e-05, - "loss": 0.0824, - "step": 41190 - }, - { - "epoch": 2.695453058554138, - "grad_norm": 0.8714901804924011, - "learning_rate": 9.150368805163851e-05, - "loss": 0.0875, - "step": 41200 - }, - { - "epoch": 2.6961072947333986, - "grad_norm": 0.7702926397323608, - "learning_rate": 9.149856477670173e-05, - "loss": 0.0842, - "step": 41210 - }, - { - "epoch": 2.6967615309126596, - "grad_norm": 0.8732930421829224, - "learning_rate": 9.149344010107955e-05, - "loss": 0.086, - "step": 41220 - }, - { - "epoch": 2.69741576709192, - "grad_norm": 0.8141332268714905, - "learning_rate": 9.148831402494495e-05, - "loss": 0.0917, - "step": 41230 - }, - { - "epoch": 2.698070003271181, - "grad_norm": 0.9626291990280151, - "learning_rate": 9.148318654847094e-05, - "loss": 0.0798, - "step": 41240 - }, - { - "epoch": 2.6987242394504416, - "grad_norm": 0.8662136793136597, - "learning_rate": 9.14780576718306e-05, - "loss": 0.0837, - "step": 41250 - }, - { - "epoch": 2.699378475629702, - "grad_norm": 0.842068076133728, - "learning_rate": 9.147292739519702e-05, - "loss": 0.0848, - "step": 41260 - }, - { - "epoch": 2.700032711808963, - "grad_norm": 0.8036767244338989, - "learning_rate": 9.146779571874337e-05, - "loss": 0.071, - "step": 41270 - }, - { - "epoch": 2.7006869479882236, - "grad_norm": 0.8152961134910583, - "learning_rate": 9.146266264264288e-05, - "loss": 0.0795, - "step": 41280 - }, - { - "epoch": 2.7013411841674846, - "grad_norm": 0.9094104766845703, - "learning_rate": 9.145752816706878e-05, - "loss": 0.0715, - "step": 41290 - }, - { - "epoch": 2.701995420346745, - "grad_norm": 0.8534713983535767, - "learning_rate": 9.145239229219438e-05, - "loss": 0.0788, - "step": 41300 - }, - { - "epoch": 2.702649656526006, - "grad_norm": 0.947786271572113, - "learning_rate": 9.144725501819303e-05, - "loss": 0.0888, - "step": 41310 - }, - { - "epoch": 2.7033038927052666, - "grad_norm": 0.9250187277793884, - "learning_rate": 9.14421163452381e-05, - "loss": 0.0849, - "step": 41320 - }, - { - "epoch": 2.703958128884527, - "grad_norm": 0.9689621925354004, - "learning_rate": 9.143697627350308e-05, - "loss": 0.0792, - "step": 41330 - }, - { - "epoch": 2.704612365063788, - "grad_norm": 0.9268077611923218, - "learning_rate": 9.143183480316143e-05, - "loss": 0.079, - "step": 41340 - }, - { - "epoch": 2.7052666012430486, - "grad_norm": 0.7102203369140625, - "learning_rate": 9.142669193438669e-05, - "loss": 0.0812, - "step": 41350 - }, - { - "epoch": 2.7059208374223096, - "grad_norm": 0.8151041269302368, - "learning_rate": 9.142154766735247e-05, - "loss": 0.0776, - "step": 41360 - }, - { - "epoch": 2.70657507360157, - "grad_norm": 1.0687074661254883, - "learning_rate": 9.141640200223236e-05, - "loss": 0.0804, - "step": 41370 - }, - { - "epoch": 2.707229309780831, - "grad_norm": 0.9339028596878052, - "learning_rate": 9.141125493920009e-05, - "loss": 0.0873, - "step": 41380 - }, - { - "epoch": 2.7078835459600916, - "grad_norm": 0.8078384399414062, - "learning_rate": 9.140610647842934e-05, - "loss": 0.0742, - "step": 41390 - }, - { - "epoch": 2.708537782139352, - "grad_norm": 1.0456902980804443, - "learning_rate": 9.140095662009392e-05, - "loss": 0.0764, - "step": 41400 - }, - { - "epoch": 2.709192018318613, - "grad_norm": 0.9626041650772095, - "learning_rate": 9.139580536436763e-05, - "loss": 0.075, - "step": 41410 - }, - { - "epoch": 2.7098462544978736, - "grad_norm": 1.0236674547195435, - "learning_rate": 9.139065271142434e-05, - "loss": 0.0861, - "step": 41420 - }, - { - "epoch": 2.7105004906771346, - "grad_norm": 0.8825036287307739, - "learning_rate": 9.138549866143797e-05, - "loss": 0.0693, - "step": 41430 - }, - { - "epoch": 2.711154726856395, - "grad_norm": 1.041279911994934, - "learning_rate": 9.138034321458248e-05, - "loss": 0.0874, - "step": 41440 - }, - { - "epoch": 2.711808963035656, - "grad_norm": 0.7983617782592773, - "learning_rate": 9.13751863710319e-05, - "loss": 0.082, - "step": 41450 - }, - { - "epoch": 2.7124631992149166, - "grad_norm": 0.7135721445083618, - "learning_rate": 9.137002813096026e-05, - "loss": 0.0747, - "step": 41460 - }, - { - "epoch": 2.713117435394177, - "grad_norm": 0.9826575517654419, - "learning_rate": 9.136486849454167e-05, - "loss": 0.0825, - "step": 41470 - }, - { - "epoch": 2.713771671573438, - "grad_norm": 0.9839840531349182, - "learning_rate": 9.135970746195029e-05, - "loss": 0.0923, - "step": 41480 - }, - { - "epoch": 2.7144259077526987, - "grad_norm": 0.9663382172584534, - "learning_rate": 9.135454503336032e-05, - "loss": 0.0676, - "step": 41490 - }, - { - "epoch": 2.715080143931959, - "grad_norm": 0.8061355352401733, - "learning_rate": 9.134938120894598e-05, - "loss": 0.0809, - "step": 41500 - }, - { - "epoch": 2.71573438011122, - "grad_norm": 0.8093533515930176, - "learning_rate": 9.134421598888159e-05, - "loss": 0.0772, - "step": 41510 - }, - { - "epoch": 2.716388616290481, - "grad_norm": 0.7529310584068298, - "learning_rate": 9.133904937334148e-05, - "loss": 0.0766, - "step": 41520 - }, - { - "epoch": 2.7170428524697416, - "grad_norm": 0.7155537605285645, - "learning_rate": 9.133388136250005e-05, - "loss": 0.0707, - "step": 41530 - }, - { - "epoch": 2.717697088649002, - "grad_norm": 0.8347744345664978, - "learning_rate": 9.13287119565317e-05, - "loss": 0.0737, - "step": 41540 - }, - { - "epoch": 2.718351324828263, - "grad_norm": 0.7836735844612122, - "learning_rate": 9.132354115561094e-05, - "loss": 0.0874, - "step": 41550 - }, - { - "epoch": 2.7190055610075237, - "grad_norm": 0.8764668107032776, - "learning_rate": 9.13183689599123e-05, - "loss": 0.0857, - "step": 41560 - }, - { - "epoch": 2.719659797186784, - "grad_norm": 0.7778729200363159, - "learning_rate": 9.131319536961035e-05, - "loss": 0.0865, - "step": 41570 - }, - { - "epoch": 2.720314033366045, - "grad_norm": 0.8253190517425537, - "learning_rate": 9.13080203848797e-05, - "loss": 0.0805, - "step": 41580 - }, - { - "epoch": 2.720968269545306, - "grad_norm": 0.7873772382736206, - "learning_rate": 9.130284400589503e-05, - "loss": 0.0815, - "step": 41590 - }, - { - "epoch": 2.7216225057245667, - "grad_norm": 0.7975731492042542, - "learning_rate": 9.129766623283105e-05, - "loss": 0.0859, - "step": 41600 - }, - { - "epoch": 2.722276741903827, - "grad_norm": 0.8026714324951172, - "learning_rate": 9.129248706586253e-05, - "loss": 0.0773, - "step": 41610 - }, - { - "epoch": 2.722930978083088, - "grad_norm": 0.8112697005271912, - "learning_rate": 9.128730650516429e-05, - "loss": 0.0743, - "step": 41620 - }, - { - "epoch": 2.7235852142623487, - "grad_norm": 0.8520511388778687, - "learning_rate": 9.128212455091115e-05, - "loss": 0.0847, - "step": 41630 - }, - { - "epoch": 2.724239450441609, - "grad_norm": 0.8587812185287476, - "learning_rate": 9.127694120327806e-05, - "loss": 0.0889, - "step": 41640 - }, - { - "epoch": 2.72489368662087, - "grad_norm": 1.0693318843841553, - "learning_rate": 9.127175646243994e-05, - "loss": 0.0834, - "step": 41650 - }, - { - "epoch": 2.7255479228001307, - "grad_norm": 0.8526062369346619, - "learning_rate": 9.126657032857181e-05, - "loss": 0.0772, - "step": 41660 - }, - { - "epoch": 2.7262021589793917, - "grad_norm": 1.0225043296813965, - "learning_rate": 9.126138280184871e-05, - "loss": 0.0982, - "step": 41670 - }, - { - "epoch": 2.726856395158652, - "grad_norm": 0.7854475378990173, - "learning_rate": 9.125619388244571e-05, - "loss": 0.0768, - "step": 41680 - }, - { - "epoch": 2.727510631337913, - "grad_norm": 0.929229736328125, - "learning_rate": 9.125100357053797e-05, - "loss": 0.0778, - "step": 41690 - }, - { - "epoch": 2.7281648675171737, - "grad_norm": 0.7836943864822388, - "learning_rate": 9.124581186630071e-05, - "loss": 0.0752, - "step": 41700 - }, - { - "epoch": 2.728819103696434, - "grad_norm": 0.9992619156837463, - "learning_rate": 9.124061876990908e-05, - "loss": 0.0795, - "step": 41710 - }, - { - "epoch": 2.729473339875695, - "grad_norm": 0.8057607412338257, - "learning_rate": 9.123542428153844e-05, - "loss": 0.072, - "step": 41720 - }, - { - "epoch": 2.7301275760549557, - "grad_norm": 0.8987215161323547, - "learning_rate": 9.123022840136407e-05, - "loss": 0.0805, - "step": 41730 - }, - { - "epoch": 2.7307818122342167, - "grad_norm": 0.8965569734573364, - "learning_rate": 9.122503112956138e-05, - "loss": 0.0797, - "step": 41740 - }, - { - "epoch": 2.731436048413477, - "grad_norm": 0.7639127969741821, - "learning_rate": 9.121983246630575e-05, - "loss": 0.0801, - "step": 41750 - }, - { - "epoch": 2.732090284592738, - "grad_norm": 0.83674156665802, - "learning_rate": 9.121463241177269e-05, - "loss": 0.0802, - "step": 41760 - }, - { - "epoch": 2.7327445207719987, - "grad_norm": 1.0172474384307861, - "learning_rate": 9.120943096613768e-05, - "loss": 0.0752, - "step": 41770 - }, - { - "epoch": 2.7333987569512592, - "grad_norm": 0.9789382815361023, - "learning_rate": 9.120422812957632e-05, - "loss": 0.0932, - "step": 41780 - }, - { - "epoch": 2.73405299313052, - "grad_norm": 0.934238612651825, - "learning_rate": 9.119902390226418e-05, - "loss": 0.0801, - "step": 41790 - }, - { - "epoch": 2.7347072293097807, - "grad_norm": 0.7868991494178772, - "learning_rate": 9.119381828437694e-05, - "loss": 0.0683, - "step": 41800 - }, - { - "epoch": 2.7353614654890417, - "grad_norm": 0.7209054827690125, - "learning_rate": 9.118861127609031e-05, - "loss": 0.0724, - "step": 41810 - }, - { - "epoch": 2.736015701668302, - "grad_norm": 0.8778037428855896, - "learning_rate": 9.118340287758001e-05, - "loss": 0.0798, - "step": 41820 - }, - { - "epoch": 2.736669937847563, - "grad_norm": 0.9034239053726196, - "learning_rate": 9.117819308902186e-05, - "loss": 0.0832, - "step": 41830 - }, - { - "epoch": 2.7373241740268237, - "grad_norm": 0.9231058359146118, - "learning_rate": 9.117298191059171e-05, - "loss": 0.0784, - "step": 41840 - }, - { - "epoch": 2.7379784102060842, - "grad_norm": 0.8640102744102478, - "learning_rate": 9.116776934246543e-05, - "loss": 0.0845, - "step": 41850 - }, - { - "epoch": 2.738632646385345, - "grad_norm": 0.9153121709823608, - "learning_rate": 9.116255538481896e-05, - "loss": 0.0832, - "step": 41860 - }, - { - "epoch": 2.7392868825646057, - "grad_norm": 0.7354351282119751, - "learning_rate": 9.115734003782832e-05, - "loss": 0.0759, - "step": 41870 - }, - { - "epoch": 2.7399411187438667, - "grad_norm": 0.9227734804153442, - "learning_rate": 9.115212330166949e-05, - "loss": 0.0839, - "step": 41880 - }, - { - "epoch": 2.7405953549231272, - "grad_norm": 0.8186841011047363, - "learning_rate": 9.114690517651859e-05, - "loss": 0.0802, - "step": 41890 - }, - { - "epoch": 2.741249591102388, - "grad_norm": 0.8185954093933105, - "learning_rate": 9.114168566255172e-05, - "loss": 0.0824, - "step": 41900 - }, - { - "epoch": 2.7419038272816487, - "grad_norm": 1.1469035148620605, - "learning_rate": 9.113646475994506e-05, - "loss": 0.0783, - "step": 41910 - }, - { - "epoch": 2.7425580634609092, - "grad_norm": 0.8847156763076782, - "learning_rate": 9.113124246887483e-05, - "loss": 0.0723, - "step": 41920 - }, - { - "epoch": 2.74321229964017, - "grad_norm": 0.870272696018219, - "learning_rate": 9.11260187895173e-05, - "loss": 0.0812, - "step": 41930 - }, - { - "epoch": 2.7438665358194307, - "grad_norm": 1.0734666585922241, - "learning_rate": 9.112079372204877e-05, - "loss": 0.0863, - "step": 41940 - }, - { - "epoch": 2.7445207719986913, - "grad_norm": 0.6786089539527893, - "learning_rate": 9.111556726664563e-05, - "loss": 0.0796, - "step": 41950 - }, - { - "epoch": 2.7451750081779522, - "grad_norm": 0.8662351369857788, - "learning_rate": 9.111033942348426e-05, - "loss": 0.0884, - "step": 41960 - }, - { - "epoch": 2.745829244357213, - "grad_norm": 0.7827929854393005, - "learning_rate": 9.110511019274111e-05, - "loss": 0.0764, - "step": 41970 - }, - { - "epoch": 2.7464834805364737, - "grad_norm": 1.1550228595733643, - "learning_rate": 9.109987957459267e-05, - "loss": 0.0845, - "step": 41980 - }, - { - "epoch": 2.7471377167157343, - "grad_norm": 0.8338910937309265, - "learning_rate": 9.109464756921553e-05, - "loss": 0.0824, - "step": 41990 - }, - { - "epoch": 2.7477919528949952, - "grad_norm": 0.9103622436523438, - "learning_rate": 9.108941417678626e-05, - "loss": 0.0896, - "step": 42000 - }, - { - "epoch": 2.7484461890742558, - "grad_norm": 0.8045547008514404, - "learning_rate": 9.108417939748149e-05, - "loss": 0.0805, - "step": 42010 - }, - { - "epoch": 2.7491004252535163, - "grad_norm": 0.8297812342643738, - "learning_rate": 9.107894323147792e-05, - "loss": 0.0819, - "step": 42020 - }, - { - "epoch": 2.7497546614327772, - "grad_norm": 0.8854258060455322, - "learning_rate": 9.107370567895229e-05, - "loss": 0.0837, - "step": 42030 - }, - { - "epoch": 2.750408897612038, - "grad_norm": 0.8766602277755737, - "learning_rate": 9.106846674008137e-05, - "loss": 0.0776, - "step": 42040 - }, - { - "epoch": 2.7510631337912987, - "grad_norm": 1.0775314569473267, - "learning_rate": 9.1063226415042e-05, - "loss": 0.0909, - "step": 42050 - }, - { - "epoch": 2.7517173699705593, - "grad_norm": 0.7865322828292847, - "learning_rate": 9.105798470401103e-05, - "loss": 0.0914, - "step": 42060 - }, - { - "epoch": 2.7523716061498202, - "grad_norm": 0.9470481872558594, - "learning_rate": 9.10527416071654e-05, - "loss": 0.0841, - "step": 42070 - }, - { - "epoch": 2.7530258423290808, - "grad_norm": 0.6476520299911499, - "learning_rate": 9.104749712468207e-05, - "loss": 0.0813, - "step": 42080 - }, - { - "epoch": 2.7536800785083413, - "grad_norm": 0.8619006276130676, - "learning_rate": 9.104225125673809e-05, - "loss": 0.08, - "step": 42090 - }, - { - "epoch": 2.7543343146876023, - "grad_norm": 0.7804111242294312, - "learning_rate": 9.103700400351047e-05, - "loss": 0.0889, - "step": 42100 - }, - { - "epoch": 2.754988550866863, - "grad_norm": 0.8227022290229797, - "learning_rate": 9.103175536517634e-05, - "loss": 0.08, - "step": 42110 - }, - { - "epoch": 2.7556427870461238, - "grad_norm": 0.8172056674957275, - "learning_rate": 9.102650534191287e-05, - "loss": 0.0863, - "step": 42120 - }, - { - "epoch": 2.7562970232253843, - "grad_norm": 0.8061167001724243, - "learning_rate": 9.102125393389725e-05, - "loss": 0.0707, - "step": 42130 - }, - { - "epoch": 2.7569512594046452, - "grad_norm": 0.9391945004463196, - "learning_rate": 9.10160011413067e-05, - "loss": 0.0777, - "step": 42140 - }, - { - "epoch": 2.7576054955839058, - "grad_norm": 1.0496068000793457, - "learning_rate": 9.101074696431858e-05, - "loss": 0.0854, - "step": 42150 - }, - { - "epoch": 2.7582597317631663, - "grad_norm": 0.8995298743247986, - "learning_rate": 9.100549140311017e-05, - "loss": 0.0802, - "step": 42160 - }, - { - "epoch": 2.7589139679424273, - "grad_norm": 0.8396767973899841, - "learning_rate": 9.100023445785889e-05, - "loss": 0.0736, - "step": 42170 - }, - { - "epoch": 2.759568204121688, - "grad_norm": 1.035605549812317, - "learning_rate": 9.099497612874217e-05, - "loss": 0.079, - "step": 42180 - }, - { - "epoch": 2.7602224403009488, - "grad_norm": 0.8803637027740479, - "learning_rate": 9.09897164159375e-05, - "loss": 0.0805, - "step": 42190 - }, - { - "epoch": 2.7608766764802093, - "grad_norm": 0.6873325109481812, - "learning_rate": 9.09844553196224e-05, - "loss": 0.0776, - "step": 42200 - }, - { - "epoch": 2.7615309126594703, - "grad_norm": 0.7511876225471497, - "learning_rate": 9.097919283997444e-05, - "loss": 0.0746, - "step": 42210 - }, - { - "epoch": 2.762185148838731, - "grad_norm": 0.7745641469955444, - "learning_rate": 9.097392897717126e-05, - "loss": 0.0829, - "step": 42220 - }, - { - "epoch": 2.7628393850179913, - "grad_norm": 0.8654570579528809, - "learning_rate": 9.09686637313905e-05, - "loss": 0.0783, - "step": 42230 - }, - { - "epoch": 2.7634936211972523, - "grad_norm": 0.8436213731765747, - "learning_rate": 9.09633971028099e-05, - "loss": 0.0826, - "step": 42240 - }, - { - "epoch": 2.764147857376513, - "grad_norm": 0.8295747637748718, - "learning_rate": 9.095812909160724e-05, - "loss": 0.0761, - "step": 42250 - }, - { - "epoch": 2.7648020935557738, - "grad_norm": 0.8413954377174377, - "learning_rate": 9.095285969796027e-05, - "loss": 0.0794, - "step": 42260 - }, - { - "epoch": 2.7654563297350343, - "grad_norm": 0.8630293607711792, - "learning_rate": 9.094758892204691e-05, - "loss": 0.0864, - "step": 42270 - }, - { - "epoch": 2.7661105659142953, - "grad_norm": 1.0896077156066895, - "learning_rate": 9.094231676404503e-05, - "loss": 0.0755, - "step": 42280 - }, - { - "epoch": 2.766764802093556, - "grad_norm": 0.9205356240272522, - "learning_rate": 9.093704322413259e-05, - "loss": 0.0888, - "step": 42290 - }, - { - "epoch": 2.7674190382728163, - "grad_norm": 0.8817057013511658, - "learning_rate": 9.093176830248756e-05, - "loss": 0.0755, - "step": 42300 - }, - { - "epoch": 2.7680732744520773, - "grad_norm": 0.9554494619369507, - "learning_rate": 9.092649199928802e-05, - "loss": 0.0742, - "step": 42310 - }, - { - "epoch": 2.768727510631338, - "grad_norm": 1.0500036478042603, - "learning_rate": 9.092121431471203e-05, - "loss": 0.0775, - "step": 42320 - }, - { - "epoch": 2.769381746810599, - "grad_norm": 1.2276242971420288, - "learning_rate": 9.091593524893776e-05, - "loss": 0.0775, - "step": 42330 - }, - { - "epoch": 2.7700359829898593, - "grad_norm": 0.8922539353370667, - "learning_rate": 9.091065480214333e-05, - "loss": 0.0841, - "step": 42340 - }, - { - "epoch": 2.7706902191691203, - "grad_norm": 0.9390623569488525, - "learning_rate": 9.090537297450703e-05, - "loss": 0.0816, - "step": 42350 - }, - { - "epoch": 2.771344455348381, - "grad_norm": 0.7786974310874939, - "learning_rate": 9.090008976620712e-05, - "loss": 0.0739, - "step": 42360 - }, - { - "epoch": 2.7719986915276413, - "grad_norm": 0.7234275341033936, - "learning_rate": 9.08948051774219e-05, - "loss": 0.077, - "step": 42370 - }, - { - "epoch": 2.7726529277069023, - "grad_norm": 0.8942136764526367, - "learning_rate": 9.088951920832978e-05, - "loss": 0.0838, - "step": 42380 - }, - { - "epoch": 2.773307163886163, - "grad_norm": 0.9873566031455994, - "learning_rate": 9.088423185910912e-05, - "loss": 0.0831, - "step": 42390 - }, - { - "epoch": 2.7739614000654234, - "grad_norm": 0.9224919080734253, - "learning_rate": 9.087894312993844e-05, - "loss": 0.0798, - "step": 42400 - }, - { - "epoch": 2.7746156362446843, - "grad_norm": 0.9617442488670349, - "learning_rate": 9.08736530209962e-05, - "loss": 0.082, - "step": 42410 - }, - { - "epoch": 2.7752698724239453, - "grad_norm": 0.9834263324737549, - "learning_rate": 9.086836153246099e-05, - "loss": 0.0846, - "step": 42420 - }, - { - "epoch": 2.775924108603206, - "grad_norm": 0.8556174635887146, - "learning_rate": 9.086306866451139e-05, - "loss": 0.0863, - "step": 42430 - }, - { - "epoch": 2.7765783447824663, - "grad_norm": 0.8653731346130371, - "learning_rate": 9.085777441732606e-05, - "loss": 0.0901, - "step": 42440 - }, - { - "epoch": 2.7772325809617273, - "grad_norm": 1.0214253664016724, - "learning_rate": 9.085247879108367e-05, - "loss": 0.0748, - "step": 42450 - }, - { - "epoch": 2.777886817140988, - "grad_norm": 0.7947637438774109, - "learning_rate": 9.084718178596301e-05, - "loss": 0.0846, - "step": 42460 - }, - { - "epoch": 2.7785410533202484, - "grad_norm": 0.9689314961433411, - "learning_rate": 9.084188340214281e-05, - "loss": 0.0747, - "step": 42470 - }, - { - "epoch": 2.7791952894995093, - "grad_norm": 0.9164670705795288, - "learning_rate": 9.083658363980196e-05, - "loss": 0.0836, - "step": 42480 - }, - { - "epoch": 2.7798495256787703, - "grad_norm": 1.0349498987197876, - "learning_rate": 9.08312824991193e-05, - "loss": 0.0858, - "step": 42490 - }, - { - "epoch": 2.780503761858031, - "grad_norm": 0.8005316257476807, - "learning_rate": 9.082597998027377e-05, - "loss": 0.0791, - "step": 42500 - }, - { - "epoch": 2.7811579980372914, - "grad_norm": 0.7661714553833008, - "learning_rate": 9.082067608344436e-05, - "loss": 0.0844, - "step": 42510 - }, - { - "epoch": 2.7818122342165523, - "grad_norm": 0.9406249523162842, - "learning_rate": 9.081537080881007e-05, - "loss": 0.0822, - "step": 42520 - }, - { - "epoch": 2.782466470395813, - "grad_norm": 0.8644420504570007, - "learning_rate": 9.081006415654995e-05, - "loss": 0.0803, - "step": 42530 - }, - { - "epoch": 2.7831207065750734, - "grad_norm": 0.9211861491203308, - "learning_rate": 9.080475612684316e-05, - "loss": 0.0808, - "step": 42540 - }, - { - "epoch": 2.7837749427543343, - "grad_norm": 1.1344019174575806, - "learning_rate": 9.079944671986883e-05, - "loss": 0.0834, - "step": 42550 - }, - { - "epoch": 2.784429178933595, - "grad_norm": 0.8112305998802185, - "learning_rate": 9.079413593580616e-05, - "loss": 0.0936, - "step": 42560 - }, - { - "epoch": 2.785083415112856, - "grad_norm": 0.973592221736908, - "learning_rate": 9.078882377483444e-05, - "loss": 0.0733, - "step": 42570 - }, - { - "epoch": 2.7857376512921164, - "grad_norm": 0.7716719508171082, - "learning_rate": 9.078351023713294e-05, - "loss": 0.0764, - "step": 42580 - }, - { - "epoch": 2.7863918874713773, - "grad_norm": 0.7506532073020935, - "learning_rate": 9.077819532288102e-05, - "loss": 0.0697, - "step": 42590 - }, - { - "epoch": 2.787046123650638, - "grad_norm": 0.7960049510002136, - "learning_rate": 9.077287903225804e-05, - "loss": 0.0835, - "step": 42600 - }, - { - "epoch": 2.7877003598298984, - "grad_norm": 0.873766303062439, - "learning_rate": 9.076756136544346e-05, - "loss": 0.076, - "step": 42610 - }, - { - "epoch": 2.7883545960091594, - "grad_norm": 0.680327296257019, - "learning_rate": 9.076224232261679e-05, - "loss": 0.0727, - "step": 42620 - }, - { - "epoch": 2.78900883218842, - "grad_norm": 0.9833268523216248, - "learning_rate": 9.075692190395752e-05, - "loss": 0.0864, - "step": 42630 - }, - { - "epoch": 2.789663068367681, - "grad_norm": 0.9118520617485046, - "learning_rate": 9.075160010964526e-05, - "loss": 0.079, - "step": 42640 - }, - { - "epoch": 2.7903173045469414, - "grad_norm": 0.990631639957428, - "learning_rate": 9.074627693985961e-05, - "loss": 0.0777, - "step": 42650 - }, - { - "epoch": 2.7909715407262023, - "grad_norm": 0.91771399974823, - "learning_rate": 9.074095239478026e-05, - "loss": 0.0775, - "step": 42660 - }, - { - "epoch": 2.791625776905463, - "grad_norm": 0.832398533821106, - "learning_rate": 9.07356264745869e-05, - "loss": 0.0769, - "step": 42670 - }, - { - "epoch": 2.7922800130847234, - "grad_norm": 0.7836474180221558, - "learning_rate": 9.073029917945934e-05, - "loss": 0.087, - "step": 42680 - }, - { - "epoch": 2.7929342492639844, - "grad_norm": 0.8258154988288879, - "learning_rate": 9.072497050957736e-05, - "loss": 0.0893, - "step": 42690 - }, - { - "epoch": 2.793588485443245, - "grad_norm": 0.8014414310455322, - "learning_rate": 9.071964046512081e-05, - "loss": 0.0776, - "step": 42700 - }, - { - "epoch": 2.794242721622506, - "grad_norm": 0.9521371126174927, - "learning_rate": 9.071430904626961e-05, - "loss": 0.0834, - "step": 42710 - }, - { - "epoch": 2.7948969578017664, - "grad_norm": 1.008519172668457, - "learning_rate": 9.07089762532037e-05, - "loss": 0.0725, - "step": 42720 - }, - { - "epoch": 2.7955511939810274, - "grad_norm": 1.2083194255828857, - "learning_rate": 9.070364208610307e-05, - "loss": 0.0781, - "step": 42730 - }, - { - "epoch": 2.796205430160288, - "grad_norm": 1.0434963703155518, - "learning_rate": 9.069830654514778e-05, - "loss": 0.0841, - "step": 42740 - }, - { - "epoch": 2.7968596663395484, - "grad_norm": 1.085412859916687, - "learning_rate": 9.06929696305179e-05, - "loss": 0.0919, - "step": 42750 - }, - { - "epoch": 2.7975139025188094, - "grad_norm": 0.9131748676300049, - "learning_rate": 9.06876313423936e-05, - "loss": 0.0799, - "step": 42760 - }, - { - "epoch": 2.79816813869807, - "grad_norm": 0.977798581123352, - "learning_rate": 9.068229168095501e-05, - "loss": 0.0895, - "step": 42770 - }, - { - "epoch": 2.798822374877331, - "grad_norm": 0.891613245010376, - "learning_rate": 9.067695064638237e-05, - "loss": 0.0834, - "step": 42780 - }, - { - "epoch": 2.7994766110565914, - "grad_norm": 0.8886749148368835, - "learning_rate": 9.0671608238856e-05, - "loss": 0.0821, - "step": 42790 - }, - { - "epoch": 2.8001308472358524, - "grad_norm": 0.9983289241790771, - "learning_rate": 9.066626445855617e-05, - "loss": 0.0828, - "step": 42800 - }, - { - "epoch": 2.800785083415113, - "grad_norm": 0.7890607118606567, - "learning_rate": 9.066091930566327e-05, - "loss": 0.0857, - "step": 42810 - }, - { - "epoch": 2.8014393195943734, - "grad_norm": 0.7882013320922852, - "learning_rate": 9.06555727803577e-05, - "loss": 0.0775, - "step": 42820 - }, - { - "epoch": 2.8020935557736344, - "grad_norm": 0.8657622337341309, - "learning_rate": 9.065022488281992e-05, - "loss": 0.0844, - "step": 42830 - }, - { - "epoch": 2.802747791952895, - "grad_norm": 1.038025975227356, - "learning_rate": 9.064487561323045e-05, - "loss": 0.0796, - "step": 42840 - }, - { - "epoch": 2.8034020281321554, - "grad_norm": 0.8643264174461365, - "learning_rate": 9.063952497176983e-05, - "loss": 0.0823, - "step": 42850 - }, - { - "epoch": 2.8040562643114164, - "grad_norm": 0.7769449949264526, - "learning_rate": 9.063417295861866e-05, - "loss": 0.07, - "step": 42860 - }, - { - "epoch": 2.8047105004906774, - "grad_norm": 0.988023579120636, - "learning_rate": 9.06288195739576e-05, - "loss": 0.0752, - "step": 42870 - }, - { - "epoch": 2.805364736669938, - "grad_norm": 0.7536209225654602, - "learning_rate": 9.06234648179673e-05, - "loss": 0.0876, - "step": 42880 - }, - { - "epoch": 2.8060189728491984, - "grad_norm": 0.9088309407234192, - "learning_rate": 9.061810869082855e-05, - "loss": 0.0749, - "step": 42890 - }, - { - "epoch": 2.8066732090284594, - "grad_norm": 0.937651515007019, - "learning_rate": 9.061275119272207e-05, - "loss": 0.0868, - "step": 42900 - }, - { - "epoch": 2.80732744520772, - "grad_norm": 0.8864895701408386, - "learning_rate": 9.060739232382876e-05, - "loss": 0.0844, - "step": 42910 - }, - { - "epoch": 2.8079816813869805, - "grad_norm": 0.8680556416511536, - "learning_rate": 9.060203208432945e-05, - "loss": 0.0741, - "step": 42920 - }, - { - "epoch": 2.8086359175662414, - "grad_norm": 0.7065665125846863, - "learning_rate": 9.059667047440508e-05, - "loss": 0.0766, - "step": 42930 - }, - { - "epoch": 2.8092901537455024, - "grad_norm": 1.2820072174072266, - "learning_rate": 9.059130749423662e-05, - "loss": 0.0857, - "step": 42940 - }, - { - "epoch": 2.809944389924763, - "grad_norm": 0.8697900772094727, - "learning_rate": 9.058594314400506e-05, - "loss": 0.0755, - "step": 42950 - }, - { - "epoch": 2.8105986261040234, - "grad_norm": 0.9628492593765259, - "learning_rate": 9.058057742389147e-05, - "loss": 0.0819, - "step": 42960 - }, - { - "epoch": 2.8112528622832844, - "grad_norm": 1.09038507938385, - "learning_rate": 9.057521033407698e-05, - "loss": 0.0815, - "step": 42970 - }, - { - "epoch": 2.811907098462545, - "grad_norm": 0.9920393824577332, - "learning_rate": 9.056984187474275e-05, - "loss": 0.081, - "step": 42980 - }, - { - "epoch": 2.8125613346418055, - "grad_norm": 0.7963695526123047, - "learning_rate": 9.056447204606993e-05, - "loss": 0.0774, - "step": 42990 - }, - { - "epoch": 2.8132155708210664, - "grad_norm": 1.1177492141723633, - "learning_rate": 9.055910084823979e-05, - "loss": 0.0786, - "step": 43000 - }, - { - "epoch": 2.813869807000327, - "grad_norm": 0.9136730432510376, - "learning_rate": 9.055372828143365e-05, - "loss": 0.0744, - "step": 43010 - }, - { - "epoch": 2.814524043179588, - "grad_norm": 0.7779806852340698, - "learning_rate": 9.054835434583281e-05, - "loss": 0.0788, - "step": 43020 - }, - { - "epoch": 2.8151782793588485, - "grad_norm": 1.0529919862747192, - "learning_rate": 9.054297904161868e-05, - "loss": 0.0788, - "step": 43030 - }, - { - "epoch": 2.8158325155381094, - "grad_norm": 0.7755911946296692, - "learning_rate": 9.053760236897266e-05, - "loss": 0.08, - "step": 43040 - }, - { - "epoch": 2.81648675171737, - "grad_norm": 0.9372928738594055, - "learning_rate": 9.053222432807626e-05, - "loss": 0.0827, - "step": 43050 - }, - { - "epoch": 2.8171409878966305, - "grad_norm": 0.91545170545578, - "learning_rate": 9.052684491911099e-05, - "loss": 0.0858, - "step": 43060 - }, - { - "epoch": 2.8177952240758914, - "grad_norm": 0.7959693670272827, - "learning_rate": 9.052146414225841e-05, - "loss": 0.0906, - "step": 43070 - }, - { - "epoch": 2.818449460255152, - "grad_norm": 0.9491099119186401, - "learning_rate": 9.051608199770016e-05, - "loss": 0.0794, - "step": 43080 - }, - { - "epoch": 2.819103696434413, - "grad_norm": 0.7243504524230957, - "learning_rate": 9.051069848561787e-05, - "loss": 0.0793, - "step": 43090 - }, - { - "epoch": 2.8197579326136735, - "grad_norm": 1.0417028665542603, - "learning_rate": 9.050531360619328e-05, - "loss": 0.0791, - "step": 43100 - }, - { - "epoch": 2.8204121687929344, - "grad_norm": 0.7411209344863892, - "learning_rate": 9.04999273596081e-05, - "loss": 0.0815, - "step": 43110 - }, - { - "epoch": 2.821066404972195, - "grad_norm": 0.7973235249519348, - "learning_rate": 9.049453974604418e-05, - "loss": 0.0752, - "step": 43120 - }, - { - "epoch": 2.8217206411514555, - "grad_norm": 0.9462906718254089, - "learning_rate": 9.048915076568334e-05, - "loss": 0.0808, - "step": 43130 - }, - { - "epoch": 2.8223748773307165, - "grad_norm": 1.0456901788711548, - "learning_rate": 9.048376041870745e-05, - "loss": 0.0969, - "step": 43140 - }, - { - "epoch": 2.823029113509977, - "grad_norm": 1.11058509349823, - "learning_rate": 9.04783687052985e-05, - "loss": 0.0767, - "step": 43150 - }, - { - "epoch": 2.823683349689238, - "grad_norm": 0.7125511765480042, - "learning_rate": 9.047297562563843e-05, - "loss": 0.0781, - "step": 43160 - }, - { - "epoch": 2.8243375858684985, - "grad_norm": 0.7025366425514221, - "learning_rate": 9.04675811799093e-05, - "loss": 0.0664, - "step": 43170 - }, - { - "epoch": 2.8249918220477594, - "grad_norm": 0.9043560028076172, - "learning_rate": 9.046218536829319e-05, - "loss": 0.077, - "step": 43180 - }, - { - "epoch": 2.82564605822702, - "grad_norm": 0.7520603537559509, - "learning_rate": 9.045678819097218e-05, - "loss": 0.0734, - "step": 43190 - }, - { - "epoch": 2.8263002944062805, - "grad_norm": 0.903668224811554, - "learning_rate": 9.045138964812848e-05, - "loss": 0.0824, - "step": 43200 - }, - { - "epoch": 2.8269545305855415, - "grad_norm": 0.8586761355400085, - "learning_rate": 9.044598973994429e-05, - "loss": 0.0787, - "step": 43210 - }, - { - "epoch": 2.827608766764802, - "grad_norm": 0.8279673457145691, - "learning_rate": 9.044058846660187e-05, - "loss": 0.0903, - "step": 43220 - }, - { - "epoch": 2.828263002944063, - "grad_norm": 0.7114253640174866, - "learning_rate": 9.043518582828354e-05, - "loss": 0.0761, - "step": 43230 - }, - { - "epoch": 2.8289172391233235, - "grad_norm": 0.8194983601570129, - "learning_rate": 9.042978182517163e-05, - "loss": 0.0902, - "step": 43240 - }, - { - "epoch": 2.8295714753025845, - "grad_norm": 0.7969952821731567, - "learning_rate": 9.042437645744856e-05, - "loss": 0.0691, - "step": 43250 - }, - { - "epoch": 2.830225711481845, - "grad_norm": 0.7822580337524414, - "learning_rate": 9.041896972529677e-05, - "loss": 0.0752, - "step": 43260 - }, - { - "epoch": 2.8308799476611055, - "grad_norm": 0.7220231294631958, - "learning_rate": 9.041356162889873e-05, - "loss": 0.0728, - "step": 43270 - }, - { - "epoch": 2.8315341838403665, - "grad_norm": 0.7740190029144287, - "learning_rate": 9.040815216843702e-05, - "loss": 0.0816, - "step": 43280 - }, - { - "epoch": 2.832188420019627, - "grad_norm": 0.8419371247291565, - "learning_rate": 9.040274134409419e-05, - "loss": 0.0792, - "step": 43290 - }, - { - "epoch": 2.8328426561988875, - "grad_norm": 0.950294554233551, - "learning_rate": 9.039732915605287e-05, - "loss": 0.0856, - "step": 43300 - }, - { - "epoch": 2.8334968923781485, - "grad_norm": 0.9349603056907654, - "learning_rate": 9.039191560449575e-05, - "loss": 0.0815, - "step": 43310 - }, - { - "epoch": 2.8341511285574095, - "grad_norm": 0.8912672996520996, - "learning_rate": 9.038650068960556e-05, - "loss": 0.0821, - "step": 43320 - }, - { - "epoch": 2.83480536473667, - "grad_norm": 0.9765848517417908, - "learning_rate": 9.038108441156506e-05, - "loss": 0.0797, - "step": 43330 - }, - { - "epoch": 2.8354596009159305, - "grad_norm": 0.8395116329193115, - "learning_rate": 9.037566677055705e-05, - "loss": 0.0738, - "step": 43340 - }, - { - "epoch": 2.8361138370951915, - "grad_norm": 0.7863714098930359, - "learning_rate": 9.037024776676438e-05, - "loss": 0.087, - "step": 43350 - }, - { - "epoch": 2.836768073274452, - "grad_norm": 0.9483030438423157, - "learning_rate": 9.036482740036998e-05, - "loss": 0.0786, - "step": 43360 - }, - { - "epoch": 2.8374223094537125, - "grad_norm": 0.8250223994255066, - "learning_rate": 9.03594056715568e-05, - "loss": 0.0719, - "step": 43370 - }, - { - "epoch": 2.8380765456329735, - "grad_norm": 0.8315423727035522, - "learning_rate": 9.035398258050784e-05, - "loss": 0.0811, - "step": 43380 - }, - { - "epoch": 2.8387307818122345, - "grad_norm": 0.8926718235015869, - "learning_rate": 9.034855812740612e-05, - "loss": 0.0749, - "step": 43390 - }, - { - "epoch": 2.839385017991495, - "grad_norm": 0.8397383689880371, - "learning_rate": 9.034313231243477e-05, - "loss": 0.0738, - "step": 43400 - }, - { - "epoch": 2.8400392541707555, - "grad_norm": 0.900603711605072, - "learning_rate": 9.033770513577688e-05, - "loss": 0.0747, - "step": 43410 - }, - { - "epoch": 2.8406934903500165, - "grad_norm": 0.8045016527175903, - "learning_rate": 9.033227659761565e-05, - "loss": 0.0794, - "step": 43420 - }, - { - "epoch": 2.841347726529277, - "grad_norm": 0.861449122428894, - "learning_rate": 9.032684669813431e-05, - "loss": 0.0744, - "step": 43430 - }, - { - "epoch": 2.8420019627085376, - "grad_norm": 0.8809171319007874, - "learning_rate": 9.032141543751614e-05, - "loss": 0.0807, - "step": 43440 - }, - { - "epoch": 2.8426561988877985, - "grad_norm": 0.8424410820007324, - "learning_rate": 9.031598281594445e-05, - "loss": 0.0771, - "step": 43450 - }, - { - "epoch": 2.843310435067059, - "grad_norm": 0.788705587387085, - "learning_rate": 9.031054883360261e-05, - "loss": 0.0815, - "step": 43460 - }, - { - "epoch": 2.84396467124632, - "grad_norm": 0.7892941832542419, - "learning_rate": 9.030511349067404e-05, - "loss": 0.0707, - "step": 43470 - }, - { - "epoch": 2.8446189074255805, - "grad_norm": 0.8143117427825928, - "learning_rate": 9.029967678734216e-05, - "loss": 0.0738, - "step": 43480 - }, - { - "epoch": 2.8452731436048415, - "grad_norm": 0.9846142530441284, - "learning_rate": 9.02942387237905e-05, - "loss": 0.0776, - "step": 43490 - }, - { - "epoch": 2.845927379784102, - "grad_norm": 0.9837608933448792, - "learning_rate": 9.028879930020262e-05, - "loss": 0.0807, - "step": 43500 - }, - { - "epoch": 2.8465816159633626, - "grad_norm": 1.0797531604766846, - "learning_rate": 9.028335851676211e-05, - "loss": 0.0818, - "step": 43510 - }, - { - "epoch": 2.8472358521426235, - "grad_norm": 0.9254376292228699, - "learning_rate": 9.02779163736526e-05, - "loss": 0.0763, - "step": 43520 - }, - { - "epoch": 2.847890088321884, - "grad_norm": 0.9471592903137207, - "learning_rate": 9.027247287105776e-05, - "loss": 0.0708, - "step": 43530 - }, - { - "epoch": 2.848544324501145, - "grad_norm": 0.9006256461143494, - "learning_rate": 9.026702800916136e-05, - "loss": 0.0736, - "step": 43540 - }, - { - "epoch": 2.8491985606804056, - "grad_norm": 0.888878345489502, - "learning_rate": 9.026158178814715e-05, - "loss": 0.0706, - "step": 43550 - }, - { - "epoch": 2.8498527968596665, - "grad_norm": 0.9179434180259705, - "learning_rate": 9.025613420819897e-05, - "loss": 0.076, - "step": 43560 - }, - { - "epoch": 2.850507033038927, - "grad_norm": 0.9074091911315918, - "learning_rate": 9.025068526950069e-05, - "loss": 0.0775, - "step": 43570 - }, - { - "epoch": 2.8511612692181876, - "grad_norm": 0.7732144594192505, - "learning_rate": 9.024523497223622e-05, - "loss": 0.077, - "step": 43580 - }, - { - "epoch": 2.8518155053974485, - "grad_norm": 0.8621454834938049, - "learning_rate": 9.02397833165895e-05, - "loss": 0.0852, - "step": 43590 - }, - { - "epoch": 2.852469741576709, - "grad_norm": 0.9191235303878784, - "learning_rate": 9.023433030274459e-05, - "loss": 0.0815, - "step": 43600 - }, - { - "epoch": 2.85312397775597, - "grad_norm": 0.7941545248031616, - "learning_rate": 9.022887593088551e-05, - "loss": 0.0837, - "step": 43610 - }, - { - "epoch": 2.8537782139352306, - "grad_norm": 1.0019004344940186, - "learning_rate": 9.022342020119637e-05, - "loss": 0.0839, - "step": 43620 - }, - { - "epoch": 2.8544324501144915, - "grad_norm": 1.2812579870224, - "learning_rate": 9.021796311386128e-05, - "loss": 0.0911, - "step": 43630 - }, - { - "epoch": 2.855086686293752, - "grad_norm": 0.9368396997451782, - "learning_rate": 9.021250466906448e-05, - "loss": 0.0748, - "step": 43640 - }, - { - "epoch": 2.8557409224730126, - "grad_norm": 0.8556075096130371, - "learning_rate": 9.020704486699018e-05, - "loss": 0.0798, - "step": 43650 - }, - { - "epoch": 2.8563951586522736, - "grad_norm": 0.6791936755180359, - "learning_rate": 9.020158370782266e-05, - "loss": 0.0834, - "step": 43660 - }, - { - "epoch": 2.857049394831534, - "grad_norm": 0.7594597339630127, - "learning_rate": 9.019612119174627e-05, - "loss": 0.0826, - "step": 43670 - }, - { - "epoch": 2.857703631010795, - "grad_norm": 0.8290628790855408, - "learning_rate": 9.019065731894537e-05, - "loss": 0.0792, - "step": 43680 - }, - { - "epoch": 2.8583578671900556, - "grad_norm": 0.8027207255363464, - "learning_rate": 9.018519208960439e-05, - "loss": 0.0695, - "step": 43690 - }, - { - "epoch": 2.8590121033693165, - "grad_norm": 0.8073681592941284, - "learning_rate": 9.017972550390777e-05, - "loss": 0.0823, - "step": 43700 - }, - { - "epoch": 2.859666339548577, - "grad_norm": 0.971709668636322, - "learning_rate": 9.017425756204005e-05, - "loss": 0.0919, - "step": 43710 - }, - { - "epoch": 2.8603205757278376, - "grad_norm": 0.8581392765045166, - "learning_rate": 9.016878826418577e-05, - "loss": 0.0817, - "step": 43720 - }, - { - "epoch": 2.8609748119070986, - "grad_norm": 0.8417885303497314, - "learning_rate": 9.016331761052954e-05, - "loss": 0.0865, - "step": 43730 - }, - { - "epoch": 2.861629048086359, - "grad_norm": 1.0083153247833252, - "learning_rate": 9.015784560125602e-05, - "loss": 0.077, - "step": 43740 - }, - { - "epoch": 2.8622832842656196, - "grad_norm": 0.8471361398696899, - "learning_rate": 9.015237223654988e-05, - "loss": 0.0737, - "step": 43750 - }, - { - "epoch": 2.8629375204448806, - "grad_norm": 0.8287156224250793, - "learning_rate": 9.014689751659589e-05, - "loss": 0.0779, - "step": 43760 - }, - { - "epoch": 2.8635917566241416, - "grad_norm": 0.8068413734436035, - "learning_rate": 9.01414214415788e-05, - "loss": 0.0885, - "step": 43770 - }, - { - "epoch": 2.864245992803402, - "grad_norm": 0.8625628352165222, - "learning_rate": 9.013594401168346e-05, - "loss": 0.0933, - "step": 43780 - }, - { - "epoch": 2.8649002289826626, - "grad_norm": 0.9650261998176575, - "learning_rate": 9.013046522709477e-05, - "loss": 0.0869, - "step": 43790 - }, - { - "epoch": 2.8655544651619236, - "grad_norm": 0.7974777817726135, - "learning_rate": 9.01249850879976e-05, - "loss": 0.0733, - "step": 43800 - }, - { - "epoch": 2.866208701341184, - "grad_norm": 1.0560264587402344, - "learning_rate": 9.011950359457699e-05, - "loss": 0.0885, - "step": 43810 - }, - { - "epoch": 2.8668629375204446, - "grad_norm": 0.8955875635147095, - "learning_rate": 9.011402074701788e-05, - "loss": 0.0829, - "step": 43820 - }, - { - "epoch": 2.8675171736997056, - "grad_norm": 0.9224720001220703, - "learning_rate": 9.01085365455054e-05, - "loss": 0.077, - "step": 43830 - }, - { - "epoch": 2.8681714098789666, - "grad_norm": 0.9114225506782532, - "learning_rate": 9.010305099022462e-05, - "loss": 0.0843, - "step": 43840 - }, - { - "epoch": 2.868825646058227, - "grad_norm": 0.8845502138137817, - "learning_rate": 9.009756408136067e-05, - "loss": 0.0749, - "step": 43850 - }, - { - "epoch": 2.8694798822374876, - "grad_norm": 0.8369585275650024, - "learning_rate": 9.009207581909879e-05, - "loss": 0.0707, - "step": 43860 - }, - { - "epoch": 2.8701341184167486, - "grad_norm": 0.8531777262687683, - "learning_rate": 9.008658620362421e-05, - "loss": 0.0816, - "step": 43870 - }, - { - "epoch": 2.870788354596009, - "grad_norm": 0.9684780836105347, - "learning_rate": 9.00810952351222e-05, - "loss": 0.0852, - "step": 43880 - }, - { - "epoch": 2.8714425907752696, - "grad_norm": 1.1896883249282837, - "learning_rate": 9.007560291377813e-05, - "loss": 0.0782, - "step": 43890 - }, - { - "epoch": 2.8720968269545306, - "grad_norm": 0.8664726614952087, - "learning_rate": 9.007010923977732e-05, - "loss": 0.0761, - "step": 43900 - }, - { - "epoch": 2.872751063133791, - "grad_norm": 0.7793802618980408, - "learning_rate": 9.006461421330528e-05, - "loss": 0.0801, - "step": 43910 - }, - { - "epoch": 2.873405299313052, - "grad_norm": 0.8414815664291382, - "learning_rate": 9.005911783454742e-05, - "loss": 0.0774, - "step": 43920 - }, - { - "epoch": 2.8740595354923126, - "grad_norm": 0.9556751251220703, - "learning_rate": 9.005362010368926e-05, - "loss": 0.0771, - "step": 43930 - }, - { - "epoch": 2.8747137716715736, - "grad_norm": 0.720085620880127, - "learning_rate": 9.00481210209164e-05, - "loss": 0.077, - "step": 43940 - }, - { - "epoch": 2.875368007850834, - "grad_norm": 0.7043220400810242, - "learning_rate": 9.004262058641441e-05, - "loss": 0.0704, - "step": 43950 - }, - { - "epoch": 2.8760222440300947, - "grad_norm": 0.9286543726921082, - "learning_rate": 9.003711880036896e-05, - "loss": 0.0778, - "step": 43960 - }, - { - "epoch": 2.8766764802093556, - "grad_norm": 0.8943794369697571, - "learning_rate": 9.003161566296576e-05, - "loss": 0.0684, - "step": 43970 - }, - { - "epoch": 2.877330716388616, - "grad_norm": 0.7501460313796997, - "learning_rate": 9.002611117439054e-05, - "loss": 0.0767, - "step": 43980 - }, - { - "epoch": 2.877984952567877, - "grad_norm": 0.9430428743362427, - "learning_rate": 9.002060533482909e-05, - "loss": 0.0751, - "step": 43990 - }, - { - "epoch": 2.8786391887471376, - "grad_norm": 0.986933171749115, - "learning_rate": 9.001509814446726e-05, - "loss": 0.0842, - "step": 44000 - }, - { - "epoch": 2.8792934249263986, - "grad_norm": 0.7722887992858887, - "learning_rate": 9.000958960349092e-05, - "loss": 0.0782, - "step": 44010 - }, - { - "epoch": 2.879947661105659, - "grad_norm": 0.8584244251251221, - "learning_rate": 9.0004079712086e-05, - "loss": 0.0796, - "step": 44020 - }, - { - "epoch": 2.8806018972849197, - "grad_norm": 0.8732749819755554, - "learning_rate": 8.999856847043849e-05, - "loss": 0.0848, - "step": 44030 - }, - { - "epoch": 2.8812561334641806, - "grad_norm": 0.8751747012138367, - "learning_rate": 8.999305587873437e-05, - "loss": 0.0776, - "step": 44040 - }, - { - "epoch": 2.881910369643441, - "grad_norm": 0.9108805060386658, - "learning_rate": 8.998754193715974e-05, - "loss": 0.0745, - "step": 44050 - }, - { - "epoch": 2.882564605822702, - "grad_norm": 0.9251412749290466, - "learning_rate": 8.99820266459007e-05, - "loss": 0.0819, - "step": 44060 - }, - { - "epoch": 2.8832188420019627, - "grad_norm": 1.0415140390396118, - "learning_rate": 8.99765100051434e-05, - "loss": 0.072, - "step": 44070 - }, - { - "epoch": 2.8838730781812236, - "grad_norm": 0.8129945397377014, - "learning_rate": 8.997099201507406e-05, - "loss": 0.0788, - "step": 44080 - }, - { - "epoch": 2.884527314360484, - "grad_norm": 0.760057270526886, - "learning_rate": 8.996547267587889e-05, - "loss": 0.0876, - "step": 44090 - }, - { - "epoch": 2.8851815505397447, - "grad_norm": 0.9552372694015503, - "learning_rate": 8.995995198774421e-05, - "loss": 0.0968, - "step": 44100 - }, - { - "epoch": 2.8858357867190056, - "grad_norm": 0.7973808646202087, - "learning_rate": 8.995442995085636e-05, - "loss": 0.0824, - "step": 44110 - }, - { - "epoch": 2.886490022898266, - "grad_norm": 0.7297688722610474, - "learning_rate": 8.99489065654017e-05, - "loss": 0.0783, - "step": 44120 - }, - { - "epoch": 2.887144259077527, - "grad_norm": 0.9278412461280823, - "learning_rate": 8.994338183156669e-05, - "loss": 0.083, - "step": 44130 - }, - { - "epoch": 2.8877984952567877, - "grad_norm": 0.9200074672698975, - "learning_rate": 8.993785574953778e-05, - "loss": 0.0781, - "step": 44140 - }, - { - "epoch": 2.8884527314360486, - "grad_norm": 0.8482991456985474, - "learning_rate": 8.99323283195015e-05, - "loss": 0.0826, - "step": 44150 - }, - { - "epoch": 2.889106967615309, - "grad_norm": 0.7914182543754578, - "learning_rate": 8.992679954164442e-05, - "loss": 0.0751, - "step": 44160 - }, - { - "epoch": 2.8897612037945697, - "grad_norm": 0.8093661069869995, - "learning_rate": 8.992126941615313e-05, - "loss": 0.0754, - "step": 44170 - }, - { - "epoch": 2.8904154399738307, - "grad_norm": 0.8084908127784729, - "learning_rate": 8.99157379432143e-05, - "loss": 0.0813, - "step": 44180 - }, - { - "epoch": 2.891069676153091, - "grad_norm": 0.7260185480117798, - "learning_rate": 8.991020512301464e-05, - "loss": 0.0744, - "step": 44190 - }, - { - "epoch": 2.8917239123323517, - "grad_norm": 0.7913504838943481, - "learning_rate": 8.990467095574089e-05, - "loss": 0.0689, - "step": 44200 - }, - { - "epoch": 2.8923781485116127, - "grad_norm": 0.6776607632637024, - "learning_rate": 8.989913544157983e-05, - "loss": 0.0707, - "step": 44210 - }, - { - "epoch": 2.8930323846908736, - "grad_norm": 0.9057397246360779, - "learning_rate": 8.98935985807183e-05, - "loss": 0.0873, - "step": 44220 - }, - { - "epoch": 2.893686620870134, - "grad_norm": 0.9263217449188232, - "learning_rate": 8.988806037334322e-05, - "loss": 0.0689, - "step": 44230 - }, - { - "epoch": 2.8943408570493947, - "grad_norm": 0.9351694583892822, - "learning_rate": 8.988252081964147e-05, - "loss": 0.0806, - "step": 44240 - }, - { - "epoch": 2.8949950932286557, - "grad_norm": 0.75326007604599, - "learning_rate": 8.987697991980007e-05, - "loss": 0.0745, - "step": 44250 - }, - { - "epoch": 2.895649329407916, - "grad_norm": 1.0243571996688843, - "learning_rate": 8.987143767400601e-05, - "loss": 0.0782, - "step": 44260 - }, - { - "epoch": 2.8963035655871767, - "grad_norm": 0.8010449409484863, - "learning_rate": 8.986589408244634e-05, - "loss": 0.0745, - "step": 44270 - }, - { - "epoch": 2.8969578017664377, - "grad_norm": 0.9779613614082336, - "learning_rate": 8.986034914530823e-05, - "loss": 0.0723, - "step": 44280 - }, - { - "epoch": 2.8976120379456987, - "grad_norm": 0.8159413933753967, - "learning_rate": 8.985480286277877e-05, - "loss": 0.0781, - "step": 44290 - }, - { - "epoch": 2.898266274124959, - "grad_norm": 0.8815252184867859, - "learning_rate": 8.984925523504519e-05, - "loss": 0.0839, - "step": 44300 - }, - { - "epoch": 2.8989205103042197, - "grad_norm": 0.7953386306762695, - "learning_rate": 8.984370626229474e-05, - "loss": 0.0898, - "step": 44310 - }, - { - "epoch": 2.8995747464834807, - "grad_norm": 0.8342095017433167, - "learning_rate": 8.983815594471472e-05, - "loss": 0.0854, - "step": 44320 - }, - { - "epoch": 2.900228982662741, - "grad_norm": 0.864380955696106, - "learning_rate": 8.983260428249246e-05, - "loss": 0.0869, - "step": 44330 - }, - { - "epoch": 2.9008832188420017, - "grad_norm": 0.9503182172775269, - "learning_rate": 8.982705127581533e-05, - "loss": 0.0722, - "step": 44340 - }, - { - "epoch": 2.9015374550212627, - "grad_norm": 0.8214649558067322, - "learning_rate": 8.982149692487078e-05, - "loss": 0.0772, - "step": 44350 - }, - { - "epoch": 2.9021916912005232, - "grad_norm": 0.8064943552017212, - "learning_rate": 8.981594122984627e-05, - "loss": 0.0757, - "step": 44360 - }, - { - "epoch": 2.902845927379784, - "grad_norm": 0.7172738909721375, - "learning_rate": 8.981038419092931e-05, - "loss": 0.0714, - "step": 44370 - }, - { - "epoch": 2.9035001635590447, - "grad_norm": 0.9183410406112671, - "learning_rate": 8.980482580830747e-05, - "loss": 0.0785, - "step": 44380 - }, - { - "epoch": 2.9041543997383057, - "grad_norm": 0.8381460905075073, - "learning_rate": 8.97992660821684e-05, - "loss": 0.0714, - "step": 44390 - }, - { - "epoch": 2.904808635917566, - "grad_norm": 0.8850300908088684, - "learning_rate": 8.979370501269971e-05, - "loss": 0.0808, - "step": 44400 - }, - { - "epoch": 2.9054628720968267, - "grad_norm": 0.848743736743927, - "learning_rate": 8.97881426000891e-05, - "loss": 0.0867, - "step": 44410 - }, - { - "epoch": 2.9061171082760877, - "grad_norm": 0.7968313097953796, - "learning_rate": 8.978257884452433e-05, - "loss": 0.0775, - "step": 44420 - }, - { - "epoch": 2.9067713444553482, - "grad_norm": 0.9209117293357849, - "learning_rate": 8.977701374619321e-05, - "loss": 0.0702, - "step": 44430 - }, - { - "epoch": 2.907425580634609, - "grad_norm": 0.7473788857460022, - "learning_rate": 8.977144730528353e-05, - "loss": 0.0786, - "step": 44440 - }, - { - "epoch": 2.9080798168138697, - "grad_norm": 0.7625455856323242, - "learning_rate": 8.97658795219832e-05, - "loss": 0.0802, - "step": 44450 - }, - { - "epoch": 2.9087340529931307, - "grad_norm": 1.0252678394317627, - "learning_rate": 8.976031039648017e-05, - "loss": 0.0686, - "step": 44460 - }, - { - "epoch": 2.9093882891723912, - "grad_norm": 0.7554779052734375, - "learning_rate": 8.975473992896239e-05, - "loss": 0.0744, - "step": 44470 - }, - { - "epoch": 2.9100425253516518, - "grad_norm": 0.8992921710014343, - "learning_rate": 8.974916811961786e-05, - "loss": 0.0847, - "step": 44480 - }, - { - "epoch": 2.9106967615309127, - "grad_norm": 0.7527303695678711, - "learning_rate": 8.974359496863466e-05, - "loss": 0.0777, - "step": 44490 - }, - { - "epoch": 2.9113509977101732, - "grad_norm": 0.8266311883926392, - "learning_rate": 8.97380204762009e-05, - "loss": 0.076, - "step": 44500 - }, - { - "epoch": 2.912005233889434, - "grad_norm": 0.8097569942474365, - "learning_rate": 8.973244464250474e-05, - "loss": 0.0814, - "step": 44510 - }, - { - "epoch": 2.9126594700686947, - "grad_norm": 0.8458988070487976, - "learning_rate": 8.972686746773436e-05, - "loss": 0.0836, - "step": 44520 - }, - { - "epoch": 2.9133137062479557, - "grad_norm": 0.8754256963729858, - "learning_rate": 8.972128895207803e-05, - "loss": 0.0954, - "step": 44530 - }, - { - "epoch": 2.9139679424272162, - "grad_norm": 0.7934485673904419, - "learning_rate": 8.971570909572401e-05, - "loss": 0.0784, - "step": 44540 - }, - { - "epoch": 2.9146221786064768, - "grad_norm": 0.9816693067550659, - "learning_rate": 8.971012789886066e-05, - "loss": 0.0724, - "step": 44550 - }, - { - "epoch": 2.9152764147857377, - "grad_norm": 0.8291686773300171, - "learning_rate": 8.970454536167634e-05, - "loss": 0.0774, - "step": 44560 - }, - { - "epoch": 2.9159306509649983, - "grad_norm": 0.7562156915664673, - "learning_rate": 8.96989614843595e-05, - "loss": 0.0802, - "step": 44570 - }, - { - "epoch": 2.9165848871442592, - "grad_norm": 0.680461049079895, - "learning_rate": 8.969337626709858e-05, - "loss": 0.0795, - "step": 44580 - }, - { - "epoch": 2.9172391233235198, - "grad_norm": 0.8741086721420288, - "learning_rate": 8.968778971008211e-05, - "loss": 0.0826, - "step": 44590 - }, - { - "epoch": 2.9178933595027807, - "grad_norm": 0.79926598072052, - "learning_rate": 8.968220181349866e-05, - "loss": 0.0677, - "step": 44600 - }, - { - "epoch": 2.9185475956820413, - "grad_norm": 0.7387502193450928, - "learning_rate": 8.967661257753683e-05, - "loss": 0.0756, - "step": 44610 - }, - { - "epoch": 2.9192018318613018, - "grad_norm": 0.8162443041801453, - "learning_rate": 8.967102200238527e-05, - "loss": 0.0716, - "step": 44620 - }, - { - "epoch": 2.9198560680405627, - "grad_norm": 0.8832371830940247, - "learning_rate": 8.966543008823268e-05, - "loss": 0.0711, - "step": 44630 - }, - { - "epoch": 2.9205103042198233, - "grad_norm": 0.8890617489814758, - "learning_rate": 8.965983683526779e-05, - "loss": 0.0754, - "step": 44640 - }, - { - "epoch": 2.921164540399084, - "grad_norm": 0.8303614854812622, - "learning_rate": 8.96542422436794e-05, - "loss": 0.0728, - "step": 44650 - }, - { - "epoch": 2.9218187765783448, - "grad_norm": 0.7883252501487732, - "learning_rate": 8.964864631365634e-05, - "loss": 0.0793, - "step": 44660 - }, - { - "epoch": 2.9224730127576057, - "grad_norm": 1.1344659328460693, - "learning_rate": 8.964304904538747e-05, - "loss": 0.0856, - "step": 44670 - }, - { - "epoch": 2.9231272489368663, - "grad_norm": 0.7178243398666382, - "learning_rate": 8.963745043906174e-05, - "loss": 0.0784, - "step": 44680 - }, - { - "epoch": 2.923781485116127, - "grad_norm": 0.9424094557762146, - "learning_rate": 8.96318504948681e-05, - "loss": 0.0805, - "step": 44690 - }, - { - "epoch": 2.9244357212953878, - "grad_norm": 0.6671366691589355, - "learning_rate": 8.962624921299558e-05, - "loss": 0.0822, - "step": 44700 - }, - { - "epoch": 2.9250899574746483, - "grad_norm": 0.7286959290504456, - "learning_rate": 8.962064659363321e-05, - "loss": 0.0875, - "step": 44710 - }, - { - "epoch": 2.925744193653909, - "grad_norm": 0.8117363452911377, - "learning_rate": 8.961504263697011e-05, - "loss": 0.0746, - "step": 44720 - }, - { - "epoch": 2.92639842983317, - "grad_norm": 0.8143247365951538, - "learning_rate": 8.960943734319542e-05, - "loss": 0.083, - "step": 44730 - }, - { - "epoch": 2.9270526660124307, - "grad_norm": 0.8705511689186096, - "learning_rate": 8.960383071249836e-05, - "loss": 0.0858, - "step": 44740 - }, - { - "epoch": 2.9277069021916913, - "grad_norm": 1.2389694452285767, - "learning_rate": 8.959822274506812e-05, - "loss": 0.0911, - "step": 44750 - }, - { - "epoch": 2.928361138370952, - "grad_norm": 0.7589144110679626, - "learning_rate": 8.959261344109404e-05, - "loss": 0.0772, - "step": 44760 - }, - { - "epoch": 2.9290153745502128, - "grad_norm": 0.8595485687255859, - "learning_rate": 8.958700280076542e-05, - "loss": 0.0727, - "step": 44770 - }, - { - "epoch": 2.9296696107294733, - "grad_norm": 1.0086075067520142, - "learning_rate": 8.958139082427162e-05, - "loss": 0.0796, - "step": 44780 - }, - { - "epoch": 2.930323846908734, - "grad_norm": 0.928106963634491, - "learning_rate": 8.957577751180209e-05, - "loss": 0.0692, - "step": 44790 - }, - { - "epoch": 2.930978083087995, - "grad_norm": 0.8739972114562988, - "learning_rate": 8.957016286354626e-05, - "loss": 0.0811, - "step": 44800 - }, - { - "epoch": 2.9316323192672553, - "grad_norm": 0.870585024356842, - "learning_rate": 8.956454687969367e-05, - "loss": 0.0709, - "step": 44810 - }, - { - "epoch": 2.9322865554465163, - "grad_norm": 1.0133564472198486, - "learning_rate": 8.955892956043385e-05, - "loss": 0.0771, - "step": 44820 - }, - { - "epoch": 2.932940791625777, - "grad_norm": 0.8007000684738159, - "learning_rate": 8.955331090595642e-05, - "loss": 0.0804, - "step": 44830 - }, - { - "epoch": 2.933595027805038, - "grad_norm": 0.8323869109153748, - "learning_rate": 8.9547690916451e-05, - "loss": 0.0738, - "step": 44840 - }, - { - "epoch": 2.9342492639842983, - "grad_norm": 0.6912589073181152, - "learning_rate": 8.954206959210731e-05, - "loss": 0.0738, - "step": 44850 - }, - { - "epoch": 2.934903500163559, - "grad_norm": 0.9906479716300964, - "learning_rate": 8.953644693311506e-05, - "loss": 0.0778, - "step": 44860 - }, - { - "epoch": 2.93555773634282, - "grad_norm": 0.6855130791664124, - "learning_rate": 8.953082293966404e-05, - "loss": 0.0713, - "step": 44870 - }, - { - "epoch": 2.9362119725220803, - "grad_norm": 0.7666285634040833, - "learning_rate": 8.952519761194407e-05, - "loss": 0.0767, - "step": 44880 - }, - { - "epoch": 2.9368662087013413, - "grad_norm": 0.8128482103347778, - "learning_rate": 8.951957095014503e-05, - "loss": 0.0734, - "step": 44890 - }, - { - "epoch": 2.937520444880602, - "grad_norm": 1.090126872062683, - "learning_rate": 8.951394295445681e-05, - "loss": 0.0873, - "step": 44900 - }, - { - "epoch": 2.938174681059863, - "grad_norm": 0.8404224514961243, - "learning_rate": 8.950831362506941e-05, - "loss": 0.073, - "step": 44910 - }, - { - "epoch": 2.9388289172391233, - "grad_norm": 0.8833885192871094, - "learning_rate": 8.950268296217279e-05, - "loss": 0.0803, - "step": 44920 - }, - { - "epoch": 2.939483153418384, - "grad_norm": 0.8460028767585754, - "learning_rate": 8.949705096595703e-05, - "loss": 0.0759, - "step": 44930 - }, - { - "epoch": 2.940137389597645, - "grad_norm": 0.7936156988143921, - "learning_rate": 8.949141763661222e-05, - "loss": 0.0911, - "step": 44940 - }, - { - "epoch": 2.9407916257769053, - "grad_norm": 0.8652473092079163, - "learning_rate": 8.948578297432848e-05, - "loss": 0.0829, - "step": 44950 - }, - { - "epoch": 2.9414458619561663, - "grad_norm": 0.9275278449058533, - "learning_rate": 8.948014697929603e-05, - "loss": 0.0766, - "step": 44960 - }, - { - "epoch": 2.942100098135427, - "grad_norm": 0.6803890466690063, - "learning_rate": 8.947450965170505e-05, - "loss": 0.0729, - "step": 44970 - }, - { - "epoch": 2.942754334314688, - "grad_norm": 0.8319152593612671, - "learning_rate": 8.946887099174587e-05, - "loss": 0.0726, - "step": 44980 - }, - { - "epoch": 2.9434085704939483, - "grad_norm": 0.9895662069320679, - "learning_rate": 8.946323099960877e-05, - "loss": 0.0735, - "step": 44990 - }, - { - "epoch": 2.944062806673209, - "grad_norm": 0.899389386177063, - "learning_rate": 8.945758967548415e-05, - "loss": 0.0678, - "step": 45000 - }, - { - "epoch": 2.94471704285247, - "grad_norm": 0.9055632948875427, - "learning_rate": 8.945194701956236e-05, - "loss": 0.0743, - "step": 45010 - }, - { - "epoch": 2.9453712790317303, - "grad_norm": 0.8592407703399658, - "learning_rate": 8.944630303203391e-05, - "loss": 0.0785, - "step": 45020 - }, - { - "epoch": 2.9460255152109913, - "grad_norm": 0.8779101967811584, - "learning_rate": 8.944065771308928e-05, - "loss": 0.0785, - "step": 45030 - }, - { - "epoch": 2.946679751390252, - "grad_norm": 0.8470450639724731, - "learning_rate": 8.943501106291901e-05, - "loss": 0.0746, - "step": 45040 - }, - { - "epoch": 2.947333987569513, - "grad_norm": 0.8635161519050598, - "learning_rate": 8.94293630817137e-05, - "loss": 0.0732, - "step": 45050 - }, - { - "epoch": 2.9479882237487733, - "grad_norm": 0.892550528049469, - "learning_rate": 8.942371376966398e-05, - "loss": 0.0829, - "step": 45060 - }, - { - "epoch": 2.948642459928034, - "grad_norm": 0.7665706276893616, - "learning_rate": 8.941806312696054e-05, - "loss": 0.0738, - "step": 45070 - }, - { - "epoch": 2.949296696107295, - "grad_norm": 0.8420522212982178, - "learning_rate": 8.941241115379408e-05, - "loss": 0.0801, - "step": 45080 - }, - { - "epoch": 2.9499509322865554, - "grad_norm": 0.8662616610527039, - "learning_rate": 8.940675785035538e-05, - "loss": 0.0738, - "step": 45090 - }, - { - "epoch": 2.950605168465816, - "grad_norm": 0.7357196807861328, - "learning_rate": 8.940110321683525e-05, - "loss": 0.0687, - "step": 45100 - }, - { - "epoch": 2.951259404645077, - "grad_norm": 0.8650752305984497, - "learning_rate": 8.939544725342454e-05, - "loss": 0.0807, - "step": 45110 - }, - { - "epoch": 2.951913640824338, - "grad_norm": 1.0531340837478638, - "learning_rate": 8.93897899603142e-05, - "loss": 0.09, - "step": 45120 - }, - { - "epoch": 2.9525678770035984, - "grad_norm": 0.7418067455291748, - "learning_rate": 8.938413133769514e-05, - "loss": 0.0777, - "step": 45130 - }, - { - "epoch": 2.953222113182859, - "grad_norm": 1.0505800247192383, - "learning_rate": 8.937847138575833e-05, - "loss": 0.0821, - "step": 45140 - }, - { - "epoch": 2.95387634936212, - "grad_norm": 0.9159501194953918, - "learning_rate": 8.937281010469486e-05, - "loss": 0.0742, - "step": 45150 - }, - { - "epoch": 2.9545305855413804, - "grad_norm": 0.8090870976448059, - "learning_rate": 8.936714749469579e-05, - "loss": 0.0825, - "step": 45160 - }, - { - "epoch": 2.955184821720641, - "grad_norm": 0.8931045532226562, - "learning_rate": 8.936148355595224e-05, - "loss": 0.0765, - "step": 45170 - }, - { - "epoch": 2.955839057899902, - "grad_norm": 0.8412348628044128, - "learning_rate": 8.93558182886554e-05, - "loss": 0.0762, - "step": 45180 - }, - { - "epoch": 2.956493294079163, - "grad_norm": 0.832350492477417, - "learning_rate": 8.935015169299646e-05, - "loss": 0.086, - "step": 45190 - }, - { - "epoch": 2.9571475302584234, - "grad_norm": 0.8142176866531372, - "learning_rate": 8.934448376916672e-05, - "loss": 0.0752, - "step": 45200 - }, - { - "epoch": 2.957801766437684, - "grad_norm": 0.7122102379798889, - "learning_rate": 8.933881451735746e-05, - "loss": 0.0754, - "step": 45210 - }, - { - "epoch": 2.958456002616945, - "grad_norm": 0.9692149758338928, - "learning_rate": 8.933314393776005e-05, - "loss": 0.0744, - "step": 45220 - }, - { - "epoch": 2.9591102387962054, - "grad_norm": 0.8291851878166199, - "learning_rate": 8.932747203056586e-05, - "loss": 0.0773, - "step": 45230 - }, - { - "epoch": 2.959764474975466, - "grad_norm": 0.7037114500999451, - "learning_rate": 8.932179879596636e-05, - "loss": 0.0838, - "step": 45240 - }, - { - "epoch": 2.960418711154727, - "grad_norm": 0.7591250538825989, - "learning_rate": 8.9316124234153e-05, - "loss": 0.0729, - "step": 45250 - }, - { - "epoch": 2.9610729473339874, - "grad_norm": 0.8546152114868164, - "learning_rate": 8.931044834531737e-05, - "loss": 0.074, - "step": 45260 - }, - { - "epoch": 2.9617271835132484, - "grad_norm": 0.8192047476768494, - "learning_rate": 8.930477112965102e-05, - "loss": 0.0773, - "step": 45270 - }, - { - "epoch": 2.962381419692509, - "grad_norm": 0.7500526905059814, - "learning_rate": 8.929909258734553e-05, - "loss": 0.0756, - "step": 45280 - }, - { - "epoch": 2.96303565587177, - "grad_norm": 0.8506186008453369, - "learning_rate": 8.929341271859262e-05, - "loss": 0.0812, - "step": 45290 - }, - { - "epoch": 2.9636898920510304, - "grad_norm": 0.7039439678192139, - "learning_rate": 8.928773152358398e-05, - "loss": 0.0697, - "step": 45300 - }, - { - "epoch": 2.964344128230291, - "grad_norm": 1.1579927206039429, - "learning_rate": 8.928204900251136e-05, - "loss": 0.0707, - "step": 45310 - }, - { - "epoch": 2.964998364409552, - "grad_norm": 0.8826687932014465, - "learning_rate": 8.927636515556657e-05, - "loss": 0.0853, - "step": 45320 - }, - { - "epoch": 2.9656526005888124, - "grad_norm": 0.7980294823646545, - "learning_rate": 8.927067998294145e-05, - "loss": 0.0846, - "step": 45330 - }, - { - "epoch": 2.9663068367680734, - "grad_norm": 0.7435978651046753, - "learning_rate": 8.926499348482787e-05, - "loss": 0.067, - "step": 45340 - }, - { - "epoch": 2.966961072947334, - "grad_norm": 0.715040922164917, - "learning_rate": 8.92593056614178e-05, - "loss": 0.0712, - "step": 45350 - }, - { - "epoch": 2.967615309126595, - "grad_norm": 0.9521136283874512, - "learning_rate": 8.925361651290321e-05, - "loss": 0.0809, - "step": 45360 - }, - { - "epoch": 2.9682695453058554, - "grad_norm": 0.7266244888305664, - "learning_rate": 8.924792603947611e-05, - "loss": 0.0802, - "step": 45370 - }, - { - "epoch": 2.968923781485116, - "grad_norm": 0.9576558470726013, - "learning_rate": 8.924223424132856e-05, - "loss": 0.0811, - "step": 45380 - }, - { - "epoch": 2.969578017664377, - "grad_norm": 0.8989412784576416, - "learning_rate": 8.92365411186527e-05, - "loss": 0.0844, - "step": 45390 - }, - { - "epoch": 2.9702322538436374, - "grad_norm": 0.9169609546661377, - "learning_rate": 8.923084667164067e-05, - "loss": 0.0728, - "step": 45400 - }, - { - "epoch": 2.9708864900228984, - "grad_norm": 0.9715309739112854, - "learning_rate": 8.92251509004847e-05, - "loss": 0.0868, - "step": 45410 - }, - { - "epoch": 2.971540726202159, - "grad_norm": 0.8212486505508423, - "learning_rate": 8.9219453805377e-05, - "loss": 0.0794, - "step": 45420 - }, - { - "epoch": 2.97219496238142, - "grad_norm": 0.8119533658027649, - "learning_rate": 8.921375538650987e-05, - "loss": 0.0831, - "step": 45430 - }, - { - "epoch": 2.9728491985606804, - "grad_norm": 0.8991037011146545, - "learning_rate": 8.920805564407565e-05, - "loss": 0.081, - "step": 45440 - }, - { - "epoch": 2.973503434739941, - "grad_norm": 0.6757112145423889, - "learning_rate": 8.920235457826675e-05, - "loss": 0.0706, - "step": 45450 - }, - { - "epoch": 2.974157670919202, - "grad_norm": 0.8806977272033691, - "learning_rate": 8.919665218927556e-05, - "loss": 0.068, - "step": 45460 - }, - { - "epoch": 2.9748119070984624, - "grad_norm": 1.013301968574524, - "learning_rate": 8.919094847729455e-05, - "loss": 0.0812, - "step": 45470 - }, - { - "epoch": 2.9754661432777234, - "grad_norm": 0.9366437792778015, - "learning_rate": 8.918524344251626e-05, - "loss": 0.0773, - "step": 45480 - }, - { - "epoch": 2.976120379456984, - "grad_norm": 0.8117071986198425, - "learning_rate": 8.917953708513324e-05, - "loss": 0.0676, - "step": 45490 - }, - { - "epoch": 2.976774615636245, - "grad_norm": 1.0920495986938477, - "learning_rate": 8.917382940533808e-05, - "loss": 0.0787, - "step": 45500 - }, - { - "epoch": 2.9774288518155054, - "grad_norm": 0.7851189970970154, - "learning_rate": 8.916812040332344e-05, - "loss": 0.0745, - "step": 45510 - }, - { - "epoch": 2.978083087994766, - "grad_norm": 0.8046877384185791, - "learning_rate": 8.916241007928203e-05, - "loss": 0.0787, - "step": 45520 - }, - { - "epoch": 2.978737324174027, - "grad_norm": 0.8224988579750061, - "learning_rate": 8.915669843340655e-05, - "loss": 0.0895, - "step": 45530 - }, - { - "epoch": 2.9793915603532874, - "grad_norm": 0.9415143132209778, - "learning_rate": 8.915098546588983e-05, - "loss": 0.0723, - "step": 45540 - }, - { - "epoch": 2.980045796532548, - "grad_norm": 0.9764366149902344, - "learning_rate": 8.914527117692465e-05, - "loss": 0.084, - "step": 45550 - }, - { - "epoch": 2.980700032711809, - "grad_norm": 0.774044930934906, - "learning_rate": 8.913955556670392e-05, - "loss": 0.0786, - "step": 45560 - }, - { - "epoch": 2.98135426889107, - "grad_norm": 0.8664652705192566, - "learning_rate": 8.913383863542054e-05, - "loss": 0.0765, - "step": 45570 - }, - { - "epoch": 2.9820085050703304, - "grad_norm": 0.8627799153327942, - "learning_rate": 8.912812038326746e-05, - "loss": 0.0839, - "step": 45580 - }, - { - "epoch": 2.982662741249591, - "grad_norm": 0.8925349712371826, - "learning_rate": 8.912240081043773e-05, - "loss": 0.0813, - "step": 45590 - }, - { - "epoch": 2.983316977428852, - "grad_norm": 0.8013269305229187, - "learning_rate": 8.911667991712433e-05, - "loss": 0.0722, - "step": 45600 - }, - { - "epoch": 2.9839712136081125, - "grad_norm": 0.6747923493385315, - "learning_rate": 8.911095770352043e-05, - "loss": 0.0757, - "step": 45610 - }, - { - "epoch": 2.984625449787373, - "grad_norm": 0.8834814429283142, - "learning_rate": 8.910523416981911e-05, - "loss": 0.081, - "step": 45620 - }, - { - "epoch": 2.985279685966634, - "grad_norm": 0.8894004225730896, - "learning_rate": 8.90995093162136e-05, - "loss": 0.0748, - "step": 45630 - }, - { - "epoch": 2.985933922145895, - "grad_norm": 0.7748491168022156, - "learning_rate": 8.909378314289708e-05, - "loss": 0.0718, - "step": 45640 - }, - { - "epoch": 2.9865881583251555, - "grad_norm": 0.8032260537147522, - "learning_rate": 8.908805565006288e-05, - "loss": 0.0714, - "step": 45650 - }, - { - "epoch": 2.987242394504416, - "grad_norm": 0.7444846630096436, - "learning_rate": 8.90823268379043e-05, - "loss": 0.0705, - "step": 45660 - }, - { - "epoch": 2.987896630683677, - "grad_norm": 0.8824648261070251, - "learning_rate": 8.907659670661467e-05, - "loss": 0.087, - "step": 45670 - }, - { - "epoch": 2.9885508668629375, - "grad_norm": 0.7613914608955383, - "learning_rate": 8.907086525638741e-05, - "loss": 0.0791, - "step": 45680 - }, - { - "epoch": 2.989205103042198, - "grad_norm": 1.0592745542526245, - "learning_rate": 8.9065132487416e-05, - "loss": 0.0757, - "step": 45690 - }, - { - "epoch": 2.989859339221459, - "grad_norm": 0.8600308299064636, - "learning_rate": 8.905939839989391e-05, - "loss": 0.0767, - "step": 45700 - }, - { - "epoch": 2.99051357540072, - "grad_norm": 0.9262317419052124, - "learning_rate": 8.90536629940147e-05, - "loss": 0.0879, - "step": 45710 - }, - { - "epoch": 2.9911678115799805, - "grad_norm": 0.7141725420951843, - "learning_rate": 8.904792626997191e-05, - "loss": 0.0699, - "step": 45720 - }, - { - "epoch": 2.991822047759241, - "grad_norm": 0.8366238474845886, - "learning_rate": 8.904218822795923e-05, - "loss": 0.0717, - "step": 45730 - }, - { - "epoch": 2.992476283938502, - "grad_norm": 0.937675416469574, - "learning_rate": 8.903644886817029e-05, - "loss": 0.0782, - "step": 45740 - }, - { - "epoch": 2.9931305201177625, - "grad_norm": 0.7373892068862915, - "learning_rate": 8.903070819079884e-05, - "loss": 0.0793, - "step": 45750 - }, - { - "epoch": 2.993784756297023, - "grad_norm": 0.9344786405563354, - "learning_rate": 8.902496619603862e-05, - "loss": 0.0852, - "step": 45760 - }, - { - "epoch": 2.994438992476284, - "grad_norm": 0.7279993295669556, - "learning_rate": 8.901922288408343e-05, - "loss": 0.0699, - "step": 45770 - }, - { - "epoch": 2.9950932286555445, - "grad_norm": 1.427483081817627, - "learning_rate": 8.901347825512715e-05, - "loss": 0.0969, - "step": 45780 - }, - { - "epoch": 2.9957474648348055, - "grad_norm": 0.707801103591919, - "learning_rate": 8.900773230936366e-05, - "loss": 0.0845, - "step": 45790 - }, - { - "epoch": 2.996401701014066, - "grad_norm": 0.7957580089569092, - "learning_rate": 8.900198504698689e-05, - "loss": 0.0735, - "step": 45800 - }, - { - "epoch": 2.997055937193327, - "grad_norm": 1.1683952808380127, - "learning_rate": 8.899623646819087e-05, - "loss": 0.0686, - "step": 45810 - }, - { - "epoch": 2.9977101733725875, - "grad_norm": 0.8779153227806091, - "learning_rate": 8.899048657316956e-05, - "loss": 0.0689, - "step": 45820 - }, - { - "epoch": 2.998364409551848, - "grad_norm": 0.8528333306312561, - "learning_rate": 8.89847353621171e-05, - "loss": 0.0696, - "step": 45830 - }, - { - "epoch": 2.999018645731109, - "grad_norm": 0.9389132261276245, - "learning_rate": 8.897898283522756e-05, - "loss": 0.0808, - "step": 45840 - }, - { - "epoch": 2.9996728819103695, - "grad_norm": 0.8795191049575806, - "learning_rate": 8.897322899269513e-05, - "loss": 0.0823, - "step": 45850 - }, - { - "epoch": 3.0003271180896305, - "grad_norm": 0.8083428740501404, - "learning_rate": 8.896747383471402e-05, - "loss": 0.0703, - "step": 45860 - }, - { - "epoch": 3.000981354268891, - "grad_norm": 1.016609787940979, - "learning_rate": 8.896171736147846e-05, - "loss": 0.0926, - "step": 45870 - }, - { - "epoch": 3.001635590448152, - "grad_norm": 0.9670884609222412, - "learning_rate": 8.895595957318277e-05, - "loss": 0.0756, - "step": 45880 - }, - { - "epoch": 3.0022898266274125, - "grad_norm": 0.9883562922477722, - "learning_rate": 8.895020047002127e-05, - "loss": 0.0797, - "step": 45890 - }, - { - "epoch": 3.002944062806673, - "grad_norm": 0.7922661304473877, - "learning_rate": 8.894444005218835e-05, - "loss": 0.0877, - "step": 45900 - }, - { - "epoch": 3.003598298985934, - "grad_norm": 0.8021111488342285, - "learning_rate": 8.893867831987845e-05, - "loss": 0.0731, - "step": 45910 - }, - { - "epoch": 3.0042525351651945, - "grad_norm": 0.8403140902519226, - "learning_rate": 8.893291527328604e-05, - "loss": 0.0819, - "step": 45920 - }, - { - "epoch": 3.0049067713444555, - "grad_norm": 1.021638035774231, - "learning_rate": 8.892715091260564e-05, - "loss": 0.077, - "step": 45930 - }, - { - "epoch": 3.005561007523716, - "grad_norm": 0.723585307598114, - "learning_rate": 8.89213852380318e-05, - "loss": 0.0711, - "step": 45940 - }, - { - "epoch": 3.006215243702977, - "grad_norm": 0.807429313659668, - "learning_rate": 8.891561824975911e-05, - "loss": 0.0756, - "step": 45950 - }, - { - "epoch": 3.0068694798822375, - "grad_norm": 0.9832606911659241, - "learning_rate": 8.890984994798229e-05, - "loss": 0.0752, - "step": 45960 - }, - { - "epoch": 3.007523716061498, - "grad_norm": 0.9934665560722351, - "learning_rate": 8.890408033289595e-05, - "loss": 0.0826, - "step": 45970 - }, - { - "epoch": 3.008177952240759, - "grad_norm": 0.8675631880760193, - "learning_rate": 8.889830940469487e-05, - "loss": 0.0748, - "step": 45980 - }, - { - "epoch": 3.0088321884200195, - "grad_norm": 0.8953355550765991, - "learning_rate": 8.889253716357385e-05, - "loss": 0.0859, - "step": 45990 - }, - { - "epoch": 3.0094864245992805, - "grad_norm": 0.9766877889633179, - "learning_rate": 8.88867636097277e-05, - "loss": 0.0766, - "step": 46000 - }, - { - "epoch": 3.010140660778541, - "grad_norm": 0.8925821185112, - "learning_rate": 8.888098874335129e-05, - "loss": 0.0754, - "step": 46010 - }, - { - "epoch": 3.0107948969578016, - "grad_norm": 0.9773553013801575, - "learning_rate": 8.887521256463953e-05, - "loss": 0.0707, - "step": 46020 - }, - { - "epoch": 3.0114491331370625, - "grad_norm": 1.1067787408828735, - "learning_rate": 8.886943507378741e-05, - "loss": 0.0846, - "step": 46030 - }, - { - "epoch": 3.012103369316323, - "grad_norm": 0.7693238854408264, - "learning_rate": 8.886365627098991e-05, - "loss": 0.0787, - "step": 46040 - }, - { - "epoch": 3.012757605495584, - "grad_norm": 0.9626716375350952, - "learning_rate": 8.885787615644208e-05, - "loss": 0.0769, - "step": 46050 - }, - { - "epoch": 3.0134118416748445, - "grad_norm": 1.0409643650054932, - "learning_rate": 8.885209473033905e-05, - "loss": 0.076, - "step": 46060 - }, - { - "epoch": 3.0140660778541055, - "grad_norm": 0.8555409908294678, - "learning_rate": 8.88463119928759e-05, - "loss": 0.0866, - "step": 46070 - }, - { - "epoch": 3.014720314033366, - "grad_norm": 0.8579902052879333, - "learning_rate": 8.884052794424785e-05, - "loss": 0.0643, - "step": 46080 - }, - { - "epoch": 3.0153745502126266, - "grad_norm": 1.0672752857208252, - "learning_rate": 8.883474258465013e-05, - "loss": 0.084, - "step": 46090 - }, - { - "epoch": 3.0160287863918875, - "grad_norm": 0.9049206972122192, - "learning_rate": 8.882895591427797e-05, - "loss": 0.0775, - "step": 46100 - }, - { - "epoch": 3.016683022571148, - "grad_norm": 0.9947949647903442, - "learning_rate": 8.882316793332674e-05, - "loss": 0.0693, - "step": 46110 - }, - { - "epoch": 3.017337258750409, - "grad_norm": 0.6652380228042603, - "learning_rate": 8.881737864199177e-05, - "loss": 0.0727, - "step": 46120 - }, - { - "epoch": 3.0179914949296696, - "grad_norm": 0.7874847650527954, - "learning_rate": 8.881158804046847e-05, - "loss": 0.0773, - "step": 46130 - }, - { - "epoch": 3.0186457311089305, - "grad_norm": 0.8908420205116272, - "learning_rate": 8.880579612895227e-05, - "loss": 0.0756, - "step": 46140 - }, - { - "epoch": 3.019299967288191, - "grad_norm": 0.7234876751899719, - "learning_rate": 8.88000029076387e-05, - "loss": 0.0749, - "step": 46150 - }, - { - "epoch": 3.0199542034674516, - "grad_norm": 0.6146116256713867, - "learning_rate": 8.879420837672327e-05, - "loss": 0.073, - "step": 46160 - }, - { - "epoch": 3.0206084396467126, - "grad_norm": 0.8692243099212646, - "learning_rate": 8.878841253640156e-05, - "loss": 0.0831, - "step": 46170 - }, - { - "epoch": 3.021262675825973, - "grad_norm": 0.9530091881752014, - "learning_rate": 8.878261538686921e-05, - "loss": 0.0667, - "step": 46180 - }, - { - "epoch": 3.021916912005234, - "grad_norm": 0.8727622032165527, - "learning_rate": 8.877681692832187e-05, - "loss": 0.075, - "step": 46190 - }, - { - "epoch": 3.0225711481844946, - "grad_norm": 0.7857223749160767, - "learning_rate": 8.877101716095525e-05, - "loss": 0.0768, - "step": 46200 - }, - { - "epoch": 3.023225384363755, - "grad_norm": 0.929732084274292, - "learning_rate": 8.876521608496516e-05, - "loss": 0.0851, - "step": 46210 - }, - { - "epoch": 3.023879620543016, - "grad_norm": 0.8777803182601929, - "learning_rate": 8.875941370054733e-05, - "loss": 0.0718, - "step": 46220 - }, - { - "epoch": 3.0245338567222766, - "grad_norm": 0.99214106798172, - "learning_rate": 8.875361000789764e-05, - "loss": 0.0747, - "step": 46230 - }, - { - "epoch": 3.0251880929015376, - "grad_norm": 1.0649714469909668, - "learning_rate": 8.874780500721198e-05, - "loss": 0.0842, - "step": 46240 - }, - { - "epoch": 3.025842329080798, - "grad_norm": 0.7679323554039001, - "learning_rate": 8.874199869868629e-05, - "loss": 0.0746, - "step": 46250 - }, - { - "epoch": 3.026496565260059, - "grad_norm": 0.8521194458007812, - "learning_rate": 8.873619108251653e-05, - "loss": 0.0822, - "step": 46260 - }, - { - "epoch": 3.0271508014393196, - "grad_norm": 1.05009126663208, - "learning_rate": 8.873038215889872e-05, - "loss": 0.0786, - "step": 46270 - }, - { - "epoch": 3.02780503761858, - "grad_norm": 0.8912391662597656, - "learning_rate": 8.872457192802897e-05, - "loss": 0.0807, - "step": 46280 - }, - { - "epoch": 3.028459273797841, - "grad_norm": 0.7786313891410828, - "learning_rate": 8.871876039010334e-05, - "loss": 0.0881, - "step": 46290 - }, - { - "epoch": 3.0291135099771016, - "grad_norm": 0.6473716497421265, - "learning_rate": 8.8712947545318e-05, - "loss": 0.0724, - "step": 46300 - }, - { - "epoch": 3.0297677461563626, - "grad_norm": 0.8231253027915955, - "learning_rate": 8.870713339386916e-05, - "loss": 0.068, - "step": 46310 - }, - { - "epoch": 3.030421982335623, - "grad_norm": 0.8397748470306396, - "learning_rate": 8.870131793595304e-05, - "loss": 0.0842, - "step": 46320 - }, - { - "epoch": 3.031076218514884, - "grad_norm": 1.0040104389190674, - "learning_rate": 8.869550117176597e-05, - "loss": 0.0821, - "step": 46330 - }, - { - "epoch": 3.0317304546941446, - "grad_norm": 0.7324417233467102, - "learning_rate": 8.868968310150423e-05, - "loss": 0.073, - "step": 46340 - }, - { - "epoch": 3.032384690873405, - "grad_norm": 0.8197325468063354, - "learning_rate": 8.868386372536423e-05, - "loss": 0.0722, - "step": 46350 - }, - { - "epoch": 3.033038927052666, - "grad_norm": 0.8666032552719116, - "learning_rate": 8.867804304354237e-05, - "loss": 0.0721, - "step": 46360 - }, - { - "epoch": 3.0336931632319266, - "grad_norm": 0.9289610981941223, - "learning_rate": 8.867222105623512e-05, - "loss": 0.0715, - "step": 46370 - }, - { - "epoch": 3.0343473994111876, - "grad_norm": 0.851219892501831, - "learning_rate": 8.866639776363898e-05, - "loss": 0.078, - "step": 46380 - }, - { - "epoch": 3.035001635590448, - "grad_norm": 0.9999982118606567, - "learning_rate": 8.866057316595053e-05, - "loss": 0.081, - "step": 46390 - }, - { - "epoch": 3.035655871769709, - "grad_norm": 0.8318057656288147, - "learning_rate": 8.865474726336632e-05, - "loss": 0.0796, - "step": 46400 - }, - { - "epoch": 3.0363101079489696, - "grad_norm": 0.7948132157325745, - "learning_rate": 8.864892005608303e-05, - "loss": 0.0706, - "step": 46410 - }, - { - "epoch": 3.03696434412823, - "grad_norm": 0.9667806625366211, - "learning_rate": 8.864309154429733e-05, - "loss": 0.0714, - "step": 46420 - }, - { - "epoch": 3.037618580307491, - "grad_norm": 0.821867823600769, - "learning_rate": 8.863726172820593e-05, - "loss": 0.0669, - "step": 46430 - }, - { - "epoch": 3.0382728164867516, - "grad_norm": 0.8645114302635193, - "learning_rate": 8.863143060800563e-05, - "loss": 0.0738, - "step": 46440 - }, - { - "epoch": 3.0389270526660126, - "grad_norm": 1.0619080066680908, - "learning_rate": 8.862559818389322e-05, - "loss": 0.0883, - "step": 46450 - }, - { - "epoch": 3.039581288845273, - "grad_norm": 0.9746699333190918, - "learning_rate": 8.861976445606559e-05, - "loss": 0.0819, - "step": 46460 - }, - { - "epoch": 3.0402355250245336, - "grad_norm": 0.8296138644218445, - "learning_rate": 8.861392942471961e-05, - "loss": 0.073, - "step": 46470 - }, - { - "epoch": 3.0408897612037946, - "grad_norm": 0.7673683762550354, - "learning_rate": 8.860809309005224e-05, - "loss": 0.0815, - "step": 46480 - }, - { - "epoch": 3.041543997383055, - "grad_norm": 0.9790584444999695, - "learning_rate": 8.860225545226049e-05, - "loss": 0.084, - "step": 46490 - }, - { - "epoch": 3.042198233562316, - "grad_norm": 0.7433209419250488, - "learning_rate": 8.859641651154138e-05, - "loss": 0.0741, - "step": 46500 - }, - { - "epoch": 3.0428524697415766, - "grad_norm": 0.8458490371704102, - "learning_rate": 8.859057626809198e-05, - "loss": 0.0691, - "step": 46510 - }, - { - "epoch": 3.0435067059208376, - "grad_norm": 0.8308846950531006, - "learning_rate": 8.858473472210944e-05, - "loss": 0.0762, - "step": 46520 - }, - { - "epoch": 3.044160942100098, - "grad_norm": 0.8369269967079163, - "learning_rate": 8.857889187379089e-05, - "loss": 0.0701, - "step": 46530 - }, - { - "epoch": 3.0448151782793587, - "grad_norm": 0.8432846665382385, - "learning_rate": 8.857304772333357e-05, - "loss": 0.0691, - "step": 46540 - }, - { - "epoch": 3.0454694144586196, - "grad_norm": 0.7842530608177185, - "learning_rate": 8.856720227093474e-05, - "loss": 0.085, - "step": 46550 - }, - { - "epoch": 3.04612365063788, - "grad_norm": 0.7288837432861328, - "learning_rate": 8.856135551679166e-05, - "loss": 0.0784, - "step": 46560 - }, - { - "epoch": 3.046777886817141, - "grad_norm": 0.7969174981117249, - "learning_rate": 8.855550746110171e-05, - "loss": 0.0849, - "step": 46570 - }, - { - "epoch": 3.0474321229964016, - "grad_norm": 0.9206051826477051, - "learning_rate": 8.854965810406229e-05, - "loss": 0.083, - "step": 46580 - }, - { - "epoch": 3.0480863591756626, - "grad_norm": 0.8779579401016235, - "learning_rate": 8.854380744587078e-05, - "loss": 0.0749, - "step": 46590 - }, - { - "epoch": 3.048740595354923, - "grad_norm": 0.9052606225013733, - "learning_rate": 8.85379554867247e-05, - "loss": 0.0712, - "step": 46600 - }, - { - "epoch": 3.0493948315341837, - "grad_norm": 0.8449774980545044, - "learning_rate": 8.853210222682156e-05, - "loss": 0.0723, - "step": 46610 - }, - { - "epoch": 3.0500490677134446, - "grad_norm": 0.8391105532646179, - "learning_rate": 8.85262476663589e-05, - "loss": 0.0764, - "step": 46620 - }, - { - "epoch": 3.050703303892705, - "grad_norm": 0.8949602842330933, - "learning_rate": 8.852039180553436e-05, - "loss": 0.0765, - "step": 46630 - }, - { - "epoch": 3.051357540071966, - "grad_norm": 0.7765173316001892, - "learning_rate": 8.851453464454554e-05, - "loss": 0.081, - "step": 46640 - }, - { - "epoch": 3.0520117762512267, - "grad_norm": 0.7792592644691467, - "learning_rate": 8.85086761835902e-05, - "loss": 0.0815, - "step": 46650 - }, - { - "epoch": 3.052666012430487, - "grad_norm": 0.8847215175628662, - "learning_rate": 8.850281642286603e-05, - "loss": 0.071, - "step": 46660 - }, - { - "epoch": 3.053320248609748, - "grad_norm": 0.9311257004737854, - "learning_rate": 8.849695536257083e-05, - "loss": 0.0749, - "step": 46670 - }, - { - "epoch": 3.0539744847890087, - "grad_norm": 0.7684807181358337, - "learning_rate": 8.849109300290242e-05, - "loss": 0.0735, - "step": 46680 - }, - { - "epoch": 3.0546287209682697, - "grad_norm": 0.864181399345398, - "learning_rate": 8.84852293440587e-05, - "loss": 0.0766, - "step": 46690 - }, - { - "epoch": 3.05528295714753, - "grad_norm": 0.8090917468070984, - "learning_rate": 8.847936438623754e-05, - "loss": 0.0857, - "step": 46700 - }, - { - "epoch": 3.055937193326791, - "grad_norm": 0.8556300401687622, - "learning_rate": 8.84734981296369e-05, - "loss": 0.0884, - "step": 46710 - }, - { - "epoch": 3.0565914295060517, - "grad_norm": 0.860185444355011, - "learning_rate": 8.846763057445481e-05, - "loss": 0.0749, - "step": 46720 - }, - { - "epoch": 3.057245665685312, - "grad_norm": 0.983341634273529, - "learning_rate": 8.846176172088932e-05, - "loss": 0.078, - "step": 46730 - }, - { - "epoch": 3.057899901864573, - "grad_norm": 0.7510449886322021, - "learning_rate": 8.84558915691385e-05, - "loss": 0.0709, - "step": 46740 - }, - { - "epoch": 3.0585541380438337, - "grad_norm": 0.8257303237915039, - "learning_rate": 8.845002011940047e-05, - "loss": 0.0808, - "step": 46750 - }, - { - "epoch": 3.0592083742230947, - "grad_norm": 0.8801044821739197, - "learning_rate": 8.844414737187342e-05, - "loss": 0.0736, - "step": 46760 - }, - { - "epoch": 3.059862610402355, - "grad_norm": 0.8823607563972473, - "learning_rate": 8.843827332675558e-05, - "loss": 0.0812, - "step": 46770 - }, - { - "epoch": 3.060516846581616, - "grad_norm": 1.0760579109191895, - "learning_rate": 8.84323979842452e-05, - "loss": 0.0779, - "step": 46780 - }, - { - "epoch": 3.0611710827608767, - "grad_norm": 0.8162868022918701, - "learning_rate": 8.842652134454061e-05, - "loss": 0.0686, - "step": 46790 - }, - { - "epoch": 3.061825318940137, - "grad_norm": 0.8627833724021912, - "learning_rate": 8.842064340784013e-05, - "loss": 0.0785, - "step": 46800 - }, - { - "epoch": 3.062479555119398, - "grad_norm": 0.6602069735527039, - "learning_rate": 8.84147641743422e-05, - "loss": 0.0682, - "step": 46810 - }, - { - "epoch": 3.0631337912986587, - "grad_norm": 0.8161008954048157, - "learning_rate": 8.84088836442452e-05, - "loss": 0.0776, - "step": 46820 - }, - { - "epoch": 3.0637880274779197, - "grad_norm": 0.9480443000793457, - "learning_rate": 8.840300181774767e-05, - "loss": 0.0783, - "step": 46830 - }, - { - "epoch": 3.06444226365718, - "grad_norm": 0.92530757188797, - "learning_rate": 8.83971186950481e-05, - "loss": 0.0822, - "step": 46840 - }, - { - "epoch": 3.065096499836441, - "grad_norm": 1.08670973777771, - "learning_rate": 8.839123427634508e-05, - "loss": 0.0785, - "step": 46850 - }, - { - "epoch": 3.0657507360157017, - "grad_norm": 0.8924565315246582, - "learning_rate": 8.838534856183722e-05, - "loss": 0.0715, - "step": 46860 - }, - { - "epoch": 3.066404972194962, - "grad_norm": 0.6959300637245178, - "learning_rate": 8.837946155172318e-05, - "loss": 0.0748, - "step": 46870 - }, - { - "epoch": 3.067059208374223, - "grad_norm": 0.7405787110328674, - "learning_rate": 8.837357324620164e-05, - "loss": 0.0748, - "step": 46880 - }, - { - "epoch": 3.0677134445534837, - "grad_norm": 0.8853090405464172, - "learning_rate": 8.836768364547139e-05, - "loss": 0.0854, - "step": 46890 - }, - { - "epoch": 3.0683676807327447, - "grad_norm": 0.9329870939254761, - "learning_rate": 8.836179274973118e-05, - "loss": 0.0803, - "step": 46900 - }, - { - "epoch": 3.069021916912005, - "grad_norm": 0.9848145246505737, - "learning_rate": 8.835590055917984e-05, - "loss": 0.0866, - "step": 46910 - }, - { - "epoch": 3.0696761530912657, - "grad_norm": 0.8550765514373779, - "learning_rate": 8.835000707401628e-05, - "loss": 0.0753, - "step": 46920 - }, - { - "epoch": 3.0703303892705267, - "grad_norm": 0.7886782288551331, - "learning_rate": 8.834411229443942e-05, - "loss": 0.0732, - "step": 46930 - }, - { - "epoch": 3.0709846254497872, - "grad_norm": 0.7411202788352966, - "learning_rate": 8.833821622064819e-05, - "loss": 0.0774, - "step": 46940 - }, - { - "epoch": 3.071638861629048, - "grad_norm": 1.315242052078247, - "learning_rate": 8.833231885284164e-05, - "loss": 0.0744, - "step": 46950 - }, - { - "epoch": 3.0722930978083087, - "grad_norm": 0.9081234335899353, - "learning_rate": 8.832642019121877e-05, - "loss": 0.0826, - "step": 46960 - }, - { - "epoch": 3.0729473339875697, - "grad_norm": 0.8793578147888184, - "learning_rate": 8.832052023597872e-05, - "loss": 0.0802, - "step": 46970 - }, - { - "epoch": 3.07360157016683, - "grad_norm": 0.9718412756919861, - "learning_rate": 8.831461898732061e-05, - "loss": 0.0784, - "step": 46980 - }, - { - "epoch": 3.0742558063460907, - "grad_norm": 0.7683785557746887, - "learning_rate": 8.83087164454436e-05, - "loss": 0.0773, - "step": 46990 - }, - { - "epoch": 3.0749100425253517, - "grad_norm": 0.8692295551300049, - "learning_rate": 8.830281261054698e-05, - "loss": 0.0702, - "step": 47000 - }, - { - "epoch": 3.0755642787046122, - "grad_norm": 0.8362782597541809, - "learning_rate": 8.829690748282997e-05, - "loss": 0.0676, - "step": 47010 - }, - { - "epoch": 3.076218514883873, - "grad_norm": 1.0380651950836182, - "learning_rate": 8.829100106249189e-05, - "loss": 0.0862, - "step": 47020 - }, - { - "epoch": 3.0768727510631337, - "grad_norm": 0.9438252449035645, - "learning_rate": 8.828509334973209e-05, - "loss": 0.0737, - "step": 47030 - }, - { - "epoch": 3.0775269872423947, - "grad_norm": 0.8855924606323242, - "learning_rate": 8.827918434475001e-05, - "loss": 0.0872, - "step": 47040 - }, - { - "epoch": 3.0781812234216552, - "grad_norm": 0.8621238470077515, - "learning_rate": 8.827327404774505e-05, - "loss": 0.0671, - "step": 47050 - }, - { - "epoch": 3.0788354596009158, - "grad_norm": 0.7865177392959595, - "learning_rate": 8.826736245891672e-05, - "loss": 0.0722, - "step": 47060 - }, - { - "epoch": 3.0794896957801767, - "grad_norm": 1.0064588785171509, - "learning_rate": 8.826144957846455e-05, - "loss": 0.069, - "step": 47070 - }, - { - "epoch": 3.0801439319594373, - "grad_norm": 0.8774595856666565, - "learning_rate": 8.825553540658811e-05, - "loss": 0.0753, - "step": 47080 - }, - { - "epoch": 3.0807981681386982, - "grad_norm": 0.9509215950965881, - "learning_rate": 8.824961994348701e-05, - "loss": 0.0741, - "step": 47090 - }, - { - "epoch": 3.0814524043179587, - "grad_norm": 0.8457993865013123, - "learning_rate": 8.824370318936095e-05, - "loss": 0.0712, - "step": 47100 - }, - { - "epoch": 3.0821066404972193, - "grad_norm": 0.8608481884002686, - "learning_rate": 8.823778514440959e-05, - "loss": 0.0801, - "step": 47110 - }, - { - "epoch": 3.0827608766764802, - "grad_norm": 0.9374340772628784, - "learning_rate": 8.82318658088327e-05, - "loss": 0.0866, - "step": 47120 - }, - { - "epoch": 3.0834151128557408, - "grad_norm": 0.9799492955207825, - "learning_rate": 8.822594518283008e-05, - "loss": 0.0875, - "step": 47130 - }, - { - "epoch": 3.0840693490350017, - "grad_norm": 0.8801363706588745, - "learning_rate": 8.822002326660154e-05, - "loss": 0.0756, - "step": 47140 - }, - { - "epoch": 3.0847235852142623, - "grad_norm": 0.8965553641319275, - "learning_rate": 8.8214100060347e-05, - "loss": 0.0792, - "step": 47150 - }, - { - "epoch": 3.0853778213935232, - "grad_norm": 0.8476041555404663, - "learning_rate": 8.820817556426636e-05, - "loss": 0.075, - "step": 47160 - }, - { - "epoch": 3.0860320575727838, - "grad_norm": 0.9920099973678589, - "learning_rate": 8.82022497785596e-05, - "loss": 0.0683, - "step": 47170 - }, - { - "epoch": 3.0866862937520443, - "grad_norm": 0.8582835793495178, - "learning_rate": 8.81963227034267e-05, - "loss": 0.0713, - "step": 47180 - }, - { - "epoch": 3.0873405299313053, - "grad_norm": 1.1154992580413818, - "learning_rate": 8.819039433906774e-05, - "loss": 0.0937, - "step": 47190 - }, - { - "epoch": 3.087994766110566, - "grad_norm": 1.022351861000061, - "learning_rate": 8.818446468568282e-05, - "loss": 0.0752, - "step": 47200 - }, - { - "epoch": 3.0886490022898268, - "grad_norm": 0.8366675972938538, - "learning_rate": 8.817853374347206e-05, - "loss": 0.0817, - "step": 47210 - }, - { - "epoch": 3.0893032384690873, - "grad_norm": 0.9166560769081116, - "learning_rate": 8.817260151263568e-05, - "loss": 0.0757, - "step": 47220 - }, - { - "epoch": 3.0899574746483482, - "grad_norm": 1.0350792407989502, - "learning_rate": 8.816666799337388e-05, - "loss": 0.0726, - "step": 47230 - }, - { - "epoch": 3.0906117108276088, - "grad_norm": 1.0422755479812622, - "learning_rate": 8.816073318588693e-05, - "loss": 0.0991, - "step": 47240 - }, - { - "epoch": 3.0912659470068693, - "grad_norm": 1.030333161354065, - "learning_rate": 8.815479709037515e-05, - "loss": 0.0694, - "step": 47250 - }, - { - "epoch": 3.0919201831861303, - "grad_norm": 1.0390321016311646, - "learning_rate": 8.814885970703893e-05, - "loss": 0.0855, - "step": 47260 - }, - { - "epoch": 3.092574419365391, - "grad_norm": 0.8490506410598755, - "learning_rate": 8.814292103607862e-05, - "loss": 0.0812, - "step": 47270 - }, - { - "epoch": 3.0932286555446518, - "grad_norm": 0.8992865681648254, - "learning_rate": 8.813698107769471e-05, - "loss": 0.0766, - "step": 47280 - }, - { - "epoch": 3.0938828917239123, - "grad_norm": 0.9719147086143494, - "learning_rate": 8.813103983208766e-05, - "loss": 0.0738, - "step": 47290 - }, - { - "epoch": 3.0945371279031733, - "grad_norm": 0.7177313566207886, - "learning_rate": 8.812509729945802e-05, - "loss": 0.0762, - "step": 47300 - }, - { - "epoch": 3.095191364082434, - "grad_norm": 0.8135479092597961, - "learning_rate": 8.811915348000635e-05, - "loss": 0.0761, - "step": 47310 - }, - { - "epoch": 3.0958456002616943, - "grad_norm": 0.8107147216796875, - "learning_rate": 8.811320837393329e-05, - "loss": 0.069, - "step": 47320 - }, - { - "epoch": 3.0964998364409553, - "grad_norm": 0.881219744682312, - "learning_rate": 8.810726198143949e-05, - "loss": 0.0791, - "step": 47330 - }, - { - "epoch": 3.097154072620216, - "grad_norm": 0.9612583518028259, - "learning_rate": 8.810131430272564e-05, - "loss": 0.0726, - "step": 47340 - }, - { - "epoch": 3.0978083087994768, - "grad_norm": 1.0407850742340088, - "learning_rate": 8.809536533799253e-05, - "loss": 0.0722, - "step": 47350 - }, - { - "epoch": 3.0984625449787373, - "grad_norm": 0.861651599407196, - "learning_rate": 8.808941508744093e-05, - "loss": 0.0808, - "step": 47360 - }, - { - "epoch": 3.099116781157998, - "grad_norm": 0.8344012498855591, - "learning_rate": 8.808346355127166e-05, - "loss": 0.0697, - "step": 47370 - }, - { - "epoch": 3.099771017337259, - "grad_norm": 0.8500309586524963, - "learning_rate": 8.807751072968563e-05, - "loss": 0.0738, - "step": 47380 - }, - { - "epoch": 3.1004252535165193, - "grad_norm": 0.8990835547447205, - "learning_rate": 8.807155662288375e-05, - "loss": 0.0724, - "step": 47390 - }, - { - "epoch": 3.1010794896957803, - "grad_norm": 0.8191524744033813, - "learning_rate": 8.8065601231067e-05, - "loss": 0.0699, - "step": 47400 - }, - { - "epoch": 3.101733725875041, - "grad_norm": 0.9893473982810974, - "learning_rate": 8.805964455443636e-05, - "loss": 0.0728, - "step": 47410 - }, - { - "epoch": 3.102387962054302, - "grad_norm": 0.8191789984703064, - "learning_rate": 8.805368659319291e-05, - "loss": 0.0682, - "step": 47420 - }, - { - "epoch": 3.1030421982335623, - "grad_norm": 0.8927891850471497, - "learning_rate": 8.804772734753773e-05, - "loss": 0.0692, - "step": 47430 - }, - { - "epoch": 3.103696434412823, - "grad_norm": 0.8646766543388367, - "learning_rate": 8.804176681767196e-05, - "loss": 0.0781, - "step": 47440 - }, - { - "epoch": 3.104350670592084, - "grad_norm": 0.9096436500549316, - "learning_rate": 8.803580500379681e-05, - "loss": 0.0854, - "step": 47450 - }, - { - "epoch": 3.1050049067713443, - "grad_norm": 0.8713791370391846, - "learning_rate": 8.802984190611349e-05, - "loss": 0.0833, - "step": 47460 - }, - { - "epoch": 3.1056591429506053, - "grad_norm": 0.9313731789588928, - "learning_rate": 8.802387752482327e-05, - "loss": 0.0796, - "step": 47470 - }, - { - "epoch": 3.106313379129866, - "grad_norm": 0.8143359422683716, - "learning_rate": 8.801791186012744e-05, - "loss": 0.0747, - "step": 47480 - }, - { - "epoch": 3.106967615309127, - "grad_norm": 0.8441165685653687, - "learning_rate": 8.80119449122274e-05, - "loss": 0.0727, - "step": 47490 - }, - { - "epoch": 3.1076218514883873, - "grad_norm": 0.9759175777435303, - "learning_rate": 8.800597668132452e-05, - "loss": 0.0743, - "step": 47500 - }, - { - "epoch": 3.108276087667648, - "grad_norm": 0.8443938493728638, - "learning_rate": 8.800000716762024e-05, - "loss": 0.0739, - "step": 47510 - }, - { - "epoch": 3.108930323846909, - "grad_norm": 0.9399232864379883, - "learning_rate": 8.799403637131609e-05, - "loss": 0.0766, - "step": 47520 - }, - { - "epoch": 3.1095845600261693, - "grad_norm": 0.8907540440559387, - "learning_rate": 8.798806429261355e-05, - "loss": 0.0757, - "step": 47530 - }, - { - "epoch": 3.1102387962054303, - "grad_norm": 0.9284667372703552, - "learning_rate": 8.798209093171421e-05, - "loss": 0.0811, - "step": 47540 - }, - { - "epoch": 3.110893032384691, - "grad_norm": 0.9799740314483643, - "learning_rate": 8.79761162888197e-05, - "loss": 0.0741, - "step": 47550 - }, - { - "epoch": 3.1115472685639514, - "grad_norm": 0.9789896011352539, - "learning_rate": 8.797014036413167e-05, - "loss": 0.0732, - "step": 47560 - }, - { - "epoch": 3.1122015047432123, - "grad_norm": 0.7399867177009583, - "learning_rate": 8.796416315785181e-05, - "loss": 0.0726, - "step": 47570 - }, - { - "epoch": 3.112855740922473, - "grad_norm": 0.9549564719200134, - "learning_rate": 8.795818467018188e-05, - "loss": 0.0777, - "step": 47580 - }, - { - "epoch": 3.113509977101734, - "grad_norm": 0.9988911151885986, - "learning_rate": 8.795220490132369e-05, - "loss": 0.0775, - "step": 47590 - }, - { - "epoch": 3.1141642132809944, - "grad_norm": 0.9814069271087646, - "learning_rate": 8.794622385147903e-05, - "loss": 0.0853, - "step": 47600 - }, - { - "epoch": 3.1148184494602553, - "grad_norm": 0.7243561148643494, - "learning_rate": 8.79402415208498e-05, - "loss": 0.0778, - "step": 47610 - }, - { - "epoch": 3.115472685639516, - "grad_norm": 0.8936898708343506, - "learning_rate": 8.793425790963792e-05, - "loss": 0.0838, - "step": 47620 - }, - { - "epoch": 3.1161269218187764, - "grad_norm": 0.9839106202125549, - "learning_rate": 8.792827301804536e-05, - "loss": 0.0809, - "step": 47630 - }, - { - "epoch": 3.1167811579980373, - "grad_norm": 0.9321417212486267, - "learning_rate": 8.79222868462741e-05, - "loss": 0.0699, - "step": 47640 - }, - { - "epoch": 3.117435394177298, - "grad_norm": 0.9388688206672668, - "learning_rate": 8.791629939452621e-05, - "loss": 0.0872, - "step": 47650 - }, - { - "epoch": 3.118089630356559, - "grad_norm": 0.7993639707565308, - "learning_rate": 8.791031066300378e-05, - "loss": 0.0793, - "step": 47660 - }, - { - "epoch": 3.1187438665358194, - "grad_norm": 0.7926256060600281, - "learning_rate": 8.790432065190892e-05, - "loss": 0.0762, - "step": 47670 - }, - { - "epoch": 3.1193981027150803, - "grad_norm": 0.8904451131820679, - "learning_rate": 8.789832936144386e-05, - "loss": 0.0817, - "step": 47680 - }, - { - "epoch": 3.120052338894341, - "grad_norm": 1.026097059249878, - "learning_rate": 8.789233679181077e-05, - "loss": 0.0745, - "step": 47690 - }, - { - "epoch": 3.1207065750736014, - "grad_norm": 0.8326067328453064, - "learning_rate": 8.788634294321195e-05, - "loss": 0.0662, - "step": 47700 - }, - { - "epoch": 3.1213608112528624, - "grad_norm": 0.9273573160171509, - "learning_rate": 8.788034781584968e-05, - "loss": 0.0782, - "step": 47710 - }, - { - "epoch": 3.122015047432123, - "grad_norm": 0.7090617418289185, - "learning_rate": 8.787435140992635e-05, - "loss": 0.0721, - "step": 47720 - }, - { - "epoch": 3.122669283611384, - "grad_norm": 0.823361873626709, - "learning_rate": 8.786835372564431e-05, - "loss": 0.0743, - "step": 47730 - }, - { - "epoch": 3.1233235197906444, - "grad_norm": 0.8674502372741699, - "learning_rate": 8.786235476320603e-05, - "loss": 0.064, - "step": 47740 - }, - { - "epoch": 3.1239777559699053, - "grad_norm": 0.8977984189987183, - "learning_rate": 8.785635452281397e-05, - "loss": 0.0797, - "step": 47750 - }, - { - "epoch": 3.124631992149166, - "grad_norm": 0.7301965355873108, - "learning_rate": 8.785035300467068e-05, - "loss": 0.0726, - "step": 47760 - }, - { - "epoch": 3.1252862283284264, - "grad_norm": 0.8135896921157837, - "learning_rate": 8.78443502089787e-05, - "loss": 0.0799, - "step": 47770 - }, - { - "epoch": 3.1259404645076874, - "grad_norm": 0.8342558145523071, - "learning_rate": 8.783834613594064e-05, - "loss": 0.0738, - "step": 47780 - }, - { - "epoch": 3.126594700686948, - "grad_norm": 0.7653647065162659, - "learning_rate": 8.783234078575917e-05, - "loss": 0.0768, - "step": 47790 - }, - { - "epoch": 3.127248936866209, - "grad_norm": 0.7020546793937683, - "learning_rate": 8.7826334158637e-05, - "loss": 0.0703, - "step": 47800 - }, - { - "epoch": 3.1279031730454694, - "grad_norm": 0.9205434918403625, - "learning_rate": 8.782032625477681e-05, - "loss": 0.0709, - "step": 47810 - }, - { - "epoch": 3.1285574092247304, - "grad_norm": 0.830418586730957, - "learning_rate": 8.781431707438145e-05, - "loss": 0.0749, - "step": 47820 - }, - { - "epoch": 3.129211645403991, - "grad_norm": 1.1934880018234253, - "learning_rate": 8.780830661765371e-05, - "loss": 0.072, - "step": 47830 - }, - { - "epoch": 3.1298658815832514, - "grad_norm": 0.9115713834762573, - "learning_rate": 8.780229488479646e-05, - "loss": 0.0744, - "step": 47840 - }, - { - "epoch": 3.1305201177625124, - "grad_norm": 0.963766872882843, - "learning_rate": 8.779628187601261e-05, - "loss": 0.0832, - "step": 47850 - }, - { - "epoch": 3.131174353941773, - "grad_norm": 0.7308657765388489, - "learning_rate": 8.779026759150515e-05, - "loss": 0.0826, - "step": 47860 - }, - { - "epoch": 3.131828590121034, - "grad_norm": 0.8448659777641296, - "learning_rate": 8.778425203147703e-05, - "loss": 0.0819, - "step": 47870 - }, - { - "epoch": 3.1324828263002944, - "grad_norm": 1.0432631969451904, - "learning_rate": 8.777823519613131e-05, - "loss": 0.0763, - "step": 47880 - }, - { - "epoch": 3.133137062479555, - "grad_norm": 0.9087404608726501, - "learning_rate": 8.77722170856711e-05, - "loss": 0.0852, - "step": 47890 - }, - { - "epoch": 3.133791298658816, - "grad_norm": 0.8869801163673401, - "learning_rate": 8.776619770029946e-05, - "loss": 0.0716, - "step": 47900 - }, - { - "epoch": 3.1344455348380764, - "grad_norm": 0.7883999943733215, - "learning_rate": 8.776017704021964e-05, - "loss": 0.0694, - "step": 47910 - }, - { - "epoch": 3.1350997710173374, - "grad_norm": 1.1051850318908691, - "learning_rate": 8.77541551056348e-05, - "loss": 0.083, - "step": 47920 - }, - { - "epoch": 3.135754007196598, - "grad_norm": 0.7399940490722656, - "learning_rate": 8.77481318967482e-05, - "loss": 0.0688, - "step": 47930 - }, - { - "epoch": 3.136408243375859, - "grad_norm": 0.8819198608398438, - "learning_rate": 8.774210741376316e-05, - "loss": 0.0813, - "step": 47940 - }, - { - "epoch": 3.1370624795551194, - "grad_norm": 0.8618602156639099, - "learning_rate": 8.773608165688303e-05, - "loss": 0.081, - "step": 47950 - }, - { - "epoch": 3.13771671573438, - "grad_norm": 0.9459993243217468, - "learning_rate": 8.773005462631115e-05, - "loss": 0.072, - "step": 47960 - }, - { - "epoch": 3.138370951913641, - "grad_norm": 0.8384934067726135, - "learning_rate": 8.772402632225098e-05, - "loss": 0.0748, - "step": 47970 - }, - { - "epoch": 3.1390251880929014, - "grad_norm": 0.803860604763031, - "learning_rate": 8.7717996744906e-05, - "loss": 0.0765, - "step": 47980 - }, - { - "epoch": 3.1396794242721624, - "grad_norm": 1.1907633543014526, - "learning_rate": 8.77119658944797e-05, - "loss": 0.0909, - "step": 47990 - }, - { - "epoch": 3.140333660451423, - "grad_norm": 0.8750865459442139, - "learning_rate": 8.770593377117566e-05, - "loss": 0.067, - "step": 48000 - }, - { - "epoch": 3.1409878966306835, - "grad_norm": 0.7471033930778503, - "learning_rate": 8.769990037519747e-05, - "loss": 0.0719, - "step": 48010 - }, - { - "epoch": 3.1416421328099444, - "grad_norm": 1.0200992822647095, - "learning_rate": 8.769386570674876e-05, - "loss": 0.0804, - "step": 48020 - }, - { - "epoch": 3.142296368989205, - "grad_norm": 0.8135073184967041, - "learning_rate": 8.768782976603323e-05, - "loss": 0.0677, - "step": 48030 - }, - { - "epoch": 3.142950605168466, - "grad_norm": 0.8066560626029968, - "learning_rate": 8.76817925532546e-05, - "loss": 0.0816, - "step": 48040 - }, - { - "epoch": 3.1436048413477264, - "grad_norm": 0.9547781348228455, - "learning_rate": 8.767575406861665e-05, - "loss": 0.0732, - "step": 48050 - }, - { - "epoch": 3.1442590775269874, - "grad_norm": 0.9590566754341125, - "learning_rate": 8.766971431232318e-05, - "loss": 0.0797, - "step": 48060 - }, - { - "epoch": 3.144913313706248, - "grad_norm": 0.8913863897323608, - "learning_rate": 8.766367328457808e-05, - "loss": 0.08, - "step": 48070 - }, - { - "epoch": 3.1455675498855085, - "grad_norm": 0.9507361650466919, - "learning_rate": 8.765763098558521e-05, - "loss": 0.0722, - "step": 48080 - }, - { - "epoch": 3.1462217860647694, - "grad_norm": 1.0220569372177124, - "learning_rate": 8.765158741554855e-05, - "loss": 0.083, - "step": 48090 - }, - { - "epoch": 3.14687602224403, - "grad_norm": 0.9265881776809692, - "learning_rate": 8.764554257467207e-05, - "loss": 0.0781, - "step": 48100 - }, - { - "epoch": 3.147530258423291, - "grad_norm": 0.765208899974823, - "learning_rate": 8.763949646315979e-05, - "loss": 0.0691, - "step": 48110 - }, - { - "epoch": 3.1481844946025515, - "grad_norm": 0.7979570627212524, - "learning_rate": 8.76334490812158e-05, - "loss": 0.0834, - "step": 48120 - }, - { - "epoch": 3.1488387307818124, - "grad_norm": 0.6835886240005493, - "learning_rate": 8.76274004290442e-05, - "loss": 0.0679, - "step": 48130 - }, - { - "epoch": 3.149492966961073, - "grad_norm": 0.8190062642097473, - "learning_rate": 8.762135050684915e-05, - "loss": 0.0679, - "step": 48140 - }, - { - "epoch": 3.1501472031403335, - "grad_norm": 0.946603536605835, - "learning_rate": 8.761529931483487e-05, - "loss": 0.0689, - "step": 48150 - }, - { - "epoch": 3.1508014393195944, - "grad_norm": 0.8142273426055908, - "learning_rate": 8.760924685320557e-05, - "loss": 0.0722, - "step": 48160 - }, - { - "epoch": 3.151455675498855, - "grad_norm": 0.7121886610984802, - "learning_rate": 8.760319312216557e-05, - "loss": 0.0683, - "step": 48170 - }, - { - "epoch": 3.152109911678116, - "grad_norm": 0.9496175646781921, - "learning_rate": 8.759713812191917e-05, - "loss": 0.066, - "step": 48180 - }, - { - "epoch": 3.1527641478573765, - "grad_norm": 0.7960578799247742, - "learning_rate": 8.759108185267078e-05, - "loss": 0.0642, - "step": 48190 - }, - { - "epoch": 3.1534183840366374, - "grad_norm": 1.0279724597930908, - "learning_rate": 8.758502431462476e-05, - "loss": 0.0795, - "step": 48200 - }, - { - "epoch": 3.154072620215898, - "grad_norm": 0.8479052186012268, - "learning_rate": 8.757896550798562e-05, - "loss": 0.0718, - "step": 48210 - }, - { - "epoch": 3.1547268563951585, - "grad_norm": 0.7294306755065918, - "learning_rate": 8.757290543295784e-05, - "loss": 0.0758, - "step": 48220 - }, - { - "epoch": 3.1553810925744195, - "grad_norm": 0.7830544710159302, - "learning_rate": 8.756684408974596e-05, - "loss": 0.0777, - "step": 48230 - }, - { - "epoch": 3.15603532875368, - "grad_norm": 0.8489717841148376, - "learning_rate": 8.756078147855455e-05, - "loss": 0.0847, - "step": 48240 - }, - { - "epoch": 3.156689564932941, - "grad_norm": 0.8141524791717529, - "learning_rate": 8.755471759958828e-05, - "loss": 0.074, - "step": 48250 - }, - { - "epoch": 3.1573438011122015, - "grad_norm": 1.0648870468139648, - "learning_rate": 8.754865245305179e-05, - "loss": 0.085, - "step": 48260 - }, - { - "epoch": 3.1579980372914624, - "grad_norm": 0.9387083649635315, - "learning_rate": 8.754258603914982e-05, - "loss": 0.0737, - "step": 48270 - }, - { - "epoch": 3.158652273470723, - "grad_norm": 0.906022846698761, - "learning_rate": 8.75365183580871e-05, - "loss": 0.0797, - "step": 48280 - }, - { - "epoch": 3.1593065096499835, - "grad_norm": 0.9713001847267151, - "learning_rate": 8.753044941006846e-05, - "loss": 0.0766, - "step": 48290 - }, - { - "epoch": 3.1599607458292445, - "grad_norm": 1.0321050882339478, - "learning_rate": 8.75243791952987e-05, - "loss": 0.0814, - "step": 48300 - }, - { - "epoch": 3.160614982008505, - "grad_norm": 0.7994502186775208, - "learning_rate": 8.751830771398272e-05, - "loss": 0.0746, - "step": 48310 - }, - { - "epoch": 3.161269218187766, - "grad_norm": 1.0904173851013184, - "learning_rate": 8.75122349663255e-05, - "loss": 0.0742, - "step": 48320 - }, - { - "epoch": 3.1619234543670265, - "grad_norm": 1.043516755104065, - "learning_rate": 8.750616095253194e-05, - "loss": 0.085, - "step": 48330 - }, - { - "epoch": 3.162577690546287, - "grad_norm": 0.9570246934890747, - "learning_rate": 8.750008567280709e-05, - "loss": 0.0795, - "step": 48340 - }, - { - "epoch": 3.163231926725548, - "grad_norm": 0.9861446022987366, - "learning_rate": 8.749400912735602e-05, - "loss": 0.0799, - "step": 48350 - }, - { - "epoch": 3.1638861629048085, - "grad_norm": 1.004128336906433, - "learning_rate": 8.748793131638379e-05, - "loss": 0.0798, - "step": 48360 - }, - { - "epoch": 3.1645403990840695, - "grad_norm": 0.8075050115585327, - "learning_rate": 8.748185224009558e-05, - "loss": 0.0796, - "step": 48370 - }, - { - "epoch": 3.16519463526333, - "grad_norm": 0.7377058863639832, - "learning_rate": 8.747577189869653e-05, - "loss": 0.0672, - "step": 48380 - }, - { - "epoch": 3.165848871442591, - "grad_norm": 0.8605074882507324, - "learning_rate": 8.746969029239192e-05, - "loss": 0.071, - "step": 48390 - }, - { - "epoch": 3.1665031076218515, - "grad_norm": 0.8945094347000122, - "learning_rate": 8.746360742138698e-05, - "loss": 0.0756, - "step": 48400 - }, - { - "epoch": 3.167157343801112, - "grad_norm": 0.9711698293685913, - "learning_rate": 8.745752328588703e-05, - "loss": 0.0706, - "step": 48410 - }, - { - "epoch": 3.167811579980373, - "grad_norm": 1.2028074264526367, - "learning_rate": 8.745143788609744e-05, - "loss": 0.0807, - "step": 48420 - }, - { - "epoch": 3.1684658161596335, - "grad_norm": 0.8748884201049805, - "learning_rate": 8.744535122222361e-05, - "loss": 0.0808, - "step": 48430 - }, - { - "epoch": 3.1691200523388945, - "grad_norm": 0.8327896595001221, - "learning_rate": 8.743926329447097e-05, - "loss": 0.0734, - "step": 48440 - }, - { - "epoch": 3.169774288518155, - "grad_norm": 0.7776886224746704, - "learning_rate": 8.743317410304501e-05, - "loss": 0.0674, - "step": 48450 - }, - { - "epoch": 3.1704285246974155, - "grad_norm": 0.8404861688613892, - "learning_rate": 8.742708364815125e-05, - "loss": 0.0654, - "step": 48460 - }, - { - "epoch": 3.1710827608766765, - "grad_norm": 0.9241907596588135, - "learning_rate": 8.742099192999525e-05, - "loss": 0.075, - "step": 48470 - }, - { - "epoch": 3.171736997055937, - "grad_norm": 0.7482475638389587, - "learning_rate": 8.741489894878264e-05, - "loss": 0.0702, - "step": 48480 - }, - { - "epoch": 3.172391233235198, - "grad_norm": 0.9275398254394531, - "learning_rate": 8.740880470471907e-05, - "loss": 0.072, - "step": 48490 - }, - { - "epoch": 3.1730454694144585, - "grad_norm": 0.873641848564148, - "learning_rate": 8.740270919801023e-05, - "loss": 0.0783, - "step": 48500 - }, - { - "epoch": 3.1736997055937195, - "grad_norm": 0.8717873692512512, - "learning_rate": 8.739661242886186e-05, - "loss": 0.0775, - "step": 48510 - }, - { - "epoch": 3.17435394177298, - "grad_norm": 0.8635610938072205, - "learning_rate": 8.739051439747973e-05, - "loss": 0.0736, - "step": 48520 - }, - { - "epoch": 3.1750081779522406, - "grad_norm": 1.115180253982544, - "learning_rate": 8.73844151040697e-05, - "loss": 0.0772, - "step": 48530 - }, - { - "epoch": 3.1756624141315015, - "grad_norm": 0.6833996772766113, - "learning_rate": 8.737831454883761e-05, - "loss": 0.0643, - "step": 48540 - }, - { - "epoch": 3.176316650310762, - "grad_norm": 0.8258626461029053, - "learning_rate": 8.737221273198939e-05, - "loss": 0.0809, - "step": 48550 - }, - { - "epoch": 3.176970886490023, - "grad_norm": 0.9197008609771729, - "learning_rate": 8.736610965373095e-05, - "loss": 0.0705, - "step": 48560 - }, - { - "epoch": 3.1776251226692835, - "grad_norm": 0.9148569703102112, - "learning_rate": 8.736000531426833e-05, - "loss": 0.0733, - "step": 48570 - }, - { - "epoch": 3.1782793588485445, - "grad_norm": 0.9025515913963318, - "learning_rate": 8.735389971380755e-05, - "loss": 0.0835, - "step": 48580 - }, - { - "epoch": 3.178933595027805, - "grad_norm": 0.9588202238082886, - "learning_rate": 8.734779285255469e-05, - "loss": 0.0648, - "step": 48590 - }, - { - "epoch": 3.1795878312070656, - "grad_norm": 0.7988367676734924, - "learning_rate": 8.734168473071587e-05, - "loss": 0.083, - "step": 48600 - }, - { - "epoch": 3.1802420673863265, - "grad_norm": 0.7926768064498901, - "learning_rate": 8.733557534849726e-05, - "loss": 0.0718, - "step": 48610 - }, - { - "epoch": 3.180896303565587, - "grad_norm": 1.0557847023010254, - "learning_rate": 8.732946470610506e-05, - "loss": 0.0718, - "step": 48620 - }, - { - "epoch": 3.181550539744848, - "grad_norm": 0.996004045009613, - "learning_rate": 8.732335280374555e-05, - "loss": 0.081, - "step": 48630 - }, - { - "epoch": 3.1822047759241086, - "grad_norm": 0.9401144981384277, - "learning_rate": 8.731723964162498e-05, - "loss": 0.0697, - "step": 48640 - }, - { - "epoch": 3.1828590121033695, - "grad_norm": 0.7425965666770935, - "learning_rate": 8.731112521994969e-05, - "loss": 0.0752, - "step": 48650 - }, - { - "epoch": 3.18351324828263, - "grad_norm": 0.9412823915481567, - "learning_rate": 8.730500953892609e-05, - "loss": 0.07, - "step": 48660 - }, - { - "epoch": 3.1841674844618906, - "grad_norm": 0.7894400358200073, - "learning_rate": 8.729889259876057e-05, - "loss": 0.0686, - "step": 48670 - }, - { - "epoch": 3.1848217206411515, - "grad_norm": 0.8867034912109375, - "learning_rate": 8.729277439965962e-05, - "loss": 0.081, - "step": 48680 - }, - { - "epoch": 3.185475956820412, - "grad_norm": 0.8769906163215637, - "learning_rate": 8.728665494182971e-05, - "loss": 0.0688, - "step": 48690 - }, - { - "epoch": 3.186130192999673, - "grad_norm": 0.8141859173774719, - "learning_rate": 8.728053422547743e-05, - "loss": 0.0803, - "step": 48700 - }, - { - "epoch": 3.1867844291789336, - "grad_norm": 0.8036234974861145, - "learning_rate": 8.727441225080934e-05, - "loss": 0.0813, - "step": 48710 - }, - { - "epoch": 3.1874386653581945, - "grad_norm": 0.7681657075881958, - "learning_rate": 8.726828901803207e-05, - "loss": 0.0797, - "step": 48720 - }, - { - "epoch": 3.188092901537455, - "grad_norm": 0.9178977608680725, - "learning_rate": 8.726216452735232e-05, - "loss": 0.0743, - "step": 48730 - }, - { - "epoch": 3.1887471377167156, - "grad_norm": 0.8207674026489258, - "learning_rate": 8.72560387789768e-05, - "loss": 0.0692, - "step": 48740 - }, - { - "epoch": 3.1894013738959766, - "grad_norm": 0.7519068121910095, - "learning_rate": 8.724991177311224e-05, - "loss": 0.0789, - "step": 48750 - }, - { - "epoch": 3.190055610075237, - "grad_norm": 0.8407567143440247, - "learning_rate": 8.724378350996549e-05, - "loss": 0.0686, - "step": 48760 - }, - { - "epoch": 3.190709846254498, - "grad_norm": 0.6936128735542297, - "learning_rate": 8.723765398974335e-05, - "loss": 0.076, - "step": 48770 - }, - { - "epoch": 3.1913640824337586, - "grad_norm": 0.7411690354347229, - "learning_rate": 8.723152321265275e-05, - "loss": 0.077, - "step": 48780 - }, - { - "epoch": 3.192018318613019, - "grad_norm": 0.9644950032234192, - "learning_rate": 8.722539117890058e-05, - "loss": 0.0803, - "step": 48790 - }, - { - "epoch": 3.19267255479228, - "grad_norm": 0.9315986633300781, - "learning_rate": 8.721925788869383e-05, - "loss": 0.0792, - "step": 48800 - }, - { - "epoch": 3.1933267909715406, - "grad_norm": 0.8755950331687927, - "learning_rate": 8.721312334223952e-05, - "loss": 0.0743, - "step": 48810 - }, - { - "epoch": 3.1939810271508016, - "grad_norm": 0.7426033616065979, - "learning_rate": 8.720698753974473e-05, - "loss": 0.0711, - "step": 48820 - }, - { - "epoch": 3.194635263330062, - "grad_norm": 0.8731258511543274, - "learning_rate": 8.720085048141649e-05, - "loss": 0.0696, - "step": 48830 - }, - { - "epoch": 3.195289499509323, - "grad_norm": 0.9991781711578369, - "learning_rate": 8.7194712167462e-05, - "loss": 0.0847, - "step": 48840 - }, - { - "epoch": 3.1959437356885836, - "grad_norm": 0.78664630651474, - "learning_rate": 8.718857259808843e-05, - "loss": 0.0788, - "step": 48850 - }, - { - "epoch": 3.196597971867844, - "grad_norm": 0.8035576939582825, - "learning_rate": 8.7182431773503e-05, - "loss": 0.0652, - "step": 48860 - }, - { - "epoch": 3.197252208047105, - "grad_norm": 0.9748409390449524, - "learning_rate": 8.717628969391298e-05, - "loss": 0.0878, - "step": 48870 - }, - { - "epoch": 3.1979064442263656, - "grad_norm": 0.9272085428237915, - "learning_rate": 8.717014635952569e-05, - "loss": 0.0736, - "step": 48880 - }, - { - "epoch": 3.1985606804056266, - "grad_norm": 0.7096579670906067, - "learning_rate": 8.716400177054849e-05, - "loss": 0.0637, - "step": 48890 - }, - { - "epoch": 3.199214916584887, - "grad_norm": 0.9682911038398743, - "learning_rate": 8.715785592718875e-05, - "loss": 0.0681, - "step": 48900 - }, - { - "epoch": 3.1998691527641476, - "grad_norm": 0.7501932382583618, - "learning_rate": 8.715170882965391e-05, - "loss": 0.0706, - "step": 48910 - }, - { - "epoch": 3.2005233889434086, - "grad_norm": 0.8990116715431213, - "learning_rate": 8.714556047815147e-05, - "loss": 0.0719, - "step": 48920 - }, - { - "epoch": 3.201177625122669, - "grad_norm": 0.8213568925857544, - "learning_rate": 8.713941087288897e-05, - "loss": 0.0646, - "step": 48930 - }, - { - "epoch": 3.20183186130193, - "grad_norm": 0.9337528944015503, - "learning_rate": 8.713326001407393e-05, - "loss": 0.0844, - "step": 48940 - }, - { - "epoch": 3.2024860974811906, - "grad_norm": 0.7658262848854065, - "learning_rate": 8.712710790191399e-05, - "loss": 0.0723, - "step": 48950 - }, - { - "epoch": 3.2031403336604516, - "grad_norm": 0.803835391998291, - "learning_rate": 8.712095453661677e-05, - "loss": 0.0714, - "step": 48960 - }, - { - "epoch": 3.203794569839712, - "grad_norm": 0.8985826373100281, - "learning_rate": 8.711479991839e-05, - "loss": 0.0761, - "step": 48970 - }, - { - "epoch": 3.2044488060189726, - "grad_norm": 0.8591262698173523, - "learning_rate": 8.710864404744139e-05, - "loss": 0.0807, - "step": 48980 - }, - { - "epoch": 3.2051030421982336, - "grad_norm": 0.8583003878593445, - "learning_rate": 8.710248692397872e-05, - "loss": 0.0725, - "step": 48990 - }, - { - "epoch": 3.205757278377494, - "grad_norm": 0.8409457802772522, - "learning_rate": 8.709632854820982e-05, - "loss": 0.0805, - "step": 49000 - }, - { - "epoch": 3.206411514556755, - "grad_norm": 0.8898982405662537, - "learning_rate": 8.709016892034252e-05, - "loss": 0.0737, - "step": 49010 - }, - { - "epoch": 3.2070657507360156, - "grad_norm": 0.8796902298927307, - "learning_rate": 8.708400804058478e-05, - "loss": 0.0786, - "step": 49020 - }, - { - "epoch": 3.2077199869152766, - "grad_norm": 0.9222922325134277, - "learning_rate": 8.70778459091445e-05, - "loss": 0.0753, - "step": 49030 - }, - { - "epoch": 3.208374223094537, - "grad_norm": 0.9596317410469055, - "learning_rate": 8.707168252622966e-05, - "loss": 0.0713, - "step": 49040 - }, - { - "epoch": 3.2090284592737977, - "grad_norm": 0.9624839425086975, - "learning_rate": 8.706551789204833e-05, - "loss": 0.0772, - "step": 49050 - }, - { - "epoch": 3.2096826954530586, - "grad_norm": 0.9401121139526367, - "learning_rate": 8.705935200680854e-05, - "loss": 0.0724, - "step": 49060 - }, - { - "epoch": 3.210336931632319, - "grad_norm": 0.7766807675361633, - "learning_rate": 8.705318487071846e-05, - "loss": 0.0691, - "step": 49070 - }, - { - "epoch": 3.21099116781158, - "grad_norm": 0.8511127233505249, - "learning_rate": 8.704701648398621e-05, - "loss": 0.0736, - "step": 49080 - }, - { - "epoch": 3.2116454039908406, - "grad_norm": 0.9326490163803101, - "learning_rate": 8.704084684681998e-05, - "loss": 0.0704, - "step": 49090 - }, - { - "epoch": 3.2122996401701016, - "grad_norm": 1.0042059421539307, - "learning_rate": 8.703467595942803e-05, - "loss": 0.0755, - "step": 49100 - }, - { - "epoch": 3.212953876349362, - "grad_norm": 0.7557867169380188, - "learning_rate": 8.702850382201863e-05, - "loss": 0.0707, - "step": 49110 - }, - { - "epoch": 3.2136081125286227, - "grad_norm": 1.0037438869476318, - "learning_rate": 8.702233043480015e-05, - "loss": 0.0776, - "step": 49120 - }, - { - "epoch": 3.2142623487078836, - "grad_norm": 0.7470929026603699, - "learning_rate": 8.701615579798089e-05, - "loss": 0.0707, - "step": 49130 - }, - { - "epoch": 3.214916584887144, - "grad_norm": 0.8272511959075928, - "learning_rate": 8.70099799117693e-05, - "loss": 0.0767, - "step": 49140 - }, - { - "epoch": 3.215570821066405, - "grad_norm": 1.0610016584396362, - "learning_rate": 8.700380277637384e-05, - "loss": 0.0749, - "step": 49150 - }, - { - "epoch": 3.2162250572456657, - "grad_norm": 0.7501885890960693, - "learning_rate": 8.699762439200298e-05, - "loss": 0.0777, - "step": 49160 - }, - { - "epoch": 3.2168792934249266, - "grad_norm": 0.8301616311073303, - "learning_rate": 8.699144475886526e-05, - "loss": 0.0798, - "step": 49170 - }, - { - "epoch": 3.217533529604187, - "grad_norm": 0.8588626384735107, - "learning_rate": 8.698526387716928e-05, - "loss": 0.0745, - "step": 49180 - }, - { - "epoch": 3.2181877657834477, - "grad_norm": 1.1801152229309082, - "learning_rate": 8.697908174712363e-05, - "loss": 0.0882, - "step": 49190 - }, - { - "epoch": 3.2188420019627086, - "grad_norm": 0.9209977984428406, - "learning_rate": 8.6972898368937e-05, - "loss": 0.0757, - "step": 49200 - }, - { - "epoch": 3.219496238141969, - "grad_norm": 1.0483894348144531, - "learning_rate": 8.696671374281808e-05, - "loss": 0.0722, - "step": 49210 - }, - { - "epoch": 3.22015047432123, - "grad_norm": 0.9337593913078308, - "learning_rate": 8.696052786897563e-05, - "loss": 0.075, - "step": 49220 - }, - { - "epoch": 3.2208047105004907, - "grad_norm": 0.8857582807540894, - "learning_rate": 8.695434074761843e-05, - "loss": 0.0744, - "step": 49230 - }, - { - "epoch": 3.221458946679751, - "grad_norm": 0.8099195957183838, - "learning_rate": 8.69481523789553e-05, - "loss": 0.072, - "step": 49240 - }, - { - "epoch": 3.222113182859012, - "grad_norm": 0.8427746295928955, - "learning_rate": 8.694196276319514e-05, - "loss": 0.0747, - "step": 49250 - }, - { - "epoch": 3.2227674190382727, - "grad_norm": 0.8717663884162903, - "learning_rate": 8.693577190054685e-05, - "loss": 0.0792, - "step": 49260 - }, - { - "epoch": 3.2234216552175337, - "grad_norm": 0.8552377820014954, - "learning_rate": 8.69295797912194e-05, - "loss": 0.0777, - "step": 49270 - }, - { - "epoch": 3.224075891396794, - "grad_norm": 0.8015262484550476, - "learning_rate": 8.692338643542177e-05, - "loss": 0.0846, - "step": 49280 - }, - { - "epoch": 3.224730127576055, - "grad_norm": 0.7176341414451599, - "learning_rate": 8.691719183336302e-05, - "loss": 0.0658, - "step": 49290 - }, - { - "epoch": 3.2253843637553157, - "grad_norm": 0.8882294297218323, - "learning_rate": 8.69109959852522e-05, - "loss": 0.0737, - "step": 49300 - }, - { - "epoch": 3.226038599934576, - "grad_norm": 0.9010905623435974, - "learning_rate": 8.69047988912985e-05, - "loss": 0.0838, - "step": 49310 - }, - { - "epoch": 3.226692836113837, - "grad_norm": 0.8391245007514954, - "learning_rate": 8.689860055171104e-05, - "loss": 0.0727, - "step": 49320 - }, - { - "epoch": 3.2273470722930977, - "grad_norm": 1.213536024093628, - "learning_rate": 8.689240096669903e-05, - "loss": 0.0761, - "step": 49330 - }, - { - "epoch": 3.2280013084723587, - "grad_norm": 0.9631514549255371, - "learning_rate": 8.688620013647175e-05, - "loss": 0.0703, - "step": 49340 - }, - { - "epoch": 3.228655544651619, - "grad_norm": 0.8843837380409241, - "learning_rate": 8.687999806123847e-05, - "loss": 0.0657, - "step": 49350 - }, - { - "epoch": 3.2293097808308797, - "grad_norm": 0.8727587461471558, - "learning_rate": 8.687379474120852e-05, - "loss": 0.085, - "step": 49360 - }, - { - "epoch": 3.2299640170101407, - "grad_norm": 0.8236467838287354, - "learning_rate": 8.686759017659132e-05, - "loss": 0.0652, - "step": 49370 - }, - { - "epoch": 3.230618253189401, - "grad_norm": 0.876015841960907, - "learning_rate": 8.686138436759623e-05, - "loss": 0.0741, - "step": 49380 - }, - { - "epoch": 3.231272489368662, - "grad_norm": 0.8100466132164001, - "learning_rate": 8.685517731443278e-05, - "loss": 0.0748, - "step": 49390 - }, - { - "epoch": 3.2319267255479227, - "grad_norm": 0.8338639140129089, - "learning_rate": 8.684896901731041e-05, - "loss": 0.0731, - "step": 49400 - }, - { - "epoch": 3.2325809617271837, - "grad_norm": 0.8085967302322388, - "learning_rate": 8.684275947643872e-05, - "loss": 0.075, - "step": 49410 - }, - { - "epoch": 3.233235197906444, - "grad_norm": 0.8683192729949951, - "learning_rate": 8.683654869202726e-05, - "loss": 0.072, - "step": 49420 - }, - { - "epoch": 3.2338894340857047, - "grad_norm": 1.1093887090682983, - "learning_rate": 8.683033666428568e-05, - "loss": 0.0715, - "step": 49430 - }, - { - "epoch": 3.2345436702649657, - "grad_norm": 1.2019805908203125, - "learning_rate": 8.682412339342363e-05, - "loss": 0.0825, - "step": 49440 - }, - { - "epoch": 3.2351979064442262, - "grad_norm": 0.8940714001655579, - "learning_rate": 8.681790887965087e-05, - "loss": 0.0761, - "step": 49450 - }, - { - "epoch": 3.235852142623487, - "grad_norm": 0.9425767660140991, - "learning_rate": 8.681169312317709e-05, - "loss": 0.0733, - "step": 49460 - }, - { - "epoch": 3.2365063788027477, - "grad_norm": 0.9423227310180664, - "learning_rate": 8.680547612421215e-05, - "loss": 0.0888, - "step": 49470 - }, - { - "epoch": 3.2371606149820087, - "grad_norm": 1.0191290378570557, - "learning_rate": 8.679925788296586e-05, - "loss": 0.0749, - "step": 49480 - }, - { - "epoch": 3.237814851161269, - "grad_norm": 0.8777630925178528, - "learning_rate": 8.67930383996481e-05, - "loss": 0.083, - "step": 49490 - }, - { - "epoch": 3.2384690873405297, - "grad_norm": 1.0613380670547485, - "learning_rate": 8.678681767446882e-05, - "loss": 0.0801, - "step": 49500 - }, - { - "epoch": 3.2391233235197907, - "grad_norm": 0.8227213621139526, - "learning_rate": 8.678059570763794e-05, - "loss": 0.0692, - "step": 49510 - }, - { - "epoch": 3.2397775596990512, - "grad_norm": 0.8650736212730408, - "learning_rate": 8.677437249936552e-05, - "loss": 0.0787, - "step": 49520 - }, - { - "epoch": 3.240431795878312, - "grad_norm": 0.8299550414085388, - "learning_rate": 8.676814804986158e-05, - "loss": 0.0721, - "step": 49530 - }, - { - "epoch": 3.2410860320575727, - "grad_norm": 1.0536143779754639, - "learning_rate": 8.67619223593362e-05, - "loss": 0.074, - "step": 49540 - }, - { - "epoch": 3.2417402682368337, - "grad_norm": 0.9724963903427124, - "learning_rate": 8.675569542799953e-05, - "loss": 0.0775, - "step": 49550 - }, - { - "epoch": 3.2423945044160942, - "grad_norm": 1.0580730438232422, - "learning_rate": 8.674946725606176e-05, - "loss": 0.0856, - "step": 49560 - }, - { - "epoch": 3.2430487405953548, - "grad_norm": 0.9586568474769592, - "learning_rate": 8.674323784373308e-05, - "loss": 0.0783, - "step": 49570 - }, - { - "epoch": 3.2437029767746157, - "grad_norm": 0.8590267300605774, - "learning_rate": 8.673700719122375e-05, - "loss": 0.0748, - "step": 49580 - }, - { - "epoch": 3.2443572129538762, - "grad_norm": 0.9745254516601562, - "learning_rate": 8.673077529874409e-05, - "loss": 0.0857, - "step": 49590 - }, - { - "epoch": 3.245011449133137, - "grad_norm": 0.7989119291305542, - "learning_rate": 8.672454216650445e-05, - "loss": 0.0696, - "step": 49600 - }, - { - "epoch": 3.2456656853123977, - "grad_norm": 0.9528154730796814, - "learning_rate": 8.671830779471518e-05, - "loss": 0.082, - "step": 49610 - }, - { - "epoch": 3.2463199214916587, - "grad_norm": 0.8685365319252014, - "learning_rate": 8.671207218358672e-05, - "loss": 0.0709, - "step": 49620 - }, - { - "epoch": 3.2469741576709192, - "grad_norm": 0.7714129090309143, - "learning_rate": 8.670583533332957e-05, - "loss": 0.078, - "step": 49630 - }, - { - "epoch": 3.2476283938501798, - "grad_norm": 0.865516722202301, - "learning_rate": 8.669959724415419e-05, - "loss": 0.0676, - "step": 49640 - }, - { - "epoch": 3.2482826300294407, - "grad_norm": 0.8643177151679993, - "learning_rate": 8.669335791627116e-05, - "loss": 0.0699, - "step": 49650 - }, - { - "epoch": 3.2489368662087013, - "grad_norm": 0.9267588257789612, - "learning_rate": 8.668711734989105e-05, - "loss": 0.0746, - "step": 49660 - }, - { - "epoch": 3.2495911023879622, - "grad_norm": 1.2003788948059082, - "learning_rate": 8.668087554522455e-05, - "loss": 0.0673, - "step": 49670 - }, - { - "epoch": 3.2502453385672228, - "grad_norm": 0.869156002998352, - "learning_rate": 8.667463250248228e-05, - "loss": 0.0709, - "step": 49680 - }, - { - "epoch": 3.2508995747464837, - "grad_norm": 1.0042074918746948, - "learning_rate": 8.666838822187498e-05, - "loss": 0.0782, - "step": 49690 - }, - { - "epoch": 3.2515538109257442, - "grad_norm": 0.9066942930221558, - "learning_rate": 8.666214270361342e-05, - "loss": 0.0717, - "step": 49700 - }, - { - "epoch": 3.2522080471050048, - "grad_norm": 0.8262027502059937, - "learning_rate": 8.665589594790838e-05, - "loss": 0.0818, - "step": 49710 - }, - { - "epoch": 3.2528622832842657, - "grad_norm": 0.7925301194190979, - "learning_rate": 8.664964795497073e-05, - "loss": 0.0704, - "step": 49720 - }, - { - "epoch": 3.2535165194635263, - "grad_norm": 0.8457410335540771, - "learning_rate": 8.664339872501133e-05, - "loss": 0.0743, - "step": 49730 - }, - { - "epoch": 3.2541707556427872, - "grad_norm": 1.0192575454711914, - "learning_rate": 8.663714825824114e-05, - "loss": 0.0792, - "step": 49740 - }, - { - "epoch": 3.2548249918220478, - "grad_norm": 0.887332558631897, - "learning_rate": 8.66308965548711e-05, - "loss": 0.0744, - "step": 49750 - }, - { - "epoch": 3.2554792280013083, - "grad_norm": 0.9287620782852173, - "learning_rate": 8.662464361511224e-05, - "loss": 0.0757, - "step": 49760 - }, - { - "epoch": 3.2561334641805693, - "grad_norm": 0.9490354657173157, - "learning_rate": 8.661838943917561e-05, - "loss": 0.0802, - "step": 49770 - }, - { - "epoch": 3.25678770035983, - "grad_norm": 0.7195919752120972, - "learning_rate": 8.661213402727229e-05, - "loss": 0.0737, - "step": 49780 - }, - { - "epoch": 3.2574419365390908, - "grad_norm": 0.7042505145072937, - "learning_rate": 8.660587737961344e-05, - "loss": 0.0644, - "step": 49790 - }, - { - "epoch": 3.2580961727183513, - "grad_norm": 0.7407015562057495, - "learning_rate": 8.659961949641023e-05, - "loss": 0.067, - "step": 49800 - }, - { - "epoch": 3.258750408897612, - "grad_norm": 0.7478853464126587, - "learning_rate": 8.659336037787384e-05, - "loss": 0.0737, - "step": 49810 - }, - { - "epoch": 3.2594046450768728, - "grad_norm": 0.8127663135528564, - "learning_rate": 8.658710002421561e-05, - "loss": 0.0697, - "step": 49820 - }, - { - "epoch": 3.2600588812561333, - "grad_norm": 1.1231528520584106, - "learning_rate": 8.658083843564677e-05, - "loss": 0.0831, - "step": 49830 - }, - { - "epoch": 3.2607131174353943, - "grad_norm": 0.6892096400260925, - "learning_rate": 8.657457561237871e-05, - "loss": 0.0696, - "step": 49840 - }, - { - "epoch": 3.261367353614655, - "grad_norm": 0.8850003480911255, - "learning_rate": 8.656831155462281e-05, - "loss": 0.0721, - "step": 49850 - }, - { - "epoch": 3.2620215897939158, - "grad_norm": 0.7798917889595032, - "learning_rate": 8.656204626259048e-05, - "loss": 0.0768, - "step": 49860 - }, - { - "epoch": 3.2626758259731763, - "grad_norm": 0.934593915939331, - "learning_rate": 8.655577973649321e-05, - "loss": 0.0803, - "step": 49870 - }, - { - "epoch": 3.263330062152437, - "grad_norm": 0.9441463947296143, - "learning_rate": 8.65495119765425e-05, - "loss": 0.0735, - "step": 49880 - }, - { - "epoch": 3.263984298331698, - "grad_norm": 0.6787394881248474, - "learning_rate": 8.65432429829499e-05, - "loss": 0.0688, - "step": 49890 - }, - { - "epoch": 3.2646385345109583, - "grad_norm": 0.8261919021606445, - "learning_rate": 8.653697275592702e-05, - "loss": 0.0669, - "step": 49900 - }, - { - "epoch": 3.2652927706902193, - "grad_norm": 1.0204743146896362, - "learning_rate": 8.653070129568548e-05, - "loss": 0.0691, - "step": 49910 - }, - { - "epoch": 3.26594700686948, - "grad_norm": 1.0907118320465088, - "learning_rate": 8.652442860243698e-05, - "loss": 0.0808, - "step": 49920 - }, - { - "epoch": 3.2666012430487408, - "grad_norm": 1.026333212852478, - "learning_rate": 8.651815467639321e-05, - "loss": 0.0761, - "step": 49930 - }, - { - "epoch": 3.2672554792280013, - "grad_norm": 0.9486099481582642, - "learning_rate": 8.651187951776593e-05, - "loss": 0.0708, - "step": 49940 - }, - { - "epoch": 3.267909715407262, - "grad_norm": 0.9680077433586121, - "learning_rate": 8.6505603126767e-05, - "loss": 0.0742, - "step": 49950 - }, - { - "epoch": 3.268563951586523, - "grad_norm": 0.789247989654541, - "learning_rate": 8.649932550360821e-05, - "loss": 0.0803, - "step": 49960 - }, - { - "epoch": 3.2692181877657833, - "grad_norm": 0.725689709186554, - "learning_rate": 8.649304664850145e-05, - "loss": 0.0762, - "step": 49970 - }, - { - "epoch": 3.2698724239450443, - "grad_norm": 0.7912468314170837, - "learning_rate": 8.648676656165867e-05, - "loss": 0.0754, - "step": 49980 - }, - { - "epoch": 3.270526660124305, - "grad_norm": 0.6831502318382263, - "learning_rate": 8.648048524329182e-05, - "loss": 0.0736, - "step": 49990 - }, - { - "epoch": 3.271180896303566, - "grad_norm": 0.8818649053573608, - "learning_rate": 8.647420269361294e-05, - "loss": 0.0841, - "step": 50000 - }, - { - "epoch": 3.2718351324828263, - "grad_norm": 0.8740116953849792, - "learning_rate": 8.646791891283403e-05, - "loss": 0.07, - "step": 50010 - }, - { - "epoch": 3.272489368662087, - "grad_norm": 0.8219292759895325, - "learning_rate": 8.646163390116723e-05, - "loss": 0.0731, - "step": 50020 - }, - { - "epoch": 3.273143604841348, - "grad_norm": 0.7342281341552734, - "learning_rate": 8.645534765882469e-05, - "loss": 0.0729, - "step": 50030 - }, - { - "epoch": 3.2737978410206083, - "grad_norm": 0.6889681220054626, - "learning_rate": 8.644906018601852e-05, - "loss": 0.0697, - "step": 50040 - }, - { - "epoch": 3.2744520771998693, - "grad_norm": 0.7851772904396057, - "learning_rate": 8.6442771482961e-05, - "loss": 0.0777, - "step": 50050 - }, - { - "epoch": 3.27510631337913, - "grad_norm": 0.9841508865356445, - "learning_rate": 8.643648154986435e-05, - "loss": 0.0747, - "step": 50060 - }, - { - "epoch": 3.275760549558391, - "grad_norm": 0.8496854305267334, - "learning_rate": 8.64301903869409e-05, - "loss": 0.0708, - "step": 50070 - }, - { - "epoch": 3.2764147857376513, - "grad_norm": 0.8300930261611938, - "learning_rate": 8.642389799440298e-05, - "loss": 0.0692, - "step": 50080 - }, - { - "epoch": 3.277069021916912, - "grad_norm": 0.7653340697288513, - "learning_rate": 8.641760437246297e-05, - "loss": 0.0751, - "step": 50090 - }, - { - "epoch": 3.277723258096173, - "grad_norm": 0.7286269068717957, - "learning_rate": 8.641130952133332e-05, - "loss": 0.0665, - "step": 50100 - }, - { - "epoch": 3.2783774942754333, - "grad_norm": 0.9306923151016235, - "learning_rate": 8.64050134412265e-05, - "loss": 0.0827, - "step": 50110 - }, - { - "epoch": 3.2790317304546943, - "grad_norm": 0.7376532554626465, - "learning_rate": 8.639871613235495e-05, - "loss": 0.072, - "step": 50120 - }, - { - "epoch": 3.279685966633955, - "grad_norm": 0.8583968877792358, - "learning_rate": 8.639241759493131e-05, - "loss": 0.0756, - "step": 50130 - }, - { - "epoch": 3.280340202813216, - "grad_norm": 0.9432209134101868, - "learning_rate": 8.638611782916812e-05, - "loss": 0.0757, - "step": 50140 - }, - { - "epoch": 3.2809944389924763, - "grad_norm": 0.8525569438934326, - "learning_rate": 8.637981683527803e-05, - "loss": 0.0743, - "step": 50150 - }, - { - "epoch": 3.281648675171737, - "grad_norm": 1.0892137289047241, - "learning_rate": 8.637351461347371e-05, - "loss": 0.0712, - "step": 50160 - }, - { - "epoch": 3.282302911350998, - "grad_norm": 0.8420467376708984, - "learning_rate": 8.636721116396787e-05, - "loss": 0.0643, - "step": 50170 - }, - { - "epoch": 3.2829571475302584, - "grad_norm": 0.7865369319915771, - "learning_rate": 8.636090648697329e-05, - "loss": 0.0713, - "step": 50180 - }, - { - "epoch": 3.2836113837095193, - "grad_norm": 0.8081366419792175, - "learning_rate": 8.635460058270274e-05, - "loss": 0.0859, - "step": 50190 - }, - { - "epoch": 3.28426561988878, - "grad_norm": 0.8444221019744873, - "learning_rate": 8.63482934513691e-05, - "loss": 0.0721, - "step": 50200 - }, - { - "epoch": 3.2849198560680404, - "grad_norm": 0.9457107782363892, - "learning_rate": 8.634198509318521e-05, - "loss": 0.0678, - "step": 50210 - }, - { - "epoch": 3.2855740922473013, - "grad_norm": 0.908445417881012, - "learning_rate": 8.633567550836403e-05, - "loss": 0.0757, - "step": 50220 - }, - { - "epoch": 3.286228328426562, - "grad_norm": 0.9582734107971191, - "learning_rate": 8.63293646971185e-05, - "loss": 0.0725, - "step": 50230 - }, - { - "epoch": 3.286882564605823, - "grad_norm": 1.1066676378250122, - "learning_rate": 8.632305265966163e-05, - "loss": 0.0838, - "step": 50240 - }, - { - "epoch": 3.2875368007850834, - "grad_norm": 1.1428422927856445, - "learning_rate": 8.631673939620646e-05, - "loss": 0.0725, - "step": 50250 - }, - { - "epoch": 3.288191036964344, - "grad_norm": 0.8173267841339111, - "learning_rate": 8.631042490696612e-05, - "loss": 0.0736, - "step": 50260 - }, - { - "epoch": 3.288845273143605, - "grad_norm": 0.7522175908088684, - "learning_rate": 8.63041091921537e-05, - "loss": 0.067, - "step": 50270 - }, - { - "epoch": 3.2894995093228654, - "grad_norm": 0.7062971591949463, - "learning_rate": 8.629779225198238e-05, - "loss": 0.0683, - "step": 50280 - }, - { - "epoch": 3.2901537455021264, - "grad_norm": 0.9391506314277649, - "learning_rate": 8.629147408666537e-05, - "loss": 0.0705, - "step": 50290 - }, - { - "epoch": 3.290807981681387, - "grad_norm": 0.7575945854187012, - "learning_rate": 8.628515469641593e-05, - "loss": 0.0696, - "step": 50300 - }, - { - "epoch": 3.291462217860648, - "grad_norm": 0.7844806909561157, - "learning_rate": 8.627883408144737e-05, - "loss": 0.0746, - "step": 50310 - }, - { - "epoch": 3.2921164540399084, - "grad_norm": 0.7494459748268127, - "learning_rate": 8.627251224197302e-05, - "loss": 0.0704, - "step": 50320 - }, - { - "epoch": 3.292770690219169, - "grad_norm": 0.7377359867095947, - "learning_rate": 8.626618917820624e-05, - "loss": 0.0679, - "step": 50330 - }, - { - "epoch": 3.29342492639843, - "grad_norm": 0.7377490401268005, - "learning_rate": 8.625986489036048e-05, - "loss": 0.076, - "step": 50340 - }, - { - "epoch": 3.2940791625776904, - "grad_norm": 0.7678962349891663, - "learning_rate": 8.625353937864917e-05, - "loss": 0.071, - "step": 50350 - }, - { - "epoch": 3.2947333987569514, - "grad_norm": 0.9204226136207581, - "learning_rate": 8.624721264328584e-05, - "loss": 0.0675, - "step": 50360 - }, - { - "epoch": 3.295387634936212, - "grad_norm": 0.8501763343811035, - "learning_rate": 8.624088468448401e-05, - "loss": 0.0716, - "step": 50370 - }, - { - "epoch": 3.296041871115473, - "grad_norm": 1.0551090240478516, - "learning_rate": 8.623455550245727e-05, - "loss": 0.0702, - "step": 50380 - }, - { - "epoch": 3.2966961072947334, - "grad_norm": 0.9518636465072632, - "learning_rate": 8.622822509741928e-05, - "loss": 0.0756, - "step": 50390 - }, - { - "epoch": 3.297350343473994, - "grad_norm": 0.971017062664032, - "learning_rate": 8.622189346958365e-05, - "loss": 0.069, - "step": 50400 - }, - { - "epoch": 3.298004579653255, - "grad_norm": 0.8658537268638611, - "learning_rate": 8.621556061916414e-05, - "loss": 0.0779, - "step": 50410 - }, - { - "epoch": 3.2986588158325154, - "grad_norm": 0.9639972448348999, - "learning_rate": 8.620922654637446e-05, - "loss": 0.0717, - "step": 50420 - }, - { - "epoch": 3.2993130520117764, - "grad_norm": 0.9584673643112183, - "learning_rate": 8.620289125142845e-05, - "loss": 0.0691, - "step": 50430 - }, - { - "epoch": 3.299967288191037, - "grad_norm": 0.7982688546180725, - "learning_rate": 8.61965547345399e-05, - "loss": 0.0863, - "step": 50440 - }, - { - "epoch": 3.300621524370298, - "grad_norm": 1.0012849569320679, - "learning_rate": 8.619021699592271e-05, - "loss": 0.0813, - "step": 50450 - }, - { - "epoch": 3.3012757605495584, - "grad_norm": 0.9669886231422424, - "learning_rate": 8.618387803579076e-05, - "loss": 0.0789, - "step": 50460 - }, - { - "epoch": 3.301929996728819, - "grad_norm": 0.7313616871833801, - "learning_rate": 8.617753785435804e-05, - "loss": 0.0641, - "step": 50470 - }, - { - "epoch": 3.30258423290808, - "grad_norm": 0.7947266697883606, - "learning_rate": 8.617119645183856e-05, - "loss": 0.0751, - "step": 50480 - }, - { - "epoch": 3.3032384690873404, - "grad_norm": 0.8369613289833069, - "learning_rate": 8.616485382844631e-05, - "loss": 0.0857, - "step": 50490 - }, - { - "epoch": 3.3038927052666014, - "grad_norm": 1.0216795206069946, - "learning_rate": 8.615850998439542e-05, - "loss": 0.0753, - "step": 50500 - }, - { - "epoch": 3.304546941445862, - "grad_norm": 0.9088335633277893, - "learning_rate": 8.615216491989997e-05, - "loss": 0.0789, - "step": 50510 - }, - { - "epoch": 3.305201177625123, - "grad_norm": 0.9669744372367859, - "learning_rate": 8.614581863517414e-05, - "loss": 0.0717, - "step": 50520 - }, - { - "epoch": 3.3058554138043834, - "grad_norm": 0.8303861021995544, - "learning_rate": 8.613947113043215e-05, - "loss": 0.0739, - "step": 50530 - }, - { - "epoch": 3.306509649983644, - "grad_norm": 0.8657481074333191, - "learning_rate": 8.613312240588822e-05, - "loss": 0.0677, - "step": 50540 - }, - { - "epoch": 3.307163886162905, - "grad_norm": 0.8606223464012146, - "learning_rate": 8.612677246175665e-05, - "loss": 0.0825, - "step": 50550 - }, - { - "epoch": 3.3078181223421654, - "grad_norm": 0.9812949895858765, - "learning_rate": 8.612042129825177e-05, - "loss": 0.0715, - "step": 50560 - }, - { - "epoch": 3.3084723585214264, - "grad_norm": 0.8385498523712158, - "learning_rate": 8.611406891558793e-05, - "loss": 0.0658, - "step": 50570 - }, - { - "epoch": 3.309126594700687, - "grad_norm": 0.820543110370636, - "learning_rate": 8.610771531397957e-05, - "loss": 0.068, - "step": 50580 - }, - { - "epoch": 3.309780830879948, - "grad_norm": 0.8643449544906616, - "learning_rate": 8.61013604936411e-05, - "loss": 0.0748, - "step": 50590 - }, - { - "epoch": 3.3104350670592084, - "grad_norm": 0.8752486705780029, - "learning_rate": 8.609500445478704e-05, - "loss": 0.081, - "step": 50600 - }, - { - "epoch": 3.311089303238469, - "grad_norm": 0.8542597889900208, - "learning_rate": 8.608864719763192e-05, - "loss": 0.0713, - "step": 50610 - }, - { - "epoch": 3.31174353941773, - "grad_norm": 0.7838973999023438, - "learning_rate": 8.608228872239031e-05, - "loss": 0.0758, - "step": 50620 - }, - { - "epoch": 3.3123977755969904, - "grad_norm": 0.9226830005645752, - "learning_rate": 8.607592902927684e-05, - "loss": 0.0815, - "step": 50630 - }, - { - "epoch": 3.3130520117762514, - "grad_norm": 0.9006392955780029, - "learning_rate": 8.606956811850613e-05, - "loss": 0.0697, - "step": 50640 - }, - { - "epoch": 3.313706247955512, - "grad_norm": 0.9188740253448486, - "learning_rate": 8.606320599029292e-05, - "loss": 0.0697, - "step": 50650 - }, - { - "epoch": 3.3143604841347725, - "grad_norm": 0.8508504033088684, - "learning_rate": 8.605684264485192e-05, - "loss": 0.0752, - "step": 50660 - }, - { - "epoch": 3.3150147203140334, - "grad_norm": 0.8384444713592529, - "learning_rate": 8.605047808239791e-05, - "loss": 0.0719, - "step": 50670 - }, - { - "epoch": 3.315668956493294, - "grad_norm": 0.8773384094238281, - "learning_rate": 8.604411230314572e-05, - "loss": 0.0824, - "step": 50680 - }, - { - "epoch": 3.316323192672555, - "grad_norm": 0.7789053320884705, - "learning_rate": 8.603774530731023e-05, - "loss": 0.0652, - "step": 50690 - }, - { - "epoch": 3.3169774288518155, - "grad_norm": 0.7404478788375854, - "learning_rate": 8.60313770951063e-05, - "loss": 0.0704, - "step": 50700 - }, - { - "epoch": 3.317631665031076, - "grad_norm": 1.0024648904800415, - "learning_rate": 8.60250076667489e-05, - "loss": 0.0829, - "step": 50710 - }, - { - "epoch": 3.318285901210337, - "grad_norm": 0.7322053909301758, - "learning_rate": 8.601863702245303e-05, - "loss": 0.0716, - "step": 50720 - }, - { - "epoch": 3.3189401373895975, - "grad_norm": 1.0306543111801147, - "learning_rate": 8.601226516243368e-05, - "loss": 0.0719, - "step": 50730 - }, - { - "epoch": 3.3195943735688584, - "grad_norm": 0.8243963718414307, - "learning_rate": 8.600589208690595e-05, - "loss": 0.0726, - "step": 50740 - }, - { - "epoch": 3.320248609748119, - "grad_norm": 0.7126163244247437, - "learning_rate": 8.599951779608493e-05, - "loss": 0.072, - "step": 50750 - }, - { - "epoch": 3.32090284592738, - "grad_norm": 0.9243032932281494, - "learning_rate": 8.599314229018575e-05, - "loss": 0.0765, - "step": 50760 - }, - { - "epoch": 3.3215570821066405, - "grad_norm": 0.8897315859794617, - "learning_rate": 8.598676556942365e-05, - "loss": 0.0771, - "step": 50770 - }, - { - "epoch": 3.322211318285901, - "grad_norm": 0.8531565070152283, - "learning_rate": 8.598038763401382e-05, - "loss": 0.076, - "step": 50780 - }, - { - "epoch": 3.322865554465162, - "grad_norm": 0.7867346405982971, - "learning_rate": 8.597400848417156e-05, - "loss": 0.0847, - "step": 50790 - }, - { - "epoch": 3.3235197906444225, - "grad_norm": 0.8421074748039246, - "learning_rate": 8.596762812011216e-05, - "loss": 0.0792, - "step": 50800 - }, - { - "epoch": 3.3241740268236835, - "grad_norm": 0.8326323628425598, - "learning_rate": 8.596124654205097e-05, - "loss": 0.0737, - "step": 50810 - }, - { - "epoch": 3.324828263002944, - "grad_norm": 1.0936020612716675, - "learning_rate": 8.595486375020341e-05, - "loss": 0.0802, - "step": 50820 - }, - { - "epoch": 3.325482499182205, - "grad_norm": 1.0565087795257568, - "learning_rate": 8.59484797447849e-05, - "loss": 0.0741, - "step": 50830 - }, - { - "epoch": 3.3261367353614655, - "grad_norm": 0.8278268575668335, - "learning_rate": 8.594209452601092e-05, - "loss": 0.0665, - "step": 50840 - }, - { - "epoch": 3.326790971540726, - "grad_norm": 0.720306932926178, - "learning_rate": 8.593570809409698e-05, - "loss": 0.0753, - "step": 50850 - }, - { - "epoch": 3.327445207719987, - "grad_norm": 0.9338958859443665, - "learning_rate": 8.592932044925866e-05, - "loss": 0.0729, - "step": 50860 - }, - { - "epoch": 3.3280994438992475, - "grad_norm": 0.8932592868804932, - "learning_rate": 8.592293159171155e-05, - "loss": 0.0749, - "step": 50870 - }, - { - "epoch": 3.3287536800785085, - "grad_norm": 0.7628698945045471, - "learning_rate": 8.591654152167128e-05, - "loss": 0.0762, - "step": 50880 - }, - { - "epoch": 3.329407916257769, - "grad_norm": 0.855076789855957, - "learning_rate": 8.591015023935353e-05, - "loss": 0.0792, - "step": 50890 - }, - { - "epoch": 3.33006215243703, - "grad_norm": 0.8656195998191833, - "learning_rate": 8.590375774497406e-05, - "loss": 0.0678, - "step": 50900 - }, - { - "epoch": 3.3307163886162905, - "grad_norm": 1.1251041889190674, - "learning_rate": 8.589736403874858e-05, - "loss": 0.0817, - "step": 50910 - }, - { - "epoch": 3.331370624795551, - "grad_norm": 0.815820038318634, - "learning_rate": 8.589096912089292e-05, - "loss": 0.0795, - "step": 50920 - }, - { - "epoch": 3.332024860974812, - "grad_norm": 0.8184264302253723, - "learning_rate": 8.588457299162293e-05, - "loss": 0.0679, - "step": 50930 - }, - { - "epoch": 3.3326790971540725, - "grad_norm": 0.6833990216255188, - "learning_rate": 8.587817565115449e-05, - "loss": 0.063, - "step": 50940 - }, - { - "epoch": 3.3333333333333335, - "grad_norm": 0.7902950644493103, - "learning_rate": 8.587177709970353e-05, - "loss": 0.0683, - "step": 50950 - }, - { - "epoch": 3.333987569512594, - "grad_norm": 0.8151212334632874, - "learning_rate": 8.586537733748601e-05, - "loss": 0.0757, - "step": 50960 - }, - { - "epoch": 3.334641805691855, - "grad_norm": 0.8784273266792297, - "learning_rate": 8.585897636471796e-05, - "loss": 0.071, - "step": 50970 - }, - { - "epoch": 3.3352960418711155, - "grad_norm": 0.8391785621643066, - "learning_rate": 8.585257418161538e-05, - "loss": 0.0744, - "step": 50980 - }, - { - "epoch": 3.335950278050376, - "grad_norm": 1.0281047821044922, - "learning_rate": 8.584617078839443e-05, - "loss": 0.0855, - "step": 50990 - }, - { - "epoch": 3.336604514229637, - "grad_norm": 0.838431715965271, - "learning_rate": 8.58397661852712e-05, - "loss": 0.0777, - "step": 51000 - }, - { - "epoch": 3.3372587504088975, - "grad_norm": 0.7739060521125793, - "learning_rate": 8.583336037246186e-05, - "loss": 0.067, - "step": 51010 - }, - { - "epoch": 3.3379129865881585, - "grad_norm": 0.9541481733322144, - "learning_rate": 8.582695335018263e-05, - "loss": 0.0777, - "step": 51020 - }, - { - "epoch": 3.338567222767419, - "grad_norm": 0.9411327242851257, - "learning_rate": 8.582054511864977e-05, - "loss": 0.0823, - "step": 51030 - }, - { - "epoch": 3.33922145894668, - "grad_norm": 0.785683274269104, - "learning_rate": 8.581413567807956e-05, - "loss": 0.0685, - "step": 51040 - }, - { - "epoch": 3.3398756951259405, - "grad_norm": 0.7861221432685852, - "learning_rate": 8.580772502868835e-05, - "loss": 0.0741, - "step": 51050 - }, - { - "epoch": 3.340529931305201, - "grad_norm": 0.9560049176216125, - "learning_rate": 8.58013131706925e-05, - "loss": 0.0704, - "step": 51060 - }, - { - "epoch": 3.341184167484462, - "grad_norm": 0.7720146775245667, - "learning_rate": 8.579490010430846e-05, - "loss": 0.069, - "step": 51070 - }, - { - "epoch": 3.3418384036637225, - "grad_norm": 0.7846336364746094, - "learning_rate": 8.578848582975266e-05, - "loss": 0.074, - "step": 51080 - }, - { - "epoch": 3.3424926398429835, - "grad_norm": 0.9714375734329224, - "learning_rate": 8.57820703472416e-05, - "loss": 0.0714, - "step": 51090 - }, - { - "epoch": 3.343146876022244, - "grad_norm": 0.7882207036018372, - "learning_rate": 8.577565365699183e-05, - "loss": 0.0806, - "step": 51100 - }, - { - "epoch": 3.3438011122015046, - "grad_norm": 0.8583884835243225, - "learning_rate": 8.576923575921991e-05, - "loss": 0.076, - "step": 51110 - }, - { - "epoch": 3.3444553483807655, - "grad_norm": 1.0091041326522827, - "learning_rate": 8.576281665414249e-05, - "loss": 0.0838, - "step": 51120 - }, - { - "epoch": 3.345109584560026, - "grad_norm": 0.9896878004074097, - "learning_rate": 8.57563963419762e-05, - "loss": 0.0801, - "step": 51130 - }, - { - "epoch": 3.345763820739287, - "grad_norm": 0.9059678912162781, - "learning_rate": 8.574997482293778e-05, - "loss": 0.0756, - "step": 51140 - }, - { - "epoch": 3.3464180569185475, - "grad_norm": 0.8360334038734436, - "learning_rate": 8.574355209724393e-05, - "loss": 0.0782, - "step": 51150 - }, - { - "epoch": 3.347072293097808, - "grad_norm": 1.1059879064559937, - "learning_rate": 8.573712816511148e-05, - "loss": 0.0771, - "step": 51160 - }, - { - "epoch": 3.347726529277069, - "grad_norm": 0.9057727456092834, - "learning_rate": 8.57307030267572e-05, - "loss": 0.0817, - "step": 51170 - }, - { - "epoch": 3.3483807654563296, - "grad_norm": 0.7811506390571594, - "learning_rate": 8.572427668239802e-05, - "loss": 0.0817, - "step": 51180 - }, - { - "epoch": 3.3490350016355905, - "grad_norm": 0.927189290523529, - "learning_rate": 8.571784913225078e-05, - "loss": 0.0707, - "step": 51190 - }, - { - "epoch": 3.349689237814851, - "grad_norm": 0.7805935740470886, - "learning_rate": 8.571142037653249e-05, - "loss": 0.0763, - "step": 51200 - }, - { - "epoch": 3.350343473994112, - "grad_norm": 0.9339170455932617, - "learning_rate": 8.570499041546007e-05, - "loss": 0.0666, - "step": 51210 - }, - { - "epoch": 3.3509977101733726, - "grad_norm": 0.8374119997024536, - "learning_rate": 8.56985592492506e-05, - "loss": 0.0721, - "step": 51220 - }, - { - "epoch": 3.351651946352633, - "grad_norm": 1.1060223579406738, - "learning_rate": 8.569212687812113e-05, - "loss": 0.0828, - "step": 51230 - }, - { - "epoch": 3.352306182531894, - "grad_norm": 0.7476975917816162, - "learning_rate": 8.568569330228879e-05, - "loss": 0.0794, - "step": 51240 - }, - { - "epoch": 3.3529604187111546, - "grad_norm": 0.9365630149841309, - "learning_rate": 8.56792585219707e-05, - "loss": 0.0771, - "step": 51250 - }, - { - "epoch": 3.3536146548904155, - "grad_norm": 0.8895155787467957, - "learning_rate": 8.567282253738407e-05, - "loss": 0.0798, - "step": 51260 - }, - { - "epoch": 3.354268891069676, - "grad_norm": 0.9719933271408081, - "learning_rate": 8.566638534874612e-05, - "loss": 0.0788, - "step": 51270 - }, - { - "epoch": 3.354923127248937, - "grad_norm": 0.9939890503883362, - "learning_rate": 8.565994695627411e-05, - "loss": 0.0669, - "step": 51280 - }, - { - "epoch": 3.3555773634281976, - "grad_norm": 0.8980028033256531, - "learning_rate": 8.565350736018539e-05, - "loss": 0.0667, - "step": 51290 - }, - { - "epoch": 3.356231599607458, - "grad_norm": 0.8218081593513489, - "learning_rate": 8.564706656069726e-05, - "loss": 0.0762, - "step": 51300 - }, - { - "epoch": 3.356885835786719, - "grad_norm": 0.9311951398849487, - "learning_rate": 8.564062455802718e-05, - "loss": 0.077, - "step": 51310 - }, - { - "epoch": 3.3575400719659796, - "grad_norm": 0.7359833121299744, - "learning_rate": 8.563418135239254e-05, - "loss": 0.0766, - "step": 51320 - }, - { - "epoch": 3.3581943081452406, - "grad_norm": 0.6619185209274292, - "learning_rate": 8.56277369440108e-05, - "loss": 0.0756, - "step": 51330 - }, - { - "epoch": 3.358848544324501, - "grad_norm": 0.9243825078010559, - "learning_rate": 8.562129133309953e-05, - "loss": 0.0796, - "step": 51340 - }, - { - "epoch": 3.359502780503762, - "grad_norm": 0.7941439747810364, - "learning_rate": 8.561484451987626e-05, - "loss": 0.0657, - "step": 51350 - }, - { - "epoch": 3.3601570166830226, - "grad_norm": 1.0141104459762573, - "learning_rate": 8.560839650455857e-05, - "loss": 0.073, - "step": 51360 - }, - { - "epoch": 3.360811252862283, - "grad_norm": 0.7483166456222534, - "learning_rate": 8.560194728736412e-05, - "loss": 0.0696, - "step": 51370 - }, - { - "epoch": 3.361465489041544, - "grad_norm": 0.6753225922584534, - "learning_rate": 8.559549686851057e-05, - "loss": 0.0641, - "step": 51380 - }, - { - "epoch": 3.3621197252208046, - "grad_norm": 0.84939044713974, - "learning_rate": 8.558904524821565e-05, - "loss": 0.0747, - "step": 51390 - }, - { - "epoch": 3.3627739614000656, - "grad_norm": 0.9355828166007996, - "learning_rate": 8.558259242669713e-05, - "loss": 0.0762, - "step": 51400 - }, - { - "epoch": 3.363428197579326, - "grad_norm": 1.0123666524887085, - "learning_rate": 8.557613840417277e-05, - "loss": 0.0796, - "step": 51410 - }, - { - "epoch": 3.364082433758587, - "grad_norm": 0.8886246085166931, - "learning_rate": 8.556968318086047e-05, - "loss": 0.0722, - "step": 51420 - }, - { - "epoch": 3.3647366699378476, - "grad_norm": 0.7857908606529236, - "learning_rate": 8.556322675697806e-05, - "loss": 0.0708, - "step": 51430 - }, - { - "epoch": 3.365390906117108, - "grad_norm": 0.9863922595977783, - "learning_rate": 8.555676913274349e-05, - "loss": 0.0667, - "step": 51440 - }, - { - "epoch": 3.366045142296369, - "grad_norm": 1.3981025218963623, - "learning_rate": 8.55503103083747e-05, - "loss": 0.0731, - "step": 51450 - }, - { - "epoch": 3.3666993784756296, - "grad_norm": 0.8313714265823364, - "learning_rate": 8.55438502840897e-05, - "loss": 0.0696, - "step": 51460 - }, - { - "epoch": 3.3673536146548906, - "grad_norm": 0.7925471067428589, - "learning_rate": 8.553738906010654e-05, - "loss": 0.0685, - "step": 51470 - }, - { - "epoch": 3.368007850834151, - "grad_norm": 0.9922884106636047, - "learning_rate": 8.55309266366433e-05, - "loss": 0.0907, - "step": 51480 - }, - { - "epoch": 3.368662087013412, - "grad_norm": 0.8721675276756287, - "learning_rate": 8.55244630139181e-05, - "loss": 0.0703, - "step": 51490 - }, - { - "epoch": 3.3693163231926726, - "grad_norm": 0.9497895240783691, - "learning_rate": 8.551799819214912e-05, - "loss": 0.0769, - "step": 51500 - }, - { - "epoch": 3.369970559371933, - "grad_norm": 0.9085605144500732, - "learning_rate": 8.551153217155453e-05, - "loss": 0.0754, - "step": 51510 - }, - { - "epoch": 3.370624795551194, - "grad_norm": 0.8060374855995178, - "learning_rate": 8.550506495235262e-05, - "loss": 0.0747, - "step": 51520 - }, - { - "epoch": 3.3712790317304546, - "grad_norm": 0.9739006161689758, - "learning_rate": 8.549859653476164e-05, - "loss": 0.0746, - "step": 51530 - }, - { - "epoch": 3.3719332679097156, - "grad_norm": 0.8732509613037109, - "learning_rate": 8.549212691899993e-05, - "loss": 0.0795, - "step": 51540 - }, - { - "epoch": 3.372587504088976, - "grad_norm": 0.7843785285949707, - "learning_rate": 8.548565610528585e-05, - "loss": 0.0675, - "step": 51550 - }, - { - "epoch": 3.3732417402682366, - "grad_norm": 0.7273816466331482, - "learning_rate": 8.547918409383782e-05, - "loss": 0.0759, - "step": 51560 - }, - { - "epoch": 3.3738959764474976, - "grad_norm": 0.7831087708473206, - "learning_rate": 8.547271088487427e-05, - "loss": 0.0703, - "step": 51570 - }, - { - "epoch": 3.374550212626758, - "grad_norm": 0.7511657476425171, - "learning_rate": 8.54662364786137e-05, - "loss": 0.0706, - "step": 51580 - }, - { - "epoch": 3.375204448806019, - "grad_norm": 0.9097614288330078, - "learning_rate": 8.545976087527463e-05, - "loss": 0.068, - "step": 51590 - }, - { - "epoch": 3.3758586849852796, - "grad_norm": 0.7851161956787109, - "learning_rate": 8.545328407507565e-05, - "loss": 0.074, - "step": 51600 - }, - { - "epoch": 3.37651292116454, - "grad_norm": 0.9306683540344238, - "learning_rate": 8.544680607823534e-05, - "loss": 0.0856, - "step": 51610 - }, - { - "epoch": 3.377167157343801, - "grad_norm": 0.841254472732544, - "learning_rate": 8.544032688497236e-05, - "loss": 0.0771, - "step": 51620 - }, - { - "epoch": 3.3778213935230617, - "grad_norm": 1.2638838291168213, - "learning_rate": 8.543384649550543e-05, - "loss": 0.0771, - "step": 51630 - }, - { - "epoch": 3.3784756297023226, - "grad_norm": 0.802693784236908, - "learning_rate": 8.542736491005322e-05, - "loss": 0.0809, - "step": 51640 - }, - { - "epoch": 3.379129865881583, - "grad_norm": 0.7496511340141296, - "learning_rate": 8.542088212883454e-05, - "loss": 0.0827, - "step": 51650 - }, - { - "epoch": 3.379784102060844, - "grad_norm": 0.8439844250679016, - "learning_rate": 8.541439815206819e-05, - "loss": 0.0753, - "step": 51660 - }, - { - "epoch": 3.3804383382401046, - "grad_norm": 0.7389265894889832, - "learning_rate": 8.540791297997304e-05, - "loss": 0.0754, - "step": 51670 - }, - { - "epoch": 3.381092574419365, - "grad_norm": 0.983440101146698, - "learning_rate": 8.540142661276796e-05, - "loss": 0.067, - "step": 51680 - }, - { - "epoch": 3.381746810598626, - "grad_norm": 0.8338766694068909, - "learning_rate": 8.539493905067189e-05, - "loss": 0.0833, - "step": 51690 - }, - { - "epoch": 3.3824010467778867, - "grad_norm": 0.9599545001983643, - "learning_rate": 8.538845029390378e-05, - "loss": 0.0757, - "step": 51700 - }, - { - "epoch": 3.3830552829571476, - "grad_norm": 0.7370075583457947, - "learning_rate": 8.538196034268268e-05, - "loss": 0.07, - "step": 51710 - }, - { - "epoch": 3.383709519136408, - "grad_norm": 0.8070911765098572, - "learning_rate": 8.537546919722764e-05, - "loss": 0.0717, - "step": 51720 - }, - { - "epoch": 3.384363755315669, - "grad_norm": 0.8971413373947144, - "learning_rate": 8.536897685775772e-05, - "loss": 0.0659, - "step": 51730 - }, - { - "epoch": 3.3850179914949297, - "grad_norm": 0.9960509538650513, - "learning_rate": 8.536248332449207e-05, - "loss": 0.0804, - "step": 51740 - }, - { - "epoch": 3.38567222767419, - "grad_norm": 0.7966985106468201, - "learning_rate": 8.535598859764987e-05, - "loss": 0.0661, - "step": 51750 - }, - { - "epoch": 3.386326463853451, - "grad_norm": 0.9922680854797363, - "learning_rate": 8.534949267745034e-05, - "loss": 0.0726, - "step": 51760 - }, - { - "epoch": 3.3869807000327117, - "grad_norm": 0.8376561403274536, - "learning_rate": 8.534299556411271e-05, - "loss": 0.0679, - "step": 51770 - }, - { - "epoch": 3.3876349362119726, - "grad_norm": 0.915437638759613, - "learning_rate": 8.53364972578563e-05, - "loss": 0.0787, - "step": 51780 - }, - { - "epoch": 3.388289172391233, - "grad_norm": 0.7827186584472656, - "learning_rate": 8.532999775890043e-05, - "loss": 0.0735, - "step": 51790 - }, - { - "epoch": 3.388943408570494, - "grad_norm": 0.7581836581230164, - "learning_rate": 8.532349706746447e-05, - "loss": 0.0699, - "step": 51800 - }, - { - "epoch": 3.3895976447497547, - "grad_norm": 0.7785212397575378, - "learning_rate": 8.531699518376787e-05, - "loss": 0.0668, - "step": 51810 - }, - { - "epoch": 3.390251880929015, - "grad_norm": 0.9209142327308655, - "learning_rate": 8.531049210803003e-05, - "loss": 0.0723, - "step": 51820 - }, - { - "epoch": 3.390906117108276, - "grad_norm": 0.8338945508003235, - "learning_rate": 8.530398784047051e-05, - "loss": 0.081, - "step": 51830 - }, - { - "epoch": 3.3915603532875367, - "grad_norm": 0.8179078102111816, - "learning_rate": 8.529748238130879e-05, - "loss": 0.0715, - "step": 51840 - }, - { - "epoch": 3.3922145894667977, - "grad_norm": 0.7469951510429382, - "learning_rate": 8.529097573076447e-05, - "loss": 0.0812, - "step": 51850 - }, - { - "epoch": 3.392868825646058, - "grad_norm": 0.8803715705871582, - "learning_rate": 8.528446788905718e-05, - "loss": 0.0612, - "step": 51860 - }, - { - "epoch": 3.393523061825319, - "grad_norm": 0.7448040843009949, - "learning_rate": 8.527795885640655e-05, - "loss": 0.0772, - "step": 51870 - }, - { - "epoch": 3.3941772980045797, - "grad_norm": 0.9333431124687195, - "learning_rate": 8.527144863303227e-05, - "loss": 0.0727, - "step": 51880 - }, - { - "epoch": 3.39483153418384, - "grad_norm": 0.8752960562705994, - "learning_rate": 8.526493721915412e-05, - "loss": 0.0793, - "step": 51890 - }, - { - "epoch": 3.395485770363101, - "grad_norm": 0.9031022191047668, - "learning_rate": 8.525842461499185e-05, - "loss": 0.0699, - "step": 51900 - }, - { - "epoch": 3.3961400065423617, - "grad_norm": 0.8795397281646729, - "learning_rate": 8.525191082076527e-05, - "loss": 0.085, - "step": 51910 - }, - { - "epoch": 3.3967942427216227, - "grad_norm": 0.88941890001297, - "learning_rate": 8.524539583669426e-05, - "loss": 0.0742, - "step": 51920 - }, - { - "epoch": 3.397448478900883, - "grad_norm": 0.9229047894477844, - "learning_rate": 8.52388796629987e-05, - "loss": 0.0754, - "step": 51930 - }, - { - "epoch": 3.398102715080144, - "grad_norm": 0.9691248536109924, - "learning_rate": 8.523236229989855e-05, - "loss": 0.0729, - "step": 51940 - }, - { - "epoch": 3.3987569512594047, - "grad_norm": 0.8035882711410522, - "learning_rate": 8.522584374761375e-05, - "loss": 0.0768, - "step": 51950 - }, - { - "epoch": 3.399411187438665, - "grad_norm": 0.7715499997138977, - "learning_rate": 8.521932400636434e-05, - "loss": 0.075, - "step": 51960 - }, - { - "epoch": 3.400065423617926, - "grad_norm": 0.75547194480896, - "learning_rate": 8.52128030763704e-05, - "loss": 0.0622, - "step": 51970 - }, - { - "epoch": 3.4007196597971867, - "grad_norm": 1.1841288805007935, - "learning_rate": 8.520628095785199e-05, - "loss": 0.0721, - "step": 51980 - }, - { - "epoch": 3.4013738959764477, - "grad_norm": 0.8948665261268616, - "learning_rate": 8.519975765102927e-05, - "loss": 0.0674, - "step": 51990 - }, - { - "epoch": 3.402028132155708, - "grad_norm": 0.9164159297943115, - "learning_rate": 8.519323315612242e-05, - "loss": 0.0703, - "step": 52000 - }, - { - "epoch": 3.4026823683349687, - "grad_norm": 0.8155785799026489, - "learning_rate": 8.518670747335165e-05, - "loss": 0.0767, - "step": 52010 - }, - { - "epoch": 3.4033366045142297, - "grad_norm": 0.8073797821998596, - "learning_rate": 8.518018060293722e-05, - "loss": 0.0724, - "step": 52020 - }, - { - "epoch": 3.4039908406934902, - "grad_norm": 0.816716194152832, - "learning_rate": 8.517365254509942e-05, - "loss": 0.0661, - "step": 52030 - }, - { - "epoch": 3.404645076872751, - "grad_norm": 0.8661830425262451, - "learning_rate": 8.516712330005862e-05, - "loss": 0.0731, - "step": 52040 - }, - { - "epoch": 3.4052993130520117, - "grad_norm": 0.8978433012962341, - "learning_rate": 8.516059286803517e-05, - "loss": 0.0668, - "step": 52050 - }, - { - "epoch": 3.4059535492312722, - "grad_norm": 0.892529308795929, - "learning_rate": 8.515406124924949e-05, - "loss": 0.082, - "step": 52060 - }, - { - "epoch": 3.406607785410533, - "grad_norm": 0.6987180709838867, - "learning_rate": 8.514752844392206e-05, - "loss": 0.0676, - "step": 52070 - }, - { - "epoch": 3.4072620215897937, - "grad_norm": 0.8603876829147339, - "learning_rate": 8.514099445227336e-05, - "loss": 0.0658, - "step": 52080 - }, - { - "epoch": 3.4079162577690547, - "grad_norm": 0.7895892858505249, - "learning_rate": 8.513445927452396e-05, - "loss": 0.0637, - "step": 52090 - }, - { - "epoch": 3.4085704939483152, - "grad_norm": 0.8536374568939209, - "learning_rate": 8.51279229108944e-05, - "loss": 0.0765, - "step": 52100 - }, - { - "epoch": 3.409224730127576, - "grad_norm": 1.0041425228118896, - "learning_rate": 8.51213853616053e-05, - "loss": 0.0794, - "step": 52110 - }, - { - "epoch": 3.4098789663068367, - "grad_norm": 0.9329466223716736, - "learning_rate": 8.511484662687737e-05, - "loss": 0.0693, - "step": 52120 - }, - { - "epoch": 3.4105332024860973, - "grad_norm": 0.9463549852371216, - "learning_rate": 8.510830670693124e-05, - "loss": 0.073, - "step": 52130 - }, - { - "epoch": 3.4111874386653582, - "grad_norm": 0.8371836543083191, - "learning_rate": 8.51017656019877e-05, - "loss": 0.0707, - "step": 52140 - }, - { - "epoch": 3.4118416748446188, - "grad_norm": 1.1609052419662476, - "learning_rate": 8.50952233122675e-05, - "loss": 0.0705, - "step": 52150 - }, - { - "epoch": 3.4124959110238797, - "grad_norm": 0.9662759900093079, - "learning_rate": 8.50886798379915e-05, - "loss": 0.0732, - "step": 52160 - }, - { - "epoch": 3.4131501472031402, - "grad_norm": 0.9604242444038391, - "learning_rate": 8.50821351793805e-05, - "loss": 0.0683, - "step": 52170 - }, - { - "epoch": 3.413804383382401, - "grad_norm": 0.8330959677696228, - "learning_rate": 8.507558933665545e-05, - "loss": 0.0712, - "step": 52180 - }, - { - "epoch": 3.4144586195616617, - "grad_norm": 0.7776506543159485, - "learning_rate": 8.506904231003726e-05, - "loss": 0.0555, - "step": 52190 - }, - { - "epoch": 3.4151128557409223, - "grad_norm": 0.8300237059593201, - "learning_rate": 8.506249409974694e-05, - "loss": 0.0837, - "step": 52200 - }, - { - "epoch": 3.4157670919201832, - "grad_norm": 0.8520910143852234, - "learning_rate": 8.505594470600546e-05, - "loss": 0.0696, - "step": 52210 - }, - { - "epoch": 3.4164213280994438, - "grad_norm": 0.8499581217765808, - "learning_rate": 8.504939412903394e-05, - "loss": 0.0756, - "step": 52220 - }, - { - "epoch": 3.4170755642787047, - "grad_norm": 1.0196528434753418, - "learning_rate": 8.504284236905342e-05, - "loss": 0.079, - "step": 52230 - }, - { - "epoch": 3.4177298004579653, - "grad_norm": 1.0071736574172974, - "learning_rate": 8.503628942628508e-05, - "loss": 0.0836, - "step": 52240 - }, - { - "epoch": 3.4183840366372262, - "grad_norm": 0.8221054673194885, - "learning_rate": 8.502973530095008e-05, - "loss": 0.0666, - "step": 52250 - }, - { - "epoch": 3.4190382728164868, - "grad_norm": 0.9382101893424988, - "learning_rate": 8.502317999326965e-05, - "loss": 0.0742, - "step": 52260 - }, - { - "epoch": 3.4196925089957473, - "grad_norm": 0.9391849637031555, - "learning_rate": 8.501662350346505e-05, - "loss": 0.0725, - "step": 52270 - }, - { - "epoch": 3.4203467451750083, - "grad_norm": 0.7886925339698792, - "learning_rate": 8.501006583175757e-05, - "loss": 0.0861, - "step": 52280 - }, - { - "epoch": 3.4210009813542688, - "grad_norm": 0.7476629018783569, - "learning_rate": 8.500350697836855e-05, - "loss": 0.0649, - "step": 52290 - }, - { - "epoch": 3.4216552175335297, - "grad_norm": 1.02727472782135, - "learning_rate": 8.499694694351936e-05, - "loss": 0.0923, - "step": 52300 - }, - { - "epoch": 3.4223094537127903, - "grad_norm": 1.176798939704895, - "learning_rate": 8.499038572743144e-05, - "loss": 0.0644, - "step": 52310 - }, - { - "epoch": 3.4229636898920512, - "grad_norm": 0.8501316905021667, - "learning_rate": 8.498382333032622e-05, - "loss": 0.0703, - "step": 52320 - }, - { - "epoch": 3.4236179260713118, - "grad_norm": 0.7579331994056702, - "learning_rate": 8.497725975242523e-05, - "loss": 0.0661, - "step": 52330 - }, - { - "epoch": 3.4242721622505723, - "grad_norm": 0.8716086149215698, - "learning_rate": 8.497069499394998e-05, - "loss": 0.0781, - "step": 52340 - }, - { - "epoch": 3.4249263984298333, - "grad_norm": 1.071758508682251, - "learning_rate": 8.496412905512207e-05, - "loss": 0.0681, - "step": 52350 - }, - { - "epoch": 3.425580634609094, - "grad_norm": 0.8886385560035706, - "learning_rate": 8.49575619361631e-05, - "loss": 0.0763, - "step": 52360 - }, - { - "epoch": 3.4262348707883548, - "grad_norm": 0.9664177298545837, - "learning_rate": 8.495099363729472e-05, - "loss": 0.0768, - "step": 52370 - }, - { - "epoch": 3.4268891069676153, - "grad_norm": 0.7250670194625854, - "learning_rate": 8.494442415873868e-05, - "loss": 0.0724, - "step": 52380 - }, - { - "epoch": 3.4275433431468763, - "grad_norm": 0.9024300575256348, - "learning_rate": 8.493785350071665e-05, - "loss": 0.0842, - "step": 52390 - }, - { - "epoch": 3.428197579326137, - "grad_norm": 0.9288867712020874, - "learning_rate": 8.493128166345046e-05, - "loss": 0.0703, - "step": 52400 - }, - { - "epoch": 3.4288518155053973, - "grad_norm": 0.7886371612548828, - "learning_rate": 8.492470864716188e-05, - "loss": 0.0677, - "step": 52410 - }, - { - "epoch": 3.4295060516846583, - "grad_norm": 0.9110251665115356, - "learning_rate": 8.491813445207282e-05, - "loss": 0.0747, - "step": 52420 - }, - { - "epoch": 3.430160287863919, - "grad_norm": 0.7221360802650452, - "learning_rate": 8.491155907840511e-05, - "loss": 0.067, - "step": 52430 - }, - { - "epoch": 3.4308145240431798, - "grad_norm": 0.6658129096031189, - "learning_rate": 8.490498252638074e-05, - "loss": 0.0795, - "step": 52440 - }, - { - "epoch": 3.4314687602224403, - "grad_norm": 0.8011776208877563, - "learning_rate": 8.489840479622166e-05, - "loss": 0.068, - "step": 52450 - }, - { - "epoch": 3.432122996401701, - "grad_norm": 0.8690958619117737, - "learning_rate": 8.48918258881499e-05, - "loss": 0.0713, - "step": 52460 - }, - { - "epoch": 3.432777232580962, - "grad_norm": 0.8399226069450378, - "learning_rate": 8.488524580238752e-05, - "loss": 0.0726, - "step": 52470 - }, - { - "epoch": 3.4334314687602223, - "grad_norm": 0.7656417489051819, - "learning_rate": 8.487866453915658e-05, - "loss": 0.0731, - "step": 52480 - }, - { - "epoch": 3.4340857049394833, - "grad_norm": 0.8498892784118652, - "learning_rate": 8.487208209867928e-05, - "loss": 0.0818, - "step": 52490 - }, - { - "epoch": 3.434739941118744, - "grad_norm": 0.9708839058876038, - "learning_rate": 8.48654984811777e-05, - "loss": 0.0719, - "step": 52500 - }, - { - "epoch": 3.4353941772980043, - "grad_norm": 0.9287952184677124, - "learning_rate": 8.485891368687415e-05, - "loss": 0.0764, - "step": 52510 - }, - { - "epoch": 3.4360484134772653, - "grad_norm": 0.7302289605140686, - "learning_rate": 8.485232771599081e-05, - "loss": 0.0717, - "step": 52520 - }, - { - "epoch": 3.436702649656526, - "grad_norm": 0.9905624985694885, - "learning_rate": 8.484574056875003e-05, - "loss": 0.0685, - "step": 52530 - }, - { - "epoch": 3.437356885835787, - "grad_norm": 0.8764323592185974, - "learning_rate": 8.483915224537411e-05, - "loss": 0.0694, - "step": 52540 - }, - { - "epoch": 3.4380111220150473, - "grad_norm": 0.85755854845047, - "learning_rate": 8.483256274608544e-05, - "loss": 0.0723, - "step": 52550 - }, - { - "epoch": 3.4386653581943083, - "grad_norm": 0.8264038562774658, - "learning_rate": 8.482597207110642e-05, - "loss": 0.0667, - "step": 52560 - }, - { - "epoch": 3.439319594373569, - "grad_norm": 0.8556039333343506, - "learning_rate": 8.481938022065951e-05, - "loss": 0.0814, - "step": 52570 - }, - { - "epoch": 3.4399738305528293, - "grad_norm": 0.8458713293075562, - "learning_rate": 8.48127871949672e-05, - "loss": 0.0724, - "step": 52580 - }, - { - "epoch": 3.4406280667320903, - "grad_norm": 0.9362344145774841, - "learning_rate": 8.480619299425202e-05, - "loss": 0.0773, - "step": 52590 - }, - { - "epoch": 3.441282302911351, - "grad_norm": 1.080499291419983, - "learning_rate": 8.479959761873655e-05, - "loss": 0.0651, - "step": 52600 - }, - { - "epoch": 3.441936539090612, - "grad_norm": 1.0057145357131958, - "learning_rate": 8.479300106864338e-05, - "loss": 0.0717, - "step": 52610 - }, - { - "epoch": 3.4425907752698723, - "grad_norm": 0.8773415684700012, - "learning_rate": 8.478640334419519e-05, - "loss": 0.0668, - "step": 52620 - }, - { - "epoch": 3.4432450114491333, - "grad_norm": 0.7790507674217224, - "learning_rate": 8.477980444561465e-05, - "loss": 0.0702, - "step": 52630 - }, - { - "epoch": 3.443899247628394, - "grad_norm": 0.7044256925582886, - "learning_rate": 8.47732043731245e-05, - "loss": 0.0746, - "step": 52640 - }, - { - "epoch": 3.4445534838076544, - "grad_norm": 0.7038286924362183, - "learning_rate": 8.476660312694751e-05, - "loss": 0.0774, - "step": 52650 - }, - { - "epoch": 3.4452077199869153, - "grad_norm": 0.8981152772903442, - "learning_rate": 8.476000070730647e-05, - "loss": 0.0802, - "step": 52660 - }, - { - "epoch": 3.445861956166176, - "grad_norm": 0.8281787633895874, - "learning_rate": 8.475339711442428e-05, - "loss": 0.0639, - "step": 52670 - }, - { - "epoch": 3.446516192345437, - "grad_norm": 0.7848443388938904, - "learning_rate": 8.474679234852377e-05, - "loss": 0.0688, - "step": 52680 - }, - { - "epoch": 3.4471704285246973, - "grad_norm": 0.9246609210968018, - "learning_rate": 8.474018640982789e-05, - "loss": 0.0724, - "step": 52690 - }, - { - "epoch": 3.4478246647039583, - "grad_norm": 0.9001889228820801, - "learning_rate": 8.473357929855958e-05, - "loss": 0.0872, - "step": 52700 - }, - { - "epoch": 3.448478900883219, - "grad_norm": 0.7608310580253601, - "learning_rate": 8.472697101494192e-05, - "loss": 0.076, - "step": 52710 - }, - { - "epoch": 3.4491331370624794, - "grad_norm": 0.9559541940689087, - "learning_rate": 8.472036155919791e-05, - "loss": 0.071, - "step": 52720 - }, - { - "epoch": 3.4497873732417403, - "grad_norm": 0.8626623153686523, - "learning_rate": 8.471375093155061e-05, - "loss": 0.0722, - "step": 52730 - }, - { - "epoch": 3.450441609421001, - "grad_norm": 0.6806735992431641, - "learning_rate": 8.470713913222321e-05, - "loss": 0.0697, - "step": 52740 - }, - { - "epoch": 3.451095845600262, - "grad_norm": 0.9566436409950256, - "learning_rate": 8.470052616143883e-05, - "loss": 0.0716, - "step": 52750 - }, - { - "epoch": 3.4517500817795224, - "grad_norm": 0.8389673829078674, - "learning_rate": 8.469391201942068e-05, - "loss": 0.0661, - "step": 52760 - }, - { - "epoch": 3.4524043179587833, - "grad_norm": 0.8601690530776978, - "learning_rate": 8.468729670639201e-05, - "loss": 0.0764, - "step": 52770 - }, - { - "epoch": 3.453058554138044, - "grad_norm": 0.8300301432609558, - "learning_rate": 8.468068022257611e-05, - "loss": 0.0793, - "step": 52780 - }, - { - "epoch": 3.4537127903173044, - "grad_norm": 0.8958084583282471, - "learning_rate": 8.46740625681963e-05, - "loss": 0.0737, - "step": 52790 - }, - { - "epoch": 3.4543670264965654, - "grad_norm": 0.9533824920654297, - "learning_rate": 8.466744374347593e-05, - "loss": 0.0647, - "step": 52800 - }, - { - "epoch": 3.455021262675826, - "grad_norm": 0.7993425130844116, - "learning_rate": 8.466082374863844e-05, - "loss": 0.0792, - "step": 52810 - }, - { - "epoch": 3.455675498855087, - "grad_norm": 0.8701884746551514, - "learning_rate": 8.465420258390723e-05, - "loss": 0.0681, - "step": 52820 - }, - { - "epoch": 3.4563297350343474, - "grad_norm": 0.8877395987510681, - "learning_rate": 8.464758024950581e-05, - "loss": 0.0706, - "step": 52830 - }, - { - "epoch": 3.4569839712136083, - "grad_norm": 0.9851657152175903, - "learning_rate": 8.464095674565769e-05, - "loss": 0.0668, - "step": 52840 - }, - { - "epoch": 3.457638207392869, - "grad_norm": 1.3369970321655273, - "learning_rate": 8.46343320725864e-05, - "loss": 0.0743, - "step": 52850 - }, - { - "epoch": 3.4582924435721294, - "grad_norm": 1.114278793334961, - "learning_rate": 8.462770623051561e-05, - "loss": 0.0723, - "step": 52860 - }, - { - "epoch": 3.4589466797513904, - "grad_norm": 0.8872862458229065, - "learning_rate": 8.46210792196689e-05, - "loss": 0.0695, - "step": 52870 - }, - { - "epoch": 3.459600915930651, - "grad_norm": 0.9526658654212952, - "learning_rate": 8.461445104026997e-05, - "loss": 0.076, - "step": 52880 - }, - { - "epoch": 3.460255152109912, - "grad_norm": 0.7298262715339661, - "learning_rate": 8.460782169254254e-05, - "loss": 0.0723, - "step": 52890 - }, - { - "epoch": 3.4609093882891724, - "grad_norm": 0.845991313457489, - "learning_rate": 8.460119117671037e-05, - "loss": 0.0694, - "step": 52900 - }, - { - "epoch": 3.4615636244684334, - "grad_norm": 0.8948342800140381, - "learning_rate": 8.459455949299724e-05, - "loss": 0.0685, - "step": 52910 - }, - { - "epoch": 3.462217860647694, - "grad_norm": 0.7942617535591125, - "learning_rate": 8.458792664162702e-05, - "loss": 0.0754, - "step": 52920 - }, - { - "epoch": 3.4628720968269544, - "grad_norm": 0.959294319152832, - "learning_rate": 8.458129262282355e-05, - "loss": 0.0776, - "step": 52930 - }, - { - "epoch": 3.4635263330062154, - "grad_norm": 0.7469987273216248, - "learning_rate": 8.457465743681077e-05, - "loss": 0.0724, - "step": 52940 - }, - { - "epoch": 3.464180569185476, - "grad_norm": 0.8524947166442871, - "learning_rate": 8.456802108381261e-05, - "loss": 0.0758, - "step": 52950 - }, - { - "epoch": 3.4648348053647364, - "grad_norm": 0.8579651713371277, - "learning_rate": 8.45613835640531e-05, - "loss": 0.0835, - "step": 52960 - }, - { - "epoch": 3.4654890415439974, - "grad_norm": 0.9005305767059326, - "learning_rate": 8.455474487775625e-05, - "loss": 0.0741, - "step": 52970 - }, - { - "epoch": 3.466143277723258, - "grad_norm": 0.8470668792724609, - "learning_rate": 8.454810502514614e-05, - "loss": 0.0657, - "step": 52980 - }, - { - "epoch": 3.466797513902519, - "grad_norm": 0.7774618864059448, - "learning_rate": 8.454146400644687e-05, - "loss": 0.0734, - "step": 52990 - }, - { - "epoch": 3.4674517500817794, - "grad_norm": 0.7881149649620056, - "learning_rate": 8.453482182188259e-05, - "loss": 0.0763, - "step": 53000 - }, - { - "epoch": 3.4681059862610404, - "grad_norm": 0.9058419466018677, - "learning_rate": 8.452817847167753e-05, - "loss": 0.0701, - "step": 53010 - }, - { - "epoch": 3.468760222440301, - "grad_norm": 0.927578866481781, - "learning_rate": 8.452153395605587e-05, - "loss": 0.0646, - "step": 53020 - }, - { - "epoch": 3.4694144586195614, - "grad_norm": 0.8536058664321899, - "learning_rate": 8.451488827524192e-05, - "loss": 0.0701, - "step": 53030 - }, - { - "epoch": 3.4700686947988224, - "grad_norm": 0.7442119717597961, - "learning_rate": 8.450824142945997e-05, - "loss": 0.0696, - "step": 53040 - }, - { - "epoch": 3.470722930978083, - "grad_norm": 0.7530878186225891, - "learning_rate": 8.450159341893436e-05, - "loss": 0.0736, - "step": 53050 - }, - { - "epoch": 3.471377167157344, - "grad_norm": 0.8248975276947021, - "learning_rate": 8.449494424388951e-05, - "loss": 0.0658, - "step": 53060 - }, - { - "epoch": 3.4720314033366044, - "grad_norm": 0.973678708076477, - "learning_rate": 8.44882939045498e-05, - "loss": 0.0659, - "step": 53070 - }, - { - "epoch": 3.4726856395158654, - "grad_norm": 0.950872004032135, - "learning_rate": 8.448164240113972e-05, - "loss": 0.0722, - "step": 53080 - }, - { - "epoch": 3.473339875695126, - "grad_norm": 0.7618328332901001, - "learning_rate": 8.447498973388379e-05, - "loss": 0.0724, - "step": 53090 - }, - { - "epoch": 3.4739941118743864, - "grad_norm": 0.7460528612136841, - "learning_rate": 8.446833590300656e-05, - "loss": 0.0718, - "step": 53100 - }, - { - "epoch": 3.4746483480536474, - "grad_norm": 1.07584810256958, - "learning_rate": 8.446168090873257e-05, - "loss": 0.0778, - "step": 53110 - }, - { - "epoch": 3.475302584232908, - "grad_norm": 0.7845546007156372, - "learning_rate": 8.445502475128649e-05, - "loss": 0.0703, - "step": 53120 - }, - { - "epoch": 3.475956820412169, - "grad_norm": 0.9104862809181213, - "learning_rate": 8.444836743089294e-05, - "loss": 0.0711, - "step": 53130 - }, - { - "epoch": 3.4766110565914294, - "grad_norm": 0.8521603941917419, - "learning_rate": 8.444170894777665e-05, - "loss": 0.0751, - "step": 53140 - }, - { - "epoch": 3.4772652927706904, - "grad_norm": 1.0778313875198364, - "learning_rate": 8.443504930216237e-05, - "loss": 0.0772, - "step": 53150 - }, - { - "epoch": 3.477919528949951, - "grad_norm": 1.079459309577942, - "learning_rate": 8.442838849427486e-05, - "loss": 0.086, - "step": 53160 - }, - { - "epoch": 3.4785737651292115, - "grad_norm": 0.8837111592292786, - "learning_rate": 8.442172652433895e-05, - "loss": 0.0755, - "step": 53170 - }, - { - "epoch": 3.4792280013084724, - "grad_norm": 0.906924843788147, - "learning_rate": 8.441506339257949e-05, - "loss": 0.0708, - "step": 53180 - }, - { - "epoch": 3.479882237487733, - "grad_norm": 0.8300266861915588, - "learning_rate": 8.440839909922139e-05, - "loss": 0.0653, - "step": 53190 - }, - { - "epoch": 3.480536473666994, - "grad_norm": 0.9430700540542603, - "learning_rate": 8.440173364448958e-05, - "loss": 0.0642, - "step": 53200 - }, - { - "epoch": 3.4811907098462544, - "grad_norm": 0.9260400533676147, - "learning_rate": 8.439506702860902e-05, - "loss": 0.0705, - "step": 53210 - }, - { - "epoch": 3.4818449460255154, - "grad_norm": 0.8879568576812744, - "learning_rate": 8.438839925180476e-05, - "loss": 0.0817, - "step": 53220 - }, - { - "epoch": 3.482499182204776, - "grad_norm": 0.8787881731987, - "learning_rate": 8.438173031430185e-05, - "loss": 0.072, - "step": 53230 - }, - { - "epoch": 3.4831534183840365, - "grad_norm": 0.9385049343109131, - "learning_rate": 8.437506021632535e-05, - "loss": 0.0777, - "step": 53240 - }, - { - "epoch": 3.4838076545632974, - "grad_norm": 0.8981115818023682, - "learning_rate": 8.436838895810042e-05, - "loss": 0.0648, - "step": 53250 - }, - { - "epoch": 3.484461890742558, - "grad_norm": 0.8167151808738708, - "learning_rate": 8.436171653985223e-05, - "loss": 0.0831, - "step": 53260 - }, - { - "epoch": 3.485116126921819, - "grad_norm": 1.022867202758789, - "learning_rate": 8.4355042961806e-05, - "loss": 0.0701, - "step": 53270 - }, - { - "epoch": 3.4857703631010795, - "grad_norm": 0.6986026167869568, - "learning_rate": 8.434836822418697e-05, - "loss": 0.0672, - "step": 53280 - }, - { - "epoch": 3.4864245992803404, - "grad_norm": 0.8744747042655945, - "learning_rate": 8.434169232722043e-05, - "loss": 0.0703, - "step": 53290 - }, - { - "epoch": 3.487078835459601, - "grad_norm": 1.0003246068954468, - "learning_rate": 8.433501527113169e-05, - "loss": 0.0639, - "step": 53300 - }, - { - "epoch": 3.4877330716388615, - "grad_norm": 0.8735449314117432, - "learning_rate": 8.432833705614616e-05, - "loss": 0.0683, - "step": 53310 - }, - { - "epoch": 3.4883873078181225, - "grad_norm": 0.8451821804046631, - "learning_rate": 8.43216576824892e-05, - "loss": 0.0666, - "step": 53320 - }, - { - "epoch": 3.489041543997383, - "grad_norm": 0.8688971996307373, - "learning_rate": 8.43149771503863e-05, - "loss": 0.0644, - "step": 53330 - }, - { - "epoch": 3.489695780176644, - "grad_norm": 0.7318007349967957, - "learning_rate": 8.430829546006293e-05, - "loss": 0.0689, - "step": 53340 - }, - { - "epoch": 3.4903500163559045, - "grad_norm": 0.8060617446899414, - "learning_rate": 8.430161261174461e-05, - "loss": 0.0713, - "step": 53350 - }, - { - "epoch": 3.4910042525351654, - "grad_norm": 0.8988872170448303, - "learning_rate": 8.42949286056569e-05, - "loss": 0.0741, - "step": 53360 - }, - { - "epoch": 3.491658488714426, - "grad_norm": 1.0134598016738892, - "learning_rate": 8.42882434420254e-05, - "loss": 0.0698, - "step": 53370 - }, - { - "epoch": 3.4923127248936865, - "grad_norm": 0.8395692706108093, - "learning_rate": 8.428155712107577e-05, - "loss": 0.064, - "step": 53380 - }, - { - "epoch": 3.4929669610729475, - "grad_norm": 0.8911391496658325, - "learning_rate": 8.427486964303368e-05, - "loss": 0.0741, - "step": 53390 - }, - { - "epoch": 3.493621197252208, - "grad_norm": 0.7914614081382751, - "learning_rate": 8.426818100812486e-05, - "loss": 0.073, - "step": 53400 - }, - { - "epoch": 3.4942754334314685, - "grad_norm": 0.8505756855010986, - "learning_rate": 8.426149121657504e-05, - "loss": 0.0766, - "step": 53410 - }, - { - "epoch": 3.4949296696107295, - "grad_norm": 0.6521218419075012, - "learning_rate": 8.425480026861006e-05, - "loss": 0.0668, - "step": 53420 - }, - { - "epoch": 3.49558390578999, - "grad_norm": 0.7567178010940552, - "learning_rate": 8.424810816445571e-05, - "loss": 0.0648, - "step": 53430 - }, - { - "epoch": 3.496238141969251, - "grad_norm": 0.8685494065284729, - "learning_rate": 8.42414149043379e-05, - "loss": 0.0749, - "step": 53440 - }, - { - "epoch": 3.4968923781485115, - "grad_norm": 0.8376366496086121, - "learning_rate": 8.423472048848254e-05, - "loss": 0.0722, - "step": 53450 - }, - { - "epoch": 3.4975466143277725, - "grad_norm": 0.8271428346633911, - "learning_rate": 8.422802491711557e-05, - "loss": 0.0799, - "step": 53460 - }, - { - "epoch": 3.498200850507033, - "grad_norm": 0.9023700952529907, - "learning_rate": 8.4221328190463e-05, - "loss": 0.0633, - "step": 53470 - }, - { - "epoch": 3.4988550866862935, - "grad_norm": 0.8496003150939941, - "learning_rate": 8.421463030875085e-05, - "loss": 0.0744, - "step": 53480 - }, - { - "epoch": 3.4995093228655545, - "grad_norm": 0.7680093050003052, - "learning_rate": 8.420793127220521e-05, - "loss": 0.0715, - "step": 53490 - }, - { - "epoch": 3.500163559044815, - "grad_norm": 0.992078423500061, - "learning_rate": 8.420123108105215e-05, - "loss": 0.0807, - "step": 53500 - }, - { - "epoch": 3.500817795224076, - "grad_norm": 0.7784678936004639, - "learning_rate": 8.419452973551786e-05, - "loss": 0.0692, - "step": 53510 - }, - { - "epoch": 3.5014720314033365, - "grad_norm": 0.8641963601112366, - "learning_rate": 8.418782723582852e-05, - "loss": 0.0731, - "step": 53520 - }, - { - "epoch": 3.5021262675825975, - "grad_norm": 0.8115648627281189, - "learning_rate": 8.418112358221036e-05, - "loss": 0.076, - "step": 53530 - }, - { - "epoch": 3.502780503761858, - "grad_norm": 0.9229583144187927, - "learning_rate": 8.417441877488961e-05, - "loss": 0.0721, - "step": 53540 - }, - { - "epoch": 3.5034347399411185, - "grad_norm": 0.9231205582618713, - "learning_rate": 8.416771281409262e-05, - "loss": 0.0679, - "step": 53550 - }, - { - "epoch": 3.5040889761203795, - "grad_norm": 0.8276332020759583, - "learning_rate": 8.41610057000457e-05, - "loss": 0.0737, - "step": 53560 - }, - { - "epoch": 3.50474321229964, - "grad_norm": 0.75644451379776, - "learning_rate": 8.415429743297524e-05, - "loss": 0.0627, - "step": 53570 - }, - { - "epoch": 3.505397448478901, - "grad_norm": 0.7527329325675964, - "learning_rate": 8.41475880131077e-05, - "loss": 0.0754, - "step": 53580 - }, - { - "epoch": 3.5060516846581615, - "grad_norm": 0.8161507248878479, - "learning_rate": 8.414087744066947e-05, - "loss": 0.065, - "step": 53590 - }, - { - "epoch": 3.5067059208374225, - "grad_norm": 0.851140558719635, - "learning_rate": 8.413416571588713e-05, - "loss": 0.0655, - "step": 53600 - }, - { - "epoch": 3.507360157016683, - "grad_norm": 0.8948183059692383, - "learning_rate": 8.412745283898714e-05, - "loss": 0.0698, - "step": 53610 - }, - { - "epoch": 3.5080143931959435, - "grad_norm": 1.1371095180511475, - "learning_rate": 8.412073881019613e-05, - "loss": 0.0762, - "step": 53620 - }, - { - "epoch": 3.5086686293752045, - "grad_norm": 0.9154216647148132, - "learning_rate": 8.41140236297407e-05, - "loss": 0.0722, - "step": 53630 - }, - { - "epoch": 3.509322865554465, - "grad_norm": 0.8594346046447754, - "learning_rate": 8.41073072978475e-05, - "loss": 0.0718, - "step": 53640 - }, - { - "epoch": 3.509977101733726, - "grad_norm": 0.9689813852310181, - "learning_rate": 8.410058981474324e-05, - "loss": 0.0693, - "step": 53650 - }, - { - "epoch": 3.5106313379129865, - "grad_norm": 0.80068039894104, - "learning_rate": 8.409387118065464e-05, - "loss": 0.0758, - "step": 53660 - }, - { - "epoch": 3.5112855740922475, - "grad_norm": 0.8130351901054382, - "learning_rate": 8.408715139580846e-05, - "loss": 0.0785, - "step": 53670 - }, - { - "epoch": 3.511939810271508, - "grad_norm": 0.758581817150116, - "learning_rate": 8.408043046043154e-05, - "loss": 0.0636, - "step": 53680 - }, - { - "epoch": 3.5125940464507686, - "grad_norm": 0.955032467842102, - "learning_rate": 8.407370837475071e-05, - "loss": 0.0794, - "step": 53690 - }, - { - "epoch": 3.5132482826300295, - "grad_norm": 0.7809516191482544, - "learning_rate": 8.406698513899285e-05, - "loss": 0.0668, - "step": 53700 - }, - { - "epoch": 3.51390251880929, - "grad_norm": 0.8768904805183411, - "learning_rate": 8.406026075338489e-05, - "loss": 0.077, - "step": 53710 - }, - { - "epoch": 3.514556754988551, - "grad_norm": 1.029979944229126, - "learning_rate": 8.405353521815382e-05, - "loss": 0.0761, - "step": 53720 - }, - { - "epoch": 3.5152109911678115, - "grad_norm": 0.7512595057487488, - "learning_rate": 8.404680853352662e-05, - "loss": 0.0804, - "step": 53730 - }, - { - "epoch": 3.5158652273470725, - "grad_norm": 0.8705400824546814, - "learning_rate": 8.404008069973035e-05, - "loss": 0.0709, - "step": 53740 - }, - { - "epoch": 3.516519463526333, - "grad_norm": 0.9716213941574097, - "learning_rate": 8.403335171699209e-05, - "loss": 0.0821, - "step": 53750 - }, - { - "epoch": 3.5171736997055936, - "grad_norm": 0.7278077006340027, - "learning_rate": 8.402662158553894e-05, - "loss": 0.0783, - "step": 53760 - }, - { - "epoch": 3.5178279358848545, - "grad_norm": 0.8177416324615479, - "learning_rate": 8.401989030559807e-05, - "loss": 0.0697, - "step": 53770 - }, - { - "epoch": 3.518482172064115, - "grad_norm": 0.7998149991035461, - "learning_rate": 8.401315787739667e-05, - "loss": 0.0764, - "step": 53780 - }, - { - "epoch": 3.5191364082433756, - "grad_norm": 0.9496130347251892, - "learning_rate": 8.400642430116203e-05, - "loss": 0.0741, - "step": 53790 - }, - { - "epoch": 3.5197906444226366, - "grad_norm": 0.8604884743690491, - "learning_rate": 8.399968957712135e-05, - "loss": 0.0614, - "step": 53800 - }, - { - "epoch": 3.5204448806018975, - "grad_norm": 1.4663547277450562, - "learning_rate": 8.3992953705502e-05, - "loss": 0.0646, - "step": 53810 - }, - { - "epoch": 3.521099116781158, - "grad_norm": 0.8717848658561707, - "learning_rate": 8.39862166865313e-05, - "loss": 0.0814, - "step": 53820 - }, - { - "epoch": 3.5217533529604186, - "grad_norm": 0.8972777724266052, - "learning_rate": 8.397947852043666e-05, - "loss": 0.0727, - "step": 53830 - }, - { - "epoch": 3.5224075891396796, - "grad_norm": 1.0742758512496948, - "learning_rate": 8.39727392074455e-05, - "loss": 0.0723, - "step": 53840 - }, - { - "epoch": 3.52306182531894, - "grad_norm": 0.9516122937202454, - "learning_rate": 8.396599874778531e-05, - "loss": 0.0652, - "step": 53850 - }, - { - "epoch": 3.5237160614982006, - "grad_norm": 0.8731452226638794, - "learning_rate": 8.395925714168356e-05, - "loss": 0.0755, - "step": 53860 - }, - { - "epoch": 3.5243702976774616, - "grad_norm": 0.7560011148452759, - "learning_rate": 8.395251438936784e-05, - "loss": 0.0697, - "step": 53870 - }, - { - "epoch": 3.5250245338567225, - "grad_norm": 0.9268012642860413, - "learning_rate": 8.39457704910657e-05, - "loss": 0.0666, - "step": 53880 - }, - { - "epoch": 3.525678770035983, - "grad_norm": 0.8395716547966003, - "learning_rate": 8.393902544700478e-05, - "loss": 0.0705, - "step": 53890 - }, - { - "epoch": 3.5263330062152436, - "grad_norm": 0.9042119979858398, - "learning_rate": 8.393227925741276e-05, - "loss": 0.0776, - "step": 53900 - }, - { - "epoch": 3.5269872423945046, - "grad_norm": 0.8316322565078735, - "learning_rate": 8.392553192251731e-05, - "loss": 0.0697, - "step": 53910 - }, - { - "epoch": 3.527641478573765, - "grad_norm": 0.9326664209365845, - "learning_rate": 8.391878344254618e-05, - "loss": 0.0822, - "step": 53920 - }, - { - "epoch": 3.5282957147530256, - "grad_norm": 0.7888161540031433, - "learning_rate": 8.391203381772716e-05, - "loss": 0.071, - "step": 53930 - }, - { - "epoch": 3.5289499509322866, - "grad_norm": 1.0052454471588135, - "learning_rate": 8.390528304828807e-05, - "loss": 0.0663, - "step": 53940 - }, - { - "epoch": 3.529604187111547, - "grad_norm": 0.800828754901886, - "learning_rate": 8.389853113445676e-05, - "loss": 0.069, - "step": 53950 - }, - { - "epoch": 3.530258423290808, - "grad_norm": 0.9678196310997009, - "learning_rate": 8.38917780764611e-05, - "loss": 0.0734, - "step": 53960 - }, - { - "epoch": 3.5309126594700686, - "grad_norm": 0.9162442088127136, - "learning_rate": 8.388502387452906e-05, - "loss": 0.0679, - "step": 53970 - }, - { - "epoch": 3.5315668956493296, - "grad_norm": 0.7903563380241394, - "learning_rate": 8.38782685288886e-05, - "loss": 0.0633, - "step": 53980 - }, - { - "epoch": 3.53222113182859, - "grad_norm": 0.8106311559677124, - "learning_rate": 8.387151203976772e-05, - "loss": 0.0761, - "step": 53990 - }, - { - "epoch": 3.5328753680078506, - "grad_norm": 0.8316931128501892, - "learning_rate": 8.386475440739447e-05, - "loss": 0.0641, - "step": 54000 - }, - { - "epoch": 3.5335296041871116, - "grad_norm": 0.8786323666572571, - "learning_rate": 8.385799563199697e-05, - "loss": 0.0891, - "step": 54010 - }, - { - "epoch": 3.534183840366372, - "grad_norm": 0.9065492749214172, - "learning_rate": 8.385123571380331e-05, - "loss": 0.0743, - "step": 54020 - }, - { - "epoch": 3.534838076545633, - "grad_norm": 0.744517982006073, - "learning_rate": 8.384447465304166e-05, - "loss": 0.0679, - "step": 54030 - }, - { - "epoch": 3.5354923127248936, - "grad_norm": 0.8627418875694275, - "learning_rate": 8.383771244994023e-05, - "loss": 0.0755, - "step": 54040 - }, - { - "epoch": 3.5361465489041546, - "grad_norm": 0.8611176609992981, - "learning_rate": 8.383094910472728e-05, - "loss": 0.0785, - "step": 54050 - }, - { - "epoch": 3.536800785083415, - "grad_norm": 0.9356468319892883, - "learning_rate": 8.382418461763105e-05, - "loss": 0.0814, - "step": 54060 - }, - { - "epoch": 3.5374550212626756, - "grad_norm": 0.8152367472648621, - "learning_rate": 8.381741898887989e-05, - "loss": 0.0728, - "step": 54070 - }, - { - "epoch": 3.5381092574419366, - "grad_norm": 0.937050998210907, - "learning_rate": 8.381065221870214e-05, - "loss": 0.0698, - "step": 54080 - }, - { - "epoch": 3.538763493621197, - "grad_norm": 0.8761103749275208, - "learning_rate": 8.380388430732623e-05, - "loss": 0.0731, - "step": 54090 - }, - { - "epoch": 3.539417729800458, - "grad_norm": 0.7785167694091797, - "learning_rate": 8.379711525498055e-05, - "loss": 0.0663, - "step": 54100 - }, - { - "epoch": 3.5400719659797186, - "grad_norm": 0.7450352907180786, - "learning_rate": 8.37903450618936e-05, - "loss": 0.0678, - "step": 54110 - }, - { - "epoch": 3.5407262021589796, - "grad_norm": 0.8352435231208801, - "learning_rate": 8.378357372829391e-05, - "loss": 0.0692, - "step": 54120 - }, - { - "epoch": 3.54138043833824, - "grad_norm": 1.016081690788269, - "learning_rate": 8.377680125440997e-05, - "loss": 0.0641, - "step": 54130 - }, - { - "epoch": 3.5420346745175006, - "grad_norm": 0.9595414400100708, - "learning_rate": 8.377002764047042e-05, - "loss": 0.0721, - "step": 54140 - }, - { - "epoch": 3.5426889106967616, - "grad_norm": 1.0154024362564087, - "learning_rate": 8.376325288670386e-05, - "loss": 0.0712, - "step": 54150 - }, - { - "epoch": 3.543343146876022, - "grad_norm": 0.8880914449691772, - "learning_rate": 8.3756476993339e-05, - "loss": 0.0839, - "step": 54160 - }, - { - "epoch": 3.543997383055283, - "grad_norm": 1.0185810327529907, - "learning_rate": 8.374969996060447e-05, - "loss": 0.077, - "step": 54170 - }, - { - "epoch": 3.5446516192345436, - "grad_norm": 0.900940477848053, - "learning_rate": 8.374292178872907e-05, - "loss": 0.0718, - "step": 54180 - }, - { - "epoch": 3.5453058554138046, - "grad_norm": 0.9907526969909668, - "learning_rate": 8.373614247794157e-05, - "loss": 0.0685, - "step": 54190 - }, - { - "epoch": 3.545960091593065, - "grad_norm": 0.9706956148147583, - "learning_rate": 8.37293620284708e-05, - "loss": 0.0699, - "step": 54200 - }, - { - "epoch": 3.5466143277723257, - "grad_norm": 0.8409591913223267, - "learning_rate": 8.372258044054559e-05, - "loss": 0.0759, - "step": 54210 - }, - { - "epoch": 3.5472685639515866, - "grad_norm": 0.7982200980186462, - "learning_rate": 8.371579771439483e-05, - "loss": 0.0678, - "step": 54220 - }, - { - "epoch": 3.547922800130847, - "grad_norm": 0.8349514603614807, - "learning_rate": 8.37090138502475e-05, - "loss": 0.0714, - "step": 54230 - }, - { - "epoch": 3.5485770363101077, - "grad_norm": 0.7618018984794617, - "learning_rate": 8.370222884833254e-05, - "loss": 0.0592, - "step": 54240 - }, - { - "epoch": 3.5492312724893686, - "grad_norm": 1.4026459455490112, - "learning_rate": 8.369544270887897e-05, - "loss": 0.0804, - "step": 54250 - }, - { - "epoch": 3.5498855086686296, - "grad_norm": 0.8770639896392822, - "learning_rate": 8.368865543211584e-05, - "loss": 0.0781, - "step": 54260 - }, - { - "epoch": 3.55053974484789, - "grad_norm": 1.1316838264465332, - "learning_rate": 8.368186701827223e-05, - "loss": 0.0718, - "step": 54270 - }, - { - "epoch": 3.5511939810271507, - "grad_norm": 0.9863431453704834, - "learning_rate": 8.367507746757728e-05, - "loss": 0.0842, - "step": 54280 - }, - { - "epoch": 3.5518482172064116, - "grad_norm": 0.7806870341300964, - "learning_rate": 8.366828678026016e-05, - "loss": 0.0717, - "step": 54290 - }, - { - "epoch": 3.552502453385672, - "grad_norm": 0.8638490438461304, - "learning_rate": 8.366149495655004e-05, - "loss": 0.073, - "step": 54300 - }, - { - "epoch": 3.5531566895649327, - "grad_norm": 0.9356234669685364, - "learning_rate": 8.36547019966762e-05, - "loss": 0.0662, - "step": 54310 - }, - { - "epoch": 3.5538109257441937, - "grad_norm": 0.7376917600631714, - "learning_rate": 8.36479079008679e-05, - "loss": 0.0667, - "step": 54320 - }, - { - "epoch": 3.5544651619234546, - "grad_norm": 0.7717798948287964, - "learning_rate": 8.364111266935446e-05, - "loss": 0.0645, - "step": 54330 - }, - { - "epoch": 3.555119398102715, - "grad_norm": 0.8551239371299744, - "learning_rate": 8.363431630236525e-05, - "loss": 0.0726, - "step": 54340 - }, - { - "epoch": 3.5557736342819757, - "grad_norm": 0.7663902640342712, - "learning_rate": 8.362751880012965e-05, - "loss": 0.076, - "step": 54350 - }, - { - "epoch": 3.5564278704612367, - "grad_norm": 1.0010058879852295, - "learning_rate": 8.362072016287709e-05, - "loss": 0.0705, - "step": 54360 - }, - { - "epoch": 3.557082106640497, - "grad_norm": 0.7997660040855408, - "learning_rate": 8.361392039083706e-05, - "loss": 0.0669, - "step": 54370 - }, - { - "epoch": 3.5577363428197577, - "grad_norm": 0.6869263648986816, - "learning_rate": 8.360711948423906e-05, - "loss": 0.0673, - "step": 54380 - }, - { - "epoch": 3.5583905789990187, - "grad_norm": 1.1054303646087646, - "learning_rate": 8.360031744331264e-05, - "loss": 0.0772, - "step": 54390 - }, - { - "epoch": 3.559044815178279, - "grad_norm": 0.9529674649238586, - "learning_rate": 8.359351426828739e-05, - "loss": 0.067, - "step": 54400 - }, - { - "epoch": 3.55969905135754, - "grad_norm": 0.6143543124198914, - "learning_rate": 8.358670995939293e-05, - "loss": 0.0758, - "step": 54410 - }, - { - "epoch": 3.5603532875368007, - "grad_norm": 0.9302859306335449, - "learning_rate": 8.357990451685892e-05, - "loss": 0.0653, - "step": 54420 - }, - { - "epoch": 3.5610075237160617, - "grad_norm": 0.8283950090408325, - "learning_rate": 8.357309794091507e-05, - "loss": 0.0701, - "step": 54430 - }, - { - "epoch": 3.561661759895322, - "grad_norm": 0.7890059947967529, - "learning_rate": 8.356629023179111e-05, - "loss": 0.0711, - "step": 54440 - }, - { - "epoch": 3.5623159960745827, - "grad_norm": 0.8997254371643066, - "learning_rate": 8.355948138971683e-05, - "loss": 0.0659, - "step": 54450 - }, - { - "epoch": 3.5629702322538437, - "grad_norm": 1.2762157917022705, - "learning_rate": 8.355267141492205e-05, - "loss": 0.0692, - "step": 54460 - }, - { - "epoch": 3.563624468433104, - "grad_norm": 0.8240480422973633, - "learning_rate": 8.354586030763659e-05, - "loss": 0.064, - "step": 54470 - }, - { - "epoch": 3.564278704612365, - "grad_norm": 0.8595862984657288, - "learning_rate": 8.353904806809039e-05, - "loss": 0.0612, - "step": 54480 - }, - { - "epoch": 3.5649329407916257, - "grad_norm": 1.0054336786270142, - "learning_rate": 8.353223469651335e-05, - "loss": 0.075, - "step": 54490 - }, - { - "epoch": 3.5655871769708867, - "grad_norm": 0.9695059061050415, - "learning_rate": 8.352542019313544e-05, - "loss": 0.0723, - "step": 54500 - }, - { - "epoch": 3.566241413150147, - "grad_norm": 0.8478056192398071, - "learning_rate": 8.351860455818667e-05, - "loss": 0.0661, - "step": 54510 - }, - { - "epoch": 3.5668956493294077, - "grad_norm": 0.7672955989837646, - "learning_rate": 8.35117877918971e-05, - "loss": 0.0677, - "step": 54520 - }, - { - "epoch": 3.5675498855086687, - "grad_norm": 0.8970052003860474, - "learning_rate": 8.350496989449681e-05, - "loss": 0.0616, - "step": 54530 - }, - { - "epoch": 3.568204121687929, - "grad_norm": 0.7992386221885681, - "learning_rate": 8.34981508662159e-05, - "loss": 0.0715, - "step": 54540 - }, - { - "epoch": 3.56885835786719, - "grad_norm": 0.8448268175125122, - "learning_rate": 8.349133070728456e-05, - "loss": 0.0611, - "step": 54550 - }, - { - "epoch": 3.5695125940464507, - "grad_norm": 0.821503758430481, - "learning_rate": 8.348450941793298e-05, - "loss": 0.0661, - "step": 54560 - }, - { - "epoch": 3.5701668302257117, - "grad_norm": 0.9001857042312622, - "learning_rate": 8.347768699839139e-05, - "loss": 0.0688, - "step": 54570 - }, - { - "epoch": 3.570821066404972, - "grad_norm": 0.891309380531311, - "learning_rate": 8.347086344889006e-05, - "loss": 0.0628, - "step": 54580 - }, - { - "epoch": 3.5714753025842327, - "grad_norm": 0.8133125305175781, - "learning_rate": 8.34640387696593e-05, - "loss": 0.0648, - "step": 54590 - }, - { - "epoch": 3.5721295387634937, - "grad_norm": 1.0169897079467773, - "learning_rate": 8.345721296092947e-05, - "loss": 0.0794, - "step": 54600 - }, - { - "epoch": 3.5727837749427542, - "grad_norm": 0.8700054883956909, - "learning_rate": 8.345038602293097e-05, - "loss": 0.0629, - "step": 54610 - }, - { - "epoch": 3.573438011122015, - "grad_norm": 0.9666095972061157, - "learning_rate": 8.344355795589421e-05, - "loss": 0.0728, - "step": 54620 - }, - { - "epoch": 3.5740922473012757, - "grad_norm": 0.785966157913208, - "learning_rate": 8.343672876004965e-05, - "loss": 0.0687, - "step": 54630 - }, - { - "epoch": 3.5747464834805367, - "grad_norm": 0.988416314125061, - "learning_rate": 8.342989843562782e-05, - "loss": 0.0689, - "step": 54640 - }, - { - "epoch": 3.575400719659797, - "grad_norm": 0.9573351740837097, - "learning_rate": 8.342306698285923e-05, - "loss": 0.0716, - "step": 54650 - }, - { - "epoch": 3.5760549558390577, - "grad_norm": 0.8525318503379822, - "learning_rate": 8.341623440197448e-05, - "loss": 0.0748, - "step": 54660 - }, - { - "epoch": 3.5767091920183187, - "grad_norm": 1.0283527374267578, - "learning_rate": 8.340940069320418e-05, - "loss": 0.0686, - "step": 54670 - }, - { - "epoch": 3.5773634281975792, - "grad_norm": 0.908446192741394, - "learning_rate": 8.3402565856779e-05, - "loss": 0.0638, - "step": 54680 - }, - { - "epoch": 3.5780176643768398, - "grad_norm": 0.984634518623352, - "learning_rate": 8.339572989292961e-05, - "loss": 0.0744, - "step": 54690 - }, - { - "epoch": 3.5786719005561007, - "grad_norm": 1.0757420063018799, - "learning_rate": 8.338889280188674e-05, - "loss": 0.0748, - "step": 54700 - }, - { - "epoch": 3.5793261367353617, - "grad_norm": 0.6891494393348694, - "learning_rate": 8.338205458388118e-05, - "loss": 0.0737, - "step": 54710 - }, - { - "epoch": 3.5799803729146222, - "grad_norm": 0.9240961074829102, - "learning_rate": 8.337521523914375e-05, - "loss": 0.0848, - "step": 54720 - }, - { - "epoch": 3.5806346090938828, - "grad_norm": 0.8011013269424438, - "learning_rate": 8.336837476790526e-05, - "loss": 0.063, - "step": 54730 - }, - { - "epoch": 3.5812888452731437, - "grad_norm": 0.9099512696266174, - "learning_rate": 8.336153317039662e-05, - "loss": 0.0673, - "step": 54740 - }, - { - "epoch": 3.5819430814524043, - "grad_norm": 0.8403385281562805, - "learning_rate": 8.335469044684872e-05, - "loss": 0.072, - "step": 54750 - }, - { - "epoch": 3.582597317631665, - "grad_norm": 0.8340772986412048, - "learning_rate": 8.334784659749255e-05, - "loss": 0.0665, - "step": 54760 - }, - { - "epoch": 3.5832515538109257, - "grad_norm": 0.8868764042854309, - "learning_rate": 8.334100162255912e-05, - "loss": 0.0762, - "step": 54770 - }, - { - "epoch": 3.5839057899901867, - "grad_norm": 0.8480395078659058, - "learning_rate": 8.33341555222794e-05, - "loss": 0.0677, - "step": 54780 - }, - { - "epoch": 3.5845600261694472, - "grad_norm": 1.0152699947357178, - "learning_rate": 8.332730829688456e-05, - "loss": 0.0769, - "step": 54790 - }, - { - "epoch": 3.5852142623487078, - "grad_norm": 1.082312822341919, - "learning_rate": 8.332045994660563e-05, - "loss": 0.0708, - "step": 54800 - }, - { - "epoch": 3.5858684985279687, - "grad_norm": 0.8725629448890686, - "learning_rate": 8.33136104716738e-05, - "loss": 0.0608, - "step": 54810 - }, - { - "epoch": 3.5865227347072293, - "grad_norm": 0.7891134023666382, - "learning_rate": 8.330675987232024e-05, - "loss": 0.0615, - "step": 54820 - }, - { - "epoch": 3.58717697088649, - "grad_norm": 1.0689077377319336, - "learning_rate": 8.32999081487762e-05, - "loss": 0.0745, - "step": 54830 - }, - { - "epoch": 3.5878312070657508, - "grad_norm": 0.77810138463974, - "learning_rate": 8.329305530127291e-05, - "loss": 0.0666, - "step": 54840 - }, - { - "epoch": 3.5884854432450113, - "grad_norm": 1.0314850807189941, - "learning_rate": 8.32862013300417e-05, - "loss": 0.0775, - "step": 54850 - }, - { - "epoch": 3.5891396794242723, - "grad_norm": 0.9134453535079956, - "learning_rate": 8.32793462353139e-05, - "loss": 0.0732, - "step": 54860 - }, - { - "epoch": 3.589793915603533, - "grad_norm": 0.786392331123352, - "learning_rate": 8.32724900173209e-05, - "loss": 0.0733, - "step": 54870 - }, - { - "epoch": 3.5904481517827938, - "grad_norm": 0.7125436663627625, - "learning_rate": 8.326563267629408e-05, - "loss": 0.0712, - "step": 54880 - }, - { - "epoch": 3.5911023879620543, - "grad_norm": 0.9112322330474854, - "learning_rate": 8.325877421246491e-05, - "loss": 0.0807, - "step": 54890 - }, - { - "epoch": 3.591756624141315, - "grad_norm": 0.8013171553611755, - "learning_rate": 8.325191462606491e-05, - "loss": 0.0722, - "step": 54900 - }, - { - "epoch": 3.5924108603205758, - "grad_norm": 0.7827738523483276, - "learning_rate": 8.324505391732557e-05, - "loss": 0.0701, - "step": 54910 - }, - { - "epoch": 3.5930650964998363, - "grad_norm": 0.8412222266197205, - "learning_rate": 8.323819208647847e-05, - "loss": 0.0771, - "step": 54920 - }, - { - "epoch": 3.5937193326790973, - "grad_norm": 0.9452791213989258, - "learning_rate": 8.323132913375522e-05, - "loss": 0.0665, - "step": 54930 - }, - { - "epoch": 3.594373568858358, - "grad_norm": 0.7313148975372314, - "learning_rate": 8.322446505938746e-05, - "loss": 0.0686, - "step": 54940 - }, - { - "epoch": 3.5950278050376188, - "grad_norm": 0.7917100787162781, - "learning_rate": 8.321759986360687e-05, - "loss": 0.0706, - "step": 54950 - }, - { - "epoch": 3.5956820412168793, - "grad_norm": 1.0193794965744019, - "learning_rate": 8.321073354664516e-05, - "loss": 0.0816, - "step": 54960 - }, - { - "epoch": 3.59633627739614, - "grad_norm": 0.9962078332901001, - "learning_rate": 8.32038661087341e-05, - "loss": 0.0657, - "step": 54970 - }, - { - "epoch": 3.596990513575401, - "grad_norm": 0.8794001340866089, - "learning_rate": 8.319699755010549e-05, - "loss": 0.076, - "step": 54980 - }, - { - "epoch": 3.5976447497546613, - "grad_norm": 0.8249982595443726, - "learning_rate": 8.319012787099115e-05, - "loss": 0.0815, - "step": 54990 - }, - { - "epoch": 3.5982989859339223, - "grad_norm": 0.9822542667388916, - "learning_rate": 8.318325707162293e-05, - "loss": 0.0676, - "step": 55000 - }, - { - "epoch": 3.598953222113183, - "grad_norm": 0.8483203053474426, - "learning_rate": 8.317638515223277e-05, - "loss": 0.0686, - "step": 55010 - }, - { - "epoch": 3.5996074582924438, - "grad_norm": 0.8871238827705383, - "learning_rate": 8.31695121130526e-05, - "loss": 0.0722, - "step": 55020 - }, - { - "epoch": 3.6002616944717043, - "grad_norm": 1.047977089881897, - "learning_rate": 8.31626379543144e-05, - "loss": 0.0836, - "step": 55030 - }, - { - "epoch": 3.600915930650965, - "grad_norm": 0.9134694933891296, - "learning_rate": 8.31557626762502e-05, - "loss": 0.0733, - "step": 55040 - }, - { - "epoch": 3.601570166830226, - "grad_norm": 0.9001691341400146, - "learning_rate": 8.314888627909208e-05, - "loss": 0.0687, - "step": 55050 - }, - { - "epoch": 3.6022244030094863, - "grad_norm": 0.8247069716453552, - "learning_rate": 8.31420087630721e-05, - "loss": 0.0676, - "step": 55060 - }, - { - "epoch": 3.6028786391887473, - "grad_norm": 0.8471789360046387, - "learning_rate": 8.313513012842238e-05, - "loss": 0.0694, - "step": 55070 - }, - { - "epoch": 3.603532875368008, - "grad_norm": 0.7431899905204773, - "learning_rate": 8.312825037537513e-05, - "loss": 0.0741, - "step": 55080 - }, - { - "epoch": 3.604187111547269, - "grad_norm": 0.9894888997077942, - "learning_rate": 8.312136950416256e-05, - "loss": 0.0754, - "step": 55090 - }, - { - "epoch": 3.6048413477265293, - "grad_norm": 0.8959627747535706, - "learning_rate": 8.311448751501689e-05, - "loss": 0.0798, - "step": 55100 - }, - { - "epoch": 3.60549558390579, - "grad_norm": 0.7383559942245483, - "learning_rate": 8.310760440817043e-05, - "loss": 0.0726, - "step": 55110 - }, - { - "epoch": 3.606149820085051, - "grad_norm": 0.8835324645042419, - "learning_rate": 8.31007201838555e-05, - "loss": 0.0751, - "step": 55120 - }, - { - "epoch": 3.6068040562643113, - "grad_norm": 0.8275827169418335, - "learning_rate": 8.309383484230446e-05, - "loss": 0.064, - "step": 55130 - }, - { - "epoch": 3.607458292443572, - "grad_norm": 0.8732197880744934, - "learning_rate": 8.308694838374969e-05, - "loss": 0.0656, - "step": 55140 - }, - { - "epoch": 3.608112528622833, - "grad_norm": 1.2900716066360474, - "learning_rate": 8.308006080842362e-05, - "loss": 0.0748, - "step": 55150 - }, - { - "epoch": 3.608766764802094, - "grad_norm": 0.8810641169548035, - "learning_rate": 8.307317211655877e-05, - "loss": 0.07, - "step": 55160 - }, - { - "epoch": 3.6094210009813543, - "grad_norm": 0.9505282640457153, - "learning_rate": 8.30662823083876e-05, - "loss": 0.0701, - "step": 55170 - }, - { - "epoch": 3.610075237160615, - "grad_norm": 1.0817748308181763, - "learning_rate": 8.30593913841427e-05, - "loss": 0.0761, - "step": 55180 - }, - { - "epoch": 3.610729473339876, - "grad_norm": 0.8153062462806702, - "learning_rate": 8.305249934405664e-05, - "loss": 0.0643, - "step": 55190 - }, - { - "epoch": 3.6113837095191363, - "grad_norm": 0.8734539747238159, - "learning_rate": 8.304560618836204e-05, - "loss": 0.0721, - "step": 55200 - }, - { - "epoch": 3.612037945698397, - "grad_norm": 0.8349213004112244, - "learning_rate": 8.303871191729156e-05, - "loss": 0.0701, - "step": 55210 - }, - { - "epoch": 3.612692181877658, - "grad_norm": 0.8556681871414185, - "learning_rate": 8.303181653107791e-05, - "loss": 0.064, - "step": 55220 - }, - { - "epoch": 3.613346418056919, - "grad_norm": 1.0858343839645386, - "learning_rate": 8.302492002995383e-05, - "loss": 0.0683, - "step": 55230 - }, - { - "epoch": 3.6140006542361793, - "grad_norm": 0.6986839175224304, - "learning_rate": 8.301802241415209e-05, - "loss": 0.0651, - "step": 55240 - }, - { - "epoch": 3.61465489041544, - "grad_norm": 0.9676486849784851, - "learning_rate": 8.301112368390548e-05, - "loss": 0.067, - "step": 55250 - }, - { - "epoch": 3.615309126594701, - "grad_norm": 1.1534702777862549, - "learning_rate": 8.300422383944688e-05, - "loss": 0.0665, - "step": 55260 - }, - { - "epoch": 3.6159633627739614, - "grad_norm": 0.9202556014060974, - "learning_rate": 8.299732288100918e-05, - "loss": 0.0648, - "step": 55270 - }, - { - "epoch": 3.616617598953222, - "grad_norm": 0.9827196598052979, - "learning_rate": 8.299042080882528e-05, - "loss": 0.0777, - "step": 55280 - }, - { - "epoch": 3.617271835132483, - "grad_norm": 0.9134148359298706, - "learning_rate": 8.298351762312816e-05, - "loss": 0.0693, - "step": 55290 - }, - { - "epoch": 3.6179260713117434, - "grad_norm": 0.7244698405265808, - "learning_rate": 8.297661332415083e-05, - "loss": 0.0672, - "step": 55300 - }, - { - "epoch": 3.6185803074910043, - "grad_norm": 0.9606476426124573, - "learning_rate": 8.296970791212631e-05, - "loss": 0.0731, - "step": 55310 - }, - { - "epoch": 3.619234543670265, - "grad_norm": 0.9693164825439453, - "learning_rate": 8.296280138728768e-05, - "loss": 0.07, - "step": 55320 - }, - { - "epoch": 3.619888779849526, - "grad_norm": 0.8021335005760193, - "learning_rate": 8.295589374986804e-05, - "loss": 0.07, - "step": 55330 - }, - { - "epoch": 3.6205430160287864, - "grad_norm": 1.0578341484069824, - "learning_rate": 8.294898500010056e-05, - "loss": 0.0635, - "step": 55340 - }, - { - "epoch": 3.621197252208047, - "grad_norm": 0.8854939341545105, - "learning_rate": 8.294207513821845e-05, - "loss": 0.0651, - "step": 55350 - }, - { - "epoch": 3.621851488387308, - "grad_norm": 0.8481228351593018, - "learning_rate": 8.293516416445488e-05, - "loss": 0.0743, - "step": 55360 - }, - { - "epoch": 3.6225057245665684, - "grad_norm": 0.8567721247673035, - "learning_rate": 8.292825207904316e-05, - "loss": 0.0671, - "step": 55370 - }, - { - "epoch": 3.6231599607458294, - "grad_norm": 0.7755191922187805, - "learning_rate": 8.292133888221659e-05, - "loss": 0.0683, - "step": 55380 - }, - { - "epoch": 3.62381419692509, - "grad_norm": 1.062867522239685, - "learning_rate": 8.291442457420846e-05, - "loss": 0.0778, - "step": 55390 - }, - { - "epoch": 3.624468433104351, - "grad_norm": 0.8446351885795593, - "learning_rate": 8.290750915525219e-05, - "loss": 0.0645, - "step": 55400 - }, - { - "epoch": 3.6251226692836114, - "grad_norm": 0.8222762942314148, - "learning_rate": 8.290059262558119e-05, - "loss": 0.0755, - "step": 55410 - }, - { - "epoch": 3.625776905462872, - "grad_norm": 0.9399179220199585, - "learning_rate": 8.28936749854289e-05, - "loss": 0.0672, - "step": 55420 - }, - { - "epoch": 3.626431141642133, - "grad_norm": 0.7565219402313232, - "learning_rate": 8.288675623502881e-05, - "loss": 0.065, - "step": 55430 - }, - { - "epoch": 3.6270853778213934, - "grad_norm": 0.8849060535430908, - "learning_rate": 8.287983637461447e-05, - "loss": 0.0748, - "step": 55440 - }, - { - "epoch": 3.6277396140006544, - "grad_norm": 0.8676406145095825, - "learning_rate": 8.28729154044194e-05, - "loss": 0.0742, - "step": 55450 - }, - { - "epoch": 3.628393850179915, - "grad_norm": 0.7463305592536926, - "learning_rate": 8.286599332467722e-05, - "loss": 0.0722, - "step": 55460 - }, - { - "epoch": 3.629048086359176, - "grad_norm": 0.9079608917236328, - "learning_rate": 8.285907013562158e-05, - "loss": 0.0723, - "step": 55470 - }, - { - "epoch": 3.6297023225384364, - "grad_norm": 0.9543100595474243, - "learning_rate": 8.285214583748616e-05, - "loss": 0.0726, - "step": 55480 - }, - { - "epoch": 3.630356558717697, - "grad_norm": 0.9939130544662476, - "learning_rate": 8.284522043050463e-05, - "loss": 0.0679, - "step": 55490 - }, - { - "epoch": 3.631010794896958, - "grad_norm": 0.9271246194839478, - "learning_rate": 8.28382939149108e-05, - "loss": 0.0642, - "step": 55500 - }, - { - "epoch": 3.6316650310762184, - "grad_norm": 0.9283300042152405, - "learning_rate": 8.283136629093841e-05, - "loss": 0.067, - "step": 55510 - }, - { - "epoch": 3.6323192672554794, - "grad_norm": 0.9301807284355164, - "learning_rate": 8.28244375588213e-05, - "loss": 0.07, - "step": 55520 - }, - { - "epoch": 3.63297350343474, - "grad_norm": 0.9366194605827332, - "learning_rate": 8.281750771879335e-05, - "loss": 0.0707, - "step": 55530 - }, - { - "epoch": 3.633627739614001, - "grad_norm": 0.9586436748504639, - "learning_rate": 8.281057677108844e-05, - "loss": 0.0641, - "step": 55540 - }, - { - "epoch": 3.6342819757932614, - "grad_norm": 0.8174638748168945, - "learning_rate": 8.280364471594052e-05, - "loss": 0.0686, - "step": 55550 - }, - { - "epoch": 3.634936211972522, - "grad_norm": 0.9204336404800415, - "learning_rate": 8.279671155358355e-05, - "loss": 0.066, - "step": 55560 - }, - { - "epoch": 3.635590448151783, - "grad_norm": 0.7173357009887695, - "learning_rate": 8.278977728425157e-05, - "loss": 0.0668, - "step": 55570 - }, - { - "epoch": 3.6362446843310434, - "grad_norm": 0.8251804113388062, - "learning_rate": 8.27828419081786e-05, - "loss": 0.0689, - "step": 55580 - }, - { - "epoch": 3.636898920510304, - "grad_norm": 0.796325147151947, - "learning_rate": 8.277590542559875e-05, - "loss": 0.0657, - "step": 55590 - }, - { - "epoch": 3.637553156689565, - "grad_norm": 0.8071014881134033, - "learning_rate": 8.276896783674612e-05, - "loss": 0.0692, - "step": 55600 - }, - { - "epoch": 3.638207392868826, - "grad_norm": 0.8185661435127258, - "learning_rate": 8.27620291418549e-05, - "loss": 0.0657, - "step": 55610 - }, - { - "epoch": 3.6388616290480864, - "grad_norm": 0.800986111164093, - "learning_rate": 8.275508934115927e-05, - "loss": 0.0719, - "step": 55620 - }, - { - "epoch": 3.639515865227347, - "grad_norm": 0.9986090660095215, - "learning_rate": 8.274814843489346e-05, - "loss": 0.0764, - "step": 55630 - }, - { - "epoch": 3.640170101406608, - "grad_norm": 0.688541829586029, - "learning_rate": 8.274120642329178e-05, - "loss": 0.0721, - "step": 55640 - }, - { - "epoch": 3.6408243375858684, - "grad_norm": 1.0835293531417847, - "learning_rate": 8.273426330658849e-05, - "loss": 0.0819, - "step": 55650 - }, - { - "epoch": 3.641478573765129, - "grad_norm": 0.7599365711212158, - "learning_rate": 8.272731908501798e-05, - "loss": 0.075, - "step": 55660 - }, - { - "epoch": 3.64213280994439, - "grad_norm": 1.0994549989700317, - "learning_rate": 8.272037375881461e-05, - "loss": 0.0601, - "step": 55670 - }, - { - "epoch": 3.642787046123651, - "grad_norm": 0.8615381121635437, - "learning_rate": 8.27134273282128e-05, - "loss": 0.0733, - "step": 55680 - }, - { - "epoch": 3.6434412823029114, - "grad_norm": 1.0048024654388428, - "learning_rate": 8.270647979344706e-05, - "loss": 0.0808, - "step": 55690 - }, - { - "epoch": 3.644095518482172, - "grad_norm": 1.1392261981964111, - "learning_rate": 8.269953115475183e-05, - "loss": 0.0863, - "step": 55700 - }, - { - "epoch": 3.644749754661433, - "grad_norm": 1.387350082397461, - "learning_rate": 8.269258141236167e-05, - "loss": 0.083, - "step": 55710 - }, - { - "epoch": 3.6454039908406934, - "grad_norm": 0.8560718894004822, - "learning_rate": 8.268563056651115e-05, - "loss": 0.0751, - "step": 55720 - }, - { - "epoch": 3.646058227019954, - "grad_norm": 0.892548143863678, - "learning_rate": 8.267867861743488e-05, - "loss": 0.0709, - "step": 55730 - }, - { - "epoch": 3.646712463199215, - "grad_norm": 0.8326752185821533, - "learning_rate": 8.267172556536748e-05, - "loss": 0.0713, - "step": 55740 - }, - { - "epoch": 3.647366699378476, - "grad_norm": 0.7968476414680481, - "learning_rate": 8.26647714105437e-05, - "loss": 0.0821, - "step": 55750 - }, - { - "epoch": 3.6480209355577364, - "grad_norm": 0.8511175513267517, - "learning_rate": 8.265781615319818e-05, - "loss": 0.0745, - "step": 55760 - }, - { - "epoch": 3.648675171736997, - "grad_norm": 0.9187440872192383, - "learning_rate": 8.265085979356573e-05, - "loss": 0.0699, - "step": 55770 - }, - { - "epoch": 3.649329407916258, - "grad_norm": 0.8875986933708191, - "learning_rate": 8.264390233188113e-05, - "loss": 0.0783, - "step": 55780 - }, - { - "epoch": 3.6499836440955185, - "grad_norm": 0.9916920065879822, - "learning_rate": 8.263694376837923e-05, - "loss": 0.0637, - "step": 55790 - }, - { - "epoch": 3.650637880274779, - "grad_norm": 0.9589375853538513, - "learning_rate": 8.262998410329486e-05, - "loss": 0.0643, - "step": 55800 - }, - { - "epoch": 3.65129211645404, - "grad_norm": 0.9402400851249695, - "learning_rate": 8.262302333686296e-05, - "loss": 0.0665, - "step": 55810 - }, - { - "epoch": 3.6519463526333005, - "grad_norm": 0.9499921798706055, - "learning_rate": 8.261606146931846e-05, - "loss": 0.08, - "step": 55820 - }, - { - "epoch": 3.6526005888125614, - "grad_norm": 1.1458688974380493, - "learning_rate": 8.260909850089636e-05, - "loss": 0.0697, - "step": 55830 - }, - { - "epoch": 3.653254824991822, - "grad_norm": 0.8103237152099609, - "learning_rate": 8.260213443183167e-05, - "loss": 0.0725, - "step": 55840 - }, - { - "epoch": 3.653909061171083, - "grad_norm": 0.7244288921356201, - "learning_rate": 8.259516926235942e-05, - "loss": 0.0653, - "step": 55850 - }, - { - "epoch": 3.6545632973503435, - "grad_norm": 0.8501254320144653, - "learning_rate": 8.258820299271475e-05, - "loss": 0.0662, - "step": 55860 - }, - { - "epoch": 3.655217533529604, - "grad_norm": 0.9807097911834717, - "learning_rate": 8.258123562313274e-05, - "loss": 0.0734, - "step": 55870 - }, - { - "epoch": 3.655871769708865, - "grad_norm": 0.9171924591064453, - "learning_rate": 8.257426715384859e-05, - "loss": 0.0711, - "step": 55880 - }, - { - "epoch": 3.6565260058881255, - "grad_norm": 0.7962984442710876, - "learning_rate": 8.256729758509748e-05, - "loss": 0.0753, - "step": 55890 - }, - { - "epoch": 3.6571802420673865, - "grad_norm": 0.7744097709655762, - "learning_rate": 8.256032691711469e-05, - "loss": 0.075, - "step": 55900 - }, - { - "epoch": 3.657834478246647, - "grad_norm": 0.8903494477272034, - "learning_rate": 8.255335515013545e-05, - "loss": 0.0681, - "step": 55910 - }, - { - "epoch": 3.658488714425908, - "grad_norm": 0.8612160682678223, - "learning_rate": 8.25463822843951e-05, - "loss": 0.0739, - "step": 55920 - }, - { - "epoch": 3.6591429506051685, - "grad_norm": 1.115964651107788, - "learning_rate": 8.253940832012901e-05, - "loss": 0.0776, - "step": 55930 - }, - { - "epoch": 3.659797186784429, - "grad_norm": 0.90339195728302, - "learning_rate": 8.253243325757255e-05, - "loss": 0.0724, - "step": 55940 - }, - { - "epoch": 3.66045142296369, - "grad_norm": 0.8702420592308044, - "learning_rate": 8.252545709696114e-05, - "loss": 0.0617, - "step": 55950 - }, - { - "epoch": 3.6611056591429505, - "grad_norm": 0.7411156892776489, - "learning_rate": 8.251847983853025e-05, - "loss": 0.0724, - "step": 55960 - }, - { - "epoch": 3.6617598953222115, - "grad_norm": 0.856332540512085, - "learning_rate": 8.251150148251538e-05, - "loss": 0.0616, - "step": 55970 - }, - { - "epoch": 3.662414131501472, - "grad_norm": 0.808392345905304, - "learning_rate": 8.250452202915209e-05, - "loss": 0.0727, - "step": 55980 - }, - { - "epoch": 3.663068367680733, - "grad_norm": 0.7766302227973938, - "learning_rate": 8.249754147867592e-05, - "loss": 0.0691, - "step": 55990 - }, - { - "epoch": 3.6637226038599935, - "grad_norm": 0.782347559928894, - "learning_rate": 8.24905598313225e-05, - "loss": 0.0603, - "step": 56000 - }, - { - "epoch": 3.664376840039254, - "grad_norm": 0.721568763256073, - "learning_rate": 8.248357708732749e-05, - "loss": 0.0681, - "step": 56010 - }, - { - "epoch": 3.665031076218515, - "grad_norm": 0.8713940382003784, - "learning_rate": 8.247659324692653e-05, - "loss": 0.0746, - "step": 56020 - }, - { - "epoch": 3.6656853123977755, - "grad_norm": 0.958366334438324, - "learning_rate": 8.246960831035539e-05, - "loss": 0.0689, - "step": 56030 - }, - { - "epoch": 3.666339548577036, - "grad_norm": 1.0349668264389038, - "learning_rate": 8.246262227784982e-05, - "loss": 0.074, - "step": 56040 - }, - { - "epoch": 3.666993784756297, - "grad_norm": 0.8951663374900818, - "learning_rate": 8.245563514964562e-05, - "loss": 0.0762, - "step": 56050 - }, - { - "epoch": 3.667648020935558, - "grad_norm": 0.993411660194397, - "learning_rate": 8.24486469259786e-05, - "loss": 0.0689, - "step": 56060 - }, - { - "epoch": 3.6683022571148185, - "grad_norm": 0.9579418897628784, - "learning_rate": 8.244165760708464e-05, - "loss": 0.0658, - "step": 56070 - }, - { - "epoch": 3.668956493294079, - "grad_norm": 0.8194859623908997, - "learning_rate": 8.243466719319967e-05, - "loss": 0.0703, - "step": 56080 - }, - { - "epoch": 3.66961072947334, - "grad_norm": 0.8599490523338318, - "learning_rate": 8.242767568455963e-05, - "loss": 0.0615, - "step": 56090 - }, - { - "epoch": 3.6702649656526005, - "grad_norm": 0.7916556596755981, - "learning_rate": 8.242068308140047e-05, - "loss": 0.0769, - "step": 56100 - }, - { - "epoch": 3.670919201831861, - "grad_norm": 0.8496348857879639, - "learning_rate": 8.241368938395824e-05, - "loss": 0.066, - "step": 56110 - }, - { - "epoch": 3.671573438011122, - "grad_norm": 0.7955387234687805, - "learning_rate": 8.240669459246897e-05, - "loss": 0.0701, - "step": 56120 - }, - { - "epoch": 3.672227674190383, - "grad_norm": 0.7705776691436768, - "learning_rate": 8.239969870716878e-05, - "loss": 0.0636, - "step": 56130 - }, - { - "epoch": 3.6728819103696435, - "grad_norm": 1.0588468313217163, - "learning_rate": 8.239270172829379e-05, - "loss": 0.0748, - "step": 56140 - }, - { - "epoch": 3.673536146548904, - "grad_norm": 0.9020249247550964, - "learning_rate": 8.238570365608016e-05, - "loss": 0.0689, - "step": 56150 - }, - { - "epoch": 3.674190382728165, - "grad_norm": 0.7228168845176697, - "learning_rate": 8.237870449076411e-05, - "loss": 0.0632, - "step": 56160 - }, - { - "epoch": 3.6748446189074255, - "grad_norm": 0.729026198387146, - "learning_rate": 8.237170423258184e-05, - "loss": 0.0761, - "step": 56170 - }, - { - "epoch": 3.675498855086686, - "grad_norm": 0.7430927157402039, - "learning_rate": 8.236470288176966e-05, - "loss": 0.0644, - "step": 56180 - }, - { - "epoch": 3.676153091265947, - "grad_norm": 0.8580201268196106, - "learning_rate": 8.235770043856389e-05, - "loss": 0.0615, - "step": 56190 - }, - { - "epoch": 3.676807327445208, - "grad_norm": 1.0346200466156006, - "learning_rate": 8.235069690320087e-05, - "loss": 0.079, - "step": 56200 - }, - { - "epoch": 3.6774615636244685, - "grad_norm": 1.0010615587234497, - "learning_rate": 8.234369227591698e-05, - "loss": 0.0673, - "step": 56210 - }, - { - "epoch": 3.678115799803729, - "grad_norm": 0.8165103793144226, - "learning_rate": 8.233668655694865e-05, - "loss": 0.0738, - "step": 56220 - }, - { - "epoch": 3.67877003598299, - "grad_norm": 0.8358020186424255, - "learning_rate": 8.232967974653235e-05, - "loss": 0.0637, - "step": 56230 - }, - { - "epoch": 3.6794242721622505, - "grad_norm": 0.8464856743812561, - "learning_rate": 8.232267184490457e-05, - "loss": 0.0696, - "step": 56240 - }, - { - "epoch": 3.680078508341511, - "grad_norm": 0.8467486500740051, - "learning_rate": 8.231566285230183e-05, - "loss": 0.0697, - "step": 56250 - }, - { - "epoch": 3.680732744520772, - "grad_norm": 0.9639089703559875, - "learning_rate": 8.230865276896074e-05, - "loss": 0.0811, - "step": 56260 - }, - { - "epoch": 3.6813869807000326, - "grad_norm": 0.7469546794891357, - "learning_rate": 8.230164159511788e-05, - "loss": 0.072, - "step": 56270 - }, - { - "epoch": 3.6820412168792935, - "grad_norm": 0.9208206534385681, - "learning_rate": 8.22946293310099e-05, - "loss": 0.0836, - "step": 56280 - }, - { - "epoch": 3.682695453058554, - "grad_norm": 0.7601810693740845, - "learning_rate": 8.228761597687348e-05, - "loss": 0.071, - "step": 56290 - }, - { - "epoch": 3.683349689237815, - "grad_norm": 0.8234832286834717, - "learning_rate": 8.228060153294534e-05, - "loss": 0.0734, - "step": 56300 - }, - { - "epoch": 3.6840039254170756, - "grad_norm": 0.8467697501182556, - "learning_rate": 8.227358599946224e-05, - "loss": 0.0743, - "step": 56310 - }, - { - "epoch": 3.684658161596336, - "grad_norm": 0.6788933873176575, - "learning_rate": 8.2266569376661e-05, - "loss": 0.0639, - "step": 56320 - }, - { - "epoch": 3.685312397775597, - "grad_norm": 0.9612674713134766, - "learning_rate": 8.225955166477839e-05, - "loss": 0.0667, - "step": 56330 - }, - { - "epoch": 3.6859666339548576, - "grad_norm": 0.8955544233322144, - "learning_rate": 8.22525328640513e-05, - "loss": 0.0671, - "step": 56340 - }, - { - "epoch": 3.6866208701341185, - "grad_norm": 0.9106106758117676, - "learning_rate": 8.224551297471668e-05, - "loss": 0.0695, - "step": 56350 - }, - { - "epoch": 3.687275106313379, - "grad_norm": 0.8671728372573853, - "learning_rate": 8.223849199701139e-05, - "loss": 0.06, - "step": 56360 - }, - { - "epoch": 3.68792934249264, - "grad_norm": 0.9457067251205444, - "learning_rate": 8.223146993117247e-05, - "loss": 0.0762, - "step": 56370 - }, - { - "epoch": 3.6885835786719006, - "grad_norm": 1.0926858186721802, - "learning_rate": 8.222444677743691e-05, - "loss": 0.068, - "step": 56380 - }, - { - "epoch": 3.689237814851161, - "grad_norm": 0.964023768901825, - "learning_rate": 8.221742253604175e-05, - "loss": 0.0635, - "step": 56390 - }, - { - "epoch": 3.689892051030422, - "grad_norm": 0.8525315523147583, - "learning_rate": 8.221039720722409e-05, - "loss": 0.0806, - "step": 56400 - }, - { - "epoch": 3.6905462872096826, - "grad_norm": 0.989080548286438, - "learning_rate": 8.220337079122105e-05, - "loss": 0.0713, - "step": 56410 - }, - { - "epoch": 3.6912005233889436, - "grad_norm": 0.7577918171882629, - "learning_rate": 8.21963432882698e-05, - "loss": 0.0716, - "step": 56420 - }, - { - "epoch": 3.691854759568204, - "grad_norm": 0.8016083240509033, - "learning_rate": 8.21893146986075e-05, - "loss": 0.071, - "step": 56430 - }, - { - "epoch": 3.692508995747465, - "grad_norm": 0.7300130128860474, - "learning_rate": 8.218228502247141e-05, - "loss": 0.073, - "step": 56440 - }, - { - "epoch": 3.6931632319267256, - "grad_norm": 0.8202305436134338, - "learning_rate": 8.217525426009882e-05, - "loss": 0.0685, - "step": 56450 - }, - { - "epoch": 3.693817468105986, - "grad_norm": 0.9385225176811218, - "learning_rate": 8.216822241172702e-05, - "loss": 0.0751, - "step": 56460 - }, - { - "epoch": 3.694471704285247, - "grad_norm": 1.1209096908569336, - "learning_rate": 8.216118947759333e-05, - "loss": 0.0709, - "step": 56470 - }, - { - "epoch": 3.6951259404645076, - "grad_norm": 1.0361253023147583, - "learning_rate": 8.215415545793515e-05, - "loss": 0.0765, - "step": 56480 - }, - { - "epoch": 3.6957801766437686, - "grad_norm": 0.8918325901031494, - "learning_rate": 8.214712035298991e-05, - "loss": 0.0622, - "step": 56490 - }, - { - "epoch": 3.696434412823029, - "grad_norm": 0.7726563811302185, - "learning_rate": 8.214008416299505e-05, - "loss": 0.0717, - "step": 56500 - }, - { - "epoch": 3.69708864900229, - "grad_norm": 0.9196798801422119, - "learning_rate": 8.213304688818804e-05, - "loss": 0.081, - "step": 56510 - }, - { - "epoch": 3.6977428851815506, - "grad_norm": 1.0043820142745972, - "learning_rate": 8.212600852880644e-05, - "loss": 0.0756, - "step": 56520 - }, - { - "epoch": 3.698397121360811, - "grad_norm": 0.926852285861969, - "learning_rate": 8.21189690850878e-05, - "loss": 0.0677, - "step": 56530 - }, - { - "epoch": 3.699051357540072, - "grad_norm": 0.8300206661224365, - "learning_rate": 8.211192855726972e-05, - "loss": 0.074, - "step": 56540 - }, - { - "epoch": 3.6997055937193326, - "grad_norm": 1.1823694705963135, - "learning_rate": 8.210488694558982e-05, - "loss": 0.0653, - "step": 56550 - }, - { - "epoch": 3.700359829898593, - "grad_norm": 0.7642592787742615, - "learning_rate": 8.20978442502858e-05, - "loss": 0.0634, - "step": 56560 - }, - { - "epoch": 3.701014066077854, - "grad_norm": 0.9200793504714966, - "learning_rate": 8.209080047159535e-05, - "loss": 0.0862, - "step": 56570 - }, - { - "epoch": 3.701668302257115, - "grad_norm": 0.8010613918304443, - "learning_rate": 8.208375560975624e-05, - "loss": 0.0636, - "step": 56580 - }, - { - "epoch": 3.7023225384363756, - "grad_norm": 0.7096419930458069, - "learning_rate": 8.207670966500621e-05, - "loss": 0.065, - "step": 56590 - }, - { - "epoch": 3.702976774615636, - "grad_norm": 1.8889129161834717, - "learning_rate": 8.206966263758311e-05, - "loss": 0.0617, - "step": 56600 - }, - { - "epoch": 3.703631010794897, - "grad_norm": 0.7090383172035217, - "learning_rate": 8.20626145277248e-05, - "loss": 0.0718, - "step": 56610 - }, - { - "epoch": 3.7042852469741576, - "grad_norm": 0.8787170648574829, - "learning_rate": 8.205556533566916e-05, - "loss": 0.0734, - "step": 56620 - }, - { - "epoch": 3.704939483153418, - "grad_norm": 0.8815798759460449, - "learning_rate": 8.204851506165412e-05, - "loss": 0.0692, - "step": 56630 - }, - { - "epoch": 3.705593719332679, - "grad_norm": 0.6900599002838135, - "learning_rate": 8.204146370591765e-05, - "loss": 0.0632, - "step": 56640 - }, - { - "epoch": 3.70624795551194, - "grad_norm": 0.9211147427558899, - "learning_rate": 8.203441126869773e-05, - "loss": 0.0785, - "step": 56650 - }, - { - "epoch": 3.7069021916912006, - "grad_norm": 0.6548269987106323, - "learning_rate": 8.202735775023244e-05, - "loss": 0.0694, - "step": 56660 - }, - { - "epoch": 3.707556427870461, - "grad_norm": 0.7686384916305542, - "learning_rate": 8.202030315075982e-05, - "loss": 0.0664, - "step": 56670 - }, - { - "epoch": 3.708210664049722, - "grad_norm": 0.7667417526245117, - "learning_rate": 8.2013247470518e-05, - "loss": 0.06, - "step": 56680 - }, - { - "epoch": 3.7088649002289826, - "grad_norm": 0.70942622423172, - "learning_rate": 8.200619070974512e-05, - "loss": 0.0621, - "step": 56690 - }, - { - "epoch": 3.709519136408243, - "grad_norm": 0.9921366572380066, - "learning_rate": 8.199913286867934e-05, - "loss": 0.0727, - "step": 56700 - }, - { - "epoch": 3.710173372587504, - "grad_norm": 0.9375357031822205, - "learning_rate": 8.199207394755893e-05, - "loss": 0.0697, - "step": 56710 - }, - { - "epoch": 3.7108276087667647, - "grad_norm": 0.9243108034133911, - "learning_rate": 8.198501394662212e-05, - "loss": 0.0649, - "step": 56720 - }, - { - "epoch": 3.7114818449460256, - "grad_norm": 0.9306524395942688, - "learning_rate": 8.197795286610719e-05, - "loss": 0.0714, - "step": 56730 - }, - { - "epoch": 3.712136081125286, - "grad_norm": 1.01764976978302, - "learning_rate": 8.19708907062525e-05, - "loss": 0.0695, - "step": 56740 - }, - { - "epoch": 3.712790317304547, - "grad_norm": 0.7517834901809692, - "learning_rate": 8.196382746729641e-05, - "loss": 0.0747, - "step": 56750 - }, - { - "epoch": 3.7134445534838076, - "grad_norm": 0.7786497473716736, - "learning_rate": 8.195676314947729e-05, - "loss": 0.0817, - "step": 56760 - }, - { - "epoch": 3.714098789663068, - "grad_norm": 0.939345121383667, - "learning_rate": 8.194969775303361e-05, - "loss": 0.0732, - "step": 56770 - }, - { - "epoch": 3.714753025842329, - "grad_norm": 1.078481674194336, - "learning_rate": 8.194263127820385e-05, - "loss": 0.0724, - "step": 56780 - }, - { - "epoch": 3.7154072620215897, - "grad_norm": 0.9831727147102356, - "learning_rate": 8.19355637252265e-05, - "loss": 0.0663, - "step": 56790 - }, - { - "epoch": 3.7160614982008506, - "grad_norm": 0.8869754672050476, - "learning_rate": 8.192849509434014e-05, - "loss": 0.074, - "step": 56800 - }, - { - "epoch": 3.716715734380111, - "grad_norm": 0.7121245861053467, - "learning_rate": 8.192142538578331e-05, - "loss": 0.0716, - "step": 56810 - }, - { - "epoch": 3.717369970559372, - "grad_norm": 0.8707069754600525, - "learning_rate": 8.191435459979468e-05, - "loss": 0.0698, - "step": 56820 - }, - { - "epoch": 3.7180242067386327, - "grad_norm": 1.0101736783981323, - "learning_rate": 8.190728273661288e-05, - "loss": 0.0761, - "step": 56830 - }, - { - "epoch": 3.718678442917893, - "grad_norm": 0.8770877122879028, - "learning_rate": 8.190020979647659e-05, - "loss": 0.0802, - "step": 56840 - }, - { - "epoch": 3.719332679097154, - "grad_norm": 0.9432161450386047, - "learning_rate": 8.189313577962457e-05, - "loss": 0.0711, - "step": 56850 - }, - { - "epoch": 3.7199869152764147, - "grad_norm": 0.8004513382911682, - "learning_rate": 8.188606068629558e-05, - "loss": 0.0676, - "step": 56860 - }, - { - "epoch": 3.7206411514556756, - "grad_norm": 0.9776605367660522, - "learning_rate": 8.187898451672841e-05, - "loss": 0.0655, - "step": 56870 - }, - { - "epoch": 3.721295387634936, - "grad_norm": 0.774079442024231, - "learning_rate": 8.18719072711619e-05, - "loss": 0.0632, - "step": 56880 - }, - { - "epoch": 3.721949623814197, - "grad_norm": 0.9294254183769226, - "learning_rate": 8.186482894983494e-05, - "loss": 0.0656, - "step": 56890 - }, - { - "epoch": 3.7226038599934577, - "grad_norm": 0.9034769535064697, - "learning_rate": 8.185774955298644e-05, - "loss": 0.0775, - "step": 56900 - }, - { - "epoch": 3.723258096172718, - "grad_norm": 1.0133552551269531, - "learning_rate": 8.185066908085535e-05, - "loss": 0.0684, - "step": 56910 - }, - { - "epoch": 3.723912332351979, - "grad_norm": 1.045736312866211, - "learning_rate": 8.184358753368062e-05, - "loss": 0.0627, - "step": 56920 - }, - { - "epoch": 3.7245665685312397, - "grad_norm": 0.8517866730690002, - "learning_rate": 8.183650491170132e-05, - "loss": 0.0639, - "step": 56930 - }, - { - "epoch": 3.7252208047105007, - "grad_norm": 0.6794936060905457, - "learning_rate": 8.182942121515648e-05, - "loss": 0.0624, - "step": 56940 - }, - { - "epoch": 3.725875040889761, - "grad_norm": 0.8077151775360107, - "learning_rate": 8.182233644428518e-05, - "loss": 0.0797, - "step": 56950 - }, - { - "epoch": 3.726529277069022, - "grad_norm": 0.8589492440223694, - "learning_rate": 8.18152505993266e-05, - "loss": 0.0772, - "step": 56960 - }, - { - "epoch": 3.7271835132482827, - "grad_norm": 0.8781915307044983, - "learning_rate": 8.180816368051985e-05, - "loss": 0.0741, - "step": 56970 - }, - { - "epoch": 3.727837749427543, - "grad_norm": 0.91343092918396, - "learning_rate": 8.180107568810417e-05, - "loss": 0.0756, - "step": 56980 - }, - { - "epoch": 3.728491985606804, - "grad_norm": 0.8857640624046326, - "learning_rate": 8.179398662231876e-05, - "loss": 0.0808, - "step": 56990 - }, - { - "epoch": 3.7291462217860647, - "grad_norm": 0.807052493095398, - "learning_rate": 8.178689648340294e-05, - "loss": 0.0756, - "step": 57000 - }, - { - "epoch": 3.7298004579653252, - "grad_norm": 0.7484919428825378, - "learning_rate": 8.1779805271596e-05, - "loss": 0.066, - "step": 57010 - }, - { - "epoch": 3.730454694144586, - "grad_norm": 0.8383365869522095, - "learning_rate": 8.17727129871373e-05, - "loss": 0.0684, - "step": 57020 - }, - { - "epoch": 3.731108930323847, - "grad_norm": 0.9385350346565247, - "learning_rate": 8.176561963026618e-05, - "loss": 0.0805, - "step": 57030 - }, - { - "epoch": 3.7317631665031077, - "grad_norm": 0.7031242847442627, - "learning_rate": 8.17585252012221e-05, - "loss": 0.0684, - "step": 57040 - }, - { - "epoch": 3.732417402682368, - "grad_norm": 0.9507877826690674, - "learning_rate": 8.175142970024451e-05, - "loss": 0.0738, - "step": 57050 - }, - { - "epoch": 3.733071638861629, - "grad_norm": 0.9051758646965027, - "learning_rate": 8.174433312757289e-05, - "loss": 0.0642, - "step": 57060 - }, - { - "epoch": 3.7337258750408897, - "grad_norm": 0.8826471567153931, - "learning_rate": 8.173723548344675e-05, - "loss": 0.0764, - "step": 57070 - }, - { - "epoch": 3.7343801112201502, - "grad_norm": 0.886605978012085, - "learning_rate": 8.173013676810573e-05, - "loss": 0.0695, - "step": 57080 - }, - { - "epoch": 3.735034347399411, - "grad_norm": 0.7051117420196533, - "learning_rate": 8.172303698178935e-05, - "loss": 0.069, - "step": 57090 - }, - { - "epoch": 3.735688583578672, - "grad_norm": 0.8651570677757263, - "learning_rate": 8.171593612473727e-05, - "loss": 0.0696, - "step": 57100 - }, - { - "epoch": 3.7363428197579327, - "grad_norm": 0.9195097088813782, - "learning_rate": 8.170883419718917e-05, - "loss": 0.0791, - "step": 57110 - }, - { - "epoch": 3.7369970559371932, - "grad_norm": 0.835708498954773, - "learning_rate": 8.170173119938478e-05, - "loss": 0.0733, - "step": 57120 - }, - { - "epoch": 3.737651292116454, - "grad_norm": 0.8207480311393738, - "learning_rate": 8.16946271315638e-05, - "loss": 0.073, - "step": 57130 - }, - { - "epoch": 3.7383055282957147, - "grad_norm": 1.0654559135437012, - "learning_rate": 8.168752199396603e-05, - "loss": 0.0728, - "step": 57140 - }, - { - "epoch": 3.7389597644749752, - "grad_norm": 0.7882975339889526, - "learning_rate": 8.168041578683129e-05, - "loss": 0.0645, - "step": 57150 - }, - { - "epoch": 3.739614000654236, - "grad_norm": 0.7596569061279297, - "learning_rate": 8.167330851039945e-05, - "loss": 0.0689, - "step": 57160 - }, - { - "epoch": 3.7402682368334967, - "grad_norm": 1.1013258695602417, - "learning_rate": 8.166620016491038e-05, - "loss": 0.0647, - "step": 57170 - }, - { - "epoch": 3.7409224730127577, - "grad_norm": 0.8467496037483215, - "learning_rate": 8.165909075060398e-05, - "loss": 0.0658, - "step": 57180 - }, - { - "epoch": 3.7415767091920182, - "grad_norm": 0.7248758673667908, - "learning_rate": 8.165198026772025e-05, - "loss": 0.0705, - "step": 57190 - }, - { - "epoch": 3.742230945371279, - "grad_norm": 0.8981897234916687, - "learning_rate": 8.164486871649919e-05, - "loss": 0.074, - "step": 57200 - }, - { - "epoch": 3.7428851815505397, - "grad_norm": 0.8243358135223389, - "learning_rate": 8.16377560971808e-05, - "loss": 0.0758, - "step": 57210 - }, - { - "epoch": 3.7435394177298003, - "grad_norm": 0.7456525564193726, - "learning_rate": 8.163064241000517e-05, - "loss": 0.0586, - "step": 57220 - }, - { - "epoch": 3.7441936539090612, - "grad_norm": 0.9510387182235718, - "learning_rate": 8.162352765521243e-05, - "loss": 0.0635, - "step": 57230 - }, - { - "epoch": 3.7448478900883218, - "grad_norm": 0.9684381484985352, - "learning_rate": 8.161641183304267e-05, - "loss": 0.0621, - "step": 57240 - }, - { - "epoch": 3.7455021262675827, - "grad_norm": 0.8673816919326782, - "learning_rate": 8.16092949437361e-05, - "loss": 0.0764, - "step": 57250 - }, - { - "epoch": 3.7461563624468432, - "grad_norm": 0.7953469157218933, - "learning_rate": 8.160217698753291e-05, - "loss": 0.0687, - "step": 57260 - }, - { - "epoch": 3.746810598626104, - "grad_norm": 0.7955390214920044, - "learning_rate": 8.159505796467342e-05, - "loss": 0.0706, - "step": 57270 - }, - { - "epoch": 3.7474648348053647, - "grad_norm": 0.8372431397438049, - "learning_rate": 8.158793787539782e-05, - "loss": 0.0737, - "step": 57280 - }, - { - "epoch": 3.7481190709846253, - "grad_norm": 1.1341184377670288, - "learning_rate": 8.158081671994648e-05, - "loss": 0.0737, - "step": 57290 - }, - { - "epoch": 3.7487733071638862, - "grad_norm": 0.8782973289489746, - "learning_rate": 8.157369449855974e-05, - "loss": 0.0708, - "step": 57300 - }, - { - "epoch": 3.7494275433431468, - "grad_norm": 0.9155938625335693, - "learning_rate": 8.156657121147803e-05, - "loss": 0.0695, - "step": 57310 - }, - { - "epoch": 3.7500817795224077, - "grad_norm": 0.7682180404663086, - "learning_rate": 8.155944685894175e-05, - "loss": 0.0709, - "step": 57320 - }, - { - "epoch": 3.7507360157016683, - "grad_norm": 0.9652989506721497, - "learning_rate": 8.155232144119135e-05, - "loss": 0.0628, - "step": 57330 - }, - { - "epoch": 3.7513902518809292, - "grad_norm": 1.136732578277588, - "learning_rate": 8.154519495846737e-05, - "loss": 0.0689, - "step": 57340 - }, - { - "epoch": 3.7520444880601898, - "grad_norm": 0.9039020538330078, - "learning_rate": 8.153806741101033e-05, - "loss": 0.0792, - "step": 57350 - }, - { - "epoch": 3.7526987242394503, - "grad_norm": 0.8383570313453674, - "learning_rate": 8.15309387990608e-05, - "loss": 0.0765, - "step": 57360 - }, - { - "epoch": 3.7533529604187112, - "grad_norm": 0.9474335312843323, - "learning_rate": 8.15238091228594e-05, - "loss": 0.0641, - "step": 57370 - }, - { - "epoch": 3.7540071965979718, - "grad_norm": 0.7904636859893799, - "learning_rate": 8.151667838264677e-05, - "loss": 0.0684, - "step": 57380 - }, - { - "epoch": 3.7546614327772327, - "grad_norm": 0.861436128616333, - "learning_rate": 8.150954657866356e-05, - "loss": 0.0669, - "step": 57390 - }, - { - "epoch": 3.7553156689564933, - "grad_norm": 0.7418460249900818, - "learning_rate": 8.150241371115055e-05, - "loss": 0.0683, - "step": 57400 - }, - { - "epoch": 3.7559699051357542, - "grad_norm": 0.7864526510238647, - "learning_rate": 8.149527978034844e-05, - "loss": 0.0677, - "step": 57410 - }, - { - "epoch": 3.7566241413150148, - "grad_norm": 0.8774205446243286, - "learning_rate": 8.148814478649805e-05, - "loss": 0.0747, - "step": 57420 - }, - { - "epoch": 3.7572783774942753, - "grad_norm": 0.7475970387458801, - "learning_rate": 8.148100872984019e-05, - "loss": 0.0685, - "step": 57430 - }, - { - "epoch": 3.7579326136735363, - "grad_norm": 1.0144479274749756, - "learning_rate": 8.14738716106157e-05, - "loss": 0.0736, - "step": 57440 - }, - { - "epoch": 3.758586849852797, - "grad_norm": 1.0739880800247192, - "learning_rate": 8.146673342906552e-05, - "loss": 0.0676, - "step": 57450 - }, - { - "epoch": 3.7592410860320573, - "grad_norm": 0.9529927968978882, - "learning_rate": 8.145959418543057e-05, - "loss": 0.0645, - "step": 57460 - }, - { - "epoch": 3.7598953222113183, - "grad_norm": 0.8140749931335449, - "learning_rate": 8.14524538799518e-05, - "loss": 0.0721, - "step": 57470 - }, - { - "epoch": 3.7605495583905793, - "grad_norm": 0.7363076210021973, - "learning_rate": 8.144531251287024e-05, - "loss": 0.0655, - "step": 57480 - }, - { - "epoch": 3.7612037945698398, - "grad_norm": 0.9342272281646729, - "learning_rate": 8.14381700844269e-05, - "loss": 0.0688, - "step": 57490 - }, - { - "epoch": 3.7618580307491003, - "grad_norm": 0.8387247920036316, - "learning_rate": 8.143102659486287e-05, - "loss": 0.0605, - "step": 57500 - }, - { - "epoch": 3.7625122669283613, - "grad_norm": 1.0524139404296875, - "learning_rate": 8.142388204441927e-05, - "loss": 0.0688, - "step": 57510 - }, - { - "epoch": 3.763166503107622, - "grad_norm": 1.1220943927764893, - "learning_rate": 8.141673643333723e-05, - "loss": 0.064, - "step": 57520 - }, - { - "epoch": 3.7638207392868823, - "grad_norm": 0.7932508587837219, - "learning_rate": 8.140958976185794e-05, - "loss": 0.0703, - "step": 57530 - }, - { - "epoch": 3.7644749754661433, - "grad_norm": 0.8882383704185486, - "learning_rate": 8.140244203022262e-05, - "loss": 0.0643, - "step": 57540 - }, - { - "epoch": 3.7651292116454043, - "grad_norm": 0.8746048212051392, - "learning_rate": 8.139529323867254e-05, - "loss": 0.0687, - "step": 57550 - }, - { - "epoch": 3.765783447824665, - "grad_norm": 1.058364987373352, - "learning_rate": 8.138814338744896e-05, - "loss": 0.0711, - "step": 57560 - }, - { - "epoch": 3.7664376840039253, - "grad_norm": 0.7067134976387024, - "learning_rate": 8.138099247679322e-05, - "loss": 0.0648, - "step": 57570 - }, - { - "epoch": 3.7670919201831863, - "grad_norm": 0.7847724556922913, - "learning_rate": 8.13738405069467e-05, - "loss": 0.065, - "step": 57580 - }, - { - "epoch": 3.767746156362447, - "grad_norm": 0.9245442748069763, - "learning_rate": 8.136668747815078e-05, - "loss": 0.0655, - "step": 57590 - }, - { - "epoch": 3.7684003925417073, - "grad_norm": 0.940708339214325, - "learning_rate": 8.135953339064688e-05, - "loss": 0.0632, - "step": 57600 - }, - { - "epoch": 3.7690546287209683, - "grad_norm": 1.0202513933181763, - "learning_rate": 8.13523782446765e-05, - "loss": 0.0688, - "step": 57610 - }, - { - "epoch": 3.769708864900229, - "grad_norm": 1.0422993898391724, - "learning_rate": 8.134522204048112e-05, - "loss": 0.0748, - "step": 57620 - }, - { - "epoch": 3.77036310107949, - "grad_norm": 0.8074596524238586, - "learning_rate": 8.13380647783023e-05, - "loss": 0.0715, - "step": 57630 - }, - { - "epoch": 3.7710173372587503, - "grad_norm": 0.8613049983978271, - "learning_rate": 8.13309064583816e-05, - "loss": 0.0843, - "step": 57640 - }, - { - "epoch": 3.7716715734380113, - "grad_norm": 0.8982073664665222, - "learning_rate": 8.132374708096065e-05, - "loss": 0.0653, - "step": 57650 - }, - { - "epoch": 3.772325809617272, - "grad_norm": 0.8297070860862732, - "learning_rate": 8.131658664628107e-05, - "loss": 0.0653, - "step": 57660 - }, - { - "epoch": 3.7729800457965323, - "grad_norm": 0.9720892906188965, - "learning_rate": 8.130942515458456e-05, - "loss": 0.0639, - "step": 57670 - }, - { - "epoch": 3.7736342819757933, - "grad_norm": 0.9780619144439697, - "learning_rate": 8.130226260611284e-05, - "loss": 0.0744, - "step": 57680 - }, - { - "epoch": 3.774288518155054, - "grad_norm": 0.7452018857002258, - "learning_rate": 8.129509900110767e-05, - "loss": 0.0618, - "step": 57690 - }, - { - "epoch": 3.774942754334315, - "grad_norm": 0.9250910878181458, - "learning_rate": 8.128793433981084e-05, - "loss": 0.0693, - "step": 57700 - }, - { - "epoch": 3.7755969905135753, - "grad_norm": 1.0864530801773071, - "learning_rate": 8.128076862246416e-05, - "loss": 0.0729, - "step": 57710 - }, - { - "epoch": 3.7762512266928363, - "grad_norm": 0.7634531259536743, - "learning_rate": 8.127360184930952e-05, - "loss": 0.0699, - "step": 57720 - }, - { - "epoch": 3.776905462872097, - "grad_norm": 0.8827683925628662, - "learning_rate": 8.126643402058877e-05, - "loss": 0.0723, - "step": 57730 - }, - { - "epoch": 3.7775596990513574, - "grad_norm": 0.6894218325614929, - "learning_rate": 8.12592651365439e-05, - "loss": 0.0621, - "step": 57740 - }, - { - "epoch": 3.7782139352306183, - "grad_norm": 0.8839403986930847, - "learning_rate": 8.125209519741683e-05, - "loss": 0.0728, - "step": 57750 - }, - { - "epoch": 3.778868171409879, - "grad_norm": 0.7568190097808838, - "learning_rate": 8.124492420344961e-05, - "loss": 0.0659, - "step": 57760 - }, - { - "epoch": 3.77952240758914, - "grad_norm": 0.9448241591453552, - "learning_rate": 8.123775215488423e-05, - "loss": 0.0676, - "step": 57770 - }, - { - "epoch": 3.7801766437684003, - "grad_norm": 0.7295737862586975, - "learning_rate": 8.123057905196281e-05, - "loss": 0.0656, - "step": 57780 - }, - { - "epoch": 3.7808308799476613, - "grad_norm": 1.049312710762024, - "learning_rate": 8.122340489492743e-05, - "loss": 0.0662, - "step": 57790 - }, - { - "epoch": 3.781485116126922, - "grad_norm": 0.6659790277481079, - "learning_rate": 8.121622968402025e-05, - "loss": 0.063, - "step": 57800 - }, - { - "epoch": 3.7821393523061824, - "grad_norm": 1.10063636302948, - "learning_rate": 8.120905341948345e-05, - "loss": 0.0679, - "step": 57810 - }, - { - "epoch": 3.7827935884854433, - "grad_norm": 1.0684069395065308, - "learning_rate": 8.120187610155924e-05, - "loss": 0.0664, - "step": 57820 - }, - { - "epoch": 3.783447824664704, - "grad_norm": 0.7764497995376587, - "learning_rate": 8.119469773048992e-05, - "loss": 0.08, - "step": 57830 - }, - { - "epoch": 3.784102060843965, - "grad_norm": 0.8615806698799133, - "learning_rate": 8.11875183065177e-05, - "loss": 0.0738, - "step": 57840 - }, - { - "epoch": 3.7847562970232254, - "grad_norm": 0.8051279783248901, - "learning_rate": 8.118033782988496e-05, - "loss": 0.0642, - "step": 57850 - }, - { - "epoch": 3.7854105332024863, - "grad_norm": 0.7473447322845459, - "learning_rate": 8.117315630083404e-05, - "loss": 0.0814, - "step": 57860 - }, - { - "epoch": 3.786064769381747, - "grad_norm": 0.7426646947860718, - "learning_rate": 8.116597371960734e-05, - "loss": 0.0726, - "step": 57870 - }, - { - "epoch": 3.7867190055610074, - "grad_norm": 0.7152010798454285, - "learning_rate": 8.115879008644729e-05, - "loss": 0.0693, - "step": 57880 - }, - { - "epoch": 3.7873732417402683, - "grad_norm": 1.007025957107544, - "learning_rate": 8.115160540159636e-05, - "loss": 0.0714, - "step": 57890 - }, - { - "epoch": 3.788027477919529, - "grad_norm": 0.8072939515113831, - "learning_rate": 8.114441966529707e-05, - "loss": 0.0619, - "step": 57900 - }, - { - "epoch": 3.7886817140987894, - "grad_norm": 0.8661327362060547, - "learning_rate": 8.11372328777919e-05, - "loss": 0.0662, - "step": 57910 - }, - { - "epoch": 3.7893359502780504, - "grad_norm": 0.8400626182556152, - "learning_rate": 8.113004503932348e-05, - "loss": 0.0643, - "step": 57920 - }, - { - "epoch": 3.7899901864573113, - "grad_norm": 0.8952879309654236, - "learning_rate": 8.11228561501344e-05, - "loss": 0.0638, - "step": 57930 - }, - { - "epoch": 3.790644422636572, - "grad_norm": 1.0547457933425903, - "learning_rate": 8.11156662104673e-05, - "loss": 0.0639, - "step": 57940 - }, - { - "epoch": 3.7912986588158324, - "grad_norm": 1.0903702974319458, - "learning_rate": 8.110847522056485e-05, - "loss": 0.0619, - "step": 57950 - }, - { - "epoch": 3.7919528949950934, - "grad_norm": 0.9032535552978516, - "learning_rate": 8.11012831806698e-05, - "loss": 0.0725, - "step": 57960 - }, - { - "epoch": 3.792607131174354, - "grad_norm": 1.0580861568450928, - "learning_rate": 8.109409009102486e-05, - "loss": 0.0721, - "step": 57970 - }, - { - "epoch": 3.7932613673536144, - "grad_norm": 1.0803550481796265, - "learning_rate": 8.108689595187285e-05, - "loss": 0.0703, - "step": 57980 - }, - { - "epoch": 3.7939156035328754, - "grad_norm": 0.8773861527442932, - "learning_rate": 8.107970076345654e-05, - "loss": 0.067, - "step": 57990 - }, - { - "epoch": 3.7945698397121363, - "grad_norm": 0.7425010204315186, - "learning_rate": 8.107250452601885e-05, - "loss": 0.0721, - "step": 58000 - }, - { - "epoch": 3.795224075891397, - "grad_norm": 0.7278298139572144, - "learning_rate": 8.106530723980261e-05, - "loss": 0.0675, - "step": 58010 - }, - { - "epoch": 3.7958783120706574, - "grad_norm": 0.9992676973342896, - "learning_rate": 8.105810890505081e-05, - "loss": 0.0685, - "step": 58020 - }, - { - "epoch": 3.7965325482499184, - "grad_norm": 0.8017996549606323, - "learning_rate": 8.105090952200637e-05, - "loss": 0.0699, - "step": 58030 - }, - { - "epoch": 3.797186784429179, - "grad_norm": 1.016209363937378, - "learning_rate": 8.10437090909123e-05, - "loss": 0.0703, - "step": 58040 - }, - { - "epoch": 3.7978410206084394, - "grad_norm": 0.8442249894142151, - "learning_rate": 8.103650761201163e-05, - "loss": 0.0719, - "step": 58050 - }, - { - "epoch": 3.7984952567877004, - "grad_norm": 0.8751046061515808, - "learning_rate": 8.102930508554744e-05, - "loss": 0.07, - "step": 58060 - }, - { - "epoch": 3.799149492966961, - "grad_norm": 0.6956860423088074, - "learning_rate": 8.102210151176282e-05, - "loss": 0.0631, - "step": 58070 - }, - { - "epoch": 3.799803729146222, - "grad_norm": 0.9162025451660156, - "learning_rate": 8.101489689090091e-05, - "loss": 0.0699, - "step": 58080 - }, - { - "epoch": 3.8004579653254824, - "grad_norm": 0.9595727920532227, - "learning_rate": 8.10076912232049e-05, - "loss": 0.0604, - "step": 58090 - }, - { - "epoch": 3.8011122015047434, - "grad_norm": 0.9500911235809326, - "learning_rate": 8.100048450891799e-05, - "loss": 0.0625, - "step": 58100 - }, - { - "epoch": 3.801766437684004, - "grad_norm": 0.6980655193328857, - "learning_rate": 8.099327674828342e-05, - "loss": 0.0588, - "step": 58110 - }, - { - "epoch": 3.8024206738632644, - "grad_norm": 0.9078855514526367, - "learning_rate": 8.098606794154448e-05, - "loss": 0.0599, - "step": 58120 - }, - { - "epoch": 3.8030749100425254, - "grad_norm": 0.8954933881759644, - "learning_rate": 8.097885808894449e-05, - "loss": 0.0646, - "step": 58130 - }, - { - "epoch": 3.803729146221786, - "grad_norm": 0.8276561498641968, - "learning_rate": 8.097164719072679e-05, - "loss": 0.0717, - "step": 58140 - }, - { - "epoch": 3.804383382401047, - "grad_norm": 0.8717600703239441, - "learning_rate": 8.096443524713477e-05, - "loss": 0.069, - "step": 58150 - }, - { - "epoch": 3.8050376185803074, - "grad_norm": 0.9472079277038574, - "learning_rate": 8.095722225841185e-05, - "loss": 0.0702, - "step": 58160 - }, - { - "epoch": 3.8056918547595684, - "grad_norm": 0.8693017959594727, - "learning_rate": 8.09500082248015e-05, - "loss": 0.0697, - "step": 58170 - }, - { - "epoch": 3.806346090938829, - "grad_norm": 1.0408918857574463, - "learning_rate": 8.094279314654718e-05, - "loss": 0.0752, - "step": 58180 - }, - { - "epoch": 3.8070003271180894, - "grad_norm": 0.827529788017273, - "learning_rate": 8.093557702389246e-05, - "loss": 0.0712, - "step": 58190 - }, - { - "epoch": 3.8076545632973504, - "grad_norm": 0.6943844556808472, - "learning_rate": 8.092835985708088e-05, - "loss": 0.0648, - "step": 58200 - }, - { - "epoch": 3.808308799476611, - "grad_norm": 0.8800909519195557, - "learning_rate": 8.092114164635604e-05, - "loss": 0.0684, - "step": 58210 - }, - { - "epoch": 3.808963035655872, - "grad_norm": 0.7584820985794067, - "learning_rate": 8.091392239196159e-05, - "loss": 0.072, - "step": 58220 - }, - { - "epoch": 3.8096172718351324, - "grad_norm": 0.9473629593849182, - "learning_rate": 8.090670209414117e-05, - "loss": 0.0664, - "step": 58230 - }, - { - "epoch": 3.8102715080143934, - "grad_norm": 1.0267720222473145, - "learning_rate": 8.08994807531385e-05, - "loss": 0.0638, - "step": 58240 - }, - { - "epoch": 3.810925744193654, - "grad_norm": 0.7905268669128418, - "learning_rate": 8.089225836919732e-05, - "loss": 0.0636, - "step": 58250 - }, - { - "epoch": 3.8115799803729145, - "grad_norm": 0.8722211718559265, - "learning_rate": 8.08850349425614e-05, - "loss": 0.0767, - "step": 58260 - }, - { - "epoch": 3.8122342165521754, - "grad_norm": 0.8511031866073608, - "learning_rate": 8.087781047347455e-05, - "loss": 0.0705, - "step": 58270 - }, - { - "epoch": 3.812888452731436, - "grad_norm": 0.8996208310127258, - "learning_rate": 8.087058496218063e-05, - "loss": 0.0645, - "step": 58280 - }, - { - "epoch": 3.813542688910697, - "grad_norm": 0.975518524646759, - "learning_rate": 8.08633584089235e-05, - "loss": 0.0808, - "step": 58290 - }, - { - "epoch": 3.8141969250899574, - "grad_norm": 0.9929527044296265, - "learning_rate": 8.085613081394708e-05, - "loss": 0.0718, - "step": 58300 - }, - { - "epoch": 3.8148511612692184, - "grad_norm": 0.8056535124778748, - "learning_rate": 8.084890217749532e-05, - "loss": 0.0671, - "step": 58310 - }, - { - "epoch": 3.815505397448479, - "grad_norm": 0.766545832157135, - "learning_rate": 8.084167249981219e-05, - "loss": 0.0711, - "step": 58320 - }, - { - "epoch": 3.8161596336277395, - "grad_norm": 1.0355461835861206, - "learning_rate": 8.083444178114174e-05, - "loss": 0.0759, - "step": 58330 - }, - { - "epoch": 3.8168138698070004, - "grad_norm": 0.8508825898170471, - "learning_rate": 8.082721002172801e-05, - "loss": 0.0696, - "step": 58340 - }, - { - "epoch": 3.817468105986261, - "grad_norm": 0.863624095916748, - "learning_rate": 8.081997722181512e-05, - "loss": 0.073, - "step": 58350 - }, - { - "epoch": 3.8181223421655215, - "grad_norm": 0.8428578972816467, - "learning_rate": 8.081274338164714e-05, - "loss": 0.0697, - "step": 58360 - }, - { - "epoch": 3.8187765783447825, - "grad_norm": 0.8792449235916138, - "learning_rate": 8.080550850146829e-05, - "loss": 0.0665, - "step": 58370 - }, - { - "epoch": 3.8194308145240434, - "grad_norm": 0.7911483645439148, - "learning_rate": 8.079827258152272e-05, - "loss": 0.0738, - "step": 58380 - }, - { - "epoch": 3.820085050703304, - "grad_norm": 0.9824073314666748, - "learning_rate": 8.079103562205468e-05, - "loss": 0.0693, - "step": 58390 - }, - { - "epoch": 3.8207392868825645, - "grad_norm": 0.77364581823349, - "learning_rate": 8.078379762330843e-05, - "loss": 0.0666, - "step": 58400 - }, - { - "epoch": 3.8213935230618254, - "grad_norm": 0.8866234421730042, - "learning_rate": 8.07765585855283e-05, - "loss": 0.0706, - "step": 58410 - }, - { - "epoch": 3.822047759241086, - "grad_norm": 0.7510147094726562, - "learning_rate": 8.076931850895859e-05, - "loss": 0.0679, - "step": 58420 - }, - { - "epoch": 3.8227019954203465, - "grad_norm": 0.9007807374000549, - "learning_rate": 8.076207739384368e-05, - "loss": 0.073, - "step": 58430 - }, - { - "epoch": 3.8233562315996075, - "grad_norm": 0.8261735439300537, - "learning_rate": 8.075483524042797e-05, - "loss": 0.0709, - "step": 58440 - }, - { - "epoch": 3.8240104677788684, - "grad_norm": 0.7386045455932617, - "learning_rate": 8.074759204895593e-05, - "loss": 0.0606, - "step": 58450 - }, - { - "epoch": 3.824664703958129, - "grad_norm": 1.0054618120193481, - "learning_rate": 8.0740347819672e-05, - "loss": 0.0652, - "step": 58460 - }, - { - "epoch": 3.8253189401373895, - "grad_norm": 0.8960070610046387, - "learning_rate": 8.073310255282074e-05, - "loss": 0.0707, - "step": 58470 - }, - { - "epoch": 3.8259731763166505, - "grad_norm": 0.8370187282562256, - "learning_rate": 8.072585624864666e-05, - "loss": 0.0668, - "step": 58480 - }, - { - "epoch": 3.826627412495911, - "grad_norm": 0.9938347935676575, - "learning_rate": 8.071860890739435e-05, - "loss": 0.0786, - "step": 58490 - }, - { - "epoch": 3.8272816486751715, - "grad_norm": 1.007035732269287, - "learning_rate": 8.071136052930843e-05, - "loss": 0.0697, - "step": 58500 - }, - { - "epoch": 3.8279358848544325, - "grad_norm": 0.86492520570755, - "learning_rate": 8.070411111463353e-05, - "loss": 0.0749, - "step": 58510 - }, - { - "epoch": 3.828590121033693, - "grad_norm": 0.7350853085517883, - "learning_rate": 8.069686066361437e-05, - "loss": 0.0648, - "step": 58520 - }, - { - "epoch": 3.829244357212954, - "grad_norm": 1.0213762521743774, - "learning_rate": 8.068960917649566e-05, - "loss": 0.0731, - "step": 58530 - }, - { - "epoch": 3.8298985933922145, - "grad_norm": 0.8180403113365173, - "learning_rate": 8.068235665352214e-05, - "loss": 0.0651, - "step": 58540 - }, - { - "epoch": 3.8305528295714755, - "grad_norm": 0.8237452507019043, - "learning_rate": 8.067510309493861e-05, - "loss": 0.0702, - "step": 58550 - }, - { - "epoch": 3.831207065750736, - "grad_norm": 0.7584102153778076, - "learning_rate": 8.066784850098992e-05, - "loss": 0.062, - "step": 58560 - }, - { - "epoch": 3.8318613019299965, - "grad_norm": 1.0070369243621826, - "learning_rate": 8.066059287192091e-05, - "loss": 0.0657, - "step": 58570 - }, - { - "epoch": 3.8325155381092575, - "grad_norm": 1.0255883932113647, - "learning_rate": 8.065333620797649e-05, - "loss": 0.0699, - "step": 58580 - }, - { - "epoch": 3.833169774288518, - "grad_norm": 0.8956732153892517, - "learning_rate": 8.064607850940156e-05, - "loss": 0.0733, - "step": 58590 - }, - { - "epoch": 3.833824010467779, - "grad_norm": 0.8609063625335693, - "learning_rate": 8.063881977644112e-05, - "loss": 0.0692, - "step": 58600 - }, - { - "epoch": 3.8344782466470395, - "grad_norm": 1.1151846647262573, - "learning_rate": 8.063156000934016e-05, - "loss": 0.0674, - "step": 58610 - }, - { - "epoch": 3.8351324828263005, - "grad_norm": 0.8186620473861694, - "learning_rate": 8.062429920834372e-05, - "loss": 0.0711, - "step": 58620 - }, - { - "epoch": 3.835786719005561, - "grad_norm": 0.8908758759498596, - "learning_rate": 8.061703737369686e-05, - "loss": 0.0635, - "step": 58630 - }, - { - "epoch": 3.8364409551848215, - "grad_norm": 1.0307601690292358, - "learning_rate": 8.060977450564469e-05, - "loss": 0.0708, - "step": 58640 - }, - { - "epoch": 3.8370951913640825, - "grad_norm": 0.8627874851226807, - "learning_rate": 8.060251060443236e-05, - "loss": 0.0646, - "step": 58650 - }, - { - "epoch": 3.837749427543343, - "grad_norm": 0.9624312520027161, - "learning_rate": 8.059524567030503e-05, - "loss": 0.0685, - "step": 58660 - }, - { - "epoch": 3.838403663722604, - "grad_norm": 0.8009755611419678, - "learning_rate": 8.058797970350793e-05, - "loss": 0.064, - "step": 58670 - }, - { - "epoch": 3.8390578999018645, - "grad_norm": 0.9126613140106201, - "learning_rate": 8.058071270428628e-05, - "loss": 0.0725, - "step": 58680 - }, - { - "epoch": 3.8397121360811255, - "grad_norm": 0.7582046389579773, - "learning_rate": 8.057344467288539e-05, - "loss": 0.0622, - "step": 58690 - }, - { - "epoch": 3.840366372260386, - "grad_norm": 0.7236906886100769, - "learning_rate": 8.056617560955056e-05, - "loss": 0.0658, - "step": 58700 - }, - { - "epoch": 3.8410206084396465, - "grad_norm": 0.7379127144813538, - "learning_rate": 8.055890551452714e-05, - "loss": 0.0655, - "step": 58710 - }, - { - "epoch": 3.8416748446189075, - "grad_norm": 0.8637368679046631, - "learning_rate": 8.055163438806051e-05, - "loss": 0.0803, - "step": 58720 - }, - { - "epoch": 3.842329080798168, - "grad_norm": 0.9424805045127869, - "learning_rate": 8.05443622303961e-05, - "loss": 0.0723, - "step": 58730 - }, - { - "epoch": 3.842983316977429, - "grad_norm": 0.9135764241218567, - "learning_rate": 8.053708904177934e-05, - "loss": 0.0717, - "step": 58740 - }, - { - "epoch": 3.8436375531566895, - "grad_norm": 0.7317261695861816, - "learning_rate": 8.052981482245577e-05, - "loss": 0.0679, - "step": 58750 - }, - { - "epoch": 3.8442917893359505, - "grad_norm": 1.1053870916366577, - "learning_rate": 8.052253957267086e-05, - "loss": 0.0699, - "step": 58760 - }, - { - "epoch": 3.844946025515211, - "grad_norm": 1.033487319946289, - "learning_rate": 8.05152632926702e-05, - "loss": 0.0776, - "step": 58770 - }, - { - "epoch": 3.8456002616944716, - "grad_norm": 1.0361863374710083, - "learning_rate": 8.050798598269937e-05, - "loss": 0.0778, - "step": 58780 - }, - { - "epoch": 3.8462544978737325, - "grad_norm": 0.8072547912597656, - "learning_rate": 8.050070764300401e-05, - "loss": 0.0695, - "step": 58790 - }, - { - "epoch": 3.846908734052993, - "grad_norm": 0.7639461159706116, - "learning_rate": 8.049342827382977e-05, - "loss": 0.0718, - "step": 58800 - }, - { - "epoch": 3.8475629702322536, - "grad_norm": 0.9850027561187744, - "learning_rate": 8.048614787542234e-05, - "loss": 0.0647, - "step": 58810 - }, - { - "epoch": 3.8482172064115145, - "grad_norm": 0.8612027168273926, - "learning_rate": 8.047886644802749e-05, - "loss": 0.0667, - "step": 58820 - }, - { - "epoch": 3.8488714425907755, - "grad_norm": 0.7390583157539368, - "learning_rate": 8.047158399189096e-05, - "loss": 0.0606, - "step": 58830 - }, - { - "epoch": 3.849525678770036, - "grad_norm": 0.8510991930961609, - "learning_rate": 8.046430050725854e-05, - "loss": 0.0688, - "step": 58840 - }, - { - "epoch": 3.8501799149492966, - "grad_norm": 0.8368331789970398, - "learning_rate": 8.045701599437609e-05, - "loss": 0.0683, - "step": 58850 - }, - { - "epoch": 3.8508341511285575, - "grad_norm": 1.0364656448364258, - "learning_rate": 8.044973045348949e-05, - "loss": 0.0755, - "step": 58860 - }, - { - "epoch": 3.851488387307818, - "grad_norm": 0.7408995628356934, - "learning_rate": 8.044244388484463e-05, - "loss": 0.0695, - "step": 58870 - }, - { - "epoch": 3.8521426234870786, - "grad_norm": 0.7264641523361206, - "learning_rate": 8.043515628868743e-05, - "loss": 0.0771, - "step": 58880 - }, - { - "epoch": 3.8527968596663396, - "grad_norm": 0.8603324294090271, - "learning_rate": 8.042786766526389e-05, - "loss": 0.0619, - "step": 58890 - }, - { - "epoch": 3.8534510958456005, - "grad_norm": 1.0376695394515991, - "learning_rate": 8.042057801482001e-05, - "loss": 0.0681, - "step": 58900 - }, - { - "epoch": 3.854105332024861, - "grad_norm": 0.7530226111412048, - "learning_rate": 8.041328733760185e-05, - "loss": 0.0709, - "step": 58910 - }, - { - "epoch": 3.8547595682041216, - "grad_norm": 0.8410870432853699, - "learning_rate": 8.040599563385548e-05, - "loss": 0.0628, - "step": 58920 - }, - { - "epoch": 3.8554138043833825, - "grad_norm": 1.0103882551193237, - "learning_rate": 8.039870290382703e-05, - "loss": 0.0743, - "step": 58930 - }, - { - "epoch": 3.856068040562643, - "grad_norm": 0.925134539604187, - "learning_rate": 8.039140914776262e-05, - "loss": 0.0767, - "step": 58940 - }, - { - "epoch": 3.8567222767419036, - "grad_norm": 0.9100943803787231, - "learning_rate": 8.038411436590845e-05, - "loss": 0.0697, - "step": 58950 - }, - { - "epoch": 3.8573765129211646, - "grad_norm": 0.9153822660446167, - "learning_rate": 8.037681855851072e-05, - "loss": 0.0622, - "step": 58960 - }, - { - "epoch": 3.858030749100425, - "grad_norm": 0.9018275737762451, - "learning_rate": 8.036952172581571e-05, - "loss": 0.0669, - "step": 58970 - }, - { - "epoch": 3.858684985279686, - "grad_norm": 0.9553847908973694, - "learning_rate": 8.03622238680697e-05, - "loss": 0.0738, - "step": 58980 - }, - { - "epoch": 3.8593392214589466, - "grad_norm": 0.8879307508468628, - "learning_rate": 8.0354924985519e-05, - "loss": 0.0707, - "step": 58990 - }, - { - "epoch": 3.8599934576382076, - "grad_norm": 0.8077122569084167, - "learning_rate": 8.034762507840997e-05, - "loss": 0.0819, - "step": 59000 - }, - { - "epoch": 3.860647693817468, - "grad_norm": 0.7566487193107605, - "learning_rate": 8.034032414698901e-05, - "loss": 0.0752, - "step": 59010 - }, - { - "epoch": 3.8613019299967286, - "grad_norm": 0.8122360706329346, - "learning_rate": 8.033302219150253e-05, - "loss": 0.0715, - "step": 59020 - }, - { - "epoch": 3.8619561661759896, - "grad_norm": 1.0947107076644897, - "learning_rate": 8.0325719212197e-05, - "loss": 0.0641, - "step": 59030 - }, - { - "epoch": 3.86261040235525, - "grad_norm": 0.782038688659668, - "learning_rate": 8.031841520931893e-05, - "loss": 0.0608, - "step": 59040 - }, - { - "epoch": 3.863264638534511, - "grad_norm": 0.7737072706222534, - "learning_rate": 8.031111018311483e-05, - "loss": 0.0664, - "step": 59050 - }, - { - "epoch": 3.8639188747137716, - "grad_norm": 0.8657538890838623, - "learning_rate": 8.030380413383125e-05, - "loss": 0.0669, - "step": 59060 - }, - { - "epoch": 3.8645731108930326, - "grad_norm": 0.8043084144592285, - "learning_rate": 8.029649706171483e-05, - "loss": 0.0719, - "step": 59070 - }, - { - "epoch": 3.865227347072293, - "grad_norm": 0.7678789496421814, - "learning_rate": 8.028918896701217e-05, - "loss": 0.0679, - "step": 59080 - }, - { - "epoch": 3.8658815832515536, - "grad_norm": 0.935185432434082, - "learning_rate": 8.028187984996993e-05, - "loss": 0.0701, - "step": 59090 - }, - { - "epoch": 3.8665358194308146, - "grad_norm": 0.9353100061416626, - "learning_rate": 8.027456971083485e-05, - "loss": 0.0636, - "step": 59100 - }, - { - "epoch": 3.867190055610075, - "grad_norm": 0.8760929107666016, - "learning_rate": 8.026725854985363e-05, - "loss": 0.0776, - "step": 59110 - }, - { - "epoch": 3.867844291789336, - "grad_norm": 0.9680326581001282, - "learning_rate": 8.025994636727306e-05, - "loss": 0.0749, - "step": 59120 - }, - { - "epoch": 3.8684985279685966, - "grad_norm": 0.6920056343078613, - "learning_rate": 8.025263316333994e-05, - "loss": 0.0715, - "step": 59130 - }, - { - "epoch": 3.8691527641478576, - "grad_norm": 0.9525841474533081, - "learning_rate": 8.024531893830112e-05, - "loss": 0.0673, - "step": 59140 - }, - { - "epoch": 3.869807000327118, - "grad_norm": 0.8676741719245911, - "learning_rate": 8.023800369240344e-05, - "loss": 0.0744, - "step": 59150 - }, - { - "epoch": 3.8704612365063786, - "grad_norm": 0.8752907514572144, - "learning_rate": 8.023068742589386e-05, - "loss": 0.0646, - "step": 59160 - }, - { - "epoch": 3.8711154726856396, - "grad_norm": 0.9634379744529724, - "learning_rate": 8.022337013901928e-05, - "loss": 0.0594, - "step": 59170 - }, - { - "epoch": 3.8717697088649, - "grad_norm": 0.7896409034729004, - "learning_rate": 8.02160518320267e-05, - "loss": 0.072, - "step": 59180 - }, - { - "epoch": 3.872423945044161, - "grad_norm": 0.8738341927528381, - "learning_rate": 8.020873250516312e-05, - "loss": 0.0677, - "step": 59190 - }, - { - "epoch": 3.8730781812234216, - "grad_norm": 0.8021124601364136, - "learning_rate": 8.02014121586756e-05, - "loss": 0.0841, - "step": 59200 - }, - { - "epoch": 3.8737324174026826, - "grad_norm": 0.9408907890319824, - "learning_rate": 8.019409079281122e-05, - "loss": 0.0625, - "step": 59210 - }, - { - "epoch": 3.874386653581943, - "grad_norm": 0.8680176138877869, - "learning_rate": 8.018676840781707e-05, - "loss": 0.0702, - "step": 59220 - }, - { - "epoch": 3.8750408897612036, - "grad_norm": 0.6961979269981384, - "learning_rate": 8.017944500394033e-05, - "loss": 0.0578, - "step": 59230 - }, - { - "epoch": 3.8756951259404646, - "grad_norm": 0.8254890441894531, - "learning_rate": 8.017212058142817e-05, - "loss": 0.0717, - "step": 59240 - }, - { - "epoch": 3.876349362119725, - "grad_norm": 1.0662809610366821, - "learning_rate": 8.016479514052783e-05, - "loss": 0.0675, - "step": 59250 - }, - { - "epoch": 3.8770035982989857, - "grad_norm": 0.8766838908195496, - "learning_rate": 8.015746868148651e-05, - "loss": 0.0688, - "step": 59260 - }, - { - "epoch": 3.8776578344782466, - "grad_norm": 0.8927856683731079, - "learning_rate": 8.015014120455156e-05, - "loss": 0.0628, - "step": 59270 - }, - { - "epoch": 3.8783120706575076, - "grad_norm": 0.8246244192123413, - "learning_rate": 8.014281270997026e-05, - "loss": 0.0691, - "step": 59280 - }, - { - "epoch": 3.878966306836768, - "grad_norm": 0.9650526642799377, - "learning_rate": 8.013548319798998e-05, - "loss": 0.0771, - "step": 59290 - }, - { - "epoch": 3.8796205430160287, - "grad_norm": 0.6766818165779114, - "learning_rate": 8.012815266885811e-05, - "loss": 0.0567, - "step": 59300 - }, - { - "epoch": 3.8802747791952896, - "grad_norm": 0.8025507926940918, - "learning_rate": 8.012082112282207e-05, - "loss": 0.0614, - "step": 59310 - }, - { - "epoch": 3.88092901537455, - "grad_norm": 0.8738349080085754, - "learning_rate": 8.011348856012932e-05, - "loss": 0.0754, - "step": 59320 - }, - { - "epoch": 3.8815832515538107, - "grad_norm": 0.819463312625885, - "learning_rate": 8.010615498102736e-05, - "loss": 0.0634, - "step": 59330 - }, - { - "epoch": 3.8822374877330716, - "grad_norm": 0.7592009902000427, - "learning_rate": 8.009882038576371e-05, - "loss": 0.0649, - "step": 59340 - }, - { - "epoch": 3.8828917239123326, - "grad_norm": 1.0910615921020508, - "learning_rate": 8.009148477458594e-05, - "loss": 0.0777, - "step": 59350 - }, - { - "epoch": 3.883545960091593, - "grad_norm": 0.8218048214912415, - "learning_rate": 8.008414814774163e-05, - "loss": 0.064, - "step": 59360 - }, - { - "epoch": 3.8842001962708537, - "grad_norm": 0.8546844720840454, - "learning_rate": 8.007681050547844e-05, - "loss": 0.0787, - "step": 59370 - }, - { - "epoch": 3.8848544324501146, - "grad_norm": 0.8277244567871094, - "learning_rate": 8.0069471848044e-05, - "loss": 0.0666, - "step": 59380 - }, - { - "epoch": 3.885508668629375, - "grad_norm": 0.7746628522872925, - "learning_rate": 8.006213217568604e-05, - "loss": 0.0641, - "step": 59390 - }, - { - "epoch": 3.8861629048086357, - "grad_norm": 1.0281668901443481, - "learning_rate": 8.005479148865226e-05, - "loss": 0.0729, - "step": 59400 - }, - { - "epoch": 3.8868171409878967, - "grad_norm": 0.7792064547538757, - "learning_rate": 8.004744978719046e-05, - "loss": 0.0722, - "step": 59410 - }, - { - "epoch": 3.887471377167157, - "grad_norm": 0.7201108336448669, - "learning_rate": 8.004010707154843e-05, - "loss": 0.0694, - "step": 59420 - }, - { - "epoch": 3.888125613346418, - "grad_norm": 0.8858181834220886, - "learning_rate": 8.003276334197399e-05, - "loss": 0.0624, - "step": 59430 - }, - { - "epoch": 3.8887798495256787, - "grad_norm": 0.8069655895233154, - "learning_rate": 8.002541859871502e-05, - "loss": 0.0636, - "step": 59440 - }, - { - "epoch": 3.8894340857049396, - "grad_norm": 0.812025249004364, - "learning_rate": 8.001807284201944e-05, - "loss": 0.0638, - "step": 59450 - }, - { - "epoch": 3.8900883218842, - "grad_norm": 0.8210043907165527, - "learning_rate": 8.001072607213518e-05, - "loss": 0.0755, - "step": 59460 - }, - { - "epoch": 3.8907425580634607, - "grad_norm": 0.8998602032661438, - "learning_rate": 8.000337828931021e-05, - "loss": 0.0702, - "step": 59470 - }, - { - "epoch": 3.8913967942427217, - "grad_norm": 0.7417557239532471, - "learning_rate": 7.999602949379252e-05, - "loss": 0.0666, - "step": 59480 - }, - { - "epoch": 3.892051030421982, - "grad_norm": 0.778843104839325, - "learning_rate": 7.998867968583018e-05, - "loss": 0.0689, - "step": 59490 - }, - { - "epoch": 3.892705266601243, - "grad_norm": 0.7285590767860413, - "learning_rate": 7.998132886567125e-05, - "loss": 0.0667, - "step": 59500 - }, - { - "epoch": 3.8933595027805037, - "grad_norm": 0.8084492683410645, - "learning_rate": 7.997397703356384e-05, - "loss": 0.0588, - "step": 59510 - }, - { - "epoch": 3.8940137389597647, - "grad_norm": 0.8009549379348755, - "learning_rate": 7.996662418975609e-05, - "loss": 0.0634, - "step": 59520 - }, - { - "epoch": 3.894667975139025, - "grad_norm": 0.8678660988807678, - "learning_rate": 7.99592703344962e-05, - "loss": 0.0692, - "step": 59530 - }, - { - "epoch": 3.8953222113182857, - "grad_norm": 1.06533682346344, - "learning_rate": 7.995191546803235e-05, - "loss": 0.0712, - "step": 59540 - }, - { - "epoch": 3.8959764474975467, - "grad_norm": 1.0511845350265503, - "learning_rate": 7.99445595906128e-05, - "loss": 0.0764, - "step": 59550 - }, - { - "epoch": 3.896630683676807, - "grad_norm": 0.9604825973510742, - "learning_rate": 7.993720270248584e-05, - "loss": 0.0743, - "step": 59560 - }, - { - "epoch": 3.897284919856068, - "grad_norm": 0.9758427739143372, - "learning_rate": 7.992984480389977e-05, - "loss": 0.0586, - "step": 59570 - }, - { - "epoch": 3.8979391560353287, - "grad_norm": 0.862629771232605, - "learning_rate": 7.992248589510293e-05, - "loss": 0.0597, - "step": 59580 - }, - { - "epoch": 3.8985933922145897, - "grad_norm": 0.876379132270813, - "learning_rate": 7.991512597634375e-05, - "loss": 0.0651, - "step": 59590 - }, - { - "epoch": 3.89924762839385, - "grad_norm": 1.1144251823425293, - "learning_rate": 7.990776504787059e-05, - "loss": 0.0755, - "step": 59600 - }, - { - "epoch": 3.8999018645731107, - "grad_norm": 0.9200652837753296, - "learning_rate": 7.990040310993193e-05, - "loss": 0.058, - "step": 59610 - }, - { - "epoch": 3.9005561007523717, - "grad_norm": 0.8671579360961914, - "learning_rate": 7.989304016277625e-05, - "loss": 0.0728, - "step": 59620 - }, - { - "epoch": 3.901210336931632, - "grad_norm": 0.8644618391990662, - "learning_rate": 7.988567620665206e-05, - "loss": 0.0712, - "step": 59630 - }, - { - "epoch": 3.901864573110893, - "grad_norm": 1.062548279762268, - "learning_rate": 7.987831124180792e-05, - "loss": 0.0605, - "step": 59640 - }, - { - "epoch": 3.9025188092901537, - "grad_norm": 0.7029675841331482, - "learning_rate": 7.987094526849242e-05, - "loss": 0.0605, - "step": 59650 - }, - { - "epoch": 3.9031730454694147, - "grad_norm": 0.7304003834724426, - "learning_rate": 7.986357828695419e-05, - "loss": 0.0624, - "step": 59660 - }, - { - "epoch": 3.903827281648675, - "grad_norm": 0.8633514046669006, - "learning_rate": 7.985621029744186e-05, - "loss": 0.074, - "step": 59670 - }, - { - "epoch": 3.9044815178279357, - "grad_norm": 1.0137802362442017, - "learning_rate": 7.984884130020414e-05, - "loss": 0.0597, - "step": 59680 - }, - { - "epoch": 3.9051357540071967, - "grad_norm": 1.002747893333435, - "learning_rate": 7.984147129548973e-05, - "loss": 0.0765, - "step": 59690 - }, - { - "epoch": 3.9057899901864572, - "grad_norm": 0.7557197213172913, - "learning_rate": 7.983410028354741e-05, - "loss": 0.0758, - "step": 59700 - }, - { - "epoch": 3.9064442263657178, - "grad_norm": 1.0572227239608765, - "learning_rate": 7.982672826462595e-05, - "loss": 0.0619, - "step": 59710 - }, - { - "epoch": 3.9070984625449787, - "grad_norm": 0.8649874925613403, - "learning_rate": 7.981935523897421e-05, - "loss": 0.0617, - "step": 59720 - }, - { - "epoch": 3.9077526987242397, - "grad_norm": 0.711691677570343, - "learning_rate": 7.981198120684101e-05, - "loss": 0.066, - "step": 59730 - }, - { - "epoch": 3.9084069349035, - "grad_norm": 0.9565288424491882, - "learning_rate": 7.980460616847527e-05, - "loss": 0.0708, - "step": 59740 - }, - { - "epoch": 3.9090611710827607, - "grad_norm": 0.9531120657920837, - "learning_rate": 7.97972301241259e-05, - "loss": 0.0662, - "step": 59750 - }, - { - "epoch": 3.9097154072620217, - "grad_norm": 0.8800640106201172, - "learning_rate": 7.978985307404187e-05, - "loss": 0.0723, - "step": 59760 - }, - { - "epoch": 3.9103696434412822, - "grad_norm": 0.7907775044441223, - "learning_rate": 7.978247501847216e-05, - "loss": 0.0713, - "step": 59770 - }, - { - "epoch": 3.9110238796205428, - "grad_norm": 0.8069142699241638, - "learning_rate": 7.977509595766583e-05, - "loss": 0.0609, - "step": 59780 - }, - { - "epoch": 3.9116781157998037, - "grad_norm": 0.9594196677207947, - "learning_rate": 7.976771589187193e-05, - "loss": 0.0742, - "step": 59790 - }, - { - "epoch": 3.9123323519790647, - "grad_norm": 0.9948168992996216, - "learning_rate": 7.976033482133953e-05, - "loss": 0.0713, - "step": 59800 - }, - { - "epoch": 3.9129865881583252, - "grad_norm": 1.1436748504638672, - "learning_rate": 7.975295274631777e-05, - "loss": 0.0675, - "step": 59810 - }, - { - "epoch": 3.9136408243375858, - "grad_norm": 0.8440989255905151, - "learning_rate": 7.974556966705584e-05, - "loss": 0.0783, - "step": 59820 - }, - { - "epoch": 3.9142950605168467, - "grad_norm": 0.7184342741966248, - "learning_rate": 7.973818558380294e-05, - "loss": 0.0633, - "step": 59830 - }, - { - "epoch": 3.9149492966961073, - "grad_norm": 0.8069654703140259, - "learning_rate": 7.973080049680825e-05, - "loss": 0.0722, - "step": 59840 - }, - { - "epoch": 3.9156035328753678, - "grad_norm": 1.0971922874450684, - "learning_rate": 7.972341440632109e-05, - "loss": 0.0677, - "step": 59850 - }, - { - "epoch": 3.9162577690546287, - "grad_norm": 0.946134626865387, - "learning_rate": 7.971602731259075e-05, - "loss": 0.0721, - "step": 59860 - }, - { - "epoch": 3.9169120052338893, - "grad_norm": 0.9672790765762329, - "learning_rate": 7.970863921586655e-05, - "loss": 0.0707, - "step": 59870 - }, - { - "epoch": 3.9175662414131502, - "grad_norm": 1.017909288406372, - "learning_rate": 7.970125011639786e-05, - "loss": 0.0661, - "step": 59880 - }, - { - "epoch": 3.9182204775924108, - "grad_norm": 0.9773590564727783, - "learning_rate": 7.969386001443408e-05, - "loss": 0.0664, - "step": 59890 - }, - { - "epoch": 3.9188747137716717, - "grad_norm": 1.026491403579712, - "learning_rate": 7.968646891022466e-05, - "loss": 0.0707, - "step": 59900 - }, - { - "epoch": 3.9195289499509323, - "grad_norm": 0.9397926926612854, - "learning_rate": 7.967907680401904e-05, - "loss": 0.0628, - "step": 59910 - }, - { - "epoch": 3.920183186130193, - "grad_norm": 0.814552366733551, - "learning_rate": 7.967168369606676e-05, - "loss": 0.0702, - "step": 59920 - }, - { - "epoch": 3.9208374223094538, - "grad_norm": 1.042052149772644, - "learning_rate": 7.966428958661734e-05, - "loss": 0.0678, - "step": 59930 - }, - { - "epoch": 3.9214916584887143, - "grad_norm": 0.8458235859870911, - "learning_rate": 7.965689447592035e-05, - "loss": 0.0752, - "step": 59940 - }, - { - "epoch": 3.9221458946679753, - "grad_norm": 0.748357355594635, - "learning_rate": 7.964949836422537e-05, - "loss": 0.0692, - "step": 59950 - }, - { - "epoch": 3.9228001308472358, - "grad_norm": 0.9684262275695801, - "learning_rate": 7.964210125178209e-05, - "loss": 0.0646, - "step": 59960 - }, - { - "epoch": 3.9234543670264967, - "grad_norm": 1.0138685703277588, - "learning_rate": 7.963470313884011e-05, - "loss": 0.0704, - "step": 59970 - }, - { - "epoch": 3.9241086032057573, - "grad_norm": 0.9098789095878601, - "learning_rate": 7.962730402564924e-05, - "loss": 0.0663, - "step": 59980 - }, - { - "epoch": 3.924762839385018, - "grad_norm": 0.9759104251861572, - "learning_rate": 7.961990391245911e-05, - "loss": 0.0704, - "step": 59990 - }, - { - "epoch": 3.9254170755642788, - "grad_norm": 0.8418338894844055, - "learning_rate": 7.961250279951956e-05, - "loss": 0.066, - "step": 60000 - }, - { - "epoch": 3.9260713117435393, - "grad_norm": 0.8269692659378052, - "learning_rate": 7.960510068708039e-05, - "loss": 0.0734, - "step": 60010 - }, - { - "epoch": 3.9267255479228003, - "grad_norm": 1.259716510772705, - "learning_rate": 7.959769757539142e-05, - "loss": 0.0765, - "step": 60020 - }, - { - "epoch": 3.927379784102061, - "grad_norm": 0.7313357591629028, - "learning_rate": 7.959029346470252e-05, - "loss": 0.0689, - "step": 60030 - }, - { - "epoch": 3.9280340202813218, - "grad_norm": 1.043642282485962, - "learning_rate": 7.958288835526362e-05, - "loss": 0.0626, - "step": 60040 - }, - { - "epoch": 3.9286882564605823, - "grad_norm": 1.0208543539047241, - "learning_rate": 7.957548224732467e-05, - "loss": 0.0666, - "step": 60050 - }, - { - "epoch": 3.929342492639843, - "grad_norm": 0.9676765203475952, - "learning_rate": 7.956807514113562e-05, - "loss": 0.0722, - "step": 60060 - }, - { - "epoch": 3.929996728819104, - "grad_norm": 0.8631986975669861, - "learning_rate": 7.956066703694647e-05, - "loss": 0.0663, - "step": 60070 - }, - { - "epoch": 3.9306509649983643, - "grad_norm": 0.9647482633590698, - "learning_rate": 7.95532579350073e-05, - "loss": 0.064, - "step": 60080 - }, - { - "epoch": 3.9313052011776253, - "grad_norm": 0.7296038866043091, - "learning_rate": 7.954584783556818e-05, - "loss": 0.0616, - "step": 60090 - }, - { - "epoch": 3.931959437356886, - "grad_norm": 0.8985334038734436, - "learning_rate": 7.953843673887919e-05, - "loss": 0.0698, - "step": 60100 - }, - { - "epoch": 3.9326136735361468, - "grad_norm": 0.88104248046875, - "learning_rate": 7.953102464519049e-05, - "loss": 0.0714, - "step": 60110 - }, - { - "epoch": 3.9332679097154073, - "grad_norm": 0.8251603245735168, - "learning_rate": 7.952361155475228e-05, - "loss": 0.0693, - "step": 60120 - }, - { - "epoch": 3.933922145894668, - "grad_norm": 0.8818602561950684, - "learning_rate": 7.951619746781474e-05, - "loss": 0.0675, - "step": 60130 - }, - { - "epoch": 3.934576382073929, - "grad_norm": 0.7289047837257385, - "learning_rate": 7.950878238462812e-05, - "loss": 0.0592, - "step": 60140 - }, - { - "epoch": 3.9352306182531893, - "grad_norm": 1.0739281177520752, - "learning_rate": 7.950136630544272e-05, - "loss": 0.0726, - "step": 60150 - }, - { - "epoch": 3.93588485443245, - "grad_norm": 0.9551029801368713, - "learning_rate": 7.949394923050882e-05, - "loss": 0.0606, - "step": 60160 - }, - { - "epoch": 3.936539090611711, - "grad_norm": 0.7134016752243042, - "learning_rate": 7.94865311600768e-05, - "loss": 0.0611, - "step": 60170 - }, - { - "epoch": 3.937193326790972, - "grad_norm": 0.8816996216773987, - "learning_rate": 7.9479112094397e-05, - "loss": 0.0675, - "step": 60180 - }, - { - "epoch": 3.9378475629702323, - "grad_norm": 1.0640590190887451, - "learning_rate": 7.947169203371986e-05, - "loss": 0.0742, - "step": 60190 - }, - { - "epoch": 3.938501799149493, - "grad_norm": 0.8332981467247009, - "learning_rate": 7.946427097829584e-05, - "loss": 0.0756, - "step": 60200 - }, - { - "epoch": 3.939156035328754, - "grad_norm": 0.9748693704605103, - "learning_rate": 7.94568489283754e-05, - "loss": 0.066, - "step": 60210 - }, - { - "epoch": 3.9398102715080143, - "grad_norm": 0.9146521687507629, - "learning_rate": 7.944942588420903e-05, - "loss": 0.0652, - "step": 60220 - }, - { - "epoch": 3.940464507687275, - "grad_norm": 0.869734525680542, - "learning_rate": 7.944200184604732e-05, - "loss": 0.0743, - "step": 60230 - }, - { - "epoch": 3.941118743866536, - "grad_norm": 1.1516258716583252, - "learning_rate": 7.943457681414084e-05, - "loss": 0.0722, - "step": 60240 - }, - { - "epoch": 3.941772980045797, - "grad_norm": 0.8145464658737183, - "learning_rate": 7.942715078874019e-05, - "loss": 0.0613, - "step": 60250 - }, - { - "epoch": 3.9424272162250573, - "grad_norm": 1.0588626861572266, - "learning_rate": 7.941972377009601e-05, - "loss": 0.0701, - "step": 60260 - }, - { - "epoch": 3.943081452404318, - "grad_norm": 0.8844826221466064, - "learning_rate": 7.941229575845903e-05, - "loss": 0.0676, - "step": 60270 - }, - { - "epoch": 3.943735688583579, - "grad_norm": 0.7616518139839172, - "learning_rate": 7.94048667540799e-05, - "loss": 0.0708, - "step": 60280 - }, - { - "epoch": 3.9443899247628393, - "grad_norm": 1.0263601541519165, - "learning_rate": 7.939743675720942e-05, - "loss": 0.0709, - "step": 60290 - }, - { - "epoch": 3.9450441609421, - "grad_norm": 0.725226640701294, - "learning_rate": 7.939000576809834e-05, - "loss": 0.0628, - "step": 60300 - }, - { - "epoch": 3.945698397121361, - "grad_norm": 0.8320260047912598, - "learning_rate": 7.93825737869975e-05, - "loss": 0.0657, - "step": 60310 - }, - { - "epoch": 3.9463526333006214, - "grad_norm": 0.9669265747070312, - "learning_rate": 7.937514081415773e-05, - "loss": 0.0732, - "step": 60320 - }, - { - "epoch": 3.9470068694798823, - "grad_norm": 1.055615782737732, - "learning_rate": 7.936770684982992e-05, - "loss": 0.0672, - "step": 60330 - }, - { - "epoch": 3.947661105659143, - "grad_norm": 0.8739088177680969, - "learning_rate": 7.936027189426497e-05, - "loss": 0.069, - "step": 60340 - }, - { - "epoch": 3.948315341838404, - "grad_norm": 1.0369230508804321, - "learning_rate": 7.935283594771385e-05, - "loss": 0.0676, - "step": 60350 - }, - { - "epoch": 3.9489695780176644, - "grad_norm": 0.9298377633094788, - "learning_rate": 7.934539901042754e-05, - "loss": 0.0865, - "step": 60360 - }, - { - "epoch": 3.949623814196925, - "grad_norm": 1.1504807472229004, - "learning_rate": 7.933796108265705e-05, - "loss": 0.06, - "step": 60370 - }, - { - "epoch": 3.950278050376186, - "grad_norm": 0.8967733383178711, - "learning_rate": 7.933052216465345e-05, - "loss": 0.0642, - "step": 60380 - }, - { - "epoch": 3.9509322865554464, - "grad_norm": 0.9686173796653748, - "learning_rate": 7.932308225666779e-05, - "loss": 0.071, - "step": 60390 - }, - { - "epoch": 3.9515865227347073, - "grad_norm": 1.0625016689300537, - "learning_rate": 7.93156413589512e-05, - "loss": 0.065, - "step": 60400 - }, - { - "epoch": 3.952240758913968, - "grad_norm": 0.830640971660614, - "learning_rate": 7.930819947175484e-05, - "loss": 0.0665, - "step": 60410 - }, - { - "epoch": 3.952894995093229, - "grad_norm": 0.9542146921157837, - "learning_rate": 7.930075659532987e-05, - "loss": 0.0593, - "step": 60420 - }, - { - "epoch": 3.9535492312724894, - "grad_norm": 0.8056615591049194, - "learning_rate": 7.929331272992753e-05, - "loss": 0.0647, - "step": 60430 - }, - { - "epoch": 3.95420346745175, - "grad_norm": 0.9513580203056335, - "learning_rate": 7.928586787579904e-05, - "loss": 0.0679, - "step": 60440 - }, - { - "epoch": 3.954857703631011, - "grad_norm": 0.8633571267127991, - "learning_rate": 7.927842203319573e-05, - "loss": 0.0692, - "step": 60450 - }, - { - "epoch": 3.9555119398102714, - "grad_norm": 1.0166418552398682, - "learning_rate": 7.927097520236888e-05, - "loss": 0.0779, - "step": 60460 - }, - { - "epoch": 3.9561661759895324, - "grad_norm": 0.9727340340614319, - "learning_rate": 7.926352738356986e-05, - "loss": 0.0689, - "step": 60470 - }, - { - "epoch": 3.956820412168793, - "grad_norm": 1.02824866771698, - "learning_rate": 7.925607857705003e-05, - "loss": 0.0735, - "step": 60480 - }, - { - "epoch": 3.957474648348054, - "grad_norm": 0.790596604347229, - "learning_rate": 7.924862878306083e-05, - "loss": 0.0734, - "step": 60490 - }, - { - "epoch": 3.9581288845273144, - "grad_norm": 1.0034263134002686, - "learning_rate": 7.924117800185372e-05, - "loss": 0.0716, - "step": 60500 - }, - { - "epoch": 3.958783120706575, - "grad_norm": 0.8736487627029419, - "learning_rate": 7.923372623368013e-05, - "loss": 0.0647, - "step": 60510 - }, - { - "epoch": 3.959437356885836, - "grad_norm": 0.8763427734375, - "learning_rate": 7.922627347879162e-05, - "loss": 0.0682, - "step": 60520 - }, - { - "epoch": 3.9600915930650964, - "grad_norm": 0.7622628808021545, - "learning_rate": 7.921881973743974e-05, - "loss": 0.0625, - "step": 60530 - }, - { - "epoch": 3.9607458292443574, - "grad_norm": 0.9626938104629517, - "learning_rate": 7.921136500987607e-05, - "loss": 0.0804, - "step": 60540 - }, - { - "epoch": 3.961400065423618, - "grad_norm": 0.8307819962501526, - "learning_rate": 7.920390929635221e-05, - "loss": 0.0618, - "step": 60550 - }, - { - "epoch": 3.962054301602879, - "grad_norm": 1.078370213508606, - "learning_rate": 7.919645259711982e-05, - "loss": 0.0683, - "step": 60560 - }, - { - "epoch": 3.9627085377821394, - "grad_norm": 0.7358745336532593, - "learning_rate": 7.918899491243059e-05, - "loss": 0.0674, - "step": 60570 - }, - { - "epoch": 3.9633627739614, - "grad_norm": 0.9152573347091675, - "learning_rate": 7.918153624253624e-05, - "loss": 0.0735, - "step": 60580 - }, - { - "epoch": 3.964017010140661, - "grad_norm": 1.0013970136642456, - "learning_rate": 7.91740765876885e-05, - "loss": 0.0639, - "step": 60590 - }, - { - "epoch": 3.9646712463199214, - "grad_norm": 0.7813799381256104, - "learning_rate": 7.916661594813915e-05, - "loss": 0.0686, - "step": 60600 - }, - { - "epoch": 3.965325482499182, - "grad_norm": 1.0708343982696533, - "learning_rate": 7.915915432414005e-05, - "loss": 0.0627, - "step": 60610 - }, - { - "epoch": 3.965979718678443, - "grad_norm": 0.8936020731925964, - "learning_rate": 7.915169171594299e-05, - "loss": 0.0745, - "step": 60620 - }, - { - "epoch": 3.966633954857704, - "grad_norm": 1.0120549201965332, - "learning_rate": 7.914422812379989e-05, - "loss": 0.0654, - "step": 60630 - }, - { - "epoch": 3.9672881910369644, - "grad_norm": 0.7676621079444885, - "learning_rate": 7.913676354796267e-05, - "loss": 0.0603, - "step": 60640 - }, - { - "epoch": 3.967942427216225, - "grad_norm": 0.7795009613037109, - "learning_rate": 7.912929798868324e-05, - "loss": 0.0684, - "step": 60650 - }, - { - "epoch": 3.968596663395486, - "grad_norm": 0.9069650769233704, - "learning_rate": 7.912183144621364e-05, - "loss": 0.0792, - "step": 60660 - }, - { - "epoch": 3.9692508995747464, - "grad_norm": 0.9909687638282776, - "learning_rate": 7.911436392080585e-05, - "loss": 0.0599, - "step": 60670 - }, - { - "epoch": 3.969905135754007, - "grad_norm": 0.9287083745002747, - "learning_rate": 7.91068954127119e-05, - "loss": 0.066, - "step": 60680 - }, - { - "epoch": 3.970559371933268, - "grad_norm": 0.9105084538459778, - "learning_rate": 7.909942592218391e-05, - "loss": 0.069, - "step": 60690 - }, - { - "epoch": 3.971213608112529, - "grad_norm": 0.8097362518310547, - "learning_rate": 7.909195544947398e-05, - "loss": 0.0583, - "step": 60700 - }, - { - "epoch": 3.9718678442917894, - "grad_norm": 0.8813385367393494, - "learning_rate": 7.908448399483423e-05, - "loss": 0.0726, - "step": 60710 - }, - { - "epoch": 3.97252208047105, - "grad_norm": 0.7956545352935791, - "learning_rate": 7.907701155851691e-05, - "loss": 0.0611, - "step": 60720 - }, - { - "epoch": 3.973176316650311, - "grad_norm": 0.8885143995285034, - "learning_rate": 7.906953814077417e-05, - "loss": 0.0635, - "step": 60730 - }, - { - "epoch": 3.9738305528295714, - "grad_norm": 1.125998854637146, - "learning_rate": 7.906206374185828e-05, - "loss": 0.0778, - "step": 60740 - }, - { - "epoch": 3.974484789008832, - "grad_norm": 0.6007951498031616, - "learning_rate": 7.905458836202153e-05, - "loss": 0.0584, - "step": 60750 - }, - { - "epoch": 3.975139025188093, - "grad_norm": 1.0695325136184692, - "learning_rate": 7.904711200151622e-05, - "loss": 0.0726, - "step": 60760 - }, - { - "epoch": 3.9757932613673534, - "grad_norm": 0.9608235359191895, - "learning_rate": 7.90396346605947e-05, - "loss": 0.068, - "step": 60770 - }, - { - "epoch": 3.9764474975466144, - "grad_norm": 0.7435832023620605, - "learning_rate": 7.903215633950934e-05, - "loss": 0.076, - "step": 60780 - }, - { - "epoch": 3.977101733725875, - "grad_norm": 0.7819331288337708, - "learning_rate": 7.902467703851258e-05, - "loss": 0.0699, - "step": 60790 - }, - { - "epoch": 3.977755969905136, - "grad_norm": 0.7933452129364014, - "learning_rate": 7.901719675785685e-05, - "loss": 0.0679, - "step": 60800 - }, - { - "epoch": 3.9784102060843964, - "grad_norm": 0.8844022154808044, - "learning_rate": 7.900971549779461e-05, - "loss": 0.0693, - "step": 60810 - }, - { - "epoch": 3.979064442263657, - "grad_norm": 1.0027797222137451, - "learning_rate": 7.90022332585784e-05, - "loss": 0.0745, - "step": 60820 - }, - { - "epoch": 3.979718678442918, - "grad_norm": 0.8335436582565308, - "learning_rate": 7.899475004046078e-05, - "loss": 0.062, - "step": 60830 - }, - { - "epoch": 3.9803729146221785, - "grad_norm": 1.0376518964767456, - "learning_rate": 7.898726584369427e-05, - "loss": 0.0655, - "step": 60840 - }, - { - "epoch": 3.9810271508014394, - "grad_norm": 0.9191878437995911, - "learning_rate": 7.897978066853155e-05, - "loss": 0.075, - "step": 60850 - }, - { - "epoch": 3.9816813869807, - "grad_norm": 0.7158858180046082, - "learning_rate": 7.897229451522521e-05, - "loss": 0.0654, - "step": 60860 - }, - { - "epoch": 3.982335623159961, - "grad_norm": 0.9076741337776184, - "learning_rate": 7.896480738402795e-05, - "loss": 0.067, - "step": 60870 - }, - { - "epoch": 3.9829898593392215, - "grad_norm": 0.9363234043121338, - "learning_rate": 7.895731927519248e-05, - "loss": 0.0638, - "step": 60880 - }, - { - "epoch": 3.983644095518482, - "grad_norm": 0.9065988659858704, - "learning_rate": 7.894983018897153e-05, - "loss": 0.0649, - "step": 60890 - }, - { - "epoch": 3.984298331697743, - "grad_norm": 0.8118206858634949, - "learning_rate": 7.89423401256179e-05, - "loss": 0.074, - "step": 60900 - }, - { - "epoch": 3.9849525678770035, - "grad_norm": 1.0652772188186646, - "learning_rate": 7.893484908538437e-05, - "loss": 0.0632, - "step": 60910 - }, - { - "epoch": 3.9856068040562644, - "grad_norm": 0.8931100964546204, - "learning_rate": 7.892735706852381e-05, - "loss": 0.0689, - "step": 60920 - }, - { - "epoch": 3.986261040235525, - "grad_norm": 0.8381471633911133, - "learning_rate": 7.891986407528908e-05, - "loss": 0.0657, - "step": 60930 - }, - { - "epoch": 3.986915276414786, - "grad_norm": 1.1571853160858154, - "learning_rate": 7.89123701059331e-05, - "loss": 0.0668, - "step": 60940 - }, - { - "epoch": 3.9875695125940465, - "grad_norm": 0.904621422290802, - "learning_rate": 7.890487516070881e-05, - "loss": 0.0755, - "step": 60950 - }, - { - "epoch": 3.988223748773307, - "grad_norm": 0.9490391612052917, - "learning_rate": 7.889737923986918e-05, - "loss": 0.0705, - "step": 60960 - }, - { - "epoch": 3.988877984952568, - "grad_norm": 1.0775212049484253, - "learning_rate": 7.888988234366719e-05, - "loss": 0.0643, - "step": 60970 - }, - { - "epoch": 3.9895322211318285, - "grad_norm": 0.9176508784294128, - "learning_rate": 7.888238447235592e-05, - "loss": 0.0724, - "step": 60980 - }, - { - "epoch": 3.9901864573110895, - "grad_norm": 1.2155365943908691, - "learning_rate": 7.887488562618844e-05, - "loss": 0.0776, - "step": 60990 - }, - { - "epoch": 3.99084069349035, - "grad_norm": 1.0456100702285767, - "learning_rate": 7.886738580541782e-05, - "loss": 0.065, - "step": 61000 - }, - { - "epoch": 3.991494929669611, - "grad_norm": 0.962174117565155, - "learning_rate": 7.885988501029724e-05, - "loss": 0.0642, - "step": 61010 - }, - { - "epoch": 3.9921491658488715, - "grad_norm": 0.8093088269233704, - "learning_rate": 7.885238324107982e-05, - "loss": 0.0724, - "step": 61020 - }, - { - "epoch": 3.992803402028132, - "grad_norm": 0.9736089706420898, - "learning_rate": 7.884488049801882e-05, - "loss": 0.0679, - "step": 61030 - }, - { - "epoch": 3.993457638207393, - "grad_norm": 0.9136343002319336, - "learning_rate": 7.883737678136746e-05, - "loss": 0.0578, - "step": 61040 - }, - { - "epoch": 3.9941118743866535, - "grad_norm": 0.998466968536377, - "learning_rate": 7.8829872091379e-05, - "loss": 0.0724, - "step": 61050 - }, - { - "epoch": 3.994766110565914, - "grad_norm": 0.8486067652702332, - "learning_rate": 7.882236642830675e-05, - "loss": 0.0747, - "step": 61060 - }, - { - "epoch": 3.995420346745175, - "grad_norm": 0.8077092170715332, - "learning_rate": 7.881485979240404e-05, - "loss": 0.0668, - "step": 61070 - }, - { - "epoch": 3.996074582924436, - "grad_norm": 1.0443347692489624, - "learning_rate": 7.880735218392423e-05, - "loss": 0.068, - "step": 61080 - }, - { - "epoch": 3.9967288191036965, - "grad_norm": 0.664110541343689, - "learning_rate": 7.879984360312077e-05, - "loss": 0.0608, - "step": 61090 - }, - { - "epoch": 3.997383055282957, - "grad_norm": 0.8030071258544922, - "learning_rate": 7.879233405024702e-05, - "loss": 0.0696, - "step": 61100 - }, - { - "epoch": 3.998037291462218, - "grad_norm": 0.9400510191917419, - "learning_rate": 7.87848235255565e-05, - "loss": 0.0756, - "step": 61110 - }, - { - "epoch": 3.9986915276414785, - "grad_norm": 0.8629300594329834, - "learning_rate": 7.87773120293027e-05, - "loss": 0.0779, - "step": 61120 - }, - { - "epoch": 3.999345763820739, - "grad_norm": 0.8903558850288391, - "learning_rate": 7.876979956173914e-05, - "loss": 0.0825, - "step": 61130 - }, - { - "epoch": 4.0, - "grad_norm": 1.0015478134155273, - "learning_rate": 7.87622861231194e-05, - "loss": 0.0693, - "step": 61140 - }, - { - "epoch": 4.000654236179261, - "grad_norm": 0.8407610654830933, - "learning_rate": 7.875477171369707e-05, - "loss": 0.0606, - "step": 61150 - }, - { - "epoch": 4.001308472358521, - "grad_norm": 1.2632551193237305, - "learning_rate": 7.874725633372577e-05, - "loss": 0.0688, - "step": 61160 - }, - { - "epoch": 4.001962708537782, - "grad_norm": 0.812816858291626, - "learning_rate": 7.87397399834592e-05, - "loss": 0.067, - "step": 61170 - }, - { - "epoch": 4.002616944717043, - "grad_norm": 0.790305495262146, - "learning_rate": 7.873222266315101e-05, - "loss": 0.0604, - "step": 61180 - }, - { - "epoch": 4.003271180896304, - "grad_norm": 0.8392609357833862, - "learning_rate": 7.872470437305496e-05, - "loss": 0.0706, - "step": 61190 - }, - { - "epoch": 4.003925417075564, - "grad_norm": 0.8045458793640137, - "learning_rate": 7.87171851134248e-05, - "loss": 0.0696, - "step": 61200 - }, - { - "epoch": 4.004579653254825, - "grad_norm": 0.9475613832473755, - "learning_rate": 7.870966488451434e-05, - "loss": 0.0672, - "step": 61210 - }, - { - "epoch": 4.005233889434086, - "grad_norm": 0.8142966032028198, - "learning_rate": 7.87021436865774e-05, - "loss": 0.0612, - "step": 61220 - }, - { - "epoch": 4.005888125613346, - "grad_norm": 0.8628994822502136, - "learning_rate": 7.869462151986781e-05, - "loss": 0.0615, - "step": 61230 - }, - { - "epoch": 4.006542361792607, - "grad_norm": 0.8791362643241882, - "learning_rate": 7.868709838463952e-05, - "loss": 0.0711, - "step": 61240 - }, - { - "epoch": 4.007196597971868, - "grad_norm": 1.1582494974136353, - "learning_rate": 7.867957428114641e-05, - "loss": 0.0688, - "step": 61250 - }, - { - "epoch": 4.007850834151129, - "grad_norm": 0.9910046458244324, - "learning_rate": 7.867204920964245e-05, - "loss": 0.0659, - "step": 61260 - }, - { - "epoch": 4.008505070330389, - "grad_norm": 0.832694411277771, - "learning_rate": 7.866452317038164e-05, - "loss": 0.0706, - "step": 61270 - }, - { - "epoch": 4.00915930650965, - "grad_norm": 0.8842899203300476, - "learning_rate": 7.865699616361798e-05, - "loss": 0.0744, - "step": 61280 - }, - { - "epoch": 4.009813542688911, - "grad_norm": 0.9337363839149475, - "learning_rate": 7.864946818960557e-05, - "loss": 0.0715, - "step": 61290 - }, - { - "epoch": 4.010467778868171, - "grad_norm": 0.8401015400886536, - "learning_rate": 7.864193924859846e-05, - "loss": 0.0635, - "step": 61300 - }, - { - "epoch": 4.011122015047432, - "grad_norm": 0.9122945666313171, - "learning_rate": 7.86344093408508e-05, - "loss": 0.0663, - "step": 61310 - }, - { - "epoch": 4.011776251226693, - "grad_norm": 0.809889018535614, - "learning_rate": 7.862687846661671e-05, - "loss": 0.0704, - "step": 61320 - }, - { - "epoch": 4.012430487405954, - "grad_norm": 0.9183257818222046, - "learning_rate": 7.86193466261504e-05, - "loss": 0.0702, - "step": 61330 - }, - { - "epoch": 4.013084723585214, - "grad_norm": 0.874580979347229, - "learning_rate": 7.861181381970608e-05, - "loss": 0.072, - "step": 61340 - }, - { - "epoch": 4.013738959764475, - "grad_norm": 0.7762249708175659, - "learning_rate": 7.860428004753801e-05, - "loss": 0.0557, - "step": 61350 - }, - { - "epoch": 4.014393195943736, - "grad_norm": 0.7530931830406189, - "learning_rate": 7.859674530990047e-05, - "loss": 0.0666, - "step": 61360 - }, - { - "epoch": 4.015047432122996, - "grad_norm": 0.9463986158370972, - "learning_rate": 7.858920960704779e-05, - "loss": 0.0644, - "step": 61370 - }, - { - "epoch": 4.015701668302257, - "grad_norm": 0.828858494758606, - "learning_rate": 7.85816729392343e-05, - "loss": 0.0718, - "step": 61380 - }, - { - "epoch": 4.016355904481518, - "grad_norm": 0.7672846913337708, - "learning_rate": 7.857413530671438e-05, - "loss": 0.0615, - "step": 61390 - }, - { - "epoch": 4.017010140660778, - "grad_norm": 1.026138186454773, - "learning_rate": 7.856659670974246e-05, - "loss": 0.0762, - "step": 61400 - }, - { - "epoch": 4.017664376840039, - "grad_norm": 0.8137958645820618, - "learning_rate": 7.855905714857299e-05, - "loss": 0.0633, - "step": 61410 - }, - { - "epoch": 4.0183186130193, - "grad_norm": 0.9455887079238892, - "learning_rate": 7.855151662346043e-05, - "loss": 0.0701, - "step": 61420 - }, - { - "epoch": 4.018972849198561, - "grad_norm": 0.8994813561439514, - "learning_rate": 7.854397513465932e-05, - "loss": 0.0601, - "step": 61430 - }, - { - "epoch": 4.019627085377821, - "grad_norm": 0.7181370854377747, - "learning_rate": 7.853643268242417e-05, - "loss": 0.062, - "step": 61440 - }, - { - "epoch": 4.020281321557082, - "grad_norm": 0.7311094403266907, - "learning_rate": 7.852888926700959e-05, - "loss": 0.0645, - "step": 61450 - }, - { - "epoch": 4.020935557736343, - "grad_norm": 0.8224121928215027, - "learning_rate": 7.852134488867018e-05, - "loss": 0.0667, - "step": 61460 - }, - { - "epoch": 4.021589793915603, - "grad_norm": 1.1656960248947144, - "learning_rate": 7.851379954766058e-05, - "loss": 0.0815, - "step": 61470 - }, - { - "epoch": 4.022244030094864, - "grad_norm": 0.7743502259254456, - "learning_rate": 7.850625324423546e-05, - "loss": 0.0586, - "step": 61480 - }, - { - "epoch": 4.022898266274125, - "grad_norm": 0.8907047510147095, - "learning_rate": 7.849870597864953e-05, - "loss": 0.0665, - "step": 61490 - }, - { - "epoch": 4.023552502453386, - "grad_norm": 1.395200490951538, - "learning_rate": 7.849115775115755e-05, - "loss": 0.0752, - "step": 61500 - }, - { - "epoch": 4.024206738632646, - "grad_norm": 0.8665695190429688, - "learning_rate": 7.848360856201425e-05, - "loss": 0.066, - "step": 61510 - }, - { - "epoch": 4.024860974811907, - "grad_norm": 1.0678719282150269, - "learning_rate": 7.847605841147447e-05, - "loss": 0.0763, - "step": 61520 - }, - { - "epoch": 4.025515210991168, - "grad_norm": 0.961864173412323, - "learning_rate": 7.846850729979304e-05, - "loss": 0.0756, - "step": 61530 - }, - { - "epoch": 4.026169447170428, - "grad_norm": 0.9026308655738831, - "learning_rate": 7.846095522722482e-05, - "loss": 0.0656, - "step": 61540 - }, - { - "epoch": 4.026823683349689, - "grad_norm": 0.8607982397079468, - "learning_rate": 7.845340219402472e-05, - "loss": 0.0614, - "step": 61550 - }, - { - "epoch": 4.02747791952895, - "grad_norm": 0.8916060924530029, - "learning_rate": 7.844584820044769e-05, - "loss": 0.0714, - "step": 61560 - }, - { - "epoch": 4.028132155708211, - "grad_norm": 0.7897719144821167, - "learning_rate": 7.843829324674867e-05, - "loss": 0.0649, - "step": 61570 - }, - { - "epoch": 4.028786391887471, - "grad_norm": 0.8527302742004395, - "learning_rate": 7.843073733318268e-05, - "loss": 0.0782, - "step": 61580 - }, - { - "epoch": 4.029440628066732, - "grad_norm": 0.8170759677886963, - "learning_rate": 7.842318046000475e-05, - "loss": 0.0603, - "step": 61590 - }, - { - "epoch": 4.030094864245993, - "grad_norm": 0.9056137204170227, - "learning_rate": 7.841562262746991e-05, - "loss": 0.064, - "step": 61600 - }, - { - "epoch": 4.030749100425253, - "grad_norm": 0.9502385258674622, - "learning_rate": 7.84080638358333e-05, - "loss": 0.0672, - "step": 61610 - }, - { - "epoch": 4.031403336604514, - "grad_norm": 0.964516818523407, - "learning_rate": 7.840050408535002e-05, - "loss": 0.0765, - "step": 61620 - }, - { - "epoch": 4.032057572783775, - "grad_norm": 1.0085734128952026, - "learning_rate": 7.839294337627525e-05, - "loss": 0.0691, - "step": 61630 - }, - { - "epoch": 4.032711808963036, - "grad_norm": 0.9676178097724915, - "learning_rate": 7.838538170886419e-05, - "loss": 0.0619, - "step": 61640 - }, - { - "epoch": 4.033366045142296, - "grad_norm": 1.0016093254089355, - "learning_rate": 7.837781908337204e-05, - "loss": 0.0653, - "step": 61650 - }, - { - "epoch": 4.034020281321557, - "grad_norm": 0.9340269565582275, - "learning_rate": 7.837025550005408e-05, - "loss": 0.0682, - "step": 61660 - }, - { - "epoch": 4.034674517500818, - "grad_norm": 0.9540444612503052, - "learning_rate": 7.836269095916557e-05, - "loss": 0.0739, - "step": 61670 - }, - { - "epoch": 4.035328753680078, - "grad_norm": 0.7741513848304749, - "learning_rate": 7.835512546096188e-05, - "loss": 0.0704, - "step": 61680 - }, - { - "epoch": 4.035982989859339, - "grad_norm": 0.8669870495796204, - "learning_rate": 7.834755900569834e-05, - "loss": 0.0625, - "step": 61690 - }, - { - "epoch": 4.0366372260386, - "grad_norm": 0.8938657641410828, - "learning_rate": 7.833999159363035e-05, - "loss": 0.0677, - "step": 61700 - }, - { - "epoch": 4.037291462217861, - "grad_norm": 0.7298368215560913, - "learning_rate": 7.83324232250133e-05, - "loss": 0.0606, - "step": 61710 - }, - { - "epoch": 4.037945698397121, - "grad_norm": 0.8773128986358643, - "learning_rate": 7.832485390010266e-05, - "loss": 0.0631, - "step": 61720 - }, - { - "epoch": 4.038599934576382, - "grad_norm": 0.8813597559928894, - "learning_rate": 7.831728361915394e-05, - "loss": 0.0735, - "step": 61730 - }, - { - "epoch": 4.039254170755643, - "grad_norm": 0.7509139776229858, - "learning_rate": 7.830971238242261e-05, - "loss": 0.0666, - "step": 61740 - }, - { - "epoch": 4.039908406934903, - "grad_norm": 0.8333325386047363, - "learning_rate": 7.830214019016426e-05, - "loss": 0.0649, - "step": 61750 - }, - { - "epoch": 4.040562643114164, - "grad_norm": 0.9280720353126526, - "learning_rate": 7.829456704263442e-05, - "loss": 0.06, - "step": 61760 - }, - { - "epoch": 4.041216879293425, - "grad_norm": 0.6866008639335632, - "learning_rate": 7.828699294008877e-05, - "loss": 0.066, - "step": 61770 - }, - { - "epoch": 4.041871115472686, - "grad_norm": 0.8785973787307739, - "learning_rate": 7.827941788278292e-05, - "loss": 0.0582, - "step": 61780 - }, - { - "epoch": 4.042525351651946, - "grad_norm": 0.9217095971107483, - "learning_rate": 7.827184187097253e-05, - "loss": 0.0737, - "step": 61790 - }, - { - "epoch": 4.043179587831207, - "grad_norm": 0.7473239302635193, - "learning_rate": 7.826426490491335e-05, - "loss": 0.0639, - "step": 61800 - }, - { - "epoch": 4.043833824010468, - "grad_norm": 0.8309671878814697, - "learning_rate": 7.82566869848611e-05, - "loss": 0.0633, - "step": 61810 - }, - { - "epoch": 4.044488060189728, - "grad_norm": 1.0445795059204102, - "learning_rate": 7.824910811107156e-05, - "loss": 0.0683, - "step": 61820 - }, - { - "epoch": 4.045142296368989, - "grad_norm": 0.8129024505615234, - "learning_rate": 7.824152828380053e-05, - "loss": 0.0628, - "step": 61830 - }, - { - "epoch": 4.04579653254825, - "grad_norm": 0.8035739660263062, - "learning_rate": 7.823394750330387e-05, - "loss": 0.0664, - "step": 61840 - }, - { - "epoch": 4.04645076872751, - "grad_norm": 1.1228084564208984, - "learning_rate": 7.822636576983741e-05, - "loss": 0.0663, - "step": 61850 - }, - { - "epoch": 4.047105004906771, - "grad_norm": 0.9300438761711121, - "learning_rate": 7.821878308365708e-05, - "loss": 0.0679, - "step": 61860 - }, - { - "epoch": 4.047759241086032, - "grad_norm": 0.8193831443786621, - "learning_rate": 7.821119944501885e-05, - "loss": 0.063, - "step": 61870 - }, - { - "epoch": 4.048413477265293, - "grad_norm": 0.8168728351593018, - "learning_rate": 7.820361485417862e-05, - "loss": 0.0709, - "step": 61880 - }, - { - "epoch": 4.049067713444553, - "grad_norm": 1.0577526092529297, - "learning_rate": 7.819602931139243e-05, - "loss": 0.0641, - "step": 61890 - }, - { - "epoch": 4.049721949623814, - "grad_norm": 0.8535493612289429, - "learning_rate": 7.81884428169163e-05, - "loss": 0.057, - "step": 61900 - }, - { - "epoch": 4.050376185803075, - "grad_norm": 0.8765014410018921, - "learning_rate": 7.81808553710063e-05, - "loss": 0.0662, - "step": 61910 - }, - { - "epoch": 4.051030421982335, - "grad_norm": 0.9272754788398743, - "learning_rate": 7.817326697391853e-05, - "loss": 0.0657, - "step": 61920 - }, - { - "epoch": 4.051684658161596, - "grad_norm": 0.7854313254356384, - "learning_rate": 7.81656776259091e-05, - "loss": 0.068, - "step": 61930 - }, - { - "epoch": 4.052338894340857, - "grad_norm": 0.753021776676178, - "learning_rate": 7.81580873272342e-05, - "loss": 0.0648, - "step": 61940 - }, - { - "epoch": 4.052993130520118, - "grad_norm": 0.939609944820404, - "learning_rate": 7.815049607815e-05, - "loss": 0.0682, - "step": 61950 - }, - { - "epoch": 4.053647366699378, - "grad_norm": 0.9001626372337341, - "learning_rate": 7.814290387891271e-05, - "loss": 0.0608, - "step": 61960 - }, - { - "epoch": 4.054301602878639, - "grad_norm": 0.9824931621551514, - "learning_rate": 7.813531072977863e-05, - "loss": 0.069, - "step": 61970 - }, - { - "epoch": 4.0549558390579, - "grad_norm": 0.9642009139060974, - "learning_rate": 7.812771663100402e-05, - "loss": 0.0602, - "step": 61980 - }, - { - "epoch": 4.05561007523716, - "grad_norm": 0.7149823307991028, - "learning_rate": 7.812012158284521e-05, - "loss": 0.0667, - "step": 61990 - }, - { - "epoch": 4.056264311416421, - "grad_norm": 0.8310834765434265, - "learning_rate": 7.811252558555854e-05, - "loss": 0.0746, - "step": 62000 - }, - { - "epoch": 4.056918547595682, - "grad_norm": 0.7982217073440552, - "learning_rate": 7.810492863940041e-05, - "loss": 0.0852, - "step": 62010 - }, - { - "epoch": 4.057572783774943, - "grad_norm": 0.9088074564933777, - "learning_rate": 7.809733074462722e-05, - "loss": 0.064, - "step": 62020 - }, - { - "epoch": 4.058227019954203, - "grad_norm": 0.8191305994987488, - "learning_rate": 7.808973190149544e-05, - "loss": 0.0684, - "step": 62030 - }, - { - "epoch": 4.058881256133464, - "grad_norm": 0.8682653307914734, - "learning_rate": 7.808213211026153e-05, - "loss": 0.0623, - "step": 62040 - }, - { - "epoch": 4.059535492312725, - "grad_norm": 1.0228744745254517, - "learning_rate": 7.807453137118204e-05, - "loss": 0.0726, - "step": 62050 - }, - { - "epoch": 4.060189728491985, - "grad_norm": 0.8825092315673828, - "learning_rate": 7.806692968451346e-05, - "loss": 0.0669, - "step": 62060 - }, - { - "epoch": 4.060843964671246, - "grad_norm": 0.8798748254776001, - "learning_rate": 7.80593270505124e-05, - "loss": 0.0669, - "step": 62070 - }, - { - "epoch": 4.061498200850507, - "grad_norm": 0.8253244161605835, - "learning_rate": 7.805172346943547e-05, - "loss": 0.058, - "step": 62080 - }, - { - "epoch": 4.062152437029768, - "grad_norm": 0.9651762247085571, - "learning_rate": 7.804411894153932e-05, - "loss": 0.0709, - "step": 62090 - }, - { - "epoch": 4.062806673209028, - "grad_norm": 1.1336904764175415, - "learning_rate": 7.803651346708056e-05, - "loss": 0.072, - "step": 62100 - }, - { - "epoch": 4.063460909388289, - "grad_norm": 1.0221117734909058, - "learning_rate": 7.802890704631598e-05, - "loss": 0.0734, - "step": 62110 - }, - { - "epoch": 4.06411514556755, - "grad_norm": 0.8365046381950378, - "learning_rate": 7.802129967950227e-05, - "loss": 0.0662, - "step": 62120 - }, - { - "epoch": 4.06476938174681, - "grad_norm": 0.8922891020774841, - "learning_rate": 7.801369136689621e-05, - "loss": 0.065, - "step": 62130 - }, - { - "epoch": 4.065423617926071, - "grad_norm": 0.689914345741272, - "learning_rate": 7.80060821087546e-05, - "loss": 0.0674, - "step": 62140 - }, - { - "epoch": 4.066077854105332, - "grad_norm": 0.8463851809501648, - "learning_rate": 7.799847190533428e-05, - "loss": 0.0628, - "step": 62150 - }, - { - "epoch": 4.066732090284593, - "grad_norm": 0.7686126232147217, - "learning_rate": 7.799086075689208e-05, - "loss": 0.0665, - "step": 62160 - }, - { - "epoch": 4.067386326463853, - "grad_norm": 0.9086697697639465, - "learning_rate": 7.798324866368493e-05, - "loss": 0.0655, - "step": 62170 - }, - { - "epoch": 4.068040562643114, - "grad_norm": 0.8965178728103638, - "learning_rate": 7.797563562596974e-05, - "loss": 0.0722, - "step": 62180 - }, - { - "epoch": 4.068694798822375, - "grad_norm": 0.8882538676261902, - "learning_rate": 7.796802164400348e-05, - "loss": 0.0615, - "step": 62190 - }, - { - "epoch": 4.069349035001635, - "grad_norm": 0.8429491519927979, - "learning_rate": 7.796040671804316e-05, - "loss": 0.0628, - "step": 62200 - }, - { - "epoch": 4.070003271180896, - "grad_norm": 0.8130377531051636, - "learning_rate": 7.795279084834577e-05, - "loss": 0.0657, - "step": 62210 - }, - { - "epoch": 4.070657507360157, - "grad_norm": 0.9137871861457825, - "learning_rate": 7.794517403516838e-05, - "loss": 0.0606, - "step": 62220 - }, - { - "epoch": 4.071311743539418, - "grad_norm": 0.8658760190010071, - "learning_rate": 7.793755627876808e-05, - "loss": 0.0676, - "step": 62230 - }, - { - "epoch": 4.071965979718678, - "grad_norm": 0.9319032430648804, - "learning_rate": 7.7929937579402e-05, - "loss": 0.0693, - "step": 62240 - }, - { - "epoch": 4.072620215897939, - "grad_norm": 0.8796036839485168, - "learning_rate": 7.792231793732727e-05, - "loss": 0.0669, - "step": 62250 - }, - { - "epoch": 4.0732744520772, - "grad_norm": 0.7122048139572144, - "learning_rate": 7.791469735280106e-05, - "loss": 0.0639, - "step": 62260 - }, - { - "epoch": 4.07392868825646, - "grad_norm": 1.0055348873138428, - "learning_rate": 7.790707582608063e-05, - "loss": 0.0808, - "step": 62270 - }, - { - "epoch": 4.074582924435721, - "grad_norm": 0.8951319456100464, - "learning_rate": 7.78994533574232e-05, - "loss": 0.0677, - "step": 62280 - }, - { - "epoch": 4.075237160614982, - "grad_norm": 0.9522911906242371, - "learning_rate": 7.789182994708604e-05, - "loss": 0.0545, - "step": 62290 - }, - { - "epoch": 4.075891396794242, - "grad_norm": 0.8506065607070923, - "learning_rate": 7.788420559532646e-05, - "loss": 0.0595, - "step": 62300 - }, - { - "epoch": 4.076545632973503, - "grad_norm": 0.959356427192688, - "learning_rate": 7.787658030240183e-05, - "loss": 0.0713, - "step": 62310 - }, - { - "epoch": 4.077199869152764, - "grad_norm": 0.7769831418991089, - "learning_rate": 7.786895406856952e-05, - "loss": 0.0627, - "step": 62320 - }, - { - "epoch": 4.077854105332025, - "grad_norm": 0.6790773868560791, - "learning_rate": 7.786132689408688e-05, - "loss": 0.068, - "step": 62330 - }, - { - "epoch": 4.078508341511285, - "grad_norm": 0.7900551557540894, - "learning_rate": 7.78536987792114e-05, - "loss": 0.0749, - "step": 62340 - }, - { - "epoch": 4.079162577690546, - "grad_norm": 0.8323625922203064, - "learning_rate": 7.784606972420056e-05, - "loss": 0.0585, - "step": 62350 - }, - { - "epoch": 4.079816813869807, - "grad_norm": 1.0454132556915283, - "learning_rate": 7.783843972931184e-05, - "loss": 0.0613, - "step": 62360 - }, - { - "epoch": 4.080471050049067, - "grad_norm": 0.9331693053245544, - "learning_rate": 7.783080879480274e-05, - "loss": 0.0635, - "step": 62370 - }, - { - "epoch": 4.081125286228328, - "grad_norm": 0.9045494198799133, - "learning_rate": 7.782317692093088e-05, - "loss": 0.0703, - "step": 62380 - }, - { - "epoch": 4.081779522407589, - "grad_norm": 0.9631790518760681, - "learning_rate": 7.781554410795381e-05, - "loss": 0.0675, - "step": 62390 - }, - { - "epoch": 4.08243375858685, - "grad_norm": 0.9105682373046875, - "learning_rate": 7.78079103561292e-05, - "loss": 0.0779, - "step": 62400 - }, - { - "epoch": 4.08308799476611, - "grad_norm": 0.8074235320091248, - "learning_rate": 7.780027566571465e-05, - "loss": 0.07, - "step": 62410 - }, - { - "epoch": 4.083742230945371, - "grad_norm": 0.8759629726409912, - "learning_rate": 7.779264003696794e-05, - "loss": 0.0637, - "step": 62420 - }, - { - "epoch": 4.084396467124632, - "grad_norm": 0.8032673001289368, - "learning_rate": 7.77850034701467e-05, - "loss": 0.064, - "step": 62430 - }, - { - "epoch": 4.085050703303892, - "grad_norm": 0.8850411176681519, - "learning_rate": 7.777736596550874e-05, - "loss": 0.0641, - "step": 62440 - }, - { - "epoch": 4.085704939483153, - "grad_norm": 1.1167175769805908, - "learning_rate": 7.776972752331182e-05, - "loss": 0.0689, - "step": 62450 - }, - { - "epoch": 4.086359175662414, - "grad_norm": 0.9858927726745605, - "learning_rate": 7.776208814381379e-05, - "loss": 0.0725, - "step": 62460 - }, - { - "epoch": 4.087013411841675, - "grad_norm": 0.9899501204490662, - "learning_rate": 7.775444782727245e-05, - "loss": 0.0684, - "step": 62470 - }, - { - "epoch": 4.087667648020935, - "grad_norm": 0.7681850790977478, - "learning_rate": 7.77468065739457e-05, - "loss": 0.0633, - "step": 62480 - }, - { - "epoch": 4.088321884200196, - "grad_norm": 0.6893104314804077, - "learning_rate": 7.773916438409149e-05, - "loss": 0.0572, - "step": 62490 - }, - { - "epoch": 4.088976120379457, - "grad_norm": 0.7790807485580444, - "learning_rate": 7.773152125796772e-05, - "loss": 0.0634, - "step": 62500 - }, - { - "epoch": 4.089630356558717, - "grad_norm": 1.0509587526321411, - "learning_rate": 7.772387719583238e-05, - "loss": 0.063, - "step": 62510 - }, - { - "epoch": 4.090284592737978, - "grad_norm": 1.0617860555648804, - "learning_rate": 7.771623219794346e-05, - "loss": 0.061, - "step": 62520 - }, - { - "epoch": 4.090938828917239, - "grad_norm": 0.7485182881355286, - "learning_rate": 7.770858626455903e-05, - "loss": 0.0698, - "step": 62530 - }, - { - "epoch": 4.0915930650965, - "grad_norm": 0.7513982653617859, - "learning_rate": 7.770093939593716e-05, - "loss": 0.0646, - "step": 62540 - }, - { - "epoch": 4.09224730127576, - "grad_norm": 0.9014798998832703, - "learning_rate": 7.769329159233592e-05, - "loss": 0.0747, - "step": 62550 - }, - { - "epoch": 4.092901537455021, - "grad_norm": 1.199467658996582, - "learning_rate": 7.768564285401346e-05, - "loss": 0.0664, - "step": 62560 - }, - { - "epoch": 4.093555773634282, - "grad_norm": 0.9420590996742249, - "learning_rate": 7.767799318122794e-05, - "loss": 0.0718, - "step": 62570 - }, - { - "epoch": 4.094210009813542, - "grad_norm": 0.9552091360092163, - "learning_rate": 7.767034257423758e-05, - "loss": 0.0804, - "step": 62580 - }, - { - "epoch": 4.094864245992803, - "grad_norm": 1.0671602487564087, - "learning_rate": 7.766269103330057e-05, - "loss": 0.0742, - "step": 62590 - }, - { - "epoch": 4.095518482172064, - "grad_norm": 0.961463451385498, - "learning_rate": 7.76550385586752e-05, - "loss": 0.0706, - "step": 62600 - }, - { - "epoch": 4.096172718351325, - "grad_norm": 0.858124315738678, - "learning_rate": 7.764738515061975e-05, - "loss": 0.057, - "step": 62610 - }, - { - "epoch": 4.096826954530585, - "grad_norm": 0.830001950263977, - "learning_rate": 7.763973080939254e-05, - "loss": 0.0554, - "step": 62620 - }, - { - "epoch": 4.097481190709846, - "grad_norm": 0.9393772482872009, - "learning_rate": 7.763207553525193e-05, - "loss": 0.0708, - "step": 62630 - }, - { - "epoch": 4.098135426889107, - "grad_norm": 0.8235999941825867, - "learning_rate": 7.76244193284563e-05, - "loss": 0.0615, - "step": 62640 - }, - { - "epoch": 4.098789663068367, - "grad_norm": 1.2535772323608398, - "learning_rate": 7.761676218926408e-05, - "loss": 0.0678, - "step": 62650 - }, - { - "epoch": 4.099443899247628, - "grad_norm": 0.9056881070137024, - "learning_rate": 7.76091041179337e-05, - "loss": 0.0637, - "step": 62660 - }, - { - "epoch": 4.100098135426889, - "grad_norm": 1.1135672330856323, - "learning_rate": 7.760144511472365e-05, - "loss": 0.0627, - "step": 62670 - }, - { - "epoch": 4.10075237160615, - "grad_norm": 1.0625379085540771, - "learning_rate": 7.759378517989245e-05, - "loss": 0.0658, - "step": 62680 - }, - { - "epoch": 4.10140660778541, - "grad_norm": 0.947235107421875, - "learning_rate": 7.75861243136986e-05, - "loss": 0.0638, - "step": 62690 - }, - { - "epoch": 4.102060843964671, - "grad_norm": 0.715565025806427, - "learning_rate": 7.757846251640074e-05, - "loss": 0.0647, - "step": 62700 - }, - { - "epoch": 4.102715080143932, - "grad_norm": 0.9149342775344849, - "learning_rate": 7.757079978825744e-05, - "loss": 0.0605, - "step": 62710 - }, - { - "epoch": 4.103369316323192, - "grad_norm": 0.8739985227584839, - "learning_rate": 7.756313612952733e-05, - "loss": 0.0592, - "step": 62720 - }, - { - "epoch": 4.104023552502453, - "grad_norm": 1.1399884223937988, - "learning_rate": 7.755547154046908e-05, - "loss": 0.0712, - "step": 62730 - }, - { - "epoch": 4.104677788681714, - "grad_norm": 0.9350805878639221, - "learning_rate": 7.754780602134142e-05, - "loss": 0.063, - "step": 62740 - }, - { - "epoch": 4.105332024860974, - "grad_norm": 0.8487550616264343, - "learning_rate": 7.754013957240305e-05, - "loss": 0.0741, - "step": 62750 - }, - { - "epoch": 4.105986261040235, - "grad_norm": 0.7867029905319214, - "learning_rate": 7.753247219391273e-05, - "loss": 0.0604, - "step": 62760 - }, - { - "epoch": 4.106640497219496, - "grad_norm": 0.9640946388244629, - "learning_rate": 7.752480388612928e-05, - "loss": 0.063, - "step": 62770 - }, - { - "epoch": 4.107294733398757, - "grad_norm": 1.0482327938079834, - "learning_rate": 7.751713464931151e-05, - "loss": 0.0709, - "step": 62780 - }, - { - "epoch": 4.107948969578017, - "grad_norm": 0.8405677080154419, - "learning_rate": 7.75094644837183e-05, - "loss": 0.0572, - "step": 62790 - }, - { - "epoch": 4.108603205757278, - "grad_norm": 0.8008997440338135, - "learning_rate": 7.750179338960849e-05, - "loss": 0.0732, - "step": 62800 - }, - { - "epoch": 4.109257441936539, - "grad_norm": 0.8923006057739258, - "learning_rate": 7.749412136724103e-05, - "loss": 0.0666, - "step": 62810 - }, - { - "epoch": 4.109911678115799, - "grad_norm": 1.0257177352905273, - "learning_rate": 7.748644841687486e-05, - "loss": 0.0722, - "step": 62820 - }, - { - "epoch": 4.11056591429506, - "grad_norm": 0.7566149234771729, - "learning_rate": 7.747877453876901e-05, - "loss": 0.0667, - "step": 62830 - }, - { - "epoch": 4.111220150474321, - "grad_norm": 0.9833534955978394, - "learning_rate": 7.747109973318242e-05, - "loss": 0.0588, - "step": 62840 - }, - { - "epoch": 4.111874386653582, - "grad_norm": 0.9134366512298584, - "learning_rate": 7.746342400037417e-05, - "loss": 0.0701, - "step": 62850 - }, - { - "epoch": 4.112528622832842, - "grad_norm": 0.8957915902137756, - "learning_rate": 7.745574734060335e-05, - "loss": 0.0588, - "step": 62860 - }, - { - "epoch": 4.113182859012103, - "grad_norm": 0.7879095077514648, - "learning_rate": 7.744806975412904e-05, - "loss": 0.0669, - "step": 62870 - }, - { - "epoch": 4.113837095191364, - "grad_norm": 0.9840167760848999, - "learning_rate": 7.744039124121039e-05, - "loss": 0.0612, - "step": 62880 - }, - { - "epoch": 4.114491331370624, - "grad_norm": 0.656774640083313, - "learning_rate": 7.743271180210657e-05, - "loss": 0.0671, - "step": 62890 - }, - { - "epoch": 4.115145567549885, - "grad_norm": 0.6617786884307861, - "learning_rate": 7.742503143707679e-05, - "loss": 0.0633, - "step": 62900 - }, - { - "epoch": 4.115799803729146, - "grad_norm": 0.9550051093101501, - "learning_rate": 7.741735014638027e-05, - "loss": 0.0704, - "step": 62910 - }, - { - "epoch": 4.116454039908407, - "grad_norm": 0.7846367359161377, - "learning_rate": 7.740966793027626e-05, - "loss": 0.0664, - "step": 62920 - }, - { - "epoch": 4.117108276087667, - "grad_norm": 1.068386435508728, - "learning_rate": 7.740198478902409e-05, - "loss": 0.068, - "step": 62930 - }, - { - "epoch": 4.117762512266928, - "grad_norm": 1.1259160041809082, - "learning_rate": 7.739430072288309e-05, - "loss": 0.0667, - "step": 62940 - }, - { - "epoch": 4.118416748446189, - "grad_norm": 1.1424773931503296, - "learning_rate": 7.738661573211256e-05, - "loss": 0.0688, - "step": 62950 - }, - { - "epoch": 4.119070984625449, - "grad_norm": 0.8278184533119202, - "learning_rate": 7.737892981697194e-05, - "loss": 0.0603, - "step": 62960 - }, - { - "epoch": 4.11972522080471, - "grad_norm": 0.7246772646903992, - "learning_rate": 7.737124297772065e-05, - "loss": 0.0647, - "step": 62970 - }, - { - "epoch": 4.120379456983971, - "grad_norm": 0.9246161580085754, - "learning_rate": 7.736355521461811e-05, - "loss": 0.0655, - "step": 62980 - }, - { - "epoch": 4.121033693163232, - "grad_norm": 0.9616913199424744, - "learning_rate": 7.735586652792382e-05, - "loss": 0.0727, - "step": 62990 - }, - { - "epoch": 4.121687929342492, - "grad_norm": 0.8557519912719727, - "learning_rate": 7.734817691789729e-05, - "loss": 0.066, - "step": 63000 - }, - { - "epoch": 4.122342165521753, - "grad_norm": 0.8217513561248779, - "learning_rate": 7.734048638479807e-05, - "loss": 0.0749, - "step": 63010 - }, - { - "epoch": 4.122996401701014, - "grad_norm": 1.1255154609680176, - "learning_rate": 7.733279492888572e-05, - "loss": 0.0753, - "step": 63020 - }, - { - "epoch": 4.123650637880274, - "grad_norm": 0.9445163607597351, - "learning_rate": 7.732510255041985e-05, - "loss": 0.0624, - "step": 63030 - }, - { - "epoch": 4.124304874059535, - "grad_norm": 0.9506664276123047, - "learning_rate": 7.731740924966014e-05, - "loss": 0.067, - "step": 63040 - }, - { - "epoch": 4.124959110238796, - "grad_norm": 0.9376375079154968, - "learning_rate": 7.730971502686621e-05, - "loss": 0.0673, - "step": 63050 - }, - { - "epoch": 4.125613346418057, - "grad_norm": 0.8614791035652161, - "learning_rate": 7.730201988229777e-05, - "loss": 0.0703, - "step": 63060 - }, - { - "epoch": 4.126267582597317, - "grad_norm": 0.8429438471794128, - "learning_rate": 7.729432381621455e-05, - "loss": 0.0619, - "step": 63070 - }, - { - "epoch": 4.126921818776578, - "grad_norm": 1.0551830530166626, - "learning_rate": 7.728662682887633e-05, - "loss": 0.0709, - "step": 63080 - }, - { - "epoch": 4.127576054955839, - "grad_norm": 0.9198732376098633, - "learning_rate": 7.727892892054289e-05, - "loss": 0.0647, - "step": 63090 - }, - { - "epoch": 4.128230291135099, - "grad_norm": 0.8567238450050354, - "learning_rate": 7.727123009147406e-05, - "loss": 0.0637, - "step": 63100 - }, - { - "epoch": 4.12888452731436, - "grad_norm": 0.947485089302063, - "learning_rate": 7.72635303419297e-05, - "loss": 0.0737, - "step": 63110 - }, - { - "epoch": 4.129538763493621, - "grad_norm": 0.8637893199920654, - "learning_rate": 7.725582967216966e-05, - "loss": 0.0712, - "step": 63120 - }, - { - "epoch": 4.130192999672882, - "grad_norm": 1.1613445281982422, - "learning_rate": 7.724812808245392e-05, - "loss": 0.0756, - "step": 63130 - }, - { - "epoch": 4.130847235852142, - "grad_norm": 0.8602867722511292, - "learning_rate": 7.724042557304238e-05, - "loss": 0.067, - "step": 63140 - }, - { - "epoch": 4.131501472031403, - "grad_norm": 0.8593533635139465, - "learning_rate": 7.723272214419506e-05, - "loss": 0.0615, - "step": 63150 - }, - { - "epoch": 4.132155708210664, - "grad_norm": 0.9966337084770203, - "learning_rate": 7.722501779617193e-05, - "loss": 0.0593, - "step": 63160 - }, - { - "epoch": 4.132809944389924, - "grad_norm": 0.997571587562561, - "learning_rate": 7.721731252923305e-05, - "loss": 0.0681, - "step": 63170 - }, - { - "epoch": 4.133464180569185, - "grad_norm": 0.94794100522995, - "learning_rate": 7.720960634363848e-05, - "loss": 0.0818, - "step": 63180 - }, - { - "epoch": 4.134118416748446, - "grad_norm": 0.8688943982124329, - "learning_rate": 7.720189923964833e-05, - "loss": 0.0653, - "step": 63190 - }, - { - "epoch": 4.1347726529277065, - "grad_norm": 0.8499542474746704, - "learning_rate": 7.719419121752277e-05, - "loss": 0.0734, - "step": 63200 - }, - { - "epoch": 4.135426889106967, - "grad_norm": 0.8392613530158997, - "learning_rate": 7.718648227752192e-05, - "loss": 0.0738, - "step": 63210 - }, - { - "epoch": 4.136081125286228, - "grad_norm": 1.165153980255127, - "learning_rate": 7.7178772419906e-05, - "loss": 0.0717, - "step": 63220 - }, - { - "epoch": 4.136735361465489, - "grad_norm": 0.7674516439437866, - "learning_rate": 7.717106164493523e-05, - "loss": 0.0682, - "step": 63230 - }, - { - "epoch": 4.1373895976447495, - "grad_norm": 0.8776054978370667, - "learning_rate": 7.716334995286988e-05, - "loss": 0.0712, - "step": 63240 - }, - { - "epoch": 4.13804383382401, - "grad_norm": 1.0000853538513184, - "learning_rate": 7.715563734397022e-05, - "loss": 0.0649, - "step": 63250 - }, - { - "epoch": 4.138698070003271, - "grad_norm": 0.9027912616729736, - "learning_rate": 7.714792381849658e-05, - "loss": 0.0746, - "step": 63260 - }, - { - "epoch": 4.1393523061825315, - "grad_norm": 1.001630425453186, - "learning_rate": 7.714020937670931e-05, - "loss": 0.0703, - "step": 63270 - }, - { - "epoch": 4.140006542361792, - "grad_norm": 0.9271620512008667, - "learning_rate": 7.713249401886882e-05, - "loss": 0.0685, - "step": 63280 - }, - { - "epoch": 4.140660778541053, - "grad_norm": 0.795330822467804, - "learning_rate": 7.712477774523547e-05, - "loss": 0.0696, - "step": 63290 - }, - { - "epoch": 4.141315014720314, - "grad_norm": 0.7591599225997925, - "learning_rate": 7.711706055606975e-05, - "loss": 0.063, - "step": 63300 - }, - { - "epoch": 4.1419692508995745, - "grad_norm": 0.9598954319953918, - "learning_rate": 7.710934245163211e-05, - "loss": 0.0658, - "step": 63310 - }, - { - "epoch": 4.142623487078835, - "grad_norm": 0.6453430652618408, - "learning_rate": 7.710162343218307e-05, - "loss": 0.0666, - "step": 63320 - }, - { - "epoch": 4.143277723258096, - "grad_norm": 0.8312489986419678, - "learning_rate": 7.709390349798315e-05, - "loss": 0.0651, - "step": 63330 - }, - { - "epoch": 4.1439319594373565, - "grad_norm": 0.7793470025062561, - "learning_rate": 7.708618264929295e-05, - "loss": 0.0713, - "step": 63340 - }, - { - "epoch": 4.1445861956166175, - "grad_norm": 0.827540397644043, - "learning_rate": 7.707846088637305e-05, - "loss": 0.0641, - "step": 63350 - }, - { - "epoch": 4.145240431795878, - "grad_norm": 0.8500876426696777, - "learning_rate": 7.707073820948407e-05, - "loss": 0.0698, - "step": 63360 - }, - { - "epoch": 4.145894667975139, - "grad_norm": 0.7591197490692139, - "learning_rate": 7.706301461888667e-05, - "loss": 0.0623, - "step": 63370 - }, - { - "epoch": 4.1465489041543995, - "grad_norm": 0.7708221077919006, - "learning_rate": 7.705529011484159e-05, - "loss": 0.0595, - "step": 63380 - }, - { - "epoch": 4.14720314033366, - "grad_norm": 0.8088019490242004, - "learning_rate": 7.704756469760947e-05, - "loss": 0.0609, - "step": 63390 - }, - { - "epoch": 4.147857376512921, - "grad_norm": 0.8570855855941772, - "learning_rate": 7.703983836745112e-05, - "loss": 0.0717, - "step": 63400 - }, - { - "epoch": 4.1485116126921815, - "grad_norm": 0.7673255801200867, - "learning_rate": 7.703211112462731e-05, - "loss": 0.06, - "step": 63410 - }, - { - "epoch": 4.1491658488714425, - "grad_norm": 0.9685210585594177, - "learning_rate": 7.702438296939887e-05, - "loss": 0.0609, - "step": 63420 - }, - { - "epoch": 4.149820085050703, - "grad_norm": 0.7904552817344666, - "learning_rate": 7.701665390202661e-05, - "loss": 0.0636, - "step": 63430 - }, - { - "epoch": 4.150474321229964, - "grad_norm": 0.8309029936790466, - "learning_rate": 7.700892392277144e-05, - "loss": 0.0626, - "step": 63440 - }, - { - "epoch": 4.1511285574092245, - "grad_norm": 0.7211902141571045, - "learning_rate": 7.700119303189424e-05, - "loss": 0.0626, - "step": 63450 - }, - { - "epoch": 4.1517827935884855, - "grad_norm": 0.7313457131385803, - "learning_rate": 7.699346122965599e-05, - "loss": 0.0544, - "step": 63460 - }, - { - "epoch": 4.152437029767746, - "grad_norm": 0.9150106906890869, - "learning_rate": 7.698572851631761e-05, - "loss": 0.0679, - "step": 63470 - }, - { - "epoch": 4.1530912659470065, - "grad_norm": 1.2330702543258667, - "learning_rate": 7.69779948921401e-05, - "loss": 0.0696, - "step": 63480 - }, - { - "epoch": 4.1537455021262675, - "grad_norm": 0.9700013995170593, - "learning_rate": 7.697026035738454e-05, - "loss": 0.0679, - "step": 63490 - }, - { - "epoch": 4.1543997383055284, - "grad_norm": 0.8161741495132446, - "learning_rate": 7.696252491231197e-05, - "loss": 0.0617, - "step": 63500 - }, - { - "epoch": 4.155053974484789, - "grad_norm": 0.9676251411437988, - "learning_rate": 7.695478855718344e-05, - "loss": 0.0713, - "step": 63510 - }, - { - "epoch": 4.1557082106640495, - "grad_norm": 0.9095385074615479, - "learning_rate": 7.694705129226012e-05, - "loss": 0.063, - "step": 63520 - }, - { - "epoch": 4.1563624468433105, - "grad_norm": 0.7652897238731384, - "learning_rate": 7.693931311780315e-05, - "loss": 0.0679, - "step": 63530 - }, - { - "epoch": 4.157016683022571, - "grad_norm": 0.8652570247650146, - "learning_rate": 7.693157403407372e-05, - "loss": 0.0624, - "step": 63540 - }, - { - "epoch": 4.1576709192018315, - "grad_norm": 0.7441179752349854, - "learning_rate": 7.692383404133301e-05, - "loss": 0.0568, - "step": 63550 - }, - { - "epoch": 4.1583251553810925, - "grad_norm": 0.8349494338035583, - "learning_rate": 7.691609313984232e-05, - "loss": 0.0621, - "step": 63560 - }, - { - "epoch": 4.1589793915603535, - "grad_norm": 0.8901420831680298, - "learning_rate": 7.690835132986287e-05, - "loss": 0.0687, - "step": 63570 - }, - { - "epoch": 4.159633627739614, - "grad_norm": 0.8408117890357971, - "learning_rate": 7.690060861165601e-05, - "loss": 0.0641, - "step": 63580 - }, - { - "epoch": 4.1602878639188745, - "grad_norm": 0.8637434244155884, - "learning_rate": 7.689286498548304e-05, - "loss": 0.0681, - "step": 63590 - }, - { - "epoch": 4.1609421000981355, - "grad_norm": 0.8616120219230652, - "learning_rate": 7.688512045160538e-05, - "loss": 0.0649, - "step": 63600 - }, - { - "epoch": 4.1615963362773964, - "grad_norm": 0.8039658069610596, - "learning_rate": 7.687737501028438e-05, - "loss": 0.0656, - "step": 63610 - }, - { - "epoch": 4.1622505724566565, - "grad_norm": 0.9429184198379517, - "learning_rate": 7.686962866178147e-05, - "loss": 0.0668, - "step": 63620 - }, - { - "epoch": 4.1629048086359175, - "grad_norm": 0.9747768044471741, - "learning_rate": 7.686188140635815e-05, - "loss": 0.0672, - "step": 63630 - }, - { - "epoch": 4.1635590448151785, - "grad_norm": 1.0797864198684692, - "learning_rate": 7.685413324427588e-05, - "loss": 0.0729, - "step": 63640 - }, - { - "epoch": 4.1642132809944385, - "grad_norm": 1.0528181791305542, - "learning_rate": 7.684638417579617e-05, - "loss": 0.0679, - "step": 63650 - }, - { - "epoch": 4.1648675171736995, - "grad_norm": 1.1309136152267456, - "learning_rate": 7.68386342011806e-05, - "loss": 0.0744, - "step": 63660 - }, - { - "epoch": 4.1655217533529605, - "grad_norm": 1.1569007635116577, - "learning_rate": 7.683088332069073e-05, - "loss": 0.0766, - "step": 63670 - }, - { - "epoch": 4.1661759895322215, - "grad_norm": 0.8142968416213989, - "learning_rate": 7.682313153458817e-05, - "loss": 0.0634, - "step": 63680 - }, - { - "epoch": 4.1668302257114815, - "grad_norm": 1.0308458805084229, - "learning_rate": 7.68153788431346e-05, - "loss": 0.0671, - "step": 63690 - }, - { - "epoch": 4.1674844618907425, - "grad_norm": 0.8902773857116699, - "learning_rate": 7.680762524659167e-05, - "loss": 0.0575, - "step": 63700 - }, - { - "epoch": 4.1681386980700035, - "grad_norm": 1.0067930221557617, - "learning_rate": 7.679987074522107e-05, - "loss": 0.0612, - "step": 63710 - }, - { - "epoch": 4.168792934249264, - "grad_norm": 0.8824333548545837, - "learning_rate": 7.679211533928454e-05, - "loss": 0.0631, - "step": 63720 - }, - { - "epoch": 4.1694471704285245, - "grad_norm": 0.8123067021369934, - "learning_rate": 7.678435902904386e-05, - "loss": 0.0651, - "step": 63730 - }, - { - "epoch": 4.1701014066077855, - "grad_norm": 1.0830172300338745, - "learning_rate": 7.677660181476081e-05, - "loss": 0.0693, - "step": 63740 - }, - { - "epoch": 4.1707556427870465, - "grad_norm": 0.9839672446250916, - "learning_rate": 7.676884369669723e-05, - "loss": 0.0662, - "step": 63750 - }, - { - "epoch": 4.1714098789663066, - "grad_norm": 0.941388726234436, - "learning_rate": 7.676108467511498e-05, - "loss": 0.066, - "step": 63760 - }, - { - "epoch": 4.1720641151455675, - "grad_norm": 0.8638181090354919, - "learning_rate": 7.675332475027593e-05, - "loss": 0.0714, - "step": 63770 - }, - { - "epoch": 4.1727183513248285, - "grad_norm": 0.8280830979347229, - "learning_rate": 7.674556392244201e-05, - "loss": 0.0676, - "step": 63780 - }, - { - "epoch": 4.173372587504089, - "grad_norm": 0.7582976222038269, - "learning_rate": 7.673780219187518e-05, - "loss": 0.0591, - "step": 63790 - }, - { - "epoch": 4.1740268236833495, - "grad_norm": 0.7644749283790588, - "learning_rate": 7.673003955883737e-05, - "loss": 0.058, - "step": 63800 - }, - { - "epoch": 4.1746810598626105, - "grad_norm": 0.88578200340271, - "learning_rate": 7.672227602359064e-05, - "loss": 0.069, - "step": 63810 - }, - { - "epoch": 4.1753352960418715, - "grad_norm": 0.8269320726394653, - "learning_rate": 7.671451158639702e-05, - "loss": 0.0553, - "step": 63820 - }, - { - "epoch": 4.175989532221132, - "grad_norm": 0.791187584400177, - "learning_rate": 7.670674624751857e-05, - "loss": 0.0666, - "step": 63830 - }, - { - "epoch": 4.1766437684003925, - "grad_norm": 0.8734759092330933, - "learning_rate": 7.669898000721738e-05, - "loss": 0.0613, - "step": 63840 - }, - { - "epoch": 4.1772980045796535, - "grad_norm": 0.9316699504852295, - "learning_rate": 7.66912128657556e-05, - "loss": 0.0632, - "step": 63850 - }, - { - "epoch": 4.177952240758914, - "grad_norm": 0.9909458160400391, - "learning_rate": 7.668344482339539e-05, - "loss": 0.0668, - "step": 63860 - }, - { - "epoch": 4.1786064769381746, - "grad_norm": 1.1051509380340576, - "learning_rate": 7.667567588039895e-05, - "loss": 0.0661, - "step": 63870 - }, - { - "epoch": 4.1792607131174355, - "grad_norm": 0.8877078890800476, - "learning_rate": 7.666790603702846e-05, - "loss": 0.0608, - "step": 63880 - }, - { - "epoch": 4.1799149492966965, - "grad_norm": 0.7553628087043762, - "learning_rate": 7.666013529354621e-05, - "loss": 0.0662, - "step": 63890 - }, - { - "epoch": 4.180569185475957, - "grad_norm": 0.8676695823669434, - "learning_rate": 7.665236365021448e-05, - "loss": 0.0669, - "step": 63900 - }, - { - "epoch": 4.1812234216552175, - "grad_norm": 0.7713704109191895, - "learning_rate": 7.664459110729558e-05, - "loss": 0.0642, - "step": 63910 - }, - { - "epoch": 4.1818776578344785, - "grad_norm": 1.018075704574585, - "learning_rate": 7.663681766505187e-05, - "loss": 0.0665, - "step": 63920 - }, - { - "epoch": 4.182531894013739, - "grad_norm": 0.94621342420578, - "learning_rate": 7.662904332374567e-05, - "loss": 0.0613, - "step": 63930 - }, - { - "epoch": 4.183186130193, - "grad_norm": 0.9044004678726196, - "learning_rate": 7.662126808363946e-05, - "loss": 0.0602, - "step": 63940 - }, - { - "epoch": 4.1838403663722605, - "grad_norm": 0.8055490255355835, - "learning_rate": 7.661349194499561e-05, - "loss": 0.0643, - "step": 63950 - }, - { - "epoch": 4.1844946025515215, - "grad_norm": 0.9511623382568359, - "learning_rate": 7.660571490807662e-05, - "loss": 0.0748, - "step": 63960 - }, - { - "epoch": 4.185148838730782, - "grad_norm": 0.8675695061683655, - "learning_rate": 7.659793697314496e-05, - "loss": 0.0609, - "step": 63970 - }, - { - "epoch": 4.1858030749100426, - "grad_norm": 0.9198430180549622, - "learning_rate": 7.659015814046318e-05, - "loss": 0.0718, - "step": 63980 - }, - { - "epoch": 4.1864573110893035, - "grad_norm": 0.8027588129043579, - "learning_rate": 7.658237841029383e-05, - "loss": 0.0562, - "step": 63990 - }, - { - "epoch": 4.187111547268564, - "grad_norm": 0.7004244327545166, - "learning_rate": 7.657459778289949e-05, - "loss": 0.0602, - "step": 64000 - }, - { - "epoch": 4.187765783447825, - "grad_norm": 0.8982917070388794, - "learning_rate": 7.656681625854278e-05, - "loss": 0.0646, - "step": 64010 - }, - { - "epoch": 4.1884200196270855, - "grad_norm": 0.9718723297119141, - "learning_rate": 7.655903383748637e-05, - "loss": 0.0811, - "step": 64020 - }, - { - "epoch": 4.1890742558063465, - "grad_norm": 0.9556106925010681, - "learning_rate": 7.655125051999289e-05, - "loss": 0.0666, - "step": 64030 - }, - { - "epoch": 4.189728491985607, - "grad_norm": 0.8579035997390747, - "learning_rate": 7.654346630632507e-05, - "loss": 0.0681, - "step": 64040 - }, - { - "epoch": 4.190382728164868, - "grad_norm": 0.8351026177406311, - "learning_rate": 7.653568119674567e-05, - "loss": 0.0722, - "step": 64050 - }, - { - "epoch": 4.1910369643441285, - "grad_norm": 0.8565570116043091, - "learning_rate": 7.652789519151741e-05, - "loss": 0.0584, - "step": 64060 - }, - { - "epoch": 4.191691200523389, - "grad_norm": 0.9618107080459595, - "learning_rate": 7.652010829090312e-05, - "loss": 0.0592, - "step": 64070 - }, - { - "epoch": 4.19234543670265, - "grad_norm": 0.6823198199272156, - "learning_rate": 7.651232049516566e-05, - "loss": 0.0549, - "step": 64080 - }, - { - "epoch": 4.1929996728819106, - "grad_norm": 0.9959099292755127, - "learning_rate": 7.650453180456783e-05, - "loss": 0.068, - "step": 64090 - }, - { - "epoch": 4.193653909061171, - "grad_norm": 0.767525315284729, - "learning_rate": 7.649674221937252e-05, - "loss": 0.0582, - "step": 64100 - }, - { - "epoch": 4.194308145240432, - "grad_norm": 1.036657452583313, - "learning_rate": 7.64889517398427e-05, - "loss": 0.073, - "step": 64110 - }, - { - "epoch": 4.194962381419693, - "grad_norm": 1.0274145603179932, - "learning_rate": 7.648116036624126e-05, - "loss": 0.0697, - "step": 64120 - }, - { - "epoch": 4.1956166175989535, - "grad_norm": 0.8063427209854126, - "learning_rate": 7.647336809883124e-05, - "loss": 0.0559, - "step": 64130 - }, - { - "epoch": 4.196270853778214, - "grad_norm": 0.8468816876411438, - "learning_rate": 7.646557493787558e-05, - "loss": 0.0635, - "step": 64140 - }, - { - "epoch": 4.196925089957475, - "grad_norm": 0.9064197540283203, - "learning_rate": 7.645778088363738e-05, - "loss": 0.0647, - "step": 64150 - }, - { - "epoch": 4.197579326136736, - "grad_norm": 0.9117610454559326, - "learning_rate": 7.644998593637968e-05, - "loss": 0.0668, - "step": 64160 - }, - { - "epoch": 4.198233562315996, - "grad_norm": 0.6964724063873291, - "learning_rate": 7.64421900963656e-05, - "loss": 0.064, - "step": 64170 - }, - { - "epoch": 4.198887798495257, - "grad_norm": 0.8209680318832397, - "learning_rate": 7.643439336385824e-05, - "loss": 0.0636, - "step": 64180 - }, - { - "epoch": 4.199542034674518, - "grad_norm": 0.8586738705635071, - "learning_rate": 7.642659573912078e-05, - "loss": 0.0644, - "step": 64190 - }, - { - "epoch": 4.200196270853779, - "grad_norm": 0.8765914440155029, - "learning_rate": 7.641879722241643e-05, - "loss": 0.0718, - "step": 64200 - }, - { - "epoch": 4.200850507033039, - "grad_norm": 0.9443649649620056, - "learning_rate": 7.641099781400838e-05, - "loss": 0.0835, - "step": 64210 - }, - { - "epoch": 4.2015047432123, - "grad_norm": 1.0230027437210083, - "learning_rate": 7.640319751415987e-05, - "loss": 0.0697, - "step": 64220 - }, - { - "epoch": 4.202158979391561, - "grad_norm": 1.0767507553100586, - "learning_rate": 7.63953963231342e-05, - "loss": 0.0671, - "step": 64230 - }, - { - "epoch": 4.202813215570821, - "grad_norm": 0.9476938247680664, - "learning_rate": 7.63875942411947e-05, - "loss": 0.0624, - "step": 64240 - }, - { - "epoch": 4.203467451750082, - "grad_norm": 0.9079118967056274, - "learning_rate": 7.637979126860468e-05, - "loss": 0.0665, - "step": 64250 - }, - { - "epoch": 4.204121687929343, - "grad_norm": 0.7623441219329834, - "learning_rate": 7.637198740562752e-05, - "loss": 0.0564, - "step": 64260 - }, - { - "epoch": 4.204775924108604, - "grad_norm": 0.7987393736839294, - "learning_rate": 7.636418265252662e-05, - "loss": 0.0642, - "step": 64270 - }, - { - "epoch": 4.205430160287864, - "grad_norm": 0.7768983840942383, - "learning_rate": 7.635637700956542e-05, - "loss": 0.0689, - "step": 64280 - }, - { - "epoch": 4.206084396467125, - "grad_norm": 0.9263744354248047, - "learning_rate": 7.634857047700737e-05, - "loss": 0.0591, - "step": 64290 - }, - { - "epoch": 4.206738632646386, - "grad_norm": 0.9548198580741882, - "learning_rate": 7.634076305511598e-05, - "loss": 0.0626, - "step": 64300 - }, - { - "epoch": 4.207392868825646, - "grad_norm": 0.9324436783790588, - "learning_rate": 7.633295474415473e-05, - "loss": 0.062, - "step": 64310 - }, - { - "epoch": 4.208047105004907, - "grad_norm": 0.9058679342269897, - "learning_rate": 7.63251455443872e-05, - "loss": 0.0726, - "step": 64320 - }, - { - "epoch": 4.208701341184168, - "grad_norm": 0.6921377778053284, - "learning_rate": 7.631733545607697e-05, - "loss": 0.0582, - "step": 64330 - }, - { - "epoch": 4.209355577363429, - "grad_norm": 0.7973593473434448, - "learning_rate": 7.630952447948765e-05, - "loss": 0.0667, - "step": 64340 - }, - { - "epoch": 4.210009813542689, - "grad_norm": 0.8190951943397522, - "learning_rate": 7.630171261488289e-05, - "loss": 0.0559, - "step": 64350 - }, - { - "epoch": 4.21066404972195, - "grad_norm": 0.9391740560531616, - "learning_rate": 7.629389986252634e-05, - "loss": 0.0589, - "step": 64360 - }, - { - "epoch": 4.211318285901211, - "grad_norm": 0.8340182304382324, - "learning_rate": 7.628608622268171e-05, - "loss": 0.0663, - "step": 64370 - }, - { - "epoch": 4.211972522080471, - "grad_norm": 0.9111093878746033, - "learning_rate": 7.627827169561275e-05, - "loss": 0.0664, - "step": 64380 - }, - { - "epoch": 4.212626758259732, - "grad_norm": 0.8605757355690002, - "learning_rate": 7.627045628158318e-05, - "loss": 0.0681, - "step": 64390 - }, - { - "epoch": 4.213280994438993, - "grad_norm": 0.9816390872001648, - "learning_rate": 7.626263998085683e-05, - "loss": 0.0607, - "step": 64400 - }, - { - "epoch": 4.213935230618254, - "grad_norm": 0.8772796988487244, - "learning_rate": 7.625482279369749e-05, - "loss": 0.0548, - "step": 64410 - }, - { - "epoch": 4.214589466797514, - "grad_norm": 0.8973631858825684, - "learning_rate": 7.624700472036904e-05, - "loss": 0.0561, - "step": 64420 - }, - { - "epoch": 4.215243702976775, - "grad_norm": 1.0034244060516357, - "learning_rate": 7.623918576113533e-05, - "loss": 0.0648, - "step": 64430 - }, - { - "epoch": 4.215897939156036, - "grad_norm": 0.8082411885261536, - "learning_rate": 7.62313659162603e-05, - "loss": 0.0651, - "step": 64440 - }, - { - "epoch": 4.216552175335296, - "grad_norm": 0.8607401847839355, - "learning_rate": 7.622354518600786e-05, - "loss": 0.063, - "step": 64450 - }, - { - "epoch": 4.217206411514557, - "grad_norm": 1.0780029296875, - "learning_rate": 7.621572357064202e-05, - "loss": 0.0556, - "step": 64460 - }, - { - "epoch": 4.217860647693818, - "grad_norm": 0.8647873997688293, - "learning_rate": 7.620790107042674e-05, - "loss": 0.0654, - "step": 64470 - }, - { - "epoch": 4.218514883873079, - "grad_norm": 0.9308034181594849, - "learning_rate": 7.620007768562606e-05, - "loss": 0.0719, - "step": 64480 - }, - { - "epoch": 4.219169120052339, - "grad_norm": 1.1081721782684326, - "learning_rate": 7.619225341650404e-05, - "loss": 0.0765, - "step": 64490 - }, - { - "epoch": 4.2198233562316, - "grad_norm": 0.7883846163749695, - "learning_rate": 7.618442826332482e-05, - "loss": 0.0683, - "step": 64500 - }, - { - "epoch": 4.220477592410861, - "grad_norm": 0.6897490620613098, - "learning_rate": 7.617660222635243e-05, - "loss": 0.0579, - "step": 64510 - }, - { - "epoch": 4.221131828590121, - "grad_norm": 0.7844201326370239, - "learning_rate": 7.616877530585107e-05, - "loss": 0.0584, - "step": 64520 - }, - { - "epoch": 4.221786064769382, - "grad_norm": 0.8688981533050537, - "learning_rate": 7.616094750208493e-05, - "loss": 0.0738, - "step": 64530 - }, - { - "epoch": 4.222440300948643, - "grad_norm": 0.886189341545105, - "learning_rate": 7.61531188153182e-05, - "loss": 0.0587, - "step": 64540 - }, - { - "epoch": 4.223094537127903, - "grad_norm": 1.0094472169876099, - "learning_rate": 7.61452892458151e-05, - "loss": 0.0697, - "step": 64550 - }, - { - "epoch": 4.223748773307164, - "grad_norm": 1.0703370571136475, - "learning_rate": 7.613745879383995e-05, - "loss": 0.0695, - "step": 64560 - }, - { - "epoch": 4.224403009486425, - "grad_norm": 0.8577084541320801, - "learning_rate": 7.612962745965699e-05, - "loss": 0.0613, - "step": 64570 - }, - { - "epoch": 4.225057245665686, - "grad_norm": 0.6969327330589294, - "learning_rate": 7.612179524353058e-05, - "loss": 0.0734, - "step": 64580 - }, - { - "epoch": 4.225711481844946, - "grad_norm": 0.8476243019104004, - "learning_rate": 7.611396214572508e-05, - "loss": 0.0715, - "step": 64590 - }, - { - "epoch": 4.226365718024207, - "grad_norm": 0.8455610871315002, - "learning_rate": 7.610612816650488e-05, - "loss": 0.0605, - "step": 64600 - }, - { - "epoch": 4.227019954203468, - "grad_norm": 1.1364946365356445, - "learning_rate": 7.609829330613439e-05, - "loss": 0.0741, - "step": 64610 - }, - { - "epoch": 4.227674190382728, - "grad_norm": 1.1362121105194092, - "learning_rate": 7.609045756487805e-05, - "loss": 0.0667, - "step": 64620 - }, - { - "epoch": 4.228328426561989, - "grad_norm": 1.0451109409332275, - "learning_rate": 7.608262094300034e-05, - "loss": 0.0725, - "step": 64630 - }, - { - "epoch": 4.22898266274125, - "grad_norm": 0.9875683188438416, - "learning_rate": 7.607478344076577e-05, - "loss": 0.057, - "step": 64640 - }, - { - "epoch": 4.229636898920511, - "grad_norm": 1.0684120655059814, - "learning_rate": 7.606694505843887e-05, - "loss": 0.0645, - "step": 64650 - }, - { - "epoch": 4.230291135099771, - "grad_norm": 1.1099648475646973, - "learning_rate": 7.605910579628421e-05, - "loss": 0.0743, - "step": 64660 - }, - { - "epoch": 4.230945371279032, - "grad_norm": 1.0001264810562134, - "learning_rate": 7.60512656545664e-05, - "loss": 0.0664, - "step": 64670 - }, - { - "epoch": 4.231599607458293, - "grad_norm": 0.8117485046386719, - "learning_rate": 7.604342463355003e-05, - "loss": 0.0688, - "step": 64680 - }, - { - "epoch": 4.232253843637553, - "grad_norm": 0.9693712592124939, - "learning_rate": 7.60355827334998e-05, - "loss": 0.0747, - "step": 64690 - }, - { - "epoch": 4.232908079816814, - "grad_norm": 0.9077291488647461, - "learning_rate": 7.602773995468036e-05, - "loss": 0.0593, - "step": 64700 - }, - { - "epoch": 4.233562315996075, - "grad_norm": 1.1760419607162476, - "learning_rate": 7.601989629735643e-05, - "loss": 0.076, - "step": 64710 - }, - { - "epoch": 4.234216552175336, - "grad_norm": 0.9444860219955444, - "learning_rate": 7.601205176179279e-05, - "loss": 0.0658, - "step": 64720 - }, - { - "epoch": 4.234870788354596, - "grad_norm": 0.9393828511238098, - "learning_rate": 7.600420634825416e-05, - "loss": 0.0571, - "step": 64730 - }, - { - "epoch": 4.235525024533857, - "grad_norm": 1.003543496131897, - "learning_rate": 7.599636005700537e-05, - "loss": 0.0668, - "step": 64740 - }, - { - "epoch": 4.236179260713118, - "grad_norm": 0.8727512955665588, - "learning_rate": 7.598851288831124e-05, - "loss": 0.0624, - "step": 64750 - }, - { - "epoch": 4.236833496892378, - "grad_norm": 1.0206762552261353, - "learning_rate": 7.598066484243667e-05, - "loss": 0.0789, - "step": 64760 - }, - { - "epoch": 4.237487733071639, - "grad_norm": 1.1758549213409424, - "learning_rate": 7.597281591964649e-05, - "loss": 0.0747, - "step": 64770 - }, - { - "epoch": 4.2381419692509, - "grad_norm": 0.7732208371162415, - "learning_rate": 7.596496612020567e-05, - "loss": 0.0596, - "step": 64780 - }, - { - "epoch": 4.238796205430161, - "grad_norm": 0.8588206768035889, - "learning_rate": 7.595711544437917e-05, - "loss": 0.063, - "step": 64790 - }, - { - "epoch": 4.239450441609421, - "grad_norm": 0.811578094959259, - "learning_rate": 7.594926389243193e-05, - "loss": 0.0732, - "step": 64800 - }, - { - "epoch": 4.240104677788682, - "grad_norm": 0.7321622371673584, - "learning_rate": 7.594141146462897e-05, - "loss": 0.0645, - "step": 64810 - }, - { - "epoch": 4.240758913967943, - "grad_norm": 0.690522313117981, - "learning_rate": 7.593355816123535e-05, - "loss": 0.0696, - "step": 64820 - }, - { - "epoch": 4.241413150147203, - "grad_norm": 0.8633084297180176, - "learning_rate": 7.592570398251614e-05, - "loss": 0.0669, - "step": 64830 - }, - { - "epoch": 4.242067386326464, - "grad_norm": 0.8941366076469421, - "learning_rate": 7.591784892873642e-05, - "loss": 0.0604, - "step": 64840 - }, - { - "epoch": 4.242721622505725, - "grad_norm": 0.9090977311134338, - "learning_rate": 7.590999300016131e-05, - "loss": 0.0656, - "step": 64850 - }, - { - "epoch": 4.243375858684986, - "grad_norm": 0.734219491481781, - "learning_rate": 7.5902136197056e-05, - "loss": 0.0589, - "step": 64860 - }, - { - "epoch": 4.244030094864246, - "grad_norm": 0.9445002675056458, - "learning_rate": 7.589427851968567e-05, - "loss": 0.0696, - "step": 64870 - }, - { - "epoch": 4.244684331043507, - "grad_norm": 0.8005701899528503, - "learning_rate": 7.58864199683155e-05, - "loss": 0.0658, - "step": 64880 - }, - { - "epoch": 4.245338567222768, - "grad_norm": 1.0320810079574585, - "learning_rate": 7.58785605432108e-05, - "loss": 0.0689, - "step": 64890 - }, - { - "epoch": 4.245992803402028, - "grad_norm": 0.820569634437561, - "learning_rate": 7.58707002446368e-05, - "loss": 0.0665, - "step": 64900 - }, - { - "epoch": 4.246647039581289, - "grad_norm": 0.8624978065490723, - "learning_rate": 7.58628390728588e-05, - "loss": 0.0674, - "step": 64910 - }, - { - "epoch": 4.24730127576055, - "grad_norm": 0.9039542078971863, - "learning_rate": 7.58549770281422e-05, - "loss": 0.0871, - "step": 64920 - }, - { - "epoch": 4.247955511939811, - "grad_norm": 0.8915390968322754, - "learning_rate": 7.584711411075227e-05, - "loss": 0.0585, - "step": 64930 - }, - { - "epoch": 4.248609748119071, - "grad_norm": 0.9667330980300903, - "learning_rate": 7.583925032095447e-05, - "loss": 0.0598, - "step": 64940 - }, - { - "epoch": 4.249263984298332, - "grad_norm": 0.9445711970329285, - "learning_rate": 7.583138565901422e-05, - "loss": 0.0579, - "step": 64950 - }, - { - "epoch": 4.249918220477593, - "grad_norm": 1.0453894138336182, - "learning_rate": 7.582352012519694e-05, - "loss": 0.0719, - "step": 64960 - }, - { - "epoch": 4.250572456656853, - "grad_norm": 0.9736664891242981, - "learning_rate": 7.581565371976813e-05, - "loss": 0.063, - "step": 64970 - }, - { - "epoch": 4.251226692836114, - "grad_norm": 0.7672659158706665, - "learning_rate": 7.580778644299332e-05, - "loss": 0.0648, - "step": 64980 - }, - { - "epoch": 4.251880929015375, - "grad_norm": 0.7975163459777832, - "learning_rate": 7.579991829513802e-05, - "loss": 0.0604, - "step": 64990 - }, - { - "epoch": 4.252535165194635, - "grad_norm": 0.8647359013557434, - "learning_rate": 7.579204927646782e-05, - "loss": 0.0739, - "step": 65000 - }, - { - "epoch": 4.253189401373896, - "grad_norm": 0.7860682606697083, - "learning_rate": 7.57841793872483e-05, - "loss": 0.0621, - "step": 65010 - }, - { - "epoch": 4.253843637553157, - "grad_norm": 0.9216861128807068, - "learning_rate": 7.577630862774515e-05, - "loss": 0.0593, - "step": 65020 - }, - { - "epoch": 4.254497873732418, - "grad_norm": 0.9444064497947693, - "learning_rate": 7.576843699822394e-05, - "loss": 0.0583, - "step": 65030 - }, - { - "epoch": 4.255152109911678, - "grad_norm": 0.833625853061676, - "learning_rate": 7.57605644989504e-05, - "loss": 0.0653, - "step": 65040 - }, - { - "epoch": 4.255806346090939, - "grad_norm": 0.8485620617866516, - "learning_rate": 7.575269113019027e-05, - "loss": 0.0723, - "step": 65050 - }, - { - "epoch": 4.2564605822702, - "grad_norm": 0.9002768397331238, - "learning_rate": 7.574481689220926e-05, - "loss": 0.0667, - "step": 65060 - }, - { - "epoch": 4.257114818449461, - "grad_norm": 0.7625710964202881, - "learning_rate": 7.573694178527316e-05, - "loss": 0.0671, - "step": 65070 - }, - { - "epoch": 4.257769054628721, - "grad_norm": 0.912322998046875, - "learning_rate": 7.572906580964779e-05, - "loss": 0.0707, - "step": 65080 - }, - { - "epoch": 4.258423290807982, - "grad_norm": 0.8781670331954956, - "learning_rate": 7.572118896559896e-05, - "loss": 0.0731, - "step": 65090 - }, - { - "epoch": 4.259077526987243, - "grad_norm": 1.0701106786727905, - "learning_rate": 7.571331125339256e-05, - "loss": 0.062, - "step": 65100 - }, - { - "epoch": 4.259731763166503, - "grad_norm": 1.057981014251709, - "learning_rate": 7.570543267329446e-05, - "loss": 0.0619, - "step": 65110 - }, - { - "epoch": 4.260385999345764, - "grad_norm": 0.6521860361099243, - "learning_rate": 7.56975532255706e-05, - "loss": 0.0532, - "step": 65120 - }, - { - "epoch": 4.261040235525025, - "grad_norm": 0.7353628277778625, - "learning_rate": 7.568967291048692e-05, - "loss": 0.0675, - "step": 65130 - }, - { - "epoch": 4.261694471704285, - "grad_norm": 0.8143835067749023, - "learning_rate": 7.56817917283094e-05, - "loss": 0.0615, - "step": 65140 - }, - { - "epoch": 4.262348707883546, - "grad_norm": 0.7694095373153687, - "learning_rate": 7.567390967930406e-05, - "loss": 0.065, - "step": 65150 - }, - { - "epoch": 4.263002944062807, - "grad_norm": 1.1253188848495483, - "learning_rate": 7.566602676373694e-05, - "loss": 0.0745, - "step": 65160 - }, - { - "epoch": 4.263657180242068, - "grad_norm": 0.760852575302124, - "learning_rate": 7.56581429818741e-05, - "loss": 0.064, - "step": 65170 - }, - { - "epoch": 4.264311416421328, - "grad_norm": 0.808857262134552, - "learning_rate": 7.565025833398164e-05, - "loss": 0.0654, - "step": 65180 - }, - { - "epoch": 4.264965652600589, - "grad_norm": 0.7772423028945923, - "learning_rate": 7.56423728203257e-05, - "loss": 0.0645, - "step": 65190 - }, - { - "epoch": 4.26561988877985, - "grad_norm": 0.8490001559257507, - "learning_rate": 7.563448644117242e-05, - "loss": 0.0618, - "step": 65200 - }, - { - "epoch": 4.26627412495911, - "grad_norm": 0.776184618473053, - "learning_rate": 7.562659919678801e-05, - "loss": 0.0724, - "step": 65210 - }, - { - "epoch": 4.266928361138371, - "grad_norm": 0.765455424785614, - "learning_rate": 7.561871108743865e-05, - "loss": 0.0681, - "step": 65220 - }, - { - "epoch": 4.267582597317632, - "grad_norm": 0.8536463975906372, - "learning_rate": 7.561082211339062e-05, - "loss": 0.0693, - "step": 65230 - }, - { - "epoch": 4.268236833496893, - "grad_norm": 0.7764464616775513, - "learning_rate": 7.560293227491017e-05, - "loss": 0.0532, - "step": 65240 - }, - { - "epoch": 4.268891069676153, - "grad_norm": 0.8680564165115356, - "learning_rate": 7.55950415722636e-05, - "loss": 0.0577, - "step": 65250 - }, - { - "epoch": 4.269545305855414, - "grad_norm": 0.8063961267471313, - "learning_rate": 7.558715000571726e-05, - "loss": 0.0709, - "step": 65260 - }, - { - "epoch": 4.270199542034675, - "grad_norm": 0.8812915086746216, - "learning_rate": 7.55792575755375e-05, - "loss": 0.0586, - "step": 65270 - }, - { - "epoch": 4.270853778213935, - "grad_norm": 0.7072269320487976, - "learning_rate": 7.55713642819907e-05, - "loss": 0.071, - "step": 65280 - }, - { - "epoch": 4.271508014393196, - "grad_norm": 0.9915341138839722, - "learning_rate": 7.55634701253433e-05, - "loss": 0.0548, - "step": 65290 - }, - { - "epoch": 4.272162250572457, - "grad_norm": 0.9940981864929199, - "learning_rate": 7.555557510586175e-05, - "loss": 0.0664, - "step": 65300 - }, - { - "epoch": 4.272816486751718, - "grad_norm": 1.065885305404663, - "learning_rate": 7.554767922381253e-05, - "loss": 0.0663, - "step": 65310 - }, - { - "epoch": 4.273470722930978, - "grad_norm": 0.7399426102638245, - "learning_rate": 7.553978247946212e-05, - "loss": 0.0616, - "step": 65320 - }, - { - "epoch": 4.274124959110239, - "grad_norm": 0.7760506868362427, - "learning_rate": 7.553188487307705e-05, - "loss": 0.0592, - "step": 65330 - }, - { - "epoch": 4.2747791952895, - "grad_norm": 0.9584298133850098, - "learning_rate": 7.552398640492393e-05, - "loss": 0.0646, - "step": 65340 - }, - { - "epoch": 4.27543343146876, - "grad_norm": 0.807236909866333, - "learning_rate": 7.551608707526933e-05, - "loss": 0.0694, - "step": 65350 - }, - { - "epoch": 4.276087667648021, - "grad_norm": 0.9472173452377319, - "learning_rate": 7.550818688437986e-05, - "loss": 0.0608, - "step": 65360 - }, - { - "epoch": 4.276741903827282, - "grad_norm": 0.9590891003608704, - "learning_rate": 7.55002858325222e-05, - "loss": 0.0615, - "step": 65370 - }, - { - "epoch": 4.277396140006543, - "grad_norm": 0.8383166193962097, - "learning_rate": 7.549238391996302e-05, - "loss": 0.0632, - "step": 65380 - }, - { - "epoch": 4.278050376185803, - "grad_norm": 0.7799770832061768, - "learning_rate": 7.5484481146969e-05, - "loss": 0.0631, - "step": 65390 - }, - { - "epoch": 4.278704612365064, - "grad_norm": 1.1362422704696655, - "learning_rate": 7.547657751380694e-05, - "loss": 0.0738, - "step": 65400 - }, - { - "epoch": 4.279358848544325, - "grad_norm": 0.9604383111000061, - "learning_rate": 7.546867302074354e-05, - "loss": 0.0579, - "step": 65410 - }, - { - "epoch": 4.280013084723585, - "grad_norm": 0.9436408877372742, - "learning_rate": 7.546076766804567e-05, - "loss": 0.0669, - "step": 65420 - }, - { - "epoch": 4.280667320902846, - "grad_norm": 0.8412542939186096, - "learning_rate": 7.545286145598007e-05, - "loss": 0.0744, - "step": 65430 - }, - { - "epoch": 4.281321557082107, - "grad_norm": 0.7971706986427307, - "learning_rate": 7.544495438481367e-05, - "loss": 0.0613, - "step": 65440 - }, - { - "epoch": 4.281975793261367, - "grad_norm": 0.8335549235343933, - "learning_rate": 7.543704645481333e-05, - "loss": 0.0696, - "step": 65450 - }, - { - "epoch": 4.282630029440628, - "grad_norm": 0.7970999479293823, - "learning_rate": 7.542913766624596e-05, - "loss": 0.0631, - "step": 65460 - }, - { - "epoch": 4.283284265619889, - "grad_norm": 0.990964949131012, - "learning_rate": 7.542122801937849e-05, - "loss": 0.0677, - "step": 65470 - }, - { - "epoch": 4.28393850179915, - "grad_norm": 1.0644537210464478, - "learning_rate": 7.541331751447792e-05, - "loss": 0.0689, - "step": 65480 - }, - { - "epoch": 4.28459273797841, - "grad_norm": 0.9389525055885315, - "learning_rate": 7.540540615181123e-05, - "loss": 0.0643, - "step": 65490 - }, - { - "epoch": 4.285246974157671, - "grad_norm": 0.8526713252067566, - "learning_rate": 7.539749393164546e-05, - "loss": 0.0662, - "step": 65500 - }, - { - "epoch": 4.285901210336932, - "grad_norm": 0.9678772687911987, - "learning_rate": 7.538958085424765e-05, - "loss": 0.065, - "step": 65510 - }, - { - "epoch": 4.286555446516193, - "grad_norm": 0.8349334001541138, - "learning_rate": 7.53816669198849e-05, - "loss": 0.0737, - "step": 65520 - }, - { - "epoch": 4.287209682695453, - "grad_norm": 0.8528868556022644, - "learning_rate": 7.537375212882433e-05, - "loss": 0.0585, - "step": 65530 - }, - { - "epoch": 4.287863918874714, - "grad_norm": 0.7921110987663269, - "learning_rate": 7.536583648133311e-05, - "loss": 0.0621, - "step": 65540 - }, - { - "epoch": 4.288518155053975, - "grad_norm": 1.04022216796875, - "learning_rate": 7.535791997767834e-05, - "loss": 0.061, - "step": 65550 - }, - { - "epoch": 4.289172391233235, - "grad_norm": 0.7660422325134277, - "learning_rate": 7.535000261812729e-05, - "loss": 0.0618, - "step": 65560 - }, - { - "epoch": 4.289826627412496, - "grad_norm": 1.0147818326950073, - "learning_rate": 7.534208440294717e-05, - "loss": 0.0584, - "step": 65570 - }, - { - "epoch": 4.290480863591757, - "grad_norm": 0.7676869630813599, - "learning_rate": 7.533416533240523e-05, - "loss": 0.0664, - "step": 65580 - }, - { - "epoch": 4.291135099771017, - "grad_norm": 0.7164015173912048, - "learning_rate": 7.532624540676876e-05, - "loss": 0.0643, - "step": 65590 - }, - { - "epoch": 4.291789335950278, - "grad_norm": 0.7891098856925964, - "learning_rate": 7.53183246263051e-05, - "loss": 0.0799, - "step": 65600 - }, - { - "epoch": 4.292443572129539, - "grad_norm": 0.8349225521087646, - "learning_rate": 7.531040299128158e-05, - "loss": 0.0689, - "step": 65610 - }, - { - "epoch": 4.2930978083088, - "grad_norm": 0.9937372803688049, - "learning_rate": 7.530248050196557e-05, - "loss": 0.0657, - "step": 65620 - }, - { - "epoch": 4.29375204448806, - "grad_norm": 0.9106799960136414, - "learning_rate": 7.529455715862452e-05, - "loss": 0.0684, - "step": 65630 - }, - { - "epoch": 4.294406280667321, - "grad_norm": 0.9614897966384888, - "learning_rate": 7.52866329615258e-05, - "loss": 0.0642, - "step": 65640 - }, - { - "epoch": 4.295060516846582, - "grad_norm": 0.914214015007019, - "learning_rate": 7.527870791093691e-05, - "loss": 0.0648, - "step": 65650 - }, - { - "epoch": 4.295714753025842, - "grad_norm": 0.8048765063285828, - "learning_rate": 7.527078200712533e-05, - "loss": 0.0655, - "step": 65660 - }, - { - "epoch": 4.296368989205103, - "grad_norm": 0.9100319743156433, - "learning_rate": 7.526285525035858e-05, - "loss": 0.0678, - "step": 65670 - }, - { - "epoch": 4.297023225384364, - "grad_norm": 0.9633187055587769, - "learning_rate": 7.52549276409042e-05, - "loss": 0.0681, - "step": 65680 - }, - { - "epoch": 4.297677461563625, - "grad_norm": 0.7984033823013306, - "learning_rate": 7.52469991790298e-05, - "loss": 0.0609, - "step": 65690 - }, - { - "epoch": 4.298331697742885, - "grad_norm": 0.9760509729385376, - "learning_rate": 7.523906986500296e-05, - "loss": 0.0739, - "step": 65700 - }, - { - "epoch": 4.298985933922146, - "grad_norm": 1.10667085647583, - "learning_rate": 7.52311396990913e-05, - "loss": 0.0653, - "step": 65710 - }, - { - "epoch": 4.299640170101407, - "grad_norm": 1.0350356101989746, - "learning_rate": 7.522320868156253e-05, - "loss": 0.0656, - "step": 65720 - }, - { - "epoch": 4.300294406280667, - "grad_norm": 1.0190367698669434, - "learning_rate": 7.521527681268431e-05, - "loss": 0.0714, - "step": 65730 - }, - { - "epoch": 4.300948642459928, - "grad_norm": 0.7813534140586853, - "learning_rate": 7.520734409272437e-05, - "loss": 0.0592, - "step": 65740 - }, - { - "epoch": 4.301602878639189, - "grad_norm": 0.9878474473953247, - "learning_rate": 7.519941052195045e-05, - "loss": 0.0642, - "step": 65750 - }, - { - "epoch": 4.30225711481845, - "grad_norm": 0.7413721680641174, - "learning_rate": 7.519147610063035e-05, - "loss": 0.0593, - "step": 65760 - }, - { - "epoch": 4.30291135099771, - "grad_norm": 0.8811038732528687, - "learning_rate": 7.518354082903184e-05, - "loss": 0.0664, - "step": 65770 - }, - { - "epoch": 4.303565587176971, - "grad_norm": 0.9010792970657349, - "learning_rate": 7.517560470742279e-05, - "loss": 0.0669, - "step": 65780 - }, - { - "epoch": 4.304219823356232, - "grad_norm": 0.8902906775474548, - "learning_rate": 7.516766773607107e-05, - "loss": 0.068, - "step": 65790 - }, - { - "epoch": 4.304874059535492, - "grad_norm": 0.9187259674072266, - "learning_rate": 7.515972991524454e-05, - "loss": 0.0681, - "step": 65800 - }, - { - "epoch": 4.305528295714753, - "grad_norm": 0.7792700529098511, - "learning_rate": 7.515179124521116e-05, - "loss": 0.0599, - "step": 65810 - }, - { - "epoch": 4.306182531894014, - "grad_norm": 0.7929428815841675, - "learning_rate": 7.514385172623886e-05, - "loss": 0.0596, - "step": 65820 - }, - { - "epoch": 4.306836768073275, - "grad_norm": 0.9037280678749084, - "learning_rate": 7.513591135859561e-05, - "loss": 0.0637, - "step": 65830 - }, - { - "epoch": 4.307491004252535, - "grad_norm": 0.8752690553665161, - "learning_rate": 7.512797014254944e-05, - "loss": 0.0617, - "step": 65840 - }, - { - "epoch": 4.308145240431796, - "grad_norm": 0.9982262253761292, - "learning_rate": 7.512002807836838e-05, - "loss": 0.0617, - "step": 65850 - }, - { - "epoch": 4.308799476611057, - "grad_norm": 1.1322745084762573, - "learning_rate": 7.511208516632047e-05, - "loss": 0.0665, - "step": 65860 - }, - { - "epoch": 4.309453712790317, - "grad_norm": 0.8582587242126465, - "learning_rate": 7.510414140667385e-05, - "loss": 0.0717, - "step": 65870 - }, - { - "epoch": 4.310107948969578, - "grad_norm": 1.0602346658706665, - "learning_rate": 7.50961967996966e-05, - "loss": 0.0662, - "step": 65880 - }, - { - "epoch": 4.310762185148839, - "grad_norm": 0.9373438954353333, - "learning_rate": 7.508825134565692e-05, - "loss": 0.0596, - "step": 65890 - }, - { - "epoch": 4.311416421328099, - "grad_norm": 0.8777552843093872, - "learning_rate": 7.508030504482296e-05, - "loss": 0.0611, - "step": 65900 - }, - { - "epoch": 4.31207065750736, - "grad_norm": 0.8960667848587036, - "learning_rate": 7.50723578974629e-05, - "loss": 0.0631, - "step": 65910 - }, - { - "epoch": 4.312724893686621, - "grad_norm": 1.027182698249817, - "learning_rate": 7.506440990384502e-05, - "loss": 0.0668, - "step": 65920 - }, - { - "epoch": 4.313379129865882, - "grad_norm": 0.7670314311981201, - "learning_rate": 7.505646106423756e-05, - "loss": 0.0557, - "step": 65930 - }, - { - "epoch": 4.314033366045142, - "grad_norm": 0.9890926480293274, - "learning_rate": 7.504851137890885e-05, - "loss": 0.0718, - "step": 65940 - }, - { - "epoch": 4.314687602224403, - "grad_norm": 0.7140153050422668, - "learning_rate": 7.504056084812718e-05, - "loss": 0.0695, - "step": 65950 - }, - { - "epoch": 4.315341838403664, - "grad_norm": 0.8227149844169617, - "learning_rate": 7.50326094721609e-05, - "loss": 0.0566, - "step": 65960 - }, - { - "epoch": 4.315996074582925, - "grad_norm": 1.0063236951828003, - "learning_rate": 7.502465725127839e-05, - "loss": 0.0601, - "step": 65970 - }, - { - "epoch": 4.316650310762185, - "grad_norm": 0.8858514428138733, - "learning_rate": 7.501670418574808e-05, - "loss": 0.0561, - "step": 65980 - }, - { - "epoch": 4.317304546941446, - "grad_norm": 0.9521268606185913, - "learning_rate": 7.500875027583843e-05, - "loss": 0.0743, - "step": 65990 - }, - { - "epoch": 4.317958783120707, - "grad_norm": 0.860778272151947, - "learning_rate": 7.500079552181782e-05, - "loss": 0.056, - "step": 66000 - }, - { - "epoch": 4.318613019299967, - "grad_norm": 0.6561046838760376, - "learning_rate": 7.499283992395483e-05, - "loss": 0.0579, - "step": 66010 - }, - { - "epoch": 4.319267255479228, - "grad_norm": 0.9803190231323242, - "learning_rate": 7.498488348251794e-05, - "loss": 0.0664, - "step": 66020 - }, - { - "epoch": 4.319921491658489, - "grad_norm": 0.782440185546875, - "learning_rate": 7.497692619777568e-05, - "loss": 0.067, - "step": 66030 - }, - { - "epoch": 4.320575727837749, - "grad_norm": 0.900099515914917, - "learning_rate": 7.496896806999667e-05, - "loss": 0.0708, - "step": 66040 - }, - { - "epoch": 4.32122996401701, - "grad_norm": 0.7509415745735168, - "learning_rate": 7.496100909944952e-05, - "loss": 0.0639, - "step": 66050 - }, - { - "epoch": 4.321884200196271, - "grad_norm": 0.8425498008728027, - "learning_rate": 7.495304928640284e-05, - "loss": 0.0697, - "step": 66060 - }, - { - "epoch": 4.322538436375532, - "grad_norm": 0.8149582147598267, - "learning_rate": 7.494508863112529e-05, - "loss": 0.0615, - "step": 66070 - }, - { - "epoch": 4.323192672554792, - "grad_norm": 0.8617916703224182, - "learning_rate": 7.49371271338856e-05, - "loss": 0.0663, - "step": 66080 - }, - { - "epoch": 4.323846908734053, - "grad_norm": 0.7951449155807495, - "learning_rate": 7.492916479495246e-05, - "loss": 0.061, - "step": 66090 - }, - { - "epoch": 4.324501144913314, - "grad_norm": 0.9237793684005737, - "learning_rate": 7.492120161459463e-05, - "loss": 0.0655, - "step": 66100 - }, - { - "epoch": 4.325155381092574, - "grad_norm": 0.8202549815177917, - "learning_rate": 7.491323759308089e-05, - "loss": 0.0588, - "step": 66110 - }, - { - "epoch": 4.325809617271835, - "grad_norm": 0.8115940690040588, - "learning_rate": 7.490527273068003e-05, - "loss": 0.0609, - "step": 66120 - }, - { - "epoch": 4.326463853451096, - "grad_norm": 0.8623285889625549, - "learning_rate": 7.489730702766092e-05, - "loss": 0.0694, - "step": 66130 - }, - { - "epoch": 4.327118089630357, - "grad_norm": 0.9182894825935364, - "learning_rate": 7.488934048429239e-05, - "loss": 0.0687, - "step": 66140 - }, - { - "epoch": 4.327772325809617, - "grad_norm": 0.8562094569206238, - "learning_rate": 7.488137310084334e-05, - "loss": 0.0567, - "step": 66150 - }, - { - "epoch": 4.328426561988878, - "grad_norm": 0.6775592565536499, - "learning_rate": 7.487340487758271e-05, - "loss": 0.0616, - "step": 66160 - }, - { - "epoch": 4.329080798168139, - "grad_norm": 0.9210445880889893, - "learning_rate": 7.486543581477942e-05, - "loss": 0.066, - "step": 66170 - }, - { - "epoch": 4.329735034347399, - "grad_norm": 0.8640944957733154, - "learning_rate": 7.485746591270247e-05, - "loss": 0.0679, - "step": 66180 - }, - { - "epoch": 4.33038927052666, - "grad_norm": 0.9306260943412781, - "learning_rate": 7.484949517162083e-05, - "loss": 0.0677, - "step": 66190 - }, - { - "epoch": 4.331043506705921, - "grad_norm": 0.9359939098358154, - "learning_rate": 7.484152359180358e-05, - "loss": 0.063, - "step": 66200 - }, - { - "epoch": 4.331697742885182, - "grad_norm": 0.6757792830467224, - "learning_rate": 7.483355117351975e-05, - "loss": 0.0645, - "step": 66210 - }, - { - "epoch": 4.332351979064442, - "grad_norm": 1.1225383281707764, - "learning_rate": 7.482557791703843e-05, - "loss": 0.0551, - "step": 66220 - }, - { - "epoch": 4.333006215243703, - "grad_norm": 0.8238813877105713, - "learning_rate": 7.481760382262876e-05, - "loss": 0.0648, - "step": 66230 - }, - { - "epoch": 4.333660451422964, - "grad_norm": 0.8821508288383484, - "learning_rate": 7.480962889055989e-05, - "loss": 0.064, - "step": 66240 - }, - { - "epoch": 4.334314687602224, - "grad_norm": 0.8771295547485352, - "learning_rate": 7.480165312110096e-05, - "loss": 0.0546, - "step": 66250 - }, - { - "epoch": 4.334968923781485, - "grad_norm": 0.9171880483627319, - "learning_rate": 7.479367651452119e-05, - "loss": 0.0613, - "step": 66260 - }, - { - "epoch": 4.335623159960746, - "grad_norm": 0.8392135500907898, - "learning_rate": 7.478569907108983e-05, - "loss": 0.0664, - "step": 66270 - }, - { - "epoch": 4.336277396140007, - "grad_norm": 0.8099778890609741, - "learning_rate": 7.477772079107612e-05, - "loss": 0.0602, - "step": 66280 - }, - { - "epoch": 4.336931632319267, - "grad_norm": 0.8127948641777039, - "learning_rate": 7.476974167474934e-05, - "loss": 0.0604, - "step": 66290 - }, - { - "epoch": 4.337585868498528, - "grad_norm": 0.9666029214859009, - "learning_rate": 7.476176172237883e-05, - "loss": 0.061, - "step": 66300 - }, - { - "epoch": 4.338240104677789, - "grad_norm": 0.9391130208969116, - "learning_rate": 7.475378093423391e-05, - "loss": 0.0647, - "step": 66310 - }, - { - "epoch": 4.338894340857049, - "grad_norm": 0.7683467864990234, - "learning_rate": 7.474579931058397e-05, - "loss": 0.0622, - "step": 66320 - }, - { - "epoch": 4.33954857703631, - "grad_norm": 0.8079342246055603, - "learning_rate": 7.47378168516984e-05, - "loss": 0.0572, - "step": 66330 - }, - { - "epoch": 4.340202813215571, - "grad_norm": 1.2922800779342651, - "learning_rate": 7.472983355784664e-05, - "loss": 0.0781, - "step": 66340 - }, - { - "epoch": 4.340857049394831, - "grad_norm": 0.7486710548400879, - "learning_rate": 7.472184942929815e-05, - "loss": 0.0631, - "step": 66350 - }, - { - "epoch": 4.341511285574092, - "grad_norm": 0.7846554517745972, - "learning_rate": 7.471386446632238e-05, - "loss": 0.063, - "step": 66360 - }, - { - "epoch": 4.342165521753353, - "grad_norm": 0.6966447234153748, - "learning_rate": 7.470587866918889e-05, - "loss": 0.0614, - "step": 66370 - }, - { - "epoch": 4.342819757932614, - "grad_norm": 0.8768433928489685, - "learning_rate": 7.469789203816719e-05, - "loss": 0.0598, - "step": 66380 - }, - { - "epoch": 4.343473994111874, - "grad_norm": 1.1041473150253296, - "learning_rate": 7.468990457352687e-05, - "loss": 0.0684, - "step": 66390 - }, - { - "epoch": 4.344128230291135, - "grad_norm": 0.9726035594940186, - "learning_rate": 7.468191627553753e-05, - "loss": 0.0635, - "step": 66400 - }, - { - "epoch": 4.344782466470396, - "grad_norm": 0.9272595643997192, - "learning_rate": 7.467392714446876e-05, - "loss": 0.0695, - "step": 66410 - }, - { - "epoch": 4.345436702649657, - "grad_norm": 0.8459830284118652, - "learning_rate": 7.466593718059026e-05, - "loss": 0.0527, - "step": 66420 - }, - { - "epoch": 4.346090938828917, - "grad_norm": 0.9491352438926697, - "learning_rate": 7.465794638417167e-05, - "loss": 0.0611, - "step": 66430 - }, - { - "epoch": 4.346745175008178, - "grad_norm": 0.9533461928367615, - "learning_rate": 7.464995475548275e-05, - "loss": 0.0687, - "step": 66440 - }, - { - "epoch": 4.347399411187439, - "grad_norm": 0.8215408325195312, - "learning_rate": 7.464196229479317e-05, - "loss": 0.0617, - "step": 66450 - }, - { - "epoch": 4.348053647366699, - "grad_norm": 0.7235555052757263, - "learning_rate": 7.463396900237277e-05, - "loss": 0.058, - "step": 66460 - }, - { - "epoch": 4.34870788354596, - "grad_norm": 0.8705189824104309, - "learning_rate": 7.462597487849131e-05, - "loss": 0.0595, - "step": 66470 - }, - { - "epoch": 4.349362119725221, - "grad_norm": 0.906433641910553, - "learning_rate": 7.461797992341861e-05, - "loss": 0.0626, - "step": 66480 - }, - { - "epoch": 4.350016355904481, - "grad_norm": 0.7737306356430054, - "learning_rate": 7.460998413742451e-05, - "loss": 0.0704, - "step": 66490 - }, - { - "epoch": 4.350670592083742, - "grad_norm": 0.782434344291687, - "learning_rate": 7.460198752077892e-05, - "loss": 0.0603, - "step": 66500 - }, - { - "epoch": 4.351324828263003, - "grad_norm": 0.7962082028388977, - "learning_rate": 7.459399007375172e-05, - "loss": 0.0656, - "step": 66510 - }, - { - "epoch": 4.351979064442264, - "grad_norm": 0.8921648263931274, - "learning_rate": 7.458599179661286e-05, - "loss": 0.0634, - "step": 66520 - }, - { - "epoch": 4.352633300621524, - "grad_norm": 0.868022084236145, - "learning_rate": 7.45779926896323e-05, - "loss": 0.0594, - "step": 66530 - }, - { - "epoch": 4.353287536800785, - "grad_norm": 1.0638127326965332, - "learning_rate": 7.456999275308002e-05, - "loss": 0.0655, - "step": 66540 - }, - { - "epoch": 4.353941772980046, - "grad_norm": 0.8913347721099854, - "learning_rate": 7.456199198722604e-05, - "loss": 0.056, - "step": 66550 - }, - { - "epoch": 4.354596009159306, - "grad_norm": 0.9430878162384033, - "learning_rate": 7.455399039234043e-05, - "loss": 0.0648, - "step": 66560 - }, - { - "epoch": 4.355250245338567, - "grad_norm": 1.181059718132019, - "learning_rate": 7.454598796869325e-05, - "loss": 0.0701, - "step": 66570 - }, - { - "epoch": 4.355904481517828, - "grad_norm": 0.9202277064323425, - "learning_rate": 7.45379847165546e-05, - "loss": 0.0633, - "step": 66580 - }, - { - "epoch": 4.356558717697089, - "grad_norm": 0.9049434661865234, - "learning_rate": 7.45299806361946e-05, - "loss": 0.0625, - "step": 66590 - }, - { - "epoch": 4.357212953876349, - "grad_norm": 1.0947158336639404, - "learning_rate": 7.452197572788345e-05, - "loss": 0.0639, - "step": 66600 - }, - { - "epoch": 4.35786719005561, - "grad_norm": 0.9060896635055542, - "learning_rate": 7.451396999189129e-05, - "loss": 0.0599, - "step": 66610 - }, - { - "epoch": 4.358521426234871, - "grad_norm": 0.9136892557144165, - "learning_rate": 7.450596342848835e-05, - "loss": 0.0671, - "step": 66620 - }, - { - "epoch": 4.359175662414131, - "grad_norm": 0.7498853802680969, - "learning_rate": 7.449795603794487e-05, - "loss": 0.0585, - "step": 66630 - }, - { - "epoch": 4.359829898593392, - "grad_norm": 0.9020270705223083, - "learning_rate": 7.448994782053114e-05, - "loss": 0.0678, - "step": 66640 - }, - { - "epoch": 4.360484134772653, - "grad_norm": 0.9565746188163757, - "learning_rate": 7.448193877651743e-05, - "loss": 0.0662, - "step": 66650 - }, - { - "epoch": 4.361138370951914, - "grad_norm": 0.9026892185211182, - "learning_rate": 7.447392890617408e-05, - "loss": 0.0647, - "step": 66660 - }, - { - "epoch": 4.361792607131174, - "grad_norm": 1.156490683555603, - "learning_rate": 7.446591820977144e-05, - "loss": 0.0648, - "step": 66670 - }, - { - "epoch": 4.362446843310435, - "grad_norm": 0.8447253704071045, - "learning_rate": 7.445790668757992e-05, - "loss": 0.0662, - "step": 66680 - }, - { - "epoch": 4.363101079489696, - "grad_norm": 0.8401336073875427, - "learning_rate": 7.44498943398699e-05, - "loss": 0.0621, - "step": 66690 - }, - { - "epoch": 4.363755315668956, - "grad_norm": 0.88148033618927, - "learning_rate": 7.44418811669118e-05, - "loss": 0.0688, - "step": 66700 - }, - { - "epoch": 4.364409551848217, - "grad_norm": 1.0117405652999878, - "learning_rate": 7.443386716897614e-05, - "loss": 0.0547, - "step": 66710 - }, - { - "epoch": 4.365063788027478, - "grad_norm": 0.9008330702781677, - "learning_rate": 7.442585234633337e-05, - "loss": 0.0676, - "step": 66720 - }, - { - "epoch": 4.365718024206739, - "grad_norm": 0.8661233186721802, - "learning_rate": 7.441783669925402e-05, - "loss": 0.0613, - "step": 66730 - }, - { - "epoch": 4.366372260385999, - "grad_norm": 0.849505603313446, - "learning_rate": 7.440982022800864e-05, - "loss": 0.0652, - "step": 66740 - }, - { - "epoch": 4.36702649656526, - "grad_norm": 0.7633360028266907, - "learning_rate": 7.440180293286783e-05, - "loss": 0.0601, - "step": 66750 - }, - { - "epoch": 4.367680732744521, - "grad_norm": 0.9463858008384705, - "learning_rate": 7.439378481410215e-05, - "loss": 0.0651, - "step": 66760 - }, - { - "epoch": 4.368334968923781, - "grad_norm": 0.9335405826568604, - "learning_rate": 7.438576587198228e-05, - "loss": 0.0639, - "step": 66770 - }, - { - "epoch": 4.368989205103042, - "grad_norm": 0.8489439487457275, - "learning_rate": 7.437774610677884e-05, - "loss": 0.057, - "step": 66780 - }, - { - "epoch": 4.369643441282303, - "grad_norm": 0.9657695889472961, - "learning_rate": 7.436972551876255e-05, - "loss": 0.0632, - "step": 66790 - }, - { - "epoch": 4.370297677461563, - "grad_norm": 0.8305515050888062, - "learning_rate": 7.43617041082041e-05, - "loss": 0.0664, - "step": 66800 - }, - { - "epoch": 4.370951913640824, - "grad_norm": 1.0466861724853516, - "learning_rate": 7.435368187537424e-05, - "loss": 0.057, - "step": 66810 - }, - { - "epoch": 4.371606149820085, - "grad_norm": 1.0870552062988281, - "learning_rate": 7.434565882054377e-05, - "loss": 0.0614, - "step": 66820 - }, - { - "epoch": 4.372260385999346, - "grad_norm": 0.9726265072822571, - "learning_rate": 7.433763494398345e-05, - "loss": 0.0579, - "step": 66830 - }, - { - "epoch": 4.372914622178606, - "grad_norm": 0.8406111001968384, - "learning_rate": 7.432961024596413e-05, - "loss": 0.0625, - "step": 66840 - }, - { - "epoch": 4.373568858357867, - "grad_norm": 0.9154638648033142, - "learning_rate": 7.432158472675665e-05, - "loss": 0.0695, - "step": 66850 - }, - { - "epoch": 4.374223094537128, - "grad_norm": 0.8148864507675171, - "learning_rate": 7.43135583866319e-05, - "loss": 0.0657, - "step": 66860 - }, - { - "epoch": 4.374877330716389, - "grad_norm": 1.0831327438354492, - "learning_rate": 7.430553122586079e-05, - "loss": 0.0759, - "step": 66870 - }, - { - "epoch": 4.375531566895649, - "grad_norm": 0.7323132157325745, - "learning_rate": 7.429750324471425e-05, - "loss": 0.0575, - "step": 66880 - }, - { - "epoch": 4.37618580307491, - "grad_norm": 0.7715146541595459, - "learning_rate": 7.428947444346327e-05, - "loss": 0.0591, - "step": 66890 - }, - { - "epoch": 4.376840039254171, - "grad_norm": 0.7162055969238281, - "learning_rate": 7.428144482237882e-05, - "loss": 0.0633, - "step": 66900 - }, - { - "epoch": 4.377494275433431, - "grad_norm": 0.9126048684120178, - "learning_rate": 7.427341438173192e-05, - "loss": 0.0738, - "step": 66910 - }, - { - "epoch": 4.378148511612692, - "grad_norm": 0.8502604365348816, - "learning_rate": 7.426538312179364e-05, - "loss": 0.0591, - "step": 66920 - }, - { - "epoch": 4.378802747791953, - "grad_norm": 1.0491002798080444, - "learning_rate": 7.425735104283502e-05, - "loss": 0.0642, - "step": 66930 - }, - { - "epoch": 4.379456983971213, - "grad_norm": 1.0379705429077148, - "learning_rate": 7.424931814512721e-05, - "loss": 0.0692, - "step": 66940 - }, - { - "epoch": 4.380111220150474, - "grad_norm": 0.9952113032341003, - "learning_rate": 7.42412844289413e-05, - "loss": 0.075, - "step": 66950 - }, - { - "epoch": 4.380765456329735, - "grad_norm": 0.9746718406677246, - "learning_rate": 7.423324989454847e-05, - "loss": 0.0598, - "step": 66960 - }, - { - "epoch": 4.381419692508996, - "grad_norm": 0.8205549716949463, - "learning_rate": 7.42252145422199e-05, - "loss": 0.0565, - "step": 66970 - }, - { - "epoch": 4.382073928688256, - "grad_norm": 0.9923220276832581, - "learning_rate": 7.421717837222682e-05, - "loss": 0.0717, - "step": 66980 - }, - { - "epoch": 4.382728164867517, - "grad_norm": 0.804681658744812, - "learning_rate": 7.420914138484045e-05, - "loss": 0.0645, - "step": 66990 - }, - { - "epoch": 4.383382401046778, - "grad_norm": 0.7509680390357971, - "learning_rate": 7.420110358033205e-05, - "loss": 0.0627, - "step": 67000 - }, - { - "epoch": 4.384036637226038, - "grad_norm": 0.7687981128692627, - "learning_rate": 7.419306495897295e-05, - "loss": 0.069, - "step": 67010 - }, - { - "epoch": 4.384690873405299, - "grad_norm": 0.8498753309249878, - "learning_rate": 7.418502552103446e-05, - "loss": 0.0653, - "step": 67020 - }, - { - "epoch": 4.38534510958456, - "grad_norm": 0.9675426483154297, - "learning_rate": 7.417698526678792e-05, - "loss": 0.0728, - "step": 67030 - }, - { - "epoch": 4.385999345763821, - "grad_norm": 0.9091965556144714, - "learning_rate": 7.416894419650473e-05, - "loss": 0.0641, - "step": 67040 - }, - { - "epoch": 4.386653581943081, - "grad_norm": 0.8700924515724182, - "learning_rate": 7.416090231045629e-05, - "loss": 0.057, - "step": 67050 - }, - { - "epoch": 4.387307818122342, - "grad_norm": 0.7336810231208801, - "learning_rate": 7.415285960891403e-05, - "loss": 0.0683, - "step": 67060 - }, - { - "epoch": 4.387962054301603, - "grad_norm": 0.9728383421897888, - "learning_rate": 7.414481609214941e-05, - "loss": 0.0596, - "step": 67070 - }, - { - "epoch": 4.388616290480863, - "grad_norm": 0.6819276809692383, - "learning_rate": 7.413677176043393e-05, - "loss": 0.0561, - "step": 67080 - }, - { - "epoch": 4.389270526660124, - "grad_norm": 0.9143689870834351, - "learning_rate": 7.41287266140391e-05, - "loss": 0.0592, - "step": 67090 - }, - { - "epoch": 4.389924762839385, - "grad_norm": 0.8440172672271729, - "learning_rate": 7.412068065323648e-05, - "loss": 0.0643, - "step": 67100 - }, - { - "epoch": 4.390578999018646, - "grad_norm": 0.8751739859580994, - "learning_rate": 7.411263387829761e-05, - "loss": 0.0652, - "step": 67110 - }, - { - "epoch": 4.391233235197906, - "grad_norm": 1.0761669874191284, - "learning_rate": 7.41045862894941e-05, - "loss": 0.0728, - "step": 67120 - }, - { - "epoch": 4.391887471377167, - "grad_norm": 0.7881830334663391, - "learning_rate": 7.409653788709762e-05, - "loss": 0.0655, - "step": 67130 - }, - { - "epoch": 4.392541707556428, - "grad_norm": 0.9639447331428528, - "learning_rate": 7.408848867137977e-05, - "loss": 0.0584, - "step": 67140 - }, - { - "epoch": 4.393195943735688, - "grad_norm": 0.8875806331634521, - "learning_rate": 7.408043864261225e-05, - "loss": 0.0745, - "step": 67150 - }, - { - "epoch": 4.393850179914949, - "grad_norm": 0.7383980751037598, - "learning_rate": 7.407238780106679e-05, - "loss": 0.0572, - "step": 67160 - }, - { - "epoch": 4.39450441609421, - "grad_norm": 0.848702609539032, - "learning_rate": 7.406433614701509e-05, - "loss": 0.0632, - "step": 67170 - }, - { - "epoch": 4.395158652273471, - "grad_norm": 0.7714468240737915, - "learning_rate": 7.405628368072894e-05, - "loss": 0.0595, - "step": 67180 - }, - { - "epoch": 4.395812888452731, - "grad_norm": 0.775048017501831, - "learning_rate": 7.404823040248013e-05, - "loss": 0.0612, - "step": 67190 - }, - { - "epoch": 4.396467124631992, - "grad_norm": 0.6783040165901184, - "learning_rate": 7.404017631254047e-05, - "loss": 0.0576, - "step": 67200 - }, - { - "epoch": 4.397121360811253, - "grad_norm": 0.9950321912765503, - "learning_rate": 7.403212141118182e-05, - "loss": 0.0645, - "step": 67210 - }, - { - "epoch": 4.397775596990513, - "grad_norm": 0.9404067993164062, - "learning_rate": 7.402406569867604e-05, - "loss": 0.0632, - "step": 67220 - }, - { - "epoch": 4.398429833169774, - "grad_norm": 0.8246940970420837, - "learning_rate": 7.401600917529504e-05, - "loss": 0.0574, - "step": 67230 - }, - { - "epoch": 4.399084069349035, - "grad_norm": 0.8735194802284241, - "learning_rate": 7.400795184131075e-05, - "loss": 0.0571, - "step": 67240 - }, - { - "epoch": 4.399738305528295, - "grad_norm": 0.8854332566261292, - "learning_rate": 7.399989369699512e-05, - "loss": 0.0599, - "step": 67250 - }, - { - "epoch": 4.400392541707556, - "grad_norm": 1.0150554180145264, - "learning_rate": 7.399183474262011e-05, - "loss": 0.0703, - "step": 67260 - }, - { - "epoch": 4.401046777886817, - "grad_norm": 1.0136165618896484, - "learning_rate": 7.398377497845779e-05, - "loss": 0.0587, - "step": 67270 - }, - { - "epoch": 4.401701014066078, - "grad_norm": 0.7560604214668274, - "learning_rate": 7.397571440478015e-05, - "loss": 0.0743, - "step": 67280 - }, - { - "epoch": 4.402355250245338, - "grad_norm": 0.8288975358009338, - "learning_rate": 7.396765302185928e-05, - "loss": 0.0663, - "step": 67290 - }, - { - "epoch": 4.403009486424599, - "grad_norm": 1.1244462728500366, - "learning_rate": 7.395959082996725e-05, - "loss": 0.0719, - "step": 67300 - }, - { - "epoch": 4.40366372260386, - "grad_norm": 0.8826910257339478, - "learning_rate": 7.39515278293762e-05, - "loss": 0.0693, - "step": 67310 - }, - { - "epoch": 4.404317958783121, - "grad_norm": 0.8879522681236267, - "learning_rate": 7.394346402035828e-05, - "loss": 0.0762, - "step": 67320 - }, - { - "epoch": 4.404972194962381, - "grad_norm": 0.8239995241165161, - "learning_rate": 7.393539940318563e-05, - "loss": 0.0548, - "step": 67330 - }, - { - "epoch": 4.405626431141642, - "grad_norm": 1.0602549314498901, - "learning_rate": 7.39273339781305e-05, - "loss": 0.0598, - "step": 67340 - }, - { - "epoch": 4.406280667320903, - "grad_norm": 1.0400123596191406, - "learning_rate": 7.391926774546509e-05, - "loss": 0.0723, - "step": 67350 - }, - { - "epoch": 4.406934903500163, - "grad_norm": 0.9202961325645447, - "learning_rate": 7.391120070546165e-05, - "loss": 0.0636, - "step": 67360 - }, - { - "epoch": 4.407589139679424, - "grad_norm": 0.8202659487724304, - "learning_rate": 7.39031328583925e-05, - "loss": 0.0651, - "step": 67370 - }, - { - "epoch": 4.408243375858685, - "grad_norm": 0.8003469109535217, - "learning_rate": 7.389506420452991e-05, - "loss": 0.0572, - "step": 67380 - }, - { - "epoch": 4.408897612037945, - "grad_norm": 0.8167177438735962, - "learning_rate": 7.388699474414624e-05, - "loss": 0.0746, - "step": 67390 - }, - { - "epoch": 4.409551848217206, - "grad_norm": 0.949032187461853, - "learning_rate": 7.387892447751387e-05, - "loss": 0.0654, - "step": 67400 - }, - { - "epoch": 4.410206084396467, - "grad_norm": 0.9453916549682617, - "learning_rate": 7.387085340490514e-05, - "loss": 0.0617, - "step": 67410 - }, - { - "epoch": 4.410860320575728, - "grad_norm": 1.0809295177459717, - "learning_rate": 7.386278152659254e-05, - "loss": 0.0699, - "step": 67420 - }, - { - "epoch": 4.411514556754988, - "grad_norm": 1.0627373456954956, - "learning_rate": 7.385470884284845e-05, - "loss": 0.0636, - "step": 67430 - }, - { - "epoch": 4.412168792934249, - "grad_norm": 0.8991785645484924, - "learning_rate": 7.384663535394541e-05, - "loss": 0.064, - "step": 67440 - }, - { - "epoch": 4.41282302911351, - "grad_norm": 0.9950599074363708, - "learning_rate": 7.383856106015585e-05, - "loss": 0.0611, - "step": 67450 - }, - { - "epoch": 4.41347726529277, - "grad_norm": 0.7597075700759888, - "learning_rate": 7.383048596175236e-05, - "loss": 0.0645, - "step": 67460 - }, - { - "epoch": 4.414131501472031, - "grad_norm": 0.9785023927688599, - "learning_rate": 7.382241005900745e-05, - "loss": 0.062, - "step": 67470 - }, - { - "epoch": 4.414785737651292, - "grad_norm": 0.8805351257324219, - "learning_rate": 7.381433335219374e-05, - "loss": 0.0663, - "step": 67480 - }, - { - "epoch": 4.415439973830553, - "grad_norm": 0.7586386203765869, - "learning_rate": 7.38062558415838e-05, - "loss": 0.0644, - "step": 67490 - }, - { - "epoch": 4.416094210009813, - "grad_norm": 0.8569243550300598, - "learning_rate": 7.379817752745033e-05, - "loss": 0.0602, - "step": 67500 - }, - { - "epoch": 4.416748446189074, - "grad_norm": 0.8760682344436646, - "learning_rate": 7.379009841006593e-05, - "loss": 0.0596, - "step": 67510 - }, - { - "epoch": 4.417402682368335, - "grad_norm": 0.9443554878234863, - "learning_rate": 7.378201848970332e-05, - "loss": 0.0648, - "step": 67520 - }, - { - "epoch": 4.418056918547595, - "grad_norm": 0.9215699434280396, - "learning_rate": 7.377393776663523e-05, - "loss": 0.0693, - "step": 67530 - }, - { - "epoch": 4.418711154726856, - "grad_norm": 1.1914721727371216, - "learning_rate": 7.376585624113437e-05, - "loss": 0.0702, - "step": 67540 - }, - { - "epoch": 4.419365390906117, - "grad_norm": 0.8695497512817383, - "learning_rate": 7.375777391347355e-05, - "loss": 0.0529, - "step": 67550 - }, - { - "epoch": 4.420019627085378, - "grad_norm": 0.8049673438072205, - "learning_rate": 7.374969078392555e-05, - "loss": 0.0646, - "step": 67560 - }, - { - "epoch": 4.420673863264638, - "grad_norm": 1.0911378860473633, - "learning_rate": 7.37416068527632e-05, - "loss": 0.0677, - "step": 67570 - }, - { - "epoch": 4.421328099443899, - "grad_norm": 0.8425750136375427, - "learning_rate": 7.373352212025935e-05, - "loss": 0.0604, - "step": 67580 - }, - { - "epoch": 4.42198233562316, - "grad_norm": 1.0606712102890015, - "learning_rate": 7.372543658668688e-05, - "loss": 0.0569, - "step": 67590 - }, - { - "epoch": 4.42263657180242, - "grad_norm": 0.8173533082008362, - "learning_rate": 7.371735025231871e-05, - "loss": 0.0595, - "step": 67600 - }, - { - "epoch": 4.423290807981681, - "grad_norm": 1.046075701713562, - "learning_rate": 7.370926311742776e-05, - "loss": 0.0632, - "step": 67610 - }, - { - "epoch": 4.423945044160942, - "grad_norm": 0.6317776441574097, - "learning_rate": 7.3701175182287e-05, - "loss": 0.0688, - "step": 67620 - }, - { - "epoch": 4.424599280340203, - "grad_norm": 0.8393598198890686, - "learning_rate": 7.369308644716944e-05, - "loss": 0.0633, - "step": 67630 - }, - { - "epoch": 4.425253516519463, - "grad_norm": 0.9631373286247253, - "learning_rate": 7.368499691234806e-05, - "loss": 0.0578, - "step": 67640 - }, - { - "epoch": 4.425907752698724, - "grad_norm": 0.829046905040741, - "learning_rate": 7.367690657809592e-05, - "loss": 0.0647, - "step": 67650 - }, - { - "epoch": 4.426561988877985, - "grad_norm": 0.9944875836372375, - "learning_rate": 7.366881544468609e-05, - "loss": 0.0598, - "step": 67660 - }, - { - "epoch": 4.427216225057245, - "grad_norm": 0.9320645928382874, - "learning_rate": 7.366072351239165e-05, - "loss": 0.0583, - "step": 67670 - }, - { - "epoch": 4.427870461236506, - "grad_norm": 1.0392950773239136, - "learning_rate": 7.365263078148575e-05, - "loss": 0.0623, - "step": 67680 - }, - { - "epoch": 4.428524697415767, - "grad_norm": 0.7760281562805176, - "learning_rate": 7.364453725224154e-05, - "loss": 0.0646, - "step": 67690 - }, - { - "epoch": 4.429178933595027, - "grad_norm": 0.8868238925933838, - "learning_rate": 7.363644292493218e-05, - "loss": 0.0664, - "step": 67700 - }, - { - "epoch": 4.429833169774288, - "grad_norm": 0.9134417772293091, - "learning_rate": 7.362834779983087e-05, - "loss": 0.0597, - "step": 67710 - }, - { - "epoch": 4.430487405953549, - "grad_norm": 0.7500336170196533, - "learning_rate": 7.362025187721086e-05, - "loss": 0.0573, - "step": 67720 - }, - { - "epoch": 4.43114164213281, - "grad_norm": 0.777053713798523, - "learning_rate": 7.361215515734541e-05, - "loss": 0.0696, - "step": 67730 - }, - { - "epoch": 4.43179587831207, - "grad_norm": 0.8513823747634888, - "learning_rate": 7.36040576405078e-05, - "loss": 0.0611, - "step": 67740 - }, - { - "epoch": 4.432450114491331, - "grad_norm": 0.7951275706291199, - "learning_rate": 7.359595932697134e-05, - "loss": 0.06, - "step": 67750 - }, - { - "epoch": 4.433104350670592, - "grad_norm": 0.9466954469680786, - "learning_rate": 7.358786021700936e-05, - "loss": 0.0645, - "step": 67760 - }, - { - "epoch": 4.433758586849853, - "grad_norm": 0.8108925819396973, - "learning_rate": 7.357976031089524e-05, - "loss": 0.0666, - "step": 67770 - }, - { - "epoch": 4.434412823029113, - "grad_norm": 0.8179001212120056, - "learning_rate": 7.357165960890237e-05, - "loss": 0.069, - "step": 67780 - }, - { - "epoch": 4.435067059208374, - "grad_norm": 0.8625308275222778, - "learning_rate": 7.356355811130419e-05, - "loss": 0.0601, - "step": 67790 - }, - { - "epoch": 4.435721295387635, - "grad_norm": 1.066897988319397, - "learning_rate": 7.35554558183741e-05, - "loss": 0.069, - "step": 67800 - }, - { - "epoch": 4.436375531566895, - "grad_norm": 0.8988897800445557, - "learning_rate": 7.35473527303856e-05, - "loss": 0.0809, - "step": 67810 - }, - { - "epoch": 4.437029767746156, - "grad_norm": 0.8011250495910645, - "learning_rate": 7.35392488476122e-05, - "loss": 0.0633, - "step": 67820 - }, - { - "epoch": 4.437684003925417, - "grad_norm": 0.9169591665267944, - "learning_rate": 7.353114417032742e-05, - "loss": 0.057, - "step": 67830 - }, - { - "epoch": 4.438338240104677, - "grad_norm": 0.9640666246414185, - "learning_rate": 7.35230386988048e-05, - "loss": 0.0624, - "step": 67840 - }, - { - "epoch": 4.438992476283938, - "grad_norm": 0.7338112592697144, - "learning_rate": 7.351493243331794e-05, - "loss": 0.0615, - "step": 67850 - }, - { - "epoch": 4.439646712463199, - "grad_norm": 0.8081340789794922, - "learning_rate": 7.350682537414044e-05, - "loss": 0.0662, - "step": 67860 - }, - { - "epoch": 4.44030094864246, - "grad_norm": 1.210657000541687, - "learning_rate": 7.349871752154593e-05, - "loss": 0.0784, - "step": 67870 - }, - { - "epoch": 4.44095518482172, - "grad_norm": 0.9144163727760315, - "learning_rate": 7.349060887580808e-05, - "loss": 0.0635, - "step": 67880 - }, - { - "epoch": 4.441609421000981, - "grad_norm": 0.9125617742538452, - "learning_rate": 7.348249943720058e-05, - "loss": 0.0627, - "step": 67890 - }, - { - "epoch": 4.442263657180242, - "grad_norm": 0.8639421463012695, - "learning_rate": 7.347438920599712e-05, - "loss": 0.0649, - "step": 67900 - }, - { - "epoch": 4.442917893359502, - "grad_norm": 0.795691728591919, - "learning_rate": 7.346627818247149e-05, - "loss": 0.0674, - "step": 67910 - }, - { - "epoch": 4.443572129538763, - "grad_norm": 0.9141677618026733, - "learning_rate": 7.34581663668974e-05, - "loss": 0.0657, - "step": 67920 - }, - { - "epoch": 4.444226365718024, - "grad_norm": 0.9653276205062866, - "learning_rate": 7.345005375954869e-05, - "loss": 0.0506, - "step": 67930 - }, - { - "epoch": 4.444880601897285, - "grad_norm": 1.0340138673782349, - "learning_rate": 7.344194036069916e-05, - "loss": 0.0768, - "step": 67940 - }, - { - "epoch": 4.445534838076545, - "grad_norm": 0.7893163561820984, - "learning_rate": 7.343382617062266e-05, - "loss": 0.0591, - "step": 67950 - }, - { - "epoch": 4.446189074255806, - "grad_norm": 0.840461015701294, - "learning_rate": 7.342571118959307e-05, - "loss": 0.059, - "step": 67960 - }, - { - "epoch": 4.446843310435067, - "grad_norm": 1.0385587215423584, - "learning_rate": 7.34175954178843e-05, - "loss": 0.0784, - "step": 67970 - }, - { - "epoch": 4.447497546614327, - "grad_norm": 0.9671325087547302, - "learning_rate": 7.340947885577028e-05, - "loss": 0.0583, - "step": 67980 - }, - { - "epoch": 4.448151782793588, - "grad_norm": 0.8433911800384521, - "learning_rate": 7.340136150352492e-05, - "loss": 0.0611, - "step": 67990 - }, - { - "epoch": 4.448806018972849, - "grad_norm": 0.8698466420173645, - "learning_rate": 7.339324336142226e-05, - "loss": 0.0668, - "step": 68000 - }, - { - "epoch": 4.44946025515211, - "grad_norm": 0.8934153318405151, - "learning_rate": 7.338512442973628e-05, - "loss": 0.0628, - "step": 68010 - }, - { - "epoch": 4.45011449133137, - "grad_norm": 0.9508333206176758, - "learning_rate": 7.337700470874103e-05, - "loss": 0.0582, - "step": 68020 - }, - { - "epoch": 4.450768727510631, - "grad_norm": 0.8118358254432678, - "learning_rate": 7.336888419871055e-05, - "loss": 0.0571, - "step": 68030 - }, - { - "epoch": 4.451422963689892, - "grad_norm": 0.7559012174606323, - "learning_rate": 7.336076289991895e-05, - "loss": 0.0645, - "step": 68040 - }, - { - "epoch": 4.452077199869152, - "grad_norm": 1.2024433612823486, - "learning_rate": 7.335264081264035e-05, - "loss": 0.0652, - "step": 68050 - }, - { - "epoch": 4.452731436048413, - "grad_norm": 0.8405249714851379, - "learning_rate": 7.334451793714885e-05, - "loss": 0.0679, - "step": 68060 - }, - { - "epoch": 4.453385672227674, - "grad_norm": 0.9205291271209717, - "learning_rate": 7.333639427371866e-05, - "loss": 0.0597, - "step": 68070 - }, - { - "epoch": 4.454039908406935, - "grad_norm": 1.0684325695037842, - "learning_rate": 7.332826982262395e-05, - "loss": 0.0664, - "step": 68080 - }, - { - "epoch": 4.454694144586195, - "grad_norm": 0.9805346727371216, - "learning_rate": 7.332014458413897e-05, - "loss": 0.0621, - "step": 68090 - }, - { - "epoch": 4.455348380765456, - "grad_norm": 0.7872211337089539, - "learning_rate": 7.331201855853794e-05, - "loss": 0.073, - "step": 68100 - }, - { - "epoch": 4.456002616944717, - "grad_norm": 0.7325934767723083, - "learning_rate": 7.330389174609515e-05, - "loss": 0.0638, - "step": 68110 - }, - { - "epoch": 4.456656853123977, - "grad_norm": 0.7671561241149902, - "learning_rate": 7.32957641470849e-05, - "loss": 0.0569, - "step": 68120 - }, - { - "epoch": 4.457311089303238, - "grad_norm": 0.9188522100448608, - "learning_rate": 7.328763576178151e-05, - "loss": 0.0645, - "step": 68130 - }, - { - "epoch": 4.457965325482499, - "grad_norm": 1.0198373794555664, - "learning_rate": 7.327950659045935e-05, - "loss": 0.0603, - "step": 68140 - }, - { - "epoch": 4.458619561661759, - "grad_norm": 0.9594274759292603, - "learning_rate": 7.327137663339276e-05, - "loss": 0.0574, - "step": 68150 - }, - { - "epoch": 4.45927379784102, - "grad_norm": 1.0565496683120728, - "learning_rate": 7.32632458908562e-05, - "loss": 0.0609, - "step": 68160 - }, - { - "epoch": 4.459928034020281, - "grad_norm": 0.9333295226097107, - "learning_rate": 7.325511436312408e-05, - "loss": 0.0769, - "step": 68170 - }, - { - "epoch": 4.460582270199542, - "grad_norm": 0.9997411966323853, - "learning_rate": 7.324698205047087e-05, - "loss": 0.0607, - "step": 68180 - }, - { - "epoch": 4.461236506378802, - "grad_norm": 1.1432136297225952, - "learning_rate": 7.323884895317102e-05, - "loss": 0.0573, - "step": 68190 - }, - { - "epoch": 4.461890742558063, - "grad_norm": 0.8411798477172852, - "learning_rate": 7.32307150714991e-05, - "loss": 0.0577, - "step": 68200 - }, - { - "epoch": 4.462544978737324, - "grad_norm": 0.8566927313804626, - "learning_rate": 7.32225804057296e-05, - "loss": 0.0617, - "step": 68210 - }, - { - "epoch": 4.463199214916585, - "grad_norm": 0.9064366221427917, - "learning_rate": 7.321444495613712e-05, - "loss": 0.0702, - "step": 68220 - }, - { - "epoch": 4.463853451095845, - "grad_norm": 0.6811468005180359, - "learning_rate": 7.320630872299624e-05, - "loss": 0.0588, - "step": 68230 - }, - { - "epoch": 4.464507687275106, - "grad_norm": 0.8559892177581787, - "learning_rate": 7.319817170658158e-05, - "loss": 0.0576, - "step": 68240 - }, - { - "epoch": 4.465161923454367, - "grad_norm": 0.7884718775749207, - "learning_rate": 7.319003390716779e-05, - "loss": 0.0656, - "step": 68250 - }, - { - "epoch": 4.465816159633627, - "grad_norm": 0.9382063150405884, - "learning_rate": 7.318189532502953e-05, - "loss": 0.0734, - "step": 68260 - }, - { - "epoch": 4.466470395812888, - "grad_norm": 0.9593857526779175, - "learning_rate": 7.317375596044152e-05, - "loss": 0.0686, - "step": 68270 - }, - { - "epoch": 4.467124631992149, - "grad_norm": 0.887768030166626, - "learning_rate": 7.316561581367845e-05, - "loss": 0.0603, - "step": 68280 - }, - { - "epoch": 4.4677788681714095, - "grad_norm": 0.8870118260383606, - "learning_rate": 7.315747488501509e-05, - "loss": 0.0611, - "step": 68290 - }, - { - "epoch": 4.46843310435067, - "grad_norm": 0.7766203284263611, - "learning_rate": 7.314933317472624e-05, - "loss": 0.0577, - "step": 68300 - }, - { - "epoch": 4.469087340529931, - "grad_norm": 0.6949660181999207, - "learning_rate": 7.314119068308668e-05, - "loss": 0.0602, - "step": 68310 - }, - { - "epoch": 4.469741576709192, - "grad_norm": 0.8689270615577698, - "learning_rate": 7.313304741037124e-05, - "loss": 0.0587, - "step": 68320 - }, - { - "epoch": 4.4703958128884524, - "grad_norm": 0.9906384348869324, - "learning_rate": 7.312490335685477e-05, - "loss": 0.0614, - "step": 68330 - }, - { - "epoch": 4.471050049067713, - "grad_norm": 0.8280085325241089, - "learning_rate": 7.311675852281218e-05, - "loss": 0.0673, - "step": 68340 - }, - { - "epoch": 4.471704285246974, - "grad_norm": 0.6465350985527039, - "learning_rate": 7.310861290851836e-05, - "loss": 0.0626, - "step": 68350 - }, - { - "epoch": 4.4723585214262345, - "grad_norm": 0.8401268720626831, - "learning_rate": 7.310046651424824e-05, - "loss": 0.0602, - "step": 68360 - }, - { - "epoch": 4.473012757605495, - "grad_norm": 0.9460334777832031, - "learning_rate": 7.309231934027681e-05, - "loss": 0.0631, - "step": 68370 - }, - { - "epoch": 4.473666993784756, - "grad_norm": 0.826092004776001, - "learning_rate": 7.308417138687902e-05, - "loss": 0.0529, - "step": 68380 - }, - { - "epoch": 4.474321229964017, - "grad_norm": 0.9380882978439331, - "learning_rate": 7.307602265432993e-05, - "loss": 0.0649, - "step": 68390 - }, - { - "epoch": 4.4749754661432775, - "grad_norm": 0.720436155796051, - "learning_rate": 7.306787314290455e-05, - "loss": 0.0636, - "step": 68400 - }, - { - "epoch": 4.475629702322538, - "grad_norm": 0.8422783017158508, - "learning_rate": 7.305972285287793e-05, - "loss": 0.0531, - "step": 68410 - }, - { - "epoch": 4.476283938501799, - "grad_norm": 0.8465076684951782, - "learning_rate": 7.30515717845252e-05, - "loss": 0.0689, - "step": 68420 - }, - { - "epoch": 4.4769381746810595, - "grad_norm": 0.7343236804008484, - "learning_rate": 7.304341993812149e-05, - "loss": 0.0649, - "step": 68430 - }, - { - "epoch": 4.4775924108603204, - "grad_norm": 1.0546813011169434, - "learning_rate": 7.30352673139419e-05, - "loss": 0.0678, - "step": 68440 - }, - { - "epoch": 4.478246647039581, - "grad_norm": 0.815166711807251, - "learning_rate": 7.302711391226163e-05, - "loss": 0.0632, - "step": 68450 - }, - { - "epoch": 4.478900883218842, - "grad_norm": 0.7304500341415405, - "learning_rate": 7.301895973335587e-05, - "loss": 0.0643, - "step": 68460 - }, - { - "epoch": 4.4795551193981025, - "grad_norm": 0.8881242871284485, - "learning_rate": 7.301080477749987e-05, - "loss": 0.0757, - "step": 68470 - }, - { - "epoch": 4.480209355577363, - "grad_norm": 0.8304605484008789, - "learning_rate": 7.300264904496883e-05, - "loss": 0.0634, - "step": 68480 - }, - { - "epoch": 4.480863591756624, - "grad_norm": 0.691174328327179, - "learning_rate": 7.299449253603808e-05, - "loss": 0.0649, - "step": 68490 - }, - { - "epoch": 4.4815178279358845, - "grad_norm": 0.9316902756690979, - "learning_rate": 7.29863352509829e-05, - "loss": 0.0646, - "step": 68500 - }, - { - "epoch": 4.4821720641151455, - "grad_norm": 0.9124321937561035, - "learning_rate": 7.297817719007861e-05, - "loss": 0.067, - "step": 68510 - }, - { - "epoch": 4.482826300294406, - "grad_norm": 1.0439180135726929, - "learning_rate": 7.297001835360058e-05, - "loss": 0.0721, - "step": 68520 - }, - { - "epoch": 4.483480536473667, - "grad_norm": 0.8428361415863037, - "learning_rate": 7.296185874182421e-05, - "loss": 0.0645, - "step": 68530 - }, - { - "epoch": 4.4841347726529275, - "grad_norm": 0.8357476592063904, - "learning_rate": 7.295369835502485e-05, - "loss": 0.0679, - "step": 68540 - }, - { - "epoch": 4.4847890088321885, - "grad_norm": 0.942311704158783, - "learning_rate": 7.2945537193478e-05, - "loss": 0.0604, - "step": 68550 - }, - { - "epoch": 4.485443245011449, - "grad_norm": 0.8047347068786621, - "learning_rate": 7.293737525745908e-05, - "loss": 0.0733, - "step": 68560 - }, - { - "epoch": 4.4860974811907095, - "grad_norm": 0.9718091487884521, - "learning_rate": 7.29292125472436e-05, - "loss": 0.0591, - "step": 68570 - }, - { - "epoch": 4.4867517173699705, - "grad_norm": 0.843663215637207, - "learning_rate": 7.292104906310707e-05, - "loss": 0.0692, - "step": 68580 - }, - { - "epoch": 4.487405953549231, - "grad_norm": 0.8390454649925232, - "learning_rate": 7.2912884805325e-05, - "loss": 0.0613, - "step": 68590 - }, - { - "epoch": 4.4880601897284915, - "grad_norm": 0.7371008396148682, - "learning_rate": 7.2904719774173e-05, - "loss": 0.057, - "step": 68600 - }, - { - "epoch": 4.4887144259077525, - "grad_norm": 1.1576838493347168, - "learning_rate": 7.289655396992661e-05, - "loss": 0.0655, - "step": 68610 - }, - { - "epoch": 4.4893686620870135, - "grad_norm": 1.0340911149978638, - "learning_rate": 7.28883873928615e-05, - "loss": 0.0626, - "step": 68620 - }, - { - "epoch": 4.490022898266274, - "grad_norm": 0.9780303835868835, - "learning_rate": 7.288022004325327e-05, - "loss": 0.0646, - "step": 68630 - }, - { - "epoch": 4.4906771344455345, - "grad_norm": 1.1134583950042725, - "learning_rate": 7.287205192137763e-05, - "loss": 0.0665, - "step": 68640 - }, - { - "epoch": 4.4913313706247955, - "grad_norm": 0.9058104157447815, - "learning_rate": 7.286388302751023e-05, - "loss": 0.0627, - "step": 68650 - }, - { - "epoch": 4.4919856068040565, - "grad_norm": 0.9129322171211243, - "learning_rate": 7.285571336192683e-05, - "loss": 0.0526, - "step": 68660 - }, - { - "epoch": 4.492639842983317, - "grad_norm": 0.7618893384933472, - "learning_rate": 7.284754292490314e-05, - "loss": 0.0584, - "step": 68670 - }, - { - "epoch": 4.4932940791625775, - "grad_norm": 0.9606322050094604, - "learning_rate": 7.283937171671498e-05, - "loss": 0.0634, - "step": 68680 - }, - { - "epoch": 4.4939483153418385, - "grad_norm": 0.9516724348068237, - "learning_rate": 7.283119973763813e-05, - "loss": 0.0665, - "step": 68690 - }, - { - "epoch": 4.494602551521099, - "grad_norm": 0.8869428038597107, - "learning_rate": 7.282302698794838e-05, - "loss": 0.0613, - "step": 68700 - }, - { - "epoch": 4.4952567877003595, - "grad_norm": 0.7321935892105103, - "learning_rate": 7.281485346792165e-05, - "loss": 0.0611, - "step": 68710 - }, - { - "epoch": 4.4959110238796205, - "grad_norm": 0.9069811701774597, - "learning_rate": 7.280667917783376e-05, - "loss": 0.0624, - "step": 68720 - }, - { - "epoch": 4.4965652600588815, - "grad_norm": 0.7614008188247681, - "learning_rate": 7.279850411796065e-05, - "loss": 0.0561, - "step": 68730 - }, - { - "epoch": 4.4972194962381415, - "grad_norm": 1.1257866621017456, - "learning_rate": 7.279032828857822e-05, - "loss": 0.0602, - "step": 68740 - }, - { - "epoch": 4.4978737324174025, - "grad_norm": 0.8876655101776123, - "learning_rate": 7.278215168996245e-05, - "loss": 0.0628, - "step": 68750 - }, - { - "epoch": 4.4985279685966635, - "grad_norm": 1.048601746559143, - "learning_rate": 7.27739743223893e-05, - "loss": 0.0645, - "step": 68760 - }, - { - "epoch": 4.4991822047759245, - "grad_norm": 0.7537106871604919, - "learning_rate": 7.27657961861348e-05, - "loss": 0.0627, - "step": 68770 - }, - { - "epoch": 4.4998364409551845, - "grad_norm": 0.9030201435089111, - "learning_rate": 7.275761728147497e-05, - "loss": 0.0707, - "step": 68780 - }, - { - "epoch": 4.5004906771344455, - "grad_norm": 0.799105703830719, - "learning_rate": 7.274943760868589e-05, - "loss": 0.0571, - "step": 68790 - }, - { - "epoch": 4.5011449133137065, - "grad_norm": 0.8551787734031677, - "learning_rate": 7.27412571680436e-05, - "loss": 0.0625, - "step": 68800 - }, - { - "epoch": 4.5017991494929674, - "grad_norm": 0.8719650506973267, - "learning_rate": 7.273307595982424e-05, - "loss": 0.0596, - "step": 68810 - }, - { - "epoch": 4.5024533856722275, - "grad_norm": 0.7927690148353577, - "learning_rate": 7.272489398430397e-05, - "loss": 0.0626, - "step": 68820 - }, - { - "epoch": 4.5031076218514885, - "grad_norm": 0.9607967138290405, - "learning_rate": 7.271671124175893e-05, - "loss": 0.0632, - "step": 68830 - }, - { - "epoch": 4.5037618580307495, - "grad_norm": 0.8309448957443237, - "learning_rate": 7.270852773246528e-05, - "loss": 0.0614, - "step": 68840 - }, - { - "epoch": 4.5044160942100095, - "grad_norm": 0.9089359045028687, - "learning_rate": 7.27003434566993e-05, - "loss": 0.0693, - "step": 68850 - }, - { - "epoch": 4.5050703303892705, - "grad_norm": 0.9338735342025757, - "learning_rate": 7.269215841473717e-05, - "loss": 0.0612, - "step": 68860 - }, - { - "epoch": 4.5057245665685315, - "grad_norm": 0.8226944804191589, - "learning_rate": 7.268397260685518e-05, - "loss": 0.068, - "step": 68870 - }, - { - "epoch": 4.506378802747792, - "grad_norm": 0.8187761902809143, - "learning_rate": 7.267578603332963e-05, - "loss": 0.0563, - "step": 68880 - }, - { - "epoch": 4.5070330389270525, - "grad_norm": 0.8850693702697754, - "learning_rate": 7.266759869443683e-05, - "loss": 0.0683, - "step": 68890 - }, - { - "epoch": 4.5076872751063135, - "grad_norm": 0.8132352232933044, - "learning_rate": 7.265941059045314e-05, - "loss": 0.0595, - "step": 68900 - }, - { - "epoch": 4.5083415112855745, - "grad_norm": 0.8258399367332458, - "learning_rate": 7.265122172165489e-05, - "loss": 0.0637, - "step": 68910 - }, - { - "epoch": 4.508995747464835, - "grad_norm": 0.7450704574584961, - "learning_rate": 7.264303208831854e-05, - "loss": 0.0644, - "step": 68920 - }, - { - "epoch": 4.5096499836440955, - "grad_norm": 0.7500464916229248, - "learning_rate": 7.263484169072044e-05, - "loss": 0.0666, - "step": 68930 - }, - { - "epoch": 4.5103042198233565, - "grad_norm": 0.8899851441383362, - "learning_rate": 7.262665052913707e-05, - "loss": 0.0559, - "step": 68940 - }, - { - "epoch": 4.510958456002617, - "grad_norm": 0.9518905282020569, - "learning_rate": 7.261845860384492e-05, - "loss": 0.0637, - "step": 68950 - }, - { - "epoch": 4.5116126921818775, - "grad_norm": 1.0154138803482056, - "learning_rate": 7.261026591512047e-05, - "loss": 0.0598, - "step": 68960 - }, - { - "epoch": 4.5122669283611385, - "grad_norm": 0.7621309757232666, - "learning_rate": 7.260207246324024e-05, - "loss": 0.0729, - "step": 68970 - }, - { - "epoch": 4.5129211645403995, - "grad_norm": 0.8984786868095398, - "learning_rate": 7.25938782484808e-05, - "loss": 0.0629, - "step": 68980 - }, - { - "epoch": 4.51357540071966, - "grad_norm": 0.9162548780441284, - "learning_rate": 7.25856832711187e-05, - "loss": 0.0639, - "step": 68990 - }, - { - "epoch": 4.5142296368989205, - "grad_norm": 0.844033420085907, - "learning_rate": 7.257748753143057e-05, - "loss": 0.0614, - "step": 69000 - }, - { - "epoch": 4.5148838730781815, - "grad_norm": 0.8666905760765076, - "learning_rate": 7.256929102969302e-05, - "loss": 0.0638, - "step": 69010 - }, - { - "epoch": 4.515538109257442, - "grad_norm": 0.9945257902145386, - "learning_rate": 7.256109376618271e-05, - "loss": 0.0602, - "step": 69020 - }, - { - "epoch": 4.516192345436703, - "grad_norm": 1.0383166074752808, - "learning_rate": 7.25528957411763e-05, - "loss": 0.0587, - "step": 69030 - }, - { - "epoch": 4.5168465816159635, - "grad_norm": 0.9124101400375366, - "learning_rate": 7.254469695495054e-05, - "loss": 0.0667, - "step": 69040 - }, - { - "epoch": 4.517500817795224, - "grad_norm": 0.7469764947891235, - "learning_rate": 7.253649740778212e-05, - "loss": 0.0641, - "step": 69050 - }, - { - "epoch": 4.518155053974485, - "grad_norm": 0.8720923662185669, - "learning_rate": 7.25282970999478e-05, - "loss": 0.0733, - "step": 69060 - }, - { - "epoch": 4.5188092901537456, - "grad_norm": 0.8793687224388123, - "learning_rate": 7.252009603172436e-05, - "loss": 0.0716, - "step": 69070 - }, - { - "epoch": 4.5194635263330065, - "grad_norm": 0.8005584478378296, - "learning_rate": 7.251189420338865e-05, - "loss": 0.0605, - "step": 69080 - }, - { - "epoch": 4.520117762512267, - "grad_norm": 0.868381142616272, - "learning_rate": 7.250369161521746e-05, - "loss": 0.0644, - "step": 69090 - }, - { - "epoch": 4.520771998691528, - "grad_norm": 0.8739510774612427, - "learning_rate": 7.249548826748764e-05, - "loss": 0.067, - "step": 69100 - }, - { - "epoch": 4.5214262348707885, - "grad_norm": 0.9072023034095764, - "learning_rate": 7.248728416047611e-05, - "loss": 0.0612, - "step": 69110 - }, - { - "epoch": 4.5220804710500495, - "grad_norm": 0.9014410972595215, - "learning_rate": 7.247907929445976e-05, - "loss": 0.0686, - "step": 69120 - }, - { - "epoch": 4.52273470722931, - "grad_norm": 0.883929967880249, - "learning_rate": 7.247087366971554e-05, - "loss": 0.054, - "step": 69130 - }, - { - "epoch": 4.523388943408571, - "grad_norm": 1.133589744567871, - "learning_rate": 7.24626672865204e-05, - "loss": 0.0698, - "step": 69140 - }, - { - "epoch": 4.5240431795878315, - "grad_norm": 0.7994188666343689, - "learning_rate": 7.245446014515132e-05, - "loss": 0.0606, - "step": 69150 - }, - { - "epoch": 4.524697415767092, - "grad_norm": 1.077908992767334, - "learning_rate": 7.244625224588533e-05, - "loss": 0.0638, - "step": 69160 - }, - { - "epoch": 4.525351651946353, - "grad_norm": 0.7507327795028687, - "learning_rate": 7.243804358899943e-05, - "loss": 0.0579, - "step": 69170 - }, - { - "epoch": 4.5260058881256136, - "grad_norm": 0.8101299405097961, - "learning_rate": 7.242983417477076e-05, - "loss": 0.0637, - "step": 69180 - }, - { - "epoch": 4.526660124304874, - "grad_norm": 0.8456845879554749, - "learning_rate": 7.242162400347634e-05, - "loss": 0.0586, - "step": 69190 - }, - { - "epoch": 4.527314360484135, - "grad_norm": 0.812188982963562, - "learning_rate": 7.24134130753933e-05, - "loss": 0.0627, - "step": 69200 - }, - { - "epoch": 4.527968596663396, - "grad_norm": 1.0019590854644775, - "learning_rate": 7.24052013907988e-05, - "loss": 0.0598, - "step": 69210 - }, - { - "epoch": 4.5286228328426565, - "grad_norm": 1.0545828342437744, - "learning_rate": 7.239698894996997e-05, - "loss": 0.062, - "step": 69220 - }, - { - "epoch": 4.529277069021917, - "grad_norm": 0.7775808572769165, - "learning_rate": 7.238877575318405e-05, - "loss": 0.0578, - "step": 69230 - }, - { - "epoch": 4.529931305201178, - "grad_norm": 0.9661263227462769, - "learning_rate": 7.238056180071823e-05, - "loss": 0.0685, - "step": 69240 - }, - { - "epoch": 4.530585541380439, - "grad_norm": 0.8217857480049133, - "learning_rate": 7.237234709284975e-05, - "loss": 0.0544, - "step": 69250 - }, - { - "epoch": 4.5312397775596995, - "grad_norm": 0.7686980962753296, - "learning_rate": 7.236413162985587e-05, - "loss": 0.06, - "step": 69260 - }, - { - "epoch": 4.53189401373896, - "grad_norm": 0.7001199722290039, - "learning_rate": 7.235591541201391e-05, - "loss": 0.0599, - "step": 69270 - }, - { - "epoch": 4.532548249918221, - "grad_norm": 0.6666156649589539, - "learning_rate": 7.234769843960116e-05, - "loss": 0.0668, - "step": 69280 - }, - { - "epoch": 4.5332024860974816, - "grad_norm": 0.9138648509979248, - "learning_rate": 7.233948071289499e-05, - "loss": 0.06, - "step": 69290 - }, - { - "epoch": 4.533856722276742, - "grad_norm": 1.0828460454940796, - "learning_rate": 7.233126223217275e-05, - "loss": 0.0742, - "step": 69300 - }, - { - "epoch": 4.534510958456003, - "grad_norm": 0.9059333801269531, - "learning_rate": 7.232304299771187e-05, - "loss": 0.0619, - "step": 69310 - }, - { - "epoch": 4.535165194635264, - "grad_norm": 0.7670040130615234, - "learning_rate": 7.231482300978971e-05, - "loss": 0.0635, - "step": 69320 - }, - { - "epoch": 4.535819430814524, - "grad_norm": 0.7467994689941406, - "learning_rate": 7.230660226868376e-05, - "loss": 0.0586, - "step": 69330 - }, - { - "epoch": 4.536473666993785, - "grad_norm": 0.8463179469108582, - "learning_rate": 7.22983807746715e-05, - "loss": 0.0564, - "step": 69340 - }, - { - "epoch": 4.537127903173046, - "grad_norm": 0.6910958290100098, - "learning_rate": 7.22901585280304e-05, - "loss": 0.0637, - "step": 69350 - }, - { - "epoch": 4.537782139352307, - "grad_norm": 0.78616863489151, - "learning_rate": 7.228193552903798e-05, - "loss": 0.059, - "step": 69360 - }, - { - "epoch": 4.538436375531567, - "grad_norm": 0.934597373008728, - "learning_rate": 7.227371177797181e-05, - "loss": 0.0633, - "step": 69370 - }, - { - "epoch": 4.539090611710828, - "grad_norm": 0.9315169453620911, - "learning_rate": 7.226548727510945e-05, - "loss": 0.0573, - "step": 69380 - }, - { - "epoch": 4.539744847890089, - "grad_norm": 1.0546493530273438, - "learning_rate": 7.22572620207285e-05, - "loss": 0.069, - "step": 69390 - }, - { - "epoch": 4.540399084069349, - "grad_norm": 0.7650620937347412, - "learning_rate": 7.224903601510658e-05, - "loss": 0.0564, - "step": 69400 - }, - { - "epoch": 4.54105332024861, - "grad_norm": 0.7123848795890808, - "learning_rate": 7.224080925852136e-05, - "loss": 0.065, - "step": 69410 - }, - { - "epoch": 4.541707556427871, - "grad_norm": 0.8295034766197205, - "learning_rate": 7.22325817512505e-05, - "loss": 0.0546, - "step": 69420 - }, - { - "epoch": 4.542361792607132, - "grad_norm": 0.9228529930114746, - "learning_rate": 7.222435349357169e-05, - "loss": 0.0676, - "step": 69430 - }, - { - "epoch": 4.543016028786392, - "grad_norm": 0.7449051141738892, - "learning_rate": 7.221612448576266e-05, - "loss": 0.0575, - "step": 69440 - }, - { - "epoch": 4.543670264965653, - "grad_norm": 0.832432210445404, - "learning_rate": 7.220789472810115e-05, - "loss": 0.0595, - "step": 69450 - }, - { - "epoch": 4.544324501144914, - "grad_norm": 1.0171115398406982, - "learning_rate": 7.219966422086497e-05, - "loss": 0.0561, - "step": 69460 - }, - { - "epoch": 4.544978737324174, - "grad_norm": 0.8047521710395813, - "learning_rate": 7.219143296433191e-05, - "loss": 0.0636, - "step": 69470 - }, - { - "epoch": 4.545632973503435, - "grad_norm": 0.7108994126319885, - "learning_rate": 7.218320095877976e-05, - "loss": 0.0626, - "step": 69480 - }, - { - "epoch": 4.546287209682696, - "grad_norm": 0.9195765256881714, - "learning_rate": 7.217496820448642e-05, - "loss": 0.0578, - "step": 69490 - }, - { - "epoch": 4.546941445861956, - "grad_norm": 1.0596858263015747, - "learning_rate": 7.216673470172975e-05, - "loss": 0.0603, - "step": 69500 - }, - { - "epoch": 4.547595682041217, - "grad_norm": 0.7993507385253906, - "learning_rate": 7.215850045078765e-05, - "loss": 0.0695, - "step": 69510 - }, - { - "epoch": 4.548249918220478, - "grad_norm": 1.125396728515625, - "learning_rate": 7.215026545193802e-05, - "loss": 0.0666, - "step": 69520 - }, - { - "epoch": 4.548904154399739, - "grad_norm": 0.8783840537071228, - "learning_rate": 7.214202970545888e-05, - "loss": 0.0567, - "step": 69530 - }, - { - "epoch": 4.549558390578999, - "grad_norm": 0.7901387214660645, - "learning_rate": 7.213379321162814e-05, - "loss": 0.0585, - "step": 69540 - }, - { - "epoch": 4.55021262675826, - "grad_norm": 0.8331219553947449, - "learning_rate": 7.212555597072384e-05, - "loss": 0.0596, - "step": 69550 - }, - { - "epoch": 4.550866862937521, - "grad_norm": 0.8178802132606506, - "learning_rate": 7.2117317983024e-05, - "loss": 0.0582, - "step": 69560 - }, - { - "epoch": 4.551521099116782, - "grad_norm": 0.7966588139533997, - "learning_rate": 7.210907924880668e-05, - "loss": 0.0586, - "step": 69570 - }, - { - "epoch": 4.552175335296042, - "grad_norm": 0.7240191698074341, - "learning_rate": 7.210083976834994e-05, - "loss": 0.0554, - "step": 69580 - }, - { - "epoch": 4.552829571475303, - "grad_norm": 0.6991345882415771, - "learning_rate": 7.20925995419319e-05, - "loss": 0.0614, - "step": 69590 - }, - { - "epoch": 4.553483807654564, - "grad_norm": 0.9733833074569702, - "learning_rate": 7.208435856983068e-05, - "loss": 0.0631, - "step": 69600 - }, - { - "epoch": 4.554138043833824, - "grad_norm": 0.8843761086463928, - "learning_rate": 7.207611685232447e-05, - "loss": 0.0647, - "step": 69610 - }, - { - "epoch": 4.554792280013085, - "grad_norm": 1.0091403722763062, - "learning_rate": 7.206787438969138e-05, - "loss": 0.0581, - "step": 69620 - }, - { - "epoch": 4.555446516192346, - "grad_norm": 1.0898354053497314, - "learning_rate": 7.205963118220967e-05, - "loss": 0.0744, - "step": 69630 - }, - { - "epoch": 4.556100752371606, - "grad_norm": 0.7941951155662537, - "learning_rate": 7.205138723015756e-05, - "loss": 0.0728, - "step": 69640 - }, - { - "epoch": 4.556754988550867, - "grad_norm": 1.1205252408981323, - "learning_rate": 7.204314253381329e-05, - "loss": 0.0603, - "step": 69650 - }, - { - "epoch": 4.557409224730128, - "grad_norm": 0.9061139225959778, - "learning_rate": 7.203489709345515e-05, - "loss": 0.0613, - "step": 69660 - }, - { - "epoch": 4.558063460909389, - "grad_norm": 0.958292543888092, - "learning_rate": 7.202665090936145e-05, - "loss": 0.0667, - "step": 69670 - }, - { - "epoch": 4.558717697088649, - "grad_norm": 0.6560031771659851, - "learning_rate": 7.201840398181052e-05, - "loss": 0.0674, - "step": 69680 - }, - { - "epoch": 4.55937193326791, - "grad_norm": 0.8264176845550537, - "learning_rate": 7.201015631108071e-05, - "loss": 0.0682, - "step": 69690 - }, - { - "epoch": 4.560026169447171, - "grad_norm": 1.0350055694580078, - "learning_rate": 7.200190789745038e-05, - "loss": 0.061, - "step": 69700 - }, - { - "epoch": 4.560680405626432, - "grad_norm": 0.8044286370277405, - "learning_rate": 7.199365874119796e-05, - "loss": 0.0655, - "step": 69710 - }, - { - "epoch": 4.561334641805692, - "grad_norm": 0.9388670325279236, - "learning_rate": 7.198540884260189e-05, - "loss": 0.06, - "step": 69720 - }, - { - "epoch": 4.561988877984953, - "grad_norm": 0.7581605315208435, - "learning_rate": 7.197715820194062e-05, - "loss": 0.0535, - "step": 69730 - }, - { - "epoch": 4.562643114164214, - "grad_norm": 1.0746386051177979, - "learning_rate": 7.19689068194926e-05, - "loss": 0.0575, - "step": 69740 - }, - { - "epoch": 4.563297350343474, - "grad_norm": 0.7626873254776001, - "learning_rate": 7.196065469553637e-05, - "loss": 0.0616, - "step": 69750 - }, - { - "epoch": 4.563951586522735, - "grad_norm": 0.9289671182632446, - "learning_rate": 7.195240183035045e-05, - "loss": 0.0586, - "step": 69760 - }, - { - "epoch": 4.564605822701996, - "grad_norm": 0.8049091100692749, - "learning_rate": 7.194414822421341e-05, - "loss": 0.0527, - "step": 69770 - }, - { - "epoch": 4.565260058881256, - "grad_norm": 0.7008090615272522, - "learning_rate": 7.19358938774038e-05, - "loss": 0.0763, - "step": 69780 - }, - { - "epoch": 4.565914295060517, - "grad_norm": 1.1009507179260254, - "learning_rate": 7.192763879020025e-05, - "loss": 0.0732, - "step": 69790 - }, - { - "epoch": 4.566568531239778, - "grad_norm": 0.9115166068077087, - "learning_rate": 7.191938296288138e-05, - "loss": 0.0635, - "step": 69800 - }, - { - "epoch": 4.567222767419039, - "grad_norm": 0.9855472445487976, - "learning_rate": 7.191112639572585e-05, - "loss": 0.0594, - "step": 69810 - }, - { - "epoch": 4.567877003598299, - "grad_norm": 1.0158358812332153, - "learning_rate": 7.190286908901234e-05, - "loss": 0.0598, - "step": 69820 - }, - { - "epoch": 4.56853123977756, - "grad_norm": 0.7461121082305908, - "learning_rate": 7.189461104301955e-05, - "loss": 0.0578, - "step": 69830 - }, - { - "epoch": 4.569185475956821, - "grad_norm": 0.8417521119117737, - "learning_rate": 7.188635225802622e-05, - "loss": 0.0617, - "step": 69840 - }, - { - "epoch": 4.569839712136081, - "grad_norm": 0.9156501293182373, - "learning_rate": 7.187809273431112e-05, - "loss": 0.0567, - "step": 69850 - }, - { - "epoch": 4.570493948315342, - "grad_norm": 1.0383937358856201, - "learning_rate": 7.186983247215299e-05, - "loss": 0.0659, - "step": 69860 - }, - { - "epoch": 4.571148184494603, - "grad_norm": 0.6928499937057495, - "learning_rate": 7.186157147183067e-05, - "loss": 0.066, - "step": 69870 - }, - { - "epoch": 4.571802420673864, - "grad_norm": 0.7794655561447144, - "learning_rate": 7.185330973362298e-05, - "loss": 0.0602, - "step": 69880 - }, - { - "epoch": 4.572456656853124, - "grad_norm": 0.9522107839584351, - "learning_rate": 7.184504725780876e-05, - "loss": 0.0602, - "step": 69890 - }, - { - "epoch": 4.573110893032385, - "grad_norm": 0.9727455377578735, - "learning_rate": 7.183678404466691e-05, - "loss": 0.061, - "step": 69900 - }, - { - "epoch": 4.573765129211646, - "grad_norm": 0.8245759606361389, - "learning_rate": 7.182852009447633e-05, - "loss": 0.07, - "step": 69910 - }, - { - "epoch": 4.574419365390906, - "grad_norm": 0.9080676436424255, - "learning_rate": 7.182025540751595e-05, - "loss": 0.0647, - "step": 69920 - }, - { - "epoch": 4.575073601570167, - "grad_norm": 0.8655606508255005, - "learning_rate": 7.181198998406473e-05, - "loss": 0.0543, - "step": 69930 - }, - { - "epoch": 4.575727837749428, - "grad_norm": 1.0954169034957886, - "learning_rate": 7.180372382440164e-05, - "loss": 0.064, - "step": 69940 - }, - { - "epoch": 4.576382073928688, - "grad_norm": 0.817754328250885, - "learning_rate": 7.179545692880569e-05, - "loss": 0.0651, - "step": 69950 - }, - { - "epoch": 4.577036310107949, - "grad_norm": 0.6452881693840027, - "learning_rate": 7.17871892975559e-05, - "loss": 0.0624, - "step": 69960 - }, - { - "epoch": 4.57769054628721, - "grad_norm": 0.8822810053825378, - "learning_rate": 7.177892093093134e-05, - "loss": 0.0512, - "step": 69970 - }, - { - "epoch": 4.578344782466471, - "grad_norm": 0.7924016714096069, - "learning_rate": 7.17706518292111e-05, - "loss": 0.0663, - "step": 69980 - }, - { - "epoch": 4.578999018645731, - "grad_norm": 1.0157910585403442, - "learning_rate": 7.176238199267424e-05, - "loss": 0.0615, - "step": 69990 - }, - { - "epoch": 4.579653254824992, - "grad_norm": 0.6557690501213074, - "learning_rate": 7.175411142159991e-05, - "loss": 0.0521, - "step": 70000 - }, - { - "epoch": 4.580307491004253, - "grad_norm": 0.7803752422332764, - "learning_rate": 7.174584011626728e-05, - "loss": 0.0611, - "step": 70010 - }, - { - "epoch": 4.580961727183514, - "grad_norm": 0.889769971370697, - "learning_rate": 7.17375680769555e-05, - "loss": 0.0615, - "step": 70020 - }, - { - "epoch": 4.581615963362774, - "grad_norm": 0.706065833568573, - "learning_rate": 7.17292953039438e-05, - "loss": 0.0589, - "step": 70030 - }, - { - "epoch": 4.582270199542035, - "grad_norm": 0.9106086492538452, - "learning_rate": 7.172102179751141e-05, - "loss": 0.0638, - "step": 70040 - }, - { - "epoch": 4.582924435721296, - "grad_norm": 0.7716435790061951, - "learning_rate": 7.171274755793756e-05, - "loss": 0.0571, - "step": 70050 - }, - { - "epoch": 4.583578671900556, - "grad_norm": 0.8700406551361084, - "learning_rate": 7.170447258550152e-05, - "loss": 0.0659, - "step": 70060 - }, - { - "epoch": 4.584232908079817, - "grad_norm": 0.902850329875946, - "learning_rate": 7.169619688048262e-05, - "loss": 0.06, - "step": 70070 - }, - { - "epoch": 4.584887144259078, - "grad_norm": 1.30494225025177, - "learning_rate": 7.168792044316017e-05, - "loss": 0.0595, - "step": 70080 - }, - { - "epoch": 4.585541380438338, - "grad_norm": 0.878594160079956, - "learning_rate": 7.167964327381355e-05, - "loss": 0.0602, - "step": 70090 - }, - { - "epoch": 4.586195616617599, - "grad_norm": 1.009494662284851, - "learning_rate": 7.167136537272208e-05, - "loss": 0.0622, - "step": 70100 - }, - { - "epoch": 4.58684985279686, - "grad_norm": 0.858677864074707, - "learning_rate": 7.166308674016522e-05, - "loss": 0.0642, - "step": 70110 - }, - { - "epoch": 4.587504088976121, - "grad_norm": 0.8348826169967651, - "learning_rate": 7.165480737642234e-05, - "loss": 0.0596, - "step": 70120 - }, - { - "epoch": 4.588158325155381, - "grad_norm": 0.9198652505874634, - "learning_rate": 7.164652728177294e-05, - "loss": 0.0584, - "step": 70130 - }, - { - "epoch": 4.588812561334642, - "grad_norm": 0.8578175902366638, - "learning_rate": 7.163824645649648e-05, - "loss": 0.0597, - "step": 70140 - }, - { - "epoch": 4.589466797513903, - "grad_norm": 0.8815035223960876, - "learning_rate": 7.162996490087243e-05, - "loss": 0.0548, - "step": 70150 - }, - { - "epoch": 4.590121033693164, - "grad_norm": 0.8675861954689026, - "learning_rate": 7.162168261518036e-05, - "loss": 0.0571, - "step": 70160 - }, - { - "epoch": 4.590775269872424, - "grad_norm": 0.9907076358795166, - "learning_rate": 7.161339959969979e-05, - "loss": 0.0618, - "step": 70170 - }, - { - "epoch": 4.591429506051685, - "grad_norm": 0.8220086693763733, - "learning_rate": 7.160511585471031e-05, - "loss": 0.0592, - "step": 70180 - }, - { - "epoch": 4.592083742230946, - "grad_norm": 0.9038228988647461, - "learning_rate": 7.159683138049148e-05, - "loss": 0.0595, - "step": 70190 - }, - { - "epoch": 4.592737978410206, - "grad_norm": 0.9181106686592102, - "learning_rate": 7.158854617732297e-05, - "loss": 0.0556, - "step": 70200 - }, - { - "epoch": 4.593392214589467, - "grad_norm": 0.8551499843597412, - "learning_rate": 7.158026024548441e-05, - "loss": 0.0594, - "step": 70210 - }, - { - "epoch": 4.594046450768728, - "grad_norm": 0.9677573442459106, - "learning_rate": 7.157197358525546e-05, - "loss": 0.06, - "step": 70220 - }, - { - "epoch": 4.594700686947988, - "grad_norm": 0.8928607702255249, - "learning_rate": 7.156368619691582e-05, - "loss": 0.0647, - "step": 70230 - }, - { - "epoch": 4.595354923127249, - "grad_norm": 0.8647032976150513, - "learning_rate": 7.155539808074525e-05, - "loss": 0.0641, - "step": 70240 - }, - { - "epoch": 4.59600915930651, - "grad_norm": 0.9325335621833801, - "learning_rate": 7.154710923702345e-05, - "loss": 0.0613, - "step": 70250 - }, - { - "epoch": 4.596663395485771, - "grad_norm": 0.8028609156608582, - "learning_rate": 7.153881966603019e-05, - "loss": 0.0595, - "step": 70260 - }, - { - "epoch": 4.597317631665031, - "grad_norm": 0.9629660248756409, - "learning_rate": 7.153052936804529e-05, - "loss": 0.0555, - "step": 70270 - }, - { - "epoch": 4.597971867844292, - "grad_norm": 0.9992688298225403, - "learning_rate": 7.152223834334855e-05, - "loss": 0.0691, - "step": 70280 - }, - { - "epoch": 4.598626104023553, - "grad_norm": 0.7620879411697388, - "learning_rate": 7.151394659221984e-05, - "loss": 0.0601, - "step": 70290 - }, - { - "epoch": 4.599280340202813, - "grad_norm": 0.883135199546814, - "learning_rate": 7.150565411493899e-05, - "loss": 0.0554, - "step": 70300 - }, - { - "epoch": 4.599934576382074, - "grad_norm": 0.8866992592811584, - "learning_rate": 7.149736091178593e-05, - "loss": 0.0674, - "step": 70310 - }, - { - "epoch": 4.600588812561335, - "grad_norm": 1.0410959720611572, - "learning_rate": 7.148906698304054e-05, - "loss": 0.0634, - "step": 70320 - }, - { - "epoch": 4.601243048740596, - "grad_norm": 0.7747849225997925, - "learning_rate": 7.14807723289828e-05, - "loss": 0.0543, - "step": 70330 - }, - { - "epoch": 4.601897284919856, - "grad_norm": 0.9914830923080444, - "learning_rate": 7.147247694989265e-05, - "loss": 0.058, - "step": 70340 - }, - { - "epoch": 4.602551521099117, - "grad_norm": 0.8402534127235413, - "learning_rate": 7.146418084605008e-05, - "loss": 0.0634, - "step": 70350 - }, - { - "epoch": 4.603205757278378, - "grad_norm": 0.840990424156189, - "learning_rate": 7.145588401773513e-05, - "loss": 0.0559, - "step": 70360 - }, - { - "epoch": 4.603859993457638, - "grad_norm": 0.8721504807472229, - "learning_rate": 7.144758646522782e-05, - "loss": 0.0638, - "step": 70370 - }, - { - "epoch": 4.604514229636899, - "grad_norm": 0.8527781963348389, - "learning_rate": 7.143928818880823e-05, - "loss": 0.0532, - "step": 70380 - }, - { - "epoch": 4.60516846581616, - "grad_norm": 1.0726641416549683, - "learning_rate": 7.143098918875643e-05, - "loss": 0.0602, - "step": 70390 - }, - { - "epoch": 4.60582270199542, - "grad_norm": 1.0144989490509033, - "learning_rate": 7.142268946535254e-05, - "loss": 0.0642, - "step": 70400 - }, - { - "epoch": 4.606476938174681, - "grad_norm": 1.0396263599395752, - "learning_rate": 7.141438901887669e-05, - "loss": 0.0704, - "step": 70410 - }, - { - "epoch": 4.607131174353942, - "grad_norm": 0.7926657199859619, - "learning_rate": 7.140608784960904e-05, - "loss": 0.0692, - "step": 70420 - }, - { - "epoch": 4.607785410533203, - "grad_norm": 1.1654900312423706, - "learning_rate": 7.13977859578298e-05, - "loss": 0.0588, - "step": 70430 - }, - { - "epoch": 4.608439646712463, - "grad_norm": 0.9347232580184937, - "learning_rate": 7.138948334381917e-05, - "loss": 0.0604, - "step": 70440 - }, - { - "epoch": 4.609093882891724, - "grad_norm": 1.0925201177597046, - "learning_rate": 7.138118000785736e-05, - "loss": 0.0708, - "step": 70450 - }, - { - "epoch": 4.609748119070985, - "grad_norm": 0.9445613026618958, - "learning_rate": 7.137287595022467e-05, - "loss": 0.0523, - "step": 70460 - }, - { - "epoch": 4.610402355250246, - "grad_norm": 0.795276939868927, - "learning_rate": 7.136457117120136e-05, - "loss": 0.0655, - "step": 70470 - }, - { - "epoch": 4.611056591429506, - "grad_norm": 0.9454329609870911, - "learning_rate": 7.135626567106775e-05, - "loss": 0.0694, - "step": 70480 - }, - { - "epoch": 4.611710827608767, - "grad_norm": 0.8667651414871216, - "learning_rate": 7.134795945010416e-05, - "loss": 0.0673, - "step": 70490 - }, - { - "epoch": 4.612365063788028, - "grad_norm": 0.7597100734710693, - "learning_rate": 7.133965250859094e-05, - "loss": 0.0647, - "step": 70500 - }, - { - "epoch": 4.613019299967288, - "grad_norm": 1.0633947849273682, - "learning_rate": 7.13313448468085e-05, - "loss": 0.0676, - "step": 70510 - }, - { - "epoch": 4.613673536146549, - "grad_norm": 0.9230862259864807, - "learning_rate": 7.13230364650372e-05, - "loss": 0.0718, - "step": 70520 - }, - { - "epoch": 4.61432777232581, - "grad_norm": 0.8642282485961914, - "learning_rate": 7.131472736355754e-05, - "loss": 0.0678, - "step": 70530 - }, - { - "epoch": 4.61498200850507, - "grad_norm": 0.8637971878051758, - "learning_rate": 7.130641754264991e-05, - "loss": 0.0718, - "step": 70540 - }, - { - "epoch": 4.615636244684331, - "grad_norm": 0.943467915058136, - "learning_rate": 7.12981070025948e-05, - "loss": 0.0527, - "step": 70550 - }, - { - "epoch": 4.616290480863592, - "grad_norm": 0.843879222869873, - "learning_rate": 7.128979574367272e-05, - "loss": 0.0649, - "step": 70560 - }, - { - "epoch": 4.616944717042853, - "grad_norm": 0.9107741713523865, - "learning_rate": 7.128148376616422e-05, - "loss": 0.0617, - "step": 70570 - }, - { - "epoch": 4.617598953222113, - "grad_norm": 0.8617720007896423, - "learning_rate": 7.127317107034981e-05, - "loss": 0.061, - "step": 70580 - }, - { - "epoch": 4.618253189401374, - "grad_norm": 0.8260530829429626, - "learning_rate": 7.12648576565101e-05, - "loss": 0.0536, - "step": 70590 - }, - { - "epoch": 4.618907425580635, - "grad_norm": 0.9773645401000977, - "learning_rate": 7.125654352492567e-05, - "loss": 0.0531, - "step": 70600 - }, - { - "epoch": 4.619561661759896, - "grad_norm": 1.027761459350586, - "learning_rate": 7.124822867587715e-05, - "loss": 0.0566, - "step": 70610 - }, - { - "epoch": 4.620215897939156, - "grad_norm": 1.0088636875152588, - "learning_rate": 7.123991310964519e-05, - "loss": 0.0577, - "step": 70620 - }, - { - "epoch": 4.620870134118417, - "grad_norm": 0.9433162212371826, - "learning_rate": 7.123159682651045e-05, - "loss": 0.06, - "step": 70630 - }, - { - "epoch": 4.621524370297678, - "grad_norm": 1.0811787843704224, - "learning_rate": 7.122327982675363e-05, - "loss": 0.0597, - "step": 70640 - }, - { - "epoch": 4.622178606476938, - "grad_norm": 0.8905490636825562, - "learning_rate": 7.121496211065547e-05, - "loss": 0.0581, - "step": 70650 - }, - { - "epoch": 4.622832842656199, - "grad_norm": 0.801537811756134, - "learning_rate": 7.12066436784967e-05, - "loss": 0.0565, - "step": 70660 - }, - { - "epoch": 4.62348707883546, - "grad_norm": 0.9896419048309326, - "learning_rate": 7.119832453055809e-05, - "loss": 0.0645, - "step": 70670 - }, - { - "epoch": 4.62414131501472, - "grad_norm": 0.9551672339439392, - "learning_rate": 7.119000466712042e-05, - "loss": 0.058, - "step": 70680 - }, - { - "epoch": 4.624795551193981, - "grad_norm": 0.8658273220062256, - "learning_rate": 7.118168408846454e-05, - "loss": 0.0601, - "step": 70690 - }, - { - "epoch": 4.625449787373242, - "grad_norm": 0.7730889916419983, - "learning_rate": 7.117336279487124e-05, - "loss": 0.0589, - "step": 70700 - }, - { - "epoch": 4.626104023552503, - "grad_norm": 1.2071642875671387, - "learning_rate": 7.116504078662144e-05, - "loss": 0.0607, - "step": 70710 - }, - { - "epoch": 4.626758259731763, - "grad_norm": 0.6655055284500122, - "learning_rate": 7.1156718063996e-05, - "loss": 0.0672, - "step": 70720 - }, - { - "epoch": 4.627412495911024, - "grad_norm": 0.765287458896637, - "learning_rate": 7.114839462727585e-05, - "loss": 0.0657, - "step": 70730 - }, - { - "epoch": 4.628066732090285, - "grad_norm": 0.9906727075576782, - "learning_rate": 7.114007047674189e-05, - "loss": 0.0612, - "step": 70740 - }, - { - "epoch": 4.628720968269545, - "grad_norm": 0.8197884559631348, - "learning_rate": 7.113174561267514e-05, - "loss": 0.0638, - "step": 70750 - }, - { - "epoch": 4.629375204448806, - "grad_norm": 0.8256412744522095, - "learning_rate": 7.112342003535654e-05, - "loss": 0.062, - "step": 70760 - }, - { - "epoch": 4.630029440628067, - "grad_norm": 1.0937832593917847, - "learning_rate": 7.111509374506711e-05, - "loss": 0.0612, - "step": 70770 - }, - { - "epoch": 4.630683676807328, - "grad_norm": 0.8414881825447083, - "learning_rate": 7.11067667420879e-05, - "loss": 0.0588, - "step": 70780 - }, - { - "epoch": 4.631337912986588, - "grad_norm": 1.1070261001586914, - "learning_rate": 7.109843902669997e-05, - "loss": 0.0754, - "step": 70790 - }, - { - "epoch": 4.631992149165849, - "grad_norm": 1.002337098121643, - "learning_rate": 7.109011059918438e-05, - "loss": 0.062, - "step": 70800 - }, - { - "epoch": 4.63264638534511, - "grad_norm": 0.7594942450523376, - "learning_rate": 7.108178145982223e-05, - "loss": 0.06, - "step": 70810 - }, - { - "epoch": 4.63330062152437, - "grad_norm": 0.8567973375320435, - "learning_rate": 7.107345160889469e-05, - "loss": 0.0627, - "step": 70820 - }, - { - "epoch": 4.633954857703631, - "grad_norm": 0.919387936592102, - "learning_rate": 7.106512104668287e-05, - "loss": 0.0654, - "step": 70830 - }, - { - "epoch": 4.634609093882892, - "grad_norm": 0.9777571558952332, - "learning_rate": 7.1056789773468e-05, - "loss": 0.0661, - "step": 70840 - }, - { - "epoch": 4.635263330062152, - "grad_norm": 0.7844616174697876, - "learning_rate": 7.104845778953122e-05, - "loss": 0.0597, - "step": 70850 - }, - { - "epoch": 4.635917566241413, - "grad_norm": 0.8564948439598083, - "learning_rate": 7.10401250951538e-05, - "loss": 0.0679, - "step": 70860 - }, - { - "epoch": 4.636571802420674, - "grad_norm": 0.8882778286933899, - "learning_rate": 7.103179169061697e-05, - "loss": 0.0637, - "step": 70870 - }, - { - "epoch": 4.637226038599935, - "grad_norm": 0.8899109363555908, - "learning_rate": 7.102345757620204e-05, - "loss": 0.0639, - "step": 70880 - }, - { - "epoch": 4.637880274779195, - "grad_norm": 0.7381958961486816, - "learning_rate": 7.101512275219026e-05, - "loss": 0.0618, - "step": 70890 - }, - { - "epoch": 4.638534510958456, - "grad_norm": 1.1825381517410278, - "learning_rate": 7.100678721886296e-05, - "loss": 0.069, - "step": 70900 - }, - { - "epoch": 4.639188747137717, - "grad_norm": 0.8245072960853577, - "learning_rate": 7.099845097650152e-05, - "loss": 0.0648, - "step": 70910 - }, - { - "epoch": 4.639842983316978, - "grad_norm": 1.0069905519485474, - "learning_rate": 7.099011402538729e-05, - "loss": 0.0565, - "step": 70920 - }, - { - "epoch": 4.640497219496238, - "grad_norm": 0.9455946087837219, - "learning_rate": 7.098177636580165e-05, - "loss": 0.0644, - "step": 70930 - }, - { - "epoch": 4.641151455675499, - "grad_norm": 1.0115948915481567, - "learning_rate": 7.097343799802603e-05, - "loss": 0.0598, - "step": 70940 - }, - { - "epoch": 4.64180569185476, - "grad_norm": 0.910192608833313, - "learning_rate": 7.096509892234188e-05, - "loss": 0.062, - "step": 70950 - }, - { - "epoch": 4.64245992803402, - "grad_norm": 1.005014419555664, - "learning_rate": 7.095675913903067e-05, - "loss": 0.0573, - "step": 70960 - }, - { - "epoch": 4.643114164213281, - "grad_norm": 0.8233394622802734, - "learning_rate": 7.094841864837385e-05, - "loss": 0.0607, - "step": 70970 - }, - { - "epoch": 4.643768400392542, - "grad_norm": 0.84469074010849, - "learning_rate": 7.094007745065298e-05, - "loss": 0.0547, - "step": 70980 - }, - { - "epoch": 4.644422636571802, - "grad_norm": 0.846990168094635, - "learning_rate": 7.093173554614958e-05, - "loss": 0.0596, - "step": 70990 - }, - { - "epoch": 4.645076872751063, - "grad_norm": 1.1320626735687256, - "learning_rate": 7.092339293514521e-05, - "loss": 0.0595, - "step": 71000 - }, - { - "epoch": 4.645731108930324, - "grad_norm": 0.9012898206710815, - "learning_rate": 7.091504961792145e-05, - "loss": 0.0565, - "step": 71010 - }, - { - "epoch": 4.646385345109585, - "grad_norm": 0.9785043001174927, - "learning_rate": 7.090670559475991e-05, - "loss": 0.0636, - "step": 71020 - }, - { - "epoch": 4.647039581288845, - "grad_norm": 0.9479057788848877, - "learning_rate": 7.089836086594223e-05, - "loss": 0.063, - "step": 71030 - }, - { - "epoch": 4.647693817468106, - "grad_norm": 0.8791054487228394, - "learning_rate": 7.089001543175007e-05, - "loss": 0.0666, - "step": 71040 - }, - { - "epoch": 4.648348053647367, - "grad_norm": 1.0434389114379883, - "learning_rate": 7.088166929246509e-05, - "loss": 0.068, - "step": 71050 - }, - { - "epoch": 4.649002289826628, - "grad_norm": 0.8566122651100159, - "learning_rate": 7.087332244836901e-05, - "loss": 0.0675, - "step": 71060 - }, - { - "epoch": 4.649656526005888, - "grad_norm": 0.9243730902671814, - "learning_rate": 7.086497489974355e-05, - "loss": 0.0639, - "step": 71070 - }, - { - "epoch": 4.650310762185149, - "grad_norm": 0.8859608173370361, - "learning_rate": 7.085662664687049e-05, - "loss": 0.06, - "step": 71080 - }, - { - "epoch": 4.65096499836441, - "grad_norm": 1.0583763122558594, - "learning_rate": 7.084827769003157e-05, - "loss": 0.0573, - "step": 71090 - }, - { - "epoch": 4.65161923454367, - "grad_norm": 0.8830341100692749, - "learning_rate": 7.083992802950859e-05, - "loss": 0.0557, - "step": 71100 - }, - { - "epoch": 4.652273470722931, - "grad_norm": 0.8435945510864258, - "learning_rate": 7.08315776655834e-05, - "loss": 0.0535, - "step": 71110 - }, - { - "epoch": 4.652927706902192, - "grad_norm": 0.9333560466766357, - "learning_rate": 7.082322659853782e-05, - "loss": 0.065, - "step": 71120 - }, - { - "epoch": 4.653581943081452, - "grad_norm": 0.8376295566558838, - "learning_rate": 7.081487482865375e-05, - "loss": 0.0518, - "step": 71130 - }, - { - "epoch": 4.654236179260713, - "grad_norm": 0.7499420642852783, - "learning_rate": 7.080652235621304e-05, - "loss": 0.0536, - "step": 71140 - }, - { - "epoch": 4.654890415439974, - "grad_norm": 0.7663356065750122, - "learning_rate": 7.079816918149764e-05, - "loss": 0.0598, - "step": 71150 - }, - { - "epoch": 4.655544651619235, - "grad_norm": 1.0369300842285156, - "learning_rate": 7.07898153047895e-05, - "loss": 0.0584, - "step": 71160 - }, - { - "epoch": 4.656198887798495, - "grad_norm": 0.8538703322410583, - "learning_rate": 7.078146072637055e-05, - "loss": 0.0608, - "step": 71170 - }, - { - "epoch": 4.656853123977756, - "grad_norm": 0.9272082448005676, - "learning_rate": 7.077310544652282e-05, - "loss": 0.0666, - "step": 71180 - }, - { - "epoch": 4.657507360157017, - "grad_norm": 0.8066675066947937, - "learning_rate": 7.076474946552828e-05, - "loss": 0.0551, - "step": 71190 - }, - { - "epoch": 4.658161596336277, - "grad_norm": 0.9657180905342102, - "learning_rate": 7.0756392783669e-05, - "loss": 0.067, - "step": 71200 - }, - { - "epoch": 4.658815832515538, - "grad_norm": 0.9008907675743103, - "learning_rate": 7.074803540122703e-05, - "loss": 0.0601, - "step": 71210 - }, - { - "epoch": 4.659470068694799, - "grad_norm": 0.7514058947563171, - "learning_rate": 7.073967731848445e-05, - "loss": 0.0573, - "step": 71220 - }, - { - "epoch": 4.66012430487406, - "grad_norm": 0.8084816336631775, - "learning_rate": 7.073131853572335e-05, - "loss": 0.0641, - "step": 71230 - }, - { - "epoch": 4.66077854105332, - "grad_norm": 0.7935675978660583, - "learning_rate": 7.072295905322592e-05, - "loss": 0.0567, - "step": 71240 - }, - { - "epoch": 4.661432777232581, - "grad_norm": 0.9092238545417786, - "learning_rate": 7.071459887127424e-05, - "loss": 0.0674, - "step": 71250 - }, - { - "epoch": 4.662087013411842, - "grad_norm": 0.9651187062263489, - "learning_rate": 7.070623799015052e-05, - "loss": 0.0626, - "step": 71260 - }, - { - "epoch": 4.662741249591102, - "grad_norm": 0.7449125051498413, - "learning_rate": 7.069787641013699e-05, - "loss": 0.0598, - "step": 71270 - }, - { - "epoch": 4.663395485770363, - "grad_norm": 0.8150971531867981, - "learning_rate": 7.068951413151583e-05, - "loss": 0.0704, - "step": 71280 - }, - { - "epoch": 4.664049721949624, - "grad_norm": 1.1093558073043823, - "learning_rate": 7.06811511545693e-05, - "loss": 0.0529, - "step": 71290 - }, - { - "epoch": 4.664703958128884, - "grad_norm": 0.9424728155136108, - "learning_rate": 7.06727874795797e-05, - "loss": 0.0607, - "step": 71300 - }, - { - "epoch": 4.665358194308145, - "grad_norm": 0.9972444772720337, - "learning_rate": 7.06644231068293e-05, - "loss": 0.0585, - "step": 71310 - }, - { - "epoch": 4.666012430487406, - "grad_norm": 0.8003326654434204, - "learning_rate": 7.065605803660042e-05, - "loss": 0.0623, - "step": 71320 - }, - { - "epoch": 4.666666666666667, - "grad_norm": 0.8432341814041138, - "learning_rate": 7.064769226917543e-05, - "loss": 0.0526, - "step": 71330 - }, - { - "epoch": 4.667320902845927, - "grad_norm": 0.828727126121521, - "learning_rate": 7.063932580483665e-05, - "loss": 0.0643, - "step": 71340 - }, - { - "epoch": 4.667975139025188, - "grad_norm": 0.9466299414634705, - "learning_rate": 7.063095864386651e-05, - "loss": 0.057, - "step": 71350 - }, - { - "epoch": 4.668629375204449, - "grad_norm": 0.6953588128089905, - "learning_rate": 7.06225907865474e-05, - "loss": 0.0522, - "step": 71360 - }, - { - "epoch": 4.66928361138371, - "grad_norm": 0.8251481056213379, - "learning_rate": 7.061422223316176e-05, - "loss": 0.06, - "step": 71370 - }, - { - "epoch": 4.66993784756297, - "grad_norm": 1.085668683052063, - "learning_rate": 7.060585298399207e-05, - "loss": 0.0653, - "step": 71380 - }, - { - "epoch": 4.670592083742231, - "grad_norm": 1.0009715557098389, - "learning_rate": 7.05974830393208e-05, - "loss": 0.065, - "step": 71390 - }, - { - "epoch": 4.671246319921492, - "grad_norm": 0.8641042709350586, - "learning_rate": 7.058911239943046e-05, - "loss": 0.0589, - "step": 71400 - }, - { - "epoch": 4.671900556100752, - "grad_norm": 0.8176802396774292, - "learning_rate": 7.058074106460357e-05, - "loss": 0.058, - "step": 71410 - }, - { - "epoch": 4.672554792280013, - "grad_norm": 0.8100340366363525, - "learning_rate": 7.057236903512269e-05, - "loss": 0.0636, - "step": 71420 - }, - { - "epoch": 4.673209028459274, - "grad_norm": 0.9662539958953857, - "learning_rate": 7.056399631127043e-05, - "loss": 0.0568, - "step": 71430 - }, - { - "epoch": 4.673863264638534, - "grad_norm": 1.049534559249878, - "learning_rate": 7.055562289332934e-05, - "loss": 0.0701, - "step": 71440 - }, - { - "epoch": 4.674517500817795, - "grad_norm": 0.670726478099823, - "learning_rate": 7.054724878158206e-05, - "loss": 0.0551, - "step": 71450 - }, - { - "epoch": 4.675171736997056, - "grad_norm": 1.3592555522918701, - "learning_rate": 7.053887397631127e-05, - "loss": 0.0645, - "step": 71460 - }, - { - "epoch": 4.675825973176317, - "grad_norm": 0.7808588147163391, - "learning_rate": 7.053049847779961e-05, - "loss": 0.0649, - "step": 71470 - }, - { - "epoch": 4.676480209355577, - "grad_norm": 0.8169131875038147, - "learning_rate": 7.052212228632977e-05, - "loss": 0.0547, - "step": 71480 - }, - { - "epoch": 4.677134445534838, - "grad_norm": 0.8290903568267822, - "learning_rate": 7.051374540218451e-05, - "loss": 0.065, - "step": 71490 - }, - { - "epoch": 4.677788681714099, - "grad_norm": 1.1689528226852417, - "learning_rate": 7.050536782564653e-05, - "loss": 0.0707, - "step": 71500 - }, - { - "epoch": 4.67844291789336, - "grad_norm": 0.7976114749908447, - "learning_rate": 7.04969895569986e-05, - "loss": 0.0652, - "step": 71510 - }, - { - "epoch": 4.67909715407262, - "grad_norm": 0.9616780281066895, - "learning_rate": 7.04886105965235e-05, - "loss": 0.061, - "step": 71520 - }, - { - "epoch": 4.679751390251881, - "grad_norm": 0.9938467741012573, - "learning_rate": 7.048023094450411e-05, - "loss": 0.062, - "step": 71530 - }, - { - "epoch": 4.680405626431142, - "grad_norm": 0.8165993690490723, - "learning_rate": 7.047185060122317e-05, - "loss": 0.0564, - "step": 71540 - }, - { - "epoch": 4.681059862610402, - "grad_norm": 0.8592141270637512, - "learning_rate": 7.046346956696359e-05, - "loss": 0.0621, - "step": 71550 - }, - { - "epoch": 4.681714098789663, - "grad_norm": 0.957463800907135, - "learning_rate": 7.045508784200826e-05, - "loss": 0.0575, - "step": 71560 - }, - { - "epoch": 4.682368334968924, - "grad_norm": 0.7497161626815796, - "learning_rate": 7.044670542664007e-05, - "loss": 0.0598, - "step": 71570 - }, - { - "epoch": 4.683022571148184, - "grad_norm": 0.8132786750793457, - "learning_rate": 7.043832232114194e-05, - "loss": 0.0562, - "step": 71580 - }, - { - "epoch": 4.683676807327445, - "grad_norm": 0.8646707534790039, - "learning_rate": 7.042993852579683e-05, - "loss": 0.056, - "step": 71590 - }, - { - "epoch": 4.684331043506706, - "grad_norm": 0.82687908411026, - "learning_rate": 7.042155404088772e-05, - "loss": 0.0574, - "step": 71600 - }, - { - "epoch": 4.684985279685967, - "grad_norm": 0.9083144664764404, - "learning_rate": 7.04131688666976e-05, - "loss": 0.0564, - "step": 71610 - }, - { - "epoch": 4.685639515865227, - "grad_norm": 0.7439766526222229, - "learning_rate": 7.04047830035095e-05, - "loss": 0.06, - "step": 71620 - }, - { - "epoch": 4.686293752044488, - "grad_norm": 0.8073269128799438, - "learning_rate": 7.039639645160646e-05, - "loss": 0.064, - "step": 71630 - }, - { - "epoch": 4.686947988223749, - "grad_norm": 1.0741745233535767, - "learning_rate": 7.038800921127152e-05, - "loss": 0.0638, - "step": 71640 - }, - { - "epoch": 4.687602224403009, - "grad_norm": 0.9227171540260315, - "learning_rate": 7.037962128278783e-05, - "loss": 0.0577, - "step": 71650 - }, - { - "epoch": 4.68825646058227, - "grad_norm": 1.138973355293274, - "learning_rate": 7.037123266643846e-05, - "loss": 0.0701, - "step": 71660 - }, - { - "epoch": 4.688910696761531, - "grad_norm": 1.039119839668274, - "learning_rate": 7.036284336250658e-05, - "loss": 0.0623, - "step": 71670 - }, - { - "epoch": 4.689564932940792, - "grad_norm": 1.0590921640396118, - "learning_rate": 7.035445337127532e-05, - "loss": 0.0638, - "step": 71680 - }, - { - "epoch": 4.690219169120052, - "grad_norm": 0.8854411244392395, - "learning_rate": 7.034606269302789e-05, - "loss": 0.0663, - "step": 71690 - }, - { - "epoch": 4.690873405299313, - "grad_norm": 0.7692389488220215, - "learning_rate": 7.033767132804747e-05, - "loss": 0.0529, - "step": 71700 - }, - { - "epoch": 4.691527641478574, - "grad_norm": 0.6598808765411377, - "learning_rate": 7.03292792766173e-05, - "loss": 0.0584, - "step": 71710 - }, - { - "epoch": 4.692181877657834, - "grad_norm": 0.8510991334915161, - "learning_rate": 7.032088653902067e-05, - "loss": 0.0621, - "step": 71720 - }, - { - "epoch": 4.692836113837095, - "grad_norm": 0.9420493245124817, - "learning_rate": 7.031249311554079e-05, - "loss": 0.0635, - "step": 71730 - }, - { - "epoch": 4.693490350016356, - "grad_norm": 0.8198224306106567, - "learning_rate": 7.0304099006461e-05, - "loss": 0.0673, - "step": 71740 - }, - { - "epoch": 4.694144586195616, - "grad_norm": 1.035709023475647, - "learning_rate": 7.029570421206464e-05, - "loss": 0.064, - "step": 71750 - }, - { - "epoch": 4.694798822374877, - "grad_norm": 0.8997196555137634, - "learning_rate": 7.028730873263502e-05, - "loss": 0.0555, - "step": 71760 - }, - { - "epoch": 4.695453058554138, - "grad_norm": 0.9662396907806396, - "learning_rate": 7.027891256845553e-05, - "loss": 0.0634, - "step": 71770 - }, - { - "epoch": 4.696107294733399, - "grad_norm": 0.7257537245750427, - "learning_rate": 7.027051571980957e-05, - "loss": 0.0614, - "step": 71780 - }, - { - "epoch": 4.696761530912659, - "grad_norm": 0.9215990900993347, - "learning_rate": 7.026211818698053e-05, - "loss": 0.0698, - "step": 71790 - }, - { - "epoch": 4.69741576709192, - "grad_norm": 0.7259654402732849, - "learning_rate": 7.025371997025185e-05, - "loss": 0.0678, - "step": 71800 - }, - { - "epoch": 4.698070003271181, - "grad_norm": 0.9841349124908447, - "learning_rate": 7.0245321069907e-05, - "loss": 0.0576, - "step": 71810 - }, - { - "epoch": 4.698724239450442, - "grad_norm": 0.9428102374076843, - "learning_rate": 7.02369214862295e-05, - "loss": 0.0601, - "step": 71820 - }, - { - "epoch": 4.699378475629702, - "grad_norm": 0.7501752376556396, - "learning_rate": 7.022852121950281e-05, - "loss": 0.0564, - "step": 71830 - }, - { - "epoch": 4.700032711808963, - "grad_norm": 0.705059289932251, - "learning_rate": 7.022012027001048e-05, - "loss": 0.0499, - "step": 71840 - }, - { - "epoch": 4.700686947988224, - "grad_norm": 0.7624207735061646, - "learning_rate": 7.021171863803606e-05, - "loss": 0.072, - "step": 71850 - }, - { - "epoch": 4.701341184167484, - "grad_norm": 0.7773285508155823, - "learning_rate": 7.020331632386312e-05, - "loss": 0.0545, - "step": 71860 - }, - { - "epoch": 4.701995420346745, - "grad_norm": 1.0297824144363403, - "learning_rate": 7.019491332777528e-05, - "loss": 0.0549, - "step": 71870 - }, - { - "epoch": 4.702649656526006, - "grad_norm": 0.9911094307899475, - "learning_rate": 7.018650965005616e-05, - "loss": 0.0671, - "step": 71880 - }, - { - "epoch": 4.703303892705266, - "grad_norm": 0.960195779800415, - "learning_rate": 7.017810529098938e-05, - "loss": 0.0664, - "step": 71890 - }, - { - "epoch": 4.703958128884527, - "grad_norm": 0.7779977321624756, - "learning_rate": 7.016970025085864e-05, - "loss": 0.0593, - "step": 71900 - }, - { - "epoch": 4.704612365063788, - "grad_norm": 0.8631134629249573, - "learning_rate": 7.016129452994761e-05, - "loss": 0.06, - "step": 71910 - }, - { - "epoch": 4.705266601243049, - "grad_norm": 0.854272186756134, - "learning_rate": 7.015288812854003e-05, - "loss": 0.0593, - "step": 71920 - }, - { - "epoch": 4.705920837422309, - "grad_norm": 1.105404257774353, - "learning_rate": 7.01444810469196e-05, - "loss": 0.0675, - "step": 71930 - }, - { - "epoch": 4.70657507360157, - "grad_norm": 0.8611233234405518, - "learning_rate": 7.013607328537012e-05, - "loss": 0.0567, - "step": 71940 - }, - { - "epoch": 4.707229309780831, - "grad_norm": 0.8065164089202881, - "learning_rate": 7.012766484417536e-05, - "loss": 0.065, - "step": 71950 - }, - { - "epoch": 4.707883545960092, - "grad_norm": 0.853921115398407, - "learning_rate": 7.011925572361912e-05, - "loss": 0.0626, - "step": 71960 - }, - { - "epoch": 4.708537782139352, - "grad_norm": 0.850261926651001, - "learning_rate": 7.011084592398523e-05, - "loss": 0.0757, - "step": 71970 - }, - { - "epoch": 4.709192018318613, - "grad_norm": 1.0401116609573364, - "learning_rate": 7.010243544555756e-05, - "loss": 0.0597, - "step": 71980 - }, - { - "epoch": 4.709846254497874, - "grad_norm": 1.0012506246566772, - "learning_rate": 7.009402428861995e-05, - "loss": 0.0637, - "step": 71990 - }, - { - "epoch": 4.710500490677134, - "grad_norm": 0.854714572429657, - "learning_rate": 7.008561245345634e-05, - "loss": 0.0605, - "step": 72000 - }, - { - "epoch": 4.711154726856395, - "grad_norm": 1.0572654008865356, - "learning_rate": 7.007719994035063e-05, - "loss": 0.0718, - "step": 72010 - }, - { - "epoch": 4.711808963035656, - "grad_norm": 0.9308390021324158, - "learning_rate": 7.006878674958676e-05, - "loss": 0.0705, - "step": 72020 - }, - { - "epoch": 4.712463199214916, - "grad_norm": 0.945894181728363, - "learning_rate": 7.006037288144872e-05, - "loss": 0.0676, - "step": 72030 - }, - { - "epoch": 4.713117435394177, - "grad_norm": 1.6244274377822876, - "learning_rate": 7.005195833622048e-05, - "loss": 0.059, - "step": 72040 - }, - { - "epoch": 4.713771671573438, - "grad_norm": 0.9291044473648071, - "learning_rate": 7.004354311418606e-05, - "loss": 0.0635, - "step": 72050 - }, - { - "epoch": 4.714425907752699, - "grad_norm": 0.9611480832099915, - "learning_rate": 7.003512721562949e-05, - "loss": 0.0566, - "step": 72060 - }, - { - "epoch": 4.715080143931959, - "grad_norm": 0.7825486660003662, - "learning_rate": 7.002671064083482e-05, - "loss": 0.0575, - "step": 72070 - }, - { - "epoch": 4.71573438011122, - "grad_norm": 0.8755865693092346, - "learning_rate": 7.001829339008616e-05, - "loss": 0.065, - "step": 72080 - }, - { - "epoch": 4.716388616290481, - "grad_norm": 0.8769670724868774, - "learning_rate": 7.000987546366758e-05, - "loss": 0.068, - "step": 72090 - }, - { - "epoch": 4.717042852469741, - "grad_norm": 0.7498029470443726, - "learning_rate": 7.000145686186324e-05, - "loss": 0.0537, - "step": 72100 - }, - { - "epoch": 4.717697088649002, - "grad_norm": 1.0412774085998535, - "learning_rate": 6.999303758495727e-05, - "loss": 0.0519, - "step": 72110 - }, - { - "epoch": 4.718351324828263, - "grad_norm": 0.8896167278289795, - "learning_rate": 6.998461763323385e-05, - "loss": 0.0576, - "step": 72120 - }, - { - "epoch": 4.719005561007524, - "grad_norm": 1.0666289329528809, - "learning_rate": 6.997619700697719e-05, - "loss": 0.0625, - "step": 72130 - }, - { - "epoch": 4.719659797186784, - "grad_norm": 1.330458164215088, - "learning_rate": 6.996777570647147e-05, - "loss": 0.07, - "step": 72140 - }, - { - "epoch": 4.720314033366045, - "grad_norm": 1.0409564971923828, - "learning_rate": 6.995935373200095e-05, - "loss": 0.0543, - "step": 72150 - }, - { - "epoch": 4.720968269545306, - "grad_norm": 0.8398634195327759, - "learning_rate": 6.995093108384992e-05, - "loss": 0.0582, - "step": 72160 - }, - { - "epoch": 4.721622505724566, - "grad_norm": 0.9149483442306519, - "learning_rate": 6.994250776230262e-05, - "loss": 0.0605, - "step": 72170 - }, - { - "epoch": 4.722276741903827, - "grad_norm": 0.8937086462974548, - "learning_rate": 6.993408376764339e-05, - "loss": 0.0579, - "step": 72180 - }, - { - "epoch": 4.722930978083088, - "grad_norm": 0.8356661796569824, - "learning_rate": 6.992565910015655e-05, - "loss": 0.0569, - "step": 72190 - }, - { - "epoch": 4.723585214262348, - "grad_norm": 1.0383862257003784, - "learning_rate": 6.991723376012646e-05, - "loss": 0.0665, - "step": 72200 - }, - { - "epoch": 4.724239450441609, - "grad_norm": 0.9304238557815552, - "learning_rate": 6.99088077478375e-05, - "loss": 0.054, - "step": 72210 - }, - { - "epoch": 4.72489368662087, - "grad_norm": 1.0787382125854492, - "learning_rate": 6.990038106357407e-05, - "loss": 0.059, - "step": 72220 - }, - { - "epoch": 4.725547922800131, - "grad_norm": 0.9750184416770935, - "learning_rate": 6.989195370762057e-05, - "loss": 0.0621, - "step": 72230 - }, - { - "epoch": 4.726202158979391, - "grad_norm": 0.9108086824417114, - "learning_rate": 6.988352568026148e-05, - "loss": 0.0576, - "step": 72240 - }, - { - "epoch": 4.726856395158652, - "grad_norm": 1.0215849876403809, - "learning_rate": 6.987509698178125e-05, - "loss": 0.0619, - "step": 72250 - }, - { - "epoch": 4.727510631337913, - "grad_norm": 1.3526380062103271, - "learning_rate": 6.986666761246436e-05, - "loss": 0.0573, - "step": 72260 - }, - { - "epoch": 4.728164867517174, - "grad_norm": 0.9454095363616943, - "learning_rate": 6.985823757259535e-05, - "loss": 0.0601, - "step": 72270 - }, - { - "epoch": 4.728819103696434, - "grad_norm": 1.032045841217041, - "learning_rate": 6.984980686245874e-05, - "loss": 0.0577, - "step": 72280 - }, - { - "epoch": 4.729473339875695, - "grad_norm": 0.8855615854263306, - "learning_rate": 6.98413754823391e-05, - "loss": 0.0683, - "step": 72290 - }, - { - "epoch": 4.730127576054956, - "grad_norm": 0.9899473786354065, - "learning_rate": 6.983294343252098e-05, - "loss": 0.0705, - "step": 72300 - }, - { - "epoch": 4.730781812234216, - "grad_norm": 0.9037162065505981, - "learning_rate": 6.982451071328902e-05, - "loss": 0.0659, - "step": 72310 - }, - { - "epoch": 4.731436048413477, - "grad_norm": 0.9520037174224854, - "learning_rate": 6.98160773249278e-05, - "loss": 0.0558, - "step": 72320 - }, - { - "epoch": 4.732090284592738, - "grad_norm": 0.6964289546012878, - "learning_rate": 6.980764326772204e-05, - "loss": 0.054, - "step": 72330 - }, - { - "epoch": 4.732744520771998, - "grad_norm": 1.047905683517456, - "learning_rate": 6.979920854195637e-05, - "loss": 0.0623, - "step": 72340 - }, - { - "epoch": 4.733398756951259, - "grad_norm": 1.0188045501708984, - "learning_rate": 6.979077314791546e-05, - "loss": 0.0634, - "step": 72350 - }, - { - "epoch": 4.73405299313052, - "grad_norm": 0.7242242097854614, - "learning_rate": 6.978233708588407e-05, - "loss": 0.0592, - "step": 72360 - }, - { - "epoch": 4.734707229309781, - "grad_norm": 1.0020005702972412, - "learning_rate": 6.977390035614692e-05, - "loss": 0.0608, - "step": 72370 - }, - { - "epoch": 4.735361465489041, - "grad_norm": 0.9756394028663635, - "learning_rate": 6.976546295898878e-05, - "loss": 0.0702, - "step": 72380 - }, - { - "epoch": 4.736015701668302, - "grad_norm": 0.9397419095039368, - "learning_rate": 6.975702489469442e-05, - "loss": 0.0573, - "step": 72390 - }, - { - "epoch": 4.736669937847563, - "grad_norm": 0.9236851334571838, - "learning_rate": 6.974858616354867e-05, - "loss": 0.0593, - "step": 72400 - }, - { - "epoch": 4.737324174026824, - "grad_norm": 0.7359132170677185, - "learning_rate": 6.974014676583632e-05, - "loss": 0.0565, - "step": 72410 - }, - { - "epoch": 4.737978410206084, - "grad_norm": 1.030502438545227, - "learning_rate": 6.973170670184226e-05, - "loss": 0.0568, - "step": 72420 - }, - { - "epoch": 4.738632646385345, - "grad_norm": 0.7706143856048584, - "learning_rate": 6.972326597185136e-05, - "loss": 0.0571, - "step": 72430 - }, - { - "epoch": 4.739286882564606, - "grad_norm": 0.8049507737159729, - "learning_rate": 6.971482457614848e-05, - "loss": 0.0619, - "step": 72440 - }, - { - "epoch": 4.739941118743866, - "grad_norm": 0.9160942435264587, - "learning_rate": 6.970638251501859e-05, - "loss": 0.0699, - "step": 72450 - }, - { - "epoch": 4.740595354923127, - "grad_norm": 0.8005819916725159, - "learning_rate": 6.96979397887466e-05, - "loss": 0.0572, - "step": 72460 - }, - { - "epoch": 4.741249591102388, - "grad_norm": 0.9010353088378906, - "learning_rate": 6.96894963976175e-05, - "loss": 0.06, - "step": 72470 - }, - { - "epoch": 4.741903827281648, - "grad_norm": 0.8157947659492493, - "learning_rate": 6.968105234191623e-05, - "loss": 0.0609, - "step": 72480 - }, - { - "epoch": 4.742558063460909, - "grad_norm": 0.968302309513092, - "learning_rate": 6.967260762192785e-05, - "loss": 0.0542, - "step": 72490 - }, - { - "epoch": 4.74321229964017, - "grad_norm": 1.0909911394119263, - "learning_rate": 6.966416223793736e-05, - "loss": 0.0624, - "step": 72500 - }, - { - "epoch": 4.743866535819431, - "grad_norm": 0.7629793286323547, - "learning_rate": 6.965571619022981e-05, - "loss": 0.0592, - "step": 72510 - }, - { - "epoch": 4.744520771998691, - "grad_norm": 0.7732744216918945, - "learning_rate": 6.964726947909031e-05, - "loss": 0.0632, - "step": 72520 - }, - { - "epoch": 4.745175008177952, - "grad_norm": 0.8113254904747009, - "learning_rate": 6.963882210480394e-05, - "loss": 0.0632, - "step": 72530 - }, - { - "epoch": 4.745829244357213, - "grad_norm": 1.129671573638916, - "learning_rate": 6.963037406765581e-05, - "loss": 0.0577, - "step": 72540 - }, - { - "epoch": 4.746483480536473, - "grad_norm": 0.8630444407463074, - "learning_rate": 6.962192536793106e-05, - "loss": 0.0709, - "step": 72550 - }, - { - "epoch": 4.747137716715734, - "grad_norm": 0.8324365019798279, - "learning_rate": 6.961347600591489e-05, - "loss": 0.057, - "step": 72560 - }, - { - "epoch": 4.747791952894995, - "grad_norm": 0.919032633304596, - "learning_rate": 6.960502598189245e-05, - "loss": 0.0581, - "step": 72570 - }, - { - "epoch": 4.748446189074256, - "grad_norm": 1.0946227312088013, - "learning_rate": 6.959657529614898e-05, - "loss": 0.0643, - "step": 72580 - }, - { - "epoch": 4.749100425253516, - "grad_norm": 0.8766310214996338, - "learning_rate": 6.95881239489697e-05, - "loss": 0.0577, - "step": 72590 - }, - { - "epoch": 4.749754661432777, - "grad_norm": 0.9453266859054565, - "learning_rate": 6.957967194063985e-05, - "loss": 0.0631, - "step": 72600 - }, - { - "epoch": 4.750408897612038, - "grad_norm": 0.74550861120224, - "learning_rate": 6.957121927144474e-05, - "loss": 0.0627, - "step": 72610 - }, - { - "epoch": 4.751063133791298, - "grad_norm": 0.7826007008552551, - "learning_rate": 6.956276594166964e-05, - "loss": 0.0572, - "step": 72620 - }, - { - "epoch": 4.751717369970559, - "grad_norm": 0.9839434027671814, - "learning_rate": 6.955431195159989e-05, - "loss": 0.0724, - "step": 72630 - }, - { - "epoch": 4.75237160614982, - "grad_norm": 0.808029294013977, - "learning_rate": 6.954585730152083e-05, - "loss": 0.0634, - "step": 72640 - }, - { - "epoch": 4.75302584232908, - "grad_norm": 0.935702383518219, - "learning_rate": 6.953740199171782e-05, - "loss": 0.0734, - "step": 72650 - }, - { - "epoch": 4.753680078508341, - "grad_norm": 1.2413403987884521, - "learning_rate": 6.952894602247626e-05, - "loss": 0.0655, - "step": 72660 - }, - { - "epoch": 4.754334314687602, - "grad_norm": 1.0404640436172485, - "learning_rate": 6.952048939408156e-05, - "loss": 0.0731, - "step": 72670 - }, - { - "epoch": 4.754988550866863, - "grad_norm": 0.900425136089325, - "learning_rate": 6.951203210681914e-05, - "loss": 0.0559, - "step": 72680 - }, - { - "epoch": 4.755642787046123, - "grad_norm": 0.9717649817466736, - "learning_rate": 6.950357416097446e-05, - "loss": 0.0594, - "step": 72690 - }, - { - "epoch": 4.756297023225384, - "grad_norm": 1.1071817874908447, - "learning_rate": 6.949511555683301e-05, - "loss": 0.0638, - "step": 72700 - }, - { - "epoch": 4.756951259404645, - "grad_norm": 0.9194473028182983, - "learning_rate": 6.948665629468027e-05, - "loss": 0.0655, - "step": 72710 - }, - { - "epoch": 4.757605495583906, - "grad_norm": 0.7610183358192444, - "learning_rate": 6.94781963748018e-05, - "loss": 0.0632, - "step": 72720 - }, - { - "epoch": 4.758259731763166, - "grad_norm": 0.8666313886642456, - "learning_rate": 6.946973579748309e-05, - "loss": 0.0659, - "step": 72730 - }, - { - "epoch": 4.758913967942427, - "grad_norm": 0.8880037665367126, - "learning_rate": 6.946127456300974e-05, - "loss": 0.062, - "step": 72740 - }, - { - "epoch": 4.759568204121688, - "grad_norm": 0.8450115323066711, - "learning_rate": 6.945281267166736e-05, - "loss": 0.072, - "step": 72750 - }, - { - "epoch": 4.760222440300948, - "grad_norm": 1.100077509880066, - "learning_rate": 6.94443501237415e-05, - "loss": 0.0661, - "step": 72760 - }, - { - "epoch": 4.760876676480209, - "grad_norm": 0.956047534942627, - "learning_rate": 6.943588691951785e-05, - "loss": 0.0607, - "step": 72770 - }, - { - "epoch": 4.76153091265947, - "grad_norm": 0.7662896513938904, - "learning_rate": 6.942742305928205e-05, - "loss": 0.0627, - "step": 72780 - }, - { - "epoch": 4.76218514883873, - "grad_norm": 0.9403966069221497, - "learning_rate": 6.941895854331977e-05, - "loss": 0.0646, - "step": 72790 - }, - { - "epoch": 4.762839385017991, - "grad_norm": 0.918593168258667, - "learning_rate": 6.94104933719167e-05, - "loss": 0.0521, - "step": 72800 - }, - { - "epoch": 4.763493621197252, - "grad_norm": 0.8933160901069641, - "learning_rate": 6.940202754535856e-05, - "loss": 0.0629, - "step": 72810 - }, - { - "epoch": 4.764147857376513, - "grad_norm": 0.8830978274345398, - "learning_rate": 6.939356106393113e-05, - "loss": 0.059, - "step": 72820 - }, - { - "epoch": 4.764802093555773, - "grad_norm": 0.9082246422767639, - "learning_rate": 6.938509392792016e-05, - "loss": 0.057, - "step": 72830 - }, - { - "epoch": 4.765456329735034, - "grad_norm": 1.0356862545013428, - "learning_rate": 6.93766261376114e-05, - "loss": 0.0589, - "step": 72840 - }, - { - "epoch": 4.766110565914295, - "grad_norm": 0.8933234810829163, - "learning_rate": 6.936815769329071e-05, - "loss": 0.0596, - "step": 72850 - }, - { - "epoch": 4.766764802093556, - "grad_norm": 0.8427624106407166, - "learning_rate": 6.935968859524389e-05, - "loss": 0.0563, - "step": 72860 - }, - { - "epoch": 4.767419038272816, - "grad_norm": 0.8006119728088379, - "learning_rate": 6.935121884375683e-05, - "loss": 0.0591, - "step": 72870 - }, - { - "epoch": 4.768073274452077, - "grad_norm": 0.7767458558082581, - "learning_rate": 6.934274843911537e-05, - "loss": 0.0603, - "step": 72880 - }, - { - "epoch": 4.768727510631338, - "grad_norm": 0.8365658521652222, - "learning_rate": 6.933427738160542e-05, - "loss": 0.0648, - "step": 72890 - }, - { - "epoch": 4.769381746810598, - "grad_norm": 0.9282712340354919, - "learning_rate": 6.93258056715129e-05, - "loss": 0.0722, - "step": 72900 - }, - { - "epoch": 4.770035982989859, - "grad_norm": 0.7623592615127563, - "learning_rate": 6.931733330912375e-05, - "loss": 0.0618, - "step": 72910 - }, - { - "epoch": 4.77069021916912, - "grad_norm": 0.8280691504478455, - "learning_rate": 6.930886029472396e-05, - "loss": 0.0585, - "step": 72920 - }, - { - "epoch": 4.77134445534838, - "grad_norm": 0.8656930923461914, - "learning_rate": 6.930038662859947e-05, - "loss": 0.06, - "step": 72930 - }, - { - "epoch": 4.771998691527641, - "grad_norm": 0.8957453370094299, - "learning_rate": 6.929191231103634e-05, - "loss": 0.0626, - "step": 72940 - }, - { - "epoch": 4.772652927706902, - "grad_norm": 0.8083595037460327, - "learning_rate": 6.928343734232057e-05, - "loss": 0.0679, - "step": 72950 - }, - { - "epoch": 4.773307163886163, - "grad_norm": 0.9139804244041443, - "learning_rate": 6.92749617227382e-05, - "loss": 0.0612, - "step": 72960 - }, - { - "epoch": 4.773961400065423, - "grad_norm": 0.9249205589294434, - "learning_rate": 6.926648545257534e-05, - "loss": 0.0599, - "step": 72970 - }, - { - "epoch": 4.774615636244684, - "grad_norm": 0.9592366218566895, - "learning_rate": 6.925800853211807e-05, - "loss": 0.054, - "step": 72980 - }, - { - "epoch": 4.775269872423945, - "grad_norm": 0.9399287700653076, - "learning_rate": 6.924953096165248e-05, - "loss": 0.0585, - "step": 72990 - }, - { - "epoch": 4.775924108603205, - "grad_norm": 0.8287541270256042, - "learning_rate": 6.924105274146476e-05, - "loss": 0.0568, - "step": 73000 - }, - { - "epoch": 4.776578344782466, - "grad_norm": 0.6379443407058716, - "learning_rate": 6.923257387184103e-05, - "loss": 0.0574, - "step": 73010 - }, - { - "epoch": 4.777232580961727, - "grad_norm": 0.7155212163925171, - "learning_rate": 6.922409435306751e-05, - "loss": 0.0646, - "step": 73020 - }, - { - "epoch": 4.777886817140988, - "grad_norm": 0.8226625919342041, - "learning_rate": 6.921561418543037e-05, - "loss": 0.0576, - "step": 73030 - }, - { - "epoch": 4.778541053320248, - "grad_norm": 0.8837935328483582, - "learning_rate": 6.920713336921588e-05, - "loss": 0.0607, - "step": 73040 - }, - { - "epoch": 4.779195289499509, - "grad_norm": 0.8143121004104614, - "learning_rate": 6.919865190471026e-05, - "loss": 0.0559, - "step": 73050 - }, - { - "epoch": 4.77984952567877, - "grad_norm": 0.7172751426696777, - "learning_rate": 6.919016979219978e-05, - "loss": 0.052, - "step": 73060 - }, - { - "epoch": 4.78050376185803, - "grad_norm": 0.9022568464279175, - "learning_rate": 6.918168703197073e-05, - "loss": 0.0589, - "step": 73070 - }, - { - "epoch": 4.781157998037291, - "grad_norm": 0.9577611684799194, - "learning_rate": 6.917320362430945e-05, - "loss": 0.0654, - "step": 73080 - }, - { - "epoch": 4.781812234216552, - "grad_norm": 0.7758936285972595, - "learning_rate": 6.916471956950228e-05, - "loss": 0.0634, - "step": 73090 - }, - { - "epoch": 4.782466470395812, - "grad_norm": 0.9259306192398071, - "learning_rate": 6.915623486783555e-05, - "loss": 0.064, - "step": 73100 - }, - { - "epoch": 4.783120706575073, - "grad_norm": 0.9896383285522461, - "learning_rate": 6.914774951959565e-05, - "loss": 0.0581, - "step": 73110 - }, - { - "epoch": 4.783774942754334, - "grad_norm": 0.8976383209228516, - "learning_rate": 6.913926352506898e-05, - "loss": 0.0583, - "step": 73120 - }, - { - "epoch": 4.784429178933595, - "grad_norm": 0.9846516251564026, - "learning_rate": 6.913077688454198e-05, - "loss": 0.0639, - "step": 73130 - }, - { - "epoch": 4.785083415112855, - "grad_norm": 0.7982523441314697, - "learning_rate": 6.912228959830109e-05, - "loss": 0.062, - "step": 73140 - }, - { - "epoch": 4.785737651292116, - "grad_norm": 0.7241307497024536, - "learning_rate": 6.911380166663278e-05, - "loss": 0.0607, - "step": 73150 - }, - { - "epoch": 4.786391887471377, - "grad_norm": 0.7322345972061157, - "learning_rate": 6.910531308982353e-05, - "loss": 0.0645, - "step": 73160 - }, - { - "epoch": 4.787046123650638, - "grad_norm": 0.8883731961250305, - "learning_rate": 6.909682386815987e-05, - "loss": 0.0642, - "step": 73170 - }, - { - "epoch": 4.787700359829898, - "grad_norm": 0.8451051115989685, - "learning_rate": 6.908833400192829e-05, - "loss": 0.0635, - "step": 73180 - }, - { - "epoch": 4.788354596009159, - "grad_norm": 1.008467435836792, - "learning_rate": 6.90798434914154e-05, - "loss": 0.0643, - "step": 73190 - }, - { - "epoch": 4.78900883218842, - "grad_norm": 0.964712381362915, - "learning_rate": 6.907135233690774e-05, - "loss": 0.0602, - "step": 73200 - }, - { - "epoch": 4.78966306836768, - "grad_norm": 0.9574010968208313, - "learning_rate": 6.906286053869194e-05, - "loss": 0.0586, - "step": 73210 - }, - { - "epoch": 4.790317304546941, - "grad_norm": 0.9692860245704651, - "learning_rate": 6.90543680970546e-05, - "loss": 0.0646, - "step": 73220 - }, - { - "epoch": 4.790971540726202, - "grad_norm": 0.8585093021392822, - "learning_rate": 6.904587501228236e-05, - "loss": 0.0654, - "step": 73230 - }, - { - "epoch": 4.791625776905462, - "grad_norm": 0.9351073503494263, - "learning_rate": 6.903738128466188e-05, - "loss": 0.0553, - "step": 73240 - }, - { - "epoch": 4.792280013084723, - "grad_norm": 0.7714135050773621, - "learning_rate": 6.902888691447986e-05, - "loss": 0.0557, - "step": 73250 - }, - { - "epoch": 4.792934249263984, - "grad_norm": 0.977641224861145, - "learning_rate": 6.9020391902023e-05, - "loss": 0.0522, - "step": 73260 - }, - { - "epoch": 4.793588485443245, - "grad_norm": 1.0779730081558228, - "learning_rate": 6.901189624757803e-05, - "loss": 0.0606, - "step": 73270 - }, - { - "epoch": 4.794242721622505, - "grad_norm": 0.9080124497413635, - "learning_rate": 6.900339995143172e-05, - "loss": 0.0553, - "step": 73280 - }, - { - "epoch": 4.794896957801766, - "grad_norm": 0.8090987801551819, - "learning_rate": 6.899490301387079e-05, - "loss": 0.0565, - "step": 73290 - }, - { - "epoch": 4.795551193981027, - "grad_norm": 1.0443229675292969, - "learning_rate": 6.89864054351821e-05, - "loss": 0.0588, - "step": 73300 - }, - { - "epoch": 4.796205430160288, - "grad_norm": 0.7326402068138123, - "learning_rate": 6.897790721565243e-05, - "loss": 0.0569, - "step": 73310 - }, - { - "epoch": 4.796859666339548, - "grad_norm": 0.8527650237083435, - "learning_rate": 6.89694083555686e-05, - "loss": 0.0687, - "step": 73320 - }, - { - "epoch": 4.797513902518809, - "grad_norm": 1.0699238777160645, - "learning_rate": 6.896090885521749e-05, - "loss": 0.0616, - "step": 73330 - }, - { - "epoch": 4.79816813869807, - "grad_norm": 0.9271957874298096, - "learning_rate": 6.895240871488599e-05, - "loss": 0.062, - "step": 73340 - }, - { - "epoch": 4.79882237487733, - "grad_norm": 0.6596797704696655, - "learning_rate": 6.894390793486098e-05, - "loss": 0.0538, - "step": 73350 - }, - { - "epoch": 4.799476611056591, - "grad_norm": 0.8873812556266785, - "learning_rate": 6.89354065154294e-05, - "loss": 0.0678, - "step": 73360 - }, - { - "epoch": 4.800130847235852, - "grad_norm": 1.0261344909667969, - "learning_rate": 6.89269044568782e-05, - "loss": 0.0636, - "step": 73370 - }, - { - "epoch": 4.8007850834151125, - "grad_norm": 0.9206944108009338, - "learning_rate": 6.891840175949432e-05, - "loss": 0.0535, - "step": 73380 - }, - { - "epoch": 4.801439319594373, - "grad_norm": 0.9742897152900696, - "learning_rate": 6.890989842356479e-05, - "loss": 0.0599, - "step": 73390 - }, - { - "epoch": 4.802093555773634, - "grad_norm": 0.8803852796554565, - "learning_rate": 6.890139444937657e-05, - "loss": 0.066, - "step": 73400 - }, - { - "epoch": 4.802747791952895, - "grad_norm": 1.0795328617095947, - "learning_rate": 6.889288983721673e-05, - "loss": 0.0587, - "step": 73410 - }, - { - "epoch": 4.803402028132155, - "grad_norm": 0.8851546049118042, - "learning_rate": 6.888438458737232e-05, - "loss": 0.0598, - "step": 73420 - }, - { - "epoch": 4.804056264311416, - "grad_norm": 0.9637795686721802, - "learning_rate": 6.887587870013039e-05, - "loss": 0.0534, - "step": 73430 - }, - { - "epoch": 4.804710500490677, - "grad_norm": 1.036117434501648, - "learning_rate": 6.886737217577805e-05, - "loss": 0.0589, - "step": 73440 - }, - { - "epoch": 4.8053647366699375, - "grad_norm": 0.8892022967338562, - "learning_rate": 6.88588650146024e-05, - "loss": 0.058, - "step": 73450 - }, - { - "epoch": 4.806018972849198, - "grad_norm": 0.7642195224761963, - "learning_rate": 6.885035721689062e-05, - "loss": 0.0707, - "step": 73460 - }, - { - "epoch": 4.806673209028459, - "grad_norm": 0.8952467441558838, - "learning_rate": 6.884184878292985e-05, - "loss": 0.0618, - "step": 73470 - }, - { - "epoch": 4.80732744520772, - "grad_norm": 0.7745544910430908, - "learning_rate": 6.883333971300725e-05, - "loss": 0.0601, - "step": 73480 - }, - { - "epoch": 4.8079816813869805, - "grad_norm": 0.8764887452125549, - "learning_rate": 6.882483000741007e-05, - "loss": 0.0614, - "step": 73490 - }, - { - "epoch": 4.808635917566241, - "grad_norm": 0.8772285580635071, - "learning_rate": 6.881631966642549e-05, - "loss": 0.0532, - "step": 73500 - }, - { - "epoch": 4.809290153745502, - "grad_norm": 1.0648410320281982, - "learning_rate": 6.880780869034077e-05, - "loss": 0.0655, - "step": 73510 - }, - { - "epoch": 4.8099443899247625, - "grad_norm": 1.125880479812622, - "learning_rate": 6.879929707944317e-05, - "loss": 0.0671, - "step": 73520 - }, - { - "epoch": 4.8105986261040234, - "grad_norm": 0.867889404296875, - "learning_rate": 6.879078483402002e-05, - "loss": 0.0512, - "step": 73530 - }, - { - "epoch": 4.811252862283284, - "grad_norm": 0.8720964789390564, - "learning_rate": 6.878227195435859e-05, - "loss": 0.0601, - "step": 73540 - }, - { - "epoch": 4.8119070984625445, - "grad_norm": 0.8965845108032227, - "learning_rate": 6.877375844074622e-05, - "loss": 0.0582, - "step": 73550 - }, - { - "epoch": 4.8125613346418055, - "grad_norm": 0.7062650322914124, - "learning_rate": 6.876524429347027e-05, - "loss": 0.0674, - "step": 73560 - }, - { - "epoch": 4.813215570821066, - "grad_norm": 1.128409743309021, - "learning_rate": 6.87567295128181e-05, - "loss": 0.0603, - "step": 73570 - }, - { - "epoch": 4.813869807000327, - "grad_norm": 0.952961802482605, - "learning_rate": 6.874821409907713e-05, - "loss": 0.0563, - "step": 73580 - }, - { - "epoch": 4.8145240431795875, - "grad_norm": 0.8573753833770752, - "learning_rate": 6.873969805253477e-05, - "loss": 0.06, - "step": 73590 - }, - { - "epoch": 4.8151782793588485, - "grad_norm": 0.8490943312644958, - "learning_rate": 6.873118137347844e-05, - "loss": 0.0584, - "step": 73600 - }, - { - "epoch": 4.815832515538109, - "grad_norm": 0.9143913388252258, - "learning_rate": 6.872266406219562e-05, - "loss": 0.0739, - "step": 73610 - }, - { - "epoch": 4.81648675171737, - "grad_norm": 0.973491370677948, - "learning_rate": 6.871414611897379e-05, - "loss": 0.0572, - "step": 73620 - }, - { - "epoch": 4.8171409878966305, - "grad_norm": 0.913781464099884, - "learning_rate": 6.870562754410044e-05, - "loss": 0.0646, - "step": 73630 - }, - { - "epoch": 4.8177952240758914, - "grad_norm": 1.1292165517807007, - "learning_rate": 6.86971083378631e-05, - "loss": 0.0538, - "step": 73640 - }, - { - "epoch": 4.818449460255152, - "grad_norm": 0.815707802772522, - "learning_rate": 6.868858850054933e-05, - "loss": 0.0577, - "step": 73650 - }, - { - "epoch": 4.8191036964344125, - "grad_norm": 1.1895596981048584, - "learning_rate": 6.868006803244669e-05, - "loss": 0.0534, - "step": 73660 - }, - { - "epoch": 4.8197579326136735, - "grad_norm": 0.672910749912262, - "learning_rate": 6.867154693384275e-05, - "loss": 0.053, - "step": 73670 - }, - { - "epoch": 4.820412168792934, - "grad_norm": 1.2378745079040527, - "learning_rate": 6.866302520502515e-05, - "loss": 0.0745, - "step": 73680 - }, - { - "epoch": 4.8210664049721945, - "grad_norm": 1.0225582122802734, - "learning_rate": 6.865450284628148e-05, - "loss": 0.0637, - "step": 73690 - }, - { - "epoch": 4.8217206411514555, - "grad_norm": 0.8776040077209473, - "learning_rate": 6.864597985789944e-05, - "loss": 0.0664, - "step": 73700 - }, - { - "epoch": 4.8223748773307165, - "grad_norm": 0.7907946705818176, - "learning_rate": 6.863745624016666e-05, - "loss": 0.078, - "step": 73710 - }, - { - "epoch": 4.823029113509977, - "grad_norm": 0.7620204091072083, - "learning_rate": 6.862893199337087e-05, - "loss": 0.0517, - "step": 73720 - }, - { - "epoch": 4.8236833496892375, - "grad_norm": 0.7737089991569519, - "learning_rate": 6.862040711779976e-05, - "loss": 0.0583, - "step": 73730 - }, - { - "epoch": 4.8243375858684985, - "grad_norm": 0.7591310739517212, - "learning_rate": 6.861188161374106e-05, - "loss": 0.0602, - "step": 73740 - }, - { - "epoch": 4.8249918220477594, - "grad_norm": 0.8480481505393982, - "learning_rate": 6.860335548148257e-05, - "loss": 0.0603, - "step": 73750 - }, - { - "epoch": 4.82564605822702, - "grad_norm": 0.8201214671134949, - "learning_rate": 6.859482872131203e-05, - "loss": 0.0617, - "step": 73760 - }, - { - "epoch": 4.8263002944062805, - "grad_norm": 1.0084785223007202, - "learning_rate": 6.858630133351726e-05, - "loss": 0.0647, - "step": 73770 - }, - { - "epoch": 4.8269545305855415, - "grad_norm": 0.8590355515480042, - "learning_rate": 6.857777331838607e-05, - "loss": 0.0621, - "step": 73780 - }, - { - "epoch": 4.827608766764802, - "grad_norm": 1.0885320901870728, - "learning_rate": 6.856924467620631e-05, - "loss": 0.0556, - "step": 73790 - }, - { - "epoch": 4.8282630029440625, - "grad_norm": 0.7727965116500854, - "learning_rate": 6.856071540726585e-05, - "loss": 0.0552, - "step": 73800 - }, - { - "epoch": 4.8289172391233235, - "grad_norm": 0.9133251309394836, - "learning_rate": 6.855218551185255e-05, - "loss": 0.0601, - "step": 73810 - }, - { - "epoch": 4.8295714753025845, - "grad_norm": 0.782835841178894, - "learning_rate": 6.854365499025435e-05, - "loss": 0.0563, - "step": 73820 - }, - { - "epoch": 4.8302257114818445, - "grad_norm": 1.1449490785598755, - "learning_rate": 6.853512384275916e-05, - "loss": 0.0622, - "step": 73830 - }, - { - "epoch": 4.8308799476611055, - "grad_norm": 0.7554023265838623, - "learning_rate": 6.852659206965493e-05, - "loss": 0.0579, - "step": 73840 - }, - { - "epoch": 4.8315341838403665, - "grad_norm": 1.0752381086349487, - "learning_rate": 6.851805967122962e-05, - "loss": 0.057, - "step": 73850 - }, - { - "epoch": 4.8321884200196275, - "grad_norm": 0.9013246893882751, - "learning_rate": 6.850952664777124e-05, - "loss": 0.0545, - "step": 73860 - }, - { - "epoch": 4.8328426561988875, - "grad_norm": 0.8916118741035461, - "learning_rate": 6.85009929995678e-05, - "loss": 0.0527, - "step": 73870 - }, - { - "epoch": 4.8334968923781485, - "grad_norm": 0.8229614496231079, - "learning_rate": 6.849245872690731e-05, - "loss": 0.0666, - "step": 73880 - }, - { - "epoch": 4.8341511285574095, - "grad_norm": 0.7096636891365051, - "learning_rate": 6.848392383007784e-05, - "loss": 0.0589, - "step": 73890 - }, - { - "epoch": 4.8348053647366696, - "grad_norm": 0.8488548398017883, - "learning_rate": 6.847538830936746e-05, - "loss": 0.0672, - "step": 73900 - }, - { - "epoch": 4.8354596009159305, - "grad_norm": 0.8226892948150635, - "learning_rate": 6.84668521650643e-05, - "loss": 0.0587, - "step": 73910 - }, - { - "epoch": 4.8361138370951915, - "grad_norm": 0.7988223433494568, - "learning_rate": 6.845831539745643e-05, - "loss": 0.0618, - "step": 73920 - }, - { - "epoch": 4.8367680732744525, - "grad_norm": 0.9851675033569336, - "learning_rate": 6.8449778006832e-05, - "loss": 0.057, - "step": 73930 - }, - { - "epoch": 4.8374223094537125, - "grad_norm": 1.0461887121200562, - "learning_rate": 6.84412399934792e-05, - "loss": 0.0602, - "step": 73940 - }, - { - "epoch": 4.8380765456329735, - "grad_norm": 1.085433006286621, - "learning_rate": 6.843270135768616e-05, - "loss": 0.0609, - "step": 73950 - }, - { - "epoch": 4.8387307818122345, - "grad_norm": 0.9901970028877258, - "learning_rate": 6.842416209974111e-05, - "loss": 0.0498, - "step": 73960 - }, - { - "epoch": 4.839385017991495, - "grad_norm": 0.7033235430717468, - "learning_rate": 6.841562221993228e-05, - "loss": 0.0648, - "step": 73970 - }, - { - "epoch": 4.8400392541707555, - "grad_norm": 0.8223658800125122, - "learning_rate": 6.84070817185479e-05, - "loss": 0.0568, - "step": 73980 - }, - { - "epoch": 4.8406934903500165, - "grad_norm": 0.8278485536575317, - "learning_rate": 6.839854059587624e-05, - "loss": 0.062, - "step": 73990 - }, - { - "epoch": 4.841347726529277, - "grad_norm": 0.8352931141853333, - "learning_rate": 6.838999885220558e-05, - "loss": 0.0645, - "step": 74000 - }, - { - "epoch": 4.8420019627085376, - "grad_norm": 1.1042554378509521, - "learning_rate": 6.838145648782422e-05, - "loss": 0.0612, - "step": 74010 - }, - { - "epoch": 4.8426561988877985, - "grad_norm": 0.8824191689491272, - "learning_rate": 6.837291350302052e-05, - "loss": 0.0611, - "step": 74020 - }, - { - "epoch": 4.8433104350670595, - "grad_norm": 0.9088771939277649, - "learning_rate": 6.836436989808278e-05, - "loss": 0.0644, - "step": 74030 - }, - { - "epoch": 4.84396467124632, - "grad_norm": 0.8344402313232422, - "learning_rate": 6.835582567329942e-05, - "loss": 0.0568, - "step": 74040 - }, - { - "epoch": 4.8446189074255805, - "grad_norm": 0.8632908463478088, - "learning_rate": 6.834728082895878e-05, - "loss": 0.0596, - "step": 74050 - }, - { - "epoch": 4.8452731436048415, - "grad_norm": 0.7749559879302979, - "learning_rate": 6.83387353653493e-05, - "loss": 0.062, - "step": 74060 - }, - { - "epoch": 4.8459273797841025, - "grad_norm": 0.9213570952415466, - "learning_rate": 6.83301892827594e-05, - "loss": 0.0571, - "step": 74070 - }, - { - "epoch": 4.846581615963363, - "grad_norm": 0.7469812035560608, - "learning_rate": 6.832164258147756e-05, - "loss": 0.0528, - "step": 74080 - }, - { - "epoch": 4.8472358521426235, - "grad_norm": 0.6800960898399353, - "learning_rate": 6.831309526179221e-05, - "loss": 0.063, - "step": 74090 - }, - { - "epoch": 4.8478900883218845, - "grad_norm": 0.8719648122787476, - "learning_rate": 6.830454732399188e-05, - "loss": 0.0532, - "step": 74100 - }, - { - "epoch": 4.848544324501145, - "grad_norm": 0.9584552645683289, - "learning_rate": 6.829599876836507e-05, - "loss": 0.0636, - "step": 74110 - }, - { - "epoch": 4.849198560680406, - "grad_norm": 0.9922688603401184, - "learning_rate": 6.828744959520031e-05, - "loss": 0.0601, - "step": 74120 - }, - { - "epoch": 4.8498527968596665, - "grad_norm": 0.8676353693008423, - "learning_rate": 6.82788998047862e-05, - "loss": 0.0577, - "step": 74130 - }, - { - "epoch": 4.850507033038927, - "grad_norm": 0.9001226425170898, - "learning_rate": 6.827034939741125e-05, - "loss": 0.0597, - "step": 74140 - }, - { - "epoch": 4.851161269218188, - "grad_norm": 0.8655483722686768, - "learning_rate": 6.82617983733641e-05, - "loss": 0.0614, - "step": 74150 - }, - { - "epoch": 4.8518155053974485, - "grad_norm": 0.9211030602455139, - "learning_rate": 6.825324673293336e-05, - "loss": 0.0545, - "step": 74160 - }, - { - "epoch": 4.8524697415767095, - "grad_norm": 0.8237171173095703, - "learning_rate": 6.824469447640766e-05, - "loss": 0.0544, - "step": 74170 - }, - { - "epoch": 4.85312397775597, - "grad_norm": 1.0644043684005737, - "learning_rate": 6.82361416040757e-05, - "loss": 0.0702, - "step": 74180 - }, - { - "epoch": 4.853778213935231, - "grad_norm": 0.8750390410423279, - "learning_rate": 6.82275881162261e-05, - "loss": 0.0582, - "step": 74190 - }, - { - "epoch": 4.8544324501144915, - "grad_norm": 0.9315994381904602, - "learning_rate": 6.821903401314764e-05, - "loss": 0.0555, - "step": 74200 - }, - { - "epoch": 4.8550866862937525, - "grad_norm": 1.0597717761993408, - "learning_rate": 6.821047929512898e-05, - "loss": 0.0574, - "step": 74210 - }, - { - "epoch": 4.855740922473013, - "grad_norm": 0.8364295959472656, - "learning_rate": 6.820192396245886e-05, - "loss": 0.0573, - "step": 74220 - }, - { - "epoch": 4.856395158652274, - "grad_norm": 0.7751799821853638, - "learning_rate": 6.81933680154261e-05, - "loss": 0.0636, - "step": 74230 - }, - { - "epoch": 4.8570493948315345, - "grad_norm": 0.9654813408851624, - "learning_rate": 6.818481145431947e-05, - "loss": 0.0612, - "step": 74240 - }, - { - "epoch": 4.857703631010795, - "grad_norm": 0.922402024269104, - "learning_rate": 6.817625427942773e-05, - "loss": 0.058, - "step": 74250 - }, - { - "epoch": 4.858357867190056, - "grad_norm": 0.663686990737915, - "learning_rate": 6.816769649103974e-05, - "loss": 0.0621, - "step": 74260 - }, - { - "epoch": 4.8590121033693165, - "grad_norm": 1.038624882698059, - "learning_rate": 6.815913808944436e-05, - "loss": 0.0532, - "step": 74270 - }, - { - "epoch": 4.859666339548577, - "grad_norm": 0.7460023760795593, - "learning_rate": 6.815057907493045e-05, - "loss": 0.066, - "step": 74280 - }, - { - "epoch": 4.860320575727838, - "grad_norm": 0.8232451677322388, - "learning_rate": 6.814201944778689e-05, - "loss": 0.0635, - "step": 74290 - }, - { - "epoch": 4.860974811907099, - "grad_norm": 0.7617900967597961, - "learning_rate": 6.81334592083026e-05, - "loss": 0.0578, - "step": 74300 - }, - { - "epoch": 4.8616290480863595, - "grad_norm": 0.8007754683494568, - "learning_rate": 6.81248983567665e-05, - "loss": 0.0553, - "step": 74310 - }, - { - "epoch": 4.86228328426562, - "grad_norm": 0.8569623231887817, - "learning_rate": 6.811633689346752e-05, - "loss": 0.0563, - "step": 74320 - }, - { - "epoch": 4.862937520444881, - "grad_norm": 0.979371964931488, - "learning_rate": 6.810777481869471e-05, - "loss": 0.059, - "step": 74330 - }, - { - "epoch": 4.863591756624142, - "grad_norm": 0.7974584102630615, - "learning_rate": 6.809921213273697e-05, - "loss": 0.0644, - "step": 74340 - }, - { - "epoch": 4.864245992803402, - "grad_norm": 0.9594705104827881, - "learning_rate": 6.809064883588336e-05, - "loss": 0.0618, - "step": 74350 - }, - { - "epoch": 4.864900228982663, - "grad_norm": 0.8043680191040039, - "learning_rate": 6.808208492842291e-05, - "loss": 0.0581, - "step": 74360 - }, - { - "epoch": 4.865554465161924, - "grad_norm": 0.8218439221382141, - "learning_rate": 6.807352041064467e-05, - "loss": 0.0607, - "step": 74370 - }, - { - "epoch": 4.8662087013411846, - "grad_norm": 1.0335355997085571, - "learning_rate": 6.806495528283771e-05, - "loss": 0.0679, - "step": 74380 - }, - { - "epoch": 4.866862937520445, - "grad_norm": 1.074435830116272, - "learning_rate": 6.805638954529117e-05, - "loss": 0.0628, - "step": 74390 - }, - { - "epoch": 4.867517173699706, - "grad_norm": 1.0719654560089111, - "learning_rate": 6.80478231982941e-05, - "loss": 0.0632, - "step": 74400 - }, - { - "epoch": 4.868171409878967, - "grad_norm": 0.9550548791885376, - "learning_rate": 6.803925624213565e-05, - "loss": 0.0554, - "step": 74410 - }, - { - "epoch": 4.868825646058227, - "grad_norm": 0.9627480506896973, - "learning_rate": 6.803068867710503e-05, - "loss": 0.0676, - "step": 74420 - }, - { - "epoch": 4.869479882237488, - "grad_norm": 1.0203245878219604, - "learning_rate": 6.802212050349135e-05, - "loss": 0.0535, - "step": 74430 - }, - { - "epoch": 4.870134118416749, - "grad_norm": 0.996152937412262, - "learning_rate": 6.801355172158385e-05, - "loss": 0.0572, - "step": 74440 - }, - { - "epoch": 4.870788354596009, - "grad_norm": 0.8001197576522827, - "learning_rate": 6.800498233167172e-05, - "loss": 0.06, - "step": 74450 - }, - { - "epoch": 4.87144259077527, - "grad_norm": 1.0456775426864624, - "learning_rate": 6.799641233404423e-05, - "loss": 0.0618, - "step": 74460 - }, - { - "epoch": 4.872096826954531, - "grad_norm": 0.9173516631126404, - "learning_rate": 6.798784172899064e-05, - "loss": 0.0584, - "step": 74470 - }, - { - "epoch": 4.872751063133792, - "grad_norm": 1.2566989660263062, - "learning_rate": 6.79792705168002e-05, - "loss": 0.0665, - "step": 74480 - }, - { - "epoch": 4.873405299313052, - "grad_norm": 0.9484615921974182, - "learning_rate": 6.797069869776222e-05, - "loss": 0.0643, - "step": 74490 - }, - { - "epoch": 4.874059535492313, - "grad_norm": 0.8985061049461365, - "learning_rate": 6.796212627216605e-05, - "loss": 0.0664, - "step": 74500 - }, - { - "epoch": 4.874713771671574, - "grad_norm": 0.8476536273956299, - "learning_rate": 6.795355324030099e-05, - "loss": 0.0543, - "step": 74510 - }, - { - "epoch": 4.875368007850835, - "grad_norm": 1.0744060277938843, - "learning_rate": 6.794497960245644e-05, - "loss": 0.0656, - "step": 74520 - }, - { - "epoch": 4.876022244030095, - "grad_norm": 1.001219391822815, - "learning_rate": 6.793640535892176e-05, - "loss": 0.063, - "step": 74530 - }, - { - "epoch": 4.876676480209356, - "grad_norm": 1.04497492313385, - "learning_rate": 6.792783050998637e-05, - "loss": 0.0629, - "step": 74540 - }, - { - "epoch": 4.877330716388617, - "grad_norm": 1.1128970384597778, - "learning_rate": 6.791925505593965e-05, - "loss": 0.0598, - "step": 74550 - }, - { - "epoch": 4.877984952567877, - "grad_norm": 0.9689093828201294, - "learning_rate": 6.791067899707113e-05, - "loss": 0.0577, - "step": 74560 - }, - { - "epoch": 4.878639188747138, - "grad_norm": 1.0436021089553833, - "learning_rate": 6.790210233367019e-05, - "loss": 0.0557, - "step": 74570 - }, - { - "epoch": 4.879293424926399, - "grad_norm": 0.6721805334091187, - "learning_rate": 6.789352506602632e-05, - "loss": 0.0665, - "step": 74580 - }, - { - "epoch": 4.879947661105659, - "grad_norm": 0.8921915888786316, - "learning_rate": 6.78849471944291e-05, - "loss": 0.0541, - "step": 74590 - }, - { - "epoch": 4.88060189728492, - "grad_norm": 0.7612684965133667, - "learning_rate": 6.787636871916798e-05, - "loss": 0.0611, - "step": 74600 - }, - { - "epoch": 4.881256133464181, - "grad_norm": 0.7255061268806458, - "learning_rate": 6.786778964053253e-05, - "loss": 0.068, - "step": 74610 - }, - { - "epoch": 4.881910369643442, - "grad_norm": 0.7865713834762573, - "learning_rate": 6.785920995881234e-05, - "loss": 0.0665, - "step": 74620 - }, - { - "epoch": 4.882564605822702, - "grad_norm": 0.8941336274147034, - "learning_rate": 6.785062967429697e-05, - "loss": 0.0531, - "step": 74630 - }, - { - "epoch": 4.883218842001963, - "grad_norm": 0.8453198671340942, - "learning_rate": 6.784204878727601e-05, - "loss": 0.0552, - "step": 74640 - }, - { - "epoch": 4.883873078181224, - "grad_norm": 0.9211252927780151, - "learning_rate": 6.783346729803913e-05, - "loss": 0.064, - "step": 74650 - }, - { - "epoch": 4.884527314360485, - "grad_norm": 0.7787407040596008, - "learning_rate": 6.782488520687596e-05, - "loss": 0.064, - "step": 74660 - }, - { - "epoch": 4.885181550539745, - "grad_norm": 0.8281296491622925, - "learning_rate": 6.781630251407617e-05, - "loss": 0.0531, - "step": 74670 - }, - { - "epoch": 4.885835786719006, - "grad_norm": 0.9956787824630737, - "learning_rate": 6.780771921992945e-05, - "loss": 0.0556, - "step": 74680 - }, - { - "epoch": 4.886490022898267, - "grad_norm": 1.0824602842330933, - "learning_rate": 6.779913532472548e-05, - "loss": 0.0639, - "step": 74690 - }, - { - "epoch": 4.887144259077527, - "grad_norm": 0.9609588980674744, - "learning_rate": 6.779055082875403e-05, - "loss": 0.0633, - "step": 74700 - }, - { - "epoch": 4.887798495256788, - "grad_norm": 0.9233350157737732, - "learning_rate": 6.778196573230481e-05, - "loss": 0.0671, - "step": 74710 - }, - { - "epoch": 4.888452731436049, - "grad_norm": 0.7840463519096375, - "learning_rate": 6.777338003566765e-05, - "loss": 0.0555, - "step": 74720 - }, - { - "epoch": 4.889106967615309, - "grad_norm": 0.8873052000999451, - "learning_rate": 6.776479373913228e-05, - "loss": 0.0642, - "step": 74730 - }, - { - "epoch": 4.88976120379457, - "grad_norm": 1.742559790611267, - "learning_rate": 6.775620684298853e-05, - "loss": 0.0636, - "step": 74740 - }, - { - "epoch": 4.890415439973831, - "grad_norm": 1.021909475326538, - "learning_rate": 6.774761934752624e-05, - "loss": 0.0568, - "step": 74750 - }, - { - "epoch": 4.891069676153092, - "grad_norm": 0.7874606847763062, - "learning_rate": 6.773903125303524e-05, - "loss": 0.0537, - "step": 74760 - }, - { - "epoch": 4.891723912332352, - "grad_norm": 0.8984439969062805, - "learning_rate": 6.773044255980543e-05, - "loss": 0.071, - "step": 74770 - }, - { - "epoch": 4.892378148511613, - "grad_norm": 0.8923892378807068, - "learning_rate": 6.772185326812668e-05, - "loss": 0.0552, - "step": 74780 - }, - { - "epoch": 4.893032384690874, - "grad_norm": 1.0217084884643555, - "learning_rate": 6.77132633782889e-05, - "loss": 0.0557, - "step": 74790 - }, - { - "epoch": 4.893686620870134, - "grad_norm": 0.9023916721343994, - "learning_rate": 6.770467289058203e-05, - "loss": 0.065, - "step": 74800 - }, - { - "epoch": 4.894340857049395, - "grad_norm": 1.1047594547271729, - "learning_rate": 6.7696081805296e-05, - "loss": 0.0521, - "step": 74810 - }, - { - "epoch": 4.894995093228656, - "grad_norm": 1.103213906288147, - "learning_rate": 6.768749012272081e-05, - "loss": 0.0691, - "step": 74820 - }, - { - "epoch": 4.895649329407917, - "grad_norm": 0.9325263500213623, - "learning_rate": 6.767889784314645e-05, - "loss": 0.0625, - "step": 74830 - }, - { - "epoch": 4.896303565587177, - "grad_norm": 1.620137333869934, - "learning_rate": 6.76703049668629e-05, - "loss": 0.0641, - "step": 74840 - }, - { - "epoch": 4.896957801766438, - "grad_norm": 1.0535722970962524, - "learning_rate": 6.766171149416023e-05, - "loss": 0.052, - "step": 74850 - }, - { - "epoch": 4.897612037945699, - "grad_norm": 0.9076412320137024, - "learning_rate": 6.765311742532849e-05, - "loss": 0.0637, - "step": 74860 - }, - { - "epoch": 4.898266274124959, - "grad_norm": 1.7605758905410767, - "learning_rate": 6.764452276065774e-05, - "loss": 0.0574, - "step": 74870 - }, - { - "epoch": 4.89892051030422, - "grad_norm": 0.7411386966705322, - "learning_rate": 6.763592750043805e-05, - "loss": 0.0568, - "step": 74880 - }, - { - "epoch": 4.899574746483481, - "grad_norm": 0.8387102484703064, - "learning_rate": 6.762733164495956e-05, - "loss": 0.0567, - "step": 74890 - }, - { - "epoch": 4.900228982662741, - "grad_norm": 0.9322872161865234, - "learning_rate": 6.761873519451241e-05, - "loss": 0.0554, - "step": 74900 - }, - { - "epoch": 4.900883218842002, - "grad_norm": 0.9480436444282532, - "learning_rate": 6.761013814938673e-05, - "loss": 0.0575, - "step": 74910 - }, - { - "epoch": 4.901537455021263, - "grad_norm": 0.8049683570861816, - "learning_rate": 6.760154050987272e-05, - "loss": 0.0543, - "step": 74920 - }, - { - "epoch": 4.902191691200524, - "grad_norm": 1.016456961631775, - "learning_rate": 6.759294227626054e-05, - "loss": 0.0538, - "step": 74930 - }, - { - "epoch": 4.902845927379784, - "grad_norm": 0.9667950868606567, - "learning_rate": 6.758434344884042e-05, - "loss": 0.0569, - "step": 74940 - }, - { - "epoch": 4.903500163559045, - "grad_norm": 0.902862548828125, - "learning_rate": 6.75757440279026e-05, - "loss": 0.0586, - "step": 74950 - }, - { - "epoch": 4.904154399738306, - "grad_norm": 0.7040520906448364, - "learning_rate": 6.756714401373732e-05, - "loss": 0.0593, - "step": 74960 - }, - { - "epoch": 4.904808635917567, - "grad_norm": 0.824975848197937, - "learning_rate": 6.755854340663484e-05, - "loss": 0.0565, - "step": 74970 - }, - { - "epoch": 4.905462872096827, - "grad_norm": 0.8577737212181091, - "learning_rate": 6.754994220688551e-05, - "loss": 0.0581, - "step": 74980 - }, - { - "epoch": 4.906117108276088, - "grad_norm": 0.8069767951965332, - "learning_rate": 6.754134041477957e-05, - "loss": 0.0481, - "step": 74990 - }, - { - "epoch": 4.906771344455349, - "grad_norm": 0.7826264500617981, - "learning_rate": 6.75327380306074e-05, - "loss": 0.0642, - "step": 75000 - }, - { - "epoch": 4.907425580634609, - "grad_norm": 0.895269513130188, - "learning_rate": 6.752413505465935e-05, - "loss": 0.0549, - "step": 75010 - }, - { - "epoch": 4.90807981681387, - "grad_norm": 0.6955884099006653, - "learning_rate": 6.751553148722576e-05, - "loss": 0.0546, - "step": 75020 - }, - { - "epoch": 4.908734052993131, - "grad_norm": 0.8068095445632935, - "learning_rate": 6.750692732859706e-05, - "loss": 0.0569, - "step": 75030 - }, - { - "epoch": 4.909388289172391, - "grad_norm": 0.6808393001556396, - "learning_rate": 6.749832257906365e-05, - "loss": 0.0557, - "step": 75040 - }, - { - "epoch": 4.910042525351652, - "grad_norm": 0.9451243281364441, - "learning_rate": 6.748971723891597e-05, - "loss": 0.0634, - "step": 75050 - }, - { - "epoch": 4.910696761530913, - "grad_norm": 0.7461559772491455, - "learning_rate": 6.748111130844445e-05, - "loss": 0.062, - "step": 75060 - }, - { - "epoch": 4.911350997710174, - "grad_norm": 0.8373140096664429, - "learning_rate": 6.747250478793959e-05, - "loss": 0.0538, - "step": 75070 - }, - { - "epoch": 4.912005233889434, - "grad_norm": 0.9267625212669373, - "learning_rate": 6.746389767769185e-05, - "loss": 0.0657, - "step": 75080 - }, - { - "epoch": 4.912659470068695, - "grad_norm": 0.8543652892112732, - "learning_rate": 6.745528997799178e-05, - "loss": 0.0612, - "step": 75090 - }, - { - "epoch": 4.913313706247956, - "grad_norm": 0.9111425280570984, - "learning_rate": 6.744668168912989e-05, - "loss": 0.0604, - "step": 75100 - }, - { - "epoch": 4.913967942427217, - "grad_norm": 1.040820598602295, - "learning_rate": 6.743807281139675e-05, - "loss": 0.0614, - "step": 75110 - }, - { - "epoch": 4.914622178606477, - "grad_norm": 0.8948336839675903, - "learning_rate": 6.74294633450829e-05, - "loss": 0.0527, - "step": 75120 - }, - { - "epoch": 4.915276414785738, - "grad_norm": 0.923996090888977, - "learning_rate": 6.742085329047895e-05, - "loss": 0.0636, - "step": 75130 - }, - { - "epoch": 4.915930650964999, - "grad_norm": 0.8746695518493652, - "learning_rate": 6.741224264787553e-05, - "loss": 0.0587, - "step": 75140 - }, - { - "epoch": 4.916584887144259, - "grad_norm": 0.7787982821464539, - "learning_rate": 6.740363141756325e-05, - "loss": 0.0542, - "step": 75150 - }, - { - "epoch": 4.91723912332352, - "grad_norm": 0.9488137364387512, - "learning_rate": 6.739501959983277e-05, - "loss": 0.0602, - "step": 75160 - }, - { - "epoch": 4.917893359502781, - "grad_norm": 0.7842972874641418, - "learning_rate": 6.738640719497475e-05, - "loss": 0.0538, - "step": 75170 - }, - { - "epoch": 4.918547595682041, - "grad_norm": 0.9717864394187927, - "learning_rate": 6.73777942032799e-05, - "loss": 0.0614, - "step": 75180 - }, - { - "epoch": 4.919201831861302, - "grad_norm": 0.7789155840873718, - "learning_rate": 6.736918062503889e-05, - "loss": 0.068, - "step": 75190 - }, - { - "epoch": 4.919856068040563, - "grad_norm": 0.8811984062194824, - "learning_rate": 6.736056646054251e-05, - "loss": 0.0531, - "step": 75200 - }, - { - "epoch": 4.920510304219824, - "grad_norm": 0.9033105969429016, - "learning_rate": 6.73519517100815e-05, - "loss": 0.0591, - "step": 75210 - }, - { - "epoch": 4.921164540399084, - "grad_norm": 0.7949878573417664, - "learning_rate": 6.734333637394657e-05, - "loss": 0.0538, - "step": 75220 - }, - { - "epoch": 4.921818776578345, - "grad_norm": 0.9282236099243164, - "learning_rate": 6.73347204524286e-05, - "loss": 0.0672, - "step": 75230 - }, - { - "epoch": 4.922473012757606, - "grad_norm": 0.8523067831993103, - "learning_rate": 6.732610394581831e-05, - "loss": 0.0512, - "step": 75240 - }, - { - "epoch": 4.923127248936867, - "grad_norm": 0.9058271646499634, - "learning_rate": 6.731748685440658e-05, - "loss": 0.064, - "step": 75250 - }, - { - "epoch": 4.923781485116127, - "grad_norm": 0.7754740715026855, - "learning_rate": 6.730886917848426e-05, - "loss": 0.06, - "step": 75260 - }, - { - "epoch": 4.924435721295388, - "grad_norm": 0.9523494839668274, - "learning_rate": 6.730025091834223e-05, - "loss": 0.062, - "step": 75270 - }, - { - "epoch": 4.925089957474649, - "grad_norm": 0.825276255607605, - "learning_rate": 6.729163207427134e-05, - "loss": 0.0568, - "step": 75280 - }, - { - "epoch": 4.925744193653909, - "grad_norm": 1.0787001848220825, - "learning_rate": 6.728301264656251e-05, - "loss": 0.0591, - "step": 75290 - }, - { - "epoch": 4.92639842983317, - "grad_norm": 0.859107494354248, - "learning_rate": 6.727439263550669e-05, - "loss": 0.0568, - "step": 75300 - }, - { - "epoch": 4.927052666012431, - "grad_norm": 0.897099494934082, - "learning_rate": 6.726577204139482e-05, - "loss": 0.063, - "step": 75310 - }, - { - "epoch": 4.927706902191691, - "grad_norm": 0.7519966959953308, - "learning_rate": 6.725715086451784e-05, - "loss": 0.0582, - "step": 75320 - }, - { - "epoch": 4.928361138370952, - "grad_norm": 0.7944568991661072, - "learning_rate": 6.724852910516677e-05, - "loss": 0.0605, - "step": 75330 - }, - { - "epoch": 4.929015374550213, - "grad_norm": 1.1941640377044678, - "learning_rate": 6.723990676363262e-05, - "loss": 0.065, - "step": 75340 - }, - { - "epoch": 4.929669610729473, - "grad_norm": 0.7790656089782715, - "learning_rate": 6.723128384020638e-05, - "loss": 0.0591, - "step": 75350 - }, - { - "epoch": 4.930323846908734, - "grad_norm": 0.9245325326919556, - "learning_rate": 6.722266033517913e-05, - "loss": 0.0511, - "step": 75360 - }, - { - "epoch": 4.930978083087995, - "grad_norm": 0.9800746440887451, - "learning_rate": 6.721403624884194e-05, - "loss": 0.0634, - "step": 75370 - }, - { - "epoch": 4.931632319267256, - "grad_norm": 0.7326732277870178, - "learning_rate": 6.720541158148587e-05, - "loss": 0.0555, - "step": 75380 - }, - { - "epoch": 4.932286555446516, - "grad_norm": 0.8972048759460449, - "learning_rate": 6.719678633340202e-05, - "loss": 0.0515, - "step": 75390 - }, - { - "epoch": 4.932940791625777, - "grad_norm": 0.9582744240760803, - "learning_rate": 6.718816050488157e-05, - "loss": 0.0626, - "step": 75400 - }, - { - "epoch": 4.933595027805038, - "grad_norm": 0.7547270655632019, - "learning_rate": 6.717953409621559e-05, - "loss": 0.0576, - "step": 75410 - }, - { - "epoch": 4.934249263984299, - "grad_norm": 1.1005964279174805, - "learning_rate": 6.71709071076953e-05, - "loss": 0.0601, - "step": 75420 - }, - { - "epoch": 4.934903500163559, - "grad_norm": 0.8800162672996521, - "learning_rate": 6.716227953961185e-05, - "loss": 0.0592, - "step": 75430 - }, - { - "epoch": 4.93555773634282, - "grad_norm": 0.7670087814331055, - "learning_rate": 6.715365139225647e-05, - "loss": 0.06, - "step": 75440 - }, - { - "epoch": 4.936211972522081, - "grad_norm": 0.9640793204307556, - "learning_rate": 6.714502266592034e-05, - "loss": 0.0634, - "step": 75450 - }, - { - "epoch": 4.936866208701341, - "grad_norm": 0.8268434405326843, - "learning_rate": 6.713639336089476e-05, - "loss": 0.0591, - "step": 75460 - }, - { - "epoch": 4.937520444880602, - "grad_norm": 1.175800085067749, - "learning_rate": 6.712776347747096e-05, - "loss": 0.0675, - "step": 75470 - }, - { - "epoch": 4.938174681059863, - "grad_norm": 0.8210011720657349, - "learning_rate": 6.71191330159402e-05, - "loss": 0.0622, - "step": 75480 - }, - { - "epoch": 4.938828917239123, - "grad_norm": 0.8271769881248474, - "learning_rate": 6.711050197659384e-05, - "loss": 0.0598, - "step": 75490 - }, - { - "epoch": 4.939483153418384, - "grad_norm": 0.8794811964035034, - "learning_rate": 6.710187035972314e-05, - "loss": 0.0586, - "step": 75500 - }, - { - "epoch": 4.940137389597645, - "grad_norm": 0.8313082456588745, - "learning_rate": 6.709323816561946e-05, - "loss": 0.061, - "step": 75510 - }, - { - "epoch": 4.940791625776906, - "grad_norm": 0.9724137783050537, - "learning_rate": 6.708460539457418e-05, - "loss": 0.0639, - "step": 75520 - }, - { - "epoch": 4.941445861956166, - "grad_norm": 0.9716455936431885, - "learning_rate": 6.707597204687865e-05, - "loss": 0.0561, - "step": 75530 - }, - { - "epoch": 4.942100098135427, - "grad_norm": 0.6842145323753357, - "learning_rate": 6.706733812282428e-05, - "loss": 0.0518, - "step": 75540 - }, - { - "epoch": 4.942754334314688, - "grad_norm": 0.767361581325531, - "learning_rate": 6.705870362270248e-05, - "loss": 0.0609, - "step": 75550 - }, - { - "epoch": 4.943408570493949, - "grad_norm": 0.9804648756980896, - "learning_rate": 6.705006854680471e-05, - "loss": 0.066, - "step": 75560 - }, - { - "epoch": 4.944062806673209, - "grad_norm": 0.6798174977302551, - "learning_rate": 6.704143289542241e-05, - "loss": 0.0534, - "step": 75570 - }, - { - "epoch": 4.94471704285247, - "grad_norm": 0.8720900416374207, - "learning_rate": 6.703279666884705e-05, - "loss": 0.0508, - "step": 75580 - }, - { - "epoch": 4.945371279031731, - "grad_norm": 1.0530959367752075, - "learning_rate": 6.702415986737014e-05, - "loss": 0.0507, - "step": 75590 - }, - { - "epoch": 4.946025515210991, - "grad_norm": 0.9512234330177307, - "learning_rate": 6.701552249128318e-05, - "loss": 0.0578, - "step": 75600 - }, - { - "epoch": 4.946679751390252, - "grad_norm": 0.7706218361854553, - "learning_rate": 6.70068845408777e-05, - "loss": 0.0596, - "step": 75610 - }, - { - "epoch": 4.947333987569513, - "grad_norm": 0.896196722984314, - "learning_rate": 6.69982460164453e-05, - "loss": 0.0652, - "step": 75620 - }, - { - "epoch": 4.947988223748773, - "grad_norm": 0.8798102140426636, - "learning_rate": 6.69896069182775e-05, - "loss": 0.0601, - "step": 75630 - }, - { - "epoch": 4.948642459928034, - "grad_norm": 0.9904597997665405, - "learning_rate": 6.69809672466659e-05, - "loss": 0.0522, - "step": 75640 - }, - { - "epoch": 4.949296696107295, - "grad_norm": 0.8591594696044922, - "learning_rate": 6.697232700190213e-05, - "loss": 0.0663, - "step": 75650 - }, - { - "epoch": 4.949950932286556, - "grad_norm": 0.8061467409133911, - "learning_rate": 6.696368618427779e-05, - "loss": 0.0626, - "step": 75660 - }, - { - "epoch": 4.950605168465816, - "grad_norm": 0.7854329943656921, - "learning_rate": 6.695504479408458e-05, - "loss": 0.0545, - "step": 75670 - }, - { - "epoch": 4.951259404645077, - "grad_norm": 0.7829649448394775, - "learning_rate": 6.694640283161413e-05, - "loss": 0.0505, - "step": 75680 - }, - { - "epoch": 4.951913640824338, - "grad_norm": 0.6959307193756104, - "learning_rate": 6.693776029715814e-05, - "loss": 0.0623, - "step": 75690 - }, - { - "epoch": 4.952567877003599, - "grad_norm": 0.7448030710220337, - "learning_rate": 6.692911719100833e-05, - "loss": 0.0537, - "step": 75700 - }, - { - "epoch": 4.953222113182859, - "grad_norm": 0.686690628528595, - "learning_rate": 6.692047351345641e-05, - "loss": 0.0572, - "step": 75710 - }, - { - "epoch": 4.95387634936212, - "grad_norm": 1.1191184520721436, - "learning_rate": 6.691182926479413e-05, - "loss": 0.0699, - "step": 75720 - }, - { - "epoch": 4.954530585541381, - "grad_norm": 0.7493934631347656, - "learning_rate": 6.690318444531328e-05, - "loss": 0.0585, - "step": 75730 - }, - { - "epoch": 4.955184821720641, - "grad_norm": 1.0006502866744995, - "learning_rate": 6.689453905530559e-05, - "loss": 0.0562, - "step": 75740 - }, - { - "epoch": 4.955839057899902, - "grad_norm": 0.9580645561218262, - "learning_rate": 6.688589309506292e-05, - "loss": 0.0582, - "step": 75750 - }, - { - "epoch": 4.956493294079163, - "grad_norm": 1.0518616437911987, - "learning_rate": 6.687724656487707e-05, - "loss": 0.0699, - "step": 75760 - }, - { - "epoch": 4.957147530258423, - "grad_norm": 1.082767128944397, - "learning_rate": 6.686859946503989e-05, - "loss": 0.055, - "step": 75770 - }, - { - "epoch": 4.957801766437684, - "grad_norm": 0.933538556098938, - "learning_rate": 6.685995179584324e-05, - "loss": 0.0637, - "step": 75780 - }, - { - "epoch": 4.958456002616945, - "grad_norm": 0.955034077167511, - "learning_rate": 6.685130355757899e-05, - "loss": 0.0596, - "step": 75790 - }, - { - "epoch": 4.959110238796205, - "grad_norm": 0.9905108213424683, - "learning_rate": 6.684265475053905e-05, - "loss": 0.068, - "step": 75800 - }, - { - "epoch": 4.959764474975466, - "grad_norm": 0.7588913440704346, - "learning_rate": 6.683400537501534e-05, - "loss": 0.054, - "step": 75810 - }, - { - "epoch": 4.960418711154727, - "grad_norm": 0.8682058453559875, - "learning_rate": 6.68253554312998e-05, - "loss": 0.0569, - "step": 75820 - }, - { - "epoch": 4.961072947333988, - "grad_norm": 0.8857309818267822, - "learning_rate": 6.68167049196844e-05, - "loss": 0.0706, - "step": 75830 - }, - { - "epoch": 4.961727183513248, - "grad_norm": 0.8646987676620483, - "learning_rate": 6.680805384046109e-05, - "loss": 0.0552, - "step": 75840 - }, - { - "epoch": 4.962381419692509, - "grad_norm": 0.9026349782943726, - "learning_rate": 6.67994021939219e-05, - "loss": 0.0506, - "step": 75850 - }, - { - "epoch": 4.96303565587177, - "grad_norm": 0.9483004808425903, - "learning_rate": 6.679074998035881e-05, - "loss": 0.0557, - "step": 75860 - }, - { - "epoch": 4.963689892051031, - "grad_norm": 0.9569962024688721, - "learning_rate": 6.67820972000639e-05, - "loss": 0.0614, - "step": 75870 - }, - { - "epoch": 4.964344128230291, - "grad_norm": 0.9677038192749023, - "learning_rate": 6.677344385332918e-05, - "loss": 0.063, - "step": 75880 - }, - { - "epoch": 4.964998364409552, - "grad_norm": 0.8951756358146667, - "learning_rate": 6.676478994044673e-05, - "loss": 0.0585, - "step": 75890 - }, - { - "epoch": 4.965652600588813, - "grad_norm": 0.9659004211425781, - "learning_rate": 6.675613546170866e-05, - "loss": 0.0542, - "step": 75900 - }, - { - "epoch": 4.966306836768073, - "grad_norm": 0.7111480236053467, - "learning_rate": 6.674748041740707e-05, - "loss": 0.0635, - "step": 75910 - }, - { - "epoch": 4.966961072947334, - "grad_norm": 0.9708228707313538, - "learning_rate": 6.673882480783412e-05, - "loss": 0.0607, - "step": 75920 - }, - { - "epoch": 4.967615309126595, - "grad_norm": 0.7749221324920654, - "learning_rate": 6.673016863328189e-05, - "loss": 0.0577, - "step": 75930 - }, - { - "epoch": 4.968269545305855, - "grad_norm": 0.9135736227035522, - "learning_rate": 6.672151189404262e-05, - "loss": 0.0608, - "step": 75940 - }, - { - "epoch": 4.968923781485116, - "grad_norm": 0.9059275388717651, - "learning_rate": 6.671285459040847e-05, - "loss": 0.0552, - "step": 75950 - }, - { - "epoch": 4.969578017664377, - "grad_norm": 0.874383270740509, - "learning_rate": 6.670419672267163e-05, - "loss": 0.066, - "step": 75960 - }, - { - "epoch": 4.970232253843638, - "grad_norm": 0.8657017946243286, - "learning_rate": 6.669553829112435e-05, - "loss": 0.0529, - "step": 75970 - }, - { - "epoch": 4.970886490022898, - "grad_norm": 0.919597864151001, - "learning_rate": 6.668687929605889e-05, - "loss": 0.0594, - "step": 75980 - }, - { - "epoch": 4.971540726202159, - "grad_norm": 0.8647698163986206, - "learning_rate": 6.667821973776747e-05, - "loss": 0.0531, - "step": 75990 - }, - { - "epoch": 4.97219496238142, - "grad_norm": 0.6683645248413086, - "learning_rate": 6.666955961654238e-05, - "loss": 0.058, - "step": 76000 - }, - { - "epoch": 4.972849198560681, - "grad_norm": 0.7765533924102783, - "learning_rate": 6.666089893267595e-05, - "loss": 0.0542, - "step": 76010 - }, - { - "epoch": 4.973503434739941, - "grad_norm": 0.8284790515899658, - "learning_rate": 6.665223768646049e-05, - "loss": 0.0565, - "step": 76020 - }, - { - "epoch": 4.974157670919202, - "grad_norm": 0.869004487991333, - "learning_rate": 6.664357587818832e-05, - "loss": 0.0616, - "step": 76030 - }, - { - "epoch": 4.974811907098463, - "grad_norm": 0.99537593126297, - "learning_rate": 6.663491350815184e-05, - "loss": 0.0506, - "step": 76040 - }, - { - "epoch": 4.975466143277723, - "grad_norm": 0.8469506502151489, - "learning_rate": 6.66262505766434e-05, - "loss": 0.0676, - "step": 76050 - }, - { - "epoch": 4.976120379456984, - "grad_norm": 0.9716391563415527, - "learning_rate": 6.661758708395537e-05, - "loss": 0.0547, - "step": 76060 - }, - { - "epoch": 4.976774615636245, - "grad_norm": 1.0621095895767212, - "learning_rate": 6.660892303038022e-05, - "loss": 0.0573, - "step": 76070 - }, - { - "epoch": 4.977428851815505, - "grad_norm": 0.9522149562835693, - "learning_rate": 6.660025841621035e-05, - "loss": 0.056, - "step": 76080 - }, - { - "epoch": 4.978083087994766, - "grad_norm": 1.1043899059295654, - "learning_rate": 6.659159324173823e-05, - "loss": 0.0607, - "step": 76090 - }, - { - "epoch": 4.978737324174027, - "grad_norm": 0.8276578783988953, - "learning_rate": 6.658292750725632e-05, - "loss": 0.0537, - "step": 76100 - }, - { - "epoch": 4.979391560353288, - "grad_norm": 0.8061304688453674, - "learning_rate": 6.657426121305711e-05, - "loss": 0.0655, - "step": 76110 - }, - { - "epoch": 4.980045796532548, - "grad_norm": 0.778200626373291, - "learning_rate": 6.656559435943313e-05, - "loss": 0.0538, - "step": 76120 - }, - { - "epoch": 4.980700032711809, - "grad_norm": 0.8858152031898499, - "learning_rate": 6.655692694667688e-05, - "loss": 0.0532, - "step": 76130 - }, - { - "epoch": 4.98135426889107, - "grad_norm": 0.631655216217041, - "learning_rate": 6.654825897508095e-05, - "loss": 0.0556, - "step": 76140 - }, - { - "epoch": 4.982008505070331, - "grad_norm": 0.7844308018684387, - "learning_rate": 6.653959044493785e-05, - "loss": 0.048, - "step": 76150 - }, - { - "epoch": 4.982662741249591, - "grad_norm": 1.1140578985214233, - "learning_rate": 6.65309213565402e-05, - "loss": 0.0558, - "step": 76160 - }, - { - "epoch": 4.983316977428852, - "grad_norm": 1.0043301582336426, - "learning_rate": 6.652225171018061e-05, - "loss": 0.0553, - "step": 76170 - }, - { - "epoch": 4.983971213608113, - "grad_norm": 0.8583059310913086, - "learning_rate": 6.65135815061517e-05, - "loss": 0.0594, - "step": 76180 - }, - { - "epoch": 4.984625449787373, - "grad_norm": 0.8297145366668701, - "learning_rate": 6.650491074474608e-05, - "loss": 0.0639, - "step": 76190 - }, - { - "epoch": 4.985279685966634, - "grad_norm": 0.7165802121162415, - "learning_rate": 6.649623942625647e-05, - "loss": 0.0607, - "step": 76200 - }, - { - "epoch": 4.985933922145895, - "grad_norm": 0.9171334505081177, - "learning_rate": 6.64875675509755e-05, - "loss": 0.0548, - "step": 76210 - }, - { - "epoch": 4.986588158325155, - "grad_norm": 1.0761557817459106, - "learning_rate": 6.647889511919588e-05, - "loss": 0.061, - "step": 76220 - }, - { - "epoch": 4.987242394504416, - "grad_norm": 0.9394139051437378, - "learning_rate": 6.647022213121035e-05, - "loss": 0.0599, - "step": 76230 - }, - { - "epoch": 4.987896630683677, - "grad_norm": 0.7724557518959045, - "learning_rate": 6.646154858731162e-05, - "loss": 0.0599, - "step": 76240 - }, - { - "epoch": 4.988550866862937, - "grad_norm": 0.7974488139152527, - "learning_rate": 6.645287448779243e-05, - "loss": 0.0499, - "step": 76250 - }, - { - "epoch": 4.989205103042198, - "grad_norm": 0.9333834648132324, - "learning_rate": 6.64441998329456e-05, - "loss": 0.0697, - "step": 76260 - }, - { - "epoch": 4.989859339221459, - "grad_norm": 0.9652996063232422, - "learning_rate": 6.64355246230639e-05, - "loss": 0.0572, - "step": 76270 - }, - { - "epoch": 4.99051357540072, - "grad_norm": 1.0794624090194702, - "learning_rate": 6.642684885844013e-05, - "loss": 0.0615, - "step": 76280 - }, - { - "epoch": 4.99116781157998, - "grad_norm": 0.8401366472244263, - "learning_rate": 6.641817253936713e-05, - "loss": 0.0558, - "step": 76290 - }, - { - "epoch": 4.991822047759241, - "grad_norm": 0.6911501288414001, - "learning_rate": 6.640949566613777e-05, - "loss": 0.0558, - "step": 76300 - }, - { - "epoch": 4.992476283938502, - "grad_norm": 0.7828950881958008, - "learning_rate": 6.640081823904487e-05, - "loss": 0.0589, - "step": 76310 - }, - { - "epoch": 4.993130520117763, - "grad_norm": 0.6949774622917175, - "learning_rate": 6.639214025838135e-05, - "loss": 0.0613, - "step": 76320 - }, - { - "epoch": 4.993784756297023, - "grad_norm": 0.9022195339202881, - "learning_rate": 6.638346172444011e-05, - "loss": 0.0609, - "step": 76330 - }, - { - "epoch": 4.994438992476284, - "grad_norm": 0.824991762638092, - "learning_rate": 6.637478263751407e-05, - "loss": 0.0599, - "step": 76340 - }, - { - "epoch": 4.995093228655545, - "grad_norm": 1.041867733001709, - "learning_rate": 6.636610299789616e-05, - "loss": 0.0673, - "step": 76350 - }, - { - "epoch": 4.995747464834805, - "grad_norm": 1.073553204536438, - "learning_rate": 6.635742280587935e-05, - "loss": 0.0762, - "step": 76360 - }, - { - "epoch": 4.996401701014066, - "grad_norm": 0.8955064415931702, - "learning_rate": 6.634874206175666e-05, - "loss": 0.0659, - "step": 76370 - }, - { - "epoch": 4.997055937193327, - "grad_norm": 1.0121138095855713, - "learning_rate": 6.6340060765821e-05, - "loss": 0.0524, - "step": 76380 - }, - { - "epoch": 4.997710173372587, - "grad_norm": 1.0602036714553833, - "learning_rate": 6.633137891836546e-05, - "loss": 0.0647, - "step": 76390 - }, - { - "epoch": 4.998364409551848, - "grad_norm": 0.947830319404602, - "learning_rate": 6.632269651968306e-05, - "loss": 0.0552, - "step": 76400 - }, - { - "epoch": 4.999018645731109, - "grad_norm": 0.9607699513435364, - "learning_rate": 6.631401357006683e-05, - "loss": 0.073, - "step": 76410 - }, - { - "epoch": 4.99967288191037, - "grad_norm": 0.7737681269645691, - "learning_rate": 6.630533006980986e-05, - "loss": 0.0596, - "step": 76420 - }, - { - "epoch": 5.00032711808963, - "grad_norm": 0.8846662044525146, - "learning_rate": 6.629664601920524e-05, - "loss": 0.057, - "step": 76430 - }, - { - "epoch": 5.000981354268891, - "grad_norm": 1.1258676052093506, - "learning_rate": 6.628796141854608e-05, - "loss": 0.0602, - "step": 76440 - }, - { - "epoch": 5.001635590448152, - "grad_norm": 0.844662070274353, - "learning_rate": 6.627927626812548e-05, - "loss": 0.0529, - "step": 76450 - }, - { - "epoch": 5.002289826627412, - "grad_norm": 0.9069094657897949, - "learning_rate": 6.627059056823665e-05, - "loss": 0.056, - "step": 76460 - }, - { - "epoch": 5.002944062806673, - "grad_norm": 0.8290544152259827, - "learning_rate": 6.62619043191727e-05, - "loss": 0.0576, - "step": 76470 - }, - { - "epoch": 5.003598298985934, - "grad_norm": 1.026877522468567, - "learning_rate": 6.625321752122682e-05, - "loss": 0.0594, - "step": 76480 - }, - { - "epoch": 5.004252535165195, - "grad_norm": 1.0628007650375366, - "learning_rate": 6.624453017469223e-05, - "loss": 0.0548, - "step": 76490 - }, - { - "epoch": 5.004906771344455, - "grad_norm": 0.9122229218482971, - "learning_rate": 6.623584227986215e-05, - "loss": 0.0582, - "step": 76500 - }, - { - "epoch": 5.005561007523716, - "grad_norm": 0.9323796629905701, - "learning_rate": 6.622715383702981e-05, - "loss": 0.0652, - "step": 76510 - }, - { - "epoch": 5.006215243702977, - "grad_norm": 0.9565145969390869, - "learning_rate": 6.621846484648849e-05, - "loss": 0.0614, - "step": 76520 - }, - { - "epoch": 5.006869479882237, - "grad_norm": 0.9995282292366028, - "learning_rate": 6.620977530853141e-05, - "loss": 0.0626, - "step": 76530 - }, - { - "epoch": 5.007523716061498, - "grad_norm": 1.09110689163208, - "learning_rate": 6.620108522345192e-05, - "loss": 0.0655, - "step": 76540 - }, - { - "epoch": 5.008177952240759, - "grad_norm": 0.8959344625473022, - "learning_rate": 6.619239459154331e-05, - "loss": 0.0594, - "step": 76550 - }, - { - "epoch": 5.00883218842002, - "grad_norm": 0.7433099150657654, - "learning_rate": 6.618370341309891e-05, - "loss": 0.0563, - "step": 76560 - }, - { - "epoch": 5.00948642459928, - "grad_norm": 0.8508242964744568, - "learning_rate": 6.61750116884121e-05, - "loss": 0.0621, - "step": 76570 - }, - { - "epoch": 5.010140660778541, - "grad_norm": 1.0113868713378906, - "learning_rate": 6.616631941777621e-05, - "loss": 0.0628, - "step": 76580 - }, - { - "epoch": 5.010794896957802, - "grad_norm": 0.8707073330879211, - "learning_rate": 6.615762660148464e-05, - "loss": 0.062, - "step": 76590 - }, - { - "epoch": 5.011449133137062, - "grad_norm": 0.8594477772712708, - "learning_rate": 6.61489332398308e-05, - "loss": 0.0603, - "step": 76600 - }, - { - "epoch": 5.012103369316323, - "grad_norm": 0.7275976538658142, - "learning_rate": 6.614023933310813e-05, - "loss": 0.0524, - "step": 76610 - }, - { - "epoch": 5.012757605495584, - "grad_norm": 0.8984965682029724, - "learning_rate": 6.613154488161003e-05, - "loss": 0.0653, - "step": 76620 - }, - { - "epoch": 5.013411841674845, - "grad_norm": 0.8797445893287659, - "learning_rate": 6.612284988562997e-05, - "loss": 0.0669, - "step": 76630 - }, - { - "epoch": 5.014066077854105, - "grad_norm": 0.9222575426101685, - "learning_rate": 6.611415434546147e-05, - "loss": 0.0512, - "step": 76640 - }, - { - "epoch": 5.014720314033366, - "grad_norm": 0.7390499114990234, - "learning_rate": 6.6105458261398e-05, - "loss": 0.0504, - "step": 76650 - }, - { - "epoch": 5.015374550212627, - "grad_norm": 0.9179421663284302, - "learning_rate": 6.609676163373306e-05, - "loss": 0.0605, - "step": 76660 - }, - { - "epoch": 5.016028786391887, - "grad_norm": 0.7605708837509155, - "learning_rate": 6.608806446276021e-05, - "loss": 0.0527, - "step": 76670 - }, - { - "epoch": 5.016683022571148, - "grad_norm": 0.8641387820243835, - "learning_rate": 6.6079366748773e-05, - "loss": 0.0592, - "step": 76680 - }, - { - "epoch": 5.017337258750409, - "grad_norm": 0.8649716377258301, - "learning_rate": 6.607066849206498e-05, - "loss": 0.0731, - "step": 76690 - }, - { - "epoch": 5.01799149492967, - "grad_norm": 0.8245428800582886, - "learning_rate": 6.606196969292974e-05, - "loss": 0.0621, - "step": 76700 - }, - { - "epoch": 5.01864573110893, - "grad_norm": 1.014141321182251, - "learning_rate": 6.605327035166091e-05, - "loss": 0.0639, - "step": 76710 - }, - { - "epoch": 5.019299967288191, - "grad_norm": 0.9020232558250427, - "learning_rate": 6.604457046855212e-05, - "loss": 0.0535, - "step": 76720 - }, - { - "epoch": 5.019954203467452, - "grad_norm": 0.9599230885505676, - "learning_rate": 6.603587004389697e-05, - "loss": 0.0541, - "step": 76730 - }, - { - "epoch": 5.020608439646712, - "grad_norm": 0.9951395988464355, - "learning_rate": 6.602716907798917e-05, - "loss": 0.0624, - "step": 76740 - }, - { - "epoch": 5.021262675825973, - "grad_norm": 0.8179054260253906, - "learning_rate": 6.601846757112238e-05, - "loss": 0.053, - "step": 76750 - }, - { - "epoch": 5.021916912005234, - "grad_norm": 0.8312829732894897, - "learning_rate": 6.600976552359029e-05, - "loss": 0.0579, - "step": 76760 - }, - { - "epoch": 5.022571148184495, - "grad_norm": 0.8234156370162964, - "learning_rate": 6.600106293568663e-05, - "loss": 0.0565, - "step": 76770 - }, - { - "epoch": 5.023225384363755, - "grad_norm": 0.7905632853507996, - "learning_rate": 6.599235980770514e-05, - "loss": 0.0578, - "step": 76780 - }, - { - "epoch": 5.023879620543016, - "grad_norm": 0.83710116147995, - "learning_rate": 6.598365613993956e-05, - "loss": 0.058, - "step": 76790 - }, - { - "epoch": 5.024533856722277, - "grad_norm": 1.039841890335083, - "learning_rate": 6.597495193268366e-05, - "loss": 0.0648, - "step": 76800 - }, - { - "epoch": 5.025188092901537, - "grad_norm": 1.0208362340927124, - "learning_rate": 6.596624718623124e-05, - "loss": 0.063, - "step": 76810 - }, - { - "epoch": 5.025842329080798, - "grad_norm": 0.7998032569885254, - "learning_rate": 6.59575419008761e-05, - "loss": 0.0638, - "step": 76820 - }, - { - "epoch": 5.026496565260059, - "grad_norm": 0.8299915790557861, - "learning_rate": 6.594883607691209e-05, - "loss": 0.0595, - "step": 76830 - }, - { - "epoch": 5.02715080143932, - "grad_norm": 0.8311706185340881, - "learning_rate": 6.594012971463302e-05, - "loss": 0.0569, - "step": 76840 - }, - { - "epoch": 5.02780503761858, - "grad_norm": 1.0256335735321045, - "learning_rate": 6.593142281433277e-05, - "loss": 0.0599, - "step": 76850 - }, - { - "epoch": 5.028459273797841, - "grad_norm": 0.8700340390205383, - "learning_rate": 6.592271537630521e-05, - "loss": 0.0632, - "step": 76860 - }, - { - "epoch": 5.029113509977102, - "grad_norm": 0.7327423095703125, - "learning_rate": 6.591400740084425e-05, - "loss": 0.0572, - "step": 76870 - }, - { - "epoch": 5.029767746156362, - "grad_norm": 0.907089352607727, - "learning_rate": 6.590529888824381e-05, - "loss": 0.0511, - "step": 76880 - }, - { - "epoch": 5.030421982335623, - "grad_norm": 1.0830025672912598, - "learning_rate": 6.589658983879782e-05, - "loss": 0.0629, - "step": 76890 - }, - { - "epoch": 5.031076218514884, - "grad_norm": 0.7621312737464905, - "learning_rate": 6.588788025280022e-05, - "loss": 0.0578, - "step": 76900 - }, - { - "epoch": 5.031730454694144, - "grad_norm": 0.8991169929504395, - "learning_rate": 6.587917013054503e-05, - "loss": 0.0497, - "step": 76910 - }, - { - "epoch": 5.032384690873405, - "grad_norm": 1.5194605588912964, - "learning_rate": 6.587045947232616e-05, - "loss": 0.0594, - "step": 76920 - }, - { - "epoch": 5.033038927052666, - "grad_norm": 1.2414143085479736, - "learning_rate": 6.586174827843768e-05, - "loss": 0.0645, - "step": 76930 - }, - { - "epoch": 5.033693163231927, - "grad_norm": 1.0320972204208374, - "learning_rate": 6.58530365491736e-05, - "loss": 0.0553, - "step": 76940 - }, - { - "epoch": 5.034347399411187, - "grad_norm": 0.9476144313812256, - "learning_rate": 6.584432428482797e-05, - "loss": 0.0631, - "step": 76950 - }, - { - "epoch": 5.035001635590448, - "grad_norm": 1.1414525508880615, - "learning_rate": 6.583561148569481e-05, - "loss": 0.0699, - "step": 76960 - }, - { - "epoch": 5.035655871769709, - "grad_norm": 0.8667239546775818, - "learning_rate": 6.582689815206825e-05, - "loss": 0.0637, - "step": 76970 - }, - { - "epoch": 5.036310107948969, - "grad_norm": 0.6989625692367554, - "learning_rate": 6.581818428424238e-05, - "loss": 0.0546, - "step": 76980 - }, - { - "epoch": 5.03696434412823, - "grad_norm": 0.7704555988311768, - "learning_rate": 6.580946988251128e-05, - "loss": 0.0492, - "step": 76990 - }, - { - "epoch": 5.037618580307491, - "grad_norm": 1.1614646911621094, - "learning_rate": 6.580075494716912e-05, - "loss": 0.0721, - "step": 77000 - }, - { - "epoch": 5.038272816486752, - "grad_norm": 1.0777459144592285, - "learning_rate": 6.579203947851006e-05, - "loss": 0.0586, - "step": 77010 - }, - { - "epoch": 5.038927052666012, - "grad_norm": 0.9171910285949707, - "learning_rate": 6.578332347682824e-05, - "loss": 0.0592, - "step": 77020 - }, - { - "epoch": 5.039581288845273, - "grad_norm": 0.9307588934898376, - "learning_rate": 6.577460694241784e-05, - "loss": 0.0551, - "step": 77030 - }, - { - "epoch": 5.040235525024534, - "grad_norm": 0.7374948263168335, - "learning_rate": 6.576588987557312e-05, - "loss": 0.0557, - "step": 77040 - }, - { - "epoch": 5.040889761203794, - "grad_norm": 0.7271601557731628, - "learning_rate": 6.575717227658825e-05, - "loss": 0.057, - "step": 77050 - }, - { - "epoch": 5.041543997383055, - "grad_norm": 0.8527136445045471, - "learning_rate": 6.57484541457575e-05, - "loss": 0.0569, - "step": 77060 - }, - { - "epoch": 5.042198233562316, - "grad_norm": 0.8172820806503296, - "learning_rate": 6.57397354833751e-05, - "loss": 0.059, - "step": 77070 - }, - { - "epoch": 5.042852469741577, - "grad_norm": 0.8320928812026978, - "learning_rate": 6.573101628973537e-05, - "loss": 0.0557, - "step": 77080 - }, - { - "epoch": 5.043506705920837, - "grad_norm": 0.8559585213661194, - "learning_rate": 6.572229656513258e-05, - "loss": 0.0589, - "step": 77090 - }, - { - "epoch": 5.044160942100098, - "grad_norm": 0.797950267791748, - "learning_rate": 6.571357630986104e-05, - "loss": 0.0525, - "step": 77100 - }, - { - "epoch": 5.044815178279359, - "grad_norm": 0.9195685982704163, - "learning_rate": 6.570485552421509e-05, - "loss": 0.0576, - "step": 77110 - }, - { - "epoch": 5.045469414458619, - "grad_norm": 0.6936109066009521, - "learning_rate": 6.569613420848908e-05, - "loss": 0.0578, - "step": 77120 - }, - { - "epoch": 5.04612365063788, - "grad_norm": 0.9056085348129272, - "learning_rate": 6.568741236297738e-05, - "loss": 0.0642, - "step": 77130 - }, - { - "epoch": 5.046777886817141, - "grad_norm": 0.9640796184539795, - "learning_rate": 6.567868998797438e-05, - "loss": 0.0651, - "step": 77140 - }, - { - "epoch": 5.047432122996402, - "grad_norm": 0.8185075521469116, - "learning_rate": 6.566996708377444e-05, - "loss": 0.0585, - "step": 77150 - }, - { - "epoch": 5.048086359175662, - "grad_norm": 0.7742406129837036, - "learning_rate": 6.566124365067203e-05, - "loss": 0.0685, - "step": 77160 - }, - { - "epoch": 5.048740595354923, - "grad_norm": 0.9033831357955933, - "learning_rate": 6.56525196889616e-05, - "loss": 0.0545, - "step": 77170 - }, - { - "epoch": 5.049394831534184, - "grad_norm": 1.0280879735946655, - "learning_rate": 6.564379519893756e-05, - "loss": 0.0575, - "step": 77180 - }, - { - "epoch": 5.050049067713444, - "grad_norm": 0.8477870225906372, - "learning_rate": 6.56350701808944e-05, - "loss": 0.0596, - "step": 77190 - }, - { - "epoch": 5.050703303892705, - "grad_norm": 0.7754467129707336, - "learning_rate": 6.562634463512663e-05, - "loss": 0.057, - "step": 77200 - }, - { - "epoch": 5.051357540071966, - "grad_norm": 0.9043933749198914, - "learning_rate": 6.561761856192873e-05, - "loss": 0.0679, - "step": 77210 - }, - { - "epoch": 5.052011776251227, - "grad_norm": 0.8377389311790466, - "learning_rate": 6.560889196159525e-05, - "loss": 0.0571, - "step": 77220 - }, - { - "epoch": 5.052666012430487, - "grad_norm": 0.8205317854881287, - "learning_rate": 6.560016483442075e-05, - "loss": 0.0512, - "step": 77230 - }, - { - "epoch": 5.053320248609748, - "grad_norm": 0.6298996806144714, - "learning_rate": 6.559143718069977e-05, - "loss": 0.0653, - "step": 77240 - }, - { - "epoch": 5.053974484789009, - "grad_norm": 1.1496926546096802, - "learning_rate": 6.558270900072687e-05, - "loss": 0.0769, - "step": 77250 - }, - { - "epoch": 5.054628720968269, - "grad_norm": 1.5552153587341309, - "learning_rate": 6.557398029479669e-05, - "loss": 0.0614, - "step": 77260 - }, - { - "epoch": 5.05528295714753, - "grad_norm": 0.9075239300727844, - "learning_rate": 6.556525106320382e-05, - "loss": 0.0637, - "step": 77270 - }, - { - "epoch": 5.055937193326791, - "grad_norm": 0.866381049156189, - "learning_rate": 6.555652130624292e-05, - "loss": 0.0659, - "step": 77280 - }, - { - "epoch": 5.056591429506052, - "grad_norm": 0.8625958561897278, - "learning_rate": 6.554779102420863e-05, - "loss": 0.0547, - "step": 77290 - }, - { - "epoch": 5.057245665685312, - "grad_norm": 0.7577462196350098, - "learning_rate": 6.55390602173956e-05, - "loss": 0.0527, - "step": 77300 - }, - { - "epoch": 5.057899901864573, - "grad_norm": 0.868981659412384, - "learning_rate": 6.553032888609856e-05, - "loss": 0.0541, - "step": 77310 - }, - { - "epoch": 5.058554138043834, - "grad_norm": 1.144863486289978, - "learning_rate": 6.552159703061216e-05, - "loss": 0.0578, - "step": 77320 - }, - { - "epoch": 5.059208374223094, - "grad_norm": 0.9633135795593262, - "learning_rate": 6.551286465123118e-05, - "loss": 0.0614, - "step": 77330 - }, - { - "epoch": 5.059862610402355, - "grad_norm": 0.9470215439796448, - "learning_rate": 6.55041317482503e-05, - "loss": 0.0568, - "step": 77340 - }, - { - "epoch": 5.060516846581616, - "grad_norm": 1.0022468566894531, - "learning_rate": 6.549539832196436e-05, - "loss": 0.0624, - "step": 77350 - }, - { - "epoch": 5.061171082760876, - "grad_norm": 0.7574188113212585, - "learning_rate": 6.548666437266806e-05, - "loss": 0.0468, - "step": 77360 - }, - { - "epoch": 5.061825318940137, - "grad_norm": 0.8999849557876587, - "learning_rate": 6.547792990065622e-05, - "loss": 0.0618, - "step": 77370 - }, - { - "epoch": 5.062479555119398, - "grad_norm": 0.6741798520088196, - "learning_rate": 6.546919490622365e-05, - "loss": 0.0581, - "step": 77380 - }, - { - "epoch": 5.063133791298659, - "grad_norm": 0.9915578365325928, - "learning_rate": 6.546045938966518e-05, - "loss": 0.0584, - "step": 77390 - }, - { - "epoch": 5.063788027477919, - "grad_norm": 0.8000921607017517, - "learning_rate": 6.545172335127568e-05, - "loss": 0.0561, - "step": 77400 - }, - { - "epoch": 5.06444226365718, - "grad_norm": 1.028486728668213, - "learning_rate": 6.544298679134998e-05, - "loss": 0.0495, - "step": 77410 - }, - { - "epoch": 5.065096499836441, - "grad_norm": 0.7525604367256165, - "learning_rate": 6.543424971018298e-05, - "loss": 0.0567, - "step": 77420 - }, - { - "epoch": 5.065750736015701, - "grad_norm": 1.2675774097442627, - "learning_rate": 6.542551210806959e-05, - "loss": 0.0497, - "step": 77430 - }, - { - "epoch": 5.066404972194962, - "grad_norm": 0.8450837731361389, - "learning_rate": 6.541677398530468e-05, - "loss": 0.0589, - "step": 77440 - }, - { - "epoch": 5.067059208374223, - "grad_norm": 0.6929242610931396, - "learning_rate": 6.540803534218322e-05, - "loss": 0.0615, - "step": 77450 - }, - { - "epoch": 5.067713444553484, - "grad_norm": 0.9417129158973694, - "learning_rate": 6.539929617900019e-05, - "loss": 0.0595, - "step": 77460 - }, - { - "epoch": 5.068367680732744, - "grad_norm": 0.944251537322998, - "learning_rate": 6.53905564960505e-05, - "loss": 0.0538, - "step": 77470 - }, - { - "epoch": 5.069021916912005, - "grad_norm": 0.9781545400619507, - "learning_rate": 6.538181629362916e-05, - "loss": 0.0592, - "step": 77480 - }, - { - "epoch": 5.069676153091266, - "grad_norm": 0.9492000937461853, - "learning_rate": 6.537307557203119e-05, - "loss": 0.0644, - "step": 77490 - }, - { - "epoch": 5.070330389270526, - "grad_norm": 0.7973672151565552, - "learning_rate": 6.536433433155161e-05, - "loss": 0.0495, - "step": 77500 - }, - { - "epoch": 5.070984625449787, - "grad_norm": 0.8644813895225525, - "learning_rate": 6.535559257248545e-05, - "loss": 0.0611, - "step": 77510 - }, - { - "epoch": 5.071638861629048, - "grad_norm": 0.9546951651573181, - "learning_rate": 6.534685029512777e-05, - "loss": 0.061, - "step": 77520 - }, - { - "epoch": 5.072293097808309, - "grad_norm": 0.9337143898010254, - "learning_rate": 6.533810749977363e-05, - "loss": 0.0528, - "step": 77530 - }, - { - "epoch": 5.072947333987569, - "grad_norm": 0.8706703186035156, - "learning_rate": 6.532936418671815e-05, - "loss": 0.0517, - "step": 77540 - }, - { - "epoch": 5.07360157016683, - "grad_norm": 0.788536548614502, - "learning_rate": 6.532062035625641e-05, - "loss": 0.0638, - "step": 77550 - }, - { - "epoch": 5.074255806346091, - "grad_norm": 0.8205485343933105, - "learning_rate": 6.531187600868357e-05, - "loss": 0.0554, - "step": 77560 - }, - { - "epoch": 5.074910042525351, - "grad_norm": 0.8340345621109009, - "learning_rate": 6.530313114429475e-05, - "loss": 0.0518, - "step": 77570 - }, - { - "epoch": 5.075564278704612, - "grad_norm": 0.793782651424408, - "learning_rate": 6.529438576338512e-05, - "loss": 0.052, - "step": 77580 - }, - { - "epoch": 5.076218514883873, - "grad_norm": 0.9006875157356262, - "learning_rate": 6.528563986624987e-05, - "loss": 0.0585, - "step": 77590 - }, - { - "epoch": 5.076872751063134, - "grad_norm": 0.9196233749389648, - "learning_rate": 6.527689345318416e-05, - "loss": 0.0536, - "step": 77600 - }, - { - "epoch": 5.077526987242394, - "grad_norm": 0.987467885017395, - "learning_rate": 6.526814652448325e-05, - "loss": 0.0572, - "step": 77610 - }, - { - "epoch": 5.078181223421655, - "grad_norm": 0.994082510471344, - "learning_rate": 6.525939908044236e-05, - "loss": 0.0542, - "step": 77620 - }, - { - "epoch": 5.078835459600916, - "grad_norm": 1.2005406618118286, - "learning_rate": 6.525065112135672e-05, - "loss": 0.0593, - "step": 77630 - }, - { - "epoch": 5.079489695780176, - "grad_norm": 1.0194610357284546, - "learning_rate": 6.52419026475216e-05, - "loss": 0.0574, - "step": 77640 - }, - { - "epoch": 5.080143931959437, - "grad_norm": 0.9802126884460449, - "learning_rate": 6.52331536592323e-05, - "loss": 0.0615, - "step": 77650 - }, - { - "epoch": 5.080798168138698, - "grad_norm": 0.977064311504364, - "learning_rate": 6.522440415678413e-05, - "loss": 0.0562, - "step": 77660 - }, - { - "epoch": 5.081452404317959, - "grad_norm": 0.8457438945770264, - "learning_rate": 6.521565414047237e-05, - "loss": 0.0578, - "step": 77670 - }, - { - "epoch": 5.082106640497219, - "grad_norm": 0.8083365559577942, - "learning_rate": 6.52069036105924e-05, - "loss": 0.0578, - "step": 77680 - }, - { - "epoch": 5.08276087667648, - "grad_norm": 1.0686533451080322, - "learning_rate": 6.519815256743954e-05, - "loss": 0.0556, - "step": 77690 - }, - { - "epoch": 5.083415112855741, - "grad_norm": 0.9897651672363281, - "learning_rate": 6.518940101130916e-05, - "loss": 0.0556, - "step": 77700 - }, - { - "epoch": 5.084069349035001, - "grad_norm": 0.7918063998222351, - "learning_rate": 6.518064894249667e-05, - "loss": 0.0558, - "step": 77710 - }, - { - "epoch": 5.084723585214262, - "grad_norm": 0.7735630869865417, - "learning_rate": 6.517189636129749e-05, - "loss": 0.0557, - "step": 77720 - }, - { - "epoch": 5.085377821393523, - "grad_norm": 0.947536289691925, - "learning_rate": 6.516314326800698e-05, - "loss": 0.0578, - "step": 77730 - }, - { - "epoch": 5.086032057572784, - "grad_norm": 0.74627685546875, - "learning_rate": 6.515438966292062e-05, - "loss": 0.0529, - "step": 77740 - }, - { - "epoch": 5.086686293752044, - "grad_norm": 1.1361474990844727, - "learning_rate": 6.514563554633388e-05, - "loss": 0.0678, - "step": 77750 - }, - { - "epoch": 5.087340529931305, - "grad_norm": 1.06615149974823, - "learning_rate": 6.513688091854224e-05, - "loss": 0.0656, - "step": 77760 - }, - { - "epoch": 5.087994766110566, - "grad_norm": 0.8696459531784058, - "learning_rate": 6.512812577984114e-05, - "loss": 0.0606, - "step": 77770 - }, - { - "epoch": 5.088649002289826, - "grad_norm": 0.7366641163825989, - "learning_rate": 6.511937013052612e-05, - "loss": 0.0488, - "step": 77780 - }, - { - "epoch": 5.089303238469087, - "grad_norm": 0.7504761815071106, - "learning_rate": 6.511061397089271e-05, - "loss": 0.061, - "step": 77790 - }, - { - "epoch": 5.089957474648348, - "grad_norm": 0.8598735332489014, - "learning_rate": 6.510185730123646e-05, - "loss": 0.0606, - "step": 77800 - }, - { - "epoch": 5.090611710827609, - "grad_norm": 0.8309386968612671, - "learning_rate": 6.50931001218529e-05, - "loss": 0.069, - "step": 77810 - }, - { - "epoch": 5.091265947006869, - "grad_norm": 0.9889697432518005, - "learning_rate": 6.508434243303764e-05, - "loss": 0.0667, - "step": 77820 - }, - { - "epoch": 5.09192018318613, - "grad_norm": 0.9573984742164612, - "learning_rate": 6.507558423508629e-05, - "loss": 0.0594, - "step": 77830 - }, - { - "epoch": 5.092574419365391, - "grad_norm": 1.049492359161377, - "learning_rate": 6.50668255282944e-05, - "loss": 0.0538, - "step": 77840 - }, - { - "epoch": 5.093228655544651, - "grad_norm": 0.8158976435661316, - "learning_rate": 6.505806631295765e-05, - "loss": 0.0592, - "step": 77850 - }, - { - "epoch": 5.093882891723912, - "grad_norm": 0.7729289531707764, - "learning_rate": 6.504930658937165e-05, - "loss": 0.0601, - "step": 77860 - }, - { - "epoch": 5.094537127903173, - "grad_norm": 0.7943597435951233, - "learning_rate": 6.50405463578321e-05, - "loss": 0.0674, - "step": 77870 - }, - { - "epoch": 5.095191364082433, - "grad_norm": 1.0894060134887695, - "learning_rate": 6.503178561863466e-05, - "loss": 0.0594, - "step": 77880 - }, - { - "epoch": 5.095845600261694, - "grad_norm": 0.8474149703979492, - "learning_rate": 6.502302437207504e-05, - "loss": 0.059, - "step": 77890 - }, - { - "epoch": 5.096499836440955, - "grad_norm": 0.9915545582771301, - "learning_rate": 6.501426261844894e-05, - "loss": 0.0529, - "step": 77900 - }, - { - "epoch": 5.097154072620216, - "grad_norm": 1.004508137702942, - "learning_rate": 6.500550035805212e-05, - "loss": 0.0504, - "step": 77910 - }, - { - "epoch": 5.097808308799476, - "grad_norm": 0.861196756362915, - "learning_rate": 6.499673759118028e-05, - "loss": 0.0581, - "step": 77920 - }, - { - "epoch": 5.098462544978737, - "grad_norm": 0.7143073081970215, - "learning_rate": 6.498797431812923e-05, - "loss": 0.057, - "step": 77930 - }, - { - "epoch": 5.099116781157998, - "grad_norm": 0.7340794205665588, - "learning_rate": 6.497921053919475e-05, - "loss": 0.0698, - "step": 77940 - }, - { - "epoch": 5.099771017337258, - "grad_norm": 0.691623866558075, - "learning_rate": 6.497044625467263e-05, - "loss": 0.0555, - "step": 77950 - }, - { - "epoch": 5.100425253516519, - "grad_norm": 1.062201738357544, - "learning_rate": 6.496168146485865e-05, - "loss": 0.0621, - "step": 77960 - }, - { - "epoch": 5.10107948969578, - "grad_norm": 0.9496244192123413, - "learning_rate": 6.495291617004873e-05, - "loss": 0.0566, - "step": 77970 - }, - { - "epoch": 5.101733725875041, - "grad_norm": 0.9271469116210938, - "learning_rate": 6.494415037053865e-05, - "loss": 0.0504, - "step": 77980 - }, - { - "epoch": 5.102387962054301, - "grad_norm": 0.8734414577484131, - "learning_rate": 6.493538406662429e-05, - "loss": 0.0591, - "step": 77990 - }, - { - "epoch": 5.103042198233562, - "grad_norm": 0.9724219441413879, - "learning_rate": 6.492661725860157e-05, - "loss": 0.0653, - "step": 78000 - }, - { - "epoch": 5.103696434412823, - "grad_norm": 0.709010124206543, - "learning_rate": 6.491784994676637e-05, - "loss": 0.0516, - "step": 78010 - }, - { - "epoch": 5.104350670592083, - "grad_norm": 0.7753589749336243, - "learning_rate": 6.490908213141461e-05, - "loss": 0.0519, - "step": 78020 - }, - { - "epoch": 5.105004906771344, - "grad_norm": 0.8125455379486084, - "learning_rate": 6.490031381284221e-05, - "loss": 0.0581, - "step": 78030 - }, - { - "epoch": 5.105659142950605, - "grad_norm": 1.1419060230255127, - "learning_rate": 6.489154499134517e-05, - "loss": 0.0563, - "step": 78040 - }, - { - "epoch": 5.106313379129866, - "grad_norm": 0.945682942867279, - "learning_rate": 6.488277566721941e-05, - "loss": 0.0652, - "step": 78050 - }, - { - "epoch": 5.106967615309126, - "grad_norm": 0.9057697653770447, - "learning_rate": 6.487400584076094e-05, - "loss": 0.0589, - "step": 78060 - }, - { - "epoch": 5.107621851488387, - "grad_norm": 0.9359369874000549, - "learning_rate": 6.486523551226577e-05, - "loss": 0.0612, - "step": 78070 - }, - { - "epoch": 5.108276087667648, - "grad_norm": 0.8022433519363403, - "learning_rate": 6.485646468202993e-05, - "loss": 0.0674, - "step": 78080 - }, - { - "epoch": 5.108930323846908, - "grad_norm": 0.8566710948944092, - "learning_rate": 6.484769335034942e-05, - "loss": 0.0553, - "step": 78090 - }, - { - "epoch": 5.109584560026169, - "grad_norm": 0.9492458701133728, - "learning_rate": 6.483892151752034e-05, - "loss": 0.0629, - "step": 78100 - }, - { - "epoch": 5.11023879620543, - "grad_norm": 0.8819661736488342, - "learning_rate": 6.483014918383873e-05, - "loss": 0.0587, - "step": 78110 - }, - { - "epoch": 5.110893032384691, - "grad_norm": 0.9766054749488831, - "learning_rate": 6.482137634960068e-05, - "loss": 0.0615, - "step": 78120 - }, - { - "epoch": 5.111547268563951, - "grad_norm": 0.8624576926231384, - "learning_rate": 6.481260301510233e-05, - "loss": 0.0602, - "step": 78130 - }, - { - "epoch": 5.112201504743212, - "grad_norm": 0.9257481098175049, - "learning_rate": 6.480382918063978e-05, - "loss": 0.0551, - "step": 78140 - }, - { - "epoch": 5.112855740922473, - "grad_norm": 0.9216415882110596, - "learning_rate": 6.479505484650916e-05, - "loss": 0.0538, - "step": 78150 - }, - { - "epoch": 5.113509977101733, - "grad_norm": 0.7884585857391357, - "learning_rate": 6.478628001300664e-05, - "loss": 0.0556, - "step": 78160 - }, - { - "epoch": 5.114164213280994, - "grad_norm": 0.9612773060798645, - "learning_rate": 6.477750468042841e-05, - "loss": 0.0561, - "step": 78170 - }, - { - "epoch": 5.114818449460255, - "grad_norm": 0.7759575247764587, - "learning_rate": 6.476872884907062e-05, - "loss": 0.0633, - "step": 78180 - }, - { - "epoch": 5.115472685639516, - "grad_norm": 0.9810332655906677, - "learning_rate": 6.475995251922949e-05, - "loss": 0.0571, - "step": 78190 - }, - { - "epoch": 5.116126921818776, - "grad_norm": 0.7260820865631104, - "learning_rate": 6.475117569120127e-05, - "loss": 0.0624, - "step": 78200 - }, - { - "epoch": 5.116781157998037, - "grad_norm": 0.998258113861084, - "learning_rate": 6.474239836528219e-05, - "loss": 0.055, - "step": 78210 - }, - { - "epoch": 5.117435394177298, - "grad_norm": 0.7317970395088196, - "learning_rate": 6.473362054176847e-05, - "loss": 0.0594, - "step": 78220 - }, - { - "epoch": 5.118089630356558, - "grad_norm": 1.0319292545318604, - "learning_rate": 6.472484222095645e-05, - "loss": 0.0558, - "step": 78230 - }, - { - "epoch": 5.118743866535819, - "grad_norm": 0.6902409195899963, - "learning_rate": 6.471606340314238e-05, - "loss": 0.0631, - "step": 78240 - }, - { - "epoch": 5.11939810271508, - "grad_norm": 1.0602803230285645, - "learning_rate": 6.470728408862257e-05, - "loss": 0.053, - "step": 78250 - }, - { - "epoch": 5.120052338894341, - "grad_norm": 0.9925687313079834, - "learning_rate": 6.469850427769336e-05, - "loss": 0.0685, - "step": 78260 - }, - { - "epoch": 5.120706575073601, - "grad_norm": 0.8387590050697327, - "learning_rate": 6.468972397065108e-05, - "loss": 0.0516, - "step": 78270 - }, - { - "epoch": 5.121360811252862, - "grad_norm": 0.93202805519104, - "learning_rate": 6.468094316779207e-05, - "loss": 0.0657, - "step": 78280 - }, - { - "epoch": 5.122015047432123, - "grad_norm": 0.8118649125099182, - "learning_rate": 6.467216186941274e-05, - "loss": 0.0504, - "step": 78290 - }, - { - "epoch": 5.122669283611383, - "grad_norm": 0.8819132447242737, - "learning_rate": 6.466338007580948e-05, - "loss": 0.0667, - "step": 78300 - }, - { - "epoch": 5.123323519790644, - "grad_norm": 1.2045010328292847, - "learning_rate": 6.465459778727867e-05, - "loss": 0.068, - "step": 78310 - }, - { - "epoch": 5.123977755969905, - "grad_norm": 0.8089714646339417, - "learning_rate": 6.464581500411675e-05, - "loss": 0.0599, - "step": 78320 - }, - { - "epoch": 5.124631992149165, - "grad_norm": 0.8597386479377747, - "learning_rate": 6.463703172662019e-05, - "loss": 0.0561, - "step": 78330 - }, - { - "epoch": 5.125286228328426, - "grad_norm": 0.7467419505119324, - "learning_rate": 6.46282479550854e-05, - "loss": 0.0557, - "step": 78340 - }, - { - "epoch": 5.125940464507687, - "grad_norm": 0.8646554946899414, - "learning_rate": 6.461946368980888e-05, - "loss": 0.0532, - "step": 78350 - }, - { - "epoch": 5.126594700686948, - "grad_norm": 0.681583821773529, - "learning_rate": 6.461067893108712e-05, - "loss": 0.0543, - "step": 78360 - }, - { - "epoch": 5.127248936866208, - "grad_norm": 0.9374173879623413, - "learning_rate": 6.460189367921663e-05, - "loss": 0.0547, - "step": 78370 - }, - { - "epoch": 5.127903173045469, - "grad_norm": 0.9243113398551941, - "learning_rate": 6.459310793449391e-05, - "loss": 0.0538, - "step": 78380 - }, - { - "epoch": 5.12855740922473, - "grad_norm": 0.8543265461921692, - "learning_rate": 6.458432169721556e-05, - "loss": 0.0603, - "step": 78390 - }, - { - "epoch": 5.12921164540399, - "grad_norm": 0.811305046081543, - "learning_rate": 6.457553496767809e-05, - "loss": 0.0606, - "step": 78400 - }, - { - "epoch": 5.129865881583251, - "grad_norm": 0.9650918245315552, - "learning_rate": 6.456674774617809e-05, - "loss": 0.0528, - "step": 78410 - }, - { - "epoch": 5.130520117762512, - "grad_norm": 0.989138126373291, - "learning_rate": 6.455796003301215e-05, - "loss": 0.0625, - "step": 78420 - }, - { - "epoch": 5.131174353941773, - "grad_norm": 0.9064369201660156, - "learning_rate": 6.45491718284769e-05, - "loss": 0.0566, - "step": 78430 - }, - { - "epoch": 5.131828590121033, - "grad_norm": 0.7815073728561401, - "learning_rate": 6.454038313286891e-05, - "loss": 0.0654, - "step": 78440 - }, - { - "epoch": 5.132482826300294, - "grad_norm": 0.9094436168670654, - "learning_rate": 6.453159394648487e-05, - "loss": 0.0573, - "step": 78450 - }, - { - "epoch": 5.133137062479555, - "grad_norm": 0.8054584860801697, - "learning_rate": 6.452280426962143e-05, - "loss": 0.054, - "step": 78460 - }, - { - "epoch": 5.1337912986588154, - "grad_norm": 0.8743946552276611, - "learning_rate": 6.451401410257525e-05, - "loss": 0.0594, - "step": 78470 - }, - { - "epoch": 5.134445534838076, - "grad_norm": 0.9614243507385254, - "learning_rate": 6.450522344564303e-05, - "loss": 0.0619, - "step": 78480 - }, - { - "epoch": 5.135099771017337, - "grad_norm": 0.9176281094551086, - "learning_rate": 6.449643229912148e-05, - "loss": 0.0582, - "step": 78490 - }, - { - "epoch": 5.135754007196598, - "grad_norm": 0.9308127164840698, - "learning_rate": 6.448764066330733e-05, - "loss": 0.0574, - "step": 78500 - }, - { - "epoch": 5.136408243375858, - "grad_norm": 0.8228217363357544, - "learning_rate": 6.44788485384973e-05, - "loss": 0.0693, - "step": 78510 - }, - { - "epoch": 5.137062479555119, - "grad_norm": 0.9158545136451721, - "learning_rate": 6.447005592498816e-05, - "loss": 0.0499, - "step": 78520 - }, - { - "epoch": 5.13771671573438, - "grad_norm": 0.8017662763595581, - "learning_rate": 6.446126282307669e-05, - "loss": 0.0561, - "step": 78530 - }, - { - "epoch": 5.1383709519136405, - "grad_norm": 0.8243115544319153, - "learning_rate": 6.445246923305966e-05, - "loss": 0.056, - "step": 78540 - }, - { - "epoch": 5.139025188092901, - "grad_norm": 0.7880877256393433, - "learning_rate": 6.44436751552339e-05, - "loss": 0.0569, - "step": 78550 - }, - { - "epoch": 5.139679424272162, - "grad_norm": 0.9965237379074097, - "learning_rate": 6.443488058989624e-05, - "loss": 0.0646, - "step": 78560 - }, - { - "epoch": 5.140333660451423, - "grad_norm": 0.9175546765327454, - "learning_rate": 6.442608553734348e-05, - "loss": 0.0679, - "step": 78570 - }, - { - "epoch": 5.1409878966306835, - "grad_norm": 0.8487790822982788, - "learning_rate": 6.441728999787251e-05, - "loss": 0.0511, - "step": 78580 - }, - { - "epoch": 5.141642132809944, - "grad_norm": 0.9188631772994995, - "learning_rate": 6.44084939717802e-05, - "loss": 0.0572, - "step": 78590 - }, - { - "epoch": 5.142296368989205, - "grad_norm": 0.8880249857902527, - "learning_rate": 6.439969745936341e-05, - "loss": 0.0729, - "step": 78600 - }, - { - "epoch": 5.1429506051684655, - "grad_norm": 0.7750711441040039, - "learning_rate": 6.439090046091907e-05, - "loss": 0.062, - "step": 78610 - }, - { - "epoch": 5.143604841347726, - "grad_norm": 0.8902852535247803, - "learning_rate": 6.438210297674411e-05, - "loss": 0.0582, - "step": 78620 - }, - { - "epoch": 5.144259077526987, - "grad_norm": 0.9351129531860352, - "learning_rate": 6.437330500713545e-05, - "loss": 0.0535, - "step": 78630 - }, - { - "epoch": 5.144913313706248, - "grad_norm": 0.8082217574119568, - "learning_rate": 6.436450655239004e-05, - "loss": 0.0601, - "step": 78640 - }, - { - "epoch": 5.1455675498855085, - "grad_norm": 0.9158061146736145, - "learning_rate": 6.435570761280487e-05, - "loss": 0.0562, - "step": 78650 - }, - { - "epoch": 5.146221786064769, - "grad_norm": 0.8845149278640747, - "learning_rate": 6.434690818867693e-05, - "loss": 0.0515, - "step": 78660 - }, - { - "epoch": 5.14687602224403, - "grad_norm": 0.9246596693992615, - "learning_rate": 6.43381082803032e-05, - "loss": 0.0547, - "step": 78670 - }, - { - "epoch": 5.1475302584232905, - "grad_norm": 0.8602831363677979, - "learning_rate": 6.432930788798072e-05, - "loss": 0.0475, - "step": 78680 - }, - { - "epoch": 5.1481844946025515, - "grad_norm": 0.7768710255622864, - "learning_rate": 6.432050701200651e-05, - "loss": 0.0519, - "step": 78690 - }, - { - "epoch": 5.148838730781812, - "grad_norm": 0.8790880441665649, - "learning_rate": 6.431170565267764e-05, - "loss": 0.0609, - "step": 78700 - }, - { - "epoch": 5.149492966961073, - "grad_norm": 1.016045331954956, - "learning_rate": 6.430290381029116e-05, - "loss": 0.0583, - "step": 78710 - }, - { - "epoch": 5.1501472031403335, - "grad_norm": 1.148781418800354, - "learning_rate": 6.429410148514419e-05, - "loss": 0.0667, - "step": 78720 - }, - { - "epoch": 5.150801439319594, - "grad_norm": 0.873008668422699, - "learning_rate": 6.428529867753377e-05, - "loss": 0.0617, - "step": 78730 - }, - { - "epoch": 5.151455675498855, - "grad_norm": 0.7208111882209778, - "learning_rate": 6.427649538775708e-05, - "loss": 0.0633, - "step": 78740 - }, - { - "epoch": 5.1521099116781155, - "grad_norm": 1.1463489532470703, - "learning_rate": 6.426769161611121e-05, - "loss": 0.056, - "step": 78750 - }, - { - "epoch": 5.1527641478573765, - "grad_norm": 0.7651934623718262, - "learning_rate": 6.425888736289336e-05, - "loss": 0.0522, - "step": 78760 - }, - { - "epoch": 5.153418384036637, - "grad_norm": 0.9302027821540833, - "learning_rate": 6.425008262840064e-05, - "loss": 0.063, - "step": 78770 - }, - { - "epoch": 5.1540726202158975, - "grad_norm": 0.8309484720230103, - "learning_rate": 6.424127741293027e-05, - "loss": 0.0663, - "step": 78780 - }, - { - "epoch": 5.1547268563951585, - "grad_norm": 0.8712273240089417, - "learning_rate": 6.423247171677943e-05, - "loss": 0.0648, - "step": 78790 - }, - { - "epoch": 5.1553810925744195, - "grad_norm": 0.9975765943527222, - "learning_rate": 6.422366554024536e-05, - "loss": 0.0636, - "step": 78800 - }, - { - "epoch": 5.15603532875368, - "grad_norm": 1.0808351039886475, - "learning_rate": 6.421485888362526e-05, - "loss": 0.0615, - "step": 78810 - }, - { - "epoch": 5.1566895649329405, - "grad_norm": 0.7145622968673706, - "learning_rate": 6.42060517472164e-05, - "loss": 0.0601, - "step": 78820 - }, - { - "epoch": 5.1573438011122015, - "grad_norm": 0.8157432675361633, - "learning_rate": 6.419724413131604e-05, - "loss": 0.0544, - "step": 78830 - }, - { - "epoch": 5.1579980372914624, - "grad_norm": 0.8889109492301941, - "learning_rate": 6.418843603622144e-05, - "loss": 0.0558, - "step": 78840 - }, - { - "epoch": 5.1586522734707225, - "grad_norm": 0.8644919991493225, - "learning_rate": 6.417962746222993e-05, - "loss": 0.057, - "step": 78850 - }, - { - "epoch": 5.1593065096499835, - "grad_norm": 0.9862198233604431, - "learning_rate": 6.41708184096388e-05, - "loss": 0.0577, - "step": 78860 - }, - { - "epoch": 5.1599607458292445, - "grad_norm": 0.806100606918335, - "learning_rate": 6.41620088787454e-05, - "loss": 0.0588, - "step": 78870 - }, - { - "epoch": 5.160614982008505, - "grad_norm": 0.9563096165657043, - "learning_rate": 6.415319886984703e-05, - "loss": 0.0603, - "step": 78880 - }, - { - "epoch": 5.1612692181877655, - "grad_norm": 0.9669555425643921, - "learning_rate": 6.41443883832411e-05, - "loss": 0.0509, - "step": 78890 - }, - { - "epoch": 5.1619234543670265, - "grad_norm": 0.8312886357307434, - "learning_rate": 6.413557741922495e-05, - "loss": 0.056, - "step": 78900 - }, - { - "epoch": 5.1625776905462875, - "grad_norm": 0.8751150369644165, - "learning_rate": 6.412676597809602e-05, - "loss": 0.0561, - "step": 78910 - }, - { - "epoch": 5.1632319267255475, - "grad_norm": 0.9386425018310547, - "learning_rate": 6.411795406015166e-05, - "loss": 0.0561, - "step": 78920 - }, - { - "epoch": 5.1638861629048085, - "grad_norm": 1.336055874824524, - "learning_rate": 6.410914166568933e-05, - "loss": 0.0501, - "step": 78930 - }, - { - "epoch": 5.1645403990840695, - "grad_norm": 0.7663227915763855, - "learning_rate": 6.410032879500647e-05, - "loss": 0.0583, - "step": 78940 - }, - { - "epoch": 5.1651946352633304, - "grad_norm": 0.8238208889961243, - "learning_rate": 6.409151544840055e-05, - "loss": 0.0637, - "step": 78950 - }, - { - "epoch": 5.1658488714425905, - "grad_norm": 0.8930804133415222, - "learning_rate": 6.4082701626169e-05, - "loss": 0.0519, - "step": 78960 - }, - { - "epoch": 5.1665031076218515, - "grad_norm": 0.7918708324432373, - "learning_rate": 6.407388732860935e-05, - "loss": 0.0553, - "step": 78970 - }, - { - "epoch": 5.1671573438011125, - "grad_norm": 0.7444891333580017, - "learning_rate": 6.40650725560191e-05, - "loss": 0.0473, - "step": 78980 - }, - { - "epoch": 5.1678115799803725, - "grad_norm": 0.9502220749855042, - "learning_rate": 6.405625730869575e-05, - "loss": 0.057, - "step": 78990 - }, - { - "epoch": 5.1684658161596335, - "grad_norm": 0.8537470698356628, - "learning_rate": 6.404744158693685e-05, - "loss": 0.0563, - "step": 79000 - }, - { - "epoch": 5.1691200523388945, - "grad_norm": 0.902565062046051, - "learning_rate": 6.403862539103998e-05, - "loss": 0.0536, - "step": 79010 - }, - { - "epoch": 5.1697742885181555, - "grad_norm": 0.9885287284851074, - "learning_rate": 6.402980872130266e-05, - "loss": 0.0589, - "step": 79020 - }, - { - "epoch": 5.1704285246974155, - "grad_norm": 0.8051989674568176, - "learning_rate": 6.402099157802252e-05, - "loss": 0.0661, - "step": 79030 - }, - { - "epoch": 5.1710827608766765, - "grad_norm": 1.0724586248397827, - "learning_rate": 6.401217396149713e-05, - "loss": 0.0528, - "step": 79040 - }, - { - "epoch": 5.1717369970559375, - "grad_norm": 0.9905614256858826, - "learning_rate": 6.400335587202413e-05, - "loss": 0.0511, - "step": 79050 - }, - { - "epoch": 5.172391233235198, - "grad_norm": 0.8915742635726929, - "learning_rate": 6.399453730990113e-05, - "loss": 0.0595, - "step": 79060 - }, - { - "epoch": 5.1730454694144585, - "grad_norm": 0.8688280582427979, - "learning_rate": 6.398571827542581e-05, - "loss": 0.0592, - "step": 79070 - }, - { - "epoch": 5.1736997055937195, - "grad_norm": 0.9881342053413391, - "learning_rate": 6.39768987688958e-05, - "loss": 0.0612, - "step": 79080 - }, - { - "epoch": 5.1743539417729805, - "grad_norm": 0.8909708261489868, - "learning_rate": 6.396807879060882e-05, - "loss": 0.0552, - "step": 79090 - }, - { - "epoch": 5.1750081779522406, - "grad_norm": 1.0886934995651245, - "learning_rate": 6.395925834086254e-05, - "loss": 0.0531, - "step": 79100 - }, - { - "epoch": 5.1756624141315015, - "grad_norm": 0.8756330609321594, - "learning_rate": 6.395043741995468e-05, - "loss": 0.0536, - "step": 79110 - }, - { - "epoch": 5.1763166503107625, - "grad_norm": 0.8764585852622986, - "learning_rate": 6.394161602818296e-05, - "loss": 0.0525, - "step": 79120 - }, - { - "epoch": 5.176970886490023, - "grad_norm": 0.8468575477600098, - "learning_rate": 6.393279416584516e-05, - "loss": 0.0644, - "step": 79130 - }, - { - "epoch": 5.1776251226692835, - "grad_norm": 0.982122540473938, - "learning_rate": 6.392397183323901e-05, - "loss": 0.0528, - "step": 79140 - }, - { - "epoch": 5.1782793588485445, - "grad_norm": 0.9616347551345825, - "learning_rate": 6.391514903066228e-05, - "loss": 0.0526, - "step": 79150 - }, - { - "epoch": 5.1789335950278055, - "grad_norm": 0.8660449981689453, - "learning_rate": 6.390632575841278e-05, - "loss": 0.0643, - "step": 79160 - }, - { - "epoch": 5.179587831207066, - "grad_norm": 0.7441914081573486, - "learning_rate": 6.389750201678833e-05, - "loss": 0.0664, - "step": 79170 - }, - { - "epoch": 5.1802420673863265, - "grad_norm": 0.796704888343811, - "learning_rate": 6.388867780608672e-05, - "loss": 0.0534, - "step": 79180 - }, - { - "epoch": 5.1808963035655875, - "grad_norm": 0.6812049746513367, - "learning_rate": 6.387985312660582e-05, - "loss": 0.0643, - "step": 79190 - }, - { - "epoch": 5.181550539744848, - "grad_norm": 0.8211969137191772, - "learning_rate": 6.387102797864349e-05, - "loss": 0.0599, - "step": 79200 - }, - { - "epoch": 5.1822047759241086, - "grad_norm": 1.1284834146499634, - "learning_rate": 6.386220236249757e-05, - "loss": 0.0679, - "step": 79210 - }, - { - "epoch": 5.1828590121033695, - "grad_norm": 0.8889693021774292, - "learning_rate": 6.385337627846596e-05, - "loss": 0.0503, - "step": 79220 - }, - { - "epoch": 5.18351324828263, - "grad_norm": 0.8759040236473083, - "learning_rate": 6.384454972684658e-05, - "loss": 0.0576, - "step": 79230 - }, - { - "epoch": 5.184167484461891, - "grad_norm": 0.808881938457489, - "learning_rate": 6.383572270793733e-05, - "loss": 0.0624, - "step": 79240 - }, - { - "epoch": 5.1848217206411515, - "grad_norm": 0.8137565851211548, - "learning_rate": 6.382689522203616e-05, - "loss": 0.0621, - "step": 79250 - }, - { - "epoch": 5.1854759568204125, - "grad_norm": 1.0435824394226074, - "learning_rate": 6.381806726944101e-05, - "loss": 0.0673, - "step": 79260 - }, - { - "epoch": 5.186130192999673, - "grad_norm": 0.6479164361953735, - "learning_rate": 6.380923885044985e-05, - "loss": 0.0537, - "step": 79270 - }, - { - "epoch": 5.186784429178934, - "grad_norm": 0.784147322177887, - "learning_rate": 6.380040996536067e-05, - "loss": 0.0504, - "step": 79280 - }, - { - "epoch": 5.1874386653581945, - "grad_norm": 0.8886092901229858, - "learning_rate": 6.379158061447145e-05, - "loss": 0.0666, - "step": 79290 - }, - { - "epoch": 5.188092901537455, - "grad_norm": 0.8956160545349121, - "learning_rate": 6.378275079808022e-05, - "loss": 0.0645, - "step": 79300 - }, - { - "epoch": 5.188747137716716, - "grad_norm": 0.7196115851402283, - "learning_rate": 6.3773920516485e-05, - "loss": 0.0659, - "step": 79310 - }, - { - "epoch": 5.1894013738959766, - "grad_norm": 0.9113817811012268, - "learning_rate": 6.376508976998386e-05, - "loss": 0.0635, - "step": 79320 - }, - { - "epoch": 5.1900556100752375, - "grad_norm": 1.1962367296218872, - "learning_rate": 6.375625855887481e-05, - "loss": 0.0569, - "step": 79330 - }, - { - "epoch": 5.190709846254498, - "grad_norm": 0.8185920715332031, - "learning_rate": 6.374742688345598e-05, - "loss": 0.0622, - "step": 79340 - }, - { - "epoch": 5.191364082433759, - "grad_norm": 0.9433519244194031, - "learning_rate": 6.373859474402542e-05, - "loss": 0.0622, - "step": 79350 - }, - { - "epoch": 5.1920183186130195, - "grad_norm": 0.8798799514770508, - "learning_rate": 6.372976214088127e-05, - "loss": 0.0532, - "step": 79360 - }, - { - "epoch": 5.19267255479228, - "grad_norm": 0.9233197569847107, - "learning_rate": 6.372092907432163e-05, - "loss": 0.0551, - "step": 79370 - }, - { - "epoch": 5.193326790971541, - "grad_norm": 1.0525169372558594, - "learning_rate": 6.371209554464465e-05, - "loss": 0.0567, - "step": 79380 - }, - { - "epoch": 5.193981027150802, - "grad_norm": 0.9267981648445129, - "learning_rate": 6.37032615521485e-05, - "loss": 0.0595, - "step": 79390 - }, - { - "epoch": 5.1946352633300625, - "grad_norm": 0.8152212500572205, - "learning_rate": 6.369442709713132e-05, - "loss": 0.0625, - "step": 79400 - }, - { - "epoch": 5.195289499509323, - "grad_norm": 0.7746557593345642, - "learning_rate": 6.36855921798913e-05, - "loss": 0.0582, - "step": 79410 - }, - { - "epoch": 5.195943735688584, - "grad_norm": 0.8332052826881409, - "learning_rate": 6.367675680072668e-05, - "loss": 0.061, - "step": 79420 - }, - { - "epoch": 5.196597971867845, - "grad_norm": 0.7936050295829773, - "learning_rate": 6.366792095993563e-05, - "loss": 0.0552, - "step": 79430 - }, - { - "epoch": 5.197252208047105, - "grad_norm": 0.8946577906608582, - "learning_rate": 6.365908465781641e-05, - "loss": 0.063, - "step": 79440 - }, - { - "epoch": 5.197906444226366, - "grad_norm": 1.0016376972198486, - "learning_rate": 6.365024789466725e-05, - "loss": 0.0655, - "step": 79450 - }, - { - "epoch": 5.198560680405627, - "grad_norm": 0.8920219540596008, - "learning_rate": 6.364141067078645e-05, - "loss": 0.0636, - "step": 79460 - }, - { - "epoch": 5.1992149165848875, - "grad_norm": 0.9238928556442261, - "learning_rate": 6.363257298647224e-05, - "loss": 0.0572, - "step": 79470 - }, - { - "epoch": 5.199869152764148, - "grad_norm": 0.8631906509399414, - "learning_rate": 6.362373484202294e-05, - "loss": 0.0568, - "step": 79480 - }, - { - "epoch": 5.200523388943409, - "grad_norm": 0.8432829976081848, - "learning_rate": 6.361489623773686e-05, - "loss": 0.0616, - "step": 79490 - }, - { - "epoch": 5.20117762512267, - "grad_norm": 0.8368816375732422, - "learning_rate": 6.360605717391235e-05, - "loss": 0.0518, - "step": 79500 - }, - { - "epoch": 5.20183186130193, - "grad_norm": 0.8626534342765808, - "learning_rate": 6.35972176508477e-05, - "loss": 0.0623, - "step": 79510 - }, - { - "epoch": 5.202486097481191, - "grad_norm": 0.7735602259635925, - "learning_rate": 6.35883776688413e-05, - "loss": 0.0494, - "step": 79520 - }, - { - "epoch": 5.203140333660452, - "grad_norm": 0.8893035650253296, - "learning_rate": 6.357953722819151e-05, - "loss": 0.0573, - "step": 79530 - }, - { - "epoch": 5.203794569839713, - "grad_norm": 0.8206644058227539, - "learning_rate": 6.357069632919674e-05, - "loss": 0.0615, - "step": 79540 - }, - { - "epoch": 5.204448806018973, - "grad_norm": 1.0286612510681152, - "learning_rate": 6.356185497215537e-05, - "loss": 0.0694, - "step": 79550 - }, - { - "epoch": 5.205103042198234, - "grad_norm": 1.0252892971038818, - "learning_rate": 6.355301315736583e-05, - "loss": 0.0615, - "step": 79560 - }, - { - "epoch": 5.205757278377495, - "grad_norm": 0.8636437058448792, - "learning_rate": 6.354417088512655e-05, - "loss": 0.0534, - "step": 79570 - }, - { - "epoch": 5.206411514556755, - "grad_norm": 0.9313245415687561, - "learning_rate": 6.353532815573599e-05, - "loss": 0.054, - "step": 79580 - }, - { - "epoch": 5.207065750736016, - "grad_norm": 0.9524334073066711, - "learning_rate": 6.35264849694926e-05, - "loss": 0.0564, - "step": 79590 - }, - { - "epoch": 5.207719986915277, - "grad_norm": 0.7033629417419434, - "learning_rate": 6.351764132669486e-05, - "loss": 0.0526, - "step": 79600 - }, - { - "epoch": 5.208374223094538, - "grad_norm": 0.8188180923461914, - "learning_rate": 6.350879722764129e-05, - "loss": 0.0498, - "step": 79610 - }, - { - "epoch": 5.209028459273798, - "grad_norm": 0.8515734672546387, - "learning_rate": 6.349995267263038e-05, - "loss": 0.0608, - "step": 79620 - }, - { - "epoch": 5.209682695453059, - "grad_norm": 0.7757560014724731, - "learning_rate": 6.349110766196065e-05, - "loss": 0.0531, - "step": 79630 - }, - { - "epoch": 5.21033693163232, - "grad_norm": 0.9493059515953064, - "learning_rate": 6.348226219593066e-05, - "loss": 0.0522, - "step": 79640 - }, - { - "epoch": 5.21099116781158, - "grad_norm": 0.7367783188819885, - "learning_rate": 6.347341627483897e-05, - "loss": 0.0652, - "step": 79650 - }, - { - "epoch": 5.211645403990841, - "grad_norm": 0.8916527628898621, - "learning_rate": 6.346456989898415e-05, - "loss": 0.0641, - "step": 79660 - }, - { - "epoch": 5.212299640170102, - "grad_norm": 0.9219633936882019, - "learning_rate": 6.345572306866477e-05, - "loss": 0.0557, - "step": 79670 - }, - { - "epoch": 5.212953876349362, - "grad_norm": 0.934571385383606, - "learning_rate": 6.344687578417945e-05, - "loss": 0.0571, - "step": 79680 - }, - { - "epoch": 5.213608112528623, - "grad_norm": 0.8010873198509216, - "learning_rate": 6.343802804582681e-05, - "loss": 0.0538, - "step": 79690 - }, - { - "epoch": 5.214262348707884, - "grad_norm": 1.0014433860778809, - "learning_rate": 6.342917985390548e-05, - "loss": 0.0588, - "step": 79700 - }, - { - "epoch": 5.214916584887145, - "grad_norm": 0.7398951649665833, - "learning_rate": 6.342033120871411e-05, - "loss": 0.0599, - "step": 79710 - }, - { - "epoch": 5.215570821066405, - "grad_norm": 0.9867905974388123, - "learning_rate": 6.341148211055138e-05, - "loss": 0.0536, - "step": 79720 - }, - { - "epoch": 5.216225057245666, - "grad_norm": 0.8380576372146606, - "learning_rate": 6.340263255971594e-05, - "loss": 0.0582, - "step": 79730 - }, - { - "epoch": 5.216879293424927, - "grad_norm": 0.7339340448379517, - "learning_rate": 6.33937825565065e-05, - "loss": 0.0551, - "step": 79740 - }, - { - "epoch": 5.217533529604187, - "grad_norm": 0.9456587433815002, - "learning_rate": 6.338493210122177e-05, - "loss": 0.0566, - "step": 79750 - }, - { - "epoch": 5.218187765783448, - "grad_norm": 0.8973842859268188, - "learning_rate": 6.33760811941605e-05, - "loss": 0.0538, - "step": 79760 - }, - { - "epoch": 5.218842001962709, - "grad_norm": 1.1991941928863525, - "learning_rate": 6.336722983562138e-05, - "loss": 0.0595, - "step": 79770 - }, - { - "epoch": 5.21949623814197, - "grad_norm": 0.8048141598701477, - "learning_rate": 6.335837802590322e-05, - "loss": 0.0571, - "step": 79780 - }, - { - "epoch": 5.22015047432123, - "grad_norm": 0.7296687960624695, - "learning_rate": 6.334952576530475e-05, - "loss": 0.0605, - "step": 79790 - }, - { - "epoch": 5.220804710500491, - "grad_norm": 1.0618641376495361, - "learning_rate": 6.334067305412479e-05, - "loss": 0.0621, - "step": 79800 - }, - { - "epoch": 5.221458946679752, - "grad_norm": 1.0293445587158203, - "learning_rate": 6.333181989266213e-05, - "loss": 0.0624, - "step": 79810 - }, - { - "epoch": 5.222113182859012, - "grad_norm": 0.7226553559303284, - "learning_rate": 6.332296628121557e-05, - "loss": 0.0597, - "step": 79820 - }, - { - "epoch": 5.222767419038273, - "grad_norm": 0.8370351195335388, - "learning_rate": 6.331411222008397e-05, - "loss": 0.0536, - "step": 79830 - }, - { - "epoch": 5.223421655217534, - "grad_norm": 0.97150719165802, - "learning_rate": 6.330525770956615e-05, - "loss": 0.0544, - "step": 79840 - }, - { - "epoch": 5.224075891396795, - "grad_norm": 0.9659034013748169, - "learning_rate": 6.3296402749961e-05, - "loss": 0.0608, - "step": 79850 - }, - { - "epoch": 5.224730127576055, - "grad_norm": 0.9073391556739807, - "learning_rate": 6.328754734156737e-05, - "loss": 0.0587, - "step": 79860 - }, - { - "epoch": 5.225384363755316, - "grad_norm": 0.9242889881134033, - "learning_rate": 6.327869148468418e-05, - "loss": 0.0547, - "step": 79870 - }, - { - "epoch": 5.226038599934577, - "grad_norm": 0.8243046998977661, - "learning_rate": 6.326983517961033e-05, - "loss": 0.06, - "step": 79880 - }, - { - "epoch": 5.226692836113837, - "grad_norm": 0.8463775515556335, - "learning_rate": 6.326097842664473e-05, - "loss": 0.0539, - "step": 79890 - }, - { - "epoch": 5.227347072293098, - "grad_norm": 0.8504155278205872, - "learning_rate": 6.325212122608635e-05, - "loss": 0.0629, - "step": 79900 - }, - { - "epoch": 5.228001308472359, - "grad_norm": 0.6780961751937866, - "learning_rate": 6.324326357823413e-05, - "loss": 0.0494, - "step": 79910 - }, - { - "epoch": 5.22865554465162, - "grad_norm": 0.8905619978904724, - "learning_rate": 6.3234405483387e-05, - "loss": 0.0537, - "step": 79920 - }, - { - "epoch": 5.22930978083088, - "grad_norm": 0.8399018049240112, - "learning_rate": 6.3225546941844e-05, - "loss": 0.0542, - "step": 79930 - }, - { - "epoch": 5.229964017010141, - "grad_norm": 1.064653992652893, - "learning_rate": 6.32166879539041e-05, - "loss": 0.0607, - "step": 79940 - }, - { - "epoch": 5.230618253189402, - "grad_norm": 0.7720884084701538, - "learning_rate": 6.320782851986631e-05, - "loss": 0.047, - "step": 79950 - }, - { - "epoch": 5.231272489368662, - "grad_norm": 0.7760630249977112, - "learning_rate": 6.319896864002968e-05, - "loss": 0.0611, - "step": 79960 - }, - { - "epoch": 5.231926725547923, - "grad_norm": 0.9401276111602783, - "learning_rate": 6.319010831469324e-05, - "loss": 0.0509, - "step": 79970 - }, - { - "epoch": 5.232580961727184, - "grad_norm": 0.865756630897522, - "learning_rate": 6.318124754415605e-05, - "loss": 0.0533, - "step": 79980 - }, - { - "epoch": 5.233235197906445, - "grad_norm": 0.7279103994369507, - "learning_rate": 6.317238632871718e-05, - "loss": 0.0492, - "step": 79990 - }, - { - "epoch": 5.233889434085705, - "grad_norm": 0.7956836223602295, - "learning_rate": 6.316352466867574e-05, - "loss": 0.0549, - "step": 80000 + "epoch": 8.372652927706902, + "eval_loss": 0.02760984484014828, + "eval_runtime": 9.3035, + "eval_samples_per_second": 110.066, + "eval_steps_per_second": 1.72, + "step": 32000 } ], "logging_steps": 10, - "max_steps": 180000, + "max_steps": 40000, "num_input_tokens_seen": 0, - "num_train_epochs": 12, - "save_steps": 10000, + "num_train_epochs": 11, + "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { @@ -56028,7 +22684,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 16, + "train_batch_size": 8, "trial_name": null, "trial_params": null }