{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9987519068090416, "eval_steps": 500, "global_step": 2703, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001109416169740674, "grad_norm": 0.06963343173265457, "learning_rate": 1.107011070110701e-06, "loss": 0.5928, "step": 1 }, { "epoch": 0.002218832339481348, "grad_norm": 0.08431016653776169, "learning_rate": 2.214022140221402e-06, "loss": 0.7915, "step": 2 }, { "epoch": 0.003328248509222022, "grad_norm": 0.06818119436502457, "learning_rate": 3.321033210332103e-06, "loss": 0.6594, "step": 3 }, { "epoch": 0.004437664678962696, "grad_norm": 0.08563707023859024, "learning_rate": 4.428044280442804e-06, "loss": 0.5879, "step": 4 }, { "epoch": 0.00554708084870337, "grad_norm": 0.053242169320583344, "learning_rate": 5.535055350553505e-06, "loss": 0.6156, "step": 5 }, { "epoch": 0.006656497018444044, "grad_norm": 0.06964253634214401, "learning_rate": 6.642066420664206e-06, "loss": 0.7194, "step": 6 }, { "epoch": 0.007765913188184718, "grad_norm": 0.07358719408512115, "learning_rate": 7.749077490774907e-06, "loss": 0.6253, "step": 7 }, { "epoch": 0.008875329357925392, "grad_norm": 0.04413611814379692, "learning_rate": 8.856088560885607e-06, "loss": 0.5087, "step": 8 }, { "epoch": 0.009984745527666065, "grad_norm": 0.06562013179063797, "learning_rate": 9.96309963099631e-06, "loss": 0.7106, "step": 9 }, { "epoch": 0.01109416169740674, "grad_norm": 0.06969469785690308, "learning_rate": 1.107011070110701e-05, "loss": 0.5231, "step": 10 }, { "epoch": 0.012203577867147413, "grad_norm": 0.06136368215084076, "learning_rate": 1.2177121771217711e-05, "loss": 0.5043, "step": 11 }, { "epoch": 0.013312994036888088, "grad_norm": 0.22236867249011993, "learning_rate": 1.3284132841328412e-05, "loss": 1.1332, "step": 12 }, { "epoch": 0.014422410206628761, "grad_norm": 0.05163656920194626, "learning_rate": 1.4391143911439114e-05, "loss": 0.6196, "step": 13 }, { "epoch": 0.015531826376369436, "grad_norm": 0.07425156235694885, "learning_rate": 1.5498154981549814e-05, "loss": 0.6824, "step": 14 }, { "epoch": 0.01664124254611011, "grad_norm": 0.10796231776475906, "learning_rate": 1.6605166051660514e-05, "loss": 0.6668, "step": 15 }, { "epoch": 0.017750658715850784, "grad_norm": 0.08407709002494812, "learning_rate": 1.7712177121771215e-05, "loss": 0.6919, "step": 16 }, { "epoch": 0.01886007488559146, "grad_norm": 0.09712005406618118, "learning_rate": 1.8819188191881916e-05, "loss": 0.6337, "step": 17 }, { "epoch": 0.01996949105533213, "grad_norm": 0.09893519431352615, "learning_rate": 1.992619926199262e-05, "loss": 0.5787, "step": 18 }, { "epoch": 0.021078907225072805, "grad_norm": 0.06742904335260391, "learning_rate": 2.1033210332103317e-05, "loss": 0.5236, "step": 19 }, { "epoch": 0.02218832339481348, "grad_norm": 0.09684479981660843, "learning_rate": 2.214022140221402e-05, "loss": 0.8853, "step": 20 }, { "epoch": 0.023297739564554155, "grad_norm": 0.07542835175991058, "learning_rate": 2.3247232472324722e-05, "loss": 0.5122, "step": 21 }, { "epoch": 0.024407155734294826, "grad_norm": 0.10576613247394562, "learning_rate": 2.4354243542435423e-05, "loss": 0.5764, "step": 22 }, { "epoch": 0.0255165719040355, "grad_norm": 0.09776762127876282, "learning_rate": 2.5461254612546123e-05, "loss": 0.5908, "step": 23 }, { "epoch": 0.026625988073776176, "grad_norm": 0.11171472072601318, "learning_rate": 2.6568265682656824e-05, "loss": 0.6293, "step": 24 }, { "epoch": 0.02773540424351685, "grad_norm": 0.07923085242509842, "learning_rate": 2.7675276752767525e-05, "loss": 0.5343, "step": 25 }, { "epoch": 0.028844820413257522, "grad_norm": 0.13260161876678467, "learning_rate": 2.878228782287823e-05, "loss": 0.6485, "step": 26 }, { "epoch": 0.029954236582998197, "grad_norm": 0.08974039554595947, "learning_rate": 2.9889298892988926e-05, "loss": 0.5814, "step": 27 }, { "epoch": 0.031063652752738872, "grad_norm": 0.08319084346294403, "learning_rate": 3.099630996309963e-05, "loss": 0.5042, "step": 28 }, { "epoch": 0.03217306892247954, "grad_norm": 0.21256183087825775, "learning_rate": 3.2103321033210324e-05, "loss": 0.7798, "step": 29 }, { "epoch": 0.03328248509222022, "grad_norm": 0.15112732350826263, "learning_rate": 3.321033210332103e-05, "loss": 0.6845, "step": 30 }, { "epoch": 0.03439190126196089, "grad_norm": 0.09808047860860825, "learning_rate": 3.431734317343173e-05, "loss": 0.631, "step": 31 }, { "epoch": 0.03550131743170157, "grad_norm": 0.12207363545894623, "learning_rate": 3.542435424354243e-05, "loss": 0.6708, "step": 32 }, { "epoch": 0.03661073360144224, "grad_norm": 0.11131646484136581, "learning_rate": 3.6531365313653134e-05, "loss": 0.4812, "step": 33 }, { "epoch": 0.03772014977118292, "grad_norm": 0.15473362803459167, "learning_rate": 3.763837638376383e-05, "loss": 0.5202, "step": 34 }, { "epoch": 0.03882956594092359, "grad_norm": 0.1214248314499855, "learning_rate": 3.8745387453874535e-05, "loss": 0.5629, "step": 35 }, { "epoch": 0.03993898211066426, "grad_norm": 0.12858295440673828, "learning_rate": 3.985239852398524e-05, "loss": 0.6535, "step": 36 }, { "epoch": 0.041048398280404935, "grad_norm": 0.23708263039588928, "learning_rate": 4.0959409594095944e-05, "loss": 0.7139, "step": 37 }, { "epoch": 0.04215781445014561, "grad_norm": 0.11044642329216003, "learning_rate": 4.2066420664206634e-05, "loss": 0.5751, "step": 38 }, { "epoch": 0.043267230619886285, "grad_norm": 0.23254919052124023, "learning_rate": 4.317343173431734e-05, "loss": 0.4629, "step": 39 }, { "epoch": 0.04437664678962696, "grad_norm": 0.16974490880966187, "learning_rate": 4.428044280442804e-05, "loss": 0.7469, "step": 40 }, { "epoch": 0.045486062959367635, "grad_norm": 0.1873021423816681, "learning_rate": 4.538745387453874e-05, "loss": 0.6589, "step": 41 }, { "epoch": 0.04659547912910831, "grad_norm": 0.17700991034507751, "learning_rate": 4.6494464944649444e-05, "loss": 0.7062, "step": 42 }, { "epoch": 0.04770489529884898, "grad_norm": 0.20159931480884552, "learning_rate": 4.760147601476014e-05, "loss": 0.6955, "step": 43 }, { "epoch": 0.04881431146858965, "grad_norm": 0.18304705619812012, "learning_rate": 4.8708487084870845e-05, "loss": 0.6543, "step": 44 }, { "epoch": 0.04992372763833033, "grad_norm": 0.1452740579843521, "learning_rate": 4.981549815498154e-05, "loss": 0.5875, "step": 45 }, { "epoch": 0.051033143808071, "grad_norm": 0.16509367525577545, "learning_rate": 5.092250922509225e-05, "loss": 0.4751, "step": 46 }, { "epoch": 0.05214255997781168, "grad_norm": 0.19247905910015106, "learning_rate": 5.202952029520295e-05, "loss": 0.4989, "step": 47 }, { "epoch": 0.05325197614755235, "grad_norm": 0.20170418918132782, "learning_rate": 5.313653136531365e-05, "loss": 0.6385, "step": 48 }, { "epoch": 0.05436139231729303, "grad_norm": 0.21076518297195435, "learning_rate": 5.4243542435424346e-05, "loss": 0.4535, "step": 49 }, { "epoch": 0.0554708084870337, "grad_norm": 0.16010227799415588, "learning_rate": 5.535055350553505e-05, "loss": 0.5345, "step": 50 }, { "epoch": 0.05658022465677437, "grad_norm": 0.21507202088832855, "learning_rate": 5.6457564575645754e-05, "loss": 0.5626, "step": 51 }, { "epoch": 0.057689640826515044, "grad_norm": 0.2764558494091034, "learning_rate": 5.756457564575646e-05, "loss": 0.7726, "step": 52 }, { "epoch": 0.05879905699625572, "grad_norm": 0.19346186518669128, "learning_rate": 5.867158671586715e-05, "loss": 0.6975, "step": 53 }, { "epoch": 0.059908473165996394, "grad_norm": 0.15796959400177002, "learning_rate": 5.977859778597785e-05, "loss": 0.5838, "step": 54 }, { "epoch": 0.06101788933573707, "grad_norm": 0.26421332359313965, "learning_rate": 6.088560885608856e-05, "loss": 0.4547, "step": 55 }, { "epoch": 0.062127305505477744, "grad_norm": 0.15716983377933502, "learning_rate": 6.199261992619925e-05, "loss": 0.5811, "step": 56 }, { "epoch": 0.06323672167521842, "grad_norm": 0.31450599431991577, "learning_rate": 6.309963099630996e-05, "loss": 0.6177, "step": 57 }, { "epoch": 0.06434613784495909, "grad_norm": 0.1722506731748581, "learning_rate": 6.420664206642065e-05, "loss": 0.4451, "step": 58 }, { "epoch": 0.06545555401469977, "grad_norm": 0.19947190582752228, "learning_rate": 6.531365313653135e-05, "loss": 0.6172, "step": 59 }, { "epoch": 0.06656497018444044, "grad_norm": 0.24372801184654236, "learning_rate": 6.642066420664206e-05, "loss": 0.7055, "step": 60 }, { "epoch": 0.06767438635418112, "grad_norm": 0.2935488224029541, "learning_rate": 6.752767527675276e-05, "loss": 0.6359, "step": 61 }, { "epoch": 0.06878380252392179, "grad_norm": 0.30399176478385925, "learning_rate": 6.863468634686347e-05, "loss": 0.6141, "step": 62 }, { "epoch": 0.06989321869366245, "grad_norm": 0.2780139446258545, "learning_rate": 6.974169741697416e-05, "loss": 0.6173, "step": 63 }, { "epoch": 0.07100263486340314, "grad_norm": 0.22119928896427155, "learning_rate": 7.084870848708486e-05, "loss": 0.6311, "step": 64 }, { "epoch": 0.0721120510331438, "grad_norm": 0.17710696160793304, "learning_rate": 7.195571955719556e-05, "loss": 0.5011, "step": 65 }, { "epoch": 0.07322146720288449, "grad_norm": 0.24316059052944183, "learning_rate": 7.306273062730627e-05, "loss": 0.5671, "step": 66 }, { "epoch": 0.07433088337262515, "grad_norm": 0.2396726906299591, "learning_rate": 7.416974169741697e-05, "loss": 0.5996, "step": 67 }, { "epoch": 0.07544029954236584, "grad_norm": 0.25187021493911743, "learning_rate": 7.527675276752766e-05, "loss": 0.4867, "step": 68 }, { "epoch": 0.0765497157121065, "grad_norm": 0.18911105394363403, "learning_rate": 7.638376383763837e-05, "loss": 0.5302, "step": 69 }, { "epoch": 0.07765913188184718, "grad_norm": 0.2970142364501953, "learning_rate": 7.749077490774907e-05, "loss": 0.6817, "step": 70 }, { "epoch": 0.07876854805158785, "grad_norm": 0.255367249250412, "learning_rate": 7.859778597785978e-05, "loss": 0.53, "step": 71 }, { "epoch": 0.07987796422132852, "grad_norm": 0.2670186758041382, "learning_rate": 7.970479704797048e-05, "loss": 0.6765, "step": 72 }, { "epoch": 0.0809873803910692, "grad_norm": 0.29176634550094604, "learning_rate": 8.081180811808118e-05, "loss": 0.6788, "step": 73 }, { "epoch": 0.08209679656080987, "grad_norm": 0.1941952258348465, "learning_rate": 8.191881918819189e-05, "loss": 0.5154, "step": 74 }, { "epoch": 0.08320621273055055, "grad_norm": 0.2700209617614746, "learning_rate": 8.302583025830258e-05, "loss": 0.6123, "step": 75 }, { "epoch": 0.08431562890029122, "grad_norm": 0.2537286579608917, "learning_rate": 8.413284132841327e-05, "loss": 0.6377, "step": 76 }, { "epoch": 0.0854250450700319, "grad_norm": 0.3083483874797821, "learning_rate": 8.523985239852397e-05, "loss": 0.6263, "step": 77 }, { "epoch": 0.08653446123977257, "grad_norm": 0.1838807612657547, "learning_rate": 8.634686346863468e-05, "loss": 0.4246, "step": 78 }, { "epoch": 0.08764387740951324, "grad_norm": 0.19544388353824615, "learning_rate": 8.745387453874538e-05, "loss": 0.6333, "step": 79 }, { "epoch": 0.08875329357925392, "grad_norm": 0.11637648195028305, "learning_rate": 8.856088560885608e-05, "loss": 0.3302, "step": 80 }, { "epoch": 0.08986270974899459, "grad_norm": 0.22084759175777435, "learning_rate": 8.966789667896679e-05, "loss": 0.5961, "step": 81 }, { "epoch": 0.09097212591873527, "grad_norm": 0.20843982696533203, "learning_rate": 9.077490774907748e-05, "loss": 0.4923, "step": 82 }, { "epoch": 0.09208154208847594, "grad_norm": 0.18717604875564575, "learning_rate": 9.188191881918818e-05, "loss": 0.7945, "step": 83 }, { "epoch": 0.09319095825821662, "grad_norm": 0.5555791854858398, "learning_rate": 9.298892988929889e-05, "loss": 0.7418, "step": 84 }, { "epoch": 0.09430037442795729, "grad_norm": 0.35459521412849426, "learning_rate": 9.409594095940959e-05, "loss": 0.4774, "step": 85 }, { "epoch": 0.09540979059769795, "grad_norm": 0.33607375621795654, "learning_rate": 9.520295202952028e-05, "loss": 0.6068, "step": 86 }, { "epoch": 0.09651920676743864, "grad_norm": 0.2963770031929016, "learning_rate": 9.630996309963099e-05, "loss": 0.6738, "step": 87 }, { "epoch": 0.0976286229371793, "grad_norm": 0.1896924376487732, "learning_rate": 9.741697416974169e-05, "loss": 0.4507, "step": 88 }, { "epoch": 0.09873803910691999, "grad_norm": 0.17777574062347412, "learning_rate": 9.852398523985238e-05, "loss": 0.7229, "step": 89 }, { "epoch": 0.09984745527666065, "grad_norm": 0.2619733512401581, "learning_rate": 9.963099630996309e-05, "loss": 0.6666, "step": 90 }, { "epoch": 0.10095687144640134, "grad_norm": 0.24837014079093933, "learning_rate": 0.00010073800738007379, "loss": 0.6532, "step": 91 }, { "epoch": 0.102066287616142, "grad_norm": 0.16982443630695343, "learning_rate": 0.0001018450184501845, "loss": 0.6299, "step": 92 }, { "epoch": 0.10317570378588269, "grad_norm": 0.25121009349823, "learning_rate": 0.0001029520295202952, "loss": 0.5286, "step": 93 }, { "epoch": 0.10428511995562335, "grad_norm": 0.17865754663944244, "learning_rate": 0.0001040590405904059, "loss": 0.6088, "step": 94 }, { "epoch": 0.10539453612536402, "grad_norm": 0.21300899982452393, "learning_rate": 0.0001051660516605166, "loss": 0.5066, "step": 95 }, { "epoch": 0.1065039522951047, "grad_norm": 0.18651628494262695, "learning_rate": 0.0001062730627306273, "loss": 0.5439, "step": 96 }, { "epoch": 0.10761336846484537, "grad_norm": 0.18930092453956604, "learning_rate": 0.00010738007380073799, "loss": 0.4745, "step": 97 }, { "epoch": 0.10872278463458605, "grad_norm": 0.1967380791902542, "learning_rate": 0.00010848708487084869, "loss": 0.5471, "step": 98 }, { "epoch": 0.10983220080432672, "grad_norm": 0.16241402924060822, "learning_rate": 0.0001095940959409594, "loss": 0.6002, "step": 99 }, { "epoch": 0.1109416169740674, "grad_norm": 0.22282204031944275, "learning_rate": 0.0001107011070110701, "loss": 0.4687, "step": 100 }, { "epoch": 0.11205103314380807, "grad_norm": 0.1546042561531067, "learning_rate": 0.0001118081180811808, "loss": 0.599, "step": 101 }, { "epoch": 0.11316044931354874, "grad_norm": 0.19606682658195496, "learning_rate": 0.00011291512915129151, "loss": 0.5543, "step": 102 }, { "epoch": 0.11426986548328942, "grad_norm": 0.16278928518295288, "learning_rate": 0.00011402214022140221, "loss": 0.4827, "step": 103 }, { "epoch": 0.11537928165303009, "grad_norm": 0.18518386781215668, "learning_rate": 0.00011512915129151292, "loss": 0.6719, "step": 104 }, { "epoch": 0.11648869782277077, "grad_norm": 0.3365796208381653, "learning_rate": 0.00011623616236162362, "loss": 0.3853, "step": 105 }, { "epoch": 0.11759811399251144, "grad_norm": 0.15232332050800323, "learning_rate": 0.0001173431734317343, "loss": 0.6667, "step": 106 }, { "epoch": 0.11870753016225212, "grad_norm": 0.15742583572864532, "learning_rate": 0.000118450184501845, "loss": 0.6997, "step": 107 }, { "epoch": 0.11981694633199279, "grad_norm": 0.18328164517879486, "learning_rate": 0.0001195571955719557, "loss": 0.4386, "step": 108 }, { "epoch": 0.12092636250173346, "grad_norm": 0.19277305901050568, "learning_rate": 0.00012066420664206641, "loss": 0.4237, "step": 109 }, { "epoch": 0.12203577867147414, "grad_norm": 0.20428280532360077, "learning_rate": 0.00012177121771217711, "loss": 0.8255, "step": 110 }, { "epoch": 0.1231451948412148, "grad_norm": 0.21956562995910645, "learning_rate": 0.0001228782287822878, "loss": 0.5919, "step": 111 }, { "epoch": 0.12425461101095549, "grad_norm": 0.16980376839637756, "learning_rate": 0.0001239852398523985, "loss": 0.3784, "step": 112 }, { "epoch": 0.12536402718069617, "grad_norm": 0.14789903163909912, "learning_rate": 0.0001250922509225092, "loss": 0.4501, "step": 113 }, { "epoch": 0.12647344335043684, "grad_norm": 0.22972984611988068, "learning_rate": 0.00012619926199261992, "loss": 0.5988, "step": 114 }, { "epoch": 0.1275828595201775, "grad_norm": 0.345342218875885, "learning_rate": 0.00012730627306273062, "loss": 0.6277, "step": 115 }, { "epoch": 0.12869227568991817, "grad_norm": 0.22726920247077942, "learning_rate": 0.0001284132841328413, "loss": 0.7774, "step": 116 }, { "epoch": 0.12980169185965884, "grad_norm": 0.18863454461097717, "learning_rate": 0.000129520295202952, "loss": 0.8846, "step": 117 }, { "epoch": 0.13091110802939954, "grad_norm": 0.14772231876850128, "learning_rate": 0.0001306273062730627, "loss": 0.546, "step": 118 }, { "epoch": 0.1320205241991402, "grad_norm": 0.16707932949066162, "learning_rate": 0.0001317343173431734, "loss": 0.4935, "step": 119 }, { "epoch": 0.13312994036888087, "grad_norm": 0.17367342114448547, "learning_rate": 0.00013284132841328411, "loss": 0.5853, "step": 120 }, { "epoch": 0.13423935653862154, "grad_norm": 0.20932604372501373, "learning_rate": 0.00013394833948339482, "loss": 0.5861, "step": 121 }, { "epoch": 0.13534877270836224, "grad_norm": 0.19261795282363892, "learning_rate": 0.00013505535055350552, "loss": 0.5912, "step": 122 }, { "epoch": 0.1364581888781029, "grad_norm": 0.16762207448482513, "learning_rate": 0.00013616236162361623, "loss": 0.6856, "step": 123 }, { "epoch": 0.13756760504784357, "grad_norm": 0.22033065557479858, "learning_rate": 0.00013726937269372693, "loss": 0.6217, "step": 124 }, { "epoch": 0.13867702121758424, "grad_norm": 0.1888565719127655, "learning_rate": 0.00013837638376383763, "loss": 0.4837, "step": 125 }, { "epoch": 0.1397864373873249, "grad_norm": 0.13172639906406403, "learning_rate": 0.0001394833948339483, "loss": 0.4298, "step": 126 }, { "epoch": 0.1408958535570656, "grad_norm": 0.20497459173202515, "learning_rate": 0.00014059040590405902, "loss": 0.7032, "step": 127 }, { "epoch": 0.14200526972680627, "grad_norm": 0.6339800357818604, "learning_rate": 0.00014169741697416972, "loss": 0.5669, "step": 128 }, { "epoch": 0.14311468589654694, "grad_norm": 0.25784072279930115, "learning_rate": 0.00014280442804428042, "loss": 0.6115, "step": 129 }, { "epoch": 0.1442241020662876, "grad_norm": 0.1341128647327423, "learning_rate": 0.00014391143911439113, "loss": 0.5487, "step": 130 }, { "epoch": 0.1453335182360283, "grad_norm": 0.15084423124790192, "learning_rate": 0.00014501845018450183, "loss": 0.5858, "step": 131 }, { "epoch": 0.14644293440576897, "grad_norm": 0.3109219968318939, "learning_rate": 0.00014612546125461254, "loss": 0.5663, "step": 132 }, { "epoch": 0.14755235057550964, "grad_norm": 0.20024478435516357, "learning_rate": 0.00014723247232472324, "loss": 0.49, "step": 133 }, { "epoch": 0.1486617667452503, "grad_norm": 0.2169143110513687, "learning_rate": 0.00014833948339483394, "loss": 0.5955, "step": 134 }, { "epoch": 0.14977118291499097, "grad_norm": 0.12287949025630951, "learning_rate": 0.00014944649446494465, "loss": 0.5406, "step": 135 }, { "epoch": 0.15088059908473167, "grad_norm": 0.1676974594593048, "learning_rate": 0.00015055350553505533, "loss": 0.5796, "step": 136 }, { "epoch": 0.15199001525447234, "grad_norm": 0.18620753288269043, "learning_rate": 0.00015166051660516606, "loss": 0.4466, "step": 137 }, { "epoch": 0.153099431424213, "grad_norm": 0.2430751472711563, "learning_rate": 0.00015276752767527673, "loss": 0.5272, "step": 138 }, { "epoch": 0.15420884759395367, "grad_norm": 0.21565105020999908, "learning_rate": 0.00015387453874538746, "loss": 0.7445, "step": 139 }, { "epoch": 0.15531826376369437, "grad_norm": 0.14998437464237213, "learning_rate": 0.00015498154981549814, "loss": 0.7069, "step": 140 }, { "epoch": 0.15642767993343504, "grad_norm": 0.22991682589054108, "learning_rate": 0.00015608856088560882, "loss": 0.5109, "step": 141 }, { "epoch": 0.1575370961031757, "grad_norm": 0.21704424917697906, "learning_rate": 0.00015719557195571955, "loss": 0.5226, "step": 142 }, { "epoch": 0.15864651227291637, "grad_norm": 0.141441211104393, "learning_rate": 0.00015830258302583023, "loss": 0.6123, "step": 143 }, { "epoch": 0.15975592844265704, "grad_norm": 0.14867329597473145, "learning_rate": 0.00015940959409594096, "loss": 0.606, "step": 144 }, { "epoch": 0.16086534461239774, "grad_norm": 0.37929290533065796, "learning_rate": 0.00016051660516605164, "loss": 0.5971, "step": 145 }, { "epoch": 0.1619747607821384, "grad_norm": 0.19486796855926514, "learning_rate": 0.00016162361623616237, "loss": 0.554, "step": 146 }, { "epoch": 0.16308417695187907, "grad_norm": 0.16117985546588898, "learning_rate": 0.00016273062730627304, "loss": 0.5379, "step": 147 }, { "epoch": 0.16419359312161974, "grad_norm": 0.21857589483261108, "learning_rate": 0.00016383763837638377, "loss": 0.6984, "step": 148 }, { "epoch": 0.1653030092913604, "grad_norm": 0.18841134011745453, "learning_rate": 0.00016494464944649445, "loss": 0.7714, "step": 149 }, { "epoch": 0.1664124254611011, "grad_norm": 0.16624371707439423, "learning_rate": 0.00016605166051660516, "loss": 0.5173, "step": 150 }, { "epoch": 0.16752184163084177, "grad_norm": 0.17951254546642303, "learning_rate": 0.00016715867158671586, "loss": 0.8636, "step": 151 }, { "epoch": 0.16863125780058244, "grad_norm": 0.1839190572500229, "learning_rate": 0.00016826568265682654, "loss": 0.4827, "step": 152 }, { "epoch": 0.1697406739703231, "grad_norm": 0.12314256280660629, "learning_rate": 0.00016937269372693727, "loss": 0.512, "step": 153 }, { "epoch": 0.1708500901400638, "grad_norm": 0.12132935225963593, "learning_rate": 0.00017047970479704795, "loss": 0.5284, "step": 154 }, { "epoch": 0.17195950630980447, "grad_norm": 0.29458367824554443, "learning_rate": 0.00017158671586715868, "loss": 0.703, "step": 155 }, { "epoch": 0.17306892247954514, "grad_norm": 0.15419234335422516, "learning_rate": 0.00017269372693726935, "loss": 0.6275, "step": 156 }, { "epoch": 0.1741783386492858, "grad_norm": 0.12458593398332596, "learning_rate": 0.00017380073800738006, "loss": 0.4877, "step": 157 }, { "epoch": 0.17528775481902648, "grad_norm": 0.1632385402917862, "learning_rate": 0.00017490774907749076, "loss": 0.5432, "step": 158 }, { "epoch": 0.17639717098876717, "grad_norm": 0.12995319068431854, "learning_rate": 0.00017601476014760147, "loss": 0.5964, "step": 159 }, { "epoch": 0.17750658715850784, "grad_norm": 0.09651859104633331, "learning_rate": 0.00017712177121771217, "loss": 0.5907, "step": 160 }, { "epoch": 0.1786160033282485, "grad_norm": 0.20671844482421875, "learning_rate": 0.00017822878228782285, "loss": 0.574, "step": 161 }, { "epoch": 0.17972541949798918, "grad_norm": 0.1272733509540558, "learning_rate": 0.00017933579335793358, "loss": 0.5413, "step": 162 }, { "epoch": 0.18083483566772987, "grad_norm": 0.12072700262069702, "learning_rate": 0.00018044280442804426, "loss": 0.4555, "step": 163 }, { "epoch": 0.18194425183747054, "grad_norm": 0.18685540556907654, "learning_rate": 0.00018154981549815496, "loss": 0.7267, "step": 164 }, { "epoch": 0.1830536680072112, "grad_norm": 0.18687789142131805, "learning_rate": 0.00018265682656826566, "loss": 0.5785, "step": 165 }, { "epoch": 0.18416308417695187, "grad_norm": 0.2580098807811737, "learning_rate": 0.00018376383763837637, "loss": 0.5662, "step": 166 }, { "epoch": 0.18527250034669254, "grad_norm": 0.1476382166147232, "learning_rate": 0.00018487084870848707, "loss": 0.5427, "step": 167 }, { "epoch": 0.18638191651643324, "grad_norm": 0.14074724912643433, "learning_rate": 0.00018597785977859778, "loss": 0.4759, "step": 168 }, { "epoch": 0.1874913326861739, "grad_norm": 0.2741679847240448, "learning_rate": 0.00018708487084870848, "loss": 0.6973, "step": 169 }, { "epoch": 0.18860074885591457, "grad_norm": 0.2278490662574768, "learning_rate": 0.00018819188191881918, "loss": 0.586, "step": 170 }, { "epoch": 0.18971016502565524, "grad_norm": 0.1516968458890915, "learning_rate": 0.00018929889298892986, "loss": 0.378, "step": 171 }, { "epoch": 0.1908195811953959, "grad_norm": 0.17562097311019897, "learning_rate": 0.00019040590405904056, "loss": 0.6113, "step": 172 }, { "epoch": 0.1919289973651366, "grad_norm": 0.14954030513763428, "learning_rate": 0.00019151291512915127, "loss": 0.4926, "step": 173 }, { "epoch": 0.19303841353487727, "grad_norm": 0.1906110644340515, "learning_rate": 0.00019261992619926197, "loss": 0.4884, "step": 174 }, { "epoch": 0.19414782970461794, "grad_norm": 0.18351547420024872, "learning_rate": 0.00019372693726937268, "loss": 0.5295, "step": 175 }, { "epoch": 0.1952572458743586, "grad_norm": 0.2049722820520401, "learning_rate": 0.00019483394833948338, "loss": 0.4562, "step": 176 }, { "epoch": 0.1963666620440993, "grad_norm": 0.21823738515377045, "learning_rate": 0.00019594095940959409, "loss": 0.5329, "step": 177 }, { "epoch": 0.19747607821383997, "grad_norm": 0.1077185720205307, "learning_rate": 0.00019704797047970476, "loss": 0.4787, "step": 178 }, { "epoch": 0.19858549438358064, "grad_norm": 0.19151321053504944, "learning_rate": 0.0001981549815498155, "loss": 0.7764, "step": 179 }, { "epoch": 0.1996949105533213, "grad_norm": 0.16796617209911346, "learning_rate": 0.00019926199261992617, "loss": 0.4962, "step": 180 }, { "epoch": 0.20080432672306198, "grad_norm": 0.31866180896759033, "learning_rate": 0.00020036900369003687, "loss": 0.8462, "step": 181 }, { "epoch": 0.20191374289280267, "grad_norm": 0.2093004584312439, "learning_rate": 0.00020147601476014758, "loss": 0.5916, "step": 182 }, { "epoch": 0.20302315906254334, "grad_norm": 0.2198261171579361, "learning_rate": 0.00020258302583025828, "loss": 0.434, "step": 183 }, { "epoch": 0.204132575232284, "grad_norm": 0.19753794372081757, "learning_rate": 0.000203690036900369, "loss": 0.6142, "step": 184 }, { "epoch": 0.20524199140202468, "grad_norm": 0.1537386178970337, "learning_rate": 0.00020479704797047966, "loss": 0.4053, "step": 185 }, { "epoch": 0.20635140757176537, "grad_norm": 0.194119393825531, "learning_rate": 0.0002059040590405904, "loss": 0.6506, "step": 186 }, { "epoch": 0.20746082374150604, "grad_norm": 0.16899624466896057, "learning_rate": 0.00020701107011070107, "loss": 0.572, "step": 187 }, { "epoch": 0.2085702399112467, "grad_norm": 0.14211402833461761, "learning_rate": 0.0002081180811808118, "loss": 0.5999, "step": 188 }, { "epoch": 0.20967965608098738, "grad_norm": 0.17124126851558685, "learning_rate": 0.00020922509225092248, "loss": 0.4797, "step": 189 }, { "epoch": 0.21078907225072804, "grad_norm": 0.1637413501739502, "learning_rate": 0.0002103321033210332, "loss": 0.3899, "step": 190 }, { "epoch": 0.21189848842046874, "grad_norm": 0.2928311228752136, "learning_rate": 0.0002114391143911439, "loss": 0.4297, "step": 191 }, { "epoch": 0.2130079045902094, "grad_norm": 0.19800159335136414, "learning_rate": 0.0002125461254612546, "loss": 0.6641, "step": 192 }, { "epoch": 0.21411732075995007, "grad_norm": 0.1297946572303772, "learning_rate": 0.0002136531365313653, "loss": 0.4557, "step": 193 }, { "epoch": 0.21522673692969074, "grad_norm": 0.24120758473873138, "learning_rate": 0.00021476014760147597, "loss": 0.7058, "step": 194 }, { "epoch": 0.2163361530994314, "grad_norm": 0.15839487314224243, "learning_rate": 0.0002158671586715867, "loss": 0.5106, "step": 195 }, { "epoch": 0.2174455692691721, "grad_norm": 0.18157745897769928, "learning_rate": 0.00021697416974169738, "loss": 0.8263, "step": 196 }, { "epoch": 0.21855498543891277, "grad_norm": 0.2212788164615631, "learning_rate": 0.0002180811808118081, "loss": 0.4721, "step": 197 }, { "epoch": 0.21966440160865344, "grad_norm": 0.2079571783542633, "learning_rate": 0.0002191881918819188, "loss": 0.5491, "step": 198 }, { "epoch": 0.2207738177783941, "grad_norm": 0.29977646470069885, "learning_rate": 0.00022029520295202952, "loss": 0.5165, "step": 199 }, { "epoch": 0.2218832339481348, "grad_norm": 0.15032988786697388, "learning_rate": 0.0002214022140221402, "loss": 0.5833, "step": 200 }, { "epoch": 0.22299265011787547, "grad_norm": 0.1367197334766388, "learning_rate": 0.00022250922509225088, "loss": 0.4895, "step": 201 }, { "epoch": 0.22410206628761614, "grad_norm": 0.20162621140480042, "learning_rate": 0.0002236162361623616, "loss": 0.6069, "step": 202 }, { "epoch": 0.2252114824573568, "grad_norm": 0.1476079821586609, "learning_rate": 0.00022472324723247228, "loss": 0.3195, "step": 203 }, { "epoch": 0.22632089862709748, "grad_norm": 0.16391322016716003, "learning_rate": 0.00022583025830258302, "loss": 0.5242, "step": 204 }, { "epoch": 0.22743031479683817, "grad_norm": 0.32637152075767517, "learning_rate": 0.0002269372693726937, "loss": 0.608, "step": 205 }, { "epoch": 0.22853973096657884, "grad_norm": 0.18414971232414246, "learning_rate": 0.00022804428044280442, "loss": 0.5414, "step": 206 }, { "epoch": 0.2296491471363195, "grad_norm": 0.13513615727424622, "learning_rate": 0.0002291512915129151, "loss": 0.452, "step": 207 }, { "epoch": 0.23075856330606018, "grad_norm": 0.1715661883354187, "learning_rate": 0.00023025830258302583, "loss": 0.5552, "step": 208 }, { "epoch": 0.23186797947580087, "grad_norm": 0.2078085094690323, "learning_rate": 0.0002313653136531365, "loss": 0.6519, "step": 209 }, { "epoch": 0.23297739564554154, "grad_norm": 0.45334669947624207, "learning_rate": 0.00023247232472324724, "loss": 0.9767, "step": 210 }, { "epoch": 0.2340868118152822, "grad_norm": 0.17819558084011078, "learning_rate": 0.00023357933579335792, "loss": 0.7568, "step": 211 }, { "epoch": 0.23519622798502288, "grad_norm": 0.4460853338241577, "learning_rate": 0.0002346863468634686, "loss": 0.4265, "step": 212 }, { "epoch": 0.23630564415476354, "grad_norm": 0.2014990597963333, "learning_rate": 0.00023579335793357933, "loss": 0.6479, "step": 213 }, { "epoch": 0.23741506032450424, "grad_norm": 0.17922167479991913, "learning_rate": 0.00023690036900369, "loss": 0.6739, "step": 214 }, { "epoch": 0.2385244764942449, "grad_norm": 0.14204619824886322, "learning_rate": 0.00023800738007380073, "loss": 0.4641, "step": 215 }, { "epoch": 0.23963389266398558, "grad_norm": 0.17847496271133423, "learning_rate": 0.0002391143911439114, "loss": 0.6229, "step": 216 }, { "epoch": 0.24074330883372624, "grad_norm": 0.20707765221595764, "learning_rate": 0.00024022140221402214, "loss": 0.6337, "step": 217 }, { "epoch": 0.2418527250034669, "grad_norm": 0.1623317301273346, "learning_rate": 0.00024132841328413282, "loss": 0.5256, "step": 218 }, { "epoch": 0.2429621411732076, "grad_norm": 0.18768182396888733, "learning_rate": 0.00024243542435424352, "loss": 0.5793, "step": 219 }, { "epoch": 0.24407155734294828, "grad_norm": 0.16495372354984283, "learning_rate": 0.00024354243542435423, "loss": 0.4539, "step": 220 }, { "epoch": 0.24518097351268894, "grad_norm": 0.14188359677791595, "learning_rate": 0.00024464944649446493, "loss": 0.5089, "step": 221 }, { "epoch": 0.2462903896824296, "grad_norm": 0.18091897666454315, "learning_rate": 0.0002457564575645756, "loss": 0.5689, "step": 222 }, { "epoch": 0.2473998058521703, "grad_norm": 0.214166522026062, "learning_rate": 0.00024686346863468634, "loss": 0.6479, "step": 223 }, { "epoch": 0.24850922202191097, "grad_norm": 0.19823165237903595, "learning_rate": 0.000247970479704797, "loss": 0.4772, "step": 224 }, { "epoch": 0.24961863819165164, "grad_norm": 0.20246858894824982, "learning_rate": 0.0002490774907749077, "loss": 0.6838, "step": 225 }, { "epoch": 0.25072805436139234, "grad_norm": 0.17839235067367554, "learning_rate": 0.0002501845018450184, "loss": 0.6234, "step": 226 }, { "epoch": 0.251837470531133, "grad_norm": 0.15513737499713898, "learning_rate": 0.0002512915129151291, "loss": 0.6836, "step": 227 }, { "epoch": 0.2529468867008737, "grad_norm": 0.3762029707431793, "learning_rate": 0.00025239852398523983, "loss": 0.6588, "step": 228 }, { "epoch": 0.2540563028706143, "grad_norm": 0.24218355119228363, "learning_rate": 0.0002535055350553505, "loss": 0.663, "step": 229 }, { "epoch": 0.255165719040355, "grad_norm": 0.23746894299983978, "learning_rate": 0.00025461254612546124, "loss": 0.8055, "step": 230 }, { "epoch": 0.2562751352100957, "grad_norm": 0.19097627699375153, "learning_rate": 0.0002557195571955719, "loss": 0.5028, "step": 231 }, { "epoch": 0.25738455137983635, "grad_norm": 0.16248895227909088, "learning_rate": 0.0002568265682656826, "loss": 0.4644, "step": 232 }, { "epoch": 0.25849396754957704, "grad_norm": 0.15660254657268524, "learning_rate": 0.0002579335793357933, "loss": 0.4822, "step": 233 }, { "epoch": 0.2596033837193177, "grad_norm": 0.18094629049301147, "learning_rate": 0.000259040590405904, "loss": 0.6761, "step": 234 }, { "epoch": 0.2607127998890584, "grad_norm": 0.20552562177181244, "learning_rate": 0.00026014760147601473, "loss": 0.654, "step": 235 }, { "epoch": 0.2618222160587991, "grad_norm": 0.1620112657546997, "learning_rate": 0.0002612546125461254, "loss": 0.3786, "step": 236 }, { "epoch": 0.2629316322285397, "grad_norm": 0.25460562109947205, "learning_rate": 0.00026236162361623614, "loss": 0.6258, "step": 237 }, { "epoch": 0.2640410483982804, "grad_norm": 0.17765319347381592, "learning_rate": 0.0002634686346863468, "loss": 0.3955, "step": 238 }, { "epoch": 0.2651504645680211, "grad_norm": 0.2620762884616852, "learning_rate": 0.00026457564575645755, "loss": 0.5884, "step": 239 }, { "epoch": 0.26625988073776174, "grad_norm": 0.14502941071987152, "learning_rate": 0.00026568265682656823, "loss": 0.5391, "step": 240 }, { "epoch": 0.26736929690750244, "grad_norm": 0.15013466775417328, "learning_rate": 0.0002667896678966789, "loss": 0.5676, "step": 241 }, { "epoch": 0.2684787130772431, "grad_norm": 0.2311590611934662, "learning_rate": 0.00026789667896678964, "loss": 0.6409, "step": 242 }, { "epoch": 0.2695881292469838, "grad_norm": 0.16972240805625916, "learning_rate": 0.0002690036900369003, "loss": 0.488, "step": 243 }, { "epoch": 0.27069754541672447, "grad_norm": 0.13695970177650452, "learning_rate": 0.00027011070110701104, "loss": 0.5997, "step": 244 }, { "epoch": 0.2718069615864651, "grad_norm": 0.16296394169330597, "learning_rate": 0.0002712177121771217, "loss": 0.5645, "step": 245 }, { "epoch": 0.2729163777562058, "grad_norm": 0.15607796609401703, "learning_rate": 0.00027232472324723245, "loss": 0.3976, "step": 246 }, { "epoch": 0.27402579392594645, "grad_norm": 0.17693723738193512, "learning_rate": 0.00027343173431734313, "loss": 0.6873, "step": 247 }, { "epoch": 0.27513521009568714, "grad_norm": 0.16682426631450653, "learning_rate": 0.00027453874538745386, "loss": 0.5259, "step": 248 }, { "epoch": 0.27624462626542784, "grad_norm": 0.16855113208293915, "learning_rate": 0.00027564575645756454, "loss": 0.5889, "step": 249 }, { "epoch": 0.2773540424351685, "grad_norm": 0.160100057721138, "learning_rate": 0.00027675276752767527, "loss": 0.5832, "step": 250 }, { "epoch": 0.2784634586049092, "grad_norm": 0.24193750321865082, "learning_rate": 0.00027785977859778595, "loss": 0.4325, "step": 251 }, { "epoch": 0.2795728747746498, "grad_norm": 0.12794721126556396, "learning_rate": 0.0002789667896678966, "loss": 0.7371, "step": 252 }, { "epoch": 0.2806822909443905, "grad_norm": 0.22222159802913666, "learning_rate": 0.00028007380073800735, "loss": 0.6177, "step": 253 }, { "epoch": 0.2817917071141312, "grad_norm": 0.24400316178798676, "learning_rate": 0.00028118081180811803, "loss": 0.5169, "step": 254 }, { "epoch": 0.28290112328387185, "grad_norm": 0.23530688881874084, "learning_rate": 0.00028228782287822876, "loss": 0.4384, "step": 255 }, { "epoch": 0.28401053945361254, "grad_norm": 0.20001864433288574, "learning_rate": 0.00028339483394833944, "loss": 0.4855, "step": 256 }, { "epoch": 0.2851199556233532, "grad_norm": 0.1441182792186737, "learning_rate": 0.00028450184501845017, "loss": 0.5467, "step": 257 }, { "epoch": 0.2862293717930939, "grad_norm": 0.17499203979969025, "learning_rate": 0.00028560885608856085, "loss": 0.6093, "step": 258 }, { "epoch": 0.2873387879628346, "grad_norm": 0.13634347915649414, "learning_rate": 0.0002867158671586716, "loss": 0.5183, "step": 259 }, { "epoch": 0.2884482041325752, "grad_norm": 0.1825854778289795, "learning_rate": 0.00028782287822878226, "loss": 0.524, "step": 260 }, { "epoch": 0.2895576203023159, "grad_norm": 0.1797315776348114, "learning_rate": 0.00028892988929889293, "loss": 0.9663, "step": 261 }, { "epoch": 0.2906670364720566, "grad_norm": 0.22019031643867493, "learning_rate": 0.00029003690036900366, "loss": 0.8477, "step": 262 }, { "epoch": 0.29177645264179725, "grad_norm": 0.18698155879974365, "learning_rate": 0.00029114391143911434, "loss": 0.5928, "step": 263 }, { "epoch": 0.29288586881153794, "grad_norm": 0.18208318948745728, "learning_rate": 0.00029225092250922507, "loss": 0.5586, "step": 264 }, { "epoch": 0.2939952849812786, "grad_norm": 0.17802134156227112, "learning_rate": 0.00029335793357933575, "loss": 0.499, "step": 265 }, { "epoch": 0.2951047011510193, "grad_norm": 0.25552839040756226, "learning_rate": 0.0002944649446494465, "loss": 0.6052, "step": 266 }, { "epoch": 0.29621411732076, "grad_norm": 0.15605288743972778, "learning_rate": 0.00029557195571955716, "loss": 0.4767, "step": 267 }, { "epoch": 0.2973235334905006, "grad_norm": 0.18597030639648438, "learning_rate": 0.0002966789667896679, "loss": 0.6442, "step": 268 }, { "epoch": 0.2984329496602413, "grad_norm": 0.2584044933319092, "learning_rate": 0.00029778597785977857, "loss": 0.6259, "step": 269 }, { "epoch": 0.29954236582998195, "grad_norm": 0.13973256945610046, "learning_rate": 0.0002988929889298893, "loss": 0.6127, "step": 270 }, { "epoch": 0.30065178199972264, "grad_norm": 0.21028122305870056, "learning_rate": 0.0003, "loss": 0.3969, "step": 271 }, { "epoch": 0.30176119816946334, "grad_norm": 0.232606902718544, "learning_rate": 0.00029987664473684207, "loss": 0.6801, "step": 272 }, { "epoch": 0.302870614339204, "grad_norm": 0.24834930896759033, "learning_rate": 0.00029975328947368416, "loss": 0.5395, "step": 273 }, { "epoch": 0.3039800305089447, "grad_norm": 0.2343815118074417, "learning_rate": 0.00029962993421052625, "loss": 0.6029, "step": 274 }, { "epoch": 0.3050894466786853, "grad_norm": 0.23860520124435425, "learning_rate": 0.0002995065789473684, "loss": 0.4872, "step": 275 }, { "epoch": 0.306198862848426, "grad_norm": 0.1793919801712036, "learning_rate": 0.0002993832236842105, "loss": 0.6896, "step": 276 }, { "epoch": 0.3073082790181667, "grad_norm": 0.2507120668888092, "learning_rate": 0.00029925986842105264, "loss": 0.3792, "step": 277 }, { "epoch": 0.30841769518790735, "grad_norm": 0.27677059173583984, "learning_rate": 0.00029913651315789473, "loss": 0.7856, "step": 278 }, { "epoch": 0.30952711135764804, "grad_norm": 0.18887469172477722, "learning_rate": 0.00029901315789473683, "loss": 0.5406, "step": 279 }, { "epoch": 0.31063652752738874, "grad_norm": 0.23371614515781403, "learning_rate": 0.0002988898026315789, "loss": 0.5748, "step": 280 }, { "epoch": 0.3117459436971294, "grad_norm": 0.22486557066440582, "learning_rate": 0.000298766447368421, "loss": 0.7443, "step": 281 }, { "epoch": 0.3128553598668701, "grad_norm": 0.17992804944515228, "learning_rate": 0.0002986430921052631, "loss": 0.5819, "step": 282 }, { "epoch": 0.3139647760366107, "grad_norm": 0.20137208700180054, "learning_rate": 0.00029851973684210525, "loss": 0.722, "step": 283 }, { "epoch": 0.3150741922063514, "grad_norm": 0.25975537300109863, "learning_rate": 0.00029839638157894735, "loss": 0.5349, "step": 284 }, { "epoch": 0.3161836083760921, "grad_norm": 0.2687530219554901, "learning_rate": 0.00029827302631578944, "loss": 0.4361, "step": 285 }, { "epoch": 0.31729302454583275, "grad_norm": 0.18794257938861847, "learning_rate": 0.0002981496710526316, "loss": 0.5822, "step": 286 }, { "epoch": 0.31840244071557344, "grad_norm": 0.12230537086725235, "learning_rate": 0.0002980263157894737, "loss": 0.3639, "step": 287 }, { "epoch": 0.3195118568853141, "grad_norm": 0.20607517659664154, "learning_rate": 0.0002979029605263158, "loss": 0.6476, "step": 288 }, { "epoch": 0.3206212730550548, "grad_norm": 0.11566425859928131, "learning_rate": 0.00029777960526315787, "loss": 0.5026, "step": 289 }, { "epoch": 0.3217306892247955, "grad_norm": 0.22872845828533173, "learning_rate": 0.00029765624999999996, "loss": 0.5727, "step": 290 }, { "epoch": 0.3228401053945361, "grad_norm": 0.17686223983764648, "learning_rate": 0.00029753289473684205, "loss": 0.6867, "step": 291 }, { "epoch": 0.3239495215642768, "grad_norm": 0.2232068032026291, "learning_rate": 0.0002974095394736842, "loss": 0.4983, "step": 292 }, { "epoch": 0.32505893773401745, "grad_norm": 0.26865360140800476, "learning_rate": 0.0002972861842105263, "loss": 0.5151, "step": 293 }, { "epoch": 0.32616835390375815, "grad_norm": 0.14152151346206665, "learning_rate": 0.0002971628289473684, "loss": 0.7316, "step": 294 }, { "epoch": 0.32727777007349884, "grad_norm": 0.20797161757946014, "learning_rate": 0.00029703947368421054, "loss": 0.4479, "step": 295 }, { "epoch": 0.3283871862432395, "grad_norm": 0.1608234941959381, "learning_rate": 0.00029691611842105263, "loss": 0.5482, "step": 296 }, { "epoch": 0.3294966024129802, "grad_norm": 0.2219133973121643, "learning_rate": 0.0002967927631578947, "loss": 0.4474, "step": 297 }, { "epoch": 0.3306060185827208, "grad_norm": 0.3990642726421356, "learning_rate": 0.0002966694078947368, "loss": 0.6421, "step": 298 }, { "epoch": 0.3317154347524615, "grad_norm": 0.20786860585212708, "learning_rate": 0.0002965460526315789, "loss": 0.5569, "step": 299 }, { "epoch": 0.3328248509222022, "grad_norm": 0.1967337280511856, "learning_rate": 0.000296422697368421, "loss": 0.6013, "step": 300 }, { "epoch": 0.33393426709194285, "grad_norm": 0.19602453708648682, "learning_rate": 0.00029629934210526315, "loss": 0.5516, "step": 301 }, { "epoch": 0.33504368326168354, "grad_norm": 0.23872393369674683, "learning_rate": 0.00029617598684210524, "loss": 0.5124, "step": 302 }, { "epoch": 0.33615309943142424, "grad_norm": 0.20506146550178528, "learning_rate": 0.00029605263157894733, "loss": 0.6206, "step": 303 }, { "epoch": 0.3372625156011649, "grad_norm": 0.20456762611865997, "learning_rate": 0.00029592927631578943, "loss": 0.4852, "step": 304 }, { "epoch": 0.3383719317709056, "grad_norm": 0.23360048234462738, "learning_rate": 0.0002958059210526316, "loss": 0.4821, "step": 305 }, { "epoch": 0.3394813479406462, "grad_norm": 0.16443900763988495, "learning_rate": 0.00029568256578947367, "loss": 0.4089, "step": 306 }, { "epoch": 0.3405907641103869, "grad_norm": 0.15696674585342407, "learning_rate": 0.00029555921052631576, "loss": 0.5479, "step": 307 }, { "epoch": 0.3417001802801276, "grad_norm": 0.20780624449253082, "learning_rate": 0.00029543585526315785, "loss": 0.4644, "step": 308 }, { "epoch": 0.34280959644986825, "grad_norm": 0.185526043176651, "learning_rate": 0.00029531249999999995, "loss": 0.7259, "step": 309 }, { "epoch": 0.34391901261960894, "grad_norm": 0.1540479063987732, "learning_rate": 0.0002951891447368421, "loss": 0.6167, "step": 310 }, { "epoch": 0.3450284287893496, "grad_norm": 0.26093733310699463, "learning_rate": 0.0002950657894736842, "loss": 0.6427, "step": 311 }, { "epoch": 0.3461378449590903, "grad_norm": 0.21596834063529968, "learning_rate": 0.0002949424342105263, "loss": 0.5916, "step": 312 }, { "epoch": 0.347247261128831, "grad_norm": 0.20977520942687988, "learning_rate": 0.0002948190789473684, "loss": 0.6572, "step": 313 }, { "epoch": 0.3483566772985716, "grad_norm": 0.1886155605316162, "learning_rate": 0.0002946957236842105, "loss": 0.5399, "step": 314 }, { "epoch": 0.3494660934683123, "grad_norm": 0.18215329945087433, "learning_rate": 0.0002945723684210526, "loss": 0.5724, "step": 315 }, { "epoch": 0.35057550963805295, "grad_norm": 0.2055482119321823, "learning_rate": 0.0002944490131578947, "loss": 0.5834, "step": 316 }, { "epoch": 0.35168492580779365, "grad_norm": 0.15489786863327026, "learning_rate": 0.0002943256578947368, "loss": 0.5003, "step": 317 }, { "epoch": 0.35279434197753434, "grad_norm": 0.17881543934345245, "learning_rate": 0.00029420230263157895, "loss": 0.5105, "step": 318 }, { "epoch": 0.353903758147275, "grad_norm": 0.1768861562013626, "learning_rate": 0.00029407894736842104, "loss": 0.5363, "step": 319 }, { "epoch": 0.3550131743170157, "grad_norm": 0.2283925861120224, "learning_rate": 0.00029395559210526314, "loss": 0.5452, "step": 320 }, { "epoch": 0.3561225904867563, "grad_norm": 0.19439998269081116, "learning_rate": 0.00029383223684210523, "loss": 0.5603, "step": 321 }, { "epoch": 0.357232006656497, "grad_norm": 0.2097710222005844, "learning_rate": 0.0002937088815789473, "loss": 0.625, "step": 322 }, { "epoch": 0.3583414228262377, "grad_norm": 0.186342254281044, "learning_rate": 0.0002935855263157894, "loss": 0.3553, "step": 323 }, { "epoch": 0.35945083899597835, "grad_norm": 0.275612473487854, "learning_rate": 0.00029346217105263156, "loss": 0.6274, "step": 324 }, { "epoch": 0.36056025516571905, "grad_norm": 0.19332240521907806, "learning_rate": 0.00029333881578947366, "loss": 0.5827, "step": 325 }, { "epoch": 0.36166967133545974, "grad_norm": 0.18259958922863007, "learning_rate": 0.00029321546052631575, "loss": 0.4502, "step": 326 }, { "epoch": 0.3627790875052004, "grad_norm": 0.25983044505119324, "learning_rate": 0.0002930921052631579, "loss": 0.8896, "step": 327 }, { "epoch": 0.3638885036749411, "grad_norm": 0.1671958863735199, "learning_rate": 0.00029296875, "loss": 0.5232, "step": 328 }, { "epoch": 0.3649979198446817, "grad_norm": 0.18751101195812225, "learning_rate": 0.0002928453947368421, "loss": 0.4556, "step": 329 }, { "epoch": 0.3661073360144224, "grad_norm": 0.19776886701583862, "learning_rate": 0.0002927220394736842, "loss": 0.8467, "step": 330 }, { "epoch": 0.3672167521841631, "grad_norm": 0.2738226354122162, "learning_rate": 0.00029259868421052627, "loss": 0.7323, "step": 331 }, { "epoch": 0.36832616835390375, "grad_norm": 0.16836367547512054, "learning_rate": 0.00029247532894736836, "loss": 0.43, "step": 332 }, { "epoch": 0.36943558452364444, "grad_norm": 0.30866721272468567, "learning_rate": 0.0002923519736842105, "loss": 0.3759, "step": 333 }, { "epoch": 0.3705450006933851, "grad_norm": 0.20256434381008148, "learning_rate": 0.0002922286184210526, "loss": 0.6727, "step": 334 }, { "epoch": 0.3716544168631258, "grad_norm": 0.18781127035617828, "learning_rate": 0.0002921052631578947, "loss": 0.5784, "step": 335 }, { "epoch": 0.3727638330328665, "grad_norm": 0.3620914816856384, "learning_rate": 0.00029198190789473684, "loss": 0.8316, "step": 336 }, { "epoch": 0.3738732492026071, "grad_norm": 0.1937960535287857, "learning_rate": 0.00029185855263157894, "loss": 0.5956, "step": 337 }, { "epoch": 0.3749826653723478, "grad_norm": 0.21955540776252747, "learning_rate": 0.00029173519736842103, "loss": 0.6847, "step": 338 }, { "epoch": 0.37609208154208845, "grad_norm": 0.22091850638389587, "learning_rate": 0.0002916118421052631, "loss": 0.7571, "step": 339 }, { "epoch": 0.37720149771182915, "grad_norm": 0.17953120172023773, "learning_rate": 0.0002914884868421052, "loss": 0.596, "step": 340 }, { "epoch": 0.37831091388156984, "grad_norm": 0.2171243131160736, "learning_rate": 0.0002913651315789473, "loss": 0.5556, "step": 341 }, { "epoch": 0.3794203300513105, "grad_norm": 0.21151772141456604, "learning_rate": 0.00029124177631578946, "loss": 0.8455, "step": 342 }, { "epoch": 0.3805297462210512, "grad_norm": 0.21299928426742554, "learning_rate": 0.00029111842105263155, "loss": 0.4473, "step": 343 }, { "epoch": 0.3816391623907918, "grad_norm": 0.20761217176914215, "learning_rate": 0.00029099506578947364, "loss": 0.6742, "step": 344 }, { "epoch": 0.3827485785605325, "grad_norm": 0.21071919798851013, "learning_rate": 0.0002908717105263158, "loss": 0.5404, "step": 345 }, { "epoch": 0.3838579947302732, "grad_norm": 0.3248625099658966, "learning_rate": 0.0002907483552631579, "loss": 0.7006, "step": 346 }, { "epoch": 0.38496741090001385, "grad_norm": 0.3169274628162384, "learning_rate": 0.000290625, "loss": 0.5953, "step": 347 }, { "epoch": 0.38607682706975455, "grad_norm": 0.16656579077243805, "learning_rate": 0.00029050164473684207, "loss": 0.5392, "step": 348 }, { "epoch": 0.38718624323949524, "grad_norm": 0.1759122908115387, "learning_rate": 0.00029037828947368416, "loss": 0.6298, "step": 349 }, { "epoch": 0.3882956594092359, "grad_norm": 0.17183220386505127, "learning_rate": 0.00029025493421052626, "loss": 0.49, "step": 350 }, { "epoch": 0.3894050755789766, "grad_norm": 0.2180389016866684, "learning_rate": 0.0002901315789473684, "loss": 0.4447, "step": 351 }, { "epoch": 0.3905144917487172, "grad_norm": 0.3889177143573761, "learning_rate": 0.0002900082236842105, "loss": 0.48, "step": 352 }, { "epoch": 0.3916239079184579, "grad_norm": 0.22550411522388458, "learning_rate": 0.00028988486842105264, "loss": 0.5976, "step": 353 }, { "epoch": 0.3927333240881986, "grad_norm": 0.20059050619602203, "learning_rate": 0.00028976151315789474, "loss": 0.578, "step": 354 }, { "epoch": 0.39384274025793925, "grad_norm": 0.2586449086666107, "learning_rate": 0.00028963815789473683, "loss": 0.6453, "step": 355 }, { "epoch": 0.39495215642767995, "grad_norm": 0.24946491420269012, "learning_rate": 0.0002895148026315789, "loss": 0.649, "step": 356 }, { "epoch": 0.3960615725974206, "grad_norm": 0.1763986498117447, "learning_rate": 0.000289391447368421, "loss": 0.6183, "step": 357 }, { "epoch": 0.3971709887671613, "grad_norm": 0.1732664704322815, "learning_rate": 0.0002892680921052631, "loss": 0.8931, "step": 358 }, { "epoch": 0.398280404936902, "grad_norm": 0.22204923629760742, "learning_rate": 0.0002891447368421052, "loss": 0.5979, "step": 359 }, { "epoch": 0.3993898211066426, "grad_norm": 0.1942061334848404, "learning_rate": 0.00028902138157894735, "loss": 0.5693, "step": 360 }, { "epoch": 0.4004992372763833, "grad_norm": 0.2240975797176361, "learning_rate": 0.00028889802631578944, "loss": 0.6672, "step": 361 }, { "epoch": 0.40160865344612395, "grad_norm": 0.19991931319236755, "learning_rate": 0.0002887746710526316, "loss": 0.7095, "step": 362 }, { "epoch": 0.40271806961586465, "grad_norm": 0.1844676285982132, "learning_rate": 0.0002886513157894737, "loss": 0.5337, "step": 363 }, { "epoch": 0.40382748578560534, "grad_norm": 0.2115306556224823, "learning_rate": 0.0002885279605263158, "loss": 0.6032, "step": 364 }, { "epoch": 0.404936901955346, "grad_norm": 0.2993430495262146, "learning_rate": 0.00028840460526315787, "loss": 0.8077, "step": 365 }, { "epoch": 0.4060463181250867, "grad_norm": 0.41001948714256287, "learning_rate": 0.00028828124999999996, "loss": 0.7461, "step": 366 }, { "epoch": 0.4071557342948273, "grad_norm": 0.1562204509973526, "learning_rate": 0.00028815789473684206, "loss": 0.4517, "step": 367 }, { "epoch": 0.408265150464568, "grad_norm": 0.27741947770118713, "learning_rate": 0.0002880345394736842, "loss": 0.6076, "step": 368 }, { "epoch": 0.4093745666343087, "grad_norm": 0.205497607588768, "learning_rate": 0.0002879111842105263, "loss": 0.6499, "step": 369 }, { "epoch": 0.41048398280404935, "grad_norm": 0.412622332572937, "learning_rate": 0.0002877878289473684, "loss": 0.6487, "step": 370 }, { "epoch": 0.41159339897379005, "grad_norm": 0.19786472618579865, "learning_rate": 0.00028766447368421054, "loss": 0.4202, "step": 371 }, { "epoch": 0.41270281514353074, "grad_norm": 0.1875920295715332, "learning_rate": 0.00028754111842105263, "loss": 0.5377, "step": 372 }, { "epoch": 0.4138122313132714, "grad_norm": 0.22071506083011627, "learning_rate": 0.0002874177631578947, "loss": 0.4671, "step": 373 }, { "epoch": 0.4149216474830121, "grad_norm": 0.22277134656906128, "learning_rate": 0.0002872944078947368, "loss": 0.6436, "step": 374 }, { "epoch": 0.4160310636527527, "grad_norm": 0.13663825392723083, "learning_rate": 0.0002871710526315789, "loss": 0.3595, "step": 375 }, { "epoch": 0.4171404798224934, "grad_norm": 0.25280505418777466, "learning_rate": 0.000287047697368421, "loss": 0.7154, "step": 376 }, { "epoch": 0.4182498959922341, "grad_norm": 0.2542460262775421, "learning_rate": 0.00028692434210526315, "loss": 0.5229, "step": 377 }, { "epoch": 0.41935931216197475, "grad_norm": 0.20687228441238403, "learning_rate": 0.00028680098684210524, "loss": 0.4229, "step": 378 }, { "epoch": 0.42046872833171545, "grad_norm": 0.1827574074268341, "learning_rate": 0.00028667763157894734, "loss": 0.504, "step": 379 }, { "epoch": 0.4215781445014561, "grad_norm": 0.22924618422985077, "learning_rate": 0.00028655427631578943, "loss": 0.4645, "step": 380 }, { "epoch": 0.4226875606711968, "grad_norm": 0.1500721126794815, "learning_rate": 0.0002864309210526316, "loss": 0.2866, "step": 381 }, { "epoch": 0.4237969768409375, "grad_norm": 0.19137370586395264, "learning_rate": 0.00028630756578947367, "loss": 0.4739, "step": 382 }, { "epoch": 0.4249063930106781, "grad_norm": 0.1940913051366806, "learning_rate": 0.00028618421052631576, "loss": 0.6433, "step": 383 }, { "epoch": 0.4260158091804188, "grad_norm": 0.17999312281608582, "learning_rate": 0.00028606085526315786, "loss": 0.6118, "step": 384 }, { "epoch": 0.42712522535015945, "grad_norm": 0.21522557735443115, "learning_rate": 0.00028593749999999995, "loss": 0.5608, "step": 385 }, { "epoch": 0.42823464151990015, "grad_norm": 0.23753724992275238, "learning_rate": 0.0002858141447368421, "loss": 0.4775, "step": 386 }, { "epoch": 0.42934405768964085, "grad_norm": 0.28104132413864136, "learning_rate": 0.0002856907894736842, "loss": 0.7179, "step": 387 }, { "epoch": 0.4304534738593815, "grad_norm": 0.16530390083789825, "learning_rate": 0.0002855674342105263, "loss": 0.5765, "step": 388 }, { "epoch": 0.4315628900291222, "grad_norm": 0.20358699560165405, "learning_rate": 0.0002854440789473684, "loss": 0.6325, "step": 389 }, { "epoch": 0.4326723061988628, "grad_norm": 0.17629845440387726, "learning_rate": 0.0002853207236842105, "loss": 0.5528, "step": 390 }, { "epoch": 0.4337817223686035, "grad_norm": 0.25051596760749817, "learning_rate": 0.0002851973684210526, "loss": 0.6712, "step": 391 }, { "epoch": 0.4348911385383442, "grad_norm": 0.19358691573143005, "learning_rate": 0.0002850740131578947, "loss": 0.5572, "step": 392 }, { "epoch": 0.43600055470808485, "grad_norm": 0.13769972324371338, "learning_rate": 0.0002849506578947368, "loss": 0.5055, "step": 393 }, { "epoch": 0.43710997087782555, "grad_norm": 0.14807964861392975, "learning_rate": 0.0002848273026315789, "loss": 0.5963, "step": 394 }, { "epoch": 0.43821938704756624, "grad_norm": 0.16840098798274994, "learning_rate": 0.00028470394736842104, "loss": 0.4203, "step": 395 }, { "epoch": 0.4393288032173069, "grad_norm": 0.16224174201488495, "learning_rate": 0.00028458059210526314, "loss": 0.7647, "step": 396 }, { "epoch": 0.4404382193870476, "grad_norm": 0.2029973566532135, "learning_rate": 0.00028445723684210523, "loss": 0.6427, "step": 397 }, { "epoch": 0.4415476355567882, "grad_norm": 0.23025457561016083, "learning_rate": 0.0002843338815789473, "loss": 0.5838, "step": 398 }, { "epoch": 0.4426570517265289, "grad_norm": 0.2005675584077835, "learning_rate": 0.0002842105263157894, "loss": 0.6441, "step": 399 }, { "epoch": 0.4437664678962696, "grad_norm": 0.2208050787448883, "learning_rate": 0.00028408717105263156, "loss": 0.656, "step": 400 }, { "epoch": 0.44487588406601025, "grad_norm": 0.28568586707115173, "learning_rate": 0.00028396381578947366, "loss": 0.6623, "step": 401 }, { "epoch": 0.44598530023575095, "grad_norm": 0.22206373512744904, "learning_rate": 0.00028384046052631575, "loss": 0.5584, "step": 402 }, { "epoch": 0.4470947164054916, "grad_norm": 0.20780105888843536, "learning_rate": 0.0002837171052631579, "loss": 0.5801, "step": 403 }, { "epoch": 0.4482041325752323, "grad_norm": 0.20285826921463013, "learning_rate": 0.00028359375, "loss": 0.6751, "step": 404 }, { "epoch": 0.449313548744973, "grad_norm": 0.15252311527729034, "learning_rate": 0.0002834703947368421, "loss": 1.1206, "step": 405 }, { "epoch": 0.4504229649147136, "grad_norm": 0.23378504812717438, "learning_rate": 0.0002833470394736842, "loss": 0.6577, "step": 406 }, { "epoch": 0.4515323810844543, "grad_norm": 0.1787406951189041, "learning_rate": 0.00028322368421052627, "loss": 0.7497, "step": 407 }, { "epoch": 0.45264179725419496, "grad_norm": 0.48000597953796387, "learning_rate": 0.00028310032894736836, "loss": 0.752, "step": 408 }, { "epoch": 0.45375121342393565, "grad_norm": 0.2794741094112396, "learning_rate": 0.0002829769736842105, "loss": 0.5731, "step": 409 }, { "epoch": 0.45486062959367635, "grad_norm": 0.2653048634529114, "learning_rate": 0.0002828536184210526, "loss": 0.4867, "step": 410 }, { "epoch": 0.455970045763417, "grad_norm": 0.19287265837192535, "learning_rate": 0.0002827302631578947, "loss": 0.643, "step": 411 }, { "epoch": 0.4570794619331577, "grad_norm": 0.2332431823015213, "learning_rate": 0.00028260690789473685, "loss": 0.4386, "step": 412 }, { "epoch": 0.4581888781028983, "grad_norm": 0.21826831996440887, "learning_rate": 0.00028248355263157894, "loss": 0.548, "step": 413 }, { "epoch": 0.459298294272639, "grad_norm": 0.23967108130455017, "learning_rate": 0.00028236019736842103, "loss": 0.5739, "step": 414 }, { "epoch": 0.4604077104423797, "grad_norm": 0.18406794965267181, "learning_rate": 0.0002822368421052631, "loss": 0.4547, "step": 415 }, { "epoch": 0.46151712661212035, "grad_norm": 0.18926838040351868, "learning_rate": 0.0002821134868421052, "loss": 0.5208, "step": 416 }, { "epoch": 0.46262654278186105, "grad_norm": 0.16055501997470856, "learning_rate": 0.0002819901315789473, "loss": 0.4273, "step": 417 }, { "epoch": 0.46373595895160175, "grad_norm": 0.18228095769882202, "learning_rate": 0.00028186677631578946, "loss": 0.6194, "step": 418 }, { "epoch": 0.4648453751213424, "grad_norm": 0.1525285542011261, "learning_rate": 0.00028174342105263155, "loss": 0.5515, "step": 419 }, { "epoch": 0.4659547912910831, "grad_norm": 0.24812228977680206, "learning_rate": 0.00028162006578947365, "loss": 0.4666, "step": 420 }, { "epoch": 0.4670642074608237, "grad_norm": 0.3236676752567291, "learning_rate": 0.0002814967105263158, "loss": 0.6294, "step": 421 }, { "epoch": 0.4681736236305644, "grad_norm": 0.15236404538154602, "learning_rate": 0.0002813733552631579, "loss": 0.4148, "step": 422 }, { "epoch": 0.4692830398003051, "grad_norm": 0.20661397278308868, "learning_rate": 0.00028125, "loss": 0.4696, "step": 423 }, { "epoch": 0.47039245597004575, "grad_norm": 0.3039199113845825, "learning_rate": 0.00028112664473684207, "loss": 0.7764, "step": 424 }, { "epoch": 0.47150187213978645, "grad_norm": 0.2243487536907196, "learning_rate": 0.00028100328947368417, "loss": 0.575, "step": 425 }, { "epoch": 0.4726112883095271, "grad_norm": 0.15252256393432617, "learning_rate": 0.00028087993421052626, "loss": 0.6139, "step": 426 }, { "epoch": 0.4737207044792678, "grad_norm": 0.2288883924484253, "learning_rate": 0.0002807565789473684, "loss": 0.5199, "step": 427 }, { "epoch": 0.4748301206490085, "grad_norm": 0.1678112894296646, "learning_rate": 0.0002806332236842105, "loss": 0.699, "step": 428 }, { "epoch": 0.4759395368187491, "grad_norm": 0.27088475227355957, "learning_rate": 0.00028050986842105265, "loss": 0.5913, "step": 429 }, { "epoch": 0.4770489529884898, "grad_norm": 0.19018007814884186, "learning_rate": 0.00028038651315789474, "loss": 0.5341, "step": 430 }, { "epoch": 0.47815836915823046, "grad_norm": 0.19086478650569916, "learning_rate": 0.00028026315789473683, "loss": 0.565, "step": 431 }, { "epoch": 0.47926778532797115, "grad_norm": 0.26616349816322327, "learning_rate": 0.0002801398026315789, "loss": 0.7576, "step": 432 }, { "epoch": 0.48037720149771185, "grad_norm": 0.16013029217720032, "learning_rate": 0.000280016447368421, "loss": 0.5242, "step": 433 }, { "epoch": 0.4814866176674525, "grad_norm": 0.16926300525665283, "learning_rate": 0.0002798930921052631, "loss": 0.4289, "step": 434 }, { "epoch": 0.4825960338371932, "grad_norm": 0.2056371569633484, "learning_rate": 0.0002797697368421052, "loss": 0.5971, "step": 435 }, { "epoch": 0.4837054500069338, "grad_norm": 0.1635441929101944, "learning_rate": 0.00027964638157894735, "loss": 0.4277, "step": 436 }, { "epoch": 0.4848148661766745, "grad_norm": 0.24154643714427948, "learning_rate": 0.00027952302631578945, "loss": 0.6508, "step": 437 }, { "epoch": 0.4859242823464152, "grad_norm": 0.2069096565246582, "learning_rate": 0.0002793996710526316, "loss": 0.5105, "step": 438 }, { "epoch": 0.48703369851615586, "grad_norm": 0.23238608241081238, "learning_rate": 0.0002792763157894737, "loss": 0.7069, "step": 439 }, { "epoch": 0.48814311468589655, "grad_norm": 0.18234537541866302, "learning_rate": 0.0002791529605263158, "loss": 0.5141, "step": 440 }, { "epoch": 0.48925253085563725, "grad_norm": 0.1497894525527954, "learning_rate": 0.00027902960526315787, "loss": 0.5206, "step": 441 }, { "epoch": 0.4903619470253779, "grad_norm": 0.2433656007051468, "learning_rate": 0.00027890624999999997, "loss": 0.5697, "step": 442 }, { "epoch": 0.4914713631951186, "grad_norm": 0.1533818542957306, "learning_rate": 0.00027878289473684206, "loss": 0.359, "step": 443 }, { "epoch": 0.4925807793648592, "grad_norm": 0.2589110732078552, "learning_rate": 0.00027865953947368415, "loss": 0.6381, "step": 444 }, { "epoch": 0.4936901955345999, "grad_norm": 0.2857501208782196, "learning_rate": 0.0002785361842105263, "loss": 0.7064, "step": 445 }, { "epoch": 0.4947996117043406, "grad_norm": 0.21398474276065826, "learning_rate": 0.0002784128289473684, "loss": 0.5152, "step": 446 }, { "epoch": 0.49590902787408125, "grad_norm": 0.23514775931835175, "learning_rate": 0.00027828947368421054, "loss": 0.8143, "step": 447 }, { "epoch": 0.49701844404382195, "grad_norm": 0.18278779089450836, "learning_rate": 0.00027816611842105263, "loss": 0.4275, "step": 448 }, { "epoch": 0.4981278602135626, "grad_norm": 0.16910268366336823, "learning_rate": 0.0002780427631578947, "loss": 0.5477, "step": 449 }, { "epoch": 0.4992372763833033, "grad_norm": 0.17349810898303986, "learning_rate": 0.0002779194078947368, "loss": 0.5224, "step": 450 }, { "epoch": 0.500346692553044, "grad_norm": 0.2979370653629303, "learning_rate": 0.0002777960526315789, "loss": 0.6723, "step": 451 }, { "epoch": 0.5014561087227847, "grad_norm": 0.2329479455947876, "learning_rate": 0.000277672697368421, "loss": 0.5004, "step": 452 }, { "epoch": 0.5025655248925253, "grad_norm": 0.18267230689525604, "learning_rate": 0.00027754934210526315, "loss": 0.5729, "step": 453 }, { "epoch": 0.503674941062266, "grad_norm": 0.21524755656719208, "learning_rate": 0.00027742598684210525, "loss": 0.4295, "step": 454 }, { "epoch": 0.5047843572320067, "grad_norm": 0.18233224749565125, "learning_rate": 0.00027730263157894734, "loss": 0.4668, "step": 455 }, { "epoch": 0.5058937734017473, "grad_norm": 0.19789119064807892, "learning_rate": 0.00027717927631578943, "loss": 0.6149, "step": 456 }, { "epoch": 0.507003189571488, "grad_norm": 0.16243144869804382, "learning_rate": 0.0002770559210526316, "loss": 0.415, "step": 457 }, { "epoch": 0.5081126057412286, "grad_norm": 0.22004704177379608, "learning_rate": 0.0002769325657894737, "loss": 0.6058, "step": 458 }, { "epoch": 0.5092220219109693, "grad_norm": 0.1755845546722412, "learning_rate": 0.00027680921052631577, "loss": 0.659, "step": 459 }, { "epoch": 0.51033143808071, "grad_norm": 0.16181863844394684, "learning_rate": 0.00027668585526315786, "loss": 0.5118, "step": 460 }, { "epoch": 0.5114408542504507, "grad_norm": 0.30584779381752014, "learning_rate": 0.00027656249999999995, "loss": 0.5185, "step": 461 }, { "epoch": 0.5125502704201914, "grad_norm": 0.2442709058523178, "learning_rate": 0.0002764391447368421, "loss": 0.6267, "step": 462 }, { "epoch": 0.513659686589932, "grad_norm": 0.17913980782032013, "learning_rate": 0.0002763157894736842, "loss": 0.4123, "step": 463 }, { "epoch": 0.5147691027596727, "grad_norm": 0.16953568160533905, "learning_rate": 0.0002761924342105263, "loss": 0.4336, "step": 464 }, { "epoch": 0.5158785189294134, "grad_norm": 0.2636931538581848, "learning_rate": 0.0002760690789473684, "loss": 0.8255, "step": 465 }, { "epoch": 0.5169879350991541, "grad_norm": 0.1953415870666504, "learning_rate": 0.00027594572368421053, "loss": 0.3879, "step": 466 }, { "epoch": 0.5180973512688948, "grad_norm": 0.23631513118743896, "learning_rate": 0.0002758223684210526, "loss": 0.4591, "step": 467 }, { "epoch": 0.5192067674386354, "grad_norm": 0.25506916642189026, "learning_rate": 0.0002756990131578947, "loss": 0.6347, "step": 468 }, { "epoch": 0.5203161836083761, "grad_norm": 0.1907813549041748, "learning_rate": 0.0002755756578947368, "loss": 0.6049, "step": 469 }, { "epoch": 0.5214255997781168, "grad_norm": 0.26337459683418274, "learning_rate": 0.0002754523026315789, "loss": 0.9162, "step": 470 }, { "epoch": 0.5225350159478575, "grad_norm": 0.2560301721096039, "learning_rate": 0.00027532894736842105, "loss": 0.5243, "step": 471 }, { "epoch": 0.5236444321175981, "grad_norm": 0.21078939735889435, "learning_rate": 0.00027520559210526314, "loss": 0.528, "step": 472 }, { "epoch": 0.5247538482873388, "grad_norm": 0.19924308359622955, "learning_rate": 0.00027508223684210523, "loss": 0.484, "step": 473 }, { "epoch": 0.5258632644570794, "grad_norm": 0.1312379091978073, "learning_rate": 0.0002749588815789473, "loss": 0.5321, "step": 474 }, { "epoch": 0.5269726806268201, "grad_norm": 0.32874926924705505, "learning_rate": 0.0002748355263157894, "loss": 0.5229, "step": 475 }, { "epoch": 0.5280820967965608, "grad_norm": 0.2065068781375885, "learning_rate": 0.00027471217105263157, "loss": 0.5401, "step": 476 }, { "epoch": 0.5291915129663015, "grad_norm": 0.18827693164348602, "learning_rate": 0.00027458881578947366, "loss": 0.3993, "step": 477 }, { "epoch": 0.5303009291360422, "grad_norm": 0.20950929820537567, "learning_rate": 0.00027446546052631575, "loss": 0.6372, "step": 478 }, { "epoch": 0.5314103453057828, "grad_norm": 0.20649929344654083, "learning_rate": 0.0002743421052631579, "loss": 0.647, "step": 479 }, { "epoch": 0.5325197614755235, "grad_norm": 0.14667537808418274, "learning_rate": 0.00027421875, "loss": 0.5186, "step": 480 }, { "epoch": 0.5336291776452642, "grad_norm": 0.19212156534194946, "learning_rate": 0.0002740953947368421, "loss": 0.8352, "step": 481 }, { "epoch": 0.5347385938150049, "grad_norm": 0.2529224753379822, "learning_rate": 0.0002739720394736842, "loss": 0.5348, "step": 482 }, { "epoch": 0.5358480099847456, "grad_norm": 0.2153153419494629, "learning_rate": 0.0002738486842105263, "loss": 0.6096, "step": 483 }, { "epoch": 0.5369574261544862, "grad_norm": 0.2908189296722412, "learning_rate": 0.00027372532894736837, "loss": 0.6725, "step": 484 }, { "epoch": 0.5380668423242269, "grad_norm": 0.2697788178920746, "learning_rate": 0.0002736019736842105, "loss": 0.4649, "step": 485 }, { "epoch": 0.5391762584939676, "grad_norm": 0.20288242399692535, "learning_rate": 0.0002734786184210526, "loss": 0.5319, "step": 486 }, { "epoch": 0.5402856746637082, "grad_norm": 0.25883370637893677, "learning_rate": 0.0002733552631578947, "loss": 0.3363, "step": 487 }, { "epoch": 0.5413950908334489, "grad_norm": 0.15326879918575287, "learning_rate": 0.00027323190789473685, "loss": 0.4186, "step": 488 }, { "epoch": 0.5425045070031895, "grad_norm": 0.20244112610816956, "learning_rate": 0.00027310855263157894, "loss": 0.5039, "step": 489 }, { "epoch": 0.5436139231729302, "grad_norm": 0.21093213558197021, "learning_rate": 0.00027298519736842103, "loss": 0.7453, "step": 490 }, { "epoch": 0.5447233393426709, "grad_norm": 0.22425080835819244, "learning_rate": 0.00027286184210526313, "loss": 0.5743, "step": 491 }, { "epoch": 0.5458327555124116, "grad_norm": 0.2680664360523224, "learning_rate": 0.0002727384868421052, "loss": 0.5446, "step": 492 }, { "epoch": 0.5469421716821523, "grad_norm": 0.24040096998214722, "learning_rate": 0.0002726151315789473, "loss": 0.4714, "step": 493 }, { "epoch": 0.5480515878518929, "grad_norm": 0.30965113639831543, "learning_rate": 0.0002724917763157894, "loss": 0.8154, "step": 494 }, { "epoch": 0.5491610040216336, "grad_norm": 0.16201867163181305, "learning_rate": 0.00027236842105263155, "loss": 0.4898, "step": 495 }, { "epoch": 0.5502704201913743, "grad_norm": 0.3444017767906189, "learning_rate": 0.00027224506578947365, "loss": 0.5012, "step": 496 }, { "epoch": 0.551379836361115, "grad_norm": 0.5116562247276306, "learning_rate": 0.0002721217105263158, "loss": 0.4425, "step": 497 }, { "epoch": 0.5524892525308557, "grad_norm": 0.17541073262691498, "learning_rate": 0.0002719983552631579, "loss": 0.6107, "step": 498 }, { "epoch": 0.5535986687005963, "grad_norm": 0.4572921097278595, "learning_rate": 0.000271875, "loss": 0.5645, "step": 499 }, { "epoch": 0.554708084870337, "grad_norm": 0.35991188883781433, "learning_rate": 0.0002717516447368421, "loss": 0.4648, "step": 500 }, { "epoch": 0.5558175010400777, "grad_norm": 0.2327331304550171, "learning_rate": 0.00027162828947368417, "loss": 0.5978, "step": 501 }, { "epoch": 0.5569269172098184, "grad_norm": 0.25607866048812866, "learning_rate": 0.00027150493421052626, "loss": 0.7341, "step": 502 }, { "epoch": 0.558036333379559, "grad_norm": 0.26063939929008484, "learning_rate": 0.0002713815789473684, "loss": 0.6693, "step": 503 }, { "epoch": 0.5591457495492996, "grad_norm": 0.23963363468647003, "learning_rate": 0.0002712582236842105, "loss": 0.4704, "step": 504 }, { "epoch": 0.5602551657190403, "grad_norm": 0.21853481233119965, "learning_rate": 0.0002711348684210526, "loss": 0.827, "step": 505 }, { "epoch": 0.561364581888781, "grad_norm": 0.2731577754020691, "learning_rate": 0.00027101151315789474, "loss": 0.5781, "step": 506 }, { "epoch": 0.5624739980585217, "grad_norm": 0.1824404001235962, "learning_rate": 0.00027088815789473684, "loss": 0.5114, "step": 507 }, { "epoch": 0.5635834142282624, "grad_norm": 0.26666054129600525, "learning_rate": 0.00027076480263157893, "loss": 0.5057, "step": 508 }, { "epoch": 0.564692830398003, "grad_norm": 0.22783181071281433, "learning_rate": 0.000270641447368421, "loss": 0.6204, "step": 509 }, { "epoch": 0.5658022465677437, "grad_norm": 0.20822562277317047, "learning_rate": 0.0002705180921052631, "loss": 0.3751, "step": 510 }, { "epoch": 0.5669116627374844, "grad_norm": 0.1988370418548584, "learning_rate": 0.0002703947368421052, "loss": 0.5364, "step": 511 }, { "epoch": 0.5680210789072251, "grad_norm": 0.26172971725463867, "learning_rate": 0.00027027138157894736, "loss": 0.7982, "step": 512 }, { "epoch": 0.5691304950769658, "grad_norm": 0.25788214802742004, "learning_rate": 0.00027014802631578945, "loss": 0.5565, "step": 513 }, { "epoch": 0.5702399112467064, "grad_norm": 0.23320072889328003, "learning_rate": 0.0002700246710526316, "loss": 0.586, "step": 514 }, { "epoch": 0.5713493274164471, "grad_norm": 0.1965775191783905, "learning_rate": 0.0002699013157894737, "loss": 0.681, "step": 515 }, { "epoch": 0.5724587435861878, "grad_norm": 0.4177470803260803, "learning_rate": 0.0002697779605263158, "loss": 0.5532, "step": 516 }, { "epoch": 0.5735681597559285, "grad_norm": 0.16181616485118866, "learning_rate": 0.0002696546052631579, "loss": 0.5, "step": 517 }, { "epoch": 0.5746775759256691, "grad_norm": 0.20417065918445587, "learning_rate": 0.00026953124999999997, "loss": 0.5589, "step": 518 }, { "epoch": 0.5757869920954098, "grad_norm": 0.2022491842508316, "learning_rate": 0.00026940789473684206, "loss": 0.5517, "step": 519 }, { "epoch": 0.5768964082651504, "grad_norm": 0.3004019558429718, "learning_rate": 0.00026928453947368415, "loss": 0.46, "step": 520 }, { "epoch": 0.5780058244348911, "grad_norm": 0.2016931027173996, "learning_rate": 0.0002691611842105263, "loss": 0.3938, "step": 521 }, { "epoch": 0.5791152406046318, "grad_norm": 0.22006861865520477, "learning_rate": 0.0002690378289473684, "loss": 0.6507, "step": 522 }, { "epoch": 0.5802246567743725, "grad_norm": 0.2743866741657257, "learning_rate": 0.00026891447368421054, "loss": 0.6233, "step": 523 }, { "epoch": 0.5813340729441132, "grad_norm": 0.2324676811695099, "learning_rate": 0.00026879111842105264, "loss": 0.7202, "step": 524 }, { "epoch": 0.5824434891138538, "grad_norm": 0.2942185401916504, "learning_rate": 0.00026866776315789473, "loss": 0.577, "step": 525 }, { "epoch": 0.5835529052835945, "grad_norm": 0.20303772389888763, "learning_rate": 0.0002685444078947368, "loss": 0.5867, "step": 526 }, { "epoch": 0.5846623214533352, "grad_norm": 0.3175172507762909, "learning_rate": 0.0002684210526315789, "loss": 0.5936, "step": 527 }, { "epoch": 0.5857717376230759, "grad_norm": 0.26434624195098877, "learning_rate": 0.000268297697368421, "loss": 0.8496, "step": 528 }, { "epoch": 0.5868811537928166, "grad_norm": 0.20476919412612915, "learning_rate": 0.00026817434210526316, "loss": 0.5323, "step": 529 }, { "epoch": 0.5879905699625572, "grad_norm": 0.17890197038650513, "learning_rate": 0.00026805098684210525, "loss": 0.618, "step": 530 }, { "epoch": 0.5890999861322979, "grad_norm": 0.12501509487628937, "learning_rate": 0.00026792763157894734, "loss": 0.3985, "step": 531 }, { "epoch": 0.5902094023020386, "grad_norm": 0.14128711819648743, "learning_rate": 0.00026780427631578944, "loss": 0.3521, "step": 532 }, { "epoch": 0.5913188184717793, "grad_norm": 0.24814924597740173, "learning_rate": 0.0002676809210526316, "loss": 0.5524, "step": 533 }, { "epoch": 0.59242823464152, "grad_norm": 0.1946108043193817, "learning_rate": 0.0002675575657894737, "loss": 0.6405, "step": 534 }, { "epoch": 0.5935376508112605, "grad_norm": 0.20254820585250854, "learning_rate": 0.00026743421052631577, "loss": 0.5621, "step": 535 }, { "epoch": 0.5946470669810012, "grad_norm": 0.19255991280078888, "learning_rate": 0.00026731085526315786, "loss": 0.7144, "step": 536 }, { "epoch": 0.5957564831507419, "grad_norm": 0.20394358038902283, "learning_rate": 0.00026718749999999996, "loss": 0.4172, "step": 537 }, { "epoch": 0.5968658993204826, "grad_norm": 0.2700938582420349, "learning_rate": 0.0002670641447368421, "loss": 0.6688, "step": 538 }, { "epoch": 0.5979753154902233, "grad_norm": 0.3247049152851105, "learning_rate": 0.0002669407894736842, "loss": 0.632, "step": 539 }, { "epoch": 0.5990847316599639, "grad_norm": 0.2315102368593216, "learning_rate": 0.0002668174342105263, "loss": 0.5881, "step": 540 }, { "epoch": 0.6001941478297046, "grad_norm": 0.28765103220939636, "learning_rate": 0.0002666940789473684, "loss": 0.6717, "step": 541 }, { "epoch": 0.6013035639994453, "grad_norm": 0.24762357771396637, "learning_rate": 0.00026657072368421053, "loss": 0.692, "step": 542 }, { "epoch": 0.602412980169186, "grad_norm": 0.25794705748558044, "learning_rate": 0.0002664473684210526, "loss": 0.5335, "step": 543 }, { "epoch": 0.6035223963389267, "grad_norm": 0.2661387622356415, "learning_rate": 0.0002663240131578947, "loss": 0.7539, "step": 544 }, { "epoch": 0.6046318125086673, "grad_norm": 0.2539893388748169, "learning_rate": 0.0002662006578947368, "loss": 0.4355, "step": 545 }, { "epoch": 0.605741228678408, "grad_norm": 0.23424486815929413, "learning_rate": 0.0002660773026315789, "loss": 0.5143, "step": 546 }, { "epoch": 0.6068506448481487, "grad_norm": 0.1962471306324005, "learning_rate": 0.00026595394736842105, "loss": 0.4722, "step": 547 }, { "epoch": 0.6079600610178894, "grad_norm": 0.1904420107603073, "learning_rate": 0.00026583059210526314, "loss": 0.5893, "step": 548 }, { "epoch": 0.60906947718763, "grad_norm": 0.3046864867210388, "learning_rate": 0.00026570723684210524, "loss": 0.8071, "step": 549 }, { "epoch": 0.6101788933573706, "grad_norm": 0.1840696930885315, "learning_rate": 0.00026558388157894733, "loss": 0.4925, "step": 550 }, { "epoch": 0.6112883095271113, "grad_norm": 0.33538711071014404, "learning_rate": 0.0002654605263157894, "loss": 0.6222, "step": 551 }, { "epoch": 0.612397725696852, "grad_norm": 0.22219829261302948, "learning_rate": 0.00026533717105263157, "loss": 0.4808, "step": 552 }, { "epoch": 0.6135071418665927, "grad_norm": 0.3794260621070862, "learning_rate": 0.00026521381578947366, "loss": 0.4464, "step": 553 }, { "epoch": 0.6146165580363334, "grad_norm": 0.21123401820659637, "learning_rate": 0.00026509046052631576, "loss": 0.6366, "step": 554 }, { "epoch": 0.615725974206074, "grad_norm": 0.19497540593147278, "learning_rate": 0.00026496710526315785, "loss": 0.5467, "step": 555 }, { "epoch": 0.6168353903758147, "grad_norm": 0.18902145326137543, "learning_rate": 0.00026484375, "loss": 0.4334, "step": 556 }, { "epoch": 0.6179448065455554, "grad_norm": 0.24114537239074707, "learning_rate": 0.0002647203947368421, "loss": 0.5268, "step": 557 }, { "epoch": 0.6190542227152961, "grad_norm": 0.18477365374565125, "learning_rate": 0.0002645970394736842, "loss": 0.6423, "step": 558 }, { "epoch": 0.6201636388850368, "grad_norm": 0.21324200928211212, "learning_rate": 0.0002644736842105263, "loss": 0.6366, "step": 559 }, { "epoch": 0.6212730550547775, "grad_norm": 0.38751551508903503, "learning_rate": 0.00026435032894736837, "loss": 0.6193, "step": 560 }, { "epoch": 0.6223824712245181, "grad_norm": 0.22451990842819214, "learning_rate": 0.0002642269736842105, "loss": 0.6747, "step": 561 }, { "epoch": 0.6234918873942588, "grad_norm": 0.2260679006576538, "learning_rate": 0.0002641036184210526, "loss": 0.8031, "step": 562 }, { "epoch": 0.6246013035639995, "grad_norm": 0.22427742183208466, "learning_rate": 0.0002639802631578947, "loss": 0.7171, "step": 563 }, { "epoch": 0.6257107197337402, "grad_norm": 0.20108933746814728, "learning_rate": 0.00026385690789473685, "loss": 0.4898, "step": 564 }, { "epoch": 0.6268201359034808, "grad_norm": 0.3800278604030609, "learning_rate": 0.00026373355263157894, "loss": 0.6466, "step": 565 }, { "epoch": 0.6279295520732214, "grad_norm": 0.22784464061260223, "learning_rate": 0.00026361019736842104, "loss": 0.5532, "step": 566 }, { "epoch": 0.6290389682429621, "grad_norm": 0.23498325049877167, "learning_rate": 0.00026348684210526313, "loss": 0.8687, "step": 567 }, { "epoch": 0.6301483844127028, "grad_norm": 0.1839025616645813, "learning_rate": 0.0002633634868421052, "loss": 0.6382, "step": 568 }, { "epoch": 0.6312578005824435, "grad_norm": 0.22980616986751556, "learning_rate": 0.0002632401315789473, "loss": 0.6109, "step": 569 }, { "epoch": 0.6323672167521842, "grad_norm": 0.17458495497703552, "learning_rate": 0.0002631167763157894, "loss": 0.5498, "step": 570 }, { "epoch": 0.6334766329219248, "grad_norm": 0.22085556387901306, "learning_rate": 0.00026299342105263156, "loss": 0.5102, "step": 571 }, { "epoch": 0.6345860490916655, "grad_norm": 0.3213456869125366, "learning_rate": 0.00026287006578947365, "loss": 0.7377, "step": 572 }, { "epoch": 0.6356954652614062, "grad_norm": 0.2649673819541931, "learning_rate": 0.0002627467105263158, "loss": 0.6664, "step": 573 }, { "epoch": 0.6368048814311469, "grad_norm": 0.3784686326980591, "learning_rate": 0.0002626233552631579, "loss": 0.9214, "step": 574 }, { "epoch": 0.6379142976008876, "grad_norm": 0.1708430051803589, "learning_rate": 0.0002625, "loss": 0.4383, "step": 575 }, { "epoch": 0.6390237137706282, "grad_norm": 0.26163679361343384, "learning_rate": 0.0002623766447368421, "loss": 0.8272, "step": 576 }, { "epoch": 0.6401331299403689, "grad_norm": 0.16542355716228485, "learning_rate": 0.00026225328947368417, "loss": 0.4739, "step": 577 }, { "epoch": 0.6412425461101096, "grad_norm": 0.2180267572402954, "learning_rate": 0.00026212993421052626, "loss": 0.6221, "step": 578 }, { "epoch": 0.6423519622798503, "grad_norm": 0.21957628428936005, "learning_rate": 0.0002620065789473684, "loss": 0.6062, "step": 579 }, { "epoch": 0.643461378449591, "grad_norm": 0.20948325097560883, "learning_rate": 0.0002618832236842105, "loss": 0.5542, "step": 580 }, { "epoch": 0.6445707946193315, "grad_norm": 0.28024452924728394, "learning_rate": 0.0002617598684210526, "loss": 0.718, "step": 581 }, { "epoch": 0.6456802107890722, "grad_norm": 0.1376865804195404, "learning_rate": 0.00026163651315789474, "loss": 0.4168, "step": 582 }, { "epoch": 0.6467896269588129, "grad_norm": 0.15050861239433289, "learning_rate": 0.00026151315789473684, "loss": 0.5308, "step": 583 }, { "epoch": 0.6478990431285536, "grad_norm": 0.21709243953227997, "learning_rate": 0.00026138980263157893, "loss": 0.677, "step": 584 }, { "epoch": 0.6490084592982943, "grad_norm": 0.37020203471183777, "learning_rate": 0.000261266447368421, "loss": 0.6208, "step": 585 }, { "epoch": 0.6501178754680349, "grad_norm": 0.2256883978843689, "learning_rate": 0.0002611430921052631, "loss": 0.7711, "step": 586 }, { "epoch": 0.6512272916377756, "grad_norm": 0.22099201381206512, "learning_rate": 0.0002610197368421052, "loss": 0.6091, "step": 587 }, { "epoch": 0.6523367078075163, "grad_norm": 0.23496872186660767, "learning_rate": 0.00026089638157894736, "loss": 0.2471, "step": 588 }, { "epoch": 0.653446123977257, "grad_norm": 0.2800827622413635, "learning_rate": 0.00026077302631578945, "loss": 0.5343, "step": 589 }, { "epoch": 0.6545555401469977, "grad_norm": 0.22502388060092926, "learning_rate": 0.00026064967105263154, "loss": 0.5117, "step": 590 }, { "epoch": 0.6556649563167383, "grad_norm": 0.1460188329219818, "learning_rate": 0.0002605263157894737, "loss": 0.4772, "step": 591 }, { "epoch": 0.656774372486479, "grad_norm": 0.16591776907444, "learning_rate": 0.0002604029605263158, "loss": 0.6087, "step": 592 }, { "epoch": 0.6578837886562197, "grad_norm": 0.13937248289585114, "learning_rate": 0.0002602796052631579, "loss": 0.5587, "step": 593 }, { "epoch": 0.6589932048259604, "grad_norm": 0.2956066131591797, "learning_rate": 0.00026015624999999997, "loss": 0.7117, "step": 594 }, { "epoch": 0.660102620995701, "grad_norm": 0.495911568403244, "learning_rate": 0.00026003289473684206, "loss": 0.8364, "step": 595 }, { "epoch": 0.6612120371654416, "grad_norm": 0.1725756675004959, "learning_rate": 0.00025990953947368416, "loss": 0.4979, "step": 596 }, { "epoch": 0.6623214533351823, "grad_norm": 0.25791987776756287, "learning_rate": 0.0002597861842105263, "loss": 0.7304, "step": 597 }, { "epoch": 0.663430869504923, "grad_norm": 0.18017613887786865, "learning_rate": 0.0002596628289473684, "loss": 0.5706, "step": 598 }, { "epoch": 0.6645402856746637, "grad_norm": 0.25128671526908875, "learning_rate": 0.00025953947368421055, "loss": 0.5846, "step": 599 }, { "epoch": 0.6656497018444044, "grad_norm": 0.34931543469429016, "learning_rate": 0.00025941611842105264, "loss": 0.7017, "step": 600 }, { "epoch": 0.666759118014145, "grad_norm": 0.25030258297920227, "learning_rate": 0.00025929276315789473, "loss": 0.5193, "step": 601 }, { "epoch": 0.6678685341838857, "grad_norm": 0.236861452460289, "learning_rate": 0.0002591694078947368, "loss": 0.6901, "step": 602 }, { "epoch": 0.6689779503536264, "grad_norm": 0.308292418718338, "learning_rate": 0.0002590460526315789, "loss": 0.4285, "step": 603 }, { "epoch": 0.6700873665233671, "grad_norm": 0.2141687422990799, "learning_rate": 0.000258922697368421, "loss": 0.3857, "step": 604 }, { "epoch": 0.6711967826931078, "grad_norm": 0.164393350481987, "learning_rate": 0.0002587993421052631, "loss": 0.5326, "step": 605 }, { "epoch": 0.6723061988628485, "grad_norm": 0.30191662907600403, "learning_rate": 0.00025867598684210525, "loss": 0.4978, "step": 606 }, { "epoch": 0.6734156150325891, "grad_norm": 0.2955259382724762, "learning_rate": 0.00025855263157894734, "loss": 0.5253, "step": 607 }, { "epoch": 0.6745250312023298, "grad_norm": 0.22022663056850433, "learning_rate": 0.00025842927631578944, "loss": 0.7104, "step": 608 }, { "epoch": 0.6756344473720705, "grad_norm": 0.21236523985862732, "learning_rate": 0.0002583059210526316, "loss": 0.5324, "step": 609 }, { "epoch": 0.6767438635418112, "grad_norm": 0.1945660263299942, "learning_rate": 0.0002581825657894737, "loss": 0.5858, "step": 610 }, { "epoch": 0.6778532797115518, "grad_norm": 0.31970614194869995, "learning_rate": 0.00025805921052631577, "loss": 0.7485, "step": 611 }, { "epoch": 0.6789626958812924, "grad_norm": 0.3211202621459961, "learning_rate": 0.00025793585526315786, "loss": 0.5554, "step": 612 }, { "epoch": 0.6800721120510331, "grad_norm": 0.17990931868553162, "learning_rate": 0.00025781249999999996, "loss": 0.6221, "step": 613 }, { "epoch": 0.6811815282207738, "grad_norm": 0.3069283366203308, "learning_rate": 0.0002576891447368421, "loss": 0.7163, "step": 614 }, { "epoch": 0.6822909443905145, "grad_norm": 0.19691799581050873, "learning_rate": 0.0002575657894736842, "loss": 0.4445, "step": 615 }, { "epoch": 0.6834003605602552, "grad_norm": 0.18806682527065277, "learning_rate": 0.0002574424342105263, "loss": 0.5781, "step": 616 }, { "epoch": 0.6845097767299958, "grad_norm": 0.24056103825569153, "learning_rate": 0.0002573190789473684, "loss": 0.5351, "step": 617 }, { "epoch": 0.6856191928997365, "grad_norm": 0.2140192836523056, "learning_rate": 0.00025719572368421053, "loss": 0.6222, "step": 618 }, { "epoch": 0.6867286090694772, "grad_norm": 0.227885901927948, "learning_rate": 0.0002570723684210526, "loss": 0.6394, "step": 619 }, { "epoch": 0.6878380252392179, "grad_norm": 0.37848934531211853, "learning_rate": 0.0002569490131578947, "loss": 0.6231, "step": 620 }, { "epoch": 0.6889474414089586, "grad_norm": 0.290159672498703, "learning_rate": 0.0002568256578947368, "loss": 0.4614, "step": 621 }, { "epoch": 0.6900568575786992, "grad_norm": 0.18309064209461212, "learning_rate": 0.0002567023026315789, "loss": 0.7418, "step": 622 }, { "epoch": 0.6911662737484399, "grad_norm": 0.20930887758731842, "learning_rate": 0.00025657894736842105, "loss": 0.6572, "step": 623 }, { "epoch": 0.6922756899181806, "grad_norm": 0.24094976484775543, "learning_rate": 0.00025645559210526315, "loss": 0.5702, "step": 624 }, { "epoch": 0.6933851060879213, "grad_norm": 0.22989119589328766, "learning_rate": 0.00025633223684210524, "loss": 0.4517, "step": 625 }, { "epoch": 0.694494522257662, "grad_norm": 0.2922836244106293, "learning_rate": 0.00025620888157894733, "loss": 0.6436, "step": 626 }, { "epoch": 0.6956039384274025, "grad_norm": 0.2564910650253296, "learning_rate": 0.0002560855263157894, "loss": 0.7321, "step": 627 }, { "epoch": 0.6967133545971432, "grad_norm": 0.26571327447891235, "learning_rate": 0.00025596217105263157, "loss": 0.4964, "step": 628 }, { "epoch": 0.6978227707668839, "grad_norm": 0.5190631151199341, "learning_rate": 0.00025583881578947367, "loss": 0.7778, "step": 629 }, { "epoch": 0.6989321869366246, "grad_norm": 0.17522084712982178, "learning_rate": 0.00025571546052631576, "loss": 0.4768, "step": 630 }, { "epoch": 0.7000416031063653, "grad_norm": 0.2567191421985626, "learning_rate": 0.00025559210526315785, "loss": 0.5936, "step": 631 }, { "epoch": 0.7011510192761059, "grad_norm": 0.46300792694091797, "learning_rate": 0.00025546875, "loss": 0.6074, "step": 632 }, { "epoch": 0.7022604354458466, "grad_norm": 0.1528376042842865, "learning_rate": 0.0002553453947368421, "loss": 0.4779, "step": 633 }, { "epoch": 0.7033698516155873, "grad_norm": 0.3135516941547394, "learning_rate": 0.0002552220394736842, "loss": 0.8674, "step": 634 }, { "epoch": 0.704479267785328, "grad_norm": 0.22676752507686615, "learning_rate": 0.0002550986842105263, "loss": 0.5182, "step": 635 }, { "epoch": 0.7055886839550687, "grad_norm": 0.21783347427845, "learning_rate": 0.00025497532894736837, "loss": 0.4951, "step": 636 }, { "epoch": 0.7066981001248093, "grad_norm": 0.2860846221446991, "learning_rate": 0.0002548519736842105, "loss": 0.4363, "step": 637 }, { "epoch": 0.70780751629455, "grad_norm": 0.281086266040802, "learning_rate": 0.0002547286184210526, "loss": 0.5309, "step": 638 }, { "epoch": 0.7089169324642907, "grad_norm": 0.30188825726509094, "learning_rate": 0.0002546052631578947, "loss": 0.5422, "step": 639 }, { "epoch": 0.7100263486340314, "grad_norm": 0.26086941361427307, "learning_rate": 0.0002544819078947368, "loss": 0.4605, "step": 640 }, { "epoch": 0.711135764803772, "grad_norm": 0.3494928777217865, "learning_rate": 0.00025435855263157895, "loss": 0.4638, "step": 641 }, { "epoch": 0.7122451809735126, "grad_norm": 0.282701313495636, "learning_rate": 0.00025423519736842104, "loss": 0.6283, "step": 642 }, { "epoch": 0.7133545971432533, "grad_norm": 0.16015778481960297, "learning_rate": 0.00025411184210526313, "loss": 0.4514, "step": 643 }, { "epoch": 0.714464013312994, "grad_norm": 0.2207580804824829, "learning_rate": 0.0002539884868421052, "loss": 0.7127, "step": 644 }, { "epoch": 0.7155734294827347, "grad_norm": 0.2917775511741638, "learning_rate": 0.0002538651315789473, "loss": 0.7254, "step": 645 }, { "epoch": 0.7166828456524754, "grad_norm": 0.26504382491111755, "learning_rate": 0.0002537417763157894, "loss": 0.5335, "step": 646 }, { "epoch": 0.717792261822216, "grad_norm": 0.3495447635650635, "learning_rate": 0.00025361842105263156, "loss": 0.8425, "step": 647 }, { "epoch": 0.7189016779919567, "grad_norm": 0.18636609613895416, "learning_rate": 0.00025349506578947365, "loss": 0.5437, "step": 648 }, { "epoch": 0.7200110941616974, "grad_norm": 0.18877021968364716, "learning_rate": 0.0002533717105263158, "loss": 0.5643, "step": 649 }, { "epoch": 0.7211205103314381, "grad_norm": 0.15186652541160583, "learning_rate": 0.0002532483552631579, "loss": 0.532, "step": 650 }, { "epoch": 0.7222299265011788, "grad_norm": 0.14779016375541687, "learning_rate": 0.000253125, "loss": 0.566, "step": 651 }, { "epoch": 0.7233393426709195, "grad_norm": 0.19088061153888702, "learning_rate": 0.0002530016447368421, "loss": 0.5351, "step": 652 }, { "epoch": 0.7244487588406601, "grad_norm": 0.17555399239063263, "learning_rate": 0.00025287828947368417, "loss": 0.4758, "step": 653 }, { "epoch": 0.7255581750104008, "grad_norm": 0.2535383999347687, "learning_rate": 0.00025275493421052627, "loss": 0.6948, "step": 654 }, { "epoch": 0.7266675911801415, "grad_norm": 0.2938152551651001, "learning_rate": 0.00025263157894736836, "loss": 0.626, "step": 655 }, { "epoch": 0.7277770073498822, "grad_norm": 0.2159254252910614, "learning_rate": 0.0002525082236842105, "loss": 0.471, "step": 656 }, { "epoch": 0.7288864235196229, "grad_norm": 0.20253120362758636, "learning_rate": 0.0002523848684210526, "loss": 0.4693, "step": 657 }, { "epoch": 0.7299958396893634, "grad_norm": 0.49963316321372986, "learning_rate": 0.00025226151315789475, "loss": 0.5185, "step": 658 }, { "epoch": 0.7311052558591041, "grad_norm": 0.2259654551744461, "learning_rate": 0.00025213815789473684, "loss": 0.7249, "step": 659 }, { "epoch": 0.7322146720288448, "grad_norm": 0.21344606578350067, "learning_rate": 0.00025201480263157893, "loss": 0.4977, "step": 660 }, { "epoch": 0.7333240881985855, "grad_norm": 0.2689608037471771, "learning_rate": 0.000251891447368421, "loss": 0.3908, "step": 661 }, { "epoch": 0.7344335043683262, "grad_norm": 0.18120594322681427, "learning_rate": 0.0002517680921052631, "loss": 0.4518, "step": 662 }, { "epoch": 0.7355429205380668, "grad_norm": 0.3393332064151764, "learning_rate": 0.0002516447368421052, "loss": 0.6439, "step": 663 }, { "epoch": 0.7366523367078075, "grad_norm": 0.21560847759246826, "learning_rate": 0.00025152138157894736, "loss": 0.6041, "step": 664 }, { "epoch": 0.7377617528775482, "grad_norm": 0.4047819972038269, "learning_rate": 0.00025139802631578945, "loss": 0.7444, "step": 665 }, { "epoch": 0.7388711690472889, "grad_norm": 0.17038540542125702, "learning_rate": 0.00025127467105263155, "loss": 0.4562, "step": 666 }, { "epoch": 0.7399805852170296, "grad_norm": 0.1971050649881363, "learning_rate": 0.0002511513157894737, "loss": 0.4497, "step": 667 }, { "epoch": 0.7410900013867702, "grad_norm": 0.233141228556633, "learning_rate": 0.0002510279605263158, "loss": 0.6446, "step": 668 }, { "epoch": 0.7421994175565109, "grad_norm": 0.22134460508823395, "learning_rate": 0.0002509046052631579, "loss": 0.4393, "step": 669 }, { "epoch": 0.7433088337262516, "grad_norm": 0.18341104686260223, "learning_rate": 0.00025078125, "loss": 0.6756, "step": 670 }, { "epoch": 0.7444182498959923, "grad_norm": 0.2109827697277069, "learning_rate": 0.00025065789473684207, "loss": 0.5686, "step": 671 }, { "epoch": 0.745527666065733, "grad_norm": 0.249485045671463, "learning_rate": 0.00025053453947368416, "loss": 0.5849, "step": 672 }, { "epoch": 0.7466370822354735, "grad_norm": 0.36929988861083984, "learning_rate": 0.0002504111842105263, "loss": 0.5873, "step": 673 }, { "epoch": 0.7477464984052142, "grad_norm": 0.2066950798034668, "learning_rate": 0.0002502878289473684, "loss": 0.4152, "step": 674 }, { "epoch": 0.7488559145749549, "grad_norm": 0.14233893156051636, "learning_rate": 0.0002501644736842105, "loss": 0.4972, "step": 675 }, { "epoch": 0.7499653307446956, "grad_norm": 0.24324600398540497, "learning_rate": 0.00025004111842105264, "loss": 0.4693, "step": 676 }, { "epoch": 0.7510747469144363, "grad_norm": 0.18247400224208832, "learning_rate": 0.00024991776315789473, "loss": 0.6579, "step": 677 }, { "epoch": 0.7521841630841769, "grad_norm": 0.20285794138908386, "learning_rate": 0.00024979440789473683, "loss": 0.3657, "step": 678 }, { "epoch": 0.7532935792539176, "grad_norm": 0.2386598140001297, "learning_rate": 0.0002496710526315789, "loss": 0.5693, "step": 679 }, { "epoch": 0.7544029954236583, "grad_norm": 0.28417715430259705, "learning_rate": 0.000249547697368421, "loss": 0.5416, "step": 680 }, { "epoch": 0.755512411593399, "grad_norm": 0.4812435805797577, "learning_rate": 0.0002494243421052631, "loss": 0.6869, "step": 681 }, { "epoch": 0.7566218277631397, "grad_norm": 0.1943156123161316, "learning_rate": 0.00024930098684210525, "loss": 0.4625, "step": 682 }, { "epoch": 0.7577312439328803, "grad_norm": 0.22697441279888153, "learning_rate": 0.00024917763157894735, "loss": 0.4708, "step": 683 }, { "epoch": 0.758840660102621, "grad_norm": 0.32567355036735535, "learning_rate": 0.00024905427631578944, "loss": 0.4607, "step": 684 }, { "epoch": 0.7599500762723617, "grad_norm": 0.2505040168762207, "learning_rate": 0.0002489309210526316, "loss": 0.5327, "step": 685 }, { "epoch": 0.7610594924421024, "grad_norm": 0.19925042986869812, "learning_rate": 0.0002488075657894737, "loss": 0.7429, "step": 686 }, { "epoch": 0.762168908611843, "grad_norm": 0.19456201791763306, "learning_rate": 0.0002486842105263158, "loss": 0.6922, "step": 687 }, { "epoch": 0.7632783247815836, "grad_norm": 0.2642272412776947, "learning_rate": 0.00024856085526315787, "loss": 0.5104, "step": 688 }, { "epoch": 0.7643877409513243, "grad_norm": 0.22339658439159393, "learning_rate": 0.00024843749999999996, "loss": 0.5151, "step": 689 }, { "epoch": 0.765497157121065, "grad_norm": 0.23145624995231628, "learning_rate": 0.00024831414473684205, "loss": 0.4034, "step": 690 }, { "epoch": 0.7666065732908057, "grad_norm": 0.20969951152801514, "learning_rate": 0.0002481907894736842, "loss": 0.5701, "step": 691 }, { "epoch": 0.7677159894605464, "grad_norm": 0.2449328750371933, "learning_rate": 0.0002480674342105263, "loss": 0.4631, "step": 692 }, { "epoch": 0.768825405630287, "grad_norm": 0.2677520513534546, "learning_rate": 0.0002479440789473684, "loss": 0.5099, "step": 693 }, { "epoch": 0.7699348218000277, "grad_norm": 0.21626238524913788, "learning_rate": 0.00024782072368421053, "loss": 0.5724, "step": 694 }, { "epoch": 0.7710442379697684, "grad_norm": 0.2530820071697235, "learning_rate": 0.00024769736842105263, "loss": 0.4764, "step": 695 }, { "epoch": 0.7721536541395091, "grad_norm": 0.14730204641819, "learning_rate": 0.0002475740131578947, "loss": 0.4657, "step": 696 }, { "epoch": 0.7732630703092498, "grad_norm": 0.21123374998569489, "learning_rate": 0.0002474506578947368, "loss": 0.6438, "step": 697 }, { "epoch": 0.7743724864789905, "grad_norm": 0.2234024703502655, "learning_rate": 0.0002473273026315789, "loss": 0.7459, "step": 698 }, { "epoch": 0.7754819026487311, "grad_norm": 0.25458112359046936, "learning_rate": 0.00024720394736842105, "loss": 0.5767, "step": 699 }, { "epoch": 0.7765913188184718, "grad_norm": 0.19247955083847046, "learning_rate": 0.00024708059210526315, "loss": 0.4738, "step": 700 }, { "epoch": 0.7777007349882125, "grad_norm": 0.3082413673400879, "learning_rate": 0.00024695723684210524, "loss": 0.5998, "step": 701 }, { "epoch": 0.7788101511579532, "grad_norm": 0.21955102682113647, "learning_rate": 0.00024683388157894733, "loss": 0.6763, "step": 702 }, { "epoch": 0.7799195673276939, "grad_norm": 0.21807517111301422, "learning_rate": 0.00024671052631578943, "loss": 0.6072, "step": 703 }, { "epoch": 0.7810289834974344, "grad_norm": 0.2064640372991562, "learning_rate": 0.0002465871710526316, "loss": 0.5148, "step": 704 }, { "epoch": 0.7821383996671751, "grad_norm": 0.19100461900234222, "learning_rate": 0.00024646381578947367, "loss": 0.3789, "step": 705 }, { "epoch": 0.7832478158369158, "grad_norm": 0.4141659140586853, "learning_rate": 0.00024634046052631576, "loss": 0.5838, "step": 706 }, { "epoch": 0.7843572320066565, "grad_norm": 0.260050892829895, "learning_rate": 0.00024621710526315785, "loss": 0.6696, "step": 707 }, { "epoch": 0.7854666481763972, "grad_norm": 0.22340181469917297, "learning_rate": 0.00024609375, "loss": 0.7745, "step": 708 }, { "epoch": 0.7865760643461378, "grad_norm": 0.23725625872612, "learning_rate": 0.0002459703947368421, "loss": 0.5473, "step": 709 }, { "epoch": 0.7876854805158785, "grad_norm": 0.2455572783946991, "learning_rate": 0.0002458470394736842, "loss": 0.5893, "step": 710 }, { "epoch": 0.7887948966856192, "grad_norm": 0.2484428882598877, "learning_rate": 0.0002457236842105263, "loss": 0.6448, "step": 711 }, { "epoch": 0.7899043128553599, "grad_norm": 0.1755114644765854, "learning_rate": 0.0002456003289473684, "loss": 0.5489, "step": 712 }, { "epoch": 0.7910137290251006, "grad_norm": 0.18849492073059082, "learning_rate": 0.0002454769736842105, "loss": 0.5535, "step": 713 }, { "epoch": 0.7921231451948412, "grad_norm": 0.2751491665840149, "learning_rate": 0.0002453536184210526, "loss": 0.6029, "step": 714 }, { "epoch": 0.7932325613645819, "grad_norm": 0.2292255163192749, "learning_rate": 0.0002452302631578947, "loss": 0.452, "step": 715 }, { "epoch": 0.7943419775343226, "grad_norm": 0.2448405623435974, "learning_rate": 0.0002451069078947368, "loss": 0.6581, "step": 716 }, { "epoch": 0.7954513937040633, "grad_norm": 0.22489365935325623, "learning_rate": 0.00024498355263157895, "loss": 0.3962, "step": 717 }, { "epoch": 0.796560809873804, "grad_norm": 0.31602340936660767, "learning_rate": 0.00024486019736842104, "loss": 0.6768, "step": 718 }, { "epoch": 0.7976702260435445, "grad_norm": 0.3364429175853729, "learning_rate": 0.00024473684210526314, "loss": 0.7009, "step": 719 }, { "epoch": 0.7987796422132852, "grad_norm": 0.2554149031639099, "learning_rate": 0.00024461348684210523, "loss": 0.7788, "step": 720 }, { "epoch": 0.7998890583830259, "grad_norm": 0.3130899667739868, "learning_rate": 0.0002444901315789473, "loss": 0.698, "step": 721 }, { "epoch": 0.8009984745527666, "grad_norm": 0.17528441548347473, "learning_rate": 0.0002443667763157894, "loss": 0.5581, "step": 722 }, { "epoch": 0.8021078907225073, "grad_norm": 0.18897710740566254, "learning_rate": 0.00024424342105263156, "loss": 0.4447, "step": 723 }, { "epoch": 0.8032173068922479, "grad_norm": 0.2809373438358307, "learning_rate": 0.00024412006578947368, "loss": 0.4655, "step": 724 }, { "epoch": 0.8043267230619886, "grad_norm": 0.16998441517353058, "learning_rate": 0.00024399671052631578, "loss": 0.568, "step": 725 }, { "epoch": 0.8054361392317293, "grad_norm": 0.21042829751968384, "learning_rate": 0.00024387335526315787, "loss": 0.581, "step": 726 }, { "epoch": 0.80654555540147, "grad_norm": 0.18260133266448975, "learning_rate": 0.00024375, "loss": 0.4479, "step": 727 }, { "epoch": 0.8076549715712107, "grad_norm": 0.323022723197937, "learning_rate": 0.00024362664473684208, "loss": 0.6448, "step": 728 }, { "epoch": 0.8087643877409513, "grad_norm": 0.2532772123813629, "learning_rate": 0.00024350328947368418, "loss": 0.6414, "step": 729 }, { "epoch": 0.809873803910692, "grad_norm": 0.186369851231575, "learning_rate": 0.0002433799342105263, "loss": 0.5935, "step": 730 }, { "epoch": 0.8109832200804327, "grad_norm": 0.22981056571006775, "learning_rate": 0.0002432565789473684, "loss": 0.4538, "step": 731 }, { "epoch": 0.8120926362501734, "grad_norm": 0.2305302619934082, "learning_rate": 0.0002431332236842105, "loss": 0.3573, "step": 732 }, { "epoch": 0.8132020524199141, "grad_norm": 0.19394156336784363, "learning_rate": 0.00024300986842105263, "loss": 0.7218, "step": 733 }, { "epoch": 0.8143114685896546, "grad_norm": 0.26196351647377014, "learning_rate": 0.00024288651315789472, "loss": 0.5631, "step": 734 }, { "epoch": 0.8154208847593953, "grad_norm": 0.21519336104393005, "learning_rate": 0.00024276315789473682, "loss": 0.7168, "step": 735 }, { "epoch": 0.816530300929136, "grad_norm": 0.14451566338539124, "learning_rate": 0.00024263980263157894, "loss": 0.4952, "step": 736 }, { "epoch": 0.8176397170988767, "grad_norm": 0.19086521863937378, "learning_rate": 0.00024251644736842103, "loss": 0.441, "step": 737 }, { "epoch": 0.8187491332686174, "grad_norm": 0.2369484156370163, "learning_rate": 0.00024239309210526312, "loss": 0.6517, "step": 738 }, { "epoch": 0.819858549438358, "grad_norm": 0.22705571353435516, "learning_rate": 0.00024226973684210524, "loss": 0.5155, "step": 739 }, { "epoch": 0.8209679656080987, "grad_norm": 0.16014251112937927, "learning_rate": 0.00024214638157894734, "loss": 0.6391, "step": 740 }, { "epoch": 0.8220773817778394, "grad_norm": 0.24363452196121216, "learning_rate": 0.00024202302631578943, "loss": 0.6092, "step": 741 }, { "epoch": 0.8231867979475801, "grad_norm": 0.16901741921901703, "learning_rate": 0.00024189967105263158, "loss": 0.394, "step": 742 }, { "epoch": 0.8242962141173208, "grad_norm": 0.15532980859279633, "learning_rate": 0.00024177631578947367, "loss": 0.516, "step": 743 }, { "epoch": 0.8254056302870615, "grad_norm": 0.1833130270242691, "learning_rate": 0.00024165296052631576, "loss": 0.4056, "step": 744 }, { "epoch": 0.8265150464568021, "grad_norm": 0.23910647630691528, "learning_rate": 0.00024152960526315788, "loss": 0.5056, "step": 745 }, { "epoch": 0.8276244626265428, "grad_norm": 0.4546511769294739, "learning_rate": 0.00024140624999999998, "loss": 0.701, "step": 746 }, { "epoch": 0.8287338787962835, "grad_norm": 0.20680895447731018, "learning_rate": 0.00024128289473684207, "loss": 0.5841, "step": 747 }, { "epoch": 0.8298432949660242, "grad_norm": 0.29819783568382263, "learning_rate": 0.0002411595394736842, "loss": 0.5503, "step": 748 }, { "epoch": 0.8309527111357649, "grad_norm": 0.18847742676734924, "learning_rate": 0.00024103618421052628, "loss": 0.5588, "step": 749 }, { "epoch": 0.8320621273055054, "grad_norm": 0.2954421937465668, "learning_rate": 0.00024091282894736838, "loss": 0.5611, "step": 750 }, { "epoch": 0.8331715434752461, "grad_norm": 0.2604624927043915, "learning_rate": 0.00024078947368421052, "loss": 0.6853, "step": 751 }, { "epoch": 0.8342809596449868, "grad_norm": 0.31594741344451904, "learning_rate": 0.00024066611842105262, "loss": 0.383, "step": 752 }, { "epoch": 0.8353903758147275, "grad_norm": 0.2093072086572647, "learning_rate": 0.00024054276315789474, "loss": 0.6134, "step": 753 }, { "epoch": 0.8364997919844682, "grad_norm": 0.22096338868141174, "learning_rate": 0.00024041940789473683, "loss": 0.4454, "step": 754 }, { "epoch": 0.8376092081542088, "grad_norm": 0.31209510564804077, "learning_rate": 0.00024029605263157892, "loss": 0.8408, "step": 755 }, { "epoch": 0.8387186243239495, "grad_norm": 0.36502930521965027, "learning_rate": 0.00024017269736842104, "loss": 0.5053, "step": 756 }, { "epoch": 0.8398280404936902, "grad_norm": 0.24059328436851501, "learning_rate": 0.00024004934210526314, "loss": 0.6181, "step": 757 }, { "epoch": 0.8409374566634309, "grad_norm": 0.202326238155365, "learning_rate": 0.00023992598684210523, "loss": 0.4569, "step": 758 }, { "epoch": 0.8420468728331716, "grad_norm": 0.14637093245983124, "learning_rate": 0.00023980263157894732, "loss": 0.438, "step": 759 }, { "epoch": 0.8431562890029122, "grad_norm": 0.15636491775512695, "learning_rate": 0.00023967927631578944, "loss": 0.6549, "step": 760 }, { "epoch": 0.8442657051726529, "grad_norm": 0.25059443712234497, "learning_rate": 0.00023955592105263156, "loss": 0.5497, "step": 761 }, { "epoch": 0.8453751213423936, "grad_norm": 0.1603458672761917, "learning_rate": 0.00023943256578947368, "loss": 0.5023, "step": 762 }, { "epoch": 0.8464845375121343, "grad_norm": 0.202356219291687, "learning_rate": 0.00023930921052631578, "loss": 0.6076, "step": 763 }, { "epoch": 0.847593953681875, "grad_norm": 0.2810531258583069, "learning_rate": 0.00023918585526315787, "loss": 0.4029, "step": 764 }, { "epoch": 0.8487033698516155, "grad_norm": 0.21425937116146088, "learning_rate": 0.0002390625, "loss": 0.4594, "step": 765 }, { "epoch": 0.8498127860213562, "grad_norm": 0.29210686683654785, "learning_rate": 0.00023893914473684208, "loss": 0.4246, "step": 766 }, { "epoch": 0.8509222021910969, "grad_norm": 0.483568400144577, "learning_rate": 0.00023881578947368418, "loss": 0.6333, "step": 767 }, { "epoch": 0.8520316183608376, "grad_norm": 0.22136186063289642, "learning_rate": 0.0002386924342105263, "loss": 0.6211, "step": 768 }, { "epoch": 0.8531410345305783, "grad_norm": 0.29133930802345276, "learning_rate": 0.0002385690789473684, "loss": 0.5317, "step": 769 }, { "epoch": 0.8542504507003189, "grad_norm": 0.20380742847919464, "learning_rate": 0.0002384457236842105, "loss": 0.6699, "step": 770 }, { "epoch": 0.8553598668700596, "grad_norm": 0.16621272265911102, "learning_rate": 0.00023832236842105263, "loss": 0.8654, "step": 771 }, { "epoch": 0.8564692830398003, "grad_norm": 0.19278384745121002, "learning_rate": 0.00023819901315789472, "loss": 0.5077, "step": 772 }, { "epoch": 0.857578699209541, "grad_norm": 0.18274825811386108, "learning_rate": 0.00023807565789473682, "loss": 0.5807, "step": 773 }, { "epoch": 0.8586881153792817, "grad_norm": 0.36240458488464355, "learning_rate": 0.00023795230263157894, "loss": 0.5612, "step": 774 }, { "epoch": 0.8597975315490223, "grad_norm": 0.2410973459482193, "learning_rate": 0.00023782894736842103, "loss": 0.6891, "step": 775 }, { "epoch": 0.860906947718763, "grad_norm": 0.2783324718475342, "learning_rate": 0.00023770559210526312, "loss": 0.4547, "step": 776 }, { "epoch": 0.8620163638885037, "grad_norm": 0.7285773754119873, "learning_rate": 0.00023758223684210524, "loss": 0.4894, "step": 777 }, { "epoch": 0.8631257800582444, "grad_norm": 0.16931070387363434, "learning_rate": 0.00023745888157894734, "loss": 0.5592, "step": 778 }, { "epoch": 0.8642351962279851, "grad_norm": 0.24053402245044708, "learning_rate": 0.00023733552631578943, "loss": 0.4662, "step": 779 }, { "epoch": 0.8653446123977256, "grad_norm": 0.42136144638061523, "learning_rate": 0.00023721217105263158, "loss": 0.617, "step": 780 }, { "epoch": 0.8664540285674663, "grad_norm": 0.16937246918678284, "learning_rate": 0.00023708881578947367, "loss": 0.4531, "step": 781 }, { "epoch": 0.867563444737207, "grad_norm": 0.25334227085113525, "learning_rate": 0.00023696546052631576, "loss": 0.4013, "step": 782 }, { "epoch": 0.8686728609069477, "grad_norm": 0.18785522878170013, "learning_rate": 0.00023684210526315788, "loss": 0.5154, "step": 783 }, { "epoch": 0.8697822770766884, "grad_norm": 0.21429309248924255, "learning_rate": 0.00023671874999999998, "loss": 0.4044, "step": 784 }, { "epoch": 0.8708916932464291, "grad_norm": 0.1965511292219162, "learning_rate": 0.00023659539473684207, "loss": 0.6695, "step": 785 }, { "epoch": 0.8720011094161697, "grad_norm": 0.2309243083000183, "learning_rate": 0.0002364720394736842, "loss": 0.5465, "step": 786 }, { "epoch": 0.8731105255859104, "grad_norm": 0.2777848243713379, "learning_rate": 0.00023634868421052628, "loss": 0.6245, "step": 787 }, { "epoch": 0.8742199417556511, "grad_norm": 0.34535712003707886, "learning_rate": 0.00023622532894736838, "loss": 0.6035, "step": 788 }, { "epoch": 0.8753293579253918, "grad_norm": 0.24912825226783752, "learning_rate": 0.00023610197368421052, "loss": 0.5369, "step": 789 }, { "epoch": 0.8764387740951325, "grad_norm": 0.23429974913597107, "learning_rate": 0.00023597861842105262, "loss": 0.3689, "step": 790 }, { "epoch": 0.8775481902648731, "grad_norm": 0.25908464193344116, "learning_rate": 0.00023585526315789474, "loss": 0.6301, "step": 791 }, { "epoch": 0.8786576064346138, "grad_norm": 0.3178803324699402, "learning_rate": 0.00023573190789473683, "loss": 0.707, "step": 792 }, { "epoch": 0.8797670226043545, "grad_norm": 0.23064696788787842, "learning_rate": 0.00023560855263157892, "loss": 0.5725, "step": 793 }, { "epoch": 0.8808764387740952, "grad_norm": 0.2530830502510071, "learning_rate": 0.00023548519736842102, "loss": 0.5131, "step": 794 }, { "epoch": 0.8819858549438359, "grad_norm": 0.21092426776885986, "learning_rate": 0.00023536184210526314, "loss": 0.6452, "step": 795 }, { "epoch": 0.8830952711135764, "grad_norm": 0.21597221493721008, "learning_rate": 0.00023523848684210523, "loss": 0.5438, "step": 796 }, { "epoch": 0.8842046872833171, "grad_norm": 0.21937009692192078, "learning_rate": 0.00023511513157894732, "loss": 0.4943, "step": 797 }, { "epoch": 0.8853141034530578, "grad_norm": 0.2742394506931305, "learning_rate": 0.00023499177631578944, "loss": 0.7567, "step": 798 }, { "epoch": 0.8864235196227985, "grad_norm": 0.3151918053627014, "learning_rate": 0.00023486842105263156, "loss": 0.7817, "step": 799 }, { "epoch": 0.8875329357925392, "grad_norm": 0.23402948677539825, "learning_rate": 0.00023474506578947368, "loss": 0.3204, "step": 800 }, { "epoch": 0.8886423519622798, "grad_norm": 0.25392022728919983, "learning_rate": 0.00023462171052631578, "loss": 0.6914, "step": 801 }, { "epoch": 0.8897517681320205, "grad_norm": 0.22734297811985016, "learning_rate": 0.00023449835526315787, "loss": 0.5779, "step": 802 }, { "epoch": 0.8908611843017612, "grad_norm": 0.2071351408958435, "learning_rate": 0.000234375, "loss": 0.6024, "step": 803 }, { "epoch": 0.8919706004715019, "grad_norm": 0.2632406949996948, "learning_rate": 0.00023425164473684208, "loss": 0.5668, "step": 804 }, { "epoch": 0.8930800166412426, "grad_norm": 0.27089810371398926, "learning_rate": 0.00023412828947368418, "loss": 0.6043, "step": 805 }, { "epoch": 0.8941894328109832, "grad_norm": 0.1918395459651947, "learning_rate": 0.0002340049342105263, "loss": 0.3305, "step": 806 }, { "epoch": 0.8952988489807239, "grad_norm": 0.183834046125412, "learning_rate": 0.0002338815789473684, "loss": 0.603, "step": 807 }, { "epoch": 0.8964082651504646, "grad_norm": 0.29509904980659485, "learning_rate": 0.0002337582236842105, "loss": 0.6681, "step": 808 }, { "epoch": 0.8975176813202053, "grad_norm": 0.24960756301879883, "learning_rate": 0.00023363486842105263, "loss": 0.5601, "step": 809 }, { "epoch": 0.898627097489946, "grad_norm": 0.5940669775009155, "learning_rate": 0.00023351151315789472, "loss": 0.5074, "step": 810 }, { "epoch": 0.8997365136596865, "grad_norm": 0.21877194941043854, "learning_rate": 0.00023338815789473682, "loss": 0.6044, "step": 811 }, { "epoch": 0.9008459298294272, "grad_norm": 0.24642789363861084, "learning_rate": 0.00023326480263157894, "loss": 0.5249, "step": 812 }, { "epoch": 0.9019553459991679, "grad_norm": 0.21799951791763306, "learning_rate": 0.00023314144736842103, "loss": 0.5438, "step": 813 }, { "epoch": 0.9030647621689086, "grad_norm": 0.2310633808374405, "learning_rate": 0.00023301809210526312, "loss": 0.5451, "step": 814 }, { "epoch": 0.9041741783386493, "grad_norm": 0.1848413199186325, "learning_rate": 0.00023289473684210524, "loss": 0.4699, "step": 815 }, { "epoch": 0.9052835945083899, "grad_norm": 0.282272607088089, "learning_rate": 0.00023277138157894734, "loss": 0.5288, "step": 816 }, { "epoch": 0.9063930106781306, "grad_norm": 0.19741901755332947, "learning_rate": 0.00023264802631578943, "loss": 0.4977, "step": 817 }, { "epoch": 0.9075024268478713, "grad_norm": 0.2287929505109787, "learning_rate": 0.00023252467105263158, "loss": 0.5804, "step": 818 }, { "epoch": 0.908611843017612, "grad_norm": 0.2509765326976776, "learning_rate": 0.00023240131578947367, "loss": 0.4381, "step": 819 }, { "epoch": 0.9097212591873527, "grad_norm": 0.2717498540878296, "learning_rate": 0.00023227796052631576, "loss": 0.6095, "step": 820 }, { "epoch": 0.9108306753570933, "grad_norm": 0.38568127155303955, "learning_rate": 0.00023215460526315789, "loss": 0.512, "step": 821 }, { "epoch": 0.911940091526834, "grad_norm": 0.22532710433006287, "learning_rate": 0.00023203124999999998, "loss": 0.6333, "step": 822 }, { "epoch": 0.9130495076965747, "grad_norm": 0.2616422176361084, "learning_rate": 0.00023190789473684207, "loss": 0.4955, "step": 823 }, { "epoch": 0.9141589238663154, "grad_norm": 0.30875271558761597, "learning_rate": 0.0002317845394736842, "loss": 0.6645, "step": 824 }, { "epoch": 0.9152683400360561, "grad_norm": 0.22438234090805054, "learning_rate": 0.00023166118421052629, "loss": 0.6209, "step": 825 }, { "epoch": 0.9163777562057966, "grad_norm": 0.30306366086006165, "learning_rate": 0.00023153782894736838, "loss": 0.6832, "step": 826 }, { "epoch": 0.9174871723755373, "grad_norm": 0.19736707210540771, "learning_rate": 0.00023141447368421053, "loss": 0.5076, "step": 827 }, { "epoch": 0.918596588545278, "grad_norm": 0.37344890832901, "learning_rate": 0.00023129111842105262, "loss": 0.5337, "step": 828 }, { "epoch": 0.9197060047150187, "grad_norm": 0.22570458054542542, "learning_rate": 0.0002311677631578947, "loss": 0.4477, "step": 829 }, { "epoch": 0.9208154208847594, "grad_norm": 0.2158828228712082, "learning_rate": 0.00023104440789473683, "loss": 0.5738, "step": 830 }, { "epoch": 0.9219248370545001, "grad_norm": 0.17967133224010468, "learning_rate": 0.00023092105263157893, "loss": 0.6245, "step": 831 }, { "epoch": 0.9230342532242407, "grad_norm": 0.23400144279003143, "learning_rate": 0.00023079769736842102, "loss": 0.5328, "step": 832 }, { "epoch": 0.9241436693939814, "grad_norm": 0.20684117078781128, "learning_rate": 0.00023067434210526314, "loss": 0.4556, "step": 833 }, { "epoch": 0.9252530855637221, "grad_norm": 0.17495577037334442, "learning_rate": 0.00023055098684210523, "loss": 0.4022, "step": 834 }, { "epoch": 0.9263625017334628, "grad_norm": 0.3661905527114868, "learning_rate": 0.00023042763157894733, "loss": 0.8714, "step": 835 }, { "epoch": 0.9274719179032035, "grad_norm": 0.20915554463863373, "learning_rate": 0.00023030427631578945, "loss": 0.5164, "step": 836 }, { "epoch": 0.9285813340729441, "grad_norm": 0.2413186877965927, "learning_rate": 0.00023018092105263157, "loss": 0.515, "step": 837 }, { "epoch": 0.9296907502426848, "grad_norm": 0.23484086990356445, "learning_rate": 0.00023005756578947369, "loss": 0.5425, "step": 838 }, { "epoch": 0.9308001664124255, "grad_norm": 0.269280344247818, "learning_rate": 0.00022993421052631578, "loss": 0.5335, "step": 839 }, { "epoch": 0.9319095825821662, "grad_norm": 0.3550933003425598, "learning_rate": 0.00022981085526315787, "loss": 0.4588, "step": 840 }, { "epoch": 0.9330189987519069, "grad_norm": 0.33913251757621765, "learning_rate": 0.0002296875, "loss": 0.6615, "step": 841 }, { "epoch": 0.9341284149216474, "grad_norm": 0.31911739706993103, "learning_rate": 0.00022956414473684209, "loss": 0.6322, "step": 842 }, { "epoch": 0.9352378310913881, "grad_norm": 0.33628326654434204, "learning_rate": 0.00022944078947368418, "loss": 0.5983, "step": 843 }, { "epoch": 0.9363472472611288, "grad_norm": 0.1979479342699051, "learning_rate": 0.00022931743421052627, "loss": 0.4327, "step": 844 }, { "epoch": 0.9374566634308695, "grad_norm": 0.23927690088748932, "learning_rate": 0.0002291940789473684, "loss": 0.5675, "step": 845 }, { "epoch": 0.9385660796006102, "grad_norm": 0.23125141859054565, "learning_rate": 0.0002290707236842105, "loss": 0.5135, "step": 846 }, { "epoch": 0.9396754957703508, "grad_norm": 0.2613430917263031, "learning_rate": 0.00022894736842105263, "loss": 0.4361, "step": 847 }, { "epoch": 0.9407849119400915, "grad_norm": 0.28109273314476013, "learning_rate": 0.00022882401315789473, "loss": 0.5697, "step": 848 }, { "epoch": 0.9418943281098322, "grad_norm": 0.25478866696357727, "learning_rate": 0.00022870065789473682, "loss": 0.5453, "step": 849 }, { "epoch": 0.9430037442795729, "grad_norm": 0.2179301530122757, "learning_rate": 0.00022857730263157894, "loss": 0.5588, "step": 850 }, { "epoch": 0.9441131604493136, "grad_norm": 0.20109961926937103, "learning_rate": 0.00022845394736842103, "loss": 0.5843, "step": 851 }, { "epoch": 0.9452225766190542, "grad_norm": 0.18201051652431488, "learning_rate": 0.00022833059210526313, "loss": 0.5031, "step": 852 }, { "epoch": 0.9463319927887949, "grad_norm": 0.25168418884277344, "learning_rate": 0.00022820723684210525, "loss": 0.7, "step": 853 }, { "epoch": 0.9474414089585356, "grad_norm": 0.2502383291721344, "learning_rate": 0.00022808388157894734, "loss": 0.6002, "step": 854 }, { "epoch": 0.9485508251282763, "grad_norm": 0.2077435404062271, "learning_rate": 0.00022796052631578943, "loss": 0.632, "step": 855 }, { "epoch": 0.949660241298017, "grad_norm": 0.28856661915779114, "learning_rate": 0.00022783717105263158, "loss": 0.5973, "step": 856 }, { "epoch": 0.9507696574677575, "grad_norm": 0.18906134366989136, "learning_rate": 0.00022771381578947367, "loss": 0.5271, "step": 857 }, { "epoch": 0.9518790736374982, "grad_norm": 0.2617158591747284, "learning_rate": 0.00022759046052631577, "loss": 0.5909, "step": 858 }, { "epoch": 0.9529884898072389, "grad_norm": 0.42665448784828186, "learning_rate": 0.0002274671052631579, "loss": 0.5476, "step": 859 }, { "epoch": 0.9540979059769796, "grad_norm": 0.2901977598667145, "learning_rate": 0.00022734374999999998, "loss": 0.6033, "step": 860 }, { "epoch": 0.9552073221467203, "grad_norm": 0.26080241799354553, "learning_rate": 0.00022722039473684207, "loss": 0.6149, "step": 861 }, { "epoch": 0.9563167383164609, "grad_norm": 0.2692500948905945, "learning_rate": 0.0002270970394736842, "loss": 0.6035, "step": 862 }, { "epoch": 0.9574261544862016, "grad_norm": 0.17691655457019806, "learning_rate": 0.00022697368421052629, "loss": 0.5072, "step": 863 }, { "epoch": 0.9585355706559423, "grad_norm": 0.4164085388183594, "learning_rate": 0.00022685032894736838, "loss": 0.5342, "step": 864 }, { "epoch": 0.959644986825683, "grad_norm": 0.18131154775619507, "learning_rate": 0.00022672697368421053, "loss": 0.5121, "step": 865 }, { "epoch": 0.9607544029954237, "grad_norm": 0.22192832827568054, "learning_rate": 0.00022660361842105262, "loss": 0.5307, "step": 866 }, { "epoch": 0.9618638191651643, "grad_norm": 0.350583016872406, "learning_rate": 0.0002264802631578947, "loss": 0.4176, "step": 867 }, { "epoch": 0.962973235334905, "grad_norm": 0.2533394396305084, "learning_rate": 0.00022635690789473683, "loss": 0.5289, "step": 868 }, { "epoch": 0.9640826515046457, "grad_norm": 0.19999092817306519, "learning_rate": 0.00022623355263157893, "loss": 0.5624, "step": 869 }, { "epoch": 0.9651920676743864, "grad_norm": 0.17564158141613007, "learning_rate": 0.00022611019736842102, "loss": 0.4738, "step": 870 }, { "epoch": 0.9663014838441271, "grad_norm": 0.35863691568374634, "learning_rate": 0.00022598684210526314, "loss": 0.7332, "step": 871 }, { "epoch": 0.9674109000138676, "grad_norm": 0.20389032363891602, "learning_rate": 0.00022586348684210523, "loss": 0.4931, "step": 872 }, { "epoch": 0.9685203161836083, "grad_norm": 0.19918256998062134, "learning_rate": 0.00022574013157894733, "loss": 0.5961, "step": 873 }, { "epoch": 0.969629732353349, "grad_norm": 0.25909268856048584, "learning_rate": 0.00022561677631578945, "loss": 0.5575, "step": 874 }, { "epoch": 0.9707391485230897, "grad_norm": 0.18549631536006927, "learning_rate": 0.00022549342105263157, "loss": 0.5181, "step": 875 }, { "epoch": 0.9718485646928304, "grad_norm": 0.2645319998264313, "learning_rate": 0.0002253700657894737, "loss": 0.6302, "step": 876 }, { "epoch": 0.9729579808625711, "grad_norm": 0.36148592829704285, "learning_rate": 0.00022524671052631578, "loss": 0.4178, "step": 877 }, { "epoch": 0.9740673970323117, "grad_norm": 0.2523496150970459, "learning_rate": 0.00022512335526315787, "loss": 0.5333, "step": 878 }, { "epoch": 0.9751768132020524, "grad_norm": 0.20175378024578094, "learning_rate": 0.000225, "loss": 0.4555, "step": 879 }, { "epoch": 0.9762862293717931, "grad_norm": 0.7862651348114014, "learning_rate": 0.0002248766447368421, "loss": 0.492, "step": 880 }, { "epoch": 0.9773956455415338, "grad_norm": 0.27353614568710327, "learning_rate": 0.00022475328947368418, "loss": 0.6768, "step": 881 }, { "epoch": 0.9785050617112745, "grad_norm": 0.27447402477264404, "learning_rate": 0.00022462993421052627, "loss": 0.7058, "step": 882 }, { "epoch": 0.9796144778810151, "grad_norm": 0.1933760941028595, "learning_rate": 0.0002245065789473684, "loss": 0.6976, "step": 883 }, { "epoch": 0.9807238940507558, "grad_norm": 0.18439841270446777, "learning_rate": 0.00022438322368421051, "loss": 0.403, "step": 884 }, { "epoch": 0.9818333102204965, "grad_norm": 0.27743586897850037, "learning_rate": 0.00022425986842105263, "loss": 0.7012, "step": 885 }, { "epoch": 0.9829427263902372, "grad_norm": 0.26009905338287354, "learning_rate": 0.00022413651315789473, "loss": 0.5342, "step": 886 }, { "epoch": 0.9840521425599779, "grad_norm": 0.30932557582855225, "learning_rate": 0.00022401315789473682, "loss": 0.6027, "step": 887 }, { "epoch": 0.9851615587297184, "grad_norm": 0.21047377586364746, "learning_rate": 0.00022388980263157894, "loss": 0.4078, "step": 888 }, { "epoch": 0.9862709748994591, "grad_norm": 0.17896795272827148, "learning_rate": 0.00022376644736842103, "loss": 0.5645, "step": 889 }, { "epoch": 0.9873803910691998, "grad_norm": 0.31928160786628723, "learning_rate": 0.00022364309210526313, "loss": 0.6017, "step": 890 }, { "epoch": 0.9884898072389405, "grad_norm": 0.2661576271057129, "learning_rate": 0.00022351973684210525, "loss": 0.4079, "step": 891 }, { "epoch": 0.9895992234086812, "grad_norm": 0.17466966807842255, "learning_rate": 0.00022339638157894734, "loss": 0.4499, "step": 892 }, { "epoch": 0.9907086395784218, "grad_norm": 0.29928284883499146, "learning_rate": 0.00022327302631578943, "loss": 0.6738, "step": 893 }, { "epoch": 0.9918180557481625, "grad_norm": 0.22055590152740479, "learning_rate": 0.00022314967105263158, "loss": 0.4979, "step": 894 }, { "epoch": 0.9929274719179032, "grad_norm": 0.20703110098838806, "learning_rate": 0.00022302631578947367, "loss": 0.34, "step": 895 }, { "epoch": 0.9940368880876439, "grad_norm": 0.23895259201526642, "learning_rate": 0.00022290296052631577, "loss": 0.6151, "step": 896 }, { "epoch": 0.9951463042573846, "grad_norm": 0.23790138959884644, "learning_rate": 0.0002227796052631579, "loss": 0.5713, "step": 897 }, { "epoch": 0.9962557204271252, "grad_norm": 0.18377721309661865, "learning_rate": 0.00022265624999999998, "loss": 0.5082, "step": 898 }, { "epoch": 0.9973651365968659, "grad_norm": 0.23333214223384857, "learning_rate": 0.00022253289473684207, "loss": 0.4654, "step": 899 }, { "epoch": 0.9984745527666066, "grad_norm": 0.22151075303554535, "learning_rate": 0.0002224095394736842, "loss": 0.5104, "step": 900 }, { "epoch": 0.9995839689363473, "grad_norm": 0.2722238302230835, "learning_rate": 0.0002222861842105263, "loss": 0.5298, "step": 901 }, { "epoch": 1.000693385106088, "grad_norm": 0.3227219879627228, "learning_rate": 0.00022216282894736838, "loss": 0.6207, "step": 902 }, { "epoch": 1.0018028012758287, "grad_norm": 0.21921642124652863, "learning_rate": 0.00022203947368421053, "loss": 0.4741, "step": 903 }, { "epoch": 1.0029122174455694, "grad_norm": 0.18044739961624146, "learning_rate": 0.00022191611842105262, "loss": 0.2975, "step": 904 }, { "epoch": 1.00402163361531, "grad_norm": 0.20503199100494385, "learning_rate": 0.00022179276315789471, "loss": 0.4205, "step": 905 }, { "epoch": 1.0051310497850505, "grad_norm": 0.18091997504234314, "learning_rate": 0.00022166940789473683, "loss": 0.4393, "step": 906 }, { "epoch": 1.0062404659547912, "grad_norm": 0.2026747614145279, "learning_rate": 0.00022154605263157893, "loss": 0.6194, "step": 907 }, { "epoch": 1.007349882124532, "grad_norm": 0.20425230264663696, "learning_rate": 0.00022142269736842102, "loss": 0.7142, "step": 908 }, { "epoch": 1.0084592982942726, "grad_norm": 0.22876769304275513, "learning_rate": 0.00022129934210526314, "loss": 0.5316, "step": 909 }, { "epoch": 1.0095687144640133, "grad_norm": 0.3051469624042511, "learning_rate": 0.00022117598684210523, "loss": 0.526, "step": 910 }, { "epoch": 1.010678130633754, "grad_norm": 0.31023165583610535, "learning_rate": 0.00022105263157894733, "loss": 0.6281, "step": 911 }, { "epoch": 1.0117875468034947, "grad_norm": 0.2776643633842468, "learning_rate": 0.00022092927631578945, "loss": 0.6222, "step": 912 }, { "epoch": 1.0128969629732354, "grad_norm": 0.17718107998371124, "learning_rate": 0.00022080592105263157, "loss": 0.497, "step": 913 }, { "epoch": 1.014006379142976, "grad_norm": 0.18926644325256348, "learning_rate": 0.0002206825657894737, "loss": 0.3907, "step": 914 }, { "epoch": 1.0151157953127168, "grad_norm": 0.21219785511493683, "learning_rate": 0.00022055921052631578, "loss": 0.7063, "step": 915 }, { "epoch": 1.0162252114824573, "grad_norm": 0.30529090762138367, "learning_rate": 0.00022043585526315787, "loss": 0.5273, "step": 916 }, { "epoch": 1.017334627652198, "grad_norm": 0.2160457968711853, "learning_rate": 0.00022031249999999997, "loss": 0.6503, "step": 917 }, { "epoch": 1.0184440438219386, "grad_norm": 0.19193992018699646, "learning_rate": 0.0002201891447368421, "loss": 0.5096, "step": 918 }, { "epoch": 1.0195534599916793, "grad_norm": 0.1873706579208374, "learning_rate": 0.00022006578947368418, "loss": 0.5354, "step": 919 }, { "epoch": 1.02066287616142, "grad_norm": 0.183074951171875, "learning_rate": 0.00021994243421052627, "loss": 0.4163, "step": 920 }, { "epoch": 1.0217722923311607, "grad_norm": 0.15584836900234222, "learning_rate": 0.0002198190789473684, "loss": 0.3087, "step": 921 }, { "epoch": 1.0228817085009014, "grad_norm": 0.1514248251914978, "learning_rate": 0.00021969572368421052, "loss": 0.492, "step": 922 }, { "epoch": 1.0239911246706421, "grad_norm": 0.11994735896587372, "learning_rate": 0.00021957236842105264, "loss": 0.4445, "step": 923 }, { "epoch": 1.0251005408403828, "grad_norm": 0.23181642591953278, "learning_rate": 0.00021944901315789473, "loss": 0.5837, "step": 924 }, { "epoch": 1.0262099570101235, "grad_norm": 0.24097567796707153, "learning_rate": 0.00021932565789473682, "loss": 0.5709, "step": 925 }, { "epoch": 1.027319373179864, "grad_norm": 0.2901909649372101, "learning_rate": 0.00021920230263157894, "loss": 0.6171, "step": 926 }, { "epoch": 1.0284287893496047, "grad_norm": 0.2848079800605774, "learning_rate": 0.00021907894736842104, "loss": 0.5066, "step": 927 }, { "epoch": 1.0295382055193454, "grad_norm": 0.26711946725845337, "learning_rate": 0.00021895559210526313, "loss": 0.4874, "step": 928 }, { "epoch": 1.030647621689086, "grad_norm": 0.20230190455913544, "learning_rate": 0.00021883223684210525, "loss": 0.4349, "step": 929 }, { "epoch": 1.0317570378588268, "grad_norm": 0.2531988322734833, "learning_rate": 0.00021870888157894734, "loss": 0.5349, "step": 930 }, { "epoch": 1.0328664540285675, "grad_norm": 0.19993358850479126, "learning_rate": 0.00021858552631578944, "loss": 0.4689, "step": 931 }, { "epoch": 1.0339758701983082, "grad_norm": 0.36045175790786743, "learning_rate": 0.00021846217105263158, "loss": 0.5803, "step": 932 }, { "epoch": 1.0350852863680489, "grad_norm": 0.2449137419462204, "learning_rate": 0.00021833881578947368, "loss": 0.5587, "step": 933 }, { "epoch": 1.0361947025377896, "grad_norm": 0.21992360055446625, "learning_rate": 0.00021821546052631577, "loss": 0.4949, "step": 934 }, { "epoch": 1.0373041187075303, "grad_norm": 0.22803427278995514, "learning_rate": 0.0002180921052631579, "loss": 0.3884, "step": 935 }, { "epoch": 1.0384135348772707, "grad_norm": 0.31868723034858704, "learning_rate": 0.00021796874999999998, "loss": 0.4716, "step": 936 }, { "epoch": 1.0395229510470114, "grad_norm": 0.2443029284477234, "learning_rate": 0.00021784539473684208, "loss": 0.4942, "step": 937 }, { "epoch": 1.0406323672167521, "grad_norm": 0.16727162897586823, "learning_rate": 0.0002177220394736842, "loss": 0.5104, "step": 938 }, { "epoch": 1.0417417833864928, "grad_norm": 0.2707812488079071, "learning_rate": 0.0002175986842105263, "loss": 0.7545, "step": 939 }, { "epoch": 1.0428511995562335, "grad_norm": 0.20524704456329346, "learning_rate": 0.00021747532894736838, "loss": 0.4782, "step": 940 }, { "epoch": 1.0439606157259742, "grad_norm": 0.2047165483236313, "learning_rate": 0.00021735197368421053, "loss": 0.6799, "step": 941 }, { "epoch": 1.045070031895715, "grad_norm": 0.21171070635318756, "learning_rate": 0.00021722861842105262, "loss": 0.6063, "step": 942 }, { "epoch": 1.0461794480654556, "grad_norm": 0.2965310215950012, "learning_rate": 0.00021710526315789472, "loss": 0.8489, "step": 943 }, { "epoch": 1.0472888642351963, "grad_norm": 0.18332551419734955, "learning_rate": 0.00021698190789473684, "loss": 0.5516, "step": 944 }, { "epoch": 1.048398280404937, "grad_norm": 0.21715614199638367, "learning_rate": 0.00021685855263157893, "loss": 0.5219, "step": 945 }, { "epoch": 1.0495076965746777, "grad_norm": 0.2923019230365753, "learning_rate": 0.00021673519736842102, "loss": 0.7848, "step": 946 }, { "epoch": 1.0506171127444182, "grad_norm": 0.259674072265625, "learning_rate": 0.00021661184210526314, "loss": 0.3492, "step": 947 }, { "epoch": 1.0517265289141589, "grad_norm": 0.158514603972435, "learning_rate": 0.00021648848684210524, "loss": 0.5425, "step": 948 }, { "epoch": 1.0528359450838995, "grad_norm": 0.1703483611345291, "learning_rate": 0.00021636513157894733, "loss": 0.4018, "step": 949 }, { "epoch": 1.0539453612536402, "grad_norm": 0.1840825378894806, "learning_rate": 0.00021624177631578945, "loss": 0.3122, "step": 950 }, { "epoch": 1.055054777423381, "grad_norm": 0.3050348162651062, "learning_rate": 0.00021611842105263157, "loss": 0.6332, "step": 951 }, { "epoch": 1.0561641935931216, "grad_norm": 0.2472129911184311, "learning_rate": 0.00021599506578947366, "loss": 0.4852, "step": 952 }, { "epoch": 1.0572736097628623, "grad_norm": 0.18661095201969147, "learning_rate": 0.00021587171052631578, "loss": 0.3293, "step": 953 }, { "epoch": 1.058383025932603, "grad_norm": 0.3015456795692444, "learning_rate": 0.00021574835526315788, "loss": 0.5981, "step": 954 }, { "epoch": 1.0594924421023437, "grad_norm": 0.24642759561538696, "learning_rate": 0.00021562499999999997, "loss": 0.4401, "step": 955 }, { "epoch": 1.0606018582720844, "grad_norm": 0.23688514530658722, "learning_rate": 0.0002155016447368421, "loss": 0.6831, "step": 956 }, { "epoch": 1.061711274441825, "grad_norm": 0.20036476850509644, "learning_rate": 0.00021537828947368418, "loss": 0.4985, "step": 957 }, { "epoch": 1.0628206906115656, "grad_norm": 0.21736058592796326, "learning_rate": 0.00021525493421052628, "loss": 0.7388, "step": 958 }, { "epoch": 1.0639301067813063, "grad_norm": 0.16539451479911804, "learning_rate": 0.0002151315789473684, "loss": 0.6139, "step": 959 }, { "epoch": 1.065039522951047, "grad_norm": 0.2896292209625244, "learning_rate": 0.00021500822368421052, "loss": 0.6531, "step": 960 }, { "epoch": 1.0661489391207877, "grad_norm": 0.2285917103290558, "learning_rate": 0.00021488486842105264, "loss": 0.4059, "step": 961 }, { "epoch": 1.0672583552905284, "grad_norm": 0.2950262725353241, "learning_rate": 0.00021476151315789473, "loss": 0.6509, "step": 962 }, { "epoch": 1.068367771460269, "grad_norm": 0.30050045251846313, "learning_rate": 0.00021463815789473682, "loss": 0.513, "step": 963 }, { "epoch": 1.0694771876300098, "grad_norm": 0.22528204321861267, "learning_rate": 0.00021451480263157894, "loss": 0.4885, "step": 964 }, { "epoch": 1.0705866037997505, "grad_norm": 0.18981274962425232, "learning_rate": 0.00021439144736842104, "loss": 0.4872, "step": 965 }, { "epoch": 1.0716960199694912, "grad_norm": 0.17064198851585388, "learning_rate": 0.00021426809210526313, "loss": 0.4601, "step": 966 }, { "epoch": 1.0728054361392316, "grad_norm": 0.2478228509426117, "learning_rate": 0.00021414473684210522, "loss": 0.4115, "step": 967 }, { "epoch": 1.0739148523089723, "grad_norm": 0.22317509353160858, "learning_rate": 0.00021402138157894734, "loss": 0.4975, "step": 968 }, { "epoch": 1.075024268478713, "grad_norm": 0.26215845346450806, "learning_rate": 0.00021389802631578944, "loss": 0.5119, "step": 969 }, { "epoch": 1.0761336846484537, "grad_norm": 0.2661817967891693, "learning_rate": 0.00021377467105263158, "loss": 0.4886, "step": 970 }, { "epoch": 1.0772431008181944, "grad_norm": 0.3064764738082886, "learning_rate": 0.00021365131578947368, "loss": 0.4684, "step": 971 }, { "epoch": 1.078352516987935, "grad_norm": 0.1887666881084442, "learning_rate": 0.00021352796052631577, "loss": 0.5324, "step": 972 }, { "epoch": 1.0794619331576758, "grad_norm": 0.446283221244812, "learning_rate": 0.0002134046052631579, "loss": 0.4313, "step": 973 }, { "epoch": 1.0805713493274165, "grad_norm": 0.33540961146354675, "learning_rate": 0.00021328124999999998, "loss": 0.6241, "step": 974 }, { "epoch": 1.0816807654971572, "grad_norm": 0.234646275639534, "learning_rate": 0.00021315789473684208, "loss": 0.3787, "step": 975 }, { "epoch": 1.0827901816668979, "grad_norm": 0.21899907290935516, "learning_rate": 0.0002130345394736842, "loss": 0.5201, "step": 976 }, { "epoch": 1.0838995978366386, "grad_norm": 0.18094147741794586, "learning_rate": 0.0002129111842105263, "loss": 0.44, "step": 977 }, { "epoch": 1.085009014006379, "grad_norm": 0.1795743703842163, "learning_rate": 0.00021278782894736838, "loss": 0.4502, "step": 978 }, { "epoch": 1.0861184301761198, "grad_norm": 0.24622996151447296, "learning_rate": 0.00021266447368421053, "loss": 0.4467, "step": 979 }, { "epoch": 1.0872278463458604, "grad_norm": 0.32457393407821655, "learning_rate": 0.00021254111842105262, "loss": 0.5973, "step": 980 }, { "epoch": 1.0883372625156011, "grad_norm": 0.18922576308250427, "learning_rate": 0.00021241776315789472, "loss": 0.2973, "step": 981 }, { "epoch": 1.0894466786853418, "grad_norm": 0.2579767405986786, "learning_rate": 0.00021229440789473684, "loss": 0.4825, "step": 982 }, { "epoch": 1.0905560948550825, "grad_norm": 0.26494085788726807, "learning_rate": 0.00021217105263157893, "loss": 0.7183, "step": 983 }, { "epoch": 1.0916655110248232, "grad_norm": 0.26830682158470154, "learning_rate": 0.00021204769736842102, "loss": 0.6421, "step": 984 }, { "epoch": 1.092774927194564, "grad_norm": 0.21053458750247955, "learning_rate": 0.00021192434210526314, "loss": 0.3367, "step": 985 }, { "epoch": 1.0938843433643046, "grad_norm": 0.27601686120033264, "learning_rate": 0.00021180098684210524, "loss": 0.5145, "step": 986 }, { "epoch": 1.094993759534045, "grad_norm": 0.28960034251213074, "learning_rate": 0.00021167763157894733, "loss": 0.548, "step": 987 }, { "epoch": 1.0961031757037858, "grad_norm": 0.1937156766653061, "learning_rate": 0.00021155427631578945, "loss": 0.3746, "step": 988 }, { "epoch": 1.0972125918735265, "grad_norm": 0.22715970873832703, "learning_rate": 0.00021143092105263157, "loss": 0.6459, "step": 989 }, { "epoch": 1.0983220080432672, "grad_norm": 0.2094552218914032, "learning_rate": 0.00021130756578947366, "loss": 0.4155, "step": 990 }, { "epoch": 1.0994314242130079, "grad_norm": 0.23469318449497223, "learning_rate": 0.00021118421052631578, "loss": 0.6588, "step": 991 }, { "epoch": 1.1005408403827486, "grad_norm": 0.18322822451591492, "learning_rate": 0.00021106085526315788, "loss": 0.4636, "step": 992 }, { "epoch": 1.1016502565524893, "grad_norm": 0.2863262891769409, "learning_rate": 0.00021093749999999997, "loss": 0.4921, "step": 993 }, { "epoch": 1.10275967272223, "grad_norm": 0.2982548177242279, "learning_rate": 0.0002108141447368421, "loss": 0.6896, "step": 994 }, { "epoch": 1.1038690888919707, "grad_norm": 0.35214439034461975, "learning_rate": 0.00021069078947368418, "loss": 0.4343, "step": 995 }, { "epoch": 1.1049785050617114, "grad_norm": 0.19073866307735443, "learning_rate": 0.00021056743421052628, "loss": 0.4724, "step": 996 }, { "epoch": 1.106087921231452, "grad_norm": 0.24945704638957977, "learning_rate": 0.0002104440789473684, "loss": 0.3815, "step": 997 }, { "epoch": 1.1071973374011925, "grad_norm": 0.27915990352630615, "learning_rate": 0.00021032072368421052, "loss": 0.5862, "step": 998 }, { "epoch": 1.1083067535709332, "grad_norm": 0.3030833899974823, "learning_rate": 0.00021019736842105264, "loss": 0.4567, "step": 999 }, { "epoch": 1.109416169740674, "grad_norm": 0.22396822273731232, "learning_rate": 0.00021007401315789473, "loss": 0.4252, "step": 1000 }, { "epoch": 1.1105255859104146, "grad_norm": 0.285520076751709, "learning_rate": 0.00020995065789473682, "loss": 0.3036, "step": 1001 }, { "epoch": 1.1116350020801553, "grad_norm": 0.2318202704191208, "learning_rate": 0.00020982730263157892, "loss": 0.44, "step": 1002 }, { "epoch": 1.112744418249896, "grad_norm": 0.24701035022735596, "learning_rate": 0.00020970394736842104, "loss": 0.5542, "step": 1003 }, { "epoch": 1.1138538344196367, "grad_norm": 0.25920355319976807, "learning_rate": 0.00020958059210526313, "loss": 0.5066, "step": 1004 }, { "epoch": 1.1149632505893774, "grad_norm": 0.6409327387809753, "learning_rate": 0.00020945723684210522, "loss": 0.6025, "step": 1005 }, { "epoch": 1.116072666759118, "grad_norm": 0.469032883644104, "learning_rate": 0.00020933388157894734, "loss": 0.4099, "step": 1006 }, { "epoch": 1.1171820829288588, "grad_norm": 0.181956484913826, "learning_rate": 0.00020921052631578944, "loss": 0.353, "step": 1007 }, { "epoch": 1.1182914990985993, "grad_norm": 0.2923588752746582, "learning_rate": 0.00020908717105263158, "loss": 0.5331, "step": 1008 }, { "epoch": 1.11940091526834, "grad_norm": 0.29170429706573486, "learning_rate": 0.00020896381578947368, "loss": 0.6123, "step": 1009 }, { "epoch": 1.1205103314380807, "grad_norm": 0.23297952115535736, "learning_rate": 0.00020884046052631577, "loss": 0.3093, "step": 1010 }, { "epoch": 1.1216197476078213, "grad_norm": 0.3802937865257263, "learning_rate": 0.0002087171052631579, "loss": 0.5309, "step": 1011 }, { "epoch": 1.122729163777562, "grad_norm": 0.21260103583335876, "learning_rate": 0.00020859374999999998, "loss": 0.5288, "step": 1012 }, { "epoch": 1.1238385799473027, "grad_norm": 0.22543269395828247, "learning_rate": 0.00020847039473684208, "loss": 0.4474, "step": 1013 }, { "epoch": 1.1249479961170434, "grad_norm": 0.23437976837158203, "learning_rate": 0.0002083470394736842, "loss": 0.622, "step": 1014 }, { "epoch": 1.1260574122867841, "grad_norm": 0.22682055830955505, "learning_rate": 0.0002082236842105263, "loss": 0.4028, "step": 1015 }, { "epoch": 1.1271668284565248, "grad_norm": 0.27149879932403564, "learning_rate": 0.00020810032894736838, "loss": 0.3926, "step": 1016 }, { "epoch": 1.1282762446262655, "grad_norm": 0.21635794639587402, "learning_rate": 0.00020797697368421053, "loss": 0.5204, "step": 1017 }, { "epoch": 1.129385660796006, "grad_norm": 0.29137322306632996, "learning_rate": 0.00020785361842105263, "loss": 0.5625, "step": 1018 }, { "epoch": 1.1304950769657467, "grad_norm": 0.2547966241836548, "learning_rate": 0.00020773026315789472, "loss": 0.7025, "step": 1019 }, { "epoch": 1.1316044931354874, "grad_norm": 0.34155234694480896, "learning_rate": 0.00020760690789473684, "loss": 0.5504, "step": 1020 }, { "epoch": 1.132713909305228, "grad_norm": 0.30655375123023987, "learning_rate": 0.00020748355263157893, "loss": 0.8426, "step": 1021 }, { "epoch": 1.1338233254749688, "grad_norm": 0.24423359334468842, "learning_rate": 0.00020736019736842102, "loss": 0.4466, "step": 1022 }, { "epoch": 1.1349327416447095, "grad_norm": 0.24093151092529297, "learning_rate": 0.00020723684210526315, "loss": 0.6558, "step": 1023 }, { "epoch": 1.1360421578144502, "grad_norm": 0.2106793373823166, "learning_rate": 0.00020711348684210524, "loss": 0.4964, "step": 1024 }, { "epoch": 1.1371515739841909, "grad_norm": 0.27556413412094116, "learning_rate": 0.00020699013157894733, "loss": 0.4517, "step": 1025 }, { "epoch": 1.1382609901539316, "grad_norm": 0.28604790568351746, "learning_rate": 0.00020686677631578945, "loss": 0.606, "step": 1026 }, { "epoch": 1.1393704063236723, "grad_norm": 0.2911072075366974, "learning_rate": 0.00020674342105263157, "loss": 0.4299, "step": 1027 }, { "epoch": 1.140479822493413, "grad_norm": 0.19676262140274048, "learning_rate": 0.00020662006578947367, "loss": 0.3269, "step": 1028 }, { "epoch": 1.1415892386631534, "grad_norm": 0.2523801624774933, "learning_rate": 0.00020649671052631579, "loss": 0.3888, "step": 1029 }, { "epoch": 1.1426986548328941, "grad_norm": 0.2896265387535095, "learning_rate": 0.00020637335526315788, "loss": 0.4757, "step": 1030 }, { "epoch": 1.1438080710026348, "grad_norm": 0.24346299469470978, "learning_rate": 0.00020624999999999997, "loss": 0.5065, "step": 1031 }, { "epoch": 1.1449174871723755, "grad_norm": 0.2244485467672348, "learning_rate": 0.0002061266447368421, "loss": 0.4212, "step": 1032 }, { "epoch": 1.1460269033421162, "grad_norm": 0.2615904211997986, "learning_rate": 0.00020600328947368419, "loss": 0.3755, "step": 1033 }, { "epoch": 1.147136319511857, "grad_norm": 0.2203037440776825, "learning_rate": 0.00020587993421052628, "loss": 0.7468, "step": 1034 }, { "epoch": 1.1482457356815976, "grad_norm": 0.2502787709236145, "learning_rate": 0.0002057565789473684, "loss": 0.4717, "step": 1035 }, { "epoch": 1.1493551518513383, "grad_norm": 0.23755724728107452, "learning_rate": 0.00020563322368421052, "loss": 0.4367, "step": 1036 }, { "epoch": 1.150464568021079, "grad_norm": 0.2541312575340271, "learning_rate": 0.00020550986842105264, "loss": 0.3824, "step": 1037 }, { "epoch": 1.1515739841908195, "grad_norm": 0.2297431081533432, "learning_rate": 0.00020538651315789473, "loss": 0.5744, "step": 1038 }, { "epoch": 1.1526834003605602, "grad_norm": 0.25546327233314514, "learning_rate": 0.00020526315789473683, "loss": 0.5484, "step": 1039 }, { "epoch": 1.1537928165303009, "grad_norm": 0.27455171942710876, "learning_rate": 0.00020513980263157892, "loss": 0.4961, "step": 1040 }, { "epoch": 1.1549022327000416, "grad_norm": 0.221107617020607, "learning_rate": 0.00020501644736842104, "loss": 0.6225, "step": 1041 }, { "epoch": 1.1560116488697822, "grad_norm": 0.3334260582923889, "learning_rate": 0.00020489309210526313, "loss": 0.5827, "step": 1042 }, { "epoch": 1.157121065039523, "grad_norm": 0.2542990446090698, "learning_rate": 0.00020476973684210523, "loss": 0.4444, "step": 1043 }, { "epoch": 1.1582304812092636, "grad_norm": 0.24342231452465057, "learning_rate": 0.00020464638157894735, "loss": 0.4379, "step": 1044 }, { "epoch": 1.1593398973790043, "grad_norm": 0.27247416973114014, "learning_rate": 0.00020452302631578944, "loss": 0.5547, "step": 1045 }, { "epoch": 1.160449313548745, "grad_norm": 0.21573393046855927, "learning_rate": 0.00020439967105263159, "loss": 0.4349, "step": 1046 }, { "epoch": 1.1615587297184857, "grad_norm": 0.2523028552532196, "learning_rate": 0.00020427631578947368, "loss": 0.4237, "step": 1047 }, { "epoch": 1.1626681458882264, "grad_norm": 0.30524906516075134, "learning_rate": 0.00020415296052631577, "loss": 0.3916, "step": 1048 }, { "epoch": 1.163777562057967, "grad_norm": 0.3323182463645935, "learning_rate": 0.0002040296052631579, "loss": 0.6462, "step": 1049 }, { "epoch": 1.1648869782277076, "grad_norm": 0.31420886516571045, "learning_rate": 0.00020390624999999999, "loss": 0.4049, "step": 1050 }, { "epoch": 1.1659963943974483, "grad_norm": 0.32108211517333984, "learning_rate": 0.00020378289473684208, "loss": 0.4562, "step": 1051 }, { "epoch": 1.167105810567189, "grad_norm": 0.1855727732181549, "learning_rate": 0.00020365953947368417, "loss": 0.4673, "step": 1052 }, { "epoch": 1.1682152267369297, "grad_norm": 0.3211022615432739, "learning_rate": 0.0002035361842105263, "loss": 0.3942, "step": 1053 }, { "epoch": 1.1693246429066704, "grad_norm": 0.25351840257644653, "learning_rate": 0.00020341282894736839, "loss": 0.5996, "step": 1054 }, { "epoch": 1.170434059076411, "grad_norm": 0.3350581228733063, "learning_rate": 0.00020328947368421053, "loss": 0.9422, "step": 1055 }, { "epoch": 1.1715434752461518, "grad_norm": 0.28251969814300537, "learning_rate": 0.00020316611842105263, "loss": 0.5418, "step": 1056 }, { "epoch": 1.1726528914158925, "grad_norm": 0.25546523928642273, "learning_rate": 0.00020304276315789472, "loss": 0.5415, "step": 1057 }, { "epoch": 1.173762307585633, "grad_norm": 0.2648818790912628, "learning_rate": 0.00020291940789473684, "loss": 0.5584, "step": 1058 }, { "epoch": 1.1748717237553739, "grad_norm": 0.3260975480079651, "learning_rate": 0.00020279605263157893, "loss": 0.5427, "step": 1059 }, { "epoch": 1.1759811399251143, "grad_norm": 0.2364053875207901, "learning_rate": 0.00020267269736842103, "loss": 0.5468, "step": 1060 }, { "epoch": 1.177090556094855, "grad_norm": 0.3164118230342865, "learning_rate": 0.00020254934210526315, "loss": 0.5242, "step": 1061 }, { "epoch": 1.1781999722645957, "grad_norm": 0.22556494176387787, "learning_rate": 0.00020242598684210524, "loss": 0.6513, "step": 1062 }, { "epoch": 1.1793093884343364, "grad_norm": 0.27951955795288086, "learning_rate": 0.00020230263157894733, "loss": 0.6117, "step": 1063 }, { "epoch": 1.180418804604077, "grad_norm": 0.271635502576828, "learning_rate": 0.00020217927631578945, "loss": 0.3996, "step": 1064 }, { "epoch": 1.1815282207738178, "grad_norm": 0.23386971652507782, "learning_rate": 0.00020205592105263157, "loss": 0.4866, "step": 1065 }, { "epoch": 1.1826376369435585, "grad_norm": 0.2693704664707184, "learning_rate": 0.00020193256578947367, "loss": 0.6313, "step": 1066 }, { "epoch": 1.1837470531132992, "grad_norm": 0.2502618432044983, "learning_rate": 0.0002018092105263158, "loss": 0.5445, "step": 1067 }, { "epoch": 1.18485646928304, "grad_norm": 0.22942642867565155, "learning_rate": 0.00020168585526315788, "loss": 0.4443, "step": 1068 }, { "epoch": 1.1859658854527804, "grad_norm": 0.23630082607269287, "learning_rate": 0.00020156249999999997, "loss": 0.5264, "step": 1069 }, { "epoch": 1.187075301622521, "grad_norm": 0.29252779483795166, "learning_rate": 0.0002014391447368421, "loss": 0.5528, "step": 1070 }, { "epoch": 1.1881847177922618, "grad_norm": 0.2584315538406372, "learning_rate": 0.0002013157894736842, "loss": 0.5017, "step": 1071 }, { "epoch": 1.1892941339620025, "grad_norm": 0.28517627716064453, "learning_rate": 0.00020119243421052628, "loss": 0.4536, "step": 1072 }, { "epoch": 1.1904035501317431, "grad_norm": 0.432355672121048, "learning_rate": 0.0002010690789473684, "loss": 0.592, "step": 1073 }, { "epoch": 1.1915129663014838, "grad_norm": 0.3467410206794739, "learning_rate": 0.00020094572368421052, "loss": 0.7285, "step": 1074 }, { "epoch": 1.1926223824712245, "grad_norm": 0.29943886399269104, "learning_rate": 0.0002008223684210526, "loss": 0.7591, "step": 1075 }, { "epoch": 1.1937317986409652, "grad_norm": 0.17552253603935242, "learning_rate": 0.00020069901315789473, "loss": 0.2609, "step": 1076 }, { "epoch": 1.194841214810706, "grad_norm": 0.20625296235084534, "learning_rate": 0.00020057565789473683, "loss": 0.3158, "step": 1077 }, { "epoch": 1.1959506309804466, "grad_norm": 0.27414992451667786, "learning_rate": 0.00020045230263157892, "loss": 0.3384, "step": 1078 }, { "epoch": 1.1970600471501873, "grad_norm": 0.2584741711616516, "learning_rate": 0.00020032894736842104, "loss": 0.3626, "step": 1079 }, { "epoch": 1.1981694633199278, "grad_norm": 0.2000085860490799, "learning_rate": 0.00020020559210526313, "loss": 0.3534, "step": 1080 }, { "epoch": 1.1992788794896685, "grad_norm": 0.4106660783290863, "learning_rate": 0.00020008223684210523, "loss": 0.4994, "step": 1081 }, { "epoch": 1.2003882956594092, "grad_norm": 0.3149929940700531, "learning_rate": 0.00019995888157894735, "loss": 0.5106, "step": 1082 }, { "epoch": 1.2014977118291499, "grad_norm": 0.27537012100219727, "learning_rate": 0.00019983552631578944, "loss": 0.403, "step": 1083 }, { "epoch": 1.2026071279988906, "grad_norm": 0.27250340580940247, "learning_rate": 0.0001997121710526316, "loss": 0.4476, "step": 1084 }, { "epoch": 1.2037165441686313, "grad_norm": 0.3000098168849945, "learning_rate": 0.00019958881578947368, "loss": 0.5365, "step": 1085 }, { "epoch": 1.204825960338372, "grad_norm": 0.2616446912288666, "learning_rate": 0.00019946546052631577, "loss": 0.4966, "step": 1086 }, { "epoch": 1.2059353765081127, "grad_norm": 0.3749598264694214, "learning_rate": 0.0001993421052631579, "loss": 0.4181, "step": 1087 }, { "epoch": 1.2070447926778534, "grad_norm": 0.22599004209041595, "learning_rate": 0.00019921875, "loss": 0.4952, "step": 1088 }, { "epoch": 1.2081542088475938, "grad_norm": 0.3596106469631195, "learning_rate": 0.00019909539473684208, "loss": 0.4713, "step": 1089 }, { "epoch": 1.2092636250173345, "grad_norm": 0.32484951615333557, "learning_rate": 0.00019897203947368417, "loss": 0.5465, "step": 1090 }, { "epoch": 1.2103730411870752, "grad_norm": 0.3571338951587677, "learning_rate": 0.0001988486842105263, "loss": 0.6788, "step": 1091 }, { "epoch": 1.211482457356816, "grad_norm": 0.34311506152153015, "learning_rate": 0.0001987253289473684, "loss": 0.5347, "step": 1092 }, { "epoch": 1.2125918735265566, "grad_norm": 0.32343587279319763, "learning_rate": 0.00019860197368421053, "loss": 0.3366, "step": 1093 }, { "epoch": 1.2137012896962973, "grad_norm": 0.22507165372371674, "learning_rate": 0.00019847861842105263, "loss": 0.5374, "step": 1094 }, { "epoch": 1.214810705866038, "grad_norm": 0.25680041313171387, "learning_rate": 0.00019835526315789472, "loss": 0.3549, "step": 1095 }, { "epoch": 1.2159201220357787, "grad_norm": 0.2538761496543884, "learning_rate": 0.00019823190789473684, "loss": 0.4335, "step": 1096 }, { "epoch": 1.2170295382055194, "grad_norm": 0.2690007984638214, "learning_rate": 0.00019810855263157893, "loss": 0.7695, "step": 1097 }, { "epoch": 1.21813895437526, "grad_norm": 0.19778668880462646, "learning_rate": 0.00019798519736842103, "loss": 0.3248, "step": 1098 }, { "epoch": 1.2192483705450008, "grad_norm": 0.23934458196163177, "learning_rate": 0.00019786184210526315, "loss": 0.3444, "step": 1099 }, { "epoch": 1.2203577867147413, "grad_norm": 0.2791898846626282, "learning_rate": 0.00019773848684210524, "loss": 0.4964, "step": 1100 }, { "epoch": 1.221467202884482, "grad_norm": 0.24607348442077637, "learning_rate": 0.00019761513157894733, "loss": 0.59, "step": 1101 }, { "epoch": 1.2225766190542227, "grad_norm": 0.27295032143592834, "learning_rate": 0.00019749177631578943, "loss": 0.4905, "step": 1102 }, { "epoch": 1.2236860352239634, "grad_norm": 0.2771230936050415, "learning_rate": 0.00019736842105263157, "loss": 0.66, "step": 1103 }, { "epoch": 1.224795451393704, "grad_norm": 0.30468347668647766, "learning_rate": 0.00019724506578947367, "loss": 0.4982, "step": 1104 }, { "epoch": 1.2259048675634447, "grad_norm": 0.2552284300327301, "learning_rate": 0.0001971217105263158, "loss": 0.3414, "step": 1105 }, { "epoch": 1.2270142837331854, "grad_norm": 0.3029981255531311, "learning_rate": 0.00019699835526315788, "loss": 0.7422, "step": 1106 }, { "epoch": 1.2281236999029261, "grad_norm": 0.27999430894851685, "learning_rate": 0.00019687499999999997, "loss": 0.5393, "step": 1107 }, { "epoch": 1.2292331160726668, "grad_norm": 0.25855761766433716, "learning_rate": 0.0001967516447368421, "loss": 0.4018, "step": 1108 }, { "epoch": 1.2303425322424075, "grad_norm": 0.4557599425315857, "learning_rate": 0.0001966282894736842, "loss": 0.5082, "step": 1109 }, { "epoch": 1.2314519484121482, "grad_norm": 0.2822202742099762, "learning_rate": 0.00019650493421052628, "loss": 0.6512, "step": 1110 }, { "epoch": 1.2325613645818887, "grad_norm": 0.31011104583740234, "learning_rate": 0.0001963815789473684, "loss": 0.3538, "step": 1111 }, { "epoch": 1.2336707807516294, "grad_norm": 0.23474127054214478, "learning_rate": 0.00019625822368421052, "loss": 0.5368, "step": 1112 }, { "epoch": 1.23478019692137, "grad_norm": 0.2853519320487976, "learning_rate": 0.00019613486842105261, "loss": 0.5207, "step": 1113 }, { "epoch": 1.2358896130911108, "grad_norm": 0.32621482014656067, "learning_rate": 0.00019601151315789473, "loss": 0.555, "step": 1114 }, { "epoch": 1.2369990292608515, "grad_norm": 0.2757001519203186, "learning_rate": 0.00019588815789473683, "loss": 0.6936, "step": 1115 }, { "epoch": 1.2381084454305922, "grad_norm": 0.2345856875181198, "learning_rate": 0.00019576480263157892, "loss": 0.6683, "step": 1116 }, { "epoch": 1.2392178616003329, "grad_norm": 0.2641811668872833, "learning_rate": 0.00019564144736842104, "loss": 0.4647, "step": 1117 }, { "epoch": 1.2403272777700736, "grad_norm": 0.2421552538871765, "learning_rate": 0.00019551809210526313, "loss": 0.8658, "step": 1118 }, { "epoch": 1.2414366939398143, "grad_norm": 0.3254922032356262, "learning_rate": 0.00019539473684210523, "loss": 0.4028, "step": 1119 }, { "epoch": 1.2425461101095547, "grad_norm": 0.27477791905403137, "learning_rate": 0.00019527138157894735, "loss": 0.4876, "step": 1120 }, { "epoch": 1.2436555262792954, "grad_norm": 0.37873339653015137, "learning_rate": 0.00019514802631578944, "loss": 0.5121, "step": 1121 }, { "epoch": 1.2447649424490361, "grad_norm": 0.20323941111564636, "learning_rate": 0.0001950246710526316, "loss": 0.4327, "step": 1122 }, { "epoch": 1.2458743586187768, "grad_norm": 0.2840193510055542, "learning_rate": 0.00019490131578947368, "loss": 0.3922, "step": 1123 }, { "epoch": 1.2469837747885175, "grad_norm": 0.24474340677261353, "learning_rate": 0.00019477796052631578, "loss": 0.5334, "step": 1124 }, { "epoch": 1.2480931909582582, "grad_norm": 0.25993865728378296, "learning_rate": 0.00019465460526315787, "loss": 0.3783, "step": 1125 }, { "epoch": 1.249202607127999, "grad_norm": 0.3326314687728882, "learning_rate": 0.00019453125, "loss": 0.4944, "step": 1126 }, { "epoch": 1.2503120232977396, "grad_norm": 0.26182398200035095, "learning_rate": 0.00019440789473684208, "loss": 0.8275, "step": 1127 }, { "epoch": 1.2514214394674803, "grad_norm": 0.26601549983024597, "learning_rate": 0.00019428453947368417, "loss": 0.4694, "step": 1128 }, { "epoch": 1.2525308556372208, "grad_norm": 0.27001309394836426, "learning_rate": 0.0001941611842105263, "loss": 0.4887, "step": 1129 }, { "epoch": 1.2536402718069617, "grad_norm": 0.21264316141605377, "learning_rate": 0.0001940378289473684, "loss": 0.5031, "step": 1130 }, { "epoch": 1.2547496879767022, "grad_norm": 0.1987910121679306, "learning_rate": 0.00019391447368421054, "loss": 0.3877, "step": 1131 }, { "epoch": 1.2558591041464429, "grad_norm": 0.33749696612358093, "learning_rate": 0.00019379111842105263, "loss": 0.4784, "step": 1132 }, { "epoch": 1.2569685203161836, "grad_norm": 0.26460304856300354, "learning_rate": 0.00019366776315789472, "loss": 0.5015, "step": 1133 }, { "epoch": 1.2580779364859243, "grad_norm": 0.2513628602027893, "learning_rate": 0.00019354440789473684, "loss": 0.3158, "step": 1134 }, { "epoch": 1.259187352655665, "grad_norm": 0.23956595361232758, "learning_rate": 0.00019342105263157894, "loss": 0.4248, "step": 1135 }, { "epoch": 1.2602967688254056, "grad_norm": 0.33675435185432434, "learning_rate": 0.00019329769736842103, "loss": 0.5479, "step": 1136 }, { "epoch": 1.2614061849951463, "grad_norm": 0.2596627473831177, "learning_rate": 0.00019317434210526315, "loss": 0.4415, "step": 1137 }, { "epoch": 1.262515601164887, "grad_norm": 0.3592386543750763, "learning_rate": 0.00019305098684210524, "loss": 0.5607, "step": 1138 }, { "epoch": 1.2636250173346277, "grad_norm": 0.3122079074382782, "learning_rate": 0.00019292763157894734, "loss": 0.891, "step": 1139 }, { "epoch": 1.2647344335043682, "grad_norm": 0.31016677618026733, "learning_rate": 0.00019280427631578943, "loss": 0.5598, "step": 1140 }, { "epoch": 1.2658438496741091, "grad_norm": 0.28511670231819153, "learning_rate": 0.00019268092105263158, "loss": 0.5666, "step": 1141 }, { "epoch": 1.2669532658438496, "grad_norm": 0.3194934129714966, "learning_rate": 0.00019255756578947367, "loss": 0.6659, "step": 1142 }, { "epoch": 1.2680626820135903, "grad_norm": 0.2699849307537079, "learning_rate": 0.0001924342105263158, "loss": 0.3064, "step": 1143 }, { "epoch": 1.269172098183331, "grad_norm": 0.28152981400489807, "learning_rate": 0.00019231085526315788, "loss": 0.5019, "step": 1144 }, { "epoch": 1.2702815143530717, "grad_norm": 0.34371238946914673, "learning_rate": 0.00019218749999999998, "loss": 0.4843, "step": 1145 }, { "epoch": 1.2713909305228124, "grad_norm": 0.2585979402065277, "learning_rate": 0.0001920641447368421, "loss": 0.4984, "step": 1146 }, { "epoch": 1.272500346692553, "grad_norm": 0.32947319746017456, "learning_rate": 0.0001919407894736842, "loss": 0.5213, "step": 1147 }, { "epoch": 1.2736097628622938, "grad_norm": 0.22969017922878265, "learning_rate": 0.00019181743421052628, "loss": 0.429, "step": 1148 }, { "epoch": 1.2747191790320345, "grad_norm": 0.22548457980155945, "learning_rate": 0.0001916940789473684, "loss": 0.3659, "step": 1149 }, { "epoch": 1.2758285952017752, "grad_norm": 0.2924930155277252, "learning_rate": 0.00019157072368421052, "loss": 0.4644, "step": 1150 }, { "epoch": 1.2769380113715156, "grad_norm": 0.31746160984039307, "learning_rate": 0.00019144736842105262, "loss": 0.5486, "step": 1151 }, { "epoch": 1.2780474275412563, "grad_norm": 0.3976684808731079, "learning_rate": 0.00019132401315789474, "loss": 0.6632, "step": 1152 }, { "epoch": 1.279156843710997, "grad_norm": 0.31601187586784363, "learning_rate": 0.00019120065789473683, "loss": 0.5185, "step": 1153 }, { "epoch": 1.2802662598807377, "grad_norm": 0.2981434762477875, "learning_rate": 0.00019107730263157892, "loss": 0.4975, "step": 1154 }, { "epoch": 1.2813756760504784, "grad_norm": 0.2698228657245636, "learning_rate": 0.00019095394736842104, "loss": 0.6271, "step": 1155 }, { "epoch": 1.2824850922202191, "grad_norm": 0.3439400792121887, "learning_rate": 0.00019083059210526314, "loss": 0.6612, "step": 1156 }, { "epoch": 1.2835945083899598, "grad_norm": 0.27007582783699036, "learning_rate": 0.00019070723684210523, "loss": 0.5057, "step": 1157 }, { "epoch": 1.2847039245597005, "grad_norm": 0.29291701316833496, "learning_rate": 0.00019058388157894735, "loss": 0.5788, "step": 1158 }, { "epoch": 1.2858133407294412, "grad_norm": 0.41224008798599243, "learning_rate": 0.00019046052631578944, "loss": 0.6633, "step": 1159 }, { "epoch": 1.2869227568991817, "grad_norm": 0.3880039155483246, "learning_rate": 0.00019033717105263156, "loss": 0.5065, "step": 1160 }, { "epoch": 1.2880321730689226, "grad_norm": 0.3320450186729431, "learning_rate": 0.00019021381578947368, "loss": 0.6414, "step": 1161 }, { "epoch": 1.289141589238663, "grad_norm": 0.2895980477333069, "learning_rate": 0.00019009046052631578, "loss": 0.842, "step": 1162 }, { "epoch": 1.2902510054084038, "grad_norm": 0.2721669673919678, "learning_rate": 0.00018996710526315787, "loss": 0.4492, "step": 1163 }, { "epoch": 1.2913604215781445, "grad_norm": 0.28013283014297485, "learning_rate": 0.00018984375, "loss": 0.3661, "step": 1164 }, { "epoch": 1.2924698377478852, "grad_norm": 0.22734491527080536, "learning_rate": 0.00018972039473684208, "loss": 0.5751, "step": 1165 }, { "epoch": 1.2935792539176258, "grad_norm": 0.2372591197490692, "learning_rate": 0.00018959703947368418, "loss": 0.4572, "step": 1166 }, { "epoch": 1.2946886700873665, "grad_norm": 0.3541167080402374, "learning_rate": 0.0001894736842105263, "loss": 0.6481, "step": 1167 }, { "epoch": 1.2957980862571072, "grad_norm": 0.23032468557357788, "learning_rate": 0.0001893503289473684, "loss": 0.3583, "step": 1168 }, { "epoch": 1.296907502426848, "grad_norm": 0.20932866632938385, "learning_rate": 0.00018922697368421054, "loss": 0.3898, "step": 1169 }, { "epoch": 1.2980169185965886, "grad_norm": 0.2942456603050232, "learning_rate": 0.00018910361842105263, "loss": 0.4076, "step": 1170 }, { "epoch": 1.299126334766329, "grad_norm": 0.28456342220306396, "learning_rate": 0.00018898026315789472, "loss": 0.2953, "step": 1171 }, { "epoch": 1.30023575093607, "grad_norm": 0.39009782671928406, "learning_rate": 0.00018885690789473684, "loss": 0.511, "step": 1172 }, { "epoch": 1.3013451671058105, "grad_norm": 0.22601839900016785, "learning_rate": 0.00018873355263157894, "loss": 0.5164, "step": 1173 }, { "epoch": 1.3024545832755512, "grad_norm": 0.23257453739643097, "learning_rate": 0.00018861019736842103, "loss": 0.4081, "step": 1174 }, { "epoch": 1.3035639994452919, "grad_norm": 0.2568961977958679, "learning_rate": 0.00018848684210526312, "loss": 0.4208, "step": 1175 }, { "epoch": 1.3046734156150326, "grad_norm": 0.33718129992485046, "learning_rate": 0.00018836348684210524, "loss": 0.3738, "step": 1176 }, { "epoch": 1.3057828317847733, "grad_norm": 0.24113743007183075, "learning_rate": 0.00018824013157894734, "loss": 0.4668, "step": 1177 }, { "epoch": 1.306892247954514, "grad_norm": 0.2619990110397339, "learning_rate": 0.00018811677631578943, "loss": 0.6427, "step": 1178 }, { "epoch": 1.3080016641242547, "grad_norm": 0.24512606859207153, "learning_rate": 0.00018799342105263158, "loss": 0.4494, "step": 1179 }, { "epoch": 1.3091110802939954, "grad_norm": 0.23559564352035522, "learning_rate": 0.00018787006578947367, "loss": 0.4123, "step": 1180 }, { "epoch": 1.310220496463736, "grad_norm": 0.24036440253257751, "learning_rate": 0.0001877467105263158, "loss": 0.4689, "step": 1181 }, { "epoch": 1.3113299126334765, "grad_norm": 0.29898273944854736, "learning_rate": 0.00018762335526315788, "loss": 0.2823, "step": 1182 }, { "epoch": 1.3124393288032172, "grad_norm": 0.2730746567249298, "learning_rate": 0.00018749999999999998, "loss": 0.7834, "step": 1183 }, { "epoch": 1.313548744972958, "grad_norm": 0.30577996373176575, "learning_rate": 0.0001873766447368421, "loss": 0.6518, "step": 1184 }, { "epoch": 1.3146581611426986, "grad_norm": 0.1852562427520752, "learning_rate": 0.0001872532894736842, "loss": 0.3995, "step": 1185 }, { "epoch": 1.3157675773124393, "grad_norm": 0.3734980821609497, "learning_rate": 0.00018712993421052628, "loss": 0.8202, "step": 1186 }, { "epoch": 1.31687699348218, "grad_norm": 0.1997169852256775, "learning_rate": 0.0001870065789473684, "loss": 0.5287, "step": 1187 }, { "epoch": 1.3179864096519207, "grad_norm": 0.29843178391456604, "learning_rate": 0.00018688322368421052, "loss": 0.4611, "step": 1188 }, { "epoch": 1.3190958258216614, "grad_norm": 0.24156174063682556, "learning_rate": 0.00018675986842105262, "loss": 0.7034, "step": 1189 }, { "epoch": 1.320205241991402, "grad_norm": 0.3248981237411499, "learning_rate": 0.00018663651315789474, "loss": 0.562, "step": 1190 }, { "epoch": 1.3213146581611426, "grad_norm": 0.22303232550621033, "learning_rate": 0.00018651315789473683, "loss": 0.428, "step": 1191 }, { "epoch": 1.3224240743308835, "grad_norm": 0.32691171765327454, "learning_rate": 0.00018638980263157892, "loss": 0.5038, "step": 1192 }, { "epoch": 1.323533490500624, "grad_norm": 0.23463557660579681, "learning_rate": 0.00018626644736842104, "loss": 0.5723, "step": 1193 }, { "epoch": 1.3246429066703647, "grad_norm": 0.23432673513889313, "learning_rate": 0.00018614309210526314, "loss": 0.2767, "step": 1194 }, { "epoch": 1.3257523228401054, "grad_norm": 0.30433669686317444, "learning_rate": 0.00018601973684210523, "loss": 0.5931, "step": 1195 }, { "epoch": 1.326861739009846, "grad_norm": 0.2979690432548523, "learning_rate": 0.00018589638157894735, "loss": 0.6678, "step": 1196 }, { "epoch": 1.3279711551795867, "grad_norm": 0.279066801071167, "learning_rate": 0.00018577302631578944, "loss": 0.6302, "step": 1197 }, { "epoch": 1.3290805713493274, "grad_norm": 0.28714966773986816, "learning_rate": 0.00018564967105263156, "loss": 0.494, "step": 1198 }, { "epoch": 1.3301899875190681, "grad_norm": 0.21045270562171936, "learning_rate": 0.00018552631578947368, "loss": 0.5388, "step": 1199 }, { "epoch": 1.3312994036888088, "grad_norm": 0.27514612674713135, "learning_rate": 0.00018540296052631578, "loss": 0.3849, "step": 1200 }, { "epoch": 1.3324088198585495, "grad_norm": 0.3071988523006439, "learning_rate": 0.00018527960526315787, "loss": 0.4609, "step": 1201 }, { "epoch": 1.33351823602829, "grad_norm": 0.251871794462204, "learning_rate": 0.00018515625, "loss": 0.7988, "step": 1202 }, { "epoch": 1.334627652198031, "grad_norm": 0.273266077041626, "learning_rate": 0.00018503289473684208, "loss": 0.3886, "step": 1203 }, { "epoch": 1.3357370683677714, "grad_norm": 0.28145653009414673, "learning_rate": 0.00018490953947368418, "loss": 0.556, "step": 1204 }, { "epoch": 1.336846484537512, "grad_norm": 0.23886194825172424, "learning_rate": 0.0001847861842105263, "loss": 0.4633, "step": 1205 }, { "epoch": 1.3379559007072528, "grad_norm": 0.2713840901851654, "learning_rate": 0.0001846628289473684, "loss": 0.6669, "step": 1206 }, { "epoch": 1.3390653168769935, "grad_norm": 0.19628193974494934, "learning_rate": 0.00018453947368421054, "loss": 0.5268, "step": 1207 }, { "epoch": 1.3401747330467342, "grad_norm": 0.32730063796043396, "learning_rate": 0.00018441611842105263, "loss": 0.4295, "step": 1208 }, { "epoch": 1.3412841492164749, "grad_norm": 0.244260773062706, "learning_rate": 0.00018429276315789472, "loss": 0.4108, "step": 1209 }, { "epoch": 1.3423935653862156, "grad_norm": 0.30566221475601196, "learning_rate": 0.00018416940789473684, "loss": 0.8457, "step": 1210 }, { "epoch": 1.343502981555956, "grad_norm": 0.2607499659061432, "learning_rate": 0.00018404605263157894, "loss": 0.4235, "step": 1211 }, { "epoch": 1.344612397725697, "grad_norm": 0.359625905752182, "learning_rate": 0.00018392269736842103, "loss": 0.5669, "step": 1212 }, { "epoch": 1.3457218138954374, "grad_norm": 0.2476588487625122, "learning_rate": 0.00018379934210526312, "loss": 0.3539, "step": 1213 }, { "epoch": 1.3468312300651781, "grad_norm": 0.20041054487228394, "learning_rate": 0.00018367598684210524, "loss": 0.3843, "step": 1214 }, { "epoch": 1.3479406462349188, "grad_norm": 0.4561375677585602, "learning_rate": 0.00018355263157894734, "loss": 0.6868, "step": 1215 }, { "epoch": 1.3490500624046595, "grad_norm": 0.24837137758731842, "learning_rate": 0.00018342927631578943, "loss": 0.4685, "step": 1216 }, { "epoch": 1.3501594785744002, "grad_norm": 0.3139680027961731, "learning_rate": 0.00018330592105263158, "loss": 0.5152, "step": 1217 }, { "epoch": 1.351268894744141, "grad_norm": 0.2819393277168274, "learning_rate": 0.00018318256578947367, "loss": 0.4204, "step": 1218 }, { "epoch": 1.3523783109138816, "grad_norm": 0.2980377972126007, "learning_rate": 0.0001830592105263158, "loss": 0.4413, "step": 1219 }, { "epoch": 1.3534877270836223, "grad_norm": 0.29253172874450684, "learning_rate": 0.00018293585526315788, "loss": 0.5169, "step": 1220 }, { "epoch": 1.354597143253363, "grad_norm": 0.3686385154724121, "learning_rate": 0.00018281249999999998, "loss": 0.4958, "step": 1221 }, { "epoch": 1.3557065594231035, "grad_norm": 0.29562124609947205, "learning_rate": 0.0001826891447368421, "loss": 0.5406, "step": 1222 }, { "epoch": 1.3568159755928444, "grad_norm": 0.21728160977363586, "learning_rate": 0.0001825657894736842, "loss": 0.412, "step": 1223 }, { "epoch": 1.3579253917625849, "grad_norm": 0.23864157497882843, "learning_rate": 0.00018244243421052628, "loss": 0.4752, "step": 1224 }, { "epoch": 1.3590348079323256, "grad_norm": 0.24741685390472412, "learning_rate": 0.00018231907894736838, "loss": 0.5245, "step": 1225 }, { "epoch": 1.3601442241020663, "grad_norm": 0.2409430295228958, "learning_rate": 0.00018219572368421053, "loss": 0.4216, "step": 1226 }, { "epoch": 1.361253640271807, "grad_norm": 0.25885629653930664, "learning_rate": 0.00018207236842105262, "loss": 0.4052, "step": 1227 }, { "epoch": 1.3623630564415476, "grad_norm": 0.28579777479171753, "learning_rate": 0.00018194901315789474, "loss": 0.4137, "step": 1228 }, { "epoch": 1.3634724726112883, "grad_norm": 0.23780061304569244, "learning_rate": 0.00018182565789473683, "loss": 0.4971, "step": 1229 }, { "epoch": 1.364581888781029, "grad_norm": 0.32252663373947144, "learning_rate": 0.00018170230263157893, "loss": 0.5721, "step": 1230 }, { "epoch": 1.3656913049507697, "grad_norm": 0.3731588125228882, "learning_rate": 0.00018157894736842105, "loss": 0.4581, "step": 1231 }, { "epoch": 1.3668007211205104, "grad_norm": 0.15041133761405945, "learning_rate": 0.00018145559210526314, "loss": 0.3275, "step": 1232 }, { "epoch": 1.367910137290251, "grad_norm": 0.4989398419857025, "learning_rate": 0.00018133223684210523, "loss": 0.3791, "step": 1233 }, { "epoch": 1.3690195534599916, "grad_norm": 0.4500264525413513, "learning_rate": 0.00018120888157894735, "loss": 0.4119, "step": 1234 }, { "epoch": 1.3701289696297323, "grad_norm": 0.2674502432346344, "learning_rate": 0.00018108552631578945, "loss": 0.6938, "step": 1235 }, { "epoch": 1.371238385799473, "grad_norm": 0.363046258687973, "learning_rate": 0.00018096217105263157, "loss": 0.4881, "step": 1236 }, { "epoch": 1.3723478019692137, "grad_norm": 0.38243699073791504, "learning_rate": 0.00018083881578947369, "loss": 0.6211, "step": 1237 }, { "epoch": 1.3734572181389544, "grad_norm": 0.25102144479751587, "learning_rate": 0.00018071546052631578, "loss": 0.4659, "step": 1238 }, { "epoch": 1.374566634308695, "grad_norm": 0.546259343624115, "learning_rate": 0.00018059210526315787, "loss": 0.4187, "step": 1239 }, { "epoch": 1.3756760504784358, "grad_norm": 0.3232324719429016, "learning_rate": 0.00018046875, "loss": 0.457, "step": 1240 }, { "epoch": 1.3767854666481765, "grad_norm": 0.2647280693054199, "learning_rate": 0.00018034539473684209, "loss": 0.5801, "step": 1241 }, { "epoch": 1.377894882817917, "grad_norm": 0.43696561455726624, "learning_rate": 0.00018022203947368418, "loss": 0.5091, "step": 1242 }, { "epoch": 1.3790042989876579, "grad_norm": 0.25493887066841125, "learning_rate": 0.0001800986842105263, "loss": 0.4163, "step": 1243 }, { "epoch": 1.3801137151573983, "grad_norm": 0.32590964436531067, "learning_rate": 0.0001799753289473684, "loss": 0.4611, "step": 1244 }, { "epoch": 1.381223131327139, "grad_norm": 0.28104108572006226, "learning_rate": 0.00017985197368421054, "loss": 0.3906, "step": 1245 }, { "epoch": 1.3823325474968797, "grad_norm": 0.3454284369945526, "learning_rate": 0.00017972861842105263, "loss": 0.5055, "step": 1246 }, { "epoch": 1.3834419636666204, "grad_norm": 0.3004135489463806, "learning_rate": 0.00017960526315789473, "loss": 0.5906, "step": 1247 }, { "epoch": 1.3845513798363611, "grad_norm": 0.3798172175884247, "learning_rate": 0.00017948190789473682, "loss": 0.3812, "step": 1248 }, { "epoch": 1.3856607960061018, "grad_norm": 0.23392631113529205, "learning_rate": 0.00017935855263157894, "loss": 0.3541, "step": 1249 }, { "epoch": 1.3867702121758425, "grad_norm": 0.28611382842063904, "learning_rate": 0.00017923519736842103, "loss": 0.3994, "step": 1250 }, { "epoch": 1.3878796283455832, "grad_norm": 0.23118913173675537, "learning_rate": 0.00017911184210526313, "loss": 0.5034, "step": 1251 }, { "epoch": 1.388989044515324, "grad_norm": 0.3240826725959778, "learning_rate": 0.00017898848684210525, "loss": 0.7616, "step": 1252 }, { "epoch": 1.3900984606850644, "grad_norm": 0.35286498069763184, "learning_rate": 0.00017886513157894734, "loss": 0.7485, "step": 1253 }, { "epoch": 1.3912078768548053, "grad_norm": 0.23060156404972076, "learning_rate": 0.00017874177631578943, "loss": 0.4549, "step": 1254 }, { "epoch": 1.3923172930245458, "grad_norm": 0.3174906075000763, "learning_rate": 0.00017861842105263158, "loss": 0.4251, "step": 1255 }, { "epoch": 1.3934267091942865, "grad_norm": 0.24730369448661804, "learning_rate": 0.00017849506578947367, "loss": 0.4357, "step": 1256 }, { "epoch": 1.3945361253640272, "grad_norm": 0.2686748206615448, "learning_rate": 0.0001783717105263158, "loss": 0.3928, "step": 1257 }, { "epoch": 1.3956455415337679, "grad_norm": 0.2948950231075287, "learning_rate": 0.00017824835526315789, "loss": 0.6208, "step": 1258 }, { "epoch": 1.3967549577035085, "grad_norm": 0.30887314677238464, "learning_rate": 0.00017812499999999998, "loss": 0.4876, "step": 1259 }, { "epoch": 1.3978643738732492, "grad_norm": 0.3009176552295685, "learning_rate": 0.0001780016447368421, "loss": 0.3689, "step": 1260 }, { "epoch": 1.39897379004299, "grad_norm": 0.24447450041770935, "learning_rate": 0.0001778782894736842, "loss": 0.3824, "step": 1261 }, { "epoch": 1.4000832062127304, "grad_norm": 0.2673259973526001, "learning_rate": 0.00017775493421052629, "loss": 0.5425, "step": 1262 }, { "epoch": 1.4011926223824713, "grad_norm": 0.273612916469574, "learning_rate": 0.00017763157894736838, "loss": 0.3522, "step": 1263 }, { "epoch": 1.4023020385522118, "grad_norm": 0.3372279405593872, "learning_rate": 0.00017750822368421053, "loss": 0.4351, "step": 1264 }, { "epoch": 1.4034114547219525, "grad_norm": 0.23103949427604675, "learning_rate": 0.00017738486842105262, "loss": 0.4426, "step": 1265 }, { "epoch": 1.4045208708916932, "grad_norm": 0.21246463060379028, "learning_rate": 0.00017726151315789474, "loss": 0.5233, "step": 1266 }, { "epoch": 1.405630287061434, "grad_norm": 0.2304743230342865, "learning_rate": 0.00017713815789473683, "loss": 0.3163, "step": 1267 }, { "epoch": 1.4067397032311746, "grad_norm": 0.35149502754211426, "learning_rate": 0.00017701480263157893, "loss": 0.6481, "step": 1268 }, { "epoch": 1.4078491194009153, "grad_norm": 0.2509346604347229, "learning_rate": 0.00017689144736842105, "loss": 0.6448, "step": 1269 }, { "epoch": 1.408958535570656, "grad_norm": 0.25470322370529175, "learning_rate": 0.00017676809210526314, "loss": 0.5486, "step": 1270 }, { "epoch": 1.4100679517403967, "grad_norm": 0.24675153195858002, "learning_rate": 0.00017664473684210523, "loss": 0.4237, "step": 1271 }, { "epoch": 1.4111773679101374, "grad_norm": 0.38935887813568115, "learning_rate": 0.00017652138157894735, "loss": 0.5769, "step": 1272 }, { "epoch": 1.4122867840798778, "grad_norm": 0.22933362424373627, "learning_rate": 0.00017639802631578945, "loss": 0.4627, "step": 1273 }, { "epoch": 1.4133962002496188, "grad_norm": 0.4441911280155182, "learning_rate": 0.00017627467105263157, "loss": 0.5397, "step": 1274 }, { "epoch": 1.4145056164193592, "grad_norm": 0.24430730938911438, "learning_rate": 0.0001761513157894737, "loss": 0.4419, "step": 1275 }, { "epoch": 1.4156150325891, "grad_norm": 0.280831515789032, "learning_rate": 0.00017602796052631578, "loss": 0.3993, "step": 1276 }, { "epoch": 1.4167244487588406, "grad_norm": 0.32151225209236145, "learning_rate": 0.00017590460526315787, "loss": 0.5654, "step": 1277 }, { "epoch": 1.4178338649285813, "grad_norm": 0.3048468828201294, "learning_rate": 0.00017578125, "loss": 0.4373, "step": 1278 }, { "epoch": 1.418943281098322, "grad_norm": 0.44277939200401306, "learning_rate": 0.0001756578947368421, "loss": 0.4243, "step": 1279 }, { "epoch": 1.4200526972680627, "grad_norm": 0.3722161054611206, "learning_rate": 0.00017553453947368418, "loss": 0.5524, "step": 1280 }, { "epoch": 1.4211621134378034, "grad_norm": 0.24039240181446075, "learning_rate": 0.0001754111842105263, "loss": 0.4882, "step": 1281 }, { "epoch": 1.422271529607544, "grad_norm": 0.34307026863098145, "learning_rate": 0.0001752878289473684, "loss": 0.5102, "step": 1282 }, { "epoch": 1.4233809457772848, "grad_norm": 0.2590845227241516, "learning_rate": 0.00017516447368421051, "loss": 0.4354, "step": 1283 }, { "epoch": 1.4244903619470253, "grad_norm": 0.25644242763519287, "learning_rate": 0.00017504111842105263, "loss": 0.4385, "step": 1284 }, { "epoch": 1.425599778116766, "grad_norm": 0.25579833984375, "learning_rate": 0.00017491776315789473, "loss": 0.5522, "step": 1285 }, { "epoch": 1.4267091942865067, "grad_norm": 0.26802870631217957, "learning_rate": 0.00017479440789473682, "loss": 0.369, "step": 1286 }, { "epoch": 1.4278186104562474, "grad_norm": 0.28027182817459106, "learning_rate": 0.00017467105263157894, "loss": 0.485, "step": 1287 }, { "epoch": 1.428928026625988, "grad_norm": 0.5787771344184875, "learning_rate": 0.00017454769736842103, "loss": 0.5496, "step": 1288 }, { "epoch": 1.4300374427957288, "grad_norm": 0.24002310633659363, "learning_rate": 0.00017442434210526313, "loss": 0.4172, "step": 1289 }, { "epoch": 1.4311468589654694, "grad_norm": 0.34913745522499084, "learning_rate": 0.00017430098684210525, "loss": 0.6889, "step": 1290 }, { "epoch": 1.4322562751352101, "grad_norm": 0.2884618639945984, "learning_rate": 0.00017417763157894734, "loss": 0.4579, "step": 1291 }, { "epoch": 1.4333656913049508, "grad_norm": 0.2817135751247406, "learning_rate": 0.00017405427631578943, "loss": 0.4721, "step": 1292 }, { "epoch": 1.4344751074746913, "grad_norm": 0.2580573558807373, "learning_rate": 0.00017393092105263158, "loss": 0.4966, "step": 1293 }, { "epoch": 1.4355845236444322, "grad_norm": 0.3415181338787079, "learning_rate": 0.00017380756578947367, "loss": 0.4475, "step": 1294 }, { "epoch": 1.4366939398141727, "grad_norm": 0.2799202799797058, "learning_rate": 0.0001736842105263158, "loss": 0.4047, "step": 1295 }, { "epoch": 1.4378033559839134, "grad_norm": 0.3765754699707031, "learning_rate": 0.0001735608552631579, "loss": 0.4161, "step": 1296 }, { "epoch": 1.438912772153654, "grad_norm": 0.2828143835067749, "learning_rate": 0.00017343749999999998, "loss": 0.544, "step": 1297 }, { "epoch": 1.4400221883233948, "grad_norm": 0.4222780466079712, "learning_rate": 0.00017331414473684207, "loss": 0.4053, "step": 1298 }, { "epoch": 1.4411316044931355, "grad_norm": 0.2224740982055664, "learning_rate": 0.0001731907894736842, "loss": 0.343, "step": 1299 }, { "epoch": 1.4422410206628762, "grad_norm": 0.301512748003006, "learning_rate": 0.0001730674342105263, "loss": 0.5385, "step": 1300 }, { "epoch": 1.4433504368326169, "grad_norm": 0.24199537932872772, "learning_rate": 0.00017294407894736838, "loss": 0.4618, "step": 1301 }, { "epoch": 1.4444598530023576, "grad_norm": 0.2662793695926666, "learning_rate": 0.00017282072368421053, "loss": 0.3893, "step": 1302 }, { "epoch": 1.4455692691720983, "grad_norm": 0.6027406454086304, "learning_rate": 0.00017269736842105262, "loss": 0.7918, "step": 1303 }, { "epoch": 1.4466786853418387, "grad_norm": 0.33173203468322754, "learning_rate": 0.00017257401315789474, "loss": 0.3696, "step": 1304 }, { "epoch": 1.4477881015115797, "grad_norm": 0.42119306325912476, "learning_rate": 0.00017245065789473683, "loss": 0.3504, "step": 1305 }, { "epoch": 1.4488975176813201, "grad_norm": 0.33368048071861267, "learning_rate": 0.00017232730263157893, "loss": 0.543, "step": 1306 }, { "epoch": 1.4500069338510608, "grad_norm": 0.3724362850189209, "learning_rate": 0.00017220394736842105, "loss": 0.483, "step": 1307 }, { "epoch": 1.4511163500208015, "grad_norm": 0.28803780674934387, "learning_rate": 0.00017208059210526314, "loss": 0.6044, "step": 1308 }, { "epoch": 1.4522257661905422, "grad_norm": 0.29041630029678345, "learning_rate": 0.00017195723684210523, "loss": 0.4351, "step": 1309 }, { "epoch": 1.453335182360283, "grad_norm": 0.27980196475982666, "learning_rate": 0.00017183388157894735, "loss": 0.2806, "step": 1310 }, { "epoch": 1.4544445985300236, "grad_norm": 0.5144875049591064, "learning_rate": 0.00017171052631578945, "loss": 0.6004, "step": 1311 }, { "epoch": 1.4555540146997643, "grad_norm": 0.41391658782958984, "learning_rate": 0.00017158717105263157, "loss": 0.3426, "step": 1312 }, { "epoch": 1.456663430869505, "grad_norm": 0.38778960704803467, "learning_rate": 0.0001714638157894737, "loss": 0.5716, "step": 1313 }, { "epoch": 1.4577728470392457, "grad_norm": 0.29660889506340027, "learning_rate": 0.00017134046052631578, "loss": 0.6084, "step": 1314 }, { "epoch": 1.4588822632089862, "grad_norm": 0.24347136914730072, "learning_rate": 0.00017121710526315787, "loss": 0.6538, "step": 1315 }, { "epoch": 1.4599916793787269, "grad_norm": 0.2733883857727051, "learning_rate": 0.00017109375, "loss": 0.5797, "step": 1316 }, { "epoch": 1.4611010955484676, "grad_norm": 0.33641162514686584, "learning_rate": 0.0001709703947368421, "loss": 0.4837, "step": 1317 }, { "epoch": 1.4622105117182083, "grad_norm": 0.24747195839881897, "learning_rate": 0.00017084703947368418, "loss": 0.4617, "step": 1318 }, { "epoch": 1.463319927887949, "grad_norm": 0.2453019767999649, "learning_rate": 0.0001707236842105263, "loss": 0.5242, "step": 1319 }, { "epoch": 1.4644293440576897, "grad_norm": 0.2680438160896301, "learning_rate": 0.0001706003289473684, "loss": 0.3985, "step": 1320 }, { "epoch": 1.4655387602274303, "grad_norm": 0.22616985440254211, "learning_rate": 0.00017047697368421051, "loss": 0.5428, "step": 1321 }, { "epoch": 1.466648176397171, "grad_norm": 0.25485649704933167, "learning_rate": 0.00017035361842105264, "loss": 0.4411, "step": 1322 }, { "epoch": 1.4677575925669117, "grad_norm": 0.19676417112350464, "learning_rate": 0.00017023026315789473, "loss": 0.4987, "step": 1323 }, { "epoch": 1.4688670087366522, "grad_norm": 0.23980014026165009, "learning_rate": 0.00017010690789473682, "loss": 0.2853, "step": 1324 }, { "epoch": 1.4699764249063931, "grad_norm": 0.31723615527153015, "learning_rate": 0.00016998355263157894, "loss": 0.3271, "step": 1325 }, { "epoch": 1.4710858410761336, "grad_norm": 0.2783150374889374, "learning_rate": 0.00016986019736842103, "loss": 0.4648, "step": 1326 }, { "epoch": 1.4721952572458743, "grad_norm": 0.3233512341976166, "learning_rate": 0.00016973684210526313, "loss": 0.4557, "step": 1327 }, { "epoch": 1.473304673415615, "grad_norm": 0.2481250911951065, "learning_rate": 0.00016961348684210525, "loss": 0.5224, "step": 1328 }, { "epoch": 1.4744140895853557, "grad_norm": 0.317909300327301, "learning_rate": 0.00016949013157894734, "loss": 0.642, "step": 1329 }, { "epoch": 1.4755235057550964, "grad_norm": 0.25988101959228516, "learning_rate": 0.00016936677631578943, "loss": 0.3956, "step": 1330 }, { "epoch": 1.476632921924837, "grad_norm": 0.2583842873573303, "learning_rate": 0.00016924342105263158, "loss": 0.4031, "step": 1331 }, { "epoch": 1.4777423380945778, "grad_norm": 0.2636638283729553, "learning_rate": 0.00016912006578947368, "loss": 0.4351, "step": 1332 }, { "epoch": 1.4788517542643185, "grad_norm": 0.2658294439315796, "learning_rate": 0.00016899671052631577, "loss": 0.4895, "step": 1333 }, { "epoch": 1.4799611704340592, "grad_norm": 0.192026287317276, "learning_rate": 0.0001688733552631579, "loss": 0.5236, "step": 1334 }, { "epoch": 1.4810705866037996, "grad_norm": 0.2988239824771881, "learning_rate": 0.00016874999999999998, "loss": 0.4434, "step": 1335 }, { "epoch": 1.4821800027735406, "grad_norm": 0.32316145300865173, "learning_rate": 0.00016862664473684207, "loss": 0.5385, "step": 1336 }, { "epoch": 1.483289418943281, "grad_norm": 0.3037776052951813, "learning_rate": 0.0001685032894736842, "loss": 0.3864, "step": 1337 }, { "epoch": 1.4843988351130217, "grad_norm": 0.3819033205509186, "learning_rate": 0.0001683799342105263, "loss": 0.4825, "step": 1338 }, { "epoch": 1.4855082512827624, "grad_norm": 0.346760630607605, "learning_rate": 0.00016825657894736838, "loss": 0.5131, "step": 1339 }, { "epoch": 1.4866176674525031, "grad_norm": 0.27673614025115967, "learning_rate": 0.00016813322368421053, "loss": 0.4316, "step": 1340 }, { "epoch": 1.4877270836222438, "grad_norm": 0.21999450027942657, "learning_rate": 0.00016800986842105262, "loss": 0.4106, "step": 1341 }, { "epoch": 1.4888364997919845, "grad_norm": 0.30723118782043457, "learning_rate": 0.00016788651315789474, "loss": 0.5362, "step": 1342 }, { "epoch": 1.4899459159617252, "grad_norm": 0.4165399372577667, "learning_rate": 0.00016776315789473684, "loss": 0.6185, "step": 1343 }, { "epoch": 1.4910553321314657, "grad_norm": 0.2928377091884613, "learning_rate": 0.00016763980263157893, "loss": 0.4006, "step": 1344 }, { "epoch": 1.4921647483012066, "grad_norm": 0.405435711145401, "learning_rate": 0.00016751644736842105, "loss": 0.4493, "step": 1345 }, { "epoch": 1.493274164470947, "grad_norm": 0.2776191830635071, "learning_rate": 0.00016739309210526314, "loss": 0.4531, "step": 1346 }, { "epoch": 1.4943835806406878, "grad_norm": 0.19967693090438843, "learning_rate": 0.00016726973684210524, "loss": 0.5397, "step": 1347 }, { "epoch": 1.4954929968104285, "grad_norm": 0.22307896614074707, "learning_rate": 0.00016714638157894733, "loss": 0.6485, "step": 1348 }, { "epoch": 1.4966024129801692, "grad_norm": 0.34726205468177795, "learning_rate": 0.00016702302631578945, "loss": 0.7418, "step": 1349 }, { "epoch": 1.4977118291499099, "grad_norm": 0.2557240128517151, "learning_rate": 0.00016689967105263157, "loss": 0.2897, "step": 1350 }, { "epoch": 1.4988212453196506, "grad_norm": 0.2983255386352539, "learning_rate": 0.0001667763157894737, "loss": 0.4885, "step": 1351 }, { "epoch": 1.4999306614893912, "grad_norm": 0.2455969750881195, "learning_rate": 0.00016665296052631578, "loss": 0.5027, "step": 1352 }, { "epoch": 1.5010400776591317, "grad_norm": 0.2705221474170685, "learning_rate": 0.00016652960526315788, "loss": 0.5434, "step": 1353 }, { "epoch": 1.5021494938288726, "grad_norm": 0.27649638056755066, "learning_rate": 0.00016640625, "loss": 0.5844, "step": 1354 }, { "epoch": 1.5032589099986131, "grad_norm": 0.28695183992385864, "learning_rate": 0.0001662828947368421, "loss": 0.4698, "step": 1355 }, { "epoch": 1.504368326168354, "grad_norm": 0.3684578537940979, "learning_rate": 0.00016615953947368418, "loss": 0.5555, "step": 1356 }, { "epoch": 1.5054777423380945, "grad_norm": 0.1808445006608963, "learning_rate": 0.0001660361842105263, "loss": 0.4829, "step": 1357 }, { "epoch": 1.5065871585078352, "grad_norm": 0.37919580936431885, "learning_rate": 0.0001659128289473684, "loss": 0.4, "step": 1358 }, { "epoch": 1.507696574677576, "grad_norm": 0.3226320743560791, "learning_rate": 0.00016578947368421052, "loss": 0.6081, "step": 1359 }, { "epoch": 1.5088059908473166, "grad_norm": 0.378694623708725, "learning_rate": 0.00016566611842105264, "loss": 0.5211, "step": 1360 }, { "epoch": 1.5099154070170573, "grad_norm": 0.418722003698349, "learning_rate": 0.00016554276315789473, "loss": 0.67, "step": 1361 }, { "epoch": 1.511024823186798, "grad_norm": 0.3537873327732086, "learning_rate": 0.00016541940789473682, "loss": 0.6805, "step": 1362 }, { "epoch": 1.5121342393565387, "grad_norm": 0.2202320247888565, "learning_rate": 0.00016529605263157894, "loss": 0.3873, "step": 1363 }, { "epoch": 1.5132436555262792, "grad_norm": 0.36664825677871704, "learning_rate": 0.00016517269736842104, "loss": 0.4605, "step": 1364 }, { "epoch": 1.51435307169602, "grad_norm": 0.25324442982673645, "learning_rate": 0.00016504934210526313, "loss": 0.6109, "step": 1365 }, { "epoch": 1.5154624878657605, "grad_norm": 0.19155505299568176, "learning_rate": 0.00016492598684210525, "loss": 0.7548, "step": 1366 }, { "epoch": 1.5165719040355015, "grad_norm": 0.3349843919277191, "learning_rate": 0.00016480263157894734, "loss": 0.3317, "step": 1367 }, { "epoch": 1.517681320205242, "grad_norm": 0.26652729511260986, "learning_rate": 0.00016467927631578944, "loss": 0.6128, "step": 1368 }, { "epoch": 1.5187907363749826, "grad_norm": 0.30250903964042664, "learning_rate": 0.00016455592105263158, "loss": 0.6961, "step": 1369 }, { "epoch": 1.5199001525447233, "grad_norm": 0.4225883185863495, "learning_rate": 0.00016443256578947368, "loss": 0.5264, "step": 1370 }, { "epoch": 1.521009568714464, "grad_norm": 0.331551730632782, "learning_rate": 0.00016430921052631577, "loss": 0.4245, "step": 1371 }, { "epoch": 1.5221189848842047, "grad_norm": 0.2630516588687897, "learning_rate": 0.0001641858552631579, "loss": 0.4204, "step": 1372 }, { "epoch": 1.5232284010539454, "grad_norm": 0.2230089157819748, "learning_rate": 0.00016406249999999998, "loss": 0.4686, "step": 1373 }, { "epoch": 1.524337817223686, "grad_norm": 0.33370235562324524, "learning_rate": 0.00016393914473684208, "loss": 0.69, "step": 1374 }, { "epoch": 1.5254472333934266, "grad_norm": 0.3383263349533081, "learning_rate": 0.0001638157894736842, "loss": 0.5046, "step": 1375 }, { "epoch": 1.5265566495631675, "grad_norm": 0.3292944133281708, "learning_rate": 0.0001636924342105263, "loss": 0.5178, "step": 1376 }, { "epoch": 1.527666065732908, "grad_norm": 0.2474631369113922, "learning_rate": 0.00016356907894736838, "loss": 0.3783, "step": 1377 }, { "epoch": 1.528775481902649, "grad_norm": 0.306476354598999, "learning_rate": 0.00016344572368421053, "loss": 0.3773, "step": 1378 }, { "epoch": 1.5298848980723894, "grad_norm": 0.2079583704471588, "learning_rate": 0.00016332236842105262, "loss": 0.4554, "step": 1379 }, { "epoch": 1.53099431424213, "grad_norm": 0.4730626940727234, "learning_rate": 0.00016319901315789474, "loss": 0.3571, "step": 1380 }, { "epoch": 1.5321037304118708, "grad_norm": 0.22660337388515472, "learning_rate": 0.00016307565789473684, "loss": 0.4086, "step": 1381 }, { "epoch": 1.5332131465816115, "grad_norm": 0.444742888212204, "learning_rate": 0.00016295230263157893, "loss": 0.4406, "step": 1382 }, { "epoch": 1.5343225627513521, "grad_norm": 0.34402474761009216, "learning_rate": 0.00016282894736842102, "loss": 0.6871, "step": 1383 }, { "epoch": 1.5354319789210926, "grad_norm": 0.31494930386543274, "learning_rate": 0.00016270559210526314, "loss": 0.5023, "step": 1384 }, { "epoch": 1.5365413950908335, "grad_norm": 0.24129949510097504, "learning_rate": 0.00016258223684210524, "loss": 0.3006, "step": 1385 }, { "epoch": 1.537650811260574, "grad_norm": 0.37455177307128906, "learning_rate": 0.00016245888157894733, "loss": 0.3357, "step": 1386 }, { "epoch": 1.538760227430315, "grad_norm": 0.22518782317638397, "learning_rate": 0.00016233552631578945, "loss": 0.4212, "step": 1387 }, { "epoch": 1.5398696436000554, "grad_norm": 0.3566707372665405, "learning_rate": 0.00016221217105263157, "loss": 0.432, "step": 1388 }, { "epoch": 1.540979059769796, "grad_norm": 0.3053068518638611, "learning_rate": 0.0001620888157894737, "loss": 0.3547, "step": 1389 }, { "epoch": 1.5420884759395368, "grad_norm": 0.26762375235557556, "learning_rate": 0.00016196546052631578, "loss": 0.4258, "step": 1390 }, { "epoch": 1.5431978921092775, "grad_norm": 0.44275879859924316, "learning_rate": 0.00016184210526315788, "loss": 0.5698, "step": 1391 }, { "epoch": 1.5443073082790182, "grad_norm": 0.3341034948825836, "learning_rate": 0.00016171875, "loss": 0.6197, "step": 1392 }, { "epoch": 1.5454167244487589, "grad_norm": 0.21536946296691895, "learning_rate": 0.0001615953947368421, "loss": 0.586, "step": 1393 }, { "epoch": 1.5465261406184996, "grad_norm": 0.518312394618988, "learning_rate": 0.00016147203947368418, "loss": 0.663, "step": 1394 }, { "epoch": 1.54763555678824, "grad_norm": 0.31936115026474, "learning_rate": 0.0001613486842105263, "loss": 0.5038, "step": 1395 }, { "epoch": 1.548744972957981, "grad_norm": 0.4910357892513275, "learning_rate": 0.0001612253289473684, "loss": 0.5321, "step": 1396 }, { "epoch": 1.5498543891277214, "grad_norm": 0.35702967643737793, "learning_rate": 0.00016110197368421052, "loss": 0.5254, "step": 1397 }, { "epoch": 1.5509638052974624, "grad_norm": 0.46834543347358704, "learning_rate": 0.00016097861842105264, "loss": 0.5789, "step": 1398 }, { "epoch": 1.5520732214672028, "grad_norm": 0.48472335934638977, "learning_rate": 0.00016085526315789473, "loss": 0.4901, "step": 1399 }, { "epoch": 1.5531826376369435, "grad_norm": 0.28571802377700806, "learning_rate": 0.00016073190789473682, "loss": 0.458, "step": 1400 }, { "epoch": 1.5542920538066842, "grad_norm": 0.30422210693359375, "learning_rate": 0.00016060855263157894, "loss": 0.4369, "step": 1401 }, { "epoch": 1.555401469976425, "grad_norm": 0.34759703278541565, "learning_rate": 0.00016048519736842104, "loss": 0.5257, "step": 1402 }, { "epoch": 1.5565108861461656, "grad_norm": 0.305867999792099, "learning_rate": 0.00016036184210526313, "loss": 0.5571, "step": 1403 }, { "epoch": 1.557620302315906, "grad_norm": 0.2919771075248718, "learning_rate": 0.00016023848684210525, "loss": 0.6567, "step": 1404 }, { "epoch": 1.558729718485647, "grad_norm": 0.24828073382377625, "learning_rate": 0.00016011513157894734, "loss": 0.409, "step": 1405 }, { "epoch": 1.5598391346553875, "grad_norm": 0.48059597611427307, "learning_rate": 0.00015999177631578944, "loss": 0.5028, "step": 1406 }, { "epoch": 1.5609485508251284, "grad_norm": 0.28389695286750793, "learning_rate": 0.00015986842105263158, "loss": 0.369, "step": 1407 }, { "epoch": 1.5620579669948689, "grad_norm": 0.505401074886322, "learning_rate": 0.00015974506578947368, "loss": 0.6917, "step": 1408 }, { "epoch": 1.5631673831646096, "grad_norm": 0.24662017822265625, "learning_rate": 0.00015962171052631577, "loss": 0.3777, "step": 1409 }, { "epoch": 1.5642767993343503, "grad_norm": 0.38750240206718445, "learning_rate": 0.0001594983552631579, "loss": 0.6647, "step": 1410 }, { "epoch": 1.565386215504091, "grad_norm": 0.41798150539398193, "learning_rate": 0.00015937499999999998, "loss": 0.5661, "step": 1411 }, { "epoch": 1.5664956316738317, "grad_norm": 0.24084101617336273, "learning_rate": 0.00015925164473684208, "loss": 0.5066, "step": 1412 }, { "epoch": 1.5676050478435724, "grad_norm": 0.3744387924671173, "learning_rate": 0.0001591282894736842, "loss": 0.4722, "step": 1413 }, { "epoch": 1.568714464013313, "grad_norm": 0.2724044919013977, "learning_rate": 0.0001590049342105263, "loss": 0.5259, "step": 1414 }, { "epoch": 1.5698238801830535, "grad_norm": 0.2745903432369232, "learning_rate": 0.00015888157894736838, "loss": 0.4011, "step": 1415 }, { "epoch": 1.5709332963527944, "grad_norm": 0.20273329317569733, "learning_rate": 0.00015875822368421053, "loss": 0.5408, "step": 1416 }, { "epoch": 1.572042712522535, "grad_norm": 0.38695916533470154, "learning_rate": 0.00015863486842105262, "loss": 0.6258, "step": 1417 }, { "epoch": 1.5731521286922758, "grad_norm": 0.26426389813423157, "learning_rate": 0.00015851151315789474, "loss": 0.4185, "step": 1418 }, { "epoch": 1.5742615448620163, "grad_norm": 0.2685663104057312, "learning_rate": 0.00015838815789473684, "loss": 1.0182, "step": 1419 }, { "epoch": 1.575370961031757, "grad_norm": 0.3128146231174469, "learning_rate": 0.00015826480263157893, "loss": 0.4687, "step": 1420 }, { "epoch": 1.5764803772014977, "grad_norm": 0.24564802646636963, "learning_rate": 0.00015814144736842102, "loss": 0.4918, "step": 1421 }, { "epoch": 1.5775897933712384, "grad_norm": 0.3619256913661957, "learning_rate": 0.00015801809210526314, "loss": 0.4946, "step": 1422 }, { "epoch": 1.578699209540979, "grad_norm": 0.4339245557785034, "learning_rate": 0.00015789473684210524, "loss": 0.5112, "step": 1423 }, { "epoch": 1.5798086257107198, "grad_norm": 0.18522101640701294, "learning_rate": 0.00015777138157894733, "loss": 0.3891, "step": 1424 }, { "epoch": 1.5809180418804605, "grad_norm": 0.23903268575668335, "learning_rate": 0.00015764802631578945, "loss": 0.3706, "step": 1425 }, { "epoch": 1.582027458050201, "grad_norm": 0.6427960991859436, "learning_rate": 0.00015752467105263157, "loss": 0.6339, "step": 1426 }, { "epoch": 1.5831368742199419, "grad_norm": 0.2820015847682953, "learning_rate": 0.0001574013157894737, "loss": 0.4439, "step": 1427 }, { "epoch": 1.5842462903896823, "grad_norm": 0.26673081517219543, "learning_rate": 0.00015727796052631579, "loss": 0.5116, "step": 1428 }, { "epoch": 1.5853557065594233, "grad_norm": 0.36776989698410034, "learning_rate": 0.00015715460526315788, "loss": 0.4931, "step": 1429 }, { "epoch": 1.5864651227291637, "grad_norm": 0.3894679844379425, "learning_rate": 0.00015703125, "loss": 0.478, "step": 1430 }, { "epoch": 1.5875745388989044, "grad_norm": 0.27240705490112305, "learning_rate": 0.0001569078947368421, "loss": 0.3737, "step": 1431 }, { "epoch": 1.5886839550686451, "grad_norm": 0.32550275325775146, "learning_rate": 0.00015678453947368418, "loss": 0.5774, "step": 1432 }, { "epoch": 1.5897933712383858, "grad_norm": 0.32884177565574646, "learning_rate": 0.00015666118421052628, "loss": 0.5408, "step": 1433 }, { "epoch": 1.5909027874081265, "grad_norm": 0.45859280228614807, "learning_rate": 0.0001565378289473684, "loss": 0.7546, "step": 1434 }, { "epoch": 1.592012203577867, "grad_norm": 0.29456886649131775, "learning_rate": 0.00015641447368421052, "loss": 0.3665, "step": 1435 }, { "epoch": 1.593121619747608, "grad_norm": 0.22899344563484192, "learning_rate": 0.00015629111842105264, "loss": 0.4733, "step": 1436 }, { "epoch": 1.5942310359173484, "grad_norm": 0.3687454164028168, "learning_rate": 0.00015616776315789473, "loss": 0.4508, "step": 1437 }, { "epoch": 1.5953404520870893, "grad_norm": 0.2763974964618683, "learning_rate": 0.00015604440789473683, "loss": 0.5613, "step": 1438 }, { "epoch": 1.5964498682568298, "grad_norm": 0.4642561376094818, "learning_rate": 0.00015592105263157895, "loss": 0.5261, "step": 1439 }, { "epoch": 1.5975592844265705, "grad_norm": 0.46116307377815247, "learning_rate": 0.00015579769736842104, "loss": 0.5183, "step": 1440 }, { "epoch": 1.5986687005963112, "grad_norm": 0.42349570989608765, "learning_rate": 0.00015567434210526313, "loss": 0.4358, "step": 1441 }, { "epoch": 1.5997781167660519, "grad_norm": 0.5927262902259827, "learning_rate": 0.00015555098684210525, "loss": 0.4545, "step": 1442 }, { "epoch": 1.6008875329357926, "grad_norm": 0.2763030230998993, "learning_rate": 0.00015542763157894735, "loss": 0.598, "step": 1443 }, { "epoch": 1.6019969491055333, "grad_norm": 0.2903679311275482, "learning_rate": 0.00015530427631578944, "loss": 0.4782, "step": 1444 }, { "epoch": 1.603106365275274, "grad_norm": 0.31723347306251526, "learning_rate": 0.00015518092105263159, "loss": 0.4793, "step": 1445 }, { "epoch": 1.6042157814450144, "grad_norm": 0.3059498965740204, "learning_rate": 0.00015505756578947368, "loss": 0.5086, "step": 1446 }, { "epoch": 1.6053251976147553, "grad_norm": 0.5435683131217957, "learning_rate": 0.00015493421052631577, "loss": 0.5502, "step": 1447 }, { "epoch": 1.6064346137844958, "grad_norm": 0.28451740741729736, "learning_rate": 0.0001548108552631579, "loss": 0.3415, "step": 1448 }, { "epoch": 1.6075440299542367, "grad_norm": 0.27565130591392517, "learning_rate": 0.00015468749999999999, "loss": 0.4961, "step": 1449 }, { "epoch": 1.6086534461239772, "grad_norm": 0.38885173201560974, "learning_rate": 0.00015456414473684208, "loss": 0.6244, "step": 1450 }, { "epoch": 1.609762862293718, "grad_norm": 0.3009326457977295, "learning_rate": 0.0001544407894736842, "loss": 0.3191, "step": 1451 }, { "epoch": 1.6108722784634586, "grad_norm": 0.2374914437532425, "learning_rate": 0.0001543174342105263, "loss": 0.4029, "step": 1452 }, { "epoch": 1.6119816946331993, "grad_norm": 0.24939504265785217, "learning_rate": 0.00015419407894736839, "loss": 0.6444, "step": 1453 }, { "epoch": 1.61309111080294, "grad_norm": 0.2710380554199219, "learning_rate": 0.00015407072368421053, "loss": 0.5412, "step": 1454 }, { "epoch": 1.6142005269726805, "grad_norm": 0.32970866560935974, "learning_rate": 0.00015394736842105263, "loss": 0.4512, "step": 1455 }, { "epoch": 1.6153099431424214, "grad_norm": 0.4523448944091797, "learning_rate": 0.00015382401315789472, "loss": 0.5883, "step": 1456 }, { "epoch": 1.6164193593121619, "grad_norm": 0.25891023874282837, "learning_rate": 0.00015370065789473684, "loss": 0.5516, "step": 1457 }, { "epoch": 1.6175287754819028, "grad_norm": 0.37495937943458557, "learning_rate": 0.00015357730263157893, "loss": 0.5841, "step": 1458 }, { "epoch": 1.6186381916516432, "grad_norm": 0.33875572681427, "learning_rate": 0.00015345394736842103, "loss": 0.6034, "step": 1459 }, { "epoch": 1.6197476078213842, "grad_norm": 0.2283666431903839, "learning_rate": 0.00015333059210526315, "loss": 0.3142, "step": 1460 }, { "epoch": 1.6208570239911246, "grad_norm": 0.35166046023368835, "learning_rate": 0.00015320723684210524, "loss": 0.5688, "step": 1461 }, { "epoch": 1.6219664401608653, "grad_norm": 0.3472052812576294, "learning_rate": 0.00015308388157894733, "loss": 0.2508, "step": 1462 }, { "epoch": 1.623075856330606, "grad_norm": 0.30597633123397827, "learning_rate": 0.00015296052631578945, "loss": 0.5829, "step": 1463 }, { "epoch": 1.6241852725003467, "grad_norm": 0.7042750716209412, "learning_rate": 0.00015283717105263157, "loss": 0.5834, "step": 1464 }, { "epoch": 1.6252946886700874, "grad_norm": 0.3710060119628906, "learning_rate": 0.0001527138157894737, "loss": 0.4427, "step": 1465 }, { "epoch": 1.626404104839828, "grad_norm": 0.3646557927131653, "learning_rate": 0.00015259046052631579, "loss": 0.5418, "step": 1466 }, { "epoch": 1.6275135210095688, "grad_norm": 0.357334703207016, "learning_rate": 0.00015246710526315788, "loss": 0.5551, "step": 1467 }, { "epoch": 1.6286229371793093, "grad_norm": 0.2936727702617645, "learning_rate": 0.00015234375, "loss": 0.5441, "step": 1468 }, { "epoch": 1.6297323533490502, "grad_norm": 0.24167072772979736, "learning_rate": 0.0001522203947368421, "loss": 0.6756, "step": 1469 }, { "epoch": 1.6308417695187907, "grad_norm": 0.3860384523868561, "learning_rate": 0.00015209703947368419, "loss": 0.6466, "step": 1470 }, { "epoch": 1.6319511856885314, "grad_norm": 0.20689241588115692, "learning_rate": 0.00015197368421052628, "loss": 0.28, "step": 1471 }, { "epoch": 1.633060601858272, "grad_norm": 0.2413812130689621, "learning_rate": 0.0001518503289473684, "loss": 0.5198, "step": 1472 }, { "epoch": 1.6341700180280128, "grad_norm": 0.2979881167411804, "learning_rate": 0.00015172697368421052, "loss": 0.5016, "step": 1473 }, { "epoch": 1.6352794341977535, "grad_norm": 0.22364138066768646, "learning_rate": 0.00015160361842105264, "loss": 0.3996, "step": 1474 }, { "epoch": 1.6363888503674942, "grad_norm": 0.27691081166267395, "learning_rate": 0.00015148026315789473, "loss": 0.4038, "step": 1475 }, { "epoch": 1.6374982665372348, "grad_norm": 0.300599604845047, "learning_rate": 0.00015135690789473683, "loss": 0.4522, "step": 1476 }, { "epoch": 1.6386076827069753, "grad_norm": 0.2906956672668457, "learning_rate": 0.00015123355263157895, "loss": 0.5978, "step": 1477 }, { "epoch": 1.6397170988767162, "grad_norm": 0.3278878629207611, "learning_rate": 0.00015111019736842104, "loss": 0.445, "step": 1478 }, { "epoch": 1.6408265150464567, "grad_norm": 0.25733959674835205, "learning_rate": 0.00015098684210526313, "loss": 0.4896, "step": 1479 }, { "epoch": 1.6419359312161976, "grad_norm": 0.33139604330062866, "learning_rate": 0.00015086348684210525, "loss": 0.54, "step": 1480 }, { "epoch": 1.643045347385938, "grad_norm": 0.6023613810539246, "learning_rate": 0.00015074013157894735, "loss": 0.6114, "step": 1481 }, { "epoch": 1.6441547635556788, "grad_norm": 0.22838066518306732, "learning_rate": 0.00015061677631578944, "loss": 0.426, "step": 1482 }, { "epoch": 1.6452641797254195, "grad_norm": 0.26457536220550537, "learning_rate": 0.0001504934210526316, "loss": 0.4963, "step": 1483 }, { "epoch": 1.6463735958951602, "grad_norm": 0.30254966020584106, "learning_rate": 0.00015037006578947368, "loss": 0.5112, "step": 1484 }, { "epoch": 1.6474830120649009, "grad_norm": 0.23207102715969086, "learning_rate": 0.00015024671052631577, "loss": 0.4926, "step": 1485 }, { "epoch": 1.6485924282346414, "grad_norm": 0.25486233830451965, "learning_rate": 0.0001501233552631579, "loss": 0.4387, "step": 1486 }, { "epoch": 1.6497018444043823, "grad_norm": 0.3489553928375244, "learning_rate": 0.00015, "loss": 0.6869, "step": 1487 }, { "epoch": 1.6508112605741228, "grad_norm": 0.28267863392829895, "learning_rate": 0.00014987664473684208, "loss": 0.4191, "step": 1488 }, { "epoch": 1.6519206767438637, "grad_norm": 0.2585887014865875, "learning_rate": 0.0001497532894736842, "loss": 0.5599, "step": 1489 }, { "epoch": 1.6530300929136041, "grad_norm": 0.27951672673225403, "learning_rate": 0.00014962993421052632, "loss": 0.6091, "step": 1490 }, { "epoch": 1.6541395090833448, "grad_norm": 0.33680734038352966, "learning_rate": 0.00014950657894736841, "loss": 0.5986, "step": 1491 }, { "epoch": 1.6552489252530855, "grad_norm": 0.33777835965156555, "learning_rate": 0.0001493832236842105, "loss": 0.6051, "step": 1492 }, { "epoch": 1.6563583414228262, "grad_norm": 0.29178673028945923, "learning_rate": 0.00014925986842105263, "loss": 0.3462, "step": 1493 }, { "epoch": 1.657467757592567, "grad_norm": 0.23857684433460236, "learning_rate": 0.00014913651315789472, "loss": 0.3521, "step": 1494 }, { "epoch": 1.6585771737623076, "grad_norm": 0.33080053329467773, "learning_rate": 0.00014901315789473684, "loss": 0.4185, "step": 1495 }, { "epoch": 1.6596865899320483, "grad_norm": 0.35032498836517334, "learning_rate": 0.00014888980263157893, "loss": 0.6305, "step": 1496 }, { "epoch": 1.6607960061017888, "grad_norm": 0.2609749138355255, "learning_rate": 0.00014876644736842103, "loss": 0.4313, "step": 1497 }, { "epoch": 1.6619054222715297, "grad_norm": 0.3178192675113678, "learning_rate": 0.00014864309210526315, "loss": 0.6381, "step": 1498 }, { "epoch": 1.6630148384412702, "grad_norm": 0.3444008231163025, "learning_rate": 0.00014851973684210527, "loss": 0.6142, "step": 1499 }, { "epoch": 1.664124254611011, "grad_norm": 0.24173513054847717, "learning_rate": 0.00014839638157894736, "loss": 0.5568, "step": 1500 }, { "epoch": 1.6652336707807516, "grad_norm": 0.23622675240039825, "learning_rate": 0.00014827302631578945, "loss": 0.4888, "step": 1501 }, { "epoch": 1.6663430869504923, "grad_norm": 0.24951039254665375, "learning_rate": 0.00014814967105263157, "loss": 0.5554, "step": 1502 }, { "epoch": 1.667452503120233, "grad_norm": 0.20648309588432312, "learning_rate": 0.00014802631578947367, "loss": 0.3302, "step": 1503 }, { "epoch": 1.6685619192899737, "grad_norm": 0.2786915898323059, "learning_rate": 0.0001479029605263158, "loss": 0.3773, "step": 1504 }, { "epoch": 1.6696713354597144, "grad_norm": 0.35018453001976013, "learning_rate": 0.00014777960526315788, "loss": 0.5903, "step": 1505 }, { "epoch": 1.670780751629455, "grad_norm": 0.34121614694595337, "learning_rate": 0.00014765624999999997, "loss": 0.4813, "step": 1506 }, { "epoch": 1.6718901677991957, "grad_norm": 0.41687941551208496, "learning_rate": 0.0001475328947368421, "loss": 0.7389, "step": 1507 }, { "epoch": 1.6729995839689362, "grad_norm": 0.27919813990592957, "learning_rate": 0.0001474095394736842, "loss": 0.3984, "step": 1508 }, { "epoch": 1.6741090001386771, "grad_norm": 0.2847552001476288, "learning_rate": 0.0001472861842105263, "loss": 0.3091, "step": 1509 }, { "epoch": 1.6752184163084176, "grad_norm": 0.3107469379901886, "learning_rate": 0.0001471628289473684, "loss": 0.4399, "step": 1510 }, { "epoch": 1.6763278324781585, "grad_norm": 0.21151646971702576, "learning_rate": 0.00014703947368421052, "loss": 0.3691, "step": 1511 }, { "epoch": 1.677437248647899, "grad_norm": 0.27195483446121216, "learning_rate": 0.00014691611842105261, "loss": 0.5531, "step": 1512 }, { "epoch": 1.6785466648176397, "grad_norm": 0.2964016795158386, "learning_rate": 0.0001467927631578947, "loss": 0.4978, "step": 1513 }, { "epoch": 1.6796560809873804, "grad_norm": 0.39583608508110046, "learning_rate": 0.00014666940789473683, "loss": 0.6652, "step": 1514 }, { "epoch": 1.680765497157121, "grad_norm": 0.23227983713150024, "learning_rate": 0.00014654605263157895, "loss": 0.5401, "step": 1515 }, { "epoch": 1.6818749133268618, "grad_norm": 0.2583910822868347, "learning_rate": 0.00014642269736842104, "loss": 0.5246, "step": 1516 }, { "epoch": 1.6829843294966023, "grad_norm": 0.32901066541671753, "learning_rate": 0.00014629934210526313, "loss": 0.4348, "step": 1517 }, { "epoch": 1.6840937456663432, "grad_norm": 0.2796122133731842, "learning_rate": 0.00014617598684210525, "loss": 0.2749, "step": 1518 }, { "epoch": 1.6852031618360837, "grad_norm": 0.2737559378147125, "learning_rate": 0.00014605263157894735, "loss": 0.3413, "step": 1519 }, { "epoch": 1.6863125780058246, "grad_norm": 0.5847452282905579, "learning_rate": 0.00014592927631578947, "loss": 0.5046, "step": 1520 }, { "epoch": 1.687421994175565, "grad_norm": 0.42608511447906494, "learning_rate": 0.00014580592105263156, "loss": 0.511, "step": 1521 }, { "epoch": 1.6885314103453057, "grad_norm": 0.3839153051376343, "learning_rate": 0.00014568256578947365, "loss": 0.4812, "step": 1522 }, { "epoch": 1.6896408265150464, "grad_norm": 0.2144833356142044, "learning_rate": 0.00014555921052631577, "loss": 0.479, "step": 1523 }, { "epoch": 1.6907502426847871, "grad_norm": 0.34472957253456116, "learning_rate": 0.0001454358552631579, "loss": 0.356, "step": 1524 }, { "epoch": 1.6918596588545278, "grad_norm": 0.29683181643486023, "learning_rate": 0.0001453125, "loss": 0.385, "step": 1525 }, { "epoch": 1.6929690750242685, "grad_norm": 0.3433467447757721, "learning_rate": 0.00014518914473684208, "loss": 0.4606, "step": 1526 }, { "epoch": 1.6940784911940092, "grad_norm": 0.3034195601940155, "learning_rate": 0.0001450657894736842, "loss": 0.5594, "step": 1527 }, { "epoch": 1.6951879073637497, "grad_norm": 0.49384185671806335, "learning_rate": 0.00014494243421052632, "loss": 0.6995, "step": 1528 }, { "epoch": 1.6962973235334906, "grad_norm": 0.5246427655220032, "learning_rate": 0.00014481907894736842, "loss": 0.6432, "step": 1529 }, { "epoch": 1.697406739703231, "grad_norm": 0.26392480731010437, "learning_rate": 0.0001446957236842105, "loss": 0.5626, "step": 1530 }, { "epoch": 1.698516155872972, "grad_norm": 0.27015626430511475, "learning_rate": 0.0001445723684210526, "loss": 0.4244, "step": 1531 }, { "epoch": 1.6996255720427125, "grad_norm": 0.3427369296550751, "learning_rate": 0.00014444901315789472, "loss": 0.4015, "step": 1532 }, { "epoch": 1.7007349882124532, "grad_norm": 0.4389760196208954, "learning_rate": 0.00014432565789473684, "loss": 0.654, "step": 1533 }, { "epoch": 1.7018444043821939, "grad_norm": 0.36939921975135803, "learning_rate": 0.00014420230263157894, "loss": 0.6009, "step": 1534 }, { "epoch": 1.7029538205519346, "grad_norm": 0.2916509211063385, "learning_rate": 0.00014407894736842103, "loss": 0.4624, "step": 1535 }, { "epoch": 1.7040632367216753, "grad_norm": 0.5189476013183594, "learning_rate": 0.00014395559210526315, "loss": 0.5838, "step": 1536 }, { "epoch": 1.7051726528914157, "grad_norm": 0.2686052620410919, "learning_rate": 0.00014383223684210527, "loss": 0.9601, "step": 1537 }, { "epoch": 1.7062820690611566, "grad_norm": 0.28845494985580444, "learning_rate": 0.00014370888157894736, "loss": 0.3672, "step": 1538 }, { "epoch": 1.7073914852308971, "grad_norm": 0.21178792417049408, "learning_rate": 0.00014358552631578946, "loss": 0.3064, "step": 1539 }, { "epoch": 1.708500901400638, "grad_norm": 0.2538648843765259, "learning_rate": 0.00014346217105263158, "loss": 0.4975, "step": 1540 }, { "epoch": 1.7096103175703785, "grad_norm": 0.4006761908531189, "learning_rate": 0.00014333881578947367, "loss": 0.5619, "step": 1541 }, { "epoch": 1.7107197337401192, "grad_norm": 0.3077350854873657, "learning_rate": 0.0001432154605263158, "loss": 0.6509, "step": 1542 }, { "epoch": 1.71182914990986, "grad_norm": 0.29142218828201294, "learning_rate": 0.00014309210526315788, "loss": 0.4765, "step": 1543 }, { "epoch": 1.7129385660796006, "grad_norm": 0.3905639350414276, "learning_rate": 0.00014296874999999998, "loss": 0.8906, "step": 1544 }, { "epoch": 1.7140479822493413, "grad_norm": 0.32861366868019104, "learning_rate": 0.0001428453947368421, "loss": 0.6136, "step": 1545 }, { "epoch": 1.715157398419082, "grad_norm": 0.20155826210975647, "learning_rate": 0.0001427220394736842, "loss": 0.4392, "step": 1546 }, { "epoch": 1.7162668145888227, "grad_norm": 0.35804450511932373, "learning_rate": 0.0001425986842105263, "loss": 0.6582, "step": 1547 }, { "epoch": 1.7173762307585632, "grad_norm": 0.26054689288139343, "learning_rate": 0.0001424753289473684, "loss": 0.4493, "step": 1548 }, { "epoch": 1.718485646928304, "grad_norm": 0.25189530849456787, "learning_rate": 0.00014235197368421052, "loss": 0.4806, "step": 1549 }, { "epoch": 1.7195950630980446, "grad_norm": 0.3394787609577179, "learning_rate": 0.00014222861842105262, "loss": 0.5681, "step": 1550 }, { "epoch": 1.7207044792677855, "grad_norm": 0.3084029257297516, "learning_rate": 0.0001421052631578947, "loss": 0.6309, "step": 1551 }, { "epoch": 1.721813895437526, "grad_norm": 0.3268156945705414, "learning_rate": 0.00014198190789473683, "loss": 0.4937, "step": 1552 }, { "epoch": 1.7229233116072666, "grad_norm": 0.2543306350708008, "learning_rate": 0.00014185855263157895, "loss": 0.3895, "step": 1553 }, { "epoch": 1.7240327277770073, "grad_norm": 0.2428501844406128, "learning_rate": 0.00014173519736842104, "loss": 0.4724, "step": 1554 }, { "epoch": 1.725142143946748, "grad_norm": 0.34834590554237366, "learning_rate": 0.00014161184210526314, "loss": 0.4686, "step": 1555 }, { "epoch": 1.7262515601164887, "grad_norm": 0.2403583824634552, "learning_rate": 0.00014148848684210526, "loss": 0.3847, "step": 1556 }, { "epoch": 1.7273609762862294, "grad_norm": 0.38176214694976807, "learning_rate": 0.00014136513157894735, "loss": 0.4869, "step": 1557 }, { "epoch": 1.7284703924559701, "grad_norm": 0.2659490406513214, "learning_rate": 0.00014124177631578947, "loss": 0.4449, "step": 1558 }, { "epoch": 1.7295798086257106, "grad_norm": 0.2423594892024994, "learning_rate": 0.00014111842105263156, "loss": 0.5206, "step": 1559 }, { "epoch": 1.7306892247954515, "grad_norm": 0.3293440639972687, "learning_rate": 0.00014099506578947366, "loss": 0.621, "step": 1560 }, { "epoch": 1.731798640965192, "grad_norm": 0.27292686700820923, "learning_rate": 0.00014087171052631578, "loss": 0.4955, "step": 1561 }, { "epoch": 1.732908057134933, "grad_norm": 0.3719004690647125, "learning_rate": 0.0001407483552631579, "loss": 0.4081, "step": 1562 }, { "epoch": 1.7340174733046734, "grad_norm": 0.3784489631652832, "learning_rate": 0.000140625, "loss": 0.4212, "step": 1563 }, { "epoch": 1.735126889474414, "grad_norm": 0.24494099617004395, "learning_rate": 0.00014050164473684208, "loss": 0.4698, "step": 1564 }, { "epoch": 1.7362363056441548, "grad_norm": 0.2339191883802414, "learning_rate": 0.0001403782894736842, "loss": 0.3486, "step": 1565 }, { "epoch": 1.7373457218138955, "grad_norm": 0.321445107460022, "learning_rate": 0.00014025493421052632, "loss": 0.8323, "step": 1566 }, { "epoch": 1.7384551379836362, "grad_norm": 0.3625154495239258, "learning_rate": 0.00014013157894736842, "loss": 0.3676, "step": 1567 }, { "epoch": 1.7395645541533766, "grad_norm": 0.30214935541152954, "learning_rate": 0.0001400082236842105, "loss": 0.611, "step": 1568 }, { "epoch": 1.7406739703231175, "grad_norm": 0.3197210133075714, "learning_rate": 0.0001398848684210526, "loss": 0.5491, "step": 1569 }, { "epoch": 1.741783386492858, "grad_norm": 0.31939029693603516, "learning_rate": 0.00013976151315789472, "loss": 0.5426, "step": 1570 }, { "epoch": 1.742892802662599, "grad_norm": 0.470907986164093, "learning_rate": 0.00013963815789473684, "loss": 0.6622, "step": 1571 }, { "epoch": 1.7440022188323394, "grad_norm": 0.2162821739912033, "learning_rate": 0.00013951480263157894, "loss": 0.5728, "step": 1572 }, { "epoch": 1.74511163500208, "grad_norm": 0.24964164197444916, "learning_rate": 0.00013939144736842103, "loss": 0.6929, "step": 1573 }, { "epoch": 1.7462210511718208, "grad_norm": 0.34951767325401306, "learning_rate": 0.00013926809210526315, "loss": 0.3177, "step": 1574 }, { "epoch": 1.7473304673415615, "grad_norm": 0.2987998425960541, "learning_rate": 0.00013914473684210527, "loss": 0.4462, "step": 1575 }, { "epoch": 1.7484398835113022, "grad_norm": 0.362047016620636, "learning_rate": 0.00013902138157894736, "loss": 0.4171, "step": 1576 }, { "epoch": 1.749549299681043, "grad_norm": 0.2592370808124542, "learning_rate": 0.00013889802631578946, "loss": 0.2222, "step": 1577 }, { "epoch": 1.7506587158507836, "grad_norm": 0.26247555017471313, "learning_rate": 0.00013877467105263158, "loss": 0.5498, "step": 1578 }, { "epoch": 1.751768132020524, "grad_norm": 0.28997063636779785, "learning_rate": 0.00013865131578947367, "loss": 0.537, "step": 1579 }, { "epoch": 1.752877548190265, "grad_norm": 0.31275662779808044, "learning_rate": 0.0001385279605263158, "loss": 0.5331, "step": 1580 }, { "epoch": 1.7539869643600055, "grad_norm": 0.3327484130859375, "learning_rate": 0.00013840460526315788, "loss": 0.4344, "step": 1581 }, { "epoch": 1.7550963805297464, "grad_norm": 0.31380587816238403, "learning_rate": 0.00013828124999999998, "loss": 0.641, "step": 1582 }, { "epoch": 1.7562057966994868, "grad_norm": 0.2786813974380493, "learning_rate": 0.0001381578947368421, "loss": 0.3636, "step": 1583 }, { "epoch": 1.7573152128692275, "grad_norm": 0.33598342537879944, "learning_rate": 0.0001380345394736842, "loss": 0.5949, "step": 1584 }, { "epoch": 1.7584246290389682, "grad_norm": 0.28291746973991394, "learning_rate": 0.0001379111842105263, "loss": 0.3358, "step": 1585 }, { "epoch": 1.759534045208709, "grad_norm": 0.22912530601024628, "learning_rate": 0.0001377878289473684, "loss": 0.4407, "step": 1586 }, { "epoch": 1.7606434613784496, "grad_norm": 0.3488161861896515, "learning_rate": 0.00013766447368421052, "loss": 0.4591, "step": 1587 }, { "epoch": 1.76175287754819, "grad_norm": 0.31319087743759155, "learning_rate": 0.00013754111842105262, "loss": 0.496, "step": 1588 }, { "epoch": 1.762862293717931, "grad_norm": 0.31536537408828735, "learning_rate": 0.0001374177631578947, "loss": 0.4566, "step": 1589 }, { "epoch": 1.7639717098876715, "grad_norm": 0.38071408867836, "learning_rate": 0.00013729440789473683, "loss": 0.4728, "step": 1590 }, { "epoch": 1.7650811260574124, "grad_norm": 0.25146248936653137, "learning_rate": 0.00013717105263157895, "loss": 0.3651, "step": 1591 }, { "epoch": 1.7661905422271529, "grad_norm": 0.262510746717453, "learning_rate": 0.00013704769736842104, "loss": 0.3982, "step": 1592 }, { "epoch": 1.7672999583968938, "grad_norm": 0.2857152819633484, "learning_rate": 0.00013692434210526314, "loss": 0.3916, "step": 1593 }, { "epoch": 1.7684093745666343, "grad_norm": 0.5477368831634521, "learning_rate": 0.00013680098684210526, "loss": 0.5775, "step": 1594 }, { "epoch": 1.769518790736375, "grad_norm": 0.30496469140052795, "learning_rate": 0.00013667763157894735, "loss": 0.6863, "step": 1595 }, { "epoch": 1.7706282069061157, "grad_norm": 0.32225456833839417, "learning_rate": 0.00013655427631578947, "loss": 0.4028, "step": 1596 }, { "epoch": 1.7717376230758564, "grad_norm": 0.25836458802223206, "learning_rate": 0.00013643092105263156, "loss": 0.4638, "step": 1597 }, { "epoch": 1.772847039245597, "grad_norm": 0.24989454448223114, "learning_rate": 0.00013630756578947366, "loss": 0.5425, "step": 1598 }, { "epoch": 1.7739564554153375, "grad_norm": 0.46502557396888733, "learning_rate": 0.00013618421052631578, "loss": 0.5317, "step": 1599 }, { "epoch": 1.7750658715850784, "grad_norm": 0.32870060205459595, "learning_rate": 0.0001360608552631579, "loss": 0.5497, "step": 1600 }, { "epoch": 1.776175287754819, "grad_norm": 0.24722667038440704, "learning_rate": 0.0001359375, "loss": 0.5929, "step": 1601 }, { "epoch": 1.7772847039245598, "grad_norm": 0.3317899703979492, "learning_rate": 0.00013581414473684208, "loss": 0.4435, "step": 1602 }, { "epoch": 1.7783941200943003, "grad_norm": 0.36830225586891174, "learning_rate": 0.0001356907894736842, "loss": 0.5876, "step": 1603 }, { "epoch": 1.779503536264041, "grad_norm": 0.23982636630535126, "learning_rate": 0.0001355674342105263, "loss": 0.4906, "step": 1604 }, { "epoch": 1.7806129524337817, "grad_norm": 0.3808034062385559, "learning_rate": 0.00013544407894736842, "loss": 0.6202, "step": 1605 }, { "epoch": 1.7817223686035224, "grad_norm": 0.3208853006362915, "learning_rate": 0.0001353207236842105, "loss": 0.4002, "step": 1606 }, { "epoch": 1.782831784773263, "grad_norm": 0.42497456073760986, "learning_rate": 0.0001351973684210526, "loss": 0.6453, "step": 1607 }, { "epoch": 1.7839412009430038, "grad_norm": 0.5558460354804993, "learning_rate": 0.00013507401315789472, "loss": 0.424, "step": 1608 }, { "epoch": 1.7850506171127445, "grad_norm": 0.38764357566833496, "learning_rate": 0.00013495065789473684, "loss": 0.4115, "step": 1609 }, { "epoch": 1.786160033282485, "grad_norm": 0.3829018771648407, "learning_rate": 0.00013482730263157894, "loss": 0.4814, "step": 1610 }, { "epoch": 1.7872694494522259, "grad_norm": 0.4661031663417816, "learning_rate": 0.00013470394736842103, "loss": 0.6416, "step": 1611 }, { "epoch": 1.7883788656219664, "grad_norm": 0.2798513174057007, "learning_rate": 0.00013458059210526315, "loss": 0.4603, "step": 1612 }, { "epoch": 1.7894882817917073, "grad_norm": 0.3700726330280304, "learning_rate": 0.00013445723684210527, "loss": 0.4821, "step": 1613 }, { "epoch": 1.7905976979614477, "grad_norm": 0.35398468375205994, "learning_rate": 0.00013433388157894736, "loss": 0.5852, "step": 1614 }, { "epoch": 1.7917071141311884, "grad_norm": 0.37176424264907837, "learning_rate": 0.00013421052631578946, "loss": 0.605, "step": 1615 }, { "epoch": 1.7928165303009291, "grad_norm": 0.2966163754463196, "learning_rate": 0.00013408717105263158, "loss": 0.4163, "step": 1616 }, { "epoch": 1.7939259464706698, "grad_norm": 0.3742397129535675, "learning_rate": 0.00013396381578947367, "loss": 0.4507, "step": 1617 }, { "epoch": 1.7950353626404105, "grad_norm": 0.36498209834098816, "learning_rate": 0.0001338404605263158, "loss": 0.8315, "step": 1618 }, { "epoch": 1.796144778810151, "grad_norm": 0.4069786071777344, "learning_rate": 0.00013371710526315788, "loss": 0.5758, "step": 1619 }, { "epoch": 1.797254194979892, "grad_norm": 0.30974453687667847, "learning_rate": 0.00013359374999999998, "loss": 0.7786, "step": 1620 }, { "epoch": 1.7983636111496324, "grad_norm": 0.5354030728340149, "learning_rate": 0.0001334703947368421, "loss": 0.5513, "step": 1621 }, { "epoch": 1.7994730273193733, "grad_norm": 0.24419055879116058, "learning_rate": 0.0001333470394736842, "loss": 0.4483, "step": 1622 }, { "epoch": 1.8005824434891138, "grad_norm": 0.33314335346221924, "learning_rate": 0.0001332236842105263, "loss": 0.5992, "step": 1623 }, { "epoch": 1.8016918596588545, "grad_norm": 0.22179794311523438, "learning_rate": 0.0001331003289473684, "loss": 0.382, "step": 1624 }, { "epoch": 1.8028012758285952, "grad_norm": 0.26683637499809265, "learning_rate": 0.00013297697368421052, "loss": 0.5549, "step": 1625 }, { "epoch": 1.8039106919983359, "grad_norm": 0.2577199339866638, "learning_rate": 0.00013285361842105262, "loss": 0.5963, "step": 1626 }, { "epoch": 1.8050201081680766, "grad_norm": 0.30272090435028076, "learning_rate": 0.0001327302631578947, "loss": 0.4767, "step": 1627 }, { "epoch": 1.8061295243378173, "grad_norm": 0.4484618306159973, "learning_rate": 0.00013260690789473683, "loss": 0.4336, "step": 1628 }, { "epoch": 1.807238940507558, "grad_norm": 0.3869413733482361, "learning_rate": 0.00013248355263157892, "loss": 0.3799, "step": 1629 }, { "epoch": 1.8083483566772984, "grad_norm": 0.26756906509399414, "learning_rate": 0.00013236019736842104, "loss": 0.5128, "step": 1630 }, { "epoch": 1.8094577728470393, "grad_norm": 0.21581970155239105, "learning_rate": 0.00013223684210526314, "loss": 0.5776, "step": 1631 }, { "epoch": 1.8105671890167798, "grad_norm": 0.5925078392028809, "learning_rate": 0.00013211348684210526, "loss": 0.4828, "step": 1632 }, { "epoch": 1.8116766051865207, "grad_norm": 0.29944899678230286, "learning_rate": 0.00013199013157894735, "loss": 0.5219, "step": 1633 }, { "epoch": 1.8127860213562612, "grad_norm": 0.3090478479862213, "learning_rate": 0.00013186677631578947, "loss": 0.7766, "step": 1634 }, { "epoch": 1.813895437526002, "grad_norm": 0.27250027656555176, "learning_rate": 0.00013174342105263156, "loss": 0.5283, "step": 1635 }, { "epoch": 1.8150048536957426, "grad_norm": 0.30475983023643494, "learning_rate": 0.00013162006578947366, "loss": 0.4438, "step": 1636 }, { "epoch": 1.8161142698654833, "grad_norm": 0.4503616690635681, "learning_rate": 0.00013149671052631578, "loss": 0.6286, "step": 1637 }, { "epoch": 1.817223686035224, "grad_norm": 0.3719213306903839, "learning_rate": 0.0001313733552631579, "loss": 0.4938, "step": 1638 }, { "epoch": 1.8183331022049647, "grad_norm": 0.2590722143650055, "learning_rate": 0.00013125, "loss": 0.3978, "step": 1639 }, { "epoch": 1.8194425183747054, "grad_norm": 0.4789052903652191, "learning_rate": 0.00013112664473684209, "loss": 0.4542, "step": 1640 }, { "epoch": 1.8205519345444459, "grad_norm": 0.3678234815597534, "learning_rate": 0.0001310032894736842, "loss": 0.4437, "step": 1641 }, { "epoch": 1.8216613507141868, "grad_norm": 0.21813832223415375, "learning_rate": 0.0001308799342105263, "loss": 0.3382, "step": 1642 }, { "epoch": 1.8227707668839273, "grad_norm": 0.2665456235408783, "learning_rate": 0.00013075657894736842, "loss": 0.3803, "step": 1643 }, { "epoch": 1.8238801830536682, "grad_norm": 0.26693305373191833, "learning_rate": 0.0001306332236842105, "loss": 0.4494, "step": 1644 }, { "epoch": 1.8249895992234086, "grad_norm": 0.22977307438850403, "learning_rate": 0.0001305098684210526, "loss": 0.3472, "step": 1645 }, { "epoch": 1.8260990153931493, "grad_norm": 0.38384175300598145, "learning_rate": 0.00013038651315789473, "loss": 0.7401, "step": 1646 }, { "epoch": 1.82720843156289, "grad_norm": 0.26645827293395996, "learning_rate": 0.00013026315789473685, "loss": 0.5021, "step": 1647 }, { "epoch": 1.8283178477326307, "grad_norm": 0.28554990887641907, "learning_rate": 0.00013013980263157894, "loss": 0.3429, "step": 1648 }, { "epoch": 1.8294272639023714, "grad_norm": 0.23377835750579834, "learning_rate": 0.00013001644736842103, "loss": 0.4639, "step": 1649 }, { "epoch": 1.830536680072112, "grad_norm": 0.323998361825943, "learning_rate": 0.00012989309210526315, "loss": 0.5517, "step": 1650 }, { "epoch": 1.8316460962418528, "grad_norm": 0.24397112429141998, "learning_rate": 0.00012976973684210527, "loss": 0.4443, "step": 1651 }, { "epoch": 1.8327555124115933, "grad_norm": 0.23545107245445251, "learning_rate": 0.00012964638157894737, "loss": 0.6529, "step": 1652 }, { "epoch": 1.8338649285813342, "grad_norm": 0.3781031668186188, "learning_rate": 0.00012952302631578946, "loss": 0.3811, "step": 1653 }, { "epoch": 1.8349743447510747, "grad_norm": 0.3218782842159271, "learning_rate": 0.00012939967105263155, "loss": 0.5818, "step": 1654 }, { "epoch": 1.8360837609208154, "grad_norm": 0.31816890835762024, "learning_rate": 0.00012927631578947367, "loss": 0.515, "step": 1655 }, { "epoch": 1.837193177090556, "grad_norm": 0.3660028278827667, "learning_rate": 0.0001291529605263158, "loss": 0.6154, "step": 1656 }, { "epoch": 1.8383025932602968, "grad_norm": 0.25920218229293823, "learning_rate": 0.00012902960526315789, "loss": 0.5614, "step": 1657 }, { "epoch": 1.8394120094300375, "grad_norm": 0.3921451270580292, "learning_rate": 0.00012890624999999998, "loss": 0.5109, "step": 1658 }, { "epoch": 1.8405214255997782, "grad_norm": 0.30347323417663574, "learning_rate": 0.0001287828947368421, "loss": 0.5139, "step": 1659 }, { "epoch": 1.8416308417695189, "grad_norm": 0.36900901794433594, "learning_rate": 0.0001286595394736842, "loss": 0.3862, "step": 1660 }, { "epoch": 1.8427402579392593, "grad_norm": 0.23627950251102448, "learning_rate": 0.0001285361842105263, "loss": 0.3962, "step": 1661 }, { "epoch": 1.8438496741090002, "grad_norm": 0.3626163601875305, "learning_rate": 0.0001284128289473684, "loss": 0.5779, "step": 1662 }, { "epoch": 1.8449590902787407, "grad_norm": 0.3031785488128662, "learning_rate": 0.00012828947368421053, "loss": 0.4422, "step": 1663 }, { "epoch": 1.8460685064484816, "grad_norm": 0.27116191387176514, "learning_rate": 0.00012816611842105262, "loss": 0.4825, "step": 1664 }, { "epoch": 1.8471779226182221, "grad_norm": 0.23863159120082855, "learning_rate": 0.0001280427631578947, "loss": 0.6105, "step": 1665 }, { "epoch": 1.8482873387879628, "grad_norm": 0.3026638329029083, "learning_rate": 0.00012791940789473683, "loss": 0.5599, "step": 1666 }, { "epoch": 1.8493967549577035, "grad_norm": 0.2904566526412964, "learning_rate": 0.00012779605263157893, "loss": 0.4156, "step": 1667 }, { "epoch": 1.8505061711274442, "grad_norm": 0.2892657518386841, "learning_rate": 0.00012767269736842105, "loss": 0.3779, "step": 1668 }, { "epoch": 1.851615587297185, "grad_norm": 0.27468252182006836, "learning_rate": 0.00012754934210526314, "loss": 0.4811, "step": 1669 }, { "epoch": 1.8527250034669254, "grad_norm": 0.33178287744522095, "learning_rate": 0.00012742598684210526, "loss": 0.6331, "step": 1670 }, { "epoch": 1.8538344196366663, "grad_norm": 0.3048073351383209, "learning_rate": 0.00012730263157894735, "loss": 0.4763, "step": 1671 }, { "epoch": 1.8549438358064068, "grad_norm": 0.2505081593990326, "learning_rate": 0.00012717927631578947, "loss": 0.6395, "step": 1672 }, { "epoch": 1.8560532519761477, "grad_norm": 0.3426123261451721, "learning_rate": 0.00012705592105263157, "loss": 0.4118, "step": 1673 }, { "epoch": 1.8571626681458882, "grad_norm": 0.2770869731903076, "learning_rate": 0.00012693256578947366, "loss": 0.4372, "step": 1674 }, { "epoch": 1.8582720843156288, "grad_norm": 0.28371554613113403, "learning_rate": 0.00012680921052631578, "loss": 0.5352, "step": 1675 }, { "epoch": 1.8593815004853695, "grad_norm": 0.424926221370697, "learning_rate": 0.0001266858552631579, "loss": 0.5825, "step": 1676 }, { "epoch": 1.8604909166551102, "grad_norm": 0.42135924100875854, "learning_rate": 0.0001265625, "loss": 0.54, "step": 1677 }, { "epoch": 1.861600332824851, "grad_norm": 0.35227394104003906, "learning_rate": 0.00012643914473684209, "loss": 0.4149, "step": 1678 }, { "epoch": 1.8627097489945916, "grad_norm": 0.368327796459198, "learning_rate": 0.00012631578947368418, "loss": 0.5194, "step": 1679 }, { "epoch": 1.8638191651643323, "grad_norm": 0.31259453296661377, "learning_rate": 0.0001261924342105263, "loss": 0.5897, "step": 1680 }, { "epoch": 1.8649285813340728, "grad_norm": 0.42234233021736145, "learning_rate": 0.00012606907894736842, "loss": 0.3769, "step": 1681 }, { "epoch": 1.8660379975038137, "grad_norm": 0.258651465177536, "learning_rate": 0.0001259457236842105, "loss": 0.443, "step": 1682 }, { "epoch": 1.8671474136735542, "grad_norm": 0.3242909014225006, "learning_rate": 0.0001258223684210526, "loss": 0.5423, "step": 1683 }, { "epoch": 1.868256829843295, "grad_norm": 0.3746740221977234, "learning_rate": 0.00012569901315789473, "loss": 0.4365, "step": 1684 }, { "epoch": 1.8693662460130356, "grad_norm": 0.2767789363861084, "learning_rate": 0.00012557565789473685, "loss": 0.5011, "step": 1685 }, { "epoch": 1.8704756621827763, "grad_norm": 0.41377684473991394, "learning_rate": 0.00012545230263157894, "loss": 0.4733, "step": 1686 }, { "epoch": 1.871585078352517, "grad_norm": 0.2723773717880249, "learning_rate": 0.00012532894736842103, "loss": 0.5801, "step": 1687 }, { "epoch": 1.8726944945222577, "grad_norm": 0.28274834156036377, "learning_rate": 0.00012520559210526315, "loss": 0.6009, "step": 1688 }, { "epoch": 1.8738039106919984, "grad_norm": 0.3209463655948639, "learning_rate": 0.00012508223684210525, "loss": 0.5851, "step": 1689 }, { "epoch": 1.874913326861739, "grad_norm": 0.24118223786354065, "learning_rate": 0.00012495888157894737, "loss": 0.6186, "step": 1690 }, { "epoch": 1.8760227430314798, "grad_norm": 0.3865971565246582, "learning_rate": 0.00012483552631578946, "loss": 0.5411, "step": 1691 }, { "epoch": 1.8771321592012202, "grad_norm": 0.23888447880744934, "learning_rate": 0.00012471217105263155, "loss": 0.3907, "step": 1692 }, { "epoch": 1.8782415753709611, "grad_norm": 0.290234237909317, "learning_rate": 0.00012458881578947367, "loss": 0.3822, "step": 1693 }, { "epoch": 1.8793509915407016, "grad_norm": 0.2845550775527954, "learning_rate": 0.0001244654605263158, "loss": 0.4248, "step": 1694 }, { "epoch": 1.8804604077104425, "grad_norm": 0.3374759554862976, "learning_rate": 0.0001243421052631579, "loss": 0.7508, "step": 1695 }, { "epoch": 1.881569823880183, "grad_norm": 0.39034581184387207, "learning_rate": 0.00012421874999999998, "loss": 0.4806, "step": 1696 }, { "epoch": 1.8826792400499237, "grad_norm": 0.39774978160858154, "learning_rate": 0.0001240953947368421, "loss": 0.2821, "step": 1697 }, { "epoch": 1.8837886562196644, "grad_norm": 0.31861943006515503, "learning_rate": 0.0001239720394736842, "loss": 0.487, "step": 1698 }, { "epoch": 1.884898072389405, "grad_norm": 0.3267800807952881, "learning_rate": 0.00012384868421052631, "loss": 0.4185, "step": 1699 }, { "epoch": 1.8860074885591458, "grad_norm": 0.28482627868652344, "learning_rate": 0.0001237253289473684, "loss": 0.5111, "step": 1700 }, { "epoch": 1.8871169047288863, "grad_norm": 0.3203260898590088, "learning_rate": 0.00012360197368421053, "loss": 0.465, "step": 1701 }, { "epoch": 1.8882263208986272, "grad_norm": 0.2945539057254791, "learning_rate": 0.00012347861842105262, "loss": 0.615, "step": 1702 }, { "epoch": 1.8893357370683677, "grad_norm": 0.29036056995391846, "learning_rate": 0.00012335526315789471, "loss": 0.5022, "step": 1703 }, { "epoch": 1.8904451532381086, "grad_norm": 0.22323249280452728, "learning_rate": 0.00012323190789473683, "loss": 0.5774, "step": 1704 }, { "epoch": 1.891554569407849, "grad_norm": 0.3879876434803009, "learning_rate": 0.00012310855263157893, "loss": 0.732, "step": 1705 }, { "epoch": 1.8926639855775897, "grad_norm": 0.49169594049453735, "learning_rate": 0.00012298519736842105, "loss": 0.6079, "step": 1706 }, { "epoch": 1.8937734017473304, "grad_norm": 0.32131388783454895, "learning_rate": 0.00012286184210526314, "loss": 0.4357, "step": 1707 }, { "epoch": 1.8948828179170711, "grad_norm": 0.2757743299007416, "learning_rate": 0.00012273848684210526, "loss": 0.4277, "step": 1708 }, { "epoch": 1.8959922340868118, "grad_norm": 0.2627353370189667, "learning_rate": 0.00012261513157894735, "loss": 0.4186, "step": 1709 }, { "epoch": 1.8971016502565525, "grad_norm": 0.24395854771137238, "learning_rate": 0.00012249177631578947, "loss": 0.4422, "step": 1710 }, { "epoch": 1.8982110664262932, "grad_norm": 0.2920277416706085, "learning_rate": 0.00012236842105263157, "loss": 0.3717, "step": 1711 }, { "epoch": 1.8993204825960337, "grad_norm": 0.22231972217559814, "learning_rate": 0.00012224506578947366, "loss": 0.4537, "step": 1712 }, { "epoch": 1.9004298987657746, "grad_norm": 0.2766577899456024, "learning_rate": 0.00012212171052631578, "loss": 0.4264, "step": 1713 }, { "epoch": 1.901539314935515, "grad_norm": 0.30484381318092346, "learning_rate": 0.00012199835526315789, "loss": 0.4421, "step": 1714 }, { "epoch": 1.902648731105256, "grad_norm": 0.36658528447151184, "learning_rate": 0.000121875, "loss": 0.4917, "step": 1715 }, { "epoch": 1.9037581472749965, "grad_norm": 0.38978394865989685, "learning_rate": 0.00012175164473684209, "loss": 0.5901, "step": 1716 }, { "epoch": 1.9048675634447372, "grad_norm": 0.3046998381614685, "learning_rate": 0.0001216282894736842, "loss": 0.4495, "step": 1717 }, { "epoch": 1.9059769796144779, "grad_norm": 0.31667011976242065, "learning_rate": 0.00012150493421052631, "loss": 0.4073, "step": 1718 }, { "epoch": 1.9070863957842186, "grad_norm": 0.3211687207221985, "learning_rate": 0.00012138157894736841, "loss": 0.4975, "step": 1719 }, { "epoch": 1.9081958119539593, "grad_norm": 0.2827535569667816, "learning_rate": 0.00012125822368421051, "loss": 0.4363, "step": 1720 }, { "epoch": 1.9093052281236997, "grad_norm": 0.28672489523887634, "learning_rate": 0.00012113486842105262, "loss": 0.5762, "step": 1721 }, { "epoch": 1.9104146442934407, "grad_norm": 0.3268757462501526, "learning_rate": 0.00012101151315789471, "loss": 0.4126, "step": 1722 }, { "epoch": 1.9115240604631811, "grad_norm": 0.3554566204547882, "learning_rate": 0.00012088815789473683, "loss": 0.5062, "step": 1723 }, { "epoch": 1.912633476632922, "grad_norm": 0.3198055326938629, "learning_rate": 0.00012076480263157894, "loss": 0.4958, "step": 1724 }, { "epoch": 1.9137428928026625, "grad_norm": 0.3657841980457306, "learning_rate": 0.00012064144736842103, "loss": 0.5931, "step": 1725 }, { "epoch": 1.9148523089724034, "grad_norm": 0.45995911955833435, "learning_rate": 0.00012051809210526314, "loss": 0.5862, "step": 1726 }, { "epoch": 1.915961725142144, "grad_norm": 0.4919174015522003, "learning_rate": 0.00012039473684210526, "loss": 0.452, "step": 1727 }, { "epoch": 1.9170711413118846, "grad_norm": 0.3233271539211273, "learning_rate": 0.00012027138157894737, "loss": 0.6559, "step": 1728 }, { "epoch": 1.9181805574816253, "grad_norm": 0.396419495344162, "learning_rate": 0.00012014802631578946, "loss": 0.7268, "step": 1729 }, { "epoch": 1.919289973651366, "grad_norm": 0.2332264930009842, "learning_rate": 0.00012002467105263157, "loss": 0.3874, "step": 1730 }, { "epoch": 1.9203993898211067, "grad_norm": 0.20889733731746674, "learning_rate": 0.00011990131578947366, "loss": 0.5648, "step": 1731 }, { "epoch": 1.9215088059908472, "grad_norm": 0.25143593549728394, "learning_rate": 0.00011977796052631578, "loss": 0.4837, "step": 1732 }, { "epoch": 1.922618222160588, "grad_norm": 0.2032875120639801, "learning_rate": 0.00011965460526315789, "loss": 0.4553, "step": 1733 }, { "epoch": 1.9237276383303286, "grad_norm": 0.2216006964445114, "learning_rate": 0.00011953125, "loss": 0.3256, "step": 1734 }, { "epoch": 1.9248370545000695, "grad_norm": 0.34091660380363464, "learning_rate": 0.00011940789473684209, "loss": 0.534, "step": 1735 }, { "epoch": 1.92594647066981, "grad_norm": 0.35081061720848083, "learning_rate": 0.0001192845394736842, "loss": 0.4549, "step": 1736 }, { "epoch": 1.9270558868395506, "grad_norm": 0.37153178453445435, "learning_rate": 0.00011916118421052632, "loss": 0.507, "step": 1737 }, { "epoch": 1.9281653030092913, "grad_norm": 0.3207988142967224, "learning_rate": 0.00011903782894736841, "loss": 0.6104, "step": 1738 }, { "epoch": 1.929274719179032, "grad_norm": 0.378360390663147, "learning_rate": 0.00011891447368421052, "loss": 0.5371, "step": 1739 }, { "epoch": 1.9303841353487727, "grad_norm": 0.3643793761730194, "learning_rate": 0.00011879111842105262, "loss": 0.506, "step": 1740 }, { "epoch": 1.9314935515185134, "grad_norm": 0.35685864090919495, "learning_rate": 0.00011866776315789471, "loss": 0.6205, "step": 1741 }, { "epoch": 1.9326029676882541, "grad_norm": 0.349833220243454, "learning_rate": 0.00011854440789473684, "loss": 0.587, "step": 1742 }, { "epoch": 1.9337123838579946, "grad_norm": 0.3674916923046112, "learning_rate": 0.00011842105263157894, "loss": 0.5616, "step": 1743 }, { "epoch": 1.9348218000277355, "grad_norm": 0.4197103679180145, "learning_rate": 0.00011829769736842104, "loss": 0.6715, "step": 1744 }, { "epoch": 1.935931216197476, "grad_norm": 0.2582911550998688, "learning_rate": 0.00011817434210526314, "loss": 0.3392, "step": 1745 }, { "epoch": 1.937040632367217, "grad_norm": 0.3199860751628876, "learning_rate": 0.00011805098684210526, "loss": 0.4765, "step": 1746 }, { "epoch": 1.9381500485369574, "grad_norm": 0.28448477387428284, "learning_rate": 0.00011792763157894737, "loss": 0.5821, "step": 1747 }, { "epoch": 1.939259464706698, "grad_norm": 0.4114968180656433, "learning_rate": 0.00011780427631578946, "loss": 0.48, "step": 1748 }, { "epoch": 1.9403688808764388, "grad_norm": 0.3065422773361206, "learning_rate": 0.00011768092105263157, "loss": 0.4095, "step": 1749 }, { "epoch": 1.9414782970461795, "grad_norm": 0.33260229229927063, "learning_rate": 0.00011755756578947366, "loss": 0.5081, "step": 1750 }, { "epoch": 1.9425877132159202, "grad_norm": 0.3044232130050659, "learning_rate": 0.00011743421052631578, "loss": 0.6083, "step": 1751 }, { "epoch": 1.9436971293856606, "grad_norm": 0.3583667278289795, "learning_rate": 0.00011731085526315789, "loss": 0.6418, "step": 1752 }, { "epoch": 1.9448065455554016, "grad_norm": 0.5549653172492981, "learning_rate": 0.0001171875, "loss": 0.8511, "step": 1753 }, { "epoch": 1.945915961725142, "grad_norm": 0.24391904473304749, "learning_rate": 0.00011706414473684209, "loss": 0.4204, "step": 1754 }, { "epoch": 1.947025377894883, "grad_norm": 0.4001742899417877, "learning_rate": 0.0001169407894736842, "loss": 0.4757, "step": 1755 }, { "epoch": 1.9481347940646234, "grad_norm": 0.49259909987449646, "learning_rate": 0.00011681743421052632, "loss": 0.4522, "step": 1756 }, { "epoch": 1.9492442102343641, "grad_norm": 0.3102129399776459, "learning_rate": 0.00011669407894736841, "loss": 0.3831, "step": 1757 }, { "epoch": 1.9503536264041048, "grad_norm": 0.2639727294445038, "learning_rate": 0.00011657072368421052, "loss": 0.6364, "step": 1758 }, { "epoch": 1.9514630425738455, "grad_norm": 0.31318995356559753, "learning_rate": 0.00011644736842105262, "loss": 0.5422, "step": 1759 }, { "epoch": 1.9525724587435862, "grad_norm": 0.2877756357192993, "learning_rate": 0.00011632401315789472, "loss": 0.3984, "step": 1760 }, { "epoch": 1.953681874913327, "grad_norm": 0.36178058385849, "learning_rate": 0.00011620065789473684, "loss": 0.6806, "step": 1761 }, { "epoch": 1.9547912910830676, "grad_norm": 0.36867088079452515, "learning_rate": 0.00011607730263157894, "loss": 0.529, "step": 1762 }, { "epoch": 1.955900707252808, "grad_norm": 0.25498855113983154, "learning_rate": 0.00011595394736842104, "loss": 0.6411, "step": 1763 }, { "epoch": 1.957010123422549, "grad_norm": 0.26043468713760376, "learning_rate": 0.00011583059210526314, "loss": 0.5102, "step": 1764 }, { "epoch": 1.9581195395922895, "grad_norm": 0.40660566091537476, "learning_rate": 0.00011570723684210526, "loss": 0.5582, "step": 1765 }, { "epoch": 1.9592289557620304, "grad_norm": 0.4207366406917572, "learning_rate": 0.00011558388157894736, "loss": 0.7263, "step": 1766 }, { "epoch": 1.9603383719317709, "grad_norm": 0.35944870114326477, "learning_rate": 0.00011546052631578946, "loss": 0.4644, "step": 1767 }, { "epoch": 1.9614477881015115, "grad_norm": 0.2992507219314575, "learning_rate": 0.00011533717105263157, "loss": 0.7683, "step": 1768 }, { "epoch": 1.9625572042712522, "grad_norm": 0.3475952744483948, "learning_rate": 0.00011521381578947366, "loss": 0.6288, "step": 1769 }, { "epoch": 1.963666620440993, "grad_norm": 0.29175207018852234, "learning_rate": 0.00011509046052631578, "loss": 0.355, "step": 1770 }, { "epoch": 1.9647760366107336, "grad_norm": 0.3024480640888214, "learning_rate": 0.00011496710526315789, "loss": 0.5916, "step": 1771 }, { "epoch": 1.9658854527804743, "grad_norm": 0.310245543718338, "learning_rate": 0.00011484375, "loss": 0.6658, "step": 1772 }, { "epoch": 1.966994868950215, "grad_norm": 0.2858862578868866, "learning_rate": 0.00011472039473684209, "loss": 0.5808, "step": 1773 }, { "epoch": 1.9681042851199555, "grad_norm": 0.19843228161334991, "learning_rate": 0.0001145970394736842, "loss": 0.3837, "step": 1774 }, { "epoch": 1.9692137012896964, "grad_norm": 0.37114304304122925, "learning_rate": 0.00011447368421052632, "loss": 0.7252, "step": 1775 }, { "epoch": 1.970323117459437, "grad_norm": 0.3807290196418762, "learning_rate": 0.00011435032894736841, "loss": 0.4429, "step": 1776 }, { "epoch": 1.9714325336291778, "grad_norm": 0.2850121557712555, "learning_rate": 0.00011422697368421052, "loss": 0.3464, "step": 1777 }, { "epoch": 1.9725419497989183, "grad_norm": 0.34873002767562866, "learning_rate": 0.00011410361842105262, "loss": 0.4814, "step": 1778 }, { "epoch": 1.973651365968659, "grad_norm": 0.42871007323265076, "learning_rate": 0.00011398026315789472, "loss": 0.4912, "step": 1779 }, { "epoch": 1.9747607821383997, "grad_norm": 0.3286532163619995, "learning_rate": 0.00011385690789473684, "loss": 0.4862, "step": 1780 }, { "epoch": 1.9758701983081404, "grad_norm": 0.3135276436805725, "learning_rate": 0.00011373355263157894, "loss": 0.3872, "step": 1781 }, { "epoch": 1.976979614477881, "grad_norm": 0.37062501907348633, "learning_rate": 0.00011361019736842104, "loss": 0.5004, "step": 1782 }, { "epoch": 1.9780890306476215, "grad_norm": 0.28763630986213684, "learning_rate": 0.00011348684210526314, "loss": 0.5367, "step": 1783 }, { "epoch": 1.9791984468173625, "grad_norm": 0.34978562593460083, "learning_rate": 0.00011336348684210526, "loss": 0.5041, "step": 1784 }, { "epoch": 1.980307862987103, "grad_norm": 0.2940448820590973, "learning_rate": 0.00011324013157894736, "loss": 0.5121, "step": 1785 }, { "epoch": 1.9814172791568438, "grad_norm": 0.24150650203227997, "learning_rate": 0.00011311677631578946, "loss": 0.4354, "step": 1786 }, { "epoch": 1.9825266953265843, "grad_norm": 0.24752016365528107, "learning_rate": 0.00011299342105263157, "loss": 0.3303, "step": 1787 }, { "epoch": 1.983636111496325, "grad_norm": 0.2988849878311157, "learning_rate": 0.00011287006578947366, "loss": 0.5873, "step": 1788 }, { "epoch": 1.9847455276660657, "grad_norm": 0.548851490020752, "learning_rate": 0.00011274671052631578, "loss": 0.6516, "step": 1789 }, { "epoch": 1.9858549438358064, "grad_norm": 0.3005162477493286, "learning_rate": 0.00011262335526315789, "loss": 0.4203, "step": 1790 }, { "epoch": 1.986964360005547, "grad_norm": 0.3434782922267914, "learning_rate": 0.0001125, "loss": 0.6023, "step": 1791 }, { "epoch": 1.9880737761752878, "grad_norm": 0.27085399627685547, "learning_rate": 0.00011237664473684209, "loss": 0.4336, "step": 1792 }, { "epoch": 1.9891831923450285, "grad_norm": 0.24659699201583862, "learning_rate": 0.0001122532894736842, "loss": 0.4351, "step": 1793 }, { "epoch": 1.990292608514769, "grad_norm": 0.2878054976463318, "learning_rate": 0.00011212993421052632, "loss": 0.3109, "step": 1794 }, { "epoch": 1.9914020246845099, "grad_norm": 0.2754107117652893, "learning_rate": 0.00011200657894736841, "loss": 0.5065, "step": 1795 }, { "epoch": 1.9925114408542504, "grad_norm": 0.31422141194343567, "learning_rate": 0.00011188322368421052, "loss": 0.5294, "step": 1796 }, { "epoch": 1.9936208570239913, "grad_norm": 0.2437220960855484, "learning_rate": 0.00011175986842105262, "loss": 0.4636, "step": 1797 }, { "epoch": 1.9947302731937318, "grad_norm": 0.3113705515861511, "learning_rate": 0.00011163651315789472, "loss": 0.4084, "step": 1798 }, { "epoch": 1.9958396893634724, "grad_norm": 0.2959713935852051, "learning_rate": 0.00011151315789473684, "loss": 0.6161, "step": 1799 }, { "epoch": 1.9969491055332131, "grad_norm": 0.29905256628990173, "learning_rate": 0.00011138980263157894, "loss": 0.3853, "step": 1800 }, { "epoch": 1.9980585217029538, "grad_norm": 0.3135545551776886, "learning_rate": 0.00011126644736842104, "loss": 0.5653, "step": 1801 }, { "epoch": 1.9991679378726945, "grad_norm": 0.3632647395133972, "learning_rate": 0.00011114309210526314, "loss": 0.3835, "step": 1802 }, { "epoch": 2.000277354042435, "grad_norm": 0.3683667480945587, "learning_rate": 0.00011101973684210526, "loss": 0.4497, "step": 1803 }, { "epoch": 2.001386770212176, "grad_norm": 0.26978781819343567, "learning_rate": 0.00011089638157894736, "loss": 0.584, "step": 1804 }, { "epoch": 2.0024961863819164, "grad_norm": 0.2260834127664566, "learning_rate": 0.00011077302631578946, "loss": 0.5175, "step": 1805 }, { "epoch": 2.0036056025516573, "grad_norm": 0.2791745364665985, "learning_rate": 0.00011064967105263157, "loss": 0.5489, "step": 1806 }, { "epoch": 2.004715018721398, "grad_norm": 0.4569042921066284, "learning_rate": 0.00011052631578947366, "loss": 0.4872, "step": 1807 }, { "epoch": 2.0058244348911387, "grad_norm": 0.2634184956550598, "learning_rate": 0.00011040296052631578, "loss": 0.4137, "step": 1808 }, { "epoch": 2.006933851060879, "grad_norm": 0.3725602626800537, "learning_rate": 0.00011027960526315789, "loss": 0.3663, "step": 1809 }, { "epoch": 2.00804326723062, "grad_norm": 0.19589465856552124, "learning_rate": 0.00011015624999999998, "loss": 0.3241, "step": 1810 }, { "epoch": 2.0091526834003606, "grad_norm": 0.2446906864643097, "learning_rate": 0.00011003289473684209, "loss": 0.3818, "step": 1811 }, { "epoch": 2.010262099570101, "grad_norm": 0.2932548224925995, "learning_rate": 0.0001099095394736842, "loss": 0.4009, "step": 1812 }, { "epoch": 2.011371515739842, "grad_norm": 0.23010744154453278, "learning_rate": 0.00010978618421052632, "loss": 0.5868, "step": 1813 }, { "epoch": 2.0124809319095824, "grad_norm": 0.24582666158676147, "learning_rate": 0.00010966282894736841, "loss": 0.2866, "step": 1814 }, { "epoch": 2.0135903480793234, "grad_norm": 0.2688146233558655, "learning_rate": 0.00010953947368421052, "loss": 0.3321, "step": 1815 }, { "epoch": 2.014699764249064, "grad_norm": 0.35448578000068665, "learning_rate": 0.00010941611842105262, "loss": 0.5611, "step": 1816 }, { "epoch": 2.0158091804188047, "grad_norm": 0.3180113732814789, "learning_rate": 0.00010929276315789472, "loss": 0.3527, "step": 1817 }, { "epoch": 2.016918596588545, "grad_norm": 0.27800217270851135, "learning_rate": 0.00010916940789473684, "loss": 0.339, "step": 1818 }, { "epoch": 2.018028012758286, "grad_norm": 0.34227412939071655, "learning_rate": 0.00010904605263157894, "loss": 0.4474, "step": 1819 }, { "epoch": 2.0191374289280266, "grad_norm": 0.3180390000343323, "learning_rate": 0.00010892269736842104, "loss": 0.5412, "step": 1820 }, { "epoch": 2.020246845097767, "grad_norm": 0.4531157314777374, "learning_rate": 0.00010879934210526314, "loss": 0.4999, "step": 1821 }, { "epoch": 2.021356261267508, "grad_norm": 0.3139798045158386, "learning_rate": 0.00010867598684210526, "loss": 0.3725, "step": 1822 }, { "epoch": 2.0224656774372485, "grad_norm": 0.2892252206802368, "learning_rate": 0.00010855263157894736, "loss": 0.462, "step": 1823 }, { "epoch": 2.0235750936069894, "grad_norm": 0.34606751799583435, "learning_rate": 0.00010842927631578946, "loss": 0.2938, "step": 1824 }, { "epoch": 2.02468450977673, "grad_norm": 0.3713940680027008, "learning_rate": 0.00010830592105263157, "loss": 0.3484, "step": 1825 }, { "epoch": 2.025793925946471, "grad_norm": 0.2926501929759979, "learning_rate": 0.00010818256578947366, "loss": 0.3333, "step": 1826 }, { "epoch": 2.0269033421162113, "grad_norm": 0.29994428157806396, "learning_rate": 0.00010805921052631578, "loss": 0.5876, "step": 1827 }, { "epoch": 2.028012758285952, "grad_norm": 0.26852184534072876, "learning_rate": 0.00010793585526315789, "loss": 0.3795, "step": 1828 }, { "epoch": 2.0291221744556927, "grad_norm": 0.3186289072036743, "learning_rate": 0.00010781249999999998, "loss": 0.4261, "step": 1829 }, { "epoch": 2.0302315906254336, "grad_norm": 0.22164680063724518, "learning_rate": 0.00010768914473684209, "loss": 0.4383, "step": 1830 }, { "epoch": 2.031341006795174, "grad_norm": 0.4684840142726898, "learning_rate": 0.0001075657894736842, "loss": 0.4621, "step": 1831 }, { "epoch": 2.0324504229649145, "grad_norm": 0.27373453974723816, "learning_rate": 0.00010744243421052632, "loss": 0.4259, "step": 1832 }, { "epoch": 2.0335598391346554, "grad_norm": 0.3046364188194275, "learning_rate": 0.00010731907894736841, "loss": 0.5675, "step": 1833 }, { "epoch": 2.034669255304396, "grad_norm": 0.2961323857307434, "learning_rate": 0.00010719572368421052, "loss": 0.3838, "step": 1834 }, { "epoch": 2.035778671474137, "grad_norm": 0.3641231656074524, "learning_rate": 0.00010707236842105261, "loss": 0.2031, "step": 1835 }, { "epoch": 2.0368880876438773, "grad_norm": 0.38065147399902344, "learning_rate": 0.00010694901315789472, "loss": 0.3094, "step": 1836 }, { "epoch": 2.037997503813618, "grad_norm": 0.3846987783908844, "learning_rate": 0.00010682565789473684, "loss": 0.4203, "step": 1837 }, { "epoch": 2.0391069199833587, "grad_norm": 0.2999848425388336, "learning_rate": 0.00010670230263157895, "loss": 0.3044, "step": 1838 }, { "epoch": 2.0402163361530996, "grad_norm": 0.4001493453979492, "learning_rate": 0.00010657894736842104, "loss": 0.3758, "step": 1839 }, { "epoch": 2.04132575232284, "grad_norm": 0.42989227175712585, "learning_rate": 0.00010645559210526315, "loss": 0.4854, "step": 1840 }, { "epoch": 2.042435168492581, "grad_norm": 0.3566846549510956, "learning_rate": 0.00010633223684210527, "loss": 0.4105, "step": 1841 }, { "epoch": 2.0435445846623215, "grad_norm": 0.41669943928718567, "learning_rate": 0.00010620888157894736, "loss": 0.4248, "step": 1842 }, { "epoch": 2.044654000832062, "grad_norm": 0.31254488229751587, "learning_rate": 0.00010608552631578947, "loss": 0.3155, "step": 1843 }, { "epoch": 2.045763417001803, "grad_norm": 0.2741456925868988, "learning_rate": 0.00010596217105263157, "loss": 0.4663, "step": 1844 }, { "epoch": 2.0468728331715433, "grad_norm": 0.40784788131713867, "learning_rate": 0.00010583881578947367, "loss": 0.2822, "step": 1845 }, { "epoch": 2.0479822493412843, "grad_norm": 0.3757185935974121, "learning_rate": 0.00010571546052631579, "loss": 0.4443, "step": 1846 }, { "epoch": 2.0490916655110247, "grad_norm": 0.38732078671455383, "learning_rate": 0.00010559210526315789, "loss": 0.4025, "step": 1847 }, { "epoch": 2.0502010816807656, "grad_norm": 0.34661343693733215, "learning_rate": 0.00010546874999999999, "loss": 0.3945, "step": 1848 }, { "epoch": 2.051310497850506, "grad_norm": 0.41781237721443176, "learning_rate": 0.00010534539473684209, "loss": 0.3046, "step": 1849 }, { "epoch": 2.052419914020247, "grad_norm": 0.3018251955509186, "learning_rate": 0.0001052220394736842, "loss": 0.3751, "step": 1850 }, { "epoch": 2.0535293301899875, "grad_norm": 0.2182953655719757, "learning_rate": 0.00010509868421052632, "loss": 0.4354, "step": 1851 }, { "epoch": 2.054638746359728, "grad_norm": 0.48397496342658997, "learning_rate": 0.00010497532894736841, "loss": 0.362, "step": 1852 }, { "epoch": 2.055748162529469, "grad_norm": 0.3845345079898834, "learning_rate": 0.00010485197368421052, "loss": 0.3564, "step": 1853 }, { "epoch": 2.0568575786992094, "grad_norm": 0.2810097932815552, "learning_rate": 0.00010472861842105261, "loss": 0.2759, "step": 1854 }, { "epoch": 2.0579669948689503, "grad_norm": 0.27831992506980896, "learning_rate": 0.00010460526315789472, "loss": 0.4981, "step": 1855 }, { "epoch": 2.0590764110386908, "grad_norm": 0.48267292976379395, "learning_rate": 0.00010448190789473684, "loss": 0.5129, "step": 1856 }, { "epoch": 2.0601858272084317, "grad_norm": 0.3351428508758545, "learning_rate": 0.00010435855263157895, "loss": 0.4921, "step": 1857 }, { "epoch": 2.061295243378172, "grad_norm": 0.3631199598312378, "learning_rate": 0.00010423519736842104, "loss": 0.3983, "step": 1858 }, { "epoch": 2.062404659547913, "grad_norm": 0.369219571352005, "learning_rate": 0.00010411184210526315, "loss": 0.4716, "step": 1859 }, { "epoch": 2.0635140757176536, "grad_norm": 0.43210768699645996, "learning_rate": 0.00010398848684210527, "loss": 0.3501, "step": 1860 }, { "epoch": 2.0646234918873945, "grad_norm": 0.41098493337631226, "learning_rate": 0.00010386513157894736, "loss": 0.4026, "step": 1861 }, { "epoch": 2.065732908057135, "grad_norm": 0.36239397525787354, "learning_rate": 0.00010374177631578947, "loss": 0.297, "step": 1862 }, { "epoch": 2.0668423242268754, "grad_norm": 0.41763097047805786, "learning_rate": 0.00010361842105263157, "loss": 0.3034, "step": 1863 }, { "epoch": 2.0679517403966163, "grad_norm": 0.37006324529647827, "learning_rate": 0.00010349506578947367, "loss": 0.5594, "step": 1864 }, { "epoch": 2.069061156566357, "grad_norm": 0.4518885612487793, "learning_rate": 0.00010337171052631579, "loss": 0.5642, "step": 1865 }, { "epoch": 2.0701705727360977, "grad_norm": 0.3855383098125458, "learning_rate": 0.00010324835526315789, "loss": 0.2811, "step": 1866 }, { "epoch": 2.071279988905838, "grad_norm": 0.3048069477081299, "learning_rate": 0.00010312499999999999, "loss": 0.3574, "step": 1867 }, { "epoch": 2.072389405075579, "grad_norm": 0.28566887974739075, "learning_rate": 0.00010300164473684209, "loss": 0.2174, "step": 1868 }, { "epoch": 2.0734988212453196, "grad_norm": 0.394229918718338, "learning_rate": 0.0001028782894736842, "loss": 0.3636, "step": 1869 }, { "epoch": 2.0746082374150605, "grad_norm": 0.31521254777908325, "learning_rate": 0.00010275493421052632, "loss": 0.5267, "step": 1870 }, { "epoch": 2.075717653584801, "grad_norm": 0.3841816782951355, "learning_rate": 0.00010263157894736841, "loss": 0.4475, "step": 1871 }, { "epoch": 2.0768270697545415, "grad_norm": 0.3173518776893616, "learning_rate": 0.00010250822368421052, "loss": 0.3575, "step": 1872 }, { "epoch": 2.0779364859242824, "grad_norm": 0.4765770435333252, "learning_rate": 0.00010238486842105261, "loss": 0.3616, "step": 1873 }, { "epoch": 2.079045902094023, "grad_norm": 0.3841620683670044, "learning_rate": 0.00010226151315789472, "loss": 0.4769, "step": 1874 }, { "epoch": 2.0801553182637638, "grad_norm": 0.3756863474845886, "learning_rate": 0.00010213815789473684, "loss": 0.3619, "step": 1875 }, { "epoch": 2.0812647344335042, "grad_norm": 0.29783549904823303, "learning_rate": 0.00010201480263157895, "loss": 0.5892, "step": 1876 }, { "epoch": 2.082374150603245, "grad_norm": 0.3997184634208679, "learning_rate": 0.00010189144736842104, "loss": 0.34, "step": 1877 }, { "epoch": 2.0834835667729856, "grad_norm": 0.31789451837539673, "learning_rate": 0.00010176809210526315, "loss": 0.3102, "step": 1878 }, { "epoch": 2.0845929829427265, "grad_norm": 0.3776637017726898, "learning_rate": 0.00010164473684210527, "loss": 0.3004, "step": 1879 }, { "epoch": 2.085702399112467, "grad_norm": 0.3050936162471771, "learning_rate": 0.00010152138157894736, "loss": 0.3768, "step": 1880 }, { "epoch": 2.086811815282208, "grad_norm": 0.33904218673706055, "learning_rate": 0.00010139802631578947, "loss": 0.461, "step": 1881 }, { "epoch": 2.0879212314519484, "grad_norm": 0.5047959089279175, "learning_rate": 0.00010127467105263157, "loss": 0.4089, "step": 1882 }, { "epoch": 2.089030647621689, "grad_norm": 0.3899175822734833, "learning_rate": 0.00010115131578947367, "loss": 0.5338, "step": 1883 }, { "epoch": 2.09014006379143, "grad_norm": 0.5013115406036377, "learning_rate": 0.00010102796052631579, "loss": 0.4203, "step": 1884 }, { "epoch": 2.0912494799611703, "grad_norm": 0.2986677289009094, "learning_rate": 0.0001009046052631579, "loss": 0.3183, "step": 1885 }, { "epoch": 2.092358896130911, "grad_norm": 0.26325130462646484, "learning_rate": 0.00010078124999999999, "loss": 0.3204, "step": 1886 }, { "epoch": 2.0934683123006517, "grad_norm": 0.333397775888443, "learning_rate": 0.0001006578947368421, "loss": 0.292, "step": 1887 }, { "epoch": 2.0945777284703926, "grad_norm": 0.40086430311203003, "learning_rate": 0.0001005345394736842, "loss": 0.3311, "step": 1888 }, { "epoch": 2.095687144640133, "grad_norm": 0.3059875965118408, "learning_rate": 0.0001004111842105263, "loss": 0.3572, "step": 1889 }, { "epoch": 2.096796560809874, "grad_norm": 0.40417563915252686, "learning_rate": 0.00010028782894736841, "loss": 0.3889, "step": 1890 }, { "epoch": 2.0979059769796145, "grad_norm": 0.45205605030059814, "learning_rate": 0.00010016447368421052, "loss": 0.5045, "step": 1891 }, { "epoch": 2.0990153931493554, "grad_norm": 0.28313323855400085, "learning_rate": 0.00010004111842105261, "loss": 0.4171, "step": 1892 }, { "epoch": 2.100124809319096, "grad_norm": 0.41001975536346436, "learning_rate": 9.991776315789472e-05, "loss": 0.2781, "step": 1893 }, { "epoch": 2.1012342254888363, "grad_norm": 0.2708085775375366, "learning_rate": 9.979440789473684e-05, "loss": 0.3806, "step": 1894 }, { "epoch": 2.1023436416585772, "grad_norm": 0.3741215467453003, "learning_rate": 9.967105263157895e-05, "loss": 0.2874, "step": 1895 }, { "epoch": 2.1034530578283177, "grad_norm": 0.2658732831478119, "learning_rate": 9.954769736842104e-05, "loss": 0.3757, "step": 1896 }, { "epoch": 2.1045624739980586, "grad_norm": 0.3492313623428345, "learning_rate": 9.942434210526315e-05, "loss": 0.4064, "step": 1897 }, { "epoch": 2.105671890167799, "grad_norm": 0.34106922149658203, "learning_rate": 9.930098684210527e-05, "loss": 0.2377, "step": 1898 }, { "epoch": 2.10678130633754, "grad_norm": 0.3154791593551636, "learning_rate": 9.917763157894736e-05, "loss": 0.4634, "step": 1899 }, { "epoch": 2.1078907225072805, "grad_norm": 0.3917715549468994, "learning_rate": 9.905427631578947e-05, "loss": 0.4126, "step": 1900 }, { "epoch": 2.1090001386770214, "grad_norm": 0.32718661427497864, "learning_rate": 9.893092105263157e-05, "loss": 0.3976, "step": 1901 }, { "epoch": 2.110109554846762, "grad_norm": 0.2590242028236389, "learning_rate": 9.880756578947367e-05, "loss": 0.3127, "step": 1902 }, { "epoch": 2.1112189710165024, "grad_norm": 0.44344115257263184, "learning_rate": 9.868421052631579e-05, "loss": 0.3864, "step": 1903 }, { "epoch": 2.1123283871862433, "grad_norm": 0.425987184047699, "learning_rate": 9.85608552631579e-05, "loss": 0.4428, "step": 1904 }, { "epoch": 2.1134378033559837, "grad_norm": 0.5364298224449158, "learning_rate": 9.843749999999999e-05, "loss": 0.3975, "step": 1905 }, { "epoch": 2.1145472195257247, "grad_norm": 0.4158439338207245, "learning_rate": 9.83141447368421e-05, "loss": 0.499, "step": 1906 }, { "epoch": 2.115656635695465, "grad_norm": 0.30733615159988403, "learning_rate": 9.81907894736842e-05, "loss": 0.3568, "step": 1907 }, { "epoch": 2.116766051865206, "grad_norm": 0.2557796835899353, "learning_rate": 9.806743421052631e-05, "loss": 0.5255, "step": 1908 }, { "epoch": 2.1178754680349465, "grad_norm": 0.36079320311546326, "learning_rate": 9.794407894736841e-05, "loss": 0.4101, "step": 1909 }, { "epoch": 2.1189848842046874, "grad_norm": 0.3143673837184906, "learning_rate": 9.782072368421052e-05, "loss": 0.4313, "step": 1910 }, { "epoch": 2.120094300374428, "grad_norm": 0.44542989134788513, "learning_rate": 9.769736842105261e-05, "loss": 0.3331, "step": 1911 }, { "epoch": 2.121203716544169, "grad_norm": 0.28402179479599, "learning_rate": 9.757401315789472e-05, "loss": 0.3088, "step": 1912 }, { "epoch": 2.1223131327139093, "grad_norm": 0.4471670687198639, "learning_rate": 9.745065789473684e-05, "loss": 0.4129, "step": 1913 }, { "epoch": 2.12342254888365, "grad_norm": 0.29794514179229736, "learning_rate": 9.732730263157893e-05, "loss": 0.3326, "step": 1914 }, { "epoch": 2.1245319650533907, "grad_norm": 0.37658748030662537, "learning_rate": 9.720394736842104e-05, "loss": 0.3271, "step": 1915 }, { "epoch": 2.125641381223131, "grad_norm": 0.31780439615249634, "learning_rate": 9.708059210526315e-05, "loss": 0.3144, "step": 1916 }, { "epoch": 2.126750797392872, "grad_norm": 0.3516882359981537, "learning_rate": 9.695723684210527e-05, "loss": 0.1996, "step": 1917 }, { "epoch": 2.1278602135626126, "grad_norm": 0.3291231095790863, "learning_rate": 9.683388157894736e-05, "loss": 0.7135, "step": 1918 }, { "epoch": 2.1289696297323535, "grad_norm": 0.4481741189956665, "learning_rate": 9.671052631578947e-05, "loss": 0.2855, "step": 1919 }, { "epoch": 2.130079045902094, "grad_norm": 0.43044859170913696, "learning_rate": 9.658717105263157e-05, "loss": 0.4145, "step": 1920 }, { "epoch": 2.131188462071835, "grad_norm": 0.3299560546875, "learning_rate": 9.646381578947367e-05, "loss": 0.5771, "step": 1921 }, { "epoch": 2.1322978782415754, "grad_norm": 0.4290536940097809, "learning_rate": 9.634046052631579e-05, "loss": 0.492, "step": 1922 }, { "epoch": 2.133407294411316, "grad_norm": 0.344137966632843, "learning_rate": 9.62171052631579e-05, "loss": 0.3803, "step": 1923 }, { "epoch": 2.1345167105810567, "grad_norm": 0.511370062828064, "learning_rate": 9.609374999999999e-05, "loss": 0.3662, "step": 1924 }, { "epoch": 2.135626126750797, "grad_norm": 0.4554339349269867, "learning_rate": 9.59703947368421e-05, "loss": 0.3805, "step": 1925 }, { "epoch": 2.136735542920538, "grad_norm": 0.36867454648017883, "learning_rate": 9.58470394736842e-05, "loss": 0.4913, "step": 1926 }, { "epoch": 2.1378449590902786, "grad_norm": 0.36078619956970215, "learning_rate": 9.572368421052631e-05, "loss": 0.3083, "step": 1927 }, { "epoch": 2.1389543752600195, "grad_norm": 0.369831919670105, "learning_rate": 9.560032894736841e-05, "loss": 0.2677, "step": 1928 }, { "epoch": 2.14006379142976, "grad_norm": 0.4087238013744354, "learning_rate": 9.547697368421052e-05, "loss": 0.4678, "step": 1929 }, { "epoch": 2.141173207599501, "grad_norm": 0.5325888991355896, "learning_rate": 9.535361842105261e-05, "loss": 0.3679, "step": 1930 }, { "epoch": 2.1422826237692414, "grad_norm": 0.281781405210495, "learning_rate": 9.523026315789472e-05, "loss": 0.4423, "step": 1931 }, { "epoch": 2.1433920399389823, "grad_norm": 0.29980483651161194, "learning_rate": 9.510690789473684e-05, "loss": 0.3535, "step": 1932 }, { "epoch": 2.144501456108723, "grad_norm": 0.39666828513145447, "learning_rate": 9.498355263157893e-05, "loss": 0.5298, "step": 1933 }, { "epoch": 2.1456108722784633, "grad_norm": 0.4402129352092743, "learning_rate": 9.486019736842104e-05, "loss": 0.4866, "step": 1934 }, { "epoch": 2.146720288448204, "grad_norm": 0.4045298397541046, "learning_rate": 9.473684210526315e-05, "loss": 0.5098, "step": 1935 }, { "epoch": 2.1478297046179446, "grad_norm": 0.3659813404083252, "learning_rate": 9.461348684210527e-05, "loss": 0.3095, "step": 1936 }, { "epoch": 2.1489391207876856, "grad_norm": 0.3063139021396637, "learning_rate": 9.449013157894736e-05, "loss": 0.2183, "step": 1937 }, { "epoch": 2.150048536957426, "grad_norm": 0.4679979979991913, "learning_rate": 9.436677631578947e-05, "loss": 0.514, "step": 1938 }, { "epoch": 2.151157953127167, "grad_norm": 0.31606557965278625, "learning_rate": 9.424342105263156e-05, "loss": 0.4071, "step": 1939 }, { "epoch": 2.1522673692969074, "grad_norm": 0.3824010193347931, "learning_rate": 9.412006578947367e-05, "loss": 0.3155, "step": 1940 }, { "epoch": 2.1533767854666483, "grad_norm": 0.28109651803970337, "learning_rate": 9.399671052631579e-05, "loss": 0.344, "step": 1941 }, { "epoch": 2.154486201636389, "grad_norm": 0.3306637704372406, "learning_rate": 9.38733552631579e-05, "loss": 0.1947, "step": 1942 }, { "epoch": 2.1555956178061297, "grad_norm": 0.32935014367103577, "learning_rate": 9.374999999999999e-05, "loss": 0.2445, "step": 1943 }, { "epoch": 2.15670503397587, "grad_norm": 0.33338576555252075, "learning_rate": 9.36266447368421e-05, "loss": 0.3996, "step": 1944 }, { "epoch": 2.1578144501456107, "grad_norm": 0.25957128405570984, "learning_rate": 9.35032894736842e-05, "loss": 0.3473, "step": 1945 }, { "epoch": 2.1589238663153516, "grad_norm": 0.48169735074043274, "learning_rate": 9.337993421052631e-05, "loss": 0.329, "step": 1946 }, { "epoch": 2.160033282485092, "grad_norm": 0.4141751527786255, "learning_rate": 9.325657894736842e-05, "loss": 0.2981, "step": 1947 }, { "epoch": 2.161142698654833, "grad_norm": 0.4750854969024658, "learning_rate": 9.313322368421052e-05, "loss": 0.5787, "step": 1948 }, { "epoch": 2.1622521148245735, "grad_norm": 0.4501727819442749, "learning_rate": 9.300986842105262e-05, "loss": 0.2925, "step": 1949 }, { "epoch": 2.1633615309943144, "grad_norm": 0.5955410599708557, "learning_rate": 9.288651315789472e-05, "loss": 0.3785, "step": 1950 }, { "epoch": 2.164470947164055, "grad_norm": 0.36002209782600403, "learning_rate": 9.276315789473684e-05, "loss": 0.4342, "step": 1951 }, { "epoch": 2.1655803633337958, "grad_norm": 0.4657028615474701, "learning_rate": 9.263980263157894e-05, "loss": 0.3671, "step": 1952 }, { "epoch": 2.1666897795035363, "grad_norm": 0.7127841711044312, "learning_rate": 9.251644736842104e-05, "loss": 0.3303, "step": 1953 }, { "epoch": 2.167799195673277, "grad_norm": 0.4676034152507782, "learning_rate": 9.239309210526315e-05, "loss": 0.507, "step": 1954 }, { "epoch": 2.1689086118430176, "grad_norm": 0.6187905073165894, "learning_rate": 9.226973684210527e-05, "loss": 0.3498, "step": 1955 }, { "epoch": 2.170018028012758, "grad_norm": 0.4146270751953125, "learning_rate": 9.214638157894736e-05, "loss": 0.4633, "step": 1956 }, { "epoch": 2.171127444182499, "grad_norm": 0.3486730456352234, "learning_rate": 9.202302631578947e-05, "loss": 0.4834, "step": 1957 }, { "epoch": 2.1722368603522395, "grad_norm": 0.30160388350486755, "learning_rate": 9.189967105263156e-05, "loss": 0.4886, "step": 1958 }, { "epoch": 2.1733462765219804, "grad_norm": 0.571941077709198, "learning_rate": 9.177631578947367e-05, "loss": 0.6495, "step": 1959 }, { "epoch": 2.174455692691721, "grad_norm": 0.2943151295185089, "learning_rate": 9.165296052631579e-05, "loss": 0.4508, "step": 1960 }, { "epoch": 2.175565108861462, "grad_norm": 0.3195703625679016, "learning_rate": 9.15296052631579e-05, "loss": 0.4173, "step": 1961 }, { "epoch": 2.1766745250312023, "grad_norm": 0.3255450427532196, "learning_rate": 9.140624999999999e-05, "loss": 0.2631, "step": 1962 }, { "epoch": 2.177783941200943, "grad_norm": 0.34725460410118103, "learning_rate": 9.12828947368421e-05, "loss": 0.2789, "step": 1963 }, { "epoch": 2.1788933573706837, "grad_norm": 0.284347265958786, "learning_rate": 9.115953947368419e-05, "loss": 0.3208, "step": 1964 }, { "epoch": 2.180002773540424, "grad_norm": 0.3235912322998047, "learning_rate": 9.103618421052631e-05, "loss": 0.5076, "step": 1965 }, { "epoch": 2.181112189710165, "grad_norm": 0.37382572889328003, "learning_rate": 9.091282894736842e-05, "loss": 0.4009, "step": 1966 }, { "epoch": 2.1822216058799055, "grad_norm": 0.3380100727081299, "learning_rate": 9.078947368421052e-05, "loss": 0.3682, "step": 1967 }, { "epoch": 2.1833310220496465, "grad_norm": 0.36504095792770386, "learning_rate": 9.066611842105262e-05, "loss": 0.4735, "step": 1968 }, { "epoch": 2.184440438219387, "grad_norm": 0.44653430581092834, "learning_rate": 9.054276315789472e-05, "loss": 0.8669, "step": 1969 }, { "epoch": 2.185549854389128, "grad_norm": 0.38333860039711, "learning_rate": 9.041940789473684e-05, "loss": 0.2508, "step": 1970 }, { "epoch": 2.1866592705588683, "grad_norm": 0.413216233253479, "learning_rate": 9.029605263157894e-05, "loss": 0.5268, "step": 1971 }, { "epoch": 2.1877686867286092, "grad_norm": 0.4521336257457733, "learning_rate": 9.017269736842104e-05, "loss": 0.62, "step": 1972 }, { "epoch": 2.1888781028983497, "grad_norm": 0.39028438925743103, "learning_rate": 9.004934210526315e-05, "loss": 0.4885, "step": 1973 }, { "epoch": 2.18998751906809, "grad_norm": 0.4401836693286896, "learning_rate": 8.992598684210527e-05, "loss": 0.3722, "step": 1974 }, { "epoch": 2.191096935237831, "grad_norm": 0.4004587233066559, "learning_rate": 8.980263157894736e-05, "loss": 0.2697, "step": 1975 }, { "epoch": 2.1922063514075716, "grad_norm": 0.5189459919929504, "learning_rate": 8.967927631578947e-05, "loss": 0.4762, "step": 1976 }, { "epoch": 2.1933157675773125, "grad_norm": 0.5282573103904724, "learning_rate": 8.955592105263156e-05, "loss": 0.4813, "step": 1977 }, { "epoch": 2.194425183747053, "grad_norm": 0.3748975098133087, "learning_rate": 8.943256578947367e-05, "loss": 0.4131, "step": 1978 }, { "epoch": 2.195534599916794, "grad_norm": 0.3851288855075836, "learning_rate": 8.930921052631579e-05, "loss": 0.2714, "step": 1979 }, { "epoch": 2.1966440160865344, "grad_norm": 0.2807680368423462, "learning_rate": 8.91858552631579e-05, "loss": 0.4121, "step": 1980 }, { "epoch": 2.1977534322562753, "grad_norm": 0.42031747102737427, "learning_rate": 8.906249999999999e-05, "loss": 0.3362, "step": 1981 }, { "epoch": 2.1988628484260158, "grad_norm": 0.33740946650505066, "learning_rate": 8.89391447368421e-05, "loss": 0.2981, "step": 1982 }, { "epoch": 2.1999722645957567, "grad_norm": 0.27107852697372437, "learning_rate": 8.881578947368419e-05, "loss": 0.3691, "step": 1983 }, { "epoch": 2.201081680765497, "grad_norm": 0.36126452684402466, "learning_rate": 8.869243421052631e-05, "loss": 0.4095, "step": 1984 }, { "epoch": 2.2021910969352376, "grad_norm": 0.517387330532074, "learning_rate": 8.856907894736842e-05, "loss": 0.3078, "step": 1985 }, { "epoch": 2.2033005131049785, "grad_norm": 0.5038511157035828, "learning_rate": 8.844572368421052e-05, "loss": 0.2611, "step": 1986 }, { "epoch": 2.204409929274719, "grad_norm": 0.35804232954978943, "learning_rate": 8.832236842105262e-05, "loss": 0.37, "step": 1987 }, { "epoch": 2.20551934544446, "grad_norm": 0.47403684258461, "learning_rate": 8.819901315789472e-05, "loss": 0.4773, "step": 1988 }, { "epoch": 2.2066287616142004, "grad_norm": 0.38205355405807495, "learning_rate": 8.807565789473684e-05, "loss": 0.3112, "step": 1989 }, { "epoch": 2.2077381777839413, "grad_norm": 0.39398112893104553, "learning_rate": 8.795230263157894e-05, "loss": 0.4293, "step": 1990 }, { "epoch": 2.208847593953682, "grad_norm": 0.39993610978126526, "learning_rate": 8.782894736842104e-05, "loss": 0.5623, "step": 1991 }, { "epoch": 2.2099570101234227, "grad_norm": 0.4437258243560791, "learning_rate": 8.770559210526315e-05, "loss": 0.5582, "step": 1992 }, { "epoch": 2.211066426293163, "grad_norm": 0.3316510021686554, "learning_rate": 8.758223684210526e-05, "loss": 0.1147, "step": 1993 }, { "epoch": 2.212175842462904, "grad_norm": 0.4257866442203522, "learning_rate": 8.745888157894736e-05, "loss": 0.2057, "step": 1994 }, { "epoch": 2.2132852586326446, "grad_norm": 0.41767770051956177, "learning_rate": 8.733552631578947e-05, "loss": 0.5289, "step": 1995 }, { "epoch": 2.214394674802385, "grad_norm": 0.3871503472328186, "learning_rate": 8.721217105263156e-05, "loss": 0.4098, "step": 1996 }, { "epoch": 2.215504090972126, "grad_norm": 0.3213239014148712, "learning_rate": 8.708881578947367e-05, "loss": 0.3835, "step": 1997 }, { "epoch": 2.2166135071418664, "grad_norm": 0.5189967155456543, "learning_rate": 8.696546052631579e-05, "loss": 0.473, "step": 1998 }, { "epoch": 2.2177229233116074, "grad_norm": 0.36058536171913147, "learning_rate": 8.68421052631579e-05, "loss": 0.3896, "step": 1999 }, { "epoch": 2.218832339481348, "grad_norm": 0.5485463738441467, "learning_rate": 8.671874999999999e-05, "loss": 0.5556, "step": 2000 }, { "epoch": 2.2199417556510888, "grad_norm": 0.3734520971775055, "learning_rate": 8.65953947368421e-05, "loss": 0.42, "step": 2001 }, { "epoch": 2.2210511718208292, "grad_norm": 0.30071988701820374, "learning_rate": 8.647203947368419e-05, "loss": 0.4252, "step": 2002 }, { "epoch": 2.22216058799057, "grad_norm": 0.4292794167995453, "learning_rate": 8.634868421052631e-05, "loss": 0.2889, "step": 2003 }, { "epoch": 2.2232700041603106, "grad_norm": 0.31171557307243347, "learning_rate": 8.622532894736842e-05, "loss": 0.4064, "step": 2004 }, { "epoch": 2.2243794203300515, "grad_norm": 0.4342403709888458, "learning_rate": 8.610197368421052e-05, "loss": 0.3495, "step": 2005 }, { "epoch": 2.225488836499792, "grad_norm": 0.3274979591369629, "learning_rate": 8.597861842105262e-05, "loss": 0.2234, "step": 2006 }, { "epoch": 2.2265982526695325, "grad_norm": 0.3761701285839081, "learning_rate": 8.585526315789472e-05, "loss": 0.3346, "step": 2007 }, { "epoch": 2.2277076688392734, "grad_norm": 0.3312693238258362, "learning_rate": 8.573190789473684e-05, "loss": 0.4378, "step": 2008 }, { "epoch": 2.228817085009014, "grad_norm": 0.4094376862049103, "learning_rate": 8.560855263157894e-05, "loss": 0.3139, "step": 2009 }, { "epoch": 2.229926501178755, "grad_norm": 0.44111502170562744, "learning_rate": 8.548519736842104e-05, "loss": 0.5182, "step": 2010 }, { "epoch": 2.2310359173484953, "grad_norm": 0.40903040766716003, "learning_rate": 8.536184210526315e-05, "loss": 0.2903, "step": 2011 }, { "epoch": 2.232145333518236, "grad_norm": 0.38946643471717834, "learning_rate": 8.523848684210526e-05, "loss": 0.5439, "step": 2012 }, { "epoch": 2.2332547496879767, "grad_norm": 0.3774378001689911, "learning_rate": 8.511513157894736e-05, "loss": 0.4863, "step": 2013 }, { "epoch": 2.2343641658577176, "grad_norm": 0.2982938289642334, "learning_rate": 8.499177631578947e-05, "loss": 0.3545, "step": 2014 }, { "epoch": 2.235473582027458, "grad_norm": 0.24514225125312805, "learning_rate": 8.486842105263156e-05, "loss": 0.2306, "step": 2015 }, { "epoch": 2.2365829981971985, "grad_norm": 0.4611694812774658, "learning_rate": 8.474506578947367e-05, "loss": 0.3808, "step": 2016 }, { "epoch": 2.2376924143669394, "grad_norm": 0.43070465326309204, "learning_rate": 8.462171052631579e-05, "loss": 0.2849, "step": 2017 }, { "epoch": 2.23880183053668, "grad_norm": 0.45175713300704956, "learning_rate": 8.449835526315788e-05, "loss": 0.2813, "step": 2018 }, { "epoch": 2.239911246706421, "grad_norm": 0.4600198268890381, "learning_rate": 8.437499999999999e-05, "loss": 0.374, "step": 2019 }, { "epoch": 2.2410206628761613, "grad_norm": 0.8133832216262817, "learning_rate": 8.42516447368421e-05, "loss": 0.4012, "step": 2020 }, { "epoch": 2.2421300790459022, "grad_norm": 0.3792482316493988, "learning_rate": 8.412828947368419e-05, "loss": 0.3851, "step": 2021 }, { "epoch": 2.2432394952156427, "grad_norm": 0.38575461506843567, "learning_rate": 8.400493421052631e-05, "loss": 0.3673, "step": 2022 }, { "epoch": 2.2443489113853836, "grad_norm": 0.4712158143520355, "learning_rate": 8.388157894736842e-05, "loss": 0.5148, "step": 2023 }, { "epoch": 2.245458327555124, "grad_norm": 0.4250771105289459, "learning_rate": 8.375822368421052e-05, "loss": 0.3809, "step": 2024 }, { "epoch": 2.2465677437248646, "grad_norm": 0.3241025507450104, "learning_rate": 8.363486842105262e-05, "loss": 0.4474, "step": 2025 }, { "epoch": 2.2476771598946055, "grad_norm": 0.2825429141521454, "learning_rate": 8.351151315789472e-05, "loss": 0.3693, "step": 2026 }, { "epoch": 2.248786576064346, "grad_norm": 0.3415563106536865, "learning_rate": 8.338815789473684e-05, "loss": 0.3112, "step": 2027 }, { "epoch": 2.249895992234087, "grad_norm": 0.4205566346645355, "learning_rate": 8.326480263157894e-05, "loss": 0.4012, "step": 2028 }, { "epoch": 2.2510054084038273, "grad_norm": 0.39186304807662964, "learning_rate": 8.314144736842104e-05, "loss": 0.406, "step": 2029 }, { "epoch": 2.2521148245735683, "grad_norm": 0.44930022954940796, "learning_rate": 8.301809210526315e-05, "loss": 0.3568, "step": 2030 }, { "epoch": 2.2532242407433087, "grad_norm": 0.37317511439323425, "learning_rate": 8.289473684210526e-05, "loss": 0.3248, "step": 2031 }, { "epoch": 2.2543336569130497, "grad_norm": 0.4311521053314209, "learning_rate": 8.277138157894736e-05, "loss": 0.4463, "step": 2032 }, { "epoch": 2.25544307308279, "grad_norm": 0.29921552538871765, "learning_rate": 8.264802631578947e-05, "loss": 0.2457, "step": 2033 }, { "epoch": 2.256552489252531, "grad_norm": 0.3454459011554718, "learning_rate": 8.252467105263156e-05, "loss": 0.505, "step": 2034 }, { "epoch": 2.2576619054222715, "grad_norm": 0.38671955466270447, "learning_rate": 8.240131578947367e-05, "loss": 0.3366, "step": 2035 }, { "epoch": 2.258771321592012, "grad_norm": 0.3811526596546173, "learning_rate": 8.227796052631579e-05, "loss": 0.4974, "step": 2036 }, { "epoch": 2.259880737761753, "grad_norm": 0.3260783851146698, "learning_rate": 8.215460526315788e-05, "loss": 0.2986, "step": 2037 }, { "epoch": 2.2609901539314934, "grad_norm": 0.3846670091152191, "learning_rate": 8.203124999999999e-05, "loss": 0.3801, "step": 2038 }, { "epoch": 2.2620995701012343, "grad_norm": 0.38990718126296997, "learning_rate": 8.19078947368421e-05, "loss": 0.4151, "step": 2039 }, { "epoch": 2.2632089862709748, "grad_norm": 0.36695122718811035, "learning_rate": 8.178453947368419e-05, "loss": 0.4518, "step": 2040 }, { "epoch": 2.2643184024407157, "grad_norm": 0.33559417724609375, "learning_rate": 8.166118421052631e-05, "loss": 0.3286, "step": 2041 }, { "epoch": 2.265427818610456, "grad_norm": 0.3156730830669403, "learning_rate": 8.153782894736842e-05, "loss": 0.3759, "step": 2042 }, { "epoch": 2.266537234780197, "grad_norm": 0.4046294391155243, "learning_rate": 8.141447368421051e-05, "loss": 0.3196, "step": 2043 }, { "epoch": 2.2676466509499376, "grad_norm": 0.24606218934059143, "learning_rate": 8.129111842105262e-05, "loss": 0.2994, "step": 2044 }, { "epoch": 2.2687560671196785, "grad_norm": 0.2781525254249573, "learning_rate": 8.116776315789473e-05, "loss": 0.4763, "step": 2045 }, { "epoch": 2.269865483289419, "grad_norm": 0.3566399812698364, "learning_rate": 8.104440789473685e-05, "loss": 0.2819, "step": 2046 }, { "epoch": 2.2709748994591594, "grad_norm": 0.36124187707901, "learning_rate": 8.092105263157894e-05, "loss": 0.4515, "step": 2047 }, { "epoch": 2.2720843156289003, "grad_norm": 0.42289498448371887, "learning_rate": 8.079769736842105e-05, "loss": 0.2499, "step": 2048 }, { "epoch": 2.273193731798641, "grad_norm": 0.38013771176338196, "learning_rate": 8.067434210526315e-05, "loss": 0.2947, "step": 2049 }, { "epoch": 2.2743031479683817, "grad_norm": 0.44941648840904236, "learning_rate": 8.055098684210526e-05, "loss": 0.5397, "step": 2050 }, { "epoch": 2.275412564138122, "grad_norm": 0.4005190432071686, "learning_rate": 8.042763157894737e-05, "loss": 0.3519, "step": 2051 }, { "epoch": 2.276521980307863, "grad_norm": 0.4730212986469269, "learning_rate": 8.030427631578947e-05, "loss": 0.3143, "step": 2052 }, { "epoch": 2.2776313964776036, "grad_norm": 0.249730184674263, "learning_rate": 8.018092105263157e-05, "loss": 0.288, "step": 2053 }, { "epoch": 2.2787408126473445, "grad_norm": 0.4012918174266815, "learning_rate": 8.005756578947367e-05, "loss": 0.3855, "step": 2054 }, { "epoch": 2.279850228817085, "grad_norm": 0.40905871987342834, "learning_rate": 7.993421052631579e-05, "loss": 0.3615, "step": 2055 }, { "epoch": 2.280959644986826, "grad_norm": 0.34130680561065674, "learning_rate": 7.981085526315789e-05, "loss": 0.4656, "step": 2056 }, { "epoch": 2.2820690611565664, "grad_norm": 0.5201243162155151, "learning_rate": 7.968749999999999e-05, "loss": 0.5578, "step": 2057 }, { "epoch": 2.283178477326307, "grad_norm": 0.3649352490901947, "learning_rate": 7.95641447368421e-05, "loss": 0.5485, "step": 2058 }, { "epoch": 2.2842878934960478, "grad_norm": 0.3599608242511749, "learning_rate": 7.944078947368419e-05, "loss": 0.384, "step": 2059 }, { "epoch": 2.2853973096657882, "grad_norm": 0.35313576459884644, "learning_rate": 7.931743421052631e-05, "loss": 0.3998, "step": 2060 }, { "epoch": 2.286506725835529, "grad_norm": 0.3734196424484253, "learning_rate": 7.919407894736842e-05, "loss": 0.3514, "step": 2061 }, { "epoch": 2.2876161420052696, "grad_norm": 0.4057319462299347, "learning_rate": 7.907072368421051e-05, "loss": 0.5574, "step": 2062 }, { "epoch": 2.2887255581750106, "grad_norm": 0.3745683431625366, "learning_rate": 7.894736842105262e-05, "loss": 0.4718, "step": 2063 }, { "epoch": 2.289834974344751, "grad_norm": 0.2819893956184387, "learning_rate": 7.882401315789473e-05, "loss": 0.3068, "step": 2064 }, { "epoch": 2.2909443905144915, "grad_norm": 0.35861438512802124, "learning_rate": 7.870065789473685e-05, "loss": 0.5004, "step": 2065 }, { "epoch": 2.2920538066842324, "grad_norm": 0.3276369273662567, "learning_rate": 7.857730263157894e-05, "loss": 0.4357, "step": 2066 }, { "epoch": 2.2931632228539733, "grad_norm": 0.6049783229827881, "learning_rate": 7.845394736842105e-05, "loss": 0.2697, "step": 2067 }, { "epoch": 2.294272639023714, "grad_norm": 0.3910931348800659, "learning_rate": 7.833059210526314e-05, "loss": 0.4302, "step": 2068 }, { "epoch": 2.2953820551934543, "grad_norm": 0.5576995611190796, "learning_rate": 7.820723684210526e-05, "loss": 0.3253, "step": 2069 }, { "epoch": 2.296491471363195, "grad_norm": 0.43437716364860535, "learning_rate": 7.808388157894737e-05, "loss": 0.5236, "step": 2070 }, { "epoch": 2.2976008875329357, "grad_norm": 0.3469353914260864, "learning_rate": 7.796052631578947e-05, "loss": 0.391, "step": 2071 }, { "epoch": 2.2987103037026766, "grad_norm": 0.5261640548706055, "learning_rate": 7.783717105263157e-05, "loss": 0.3683, "step": 2072 }, { "epoch": 2.299819719872417, "grad_norm": 0.4726692736148834, "learning_rate": 7.771381578947367e-05, "loss": 0.4087, "step": 2073 }, { "epoch": 2.300929136042158, "grad_norm": 0.3760102391242981, "learning_rate": 7.759046052631579e-05, "loss": 0.424, "step": 2074 }, { "epoch": 2.3020385522118985, "grad_norm": 0.38098374009132385, "learning_rate": 7.746710526315789e-05, "loss": 0.3709, "step": 2075 }, { "epoch": 2.303147968381639, "grad_norm": 0.42381149530410767, "learning_rate": 7.734374999999999e-05, "loss": 0.5073, "step": 2076 }, { "epoch": 2.30425738455138, "grad_norm": 0.27536872029304504, "learning_rate": 7.72203947368421e-05, "loss": 0.339, "step": 2077 }, { "epoch": 2.3053668007211203, "grad_norm": 0.7497661113739014, "learning_rate": 7.709703947368419e-05, "loss": 0.4369, "step": 2078 }, { "epoch": 2.3064762168908612, "grad_norm": 0.3368166387081146, "learning_rate": 7.697368421052631e-05, "loss": 0.4125, "step": 2079 }, { "epoch": 2.3075856330606017, "grad_norm": 0.43875083327293396, "learning_rate": 7.685032894736842e-05, "loss": 0.2639, "step": 2080 }, { "epoch": 2.3086950492303426, "grad_norm": 0.44184234738349915, "learning_rate": 7.672697368421051e-05, "loss": 0.3158, "step": 2081 }, { "epoch": 2.309804465400083, "grad_norm": 0.3724750280380249, "learning_rate": 7.660361842105262e-05, "loss": 0.4184, "step": 2082 }, { "epoch": 2.310913881569824, "grad_norm": 0.37243518233299255, "learning_rate": 7.648026315789473e-05, "loss": 0.3311, "step": 2083 }, { "epoch": 2.3120232977395645, "grad_norm": 0.31451210379600525, "learning_rate": 7.635690789473685e-05, "loss": 0.4138, "step": 2084 }, { "epoch": 2.3131327139093054, "grad_norm": 0.5280860066413879, "learning_rate": 7.623355263157894e-05, "loss": 0.3653, "step": 2085 }, { "epoch": 2.314242130079046, "grad_norm": 0.4472182095050812, "learning_rate": 7.611019736842105e-05, "loss": 0.3394, "step": 2086 }, { "epoch": 2.3153515462487864, "grad_norm": 0.39410504698753357, "learning_rate": 7.598684210526314e-05, "loss": 0.4175, "step": 2087 }, { "epoch": 2.3164609624185273, "grad_norm": 0.30498364567756653, "learning_rate": 7.586348684210526e-05, "loss": 0.2771, "step": 2088 }, { "epoch": 2.3175703785882678, "grad_norm": 0.3658243417739868, "learning_rate": 7.574013157894737e-05, "loss": 0.211, "step": 2089 }, { "epoch": 2.3186797947580087, "grad_norm": 0.27675336599349976, "learning_rate": 7.561677631578947e-05, "loss": 0.4383, "step": 2090 }, { "epoch": 2.319789210927749, "grad_norm": 0.47002074122428894, "learning_rate": 7.549342105263157e-05, "loss": 0.2742, "step": 2091 }, { "epoch": 2.32089862709749, "grad_norm": 0.39124634861946106, "learning_rate": 7.537006578947367e-05, "loss": 0.3856, "step": 2092 }, { "epoch": 2.3220080432672305, "grad_norm": 0.467118501663208, "learning_rate": 7.52467105263158e-05, "loss": 0.4138, "step": 2093 }, { "epoch": 2.3231174594369715, "grad_norm": 0.29437050223350525, "learning_rate": 7.512335526315789e-05, "loss": 0.2429, "step": 2094 }, { "epoch": 2.324226875606712, "grad_norm": 0.29561078548431396, "learning_rate": 7.5e-05, "loss": 0.3395, "step": 2095 }, { "epoch": 2.325336291776453, "grad_norm": 0.5432490706443787, "learning_rate": 7.48766447368421e-05, "loss": 0.6463, "step": 2096 }, { "epoch": 2.3264457079461933, "grad_norm": 0.32837289571762085, "learning_rate": 7.475328947368421e-05, "loss": 0.4072, "step": 2097 }, { "epoch": 2.327555124115934, "grad_norm": 0.3328750729560852, "learning_rate": 7.462993421052631e-05, "loss": 0.6708, "step": 2098 }, { "epoch": 2.3286645402856747, "grad_norm": 0.44809266924858093, "learning_rate": 7.450657894736842e-05, "loss": 0.2939, "step": 2099 }, { "epoch": 2.329773956455415, "grad_norm": 0.4526784121990204, "learning_rate": 7.438322368421051e-05, "loss": 0.4913, "step": 2100 }, { "epoch": 2.330883372625156, "grad_norm": 0.3075268864631653, "learning_rate": 7.425986842105263e-05, "loss": 0.3334, "step": 2101 }, { "epoch": 2.3319927887948966, "grad_norm": 0.33832165598869324, "learning_rate": 7.413651315789473e-05, "loss": 0.3785, "step": 2102 }, { "epoch": 2.3331022049646375, "grad_norm": 0.3870348632335663, "learning_rate": 7.401315789473683e-05, "loss": 0.4643, "step": 2103 }, { "epoch": 2.334211621134378, "grad_norm": 0.39963188767433167, "learning_rate": 7.388980263157894e-05, "loss": 0.296, "step": 2104 }, { "epoch": 2.335321037304119, "grad_norm": 0.40964365005493164, "learning_rate": 7.376644736842105e-05, "loss": 0.3673, "step": 2105 }, { "epoch": 2.3364304534738594, "grad_norm": 0.31597647070884705, "learning_rate": 7.364309210526315e-05, "loss": 0.3001, "step": 2106 }, { "epoch": 2.3375398696436003, "grad_norm": 0.364797443151474, "learning_rate": 7.351973684210526e-05, "loss": 0.3574, "step": 2107 }, { "epoch": 2.3386492858133408, "grad_norm": 0.4309171438217163, "learning_rate": 7.339638157894735e-05, "loss": 0.5573, "step": 2108 }, { "epoch": 2.3397587019830812, "grad_norm": 0.34616556763648987, "learning_rate": 7.327302631578947e-05, "loss": 0.4426, "step": 2109 }, { "epoch": 2.340868118152822, "grad_norm": 0.6331126689910889, "learning_rate": 7.314967105263157e-05, "loss": 0.7939, "step": 2110 }, { "epoch": 2.3419775343225626, "grad_norm": 0.43343913555145264, "learning_rate": 7.302631578947367e-05, "loss": 0.3552, "step": 2111 }, { "epoch": 2.3430869504923035, "grad_norm": 0.42529061436653137, "learning_rate": 7.290296052631578e-05, "loss": 0.5603, "step": 2112 }, { "epoch": 2.344196366662044, "grad_norm": 0.32142138481140137, "learning_rate": 7.277960526315789e-05, "loss": 0.2149, "step": 2113 }, { "epoch": 2.345305782831785, "grad_norm": 0.29198598861694336, "learning_rate": 7.265625e-05, "loss": 0.3719, "step": 2114 }, { "epoch": 2.3464151990015254, "grad_norm": 0.36399996280670166, "learning_rate": 7.25328947368421e-05, "loss": 0.369, "step": 2115 }, { "epoch": 2.347524615171266, "grad_norm": 0.288010835647583, "learning_rate": 7.240953947368421e-05, "loss": 0.4388, "step": 2116 }, { "epoch": 2.348634031341007, "grad_norm": 0.45767742395401, "learning_rate": 7.22861842105263e-05, "loss": 0.5151, "step": 2117 }, { "epoch": 2.3497434475107477, "grad_norm": 0.4015630781650543, "learning_rate": 7.216282894736842e-05, "loss": 0.2633, "step": 2118 }, { "epoch": 2.350852863680488, "grad_norm": 0.4112047553062439, "learning_rate": 7.203947368421051e-05, "loss": 0.4331, "step": 2119 }, { "epoch": 2.3519622798502287, "grad_norm": 0.4492191672325134, "learning_rate": 7.191611842105263e-05, "loss": 0.3771, "step": 2120 }, { "epoch": 2.3530716960199696, "grad_norm": 0.36163103580474854, "learning_rate": 7.179276315789473e-05, "loss": 0.4598, "step": 2121 }, { "epoch": 2.35418111218971, "grad_norm": 0.3377210795879364, "learning_rate": 7.166940789473683e-05, "loss": 0.1919, "step": 2122 }, { "epoch": 2.355290528359451, "grad_norm": 0.6023211479187012, "learning_rate": 7.154605263157894e-05, "loss": 0.4939, "step": 2123 }, { "epoch": 2.3563999445291914, "grad_norm": 0.31167641282081604, "learning_rate": 7.142269736842105e-05, "loss": 0.4228, "step": 2124 }, { "epoch": 2.3575093606989324, "grad_norm": 0.39771780371665955, "learning_rate": 7.129934210526315e-05, "loss": 0.4054, "step": 2125 }, { "epoch": 2.358618776868673, "grad_norm": 0.45539480447769165, "learning_rate": 7.117598684210526e-05, "loss": 0.4532, "step": 2126 }, { "epoch": 2.3597281930384133, "grad_norm": 0.3616165220737457, "learning_rate": 7.105263157894735e-05, "loss": 0.4083, "step": 2127 }, { "epoch": 2.360837609208154, "grad_norm": 0.3542384207248688, "learning_rate": 7.092927631578947e-05, "loss": 0.4871, "step": 2128 }, { "epoch": 2.3619470253778947, "grad_norm": 0.3284272253513336, "learning_rate": 7.080592105263157e-05, "loss": 0.2563, "step": 2129 }, { "epoch": 2.3630564415476356, "grad_norm": 0.48142263293266296, "learning_rate": 7.068256578947367e-05, "loss": 0.3424, "step": 2130 }, { "epoch": 2.364165857717376, "grad_norm": 0.39508047699928284, "learning_rate": 7.055921052631578e-05, "loss": 0.4946, "step": 2131 }, { "epoch": 2.365275273887117, "grad_norm": 0.31679755449295044, "learning_rate": 7.043585526315789e-05, "loss": 0.494, "step": 2132 }, { "epoch": 2.3663846900568575, "grad_norm": 0.43126723170280457, "learning_rate": 7.03125e-05, "loss": 0.3824, "step": 2133 }, { "epoch": 2.3674941062265984, "grad_norm": 0.43794259428977966, "learning_rate": 7.01891447368421e-05, "loss": 0.3238, "step": 2134 }, { "epoch": 2.368603522396339, "grad_norm": 0.3279634714126587, "learning_rate": 7.006578947368421e-05, "loss": 0.3239, "step": 2135 }, { "epoch": 2.36971293856608, "grad_norm": 0.32456138730049133, "learning_rate": 6.99424342105263e-05, "loss": 0.316, "step": 2136 }, { "epoch": 2.3708223547358203, "grad_norm": 0.4760146141052246, "learning_rate": 6.981907894736842e-05, "loss": 0.3338, "step": 2137 }, { "epoch": 2.3719317709055607, "grad_norm": 0.5461307168006897, "learning_rate": 6.969572368421051e-05, "loss": 0.2907, "step": 2138 }, { "epoch": 2.3730411870753017, "grad_norm": 0.3460582494735718, "learning_rate": 6.957236842105264e-05, "loss": 0.3996, "step": 2139 }, { "epoch": 2.374150603245042, "grad_norm": 0.41096046566963196, "learning_rate": 6.944901315789473e-05, "loss": 0.2277, "step": 2140 }, { "epoch": 2.375260019414783, "grad_norm": 0.48936372995376587, "learning_rate": 6.932565789473683e-05, "loss": 0.3666, "step": 2141 }, { "epoch": 2.3763694355845235, "grad_norm": 0.36358651518821716, "learning_rate": 6.920230263157894e-05, "loss": 0.3881, "step": 2142 }, { "epoch": 2.3774788517542644, "grad_norm": 0.45791712403297424, "learning_rate": 6.907894736842105e-05, "loss": 0.3655, "step": 2143 }, { "epoch": 2.378588267924005, "grad_norm": 0.3084203898906708, "learning_rate": 6.895559210526316e-05, "loss": 0.3494, "step": 2144 }, { "epoch": 2.379697684093746, "grad_norm": 0.5072697997093201, "learning_rate": 6.883223684210526e-05, "loss": 0.4923, "step": 2145 }, { "epoch": 2.3808071002634863, "grad_norm": 0.5244908928871155, "learning_rate": 6.870888157894735e-05, "loss": 0.4565, "step": 2146 }, { "epoch": 2.381916516433227, "grad_norm": 0.3606557548046112, "learning_rate": 6.858552631578948e-05, "loss": 0.2892, "step": 2147 }, { "epoch": 2.3830259326029677, "grad_norm": 0.5143904089927673, "learning_rate": 6.846217105263157e-05, "loss": 0.3186, "step": 2148 }, { "epoch": 2.384135348772708, "grad_norm": 0.42753830552101135, "learning_rate": 6.833881578947368e-05, "loss": 0.4308, "step": 2149 }, { "epoch": 2.385244764942449, "grad_norm": 0.32439666986465454, "learning_rate": 6.821546052631578e-05, "loss": 0.3906, "step": 2150 }, { "epoch": 2.3863541811121896, "grad_norm": 0.4810985028743744, "learning_rate": 6.809210526315789e-05, "loss": 0.3133, "step": 2151 }, { "epoch": 2.3874635972819305, "grad_norm": 0.4014139473438263, "learning_rate": 6.796875e-05, "loss": 0.3086, "step": 2152 }, { "epoch": 2.388573013451671, "grad_norm": 0.3334631621837616, "learning_rate": 6.78453947368421e-05, "loss": 0.3678, "step": 2153 }, { "epoch": 2.389682429621412, "grad_norm": 0.6455233097076416, "learning_rate": 6.772203947368421e-05, "loss": 0.3285, "step": 2154 }, { "epoch": 2.3907918457911523, "grad_norm": 0.3901275396347046, "learning_rate": 6.75986842105263e-05, "loss": 0.3597, "step": 2155 }, { "epoch": 2.3919012619608933, "grad_norm": 0.4130385220050812, "learning_rate": 6.747532894736842e-05, "loss": 0.3987, "step": 2156 }, { "epoch": 2.3930106781306337, "grad_norm": 0.4633021652698517, "learning_rate": 6.735197368421052e-05, "loss": 0.3555, "step": 2157 }, { "epoch": 2.3941200943003746, "grad_norm": 0.3566206097602844, "learning_rate": 6.722861842105264e-05, "loss": 0.5682, "step": 2158 }, { "epoch": 2.395229510470115, "grad_norm": 0.37410175800323486, "learning_rate": 6.710526315789473e-05, "loss": 0.5559, "step": 2159 }, { "epoch": 2.3963389266398556, "grad_norm": 0.47398602962493896, "learning_rate": 6.698190789473684e-05, "loss": 0.3974, "step": 2160 }, { "epoch": 2.3974483428095965, "grad_norm": 0.3041347563266754, "learning_rate": 6.685855263157894e-05, "loss": 0.306, "step": 2161 }, { "epoch": 2.398557758979337, "grad_norm": 0.6512690782546997, "learning_rate": 6.673519736842105e-05, "loss": 0.3073, "step": 2162 }, { "epoch": 2.399667175149078, "grad_norm": 0.3038191497325897, "learning_rate": 6.661184210526316e-05, "loss": 0.3592, "step": 2163 }, { "epoch": 2.4007765913188184, "grad_norm": 0.30106794834136963, "learning_rate": 6.648848684210526e-05, "loss": 0.3012, "step": 2164 }, { "epoch": 2.4018860074885593, "grad_norm": 0.33144044876098633, "learning_rate": 6.636513157894736e-05, "loss": 0.4192, "step": 2165 }, { "epoch": 2.4029954236582998, "grad_norm": 0.46323060989379883, "learning_rate": 6.624177631578946e-05, "loss": 0.2883, "step": 2166 }, { "epoch": 2.4041048398280407, "grad_norm": 0.39412403106689453, "learning_rate": 6.611842105263157e-05, "loss": 0.3982, "step": 2167 }, { "epoch": 2.405214255997781, "grad_norm": 0.4328696131706238, "learning_rate": 6.599506578947368e-05, "loss": 0.2954, "step": 2168 }, { "epoch": 2.406323672167522, "grad_norm": 0.4066632390022278, "learning_rate": 6.587171052631578e-05, "loss": 0.5436, "step": 2169 }, { "epoch": 2.4074330883372626, "grad_norm": 0.41099557280540466, "learning_rate": 6.574835526315789e-05, "loss": 0.3294, "step": 2170 }, { "epoch": 2.408542504507003, "grad_norm": 0.44337305426597595, "learning_rate": 6.5625e-05, "loss": 0.4187, "step": 2171 }, { "epoch": 2.409651920676744, "grad_norm": 0.43825507164001465, "learning_rate": 6.55016447368421e-05, "loss": 0.4014, "step": 2172 }, { "epoch": 2.4107613368464844, "grad_norm": 0.37434348464012146, "learning_rate": 6.537828947368421e-05, "loss": 0.3387, "step": 2173 }, { "epoch": 2.4118707530162253, "grad_norm": 0.49352914094924927, "learning_rate": 6.52549342105263e-05, "loss": 0.538, "step": 2174 }, { "epoch": 2.412980169185966, "grad_norm": 0.3878787159919739, "learning_rate": 6.513157894736842e-05, "loss": 0.355, "step": 2175 }, { "epoch": 2.4140895853557067, "grad_norm": 0.4009630084037781, "learning_rate": 6.500822368421052e-05, "loss": 0.2911, "step": 2176 }, { "epoch": 2.415199001525447, "grad_norm": 0.5621581077575684, "learning_rate": 6.488486842105264e-05, "loss": 0.3631, "step": 2177 }, { "epoch": 2.4163084176951877, "grad_norm": 0.5268844366073608, "learning_rate": 6.476151315789473e-05, "loss": 0.1947, "step": 2178 }, { "epoch": 2.4174178338649286, "grad_norm": 0.4071340560913086, "learning_rate": 6.463815789473684e-05, "loss": 0.4349, "step": 2179 }, { "epoch": 2.418527250034669, "grad_norm": 0.5059108734130859, "learning_rate": 6.451480263157894e-05, "loss": 0.4169, "step": 2180 }, { "epoch": 2.41963666620441, "grad_norm": 0.3529709279537201, "learning_rate": 6.439144736842105e-05, "loss": 0.3216, "step": 2181 }, { "epoch": 2.4207460823741505, "grad_norm": 0.3386879563331604, "learning_rate": 6.426809210526316e-05, "loss": 0.3775, "step": 2182 }, { "epoch": 2.4218554985438914, "grad_norm": 0.4721252918243408, "learning_rate": 6.414473684210526e-05, "loss": 0.3298, "step": 2183 }, { "epoch": 2.422964914713632, "grad_norm": 0.58592289686203, "learning_rate": 6.402138157894736e-05, "loss": 0.3353, "step": 2184 }, { "epoch": 2.4240743308833728, "grad_norm": 0.3879697918891907, "learning_rate": 6.389802631578946e-05, "loss": 0.3383, "step": 2185 }, { "epoch": 2.4251837470531132, "grad_norm": 0.5064356327056885, "learning_rate": 6.377467105263157e-05, "loss": 0.2966, "step": 2186 }, { "epoch": 2.426293163222854, "grad_norm": 0.4402575194835663, "learning_rate": 6.365131578947368e-05, "loss": 0.3114, "step": 2187 }, { "epoch": 2.4274025793925946, "grad_norm": 0.5321472883224487, "learning_rate": 6.352796052631578e-05, "loss": 0.4891, "step": 2188 }, { "epoch": 2.428511995562335, "grad_norm": 0.4424992799758911, "learning_rate": 6.340460526315789e-05, "loss": 0.5393, "step": 2189 }, { "epoch": 2.429621411732076, "grad_norm": 0.37487635016441345, "learning_rate": 6.328125e-05, "loss": 0.2881, "step": 2190 }, { "epoch": 2.4307308279018165, "grad_norm": 0.4092381000518799, "learning_rate": 6.315789473684209e-05, "loss": 0.3569, "step": 2191 }, { "epoch": 2.4318402440715574, "grad_norm": 0.43235230445861816, "learning_rate": 6.303453947368421e-05, "loss": 0.3249, "step": 2192 }, { "epoch": 2.432949660241298, "grad_norm": 0.4262683689594269, "learning_rate": 6.29111842105263e-05, "loss": 0.3653, "step": 2193 }, { "epoch": 2.434059076411039, "grad_norm": 0.46941083669662476, "learning_rate": 6.278782894736842e-05, "loss": 0.3763, "step": 2194 }, { "epoch": 2.4351684925807793, "grad_norm": 0.44827941060066223, "learning_rate": 6.266447368421052e-05, "loss": 0.5289, "step": 2195 }, { "epoch": 2.43627790875052, "grad_norm": 0.4619573652744293, "learning_rate": 6.254111842105262e-05, "loss": 0.3311, "step": 2196 }, { "epoch": 2.4373873249202607, "grad_norm": 0.4534587562084198, "learning_rate": 6.241776315789473e-05, "loss": 0.5748, "step": 2197 }, { "epoch": 2.4384967410900016, "grad_norm": 0.5071384906768799, "learning_rate": 6.229440789473684e-05, "loss": 0.436, "step": 2198 }, { "epoch": 2.439606157259742, "grad_norm": 0.31029054522514343, "learning_rate": 6.217105263157894e-05, "loss": 0.4601, "step": 2199 }, { "epoch": 2.4407155734294825, "grad_norm": 0.6883265972137451, "learning_rate": 6.204769736842105e-05, "loss": 0.7862, "step": 2200 }, { "epoch": 2.4418249895992234, "grad_norm": 0.5015659332275391, "learning_rate": 6.192434210526316e-05, "loss": 0.589, "step": 2201 }, { "epoch": 2.442934405768964, "grad_norm": 0.5587977170944214, "learning_rate": 6.180098684210526e-05, "loss": 0.5756, "step": 2202 }, { "epoch": 2.444043821938705, "grad_norm": 0.3763371706008911, "learning_rate": 6.167763157894736e-05, "loss": 0.3825, "step": 2203 }, { "epoch": 2.4451532381084453, "grad_norm": 0.39078426361083984, "learning_rate": 6.155427631578946e-05, "loss": 0.2897, "step": 2204 }, { "epoch": 2.4462626542781862, "grad_norm": 0.2745046615600586, "learning_rate": 6.143092105263157e-05, "loss": 0.3853, "step": 2205 }, { "epoch": 2.4473720704479267, "grad_norm": 0.3368263840675354, "learning_rate": 6.130756578947368e-05, "loss": 0.4171, "step": 2206 }, { "epoch": 2.4484814866176676, "grad_norm": 0.3768693208694458, "learning_rate": 6.118421052631578e-05, "loss": 0.3516, "step": 2207 }, { "epoch": 2.449590902787408, "grad_norm": 0.7670299410820007, "learning_rate": 6.106085526315789e-05, "loss": 0.4042, "step": 2208 }, { "epoch": 2.450700318957149, "grad_norm": 0.332269549369812, "learning_rate": 6.09375e-05, "loss": 0.4697, "step": 2209 }, { "epoch": 2.4518097351268895, "grad_norm": 0.3532228171825409, "learning_rate": 6.08141447368421e-05, "loss": 0.3612, "step": 2210 }, { "epoch": 2.45291915129663, "grad_norm": 0.39647355675697327, "learning_rate": 6.0690789473684204e-05, "loss": 0.3681, "step": 2211 }, { "epoch": 2.454028567466371, "grad_norm": 0.3135978579521179, "learning_rate": 6.056743421052631e-05, "loss": 0.3349, "step": 2212 }, { "epoch": 2.4551379836361114, "grad_norm": 0.3503597378730774, "learning_rate": 6.044407894736842e-05, "loss": 0.4885, "step": 2213 }, { "epoch": 2.4562473998058523, "grad_norm": 0.3332939147949219, "learning_rate": 6.032072368421052e-05, "loss": 0.494, "step": 2214 }, { "epoch": 2.4573568159755927, "grad_norm": 0.24943743646144867, "learning_rate": 6.019736842105263e-05, "loss": 0.2715, "step": 2215 }, { "epoch": 2.4584662321453337, "grad_norm": 0.3770546615123749, "learning_rate": 6.007401315789473e-05, "loss": 0.4246, "step": 2216 }, { "epoch": 2.459575648315074, "grad_norm": 0.474202960729599, "learning_rate": 5.995065789473683e-05, "loss": 0.4433, "step": 2217 }, { "epoch": 2.460685064484815, "grad_norm": 0.5108838081359863, "learning_rate": 5.9827302631578944e-05, "loss": 0.3707, "step": 2218 }, { "epoch": 2.4617944806545555, "grad_norm": 0.3123144507408142, "learning_rate": 5.9703947368421044e-05, "loss": 0.5036, "step": 2219 }, { "epoch": 2.4629038968242964, "grad_norm": 0.5090911984443665, "learning_rate": 5.958059210526316e-05, "loss": 0.2868, "step": 2220 }, { "epoch": 2.464013312994037, "grad_norm": 0.300322562456131, "learning_rate": 5.945723684210526e-05, "loss": 0.4943, "step": 2221 }, { "epoch": 2.4651227291637774, "grad_norm": 0.5102697014808655, "learning_rate": 5.933388157894736e-05, "loss": 0.2345, "step": 2222 }, { "epoch": 2.4662321453335183, "grad_norm": 0.28978249430656433, "learning_rate": 5.921052631578947e-05, "loss": 0.4279, "step": 2223 }, { "epoch": 2.467341561503259, "grad_norm": 0.29028403759002686, "learning_rate": 5.908717105263157e-05, "loss": 0.4429, "step": 2224 }, { "epoch": 2.4684509776729997, "grad_norm": 0.2963179349899292, "learning_rate": 5.8963815789473684e-05, "loss": 0.2816, "step": 2225 }, { "epoch": 2.46956039384274, "grad_norm": 0.45071807503700256, "learning_rate": 5.8840460526315784e-05, "loss": 0.4315, "step": 2226 }, { "epoch": 2.470669810012481, "grad_norm": 0.5235294699668884, "learning_rate": 5.871710526315789e-05, "loss": 0.4252, "step": 2227 }, { "epoch": 2.4717792261822216, "grad_norm": 0.3098270893096924, "learning_rate": 5.859375e-05, "loss": 0.5957, "step": 2228 }, { "epoch": 2.472888642351962, "grad_norm": 0.4504237473011017, "learning_rate": 5.84703947368421e-05, "loss": 0.4309, "step": 2229 }, { "epoch": 2.473998058521703, "grad_norm": 0.3145286738872528, "learning_rate": 5.8347039473684205e-05, "loss": 0.4284, "step": 2230 }, { "epoch": 2.475107474691444, "grad_norm": 0.39920830726623535, "learning_rate": 5.822368421052631e-05, "loss": 0.3452, "step": 2231 }, { "epoch": 2.4762168908611843, "grad_norm": 0.5176841020584106, "learning_rate": 5.810032894736842e-05, "loss": 0.3673, "step": 2232 }, { "epoch": 2.477326307030925, "grad_norm": 0.3197839558124542, "learning_rate": 5.797697368421052e-05, "loss": 0.3608, "step": 2233 }, { "epoch": 2.4784357232006657, "grad_norm": 0.46264639496803284, "learning_rate": 5.785361842105263e-05, "loss": 0.3975, "step": 2234 }, { "epoch": 2.479545139370406, "grad_norm": 0.6301188468933105, "learning_rate": 5.773026315789473e-05, "loss": 0.5299, "step": 2235 }, { "epoch": 2.480654555540147, "grad_norm": 0.34785377979278564, "learning_rate": 5.760690789473683e-05, "loss": 0.3115, "step": 2236 }, { "epoch": 2.4817639717098876, "grad_norm": 0.658812403678894, "learning_rate": 5.7483552631578945e-05, "loss": 0.363, "step": 2237 }, { "epoch": 2.4828733878796285, "grad_norm": 0.5214020013809204, "learning_rate": 5.7360197368421045e-05, "loss": 0.4809, "step": 2238 }, { "epoch": 2.483982804049369, "grad_norm": 0.5607793927192688, "learning_rate": 5.723684210526316e-05, "loss": 0.2884, "step": 2239 }, { "epoch": 2.4850922202191095, "grad_norm": 0.39174753427505493, "learning_rate": 5.711348684210526e-05, "loss": 0.353, "step": 2240 }, { "epoch": 2.4862016363888504, "grad_norm": 0.3475854694843292, "learning_rate": 5.699013157894736e-05, "loss": 0.4008, "step": 2241 }, { "epoch": 2.487311052558591, "grad_norm": 0.5239514708518982, "learning_rate": 5.686677631578947e-05, "loss": 0.4618, "step": 2242 }, { "epoch": 2.488420468728332, "grad_norm": 0.358995646238327, "learning_rate": 5.674342105263157e-05, "loss": 0.4906, "step": 2243 }, { "epoch": 2.4895298848980723, "grad_norm": 0.3828662633895874, "learning_rate": 5.662006578947368e-05, "loss": 0.2941, "step": 2244 }, { "epoch": 2.490639301067813, "grad_norm": 0.43704545497894287, "learning_rate": 5.6496710526315785e-05, "loss": 0.3913, "step": 2245 }, { "epoch": 2.4917487172375536, "grad_norm": 0.29927560687065125, "learning_rate": 5.637335526315789e-05, "loss": 0.2661, "step": 2246 }, { "epoch": 2.4928581334072946, "grad_norm": 0.36267396807670593, "learning_rate": 5.625e-05, "loss": 0.2605, "step": 2247 }, { "epoch": 2.493967549577035, "grad_norm": 0.34938421845436096, "learning_rate": 5.61266447368421e-05, "loss": 0.3623, "step": 2248 }, { "epoch": 2.495076965746776, "grad_norm": 0.4633193016052246, "learning_rate": 5.6003289473684205e-05, "loss": 0.3735, "step": 2249 }, { "epoch": 2.4961863819165164, "grad_norm": 0.3852117359638214, "learning_rate": 5.587993421052631e-05, "loss": 0.5006, "step": 2250 }, { "epoch": 2.497295798086257, "grad_norm": 0.528650164604187, "learning_rate": 5.575657894736842e-05, "loss": 0.4337, "step": 2251 }, { "epoch": 2.498405214255998, "grad_norm": 0.3810504972934723, "learning_rate": 5.563322368421052e-05, "loss": 0.3518, "step": 2252 }, { "epoch": 2.4995146304257383, "grad_norm": 0.35557541251182556, "learning_rate": 5.550986842105263e-05, "loss": 0.3932, "step": 2253 }, { "epoch": 2.500624046595479, "grad_norm": 0.2806094288825989, "learning_rate": 5.538651315789473e-05, "loss": 0.2386, "step": 2254 }, { "epoch": 2.5017334627652197, "grad_norm": 0.42694249749183655, "learning_rate": 5.526315789473683e-05, "loss": 0.4441, "step": 2255 }, { "epoch": 2.5028428789349606, "grad_norm": 0.32019782066345215, "learning_rate": 5.5139802631578945e-05, "loss": 0.3024, "step": 2256 }, { "epoch": 2.503952295104701, "grad_norm": 0.43957844376564026, "learning_rate": 5.5016447368421045e-05, "loss": 0.3916, "step": 2257 }, { "epoch": 2.5050617112744415, "grad_norm": 0.39406758546829224, "learning_rate": 5.489309210526316e-05, "loss": 0.3808, "step": 2258 }, { "epoch": 2.5061711274441825, "grad_norm": 0.36642491817474365, "learning_rate": 5.476973684210526e-05, "loss": 0.4663, "step": 2259 }, { "epoch": 2.5072805436139234, "grad_norm": 0.3601360619068146, "learning_rate": 5.464638157894736e-05, "loss": 0.4061, "step": 2260 }, { "epoch": 2.508389959783664, "grad_norm": 0.406791627407074, "learning_rate": 5.452302631578947e-05, "loss": 0.3205, "step": 2261 }, { "epoch": 2.5094993759534043, "grad_norm": 0.3350258469581604, "learning_rate": 5.439967105263157e-05, "loss": 0.2646, "step": 2262 }, { "epoch": 2.5106087921231452, "grad_norm": 0.48063212633132935, "learning_rate": 5.427631578947368e-05, "loss": 0.4569, "step": 2263 }, { "epoch": 2.5117182082928857, "grad_norm": 0.4414843022823334, "learning_rate": 5.4152960526315786e-05, "loss": 0.3192, "step": 2264 }, { "epoch": 2.5128276244626266, "grad_norm": 0.4843035042285919, "learning_rate": 5.402960526315789e-05, "loss": 0.3547, "step": 2265 }, { "epoch": 2.513937040632367, "grad_norm": 0.30912336707115173, "learning_rate": 5.390624999999999e-05, "loss": 0.4092, "step": 2266 }, { "epoch": 2.515046456802108, "grad_norm": 0.3699786365032196, "learning_rate": 5.37828947368421e-05, "loss": 0.3192, "step": 2267 }, { "epoch": 2.5161558729718485, "grad_norm": 0.6730918884277344, "learning_rate": 5.3659539473684206e-05, "loss": 0.4083, "step": 2268 }, { "epoch": 2.517265289141589, "grad_norm": 0.44793230295181274, "learning_rate": 5.3536184210526306e-05, "loss": 0.3056, "step": 2269 }, { "epoch": 2.51837470531133, "grad_norm": 0.48553967475891113, "learning_rate": 5.341282894736842e-05, "loss": 0.2853, "step": 2270 }, { "epoch": 2.519484121481071, "grad_norm": 0.3934726417064667, "learning_rate": 5.328947368421052e-05, "loss": 0.252, "step": 2271 }, { "epoch": 2.5205935376508113, "grad_norm": 0.44591614603996277, "learning_rate": 5.316611842105263e-05, "loss": 0.5068, "step": 2272 }, { "epoch": 2.5217029538205518, "grad_norm": 0.40541309118270874, "learning_rate": 5.304276315789473e-05, "loss": 0.427, "step": 2273 }, { "epoch": 2.5228123699902927, "grad_norm": 0.4913922846317291, "learning_rate": 5.291940789473683e-05, "loss": 0.3514, "step": 2274 }, { "epoch": 2.523921786160033, "grad_norm": 0.3643013834953308, "learning_rate": 5.2796052631578946e-05, "loss": 0.2301, "step": 2275 }, { "epoch": 2.525031202329774, "grad_norm": 0.389931321144104, "learning_rate": 5.2672697368421046e-05, "loss": 0.5792, "step": 2276 }, { "epoch": 2.5261406184995145, "grad_norm": 0.353927344083786, "learning_rate": 5.254934210526316e-05, "loss": 0.2489, "step": 2277 }, { "epoch": 2.5272500346692555, "grad_norm": 0.6424699425697327, "learning_rate": 5.242598684210526e-05, "loss": 0.4858, "step": 2278 }, { "epoch": 2.528359450838996, "grad_norm": 0.35985687375068665, "learning_rate": 5.230263157894736e-05, "loss": 0.3414, "step": 2279 }, { "epoch": 2.5294688670087364, "grad_norm": 0.4226178228855133, "learning_rate": 5.217927631578947e-05, "loss": 0.4188, "step": 2280 }, { "epoch": 2.5305782831784773, "grad_norm": 0.3257390558719635, "learning_rate": 5.205592105263157e-05, "loss": 0.4046, "step": 2281 }, { "epoch": 2.5316876993482182, "grad_norm": 0.31944599747657776, "learning_rate": 5.193256578947368e-05, "loss": 0.3429, "step": 2282 }, { "epoch": 2.5327971155179587, "grad_norm": 0.3145938515663147, "learning_rate": 5.1809210526315786e-05, "loss": 0.4005, "step": 2283 }, { "epoch": 2.533906531687699, "grad_norm": 0.4402817189693451, "learning_rate": 5.168585526315789e-05, "loss": 0.4823, "step": 2284 }, { "epoch": 2.53501594785744, "grad_norm": 0.43508538603782654, "learning_rate": 5.156249999999999e-05, "loss": 0.3428, "step": 2285 }, { "epoch": 2.5361253640271806, "grad_norm": 0.4584806561470032, "learning_rate": 5.14391447368421e-05, "loss": 0.396, "step": 2286 }, { "epoch": 2.5372347801969215, "grad_norm": 0.454815149307251, "learning_rate": 5.1315789473684206e-05, "loss": 0.4953, "step": 2287 }, { "epoch": 2.538344196366662, "grad_norm": 0.35620346665382385, "learning_rate": 5.1192434210526306e-05, "loss": 0.4026, "step": 2288 }, { "epoch": 2.539453612536403, "grad_norm": 0.34608957171440125, "learning_rate": 5.106907894736842e-05, "loss": 0.5012, "step": 2289 }, { "epoch": 2.5405630287061434, "grad_norm": 0.3685770630836487, "learning_rate": 5.094572368421052e-05, "loss": 0.4459, "step": 2290 }, { "epoch": 2.541672444875884, "grad_norm": 0.34064555168151855, "learning_rate": 5.082236842105263e-05, "loss": 0.206, "step": 2291 }, { "epoch": 2.5427818610456248, "grad_norm": 0.5084649324417114, "learning_rate": 5.069901315789473e-05, "loss": 0.4602, "step": 2292 }, { "epoch": 2.5438912772153657, "grad_norm": 0.4061320126056671, "learning_rate": 5.057565789473683e-05, "loss": 0.5329, "step": 2293 }, { "epoch": 2.545000693385106, "grad_norm": 0.2851752042770386, "learning_rate": 5.045230263157895e-05, "loss": 0.2781, "step": 2294 }, { "epoch": 2.5461101095548466, "grad_norm": 0.2984931170940399, "learning_rate": 5.032894736842105e-05, "loss": 0.3925, "step": 2295 }, { "epoch": 2.5472195257245875, "grad_norm": 0.3785136342048645, "learning_rate": 5.020559210526315e-05, "loss": 0.2643, "step": 2296 }, { "epoch": 2.548328941894328, "grad_norm": 0.4343215823173523, "learning_rate": 5.008223684210526e-05, "loss": 0.4111, "step": 2297 }, { "epoch": 2.549438358064069, "grad_norm": 0.3174603581428528, "learning_rate": 4.995888157894736e-05, "loss": 0.3798, "step": 2298 }, { "epoch": 2.5505477742338094, "grad_norm": 0.30842000246047974, "learning_rate": 4.9835526315789474e-05, "loss": 0.2997, "step": 2299 }, { "epoch": 2.5516571904035503, "grad_norm": 0.42980441451072693, "learning_rate": 4.9712171052631573e-05, "loss": 0.3467, "step": 2300 }, { "epoch": 2.552766606573291, "grad_norm": 0.3155680000782013, "learning_rate": 4.958881578947368e-05, "loss": 0.3337, "step": 2301 }, { "epoch": 2.5538760227430313, "grad_norm": 0.38970938324928284, "learning_rate": 4.946546052631579e-05, "loss": 0.327, "step": 2302 }, { "epoch": 2.554985438912772, "grad_norm": 0.43559134006500244, "learning_rate": 4.9342105263157894e-05, "loss": 0.5547, "step": 2303 }, { "epoch": 2.5560948550825127, "grad_norm": 0.4793894290924072, "learning_rate": 4.9218749999999994e-05, "loss": 0.4121, "step": 2304 }, { "epoch": 2.5572042712522536, "grad_norm": 0.5885838270187378, "learning_rate": 4.90953947368421e-05, "loss": 0.5028, "step": 2305 }, { "epoch": 2.558313687421994, "grad_norm": 0.4064948260784149, "learning_rate": 4.897203947368421e-05, "loss": 0.257, "step": 2306 }, { "epoch": 2.559423103591735, "grad_norm": 0.6676486730575562, "learning_rate": 4.884868421052631e-05, "loss": 0.6808, "step": 2307 }, { "epoch": 2.5605325197614754, "grad_norm": 0.3958938419818878, "learning_rate": 4.872532894736842e-05, "loss": 0.4467, "step": 2308 }, { "epoch": 2.5616419359312164, "grad_norm": 0.5410012602806091, "learning_rate": 4.860197368421052e-05, "loss": 0.4545, "step": 2309 }, { "epoch": 2.562751352100957, "grad_norm": 0.31039872765541077, "learning_rate": 4.8478618421052634e-05, "loss": 0.4389, "step": 2310 }, { "epoch": 2.5638607682706978, "grad_norm": 0.4607661962509155, "learning_rate": 4.8355263157894734e-05, "loss": 0.3213, "step": 2311 }, { "epoch": 2.5649701844404382, "grad_norm": 0.33510398864746094, "learning_rate": 4.8231907894736834e-05, "loss": 0.3981, "step": 2312 }, { "epoch": 2.5660796006101787, "grad_norm": 0.42955508828163147, "learning_rate": 4.810855263157895e-05, "loss": 0.2282, "step": 2313 }, { "epoch": 2.5671890167799196, "grad_norm": 0.5348376035690308, "learning_rate": 4.798519736842105e-05, "loss": 0.4386, "step": 2314 }, { "epoch": 2.56829843294966, "grad_norm": 0.4563250243663788, "learning_rate": 4.7861842105263154e-05, "loss": 0.3593, "step": 2315 }, { "epoch": 2.569407849119401, "grad_norm": 0.38774073123931885, "learning_rate": 4.773848684210526e-05, "loss": 0.3159, "step": 2316 }, { "epoch": 2.5705172652891415, "grad_norm": 0.33762556314468384, "learning_rate": 4.761513157894736e-05, "loss": 0.2853, "step": 2317 }, { "epoch": 2.5716266814588824, "grad_norm": 0.5899233222007751, "learning_rate": 4.749177631578947e-05, "loss": 0.2734, "step": 2318 }, { "epoch": 2.572736097628623, "grad_norm": 0.4161059558391571, "learning_rate": 4.7368421052631574e-05, "loss": 0.442, "step": 2319 }, { "epoch": 2.5738455137983633, "grad_norm": 0.366187185049057, "learning_rate": 4.724506578947368e-05, "loss": 0.2767, "step": 2320 }, { "epoch": 2.5749549299681043, "grad_norm": 0.325467586517334, "learning_rate": 4.712171052631578e-05, "loss": 0.4227, "step": 2321 }, { "epoch": 2.576064346137845, "grad_norm": 0.31031447649002075, "learning_rate": 4.6998355263157894e-05, "loss": 0.3563, "step": 2322 }, { "epoch": 2.5771737623075857, "grad_norm": 0.4601028263568878, "learning_rate": 4.6874999999999994e-05, "loss": 0.3945, "step": 2323 }, { "epoch": 2.578283178477326, "grad_norm": 0.5773911476135254, "learning_rate": 4.67516447368421e-05, "loss": 0.5002, "step": 2324 }, { "epoch": 2.579392594647067, "grad_norm": 0.3071858584880829, "learning_rate": 4.662828947368421e-05, "loss": 0.3305, "step": 2325 }, { "epoch": 2.5805020108168075, "grad_norm": 0.42758241295814514, "learning_rate": 4.650493421052631e-05, "loss": 0.3915, "step": 2326 }, { "epoch": 2.5816114269865484, "grad_norm": 0.4389335811138153, "learning_rate": 4.638157894736842e-05, "loss": 0.41, "step": 2327 }, { "epoch": 2.582720843156289, "grad_norm": 0.4536753296852112, "learning_rate": 4.625822368421052e-05, "loss": 0.552, "step": 2328 }, { "epoch": 2.58383025932603, "grad_norm": 0.4767647981643677, "learning_rate": 4.6134868421052635e-05, "loss": 0.2961, "step": 2329 }, { "epoch": 2.5849396754957703, "grad_norm": 0.33526867628097534, "learning_rate": 4.6011513157894734e-05, "loss": 0.4272, "step": 2330 }, { "epoch": 2.586049091665511, "grad_norm": 0.3391248285770416, "learning_rate": 4.5888157894736834e-05, "loss": 0.4728, "step": 2331 }, { "epoch": 2.5871585078352517, "grad_norm": 0.44053715467453003, "learning_rate": 4.576480263157895e-05, "loss": 0.4216, "step": 2332 }, { "epoch": 2.5882679240049926, "grad_norm": 0.3894062340259552, "learning_rate": 4.564144736842105e-05, "loss": 0.163, "step": 2333 }, { "epoch": 2.589377340174733, "grad_norm": 0.37204012274742126, "learning_rate": 4.5518092105263155e-05, "loss": 0.388, "step": 2334 }, { "epoch": 2.5904867563444736, "grad_norm": 0.3864336907863617, "learning_rate": 4.539473684210526e-05, "loss": 0.2925, "step": 2335 }, { "epoch": 2.5915961725142145, "grad_norm": 0.2916948199272156, "learning_rate": 4.527138157894736e-05, "loss": 0.2073, "step": 2336 }, { "epoch": 2.592705588683955, "grad_norm": 0.8837294578552246, "learning_rate": 4.514802631578947e-05, "loss": 0.2756, "step": 2337 }, { "epoch": 2.593815004853696, "grad_norm": 0.41746076941490173, "learning_rate": 4.5024671052631575e-05, "loss": 0.4903, "step": 2338 }, { "epoch": 2.5949244210234363, "grad_norm": 0.31884750723838806, "learning_rate": 4.490131578947368e-05, "loss": 0.4432, "step": 2339 }, { "epoch": 2.5960338371931773, "grad_norm": 0.30209752917289734, "learning_rate": 4.477796052631578e-05, "loss": 0.3105, "step": 2340 }, { "epoch": 2.5971432533629177, "grad_norm": 0.42073366045951843, "learning_rate": 4.4654605263157895e-05, "loss": 0.2862, "step": 2341 }, { "epoch": 2.598252669532658, "grad_norm": 1.1312992572784424, "learning_rate": 4.4531249999999995e-05, "loss": 0.3452, "step": 2342 }, { "epoch": 2.599362085702399, "grad_norm": 0.35556334257125854, "learning_rate": 4.4407894736842095e-05, "loss": 0.3874, "step": 2343 }, { "epoch": 2.60047150187214, "grad_norm": 0.43012315034866333, "learning_rate": 4.428453947368421e-05, "loss": 0.3205, "step": 2344 }, { "epoch": 2.6015809180418805, "grad_norm": 0.3909721076488495, "learning_rate": 4.416118421052631e-05, "loss": 0.3442, "step": 2345 }, { "epoch": 2.602690334211621, "grad_norm": 0.4908634424209595, "learning_rate": 4.403782894736842e-05, "loss": 0.5702, "step": 2346 }, { "epoch": 2.603799750381362, "grad_norm": 0.6141018867492676, "learning_rate": 4.391447368421052e-05, "loss": 0.4149, "step": 2347 }, { "epoch": 2.6049091665511024, "grad_norm": 0.4573987126350403, "learning_rate": 4.379111842105263e-05, "loss": 0.4007, "step": 2348 }, { "epoch": 2.6060185827208433, "grad_norm": 0.4271261692047119, "learning_rate": 4.3667763157894735e-05, "loss": 0.4075, "step": 2349 }, { "epoch": 2.6071279988905838, "grad_norm": 0.39978379011154175, "learning_rate": 4.3544407894736835e-05, "loss": 0.4654, "step": 2350 }, { "epoch": 2.6082374150603247, "grad_norm": 0.33403047919273376, "learning_rate": 4.342105263157895e-05, "loss": 0.4196, "step": 2351 }, { "epoch": 2.609346831230065, "grad_norm": 0.5256022214889526, "learning_rate": 4.329769736842105e-05, "loss": 0.2752, "step": 2352 }, { "epoch": 2.6104562473998056, "grad_norm": 0.4153258204460144, "learning_rate": 4.3174342105263155e-05, "loss": 0.3468, "step": 2353 }, { "epoch": 2.6115656635695466, "grad_norm": 0.6061992645263672, "learning_rate": 4.305098684210526e-05, "loss": 0.351, "step": 2354 }, { "epoch": 2.6126750797392875, "grad_norm": 0.295296847820282, "learning_rate": 4.292763157894736e-05, "loss": 0.2476, "step": 2355 }, { "epoch": 2.613784495909028, "grad_norm": 0.3813928961753845, "learning_rate": 4.280427631578947e-05, "loss": 0.3138, "step": 2356 }, { "epoch": 2.6148939120787684, "grad_norm": 0.3294810652732849, "learning_rate": 4.2680921052631575e-05, "loss": 0.3273, "step": 2357 }, { "epoch": 2.6160033282485093, "grad_norm": 0.4694172739982605, "learning_rate": 4.255756578947368e-05, "loss": 0.3346, "step": 2358 }, { "epoch": 2.61711274441825, "grad_norm": 0.5706562399864197, "learning_rate": 4.243421052631578e-05, "loss": 0.2241, "step": 2359 }, { "epoch": 2.6182221605879907, "grad_norm": 0.6200342774391174, "learning_rate": 4.2310855263157896e-05, "loss": 0.4129, "step": 2360 }, { "epoch": 2.619331576757731, "grad_norm": 0.4055767059326172, "learning_rate": 4.2187499999999995e-05, "loss": 0.4006, "step": 2361 }, { "epoch": 2.620440992927472, "grad_norm": 0.33380985260009766, "learning_rate": 4.2064144736842095e-05, "loss": 0.3732, "step": 2362 }, { "epoch": 2.6215504090972126, "grad_norm": 0.41517943143844604, "learning_rate": 4.194078947368421e-05, "loss": 0.2458, "step": 2363 }, { "epoch": 2.622659825266953, "grad_norm": 0.3231419324874878, "learning_rate": 4.181743421052631e-05, "loss": 0.5769, "step": 2364 }, { "epoch": 2.623769241436694, "grad_norm": 0.4171946942806244, "learning_rate": 4.169407894736842e-05, "loss": 0.4702, "step": 2365 }, { "epoch": 2.6248786576064345, "grad_norm": 0.3811667263507843, "learning_rate": 4.157072368421052e-05, "loss": 0.3849, "step": 2366 }, { "epoch": 2.6259880737761754, "grad_norm": 0.41539669036865234, "learning_rate": 4.144736842105263e-05, "loss": 0.3845, "step": 2367 }, { "epoch": 2.627097489945916, "grad_norm": 0.39716070890426636, "learning_rate": 4.1324013157894736e-05, "loss": 0.3744, "step": 2368 }, { "epoch": 2.6282069061156568, "grad_norm": 0.38069993257522583, "learning_rate": 4.1200657894736836e-05, "loss": 0.2432, "step": 2369 }, { "epoch": 2.6293163222853972, "grad_norm": 0.3581462502479553, "learning_rate": 4.107730263157894e-05, "loss": 0.6516, "step": 2370 }, { "epoch": 2.6304257384551377, "grad_norm": 0.34660494327545166, "learning_rate": 4.095394736842105e-05, "loss": 0.3599, "step": 2371 }, { "epoch": 2.6315351546248786, "grad_norm": 0.34078100323677063, "learning_rate": 4.0830592105263156e-05, "loss": 0.3611, "step": 2372 }, { "epoch": 2.6326445707946196, "grad_norm": 0.3904211223125458, "learning_rate": 4.0707236842105256e-05, "loss": 0.301, "step": 2373 }, { "epoch": 2.63375398696436, "grad_norm": 0.3696337342262268, "learning_rate": 4.058388157894736e-05, "loss": 0.3051, "step": 2374 }, { "epoch": 2.6348634031341005, "grad_norm": 0.43636754155158997, "learning_rate": 4.046052631578947e-05, "loss": 0.4971, "step": 2375 }, { "epoch": 2.6359728193038414, "grad_norm": 0.5181596279144287, "learning_rate": 4.0337171052631576e-05, "loss": 0.3127, "step": 2376 }, { "epoch": 2.637082235473582, "grad_norm": 0.5476127862930298, "learning_rate": 4.021381578947368e-05, "loss": 0.5049, "step": 2377 }, { "epoch": 2.638191651643323, "grad_norm": 0.5447264909744263, "learning_rate": 4.009046052631578e-05, "loss": 0.5255, "step": 2378 }, { "epoch": 2.6393010678130633, "grad_norm": 0.5430484414100647, "learning_rate": 3.9967105263157896e-05, "loss": 0.3371, "step": 2379 }, { "epoch": 2.640410483982804, "grad_norm": 0.4831867516040802, "learning_rate": 3.9843749999999996e-05, "loss": 0.4452, "step": 2380 }, { "epoch": 2.6415199001525447, "grad_norm": 0.6055355072021484, "learning_rate": 3.9720394736842096e-05, "loss": 0.5317, "step": 2381 }, { "epoch": 2.642629316322285, "grad_norm": 0.4254622757434845, "learning_rate": 3.959703947368421e-05, "loss": 0.3434, "step": 2382 }, { "epoch": 2.643738732492026, "grad_norm": 0.4942837655544281, "learning_rate": 3.947368421052631e-05, "loss": 0.4468, "step": 2383 }, { "epoch": 2.644848148661767, "grad_norm": 0.3226402997970581, "learning_rate": 3.935032894736842e-05, "loss": 0.4929, "step": 2384 }, { "epoch": 2.6459575648315075, "grad_norm": 0.4470663368701935, "learning_rate": 3.922697368421052e-05, "loss": 0.33, "step": 2385 }, { "epoch": 2.647066981001248, "grad_norm": 0.5086075663566589, "learning_rate": 3.910361842105263e-05, "loss": 0.6575, "step": 2386 }, { "epoch": 2.648176397170989, "grad_norm": 0.39487743377685547, "learning_rate": 3.8980263157894736e-05, "loss": 0.3375, "step": 2387 }, { "epoch": 2.6492858133407293, "grad_norm": 0.6295050978660583, "learning_rate": 3.8856907894736836e-05, "loss": 0.398, "step": 2388 }, { "epoch": 2.6503952295104702, "grad_norm": 0.5763193368911743, "learning_rate": 3.873355263157894e-05, "loss": 0.3891, "step": 2389 }, { "epoch": 2.6515046456802107, "grad_norm": 0.33462652564048767, "learning_rate": 3.861019736842105e-05, "loss": 0.428, "step": 2390 }, { "epoch": 2.6526140618499516, "grad_norm": 0.6703096628189087, "learning_rate": 3.8486842105263156e-05, "loss": 0.3148, "step": 2391 }, { "epoch": 2.653723478019692, "grad_norm": 0.3808022141456604, "learning_rate": 3.8363486842105256e-05, "loss": 0.4022, "step": 2392 }, { "epoch": 2.6548328941894326, "grad_norm": 0.42382562160491943, "learning_rate": 3.824013157894736e-05, "loss": 0.2387, "step": 2393 }, { "epoch": 2.6559423103591735, "grad_norm": 0.4965885579586029, "learning_rate": 3.811677631578947e-05, "loss": 0.5139, "step": 2394 }, { "epoch": 2.6570517265289144, "grad_norm": 0.4375743269920349, "learning_rate": 3.799342105263157e-05, "loss": 0.3922, "step": 2395 }, { "epoch": 2.658161142698655, "grad_norm": 0.44008010625839233, "learning_rate": 3.787006578947368e-05, "loss": 0.8074, "step": 2396 }, { "epoch": 2.6592705588683954, "grad_norm": 0.40815237164497375, "learning_rate": 3.774671052631578e-05, "loss": 0.4673, "step": 2397 }, { "epoch": 2.6603799750381363, "grad_norm": 0.3250466287136078, "learning_rate": 3.76233552631579e-05, "loss": 0.3603, "step": 2398 }, { "epoch": 2.6614893912078768, "grad_norm": 0.4003657400608063, "learning_rate": 3.75e-05, "loss": 0.5028, "step": 2399 }, { "epoch": 2.6625988073776177, "grad_norm": 0.475201278924942, "learning_rate": 3.7376644736842103e-05, "loss": 0.2309, "step": 2400 }, { "epoch": 2.663708223547358, "grad_norm": 0.4348124563694, "learning_rate": 3.725328947368421e-05, "loss": 0.3997, "step": 2401 }, { "epoch": 2.664817639717099, "grad_norm": 0.5189718008041382, "learning_rate": 3.712993421052632e-05, "loss": 0.4532, "step": 2402 }, { "epoch": 2.6659270558868395, "grad_norm": 0.293647438287735, "learning_rate": 3.700657894736842e-05, "loss": 0.4344, "step": 2403 }, { "epoch": 2.66703647205658, "grad_norm": 0.4177672564983368, "learning_rate": 3.6883223684210524e-05, "loss": 0.4263, "step": 2404 }, { "epoch": 2.668145888226321, "grad_norm": 0.4328800141811371, "learning_rate": 3.675986842105263e-05, "loss": 0.3869, "step": 2405 }, { "epoch": 2.669255304396062, "grad_norm": 0.3846050202846527, "learning_rate": 3.663651315789474e-05, "loss": 0.4011, "step": 2406 }, { "epoch": 2.6703647205658023, "grad_norm": 0.3619375228881836, "learning_rate": 3.651315789473684e-05, "loss": 0.2928, "step": 2407 }, { "epoch": 2.671474136735543, "grad_norm": 0.5226150155067444, "learning_rate": 3.6389802631578944e-05, "loss": 0.2546, "step": 2408 }, { "epoch": 2.6725835529052837, "grad_norm": 0.3613680303096771, "learning_rate": 3.626644736842105e-05, "loss": 0.3706, "step": 2409 }, { "epoch": 2.673692969075024, "grad_norm": 0.38295650482177734, "learning_rate": 3.614309210526315e-05, "loss": 0.3319, "step": 2410 }, { "epoch": 2.674802385244765, "grad_norm": 0.3644557595252991, "learning_rate": 3.601973684210526e-05, "loss": 0.3344, "step": 2411 }, { "epoch": 2.6759118014145056, "grad_norm": 0.4107860028743744, "learning_rate": 3.5896381578947364e-05, "loss": 0.2881, "step": 2412 }, { "epoch": 2.6770212175842465, "grad_norm": 0.39361506700515747, "learning_rate": 3.577302631578947e-05, "loss": 0.3575, "step": 2413 }, { "epoch": 2.678130633753987, "grad_norm": 0.30774053931236267, "learning_rate": 3.564967105263158e-05, "loss": 0.2718, "step": 2414 }, { "epoch": 2.6792400499237274, "grad_norm": 0.40254542231559753, "learning_rate": 3.552631578947368e-05, "loss": 0.4364, "step": 2415 }, { "epoch": 2.6803494660934684, "grad_norm": 0.39855626225471497, "learning_rate": 3.5402960526315784e-05, "loss": 0.2258, "step": 2416 }, { "epoch": 2.681458882263209, "grad_norm": 0.3063789904117584, "learning_rate": 3.527960526315789e-05, "loss": 0.3248, "step": 2417 }, { "epoch": 2.6825682984329497, "grad_norm": 0.48203665018081665, "learning_rate": 3.515625e-05, "loss": 0.5971, "step": 2418 }, { "epoch": 2.6836777146026902, "grad_norm": 0.40560922026634216, "learning_rate": 3.5032894736842104e-05, "loss": 0.2832, "step": 2419 }, { "epoch": 2.684787130772431, "grad_norm": 0.4571300745010376, "learning_rate": 3.490953947368421e-05, "loss": 0.414, "step": 2420 }, { "epoch": 2.6858965469421716, "grad_norm": 0.29357972741127014, "learning_rate": 3.478618421052632e-05, "loss": 0.4382, "step": 2421 }, { "epoch": 2.687005963111912, "grad_norm": 0.4692830741405487, "learning_rate": 3.466282894736842e-05, "loss": 0.4557, "step": 2422 }, { "epoch": 2.688115379281653, "grad_norm": 0.5453107953071594, "learning_rate": 3.4539473684210524e-05, "loss": 0.456, "step": 2423 }, { "epoch": 2.689224795451394, "grad_norm": 0.8587030172348022, "learning_rate": 3.441611842105263e-05, "loss": 0.5188, "step": 2424 }, { "epoch": 2.6903342116211344, "grad_norm": 0.5260380506515503, "learning_rate": 3.429276315789474e-05, "loss": 0.4475, "step": 2425 }, { "epoch": 2.691443627790875, "grad_norm": 0.2895418405532837, "learning_rate": 3.416940789473684e-05, "loss": 0.245, "step": 2426 }, { "epoch": 2.692553043960616, "grad_norm": 0.44756054878234863, "learning_rate": 3.4046052631578944e-05, "loss": 0.4375, "step": 2427 }, { "epoch": 2.6936624601303563, "grad_norm": 0.39095133543014526, "learning_rate": 3.392269736842105e-05, "loss": 0.405, "step": 2428 }, { "epoch": 2.694771876300097, "grad_norm": 0.6433843374252319, "learning_rate": 3.379934210526315e-05, "loss": 0.2905, "step": 2429 }, { "epoch": 2.6958812924698377, "grad_norm": 0.5514675974845886, "learning_rate": 3.367598684210526e-05, "loss": 0.3651, "step": 2430 }, { "epoch": 2.6969907086395786, "grad_norm": 0.4037233293056488, "learning_rate": 3.3552631578947364e-05, "loss": 0.5534, "step": 2431 }, { "epoch": 2.698100124809319, "grad_norm": 0.3961438238620758, "learning_rate": 3.342927631578947e-05, "loss": 0.4399, "step": 2432 }, { "epoch": 2.6992095409790595, "grad_norm": 0.38111430406570435, "learning_rate": 3.330592105263158e-05, "loss": 0.3532, "step": 2433 }, { "epoch": 2.7003189571488004, "grad_norm": 0.35400980710983276, "learning_rate": 3.318256578947368e-05, "loss": 0.5905, "step": 2434 }, { "epoch": 2.7014283733185414, "grad_norm": 0.469614714384079, "learning_rate": 3.3059210526315785e-05, "loss": 0.4461, "step": 2435 }, { "epoch": 2.702537789488282, "grad_norm": 0.2964523732662201, "learning_rate": 3.293585526315789e-05, "loss": 0.3752, "step": 2436 }, { "epoch": 2.7036472056580223, "grad_norm": 0.4390257000923157, "learning_rate": 3.28125e-05, "loss": 0.384, "step": 2437 }, { "epoch": 2.704756621827763, "grad_norm": 0.6685402393341064, "learning_rate": 3.2689144736842105e-05, "loss": 0.3686, "step": 2438 }, { "epoch": 2.7058660379975037, "grad_norm": 0.42853912711143494, "learning_rate": 3.256578947368421e-05, "loss": 0.3165, "step": 2439 }, { "epoch": 2.7069754541672446, "grad_norm": 0.382541686296463, "learning_rate": 3.244243421052632e-05, "loss": 0.4957, "step": 2440 }, { "epoch": 2.708084870336985, "grad_norm": 0.5440720319747925, "learning_rate": 3.231907894736842e-05, "loss": 0.4417, "step": 2441 }, { "epoch": 2.709194286506726, "grad_norm": 0.4604506492614746, "learning_rate": 3.2195723684210525e-05, "loss": 0.4257, "step": 2442 }, { "epoch": 2.7103037026764665, "grad_norm": 0.45395079255104065, "learning_rate": 3.207236842105263e-05, "loss": 0.2593, "step": 2443 }, { "epoch": 2.711413118846207, "grad_norm": 0.4349961578845978, "learning_rate": 3.194901315789473e-05, "loss": 0.3596, "step": 2444 }, { "epoch": 2.712522535015948, "grad_norm": 0.46334758400917053, "learning_rate": 3.182565789473684e-05, "loss": 0.3347, "step": 2445 }, { "epoch": 2.713631951185689, "grad_norm": 0.4385397136211395, "learning_rate": 3.1702302631578945e-05, "loss": 0.1988, "step": 2446 }, { "epoch": 2.7147413673554293, "grad_norm": 0.6983950734138489, "learning_rate": 3.1578947368421045e-05, "loss": 0.3902, "step": 2447 }, { "epoch": 2.7158507835251697, "grad_norm": 0.46519941091537476, "learning_rate": 3.145559210526315e-05, "loss": 0.9286, "step": 2448 }, { "epoch": 2.7169601996949106, "grad_norm": 0.4739225506782532, "learning_rate": 3.133223684210526e-05, "loss": 0.4028, "step": 2449 }, { "epoch": 2.718069615864651, "grad_norm": 0.45045381784439087, "learning_rate": 3.1208881578947365e-05, "loss": 0.2989, "step": 2450 }, { "epoch": 2.719179032034392, "grad_norm": 0.5387830138206482, "learning_rate": 3.108552631578947e-05, "loss": 0.3876, "step": 2451 }, { "epoch": 2.7202884482041325, "grad_norm": 0.4015057682991028, "learning_rate": 3.096217105263158e-05, "loss": 0.3363, "step": 2452 }, { "epoch": 2.7213978643738734, "grad_norm": 0.9120666980743408, "learning_rate": 3.083881578947368e-05, "loss": 0.3437, "step": 2453 }, { "epoch": 2.722507280543614, "grad_norm": 0.3784855902194977, "learning_rate": 3.0715460526315785e-05, "loss": 0.3015, "step": 2454 }, { "epoch": 2.7236166967133544, "grad_norm": 0.3855441212654114, "learning_rate": 3.059210526315789e-05, "loss": 0.3813, "step": 2455 }, { "epoch": 2.7247261128830953, "grad_norm": 0.8900882601737976, "learning_rate": 3.046875e-05, "loss": 0.6206, "step": 2456 }, { "epoch": 2.725835529052836, "grad_norm": 0.43435898423194885, "learning_rate": 3.0345394736842102e-05, "loss": 0.2936, "step": 2457 }, { "epoch": 2.7269449452225767, "grad_norm": 0.4391900300979614, "learning_rate": 3.022203947368421e-05, "loss": 0.6765, "step": 2458 }, { "epoch": 2.728054361392317, "grad_norm": 0.566494882106781, "learning_rate": 3.0098684210526315e-05, "loss": 0.3304, "step": 2459 }, { "epoch": 2.729163777562058, "grad_norm": 0.35425856709480286, "learning_rate": 2.9975328947368415e-05, "loss": 0.3772, "step": 2460 }, { "epoch": 2.7302731937317986, "grad_norm": 0.3330824673175812, "learning_rate": 2.9851973684210522e-05, "loss": 0.3451, "step": 2461 }, { "epoch": 2.7313826099015395, "grad_norm": 0.5032577514648438, "learning_rate": 2.972861842105263e-05, "loss": 0.4958, "step": 2462 }, { "epoch": 2.73249202607128, "grad_norm": 0.46107593178749084, "learning_rate": 2.9605263157894735e-05, "loss": 0.5935, "step": 2463 }, { "epoch": 2.733601442241021, "grad_norm": 0.4279358983039856, "learning_rate": 2.9481907894736842e-05, "loss": 0.352, "step": 2464 }, { "epoch": 2.7347108584107613, "grad_norm": 0.35816699266433716, "learning_rate": 2.9358552631578946e-05, "loss": 0.4058, "step": 2465 }, { "epoch": 2.735820274580502, "grad_norm": 0.3129696249961853, "learning_rate": 2.923519736842105e-05, "loss": 0.2704, "step": 2466 }, { "epoch": 2.7369296907502427, "grad_norm": 0.5344390869140625, "learning_rate": 2.9111842105263156e-05, "loss": 0.3926, "step": 2467 }, { "epoch": 2.738039106919983, "grad_norm": 0.3743567168712616, "learning_rate": 2.898848684210526e-05, "loss": 0.3455, "step": 2468 }, { "epoch": 2.739148523089724, "grad_norm": 0.4825611412525177, "learning_rate": 2.8865131578947366e-05, "loss": 0.4537, "step": 2469 }, { "epoch": 2.7402579392594646, "grad_norm": 0.47796013951301575, "learning_rate": 2.8741776315789472e-05, "loss": 0.4268, "step": 2470 }, { "epoch": 2.7413673554292055, "grad_norm": 0.38644805550575256, "learning_rate": 2.861842105263158e-05, "loss": 0.3989, "step": 2471 }, { "epoch": 2.742476771598946, "grad_norm": 0.41950738430023193, "learning_rate": 2.849506578947368e-05, "loss": 0.4983, "step": 2472 }, { "epoch": 2.7435861877686865, "grad_norm": 0.34149548411369324, "learning_rate": 2.8371710526315786e-05, "loss": 0.4519, "step": 2473 }, { "epoch": 2.7446956039384274, "grad_norm": 0.4213709831237793, "learning_rate": 2.8248355263157893e-05, "loss": 0.4212, "step": 2474 }, { "epoch": 2.7458050201081683, "grad_norm": 0.3141988217830658, "learning_rate": 2.8125e-05, "loss": 0.3161, "step": 2475 }, { "epoch": 2.7469144362779088, "grad_norm": 0.3500676155090332, "learning_rate": 2.8001644736842103e-05, "loss": 0.3603, "step": 2476 }, { "epoch": 2.7480238524476492, "grad_norm": 0.4086303114891052, "learning_rate": 2.787828947368421e-05, "loss": 0.5926, "step": 2477 }, { "epoch": 2.74913326861739, "grad_norm": 0.5967698097229004, "learning_rate": 2.7754934210526316e-05, "loss": 0.237, "step": 2478 }, { "epoch": 2.7502426847871306, "grad_norm": 0.4627840220928192, "learning_rate": 2.7631578947368416e-05, "loss": 0.3304, "step": 2479 }, { "epoch": 2.7513521009568715, "grad_norm": 0.37266799807548523, "learning_rate": 2.7508223684210523e-05, "loss": 0.5739, "step": 2480 }, { "epoch": 2.752461517126612, "grad_norm": 0.42507204413414, "learning_rate": 2.738486842105263e-05, "loss": 0.4707, "step": 2481 }, { "epoch": 2.753570933296353, "grad_norm": 0.4577075242996216, "learning_rate": 2.7261513157894736e-05, "loss": 0.5197, "step": 2482 }, { "epoch": 2.7546803494660934, "grad_norm": 0.6312010884284973, "learning_rate": 2.713815789473684e-05, "loss": 0.5129, "step": 2483 }, { "epoch": 2.755789765635834, "grad_norm": 0.5095051527023315, "learning_rate": 2.7014802631578946e-05, "loss": 0.329, "step": 2484 }, { "epoch": 2.756899181805575, "grad_norm": 0.5552304983139038, "learning_rate": 2.689144736842105e-05, "loss": 0.516, "step": 2485 }, { "epoch": 2.7580085979753157, "grad_norm": 0.4446256458759308, "learning_rate": 2.6768092105263153e-05, "loss": 0.4809, "step": 2486 }, { "epoch": 2.759118014145056, "grad_norm": 0.45802775025367737, "learning_rate": 2.664473684210526e-05, "loss": 0.2507, "step": 2487 }, { "epoch": 2.7602274303147967, "grad_norm": 1.002074956893921, "learning_rate": 2.6521381578947366e-05, "loss": 0.3177, "step": 2488 }, { "epoch": 2.7613368464845376, "grad_norm": 0.37677767872810364, "learning_rate": 2.6398026315789473e-05, "loss": 0.3994, "step": 2489 }, { "epoch": 2.762446262654278, "grad_norm": 0.5203359723091125, "learning_rate": 2.627467105263158e-05, "loss": 0.3421, "step": 2490 }, { "epoch": 2.763555678824019, "grad_norm": 0.41637536883354187, "learning_rate": 2.615131578947368e-05, "loss": 0.4927, "step": 2491 }, { "epoch": 2.7646650949937595, "grad_norm": 0.44756412506103516, "learning_rate": 2.6027960526315786e-05, "loss": 0.3734, "step": 2492 }, { "epoch": 2.7657745111635004, "grad_norm": 0.3564557731151581, "learning_rate": 2.5904605263157893e-05, "loss": 0.3779, "step": 2493 }, { "epoch": 2.766883927333241, "grad_norm": 0.3852544128894806, "learning_rate": 2.5781249999999996e-05, "loss": 0.6018, "step": 2494 }, { "epoch": 2.7679933435029813, "grad_norm": 0.3930635452270508, "learning_rate": 2.5657894736842103e-05, "loss": 0.5023, "step": 2495 }, { "epoch": 2.7691027596727222, "grad_norm": 0.34579432010650635, "learning_rate": 2.553453947368421e-05, "loss": 0.5322, "step": 2496 }, { "epoch": 2.770212175842463, "grad_norm": 0.38263997435569763, "learning_rate": 2.5411184210526317e-05, "loss": 0.3458, "step": 2497 }, { "epoch": 2.7713215920122036, "grad_norm": 0.34970882534980774, "learning_rate": 2.5287828947368417e-05, "loss": 0.4036, "step": 2498 }, { "epoch": 2.772431008181944, "grad_norm": 0.36101511120796204, "learning_rate": 2.5164473684210523e-05, "loss": 0.2912, "step": 2499 }, { "epoch": 2.773540424351685, "grad_norm": 0.39122408628463745, "learning_rate": 2.504111842105263e-05, "loss": 0.3174, "step": 2500 }, { "epoch": 2.7746498405214255, "grad_norm": 0.3180815875530243, "learning_rate": 2.4917763157894737e-05, "loss": 0.4215, "step": 2501 }, { "epoch": 2.7757592566911664, "grad_norm": 0.4079410433769226, "learning_rate": 2.479440789473684e-05, "loss": 0.5662, "step": 2502 }, { "epoch": 2.776868672860907, "grad_norm": 0.35038089752197266, "learning_rate": 2.4671052631578947e-05, "loss": 0.4216, "step": 2503 }, { "epoch": 2.777978089030648, "grad_norm": 0.34112969040870667, "learning_rate": 2.454769736842105e-05, "loss": 0.3627, "step": 2504 }, { "epoch": 2.7790875052003883, "grad_norm": 0.38411781191825867, "learning_rate": 2.4424342105263153e-05, "loss": 0.3333, "step": 2505 }, { "epoch": 2.7801969213701287, "grad_norm": 0.4743475615978241, "learning_rate": 2.430098684210526e-05, "loss": 0.4561, "step": 2506 }, { "epoch": 2.7813063375398697, "grad_norm": 0.3875446915626526, "learning_rate": 2.4177631578947367e-05, "loss": 0.5246, "step": 2507 }, { "epoch": 2.7824157537096106, "grad_norm": 0.3834385573863983, "learning_rate": 2.4054276315789474e-05, "loss": 0.4338, "step": 2508 }, { "epoch": 2.783525169879351, "grad_norm": 0.4199206233024597, "learning_rate": 2.3930921052631577e-05, "loss": 0.3568, "step": 2509 }, { "epoch": 2.7846345860490915, "grad_norm": 0.3700524568557739, "learning_rate": 2.380756578947368e-05, "loss": 0.4707, "step": 2510 }, { "epoch": 2.7857440022188324, "grad_norm": 0.3416059911251068, "learning_rate": 2.3684210526315787e-05, "loss": 0.3664, "step": 2511 }, { "epoch": 2.786853418388573, "grad_norm": 0.39632654190063477, "learning_rate": 2.356085526315789e-05, "loss": 0.5128, "step": 2512 }, { "epoch": 2.787962834558314, "grad_norm": 0.5743038654327393, "learning_rate": 2.3437499999999997e-05, "loss": 0.4739, "step": 2513 }, { "epoch": 2.7890722507280543, "grad_norm": 0.3902910649776459, "learning_rate": 2.3314144736842104e-05, "loss": 0.4639, "step": 2514 }, { "epoch": 2.7901816668977952, "grad_norm": 0.3890102803707123, "learning_rate": 2.319078947368421e-05, "loss": 0.4134, "step": 2515 }, { "epoch": 2.7912910830675357, "grad_norm": 0.5704060196876526, "learning_rate": 2.3067434210526317e-05, "loss": 0.4693, "step": 2516 }, { "epoch": 2.792400499237276, "grad_norm": 0.37423625588417053, "learning_rate": 2.2944078947368417e-05, "loss": 0.3357, "step": 2517 }, { "epoch": 2.793509915407017, "grad_norm": 0.33532601594924927, "learning_rate": 2.2820723684210524e-05, "loss": 0.4696, "step": 2518 }, { "epoch": 2.7946193315767576, "grad_norm": 0.3831970989704132, "learning_rate": 2.269736842105263e-05, "loss": 0.3145, "step": 2519 }, { "epoch": 2.7957287477464985, "grad_norm": 0.312404602766037, "learning_rate": 2.2574013157894734e-05, "loss": 0.408, "step": 2520 }, { "epoch": 2.796838163916239, "grad_norm": 0.396106094121933, "learning_rate": 2.245065789473684e-05, "loss": 0.33, "step": 2521 }, { "epoch": 2.79794758008598, "grad_norm": 0.3440202474594116, "learning_rate": 2.2327302631578947e-05, "loss": 0.3087, "step": 2522 }, { "epoch": 2.7990569962557204, "grad_norm": 0.3287065625190735, "learning_rate": 2.2203947368421047e-05, "loss": 0.2821, "step": 2523 }, { "epoch": 2.800166412425461, "grad_norm": 0.5490508675575256, "learning_rate": 2.2080592105263154e-05, "loss": 0.4684, "step": 2524 }, { "epoch": 2.8012758285952017, "grad_norm": 0.560986340045929, "learning_rate": 2.195723684210526e-05, "loss": 0.2953, "step": 2525 }, { "epoch": 2.8023852447649427, "grad_norm": 0.4068681001663208, "learning_rate": 2.1833881578947368e-05, "loss": 0.4226, "step": 2526 }, { "epoch": 2.803494660934683, "grad_norm": 0.3921424448490143, "learning_rate": 2.1710526315789474e-05, "loss": 0.3098, "step": 2527 }, { "epoch": 2.8046040771044236, "grad_norm": 0.40971678495407104, "learning_rate": 2.1587171052631578e-05, "loss": 0.3498, "step": 2528 }, { "epoch": 2.8057134932741645, "grad_norm": 0.2933211326599121, "learning_rate": 2.146381578947368e-05, "loss": 0.4814, "step": 2529 }, { "epoch": 2.806822909443905, "grad_norm": 0.5020395517349243, "learning_rate": 2.1340460526315788e-05, "loss": 0.4159, "step": 2530 }, { "epoch": 2.807932325613646, "grad_norm": 0.3937400281429291, "learning_rate": 2.121710526315789e-05, "loss": 0.3239, "step": 2531 }, { "epoch": 2.8090417417833864, "grad_norm": 0.47247281670570374, "learning_rate": 2.1093749999999998e-05, "loss": 0.4367, "step": 2532 }, { "epoch": 2.8101511579531273, "grad_norm": 0.44965869188308716, "learning_rate": 2.0970394736842104e-05, "loss": 0.3051, "step": 2533 }, { "epoch": 2.811260574122868, "grad_norm": 0.5918648838996887, "learning_rate": 2.084703947368421e-05, "loss": 0.3252, "step": 2534 }, { "epoch": 2.8123699902926083, "grad_norm": 0.4035295844078064, "learning_rate": 2.0723684210526315e-05, "loss": 0.3419, "step": 2535 }, { "epoch": 2.813479406462349, "grad_norm": 0.43882495164871216, "learning_rate": 2.0600328947368418e-05, "loss": 0.3963, "step": 2536 }, { "epoch": 2.81458882263209, "grad_norm": 0.3037029206752777, "learning_rate": 2.0476973684210525e-05, "loss": 0.2939, "step": 2537 }, { "epoch": 2.8156982388018306, "grad_norm": 0.42646703124046326, "learning_rate": 2.0353618421052628e-05, "loss": 0.376, "step": 2538 }, { "epoch": 2.816807654971571, "grad_norm": 0.3901432156562805, "learning_rate": 2.0230263157894735e-05, "loss": 0.3588, "step": 2539 }, { "epoch": 2.817917071141312, "grad_norm": 0.4185813367366791, "learning_rate": 2.010690789473684e-05, "loss": 0.4512, "step": 2540 }, { "epoch": 2.8190264873110524, "grad_norm": 0.46253278851509094, "learning_rate": 1.9983552631578948e-05, "loss": 0.3559, "step": 2541 }, { "epoch": 2.8201359034807933, "grad_norm": 0.49084243178367615, "learning_rate": 1.9860197368421048e-05, "loss": 0.4087, "step": 2542 }, { "epoch": 2.821245319650534, "grad_norm": 0.4082900285720825, "learning_rate": 1.9736842105263155e-05, "loss": 0.3204, "step": 2543 }, { "epoch": 2.8223547358202747, "grad_norm": 0.4727761149406433, "learning_rate": 1.961348684210526e-05, "loss": 0.5809, "step": 2544 }, { "epoch": 2.823464151990015, "grad_norm": 0.37872323393821716, "learning_rate": 1.9490131578947368e-05, "loss": 0.2695, "step": 2545 }, { "epoch": 2.8245735681597557, "grad_norm": 0.4297656714916229, "learning_rate": 1.936677631578947e-05, "loss": 0.2949, "step": 2546 }, { "epoch": 2.8256829843294966, "grad_norm": 0.5141124725341797, "learning_rate": 1.9243421052631578e-05, "loss": 0.475, "step": 2547 }, { "epoch": 2.8267924004992375, "grad_norm": 0.4390595853328705, "learning_rate": 1.912006578947368e-05, "loss": 0.4907, "step": 2548 }, { "epoch": 2.827901816668978, "grad_norm": 0.5395392179489136, "learning_rate": 1.8996710526315785e-05, "loss": 0.3074, "step": 2549 }, { "epoch": 2.8290112328387185, "grad_norm": 0.5195388197898865, "learning_rate": 1.887335526315789e-05, "loss": 0.3901, "step": 2550 }, { "epoch": 2.8301206490084594, "grad_norm": 0.5643588304519653, "learning_rate": 1.875e-05, "loss": 0.528, "step": 2551 }, { "epoch": 2.8312300651782, "grad_norm": 0.6140674352645874, "learning_rate": 1.8626644736842105e-05, "loss": 0.404, "step": 2552 }, { "epoch": 2.832339481347941, "grad_norm": 1.3323380947113037, "learning_rate": 1.850328947368421e-05, "loss": 0.1839, "step": 2553 }, { "epoch": 2.8334488975176813, "grad_norm": 0.3624992072582245, "learning_rate": 1.8379934210526315e-05, "loss": 0.5518, "step": 2554 }, { "epoch": 2.834558313687422, "grad_norm": 0.493804395198822, "learning_rate": 1.825657894736842e-05, "loss": 0.2108, "step": 2555 }, { "epoch": 2.8356677298571626, "grad_norm": 0.41342535614967346, "learning_rate": 1.8133223684210525e-05, "loss": 0.5485, "step": 2556 }, { "epoch": 2.836777146026903, "grad_norm": 0.37890028953552246, "learning_rate": 1.800986842105263e-05, "loss": 0.2336, "step": 2557 }, { "epoch": 2.837886562196644, "grad_norm": 0.5276500582695007, "learning_rate": 1.7886513157894735e-05, "loss": 0.3295, "step": 2558 }, { "epoch": 2.838995978366385, "grad_norm": 0.8836992979049683, "learning_rate": 1.776315789473684e-05, "loss": 0.4195, "step": 2559 }, { "epoch": 2.8401053945361254, "grad_norm": 0.36474689841270447, "learning_rate": 1.7639802631578945e-05, "loss": 0.5335, "step": 2560 }, { "epoch": 2.841214810705866, "grad_norm": 0.3125799298286438, "learning_rate": 1.7516447368421052e-05, "loss": 0.3293, "step": 2561 }, { "epoch": 2.842324226875607, "grad_norm": 0.4086284041404724, "learning_rate": 1.739309210526316e-05, "loss": 0.8532, "step": 2562 }, { "epoch": 2.8434336430453473, "grad_norm": 0.5705865025520325, "learning_rate": 1.7269736842105262e-05, "loss": 0.7003, "step": 2563 }, { "epoch": 2.844543059215088, "grad_norm": 0.3946170508861542, "learning_rate": 1.714638157894737e-05, "loss": 0.3741, "step": 2564 }, { "epoch": 2.8456524753848287, "grad_norm": 0.5471513271331787, "learning_rate": 1.7023026315789472e-05, "loss": 0.3288, "step": 2565 }, { "epoch": 2.8467618915545696, "grad_norm": 0.43899303674697876, "learning_rate": 1.6899671052631575e-05, "loss": 0.5189, "step": 2566 }, { "epoch": 2.84787130772431, "grad_norm": 0.6397573351860046, "learning_rate": 1.6776315789473682e-05, "loss": 0.3055, "step": 2567 }, { "epoch": 2.8489807238940505, "grad_norm": 0.3993845582008362, "learning_rate": 1.665296052631579e-05, "loss": 0.3922, "step": 2568 }, { "epoch": 2.8500901400637915, "grad_norm": 0.27621200680732727, "learning_rate": 1.6529605263157892e-05, "loss": 0.3626, "step": 2569 }, { "epoch": 2.851199556233532, "grad_norm": 0.721236526966095, "learning_rate": 1.640625e-05, "loss": 0.5003, "step": 2570 }, { "epoch": 2.852308972403273, "grad_norm": 0.46138882637023926, "learning_rate": 1.6282894736842106e-05, "loss": 0.3559, "step": 2571 }, { "epoch": 2.8534183885730133, "grad_norm": 0.3132235109806061, "learning_rate": 1.615953947368421e-05, "loss": 0.3119, "step": 2572 }, { "epoch": 2.8545278047427542, "grad_norm": 0.4404788911342621, "learning_rate": 1.6036184210526316e-05, "loss": 0.3208, "step": 2573 }, { "epoch": 2.8556372209124947, "grad_norm": 0.5489991307258606, "learning_rate": 1.591282894736842e-05, "loss": 0.3501, "step": 2574 }, { "epoch": 2.856746637082235, "grad_norm": 0.5425270199775696, "learning_rate": 1.5789473684210522e-05, "loss": 0.3652, "step": 2575 }, { "epoch": 2.857856053251976, "grad_norm": 0.3924858570098877, "learning_rate": 1.566611842105263e-05, "loss": 0.3799, "step": 2576 }, { "epoch": 2.858965469421717, "grad_norm": 0.381849080324173, "learning_rate": 1.5542763157894736e-05, "loss": 0.4581, "step": 2577 }, { "epoch": 2.8600748855914575, "grad_norm": 0.38210979104042053, "learning_rate": 1.541940789473684e-05, "loss": 0.4339, "step": 2578 }, { "epoch": 2.861184301761198, "grad_norm": 0.49273112416267395, "learning_rate": 1.5296052631578946e-05, "loss": 0.3847, "step": 2579 }, { "epoch": 2.862293717930939, "grad_norm": 0.45563986897468567, "learning_rate": 1.5172697368421051e-05, "loss": 0.3684, "step": 2580 }, { "epoch": 2.8634031341006794, "grad_norm": 0.5015951991081238, "learning_rate": 1.5049342105263158e-05, "loss": 0.5179, "step": 2581 }, { "epoch": 2.8645125502704203, "grad_norm": 0.6307615041732788, "learning_rate": 1.4925986842105261e-05, "loss": 0.486, "step": 2582 }, { "epoch": 2.8656219664401608, "grad_norm": 0.36506178975105286, "learning_rate": 1.4802631578947368e-05, "loss": 0.5584, "step": 2583 }, { "epoch": 2.8667313826099017, "grad_norm": 0.39168620109558105, "learning_rate": 1.4679276315789473e-05, "loss": 0.3033, "step": 2584 }, { "epoch": 2.867840798779642, "grad_norm": 0.5694892406463623, "learning_rate": 1.4555921052631578e-05, "loss": 0.4762, "step": 2585 }, { "epoch": 2.8689502149493826, "grad_norm": 0.3743266463279724, "learning_rate": 1.4432565789473683e-05, "loss": 0.2169, "step": 2586 }, { "epoch": 2.8700596311191235, "grad_norm": 0.32552504539489746, "learning_rate": 1.430921052631579e-05, "loss": 0.5623, "step": 2587 }, { "epoch": 2.8711690472888645, "grad_norm": 0.4949817359447479, "learning_rate": 1.4185855263157893e-05, "loss": 0.5808, "step": 2588 }, { "epoch": 2.872278463458605, "grad_norm": 0.5175449848175049, "learning_rate": 1.40625e-05, "loss": 0.4294, "step": 2589 }, { "epoch": 2.8733878796283454, "grad_norm": 0.4763440191745758, "learning_rate": 1.3939144736842105e-05, "loss": 0.3579, "step": 2590 }, { "epoch": 2.8744972957980863, "grad_norm": 0.3480813205242157, "learning_rate": 1.3815789473684208e-05, "loss": 0.3258, "step": 2591 }, { "epoch": 2.875606711967827, "grad_norm": 0.33044132590293884, "learning_rate": 1.3692434210526315e-05, "loss": 0.4225, "step": 2592 }, { "epoch": 2.8767161281375677, "grad_norm": 0.4161834418773651, "learning_rate": 1.356907894736842e-05, "loss": 0.3972, "step": 2593 }, { "epoch": 2.877825544307308, "grad_norm": 0.42129024863243103, "learning_rate": 1.3445723684210525e-05, "loss": 0.4956, "step": 2594 }, { "epoch": 2.878934960477049, "grad_norm": 0.3572520613670349, "learning_rate": 1.332236842105263e-05, "loss": 0.7549, "step": 2595 }, { "epoch": 2.8800443766467896, "grad_norm": 0.3236415684223175, "learning_rate": 1.3199013157894737e-05, "loss": 0.7084, "step": 2596 }, { "epoch": 2.88115379281653, "grad_norm": 0.42282864451408386, "learning_rate": 1.307565789473684e-05, "loss": 0.439, "step": 2597 }, { "epoch": 2.882263208986271, "grad_norm": 0.3626263737678528, "learning_rate": 1.2952302631578947e-05, "loss": 0.36, "step": 2598 }, { "epoch": 2.883372625156012, "grad_norm": 0.4625903367996216, "learning_rate": 1.2828947368421052e-05, "loss": 0.3951, "step": 2599 }, { "epoch": 2.8844820413257524, "grad_norm": 0.30453893542289734, "learning_rate": 1.2705592105263158e-05, "loss": 0.3333, "step": 2600 }, { "epoch": 2.885591457495493, "grad_norm": 0.43295082449913025, "learning_rate": 1.2582236842105262e-05, "loss": 0.4729, "step": 2601 }, { "epoch": 2.8867008736652338, "grad_norm": 0.3720959424972534, "learning_rate": 1.2458881578947368e-05, "loss": 0.3683, "step": 2602 }, { "epoch": 2.8878102898349742, "grad_norm": 0.2515357434749603, "learning_rate": 1.2335526315789473e-05, "loss": 0.2588, "step": 2603 }, { "epoch": 2.888919706004715, "grad_norm": 0.41880831122398376, "learning_rate": 1.2212171052631577e-05, "loss": 0.4133, "step": 2604 }, { "epoch": 2.8900291221744556, "grad_norm": 0.4307885766029358, "learning_rate": 1.2088815789473683e-05, "loss": 0.3466, "step": 2605 }, { "epoch": 2.8911385383441965, "grad_norm": 0.5042724609375, "learning_rate": 1.1965460526315788e-05, "loss": 0.3241, "step": 2606 }, { "epoch": 2.892247954513937, "grad_norm": 0.4066576063632965, "learning_rate": 1.1842105263157894e-05, "loss": 0.4199, "step": 2607 }, { "epoch": 2.8933573706836775, "grad_norm": 0.43350422382354736, "learning_rate": 1.1718749999999999e-05, "loss": 0.4903, "step": 2608 }, { "epoch": 2.8944667868534184, "grad_norm": 0.5719215273857117, "learning_rate": 1.1595394736842105e-05, "loss": 0.4011, "step": 2609 }, { "epoch": 2.8955762030231593, "grad_norm": 0.44598355889320374, "learning_rate": 1.1472039473684209e-05, "loss": 0.3502, "step": 2610 }, { "epoch": 2.8966856191929, "grad_norm": 0.3669666051864624, "learning_rate": 1.1348684210526315e-05, "loss": 0.5565, "step": 2611 }, { "epoch": 2.8977950353626403, "grad_norm": 0.3861677348613739, "learning_rate": 1.122532894736842e-05, "loss": 0.4093, "step": 2612 }, { "epoch": 2.898904451532381, "grad_norm": 0.3836744427680969, "learning_rate": 1.1101973684210524e-05, "loss": 0.5552, "step": 2613 }, { "epoch": 2.9000138677021217, "grad_norm": 0.5049236416816711, "learning_rate": 1.097861842105263e-05, "loss": 0.3947, "step": 2614 }, { "epoch": 2.9011232838718626, "grad_norm": 0.3340136706829071, "learning_rate": 1.0855263157894737e-05, "loss": 0.3379, "step": 2615 }, { "epoch": 2.902232700041603, "grad_norm": 0.41469404101371765, "learning_rate": 1.073190789473684e-05, "loss": 0.4554, "step": 2616 }, { "epoch": 2.903342116211344, "grad_norm": 0.34765610098838806, "learning_rate": 1.0608552631578946e-05, "loss": 0.287, "step": 2617 }, { "epoch": 2.9044515323810844, "grad_norm": 0.3712421655654907, "learning_rate": 1.0485197368421052e-05, "loss": 0.3694, "step": 2618 }, { "epoch": 2.905560948550825, "grad_norm": 0.37911391258239746, "learning_rate": 1.0361842105263157e-05, "loss": 0.3963, "step": 2619 }, { "epoch": 2.906670364720566, "grad_norm": 0.3563274145126343, "learning_rate": 1.0238486842105262e-05, "loss": 0.4382, "step": 2620 }, { "epoch": 2.9077797808903068, "grad_norm": 0.535892128944397, "learning_rate": 1.0115131578947367e-05, "loss": 0.4363, "step": 2621 }, { "epoch": 2.9088891970600472, "grad_norm": 0.4795096218585968, "learning_rate": 9.991776315789474e-06, "loss": 0.5396, "step": 2622 }, { "epoch": 2.9099986132297877, "grad_norm": 0.5538339614868164, "learning_rate": 9.868421052631577e-06, "loss": 0.3635, "step": 2623 }, { "epoch": 2.9111080293995286, "grad_norm": 0.3773338198661804, "learning_rate": 9.745065789473684e-06, "loss": 0.4025, "step": 2624 }, { "epoch": 2.912217445569269, "grad_norm": 0.6125039458274841, "learning_rate": 9.621710526315789e-06, "loss": 0.4946, "step": 2625 }, { "epoch": 2.91332686173901, "grad_norm": 0.3754761219024658, "learning_rate": 9.498355263157892e-06, "loss": 0.3312, "step": 2626 }, { "epoch": 2.9144362779087505, "grad_norm": 0.5250300168991089, "learning_rate": 9.375e-06, "loss": 0.4738, "step": 2627 }, { "epoch": 2.9155456940784914, "grad_norm": 0.42315995693206787, "learning_rate": 9.251644736842104e-06, "loss": 0.3803, "step": 2628 }, { "epoch": 2.916655110248232, "grad_norm": 0.5445030927658081, "learning_rate": 9.12828947368421e-06, "loss": 0.5786, "step": 2629 }, { "epoch": 2.9177645264179723, "grad_norm": 0.6609618067741394, "learning_rate": 9.004934210526314e-06, "loss": 0.386, "step": 2630 }, { "epoch": 2.9188739425877133, "grad_norm": 0.36153900623321533, "learning_rate": 8.88157894736842e-06, "loss": 0.3705, "step": 2631 }, { "epoch": 2.9199833587574537, "grad_norm": 0.39644429087638855, "learning_rate": 8.758223684210526e-06, "loss": 0.4203, "step": 2632 }, { "epoch": 2.9210927749271947, "grad_norm": 0.44478243589401245, "learning_rate": 8.634868421052631e-06, "loss": 0.5676, "step": 2633 }, { "epoch": 2.922202191096935, "grad_norm": 0.3410852551460266, "learning_rate": 8.511513157894736e-06, "loss": 0.29, "step": 2634 }, { "epoch": 2.923311607266676, "grad_norm": 0.38531428575515747, "learning_rate": 8.388157894736841e-06, "loss": 0.4727, "step": 2635 }, { "epoch": 2.9244210234364165, "grad_norm": 0.40856409072875977, "learning_rate": 8.264802631578946e-06, "loss": 0.2994, "step": 2636 }, { "epoch": 2.925530439606157, "grad_norm": 0.3600262999534607, "learning_rate": 8.141447368421053e-06, "loss": 0.3595, "step": 2637 }, { "epoch": 2.926639855775898, "grad_norm": 0.4171159863471985, "learning_rate": 8.018092105263158e-06, "loss": 0.2841, "step": 2638 }, { "epoch": 2.927749271945639, "grad_norm": 0.35529500246047974, "learning_rate": 7.894736842105261e-06, "loss": 0.278, "step": 2639 }, { "epoch": 2.9288586881153793, "grad_norm": 0.3548045754432678, "learning_rate": 7.771381578947368e-06, "loss": 0.6584, "step": 2640 }, { "epoch": 2.92996810428512, "grad_norm": 0.8075834512710571, "learning_rate": 7.648026315789473e-06, "loss": 0.2551, "step": 2641 }, { "epoch": 2.9310775204548607, "grad_norm": 0.4155935049057007, "learning_rate": 7.524671052631579e-06, "loss": 0.3003, "step": 2642 }, { "epoch": 2.932186936624601, "grad_norm": 0.4321109652519226, "learning_rate": 7.401315789473684e-06, "loss": 0.343, "step": 2643 }, { "epoch": 2.933296352794342, "grad_norm": 0.3877204358577728, "learning_rate": 7.277960526315789e-06, "loss": 0.3259, "step": 2644 }, { "epoch": 2.9344057689640826, "grad_norm": 0.30750367045402527, "learning_rate": 7.154605263157895e-06, "loss": 0.4062, "step": 2645 }, { "epoch": 2.9355151851338235, "grad_norm": 0.6012676954269409, "learning_rate": 7.03125e-06, "loss": 0.4992, "step": 2646 }, { "epoch": 2.936624601303564, "grad_norm": 0.3799775242805481, "learning_rate": 6.907894736842104e-06, "loss": 0.4411, "step": 2647 }, { "epoch": 2.9377340174733044, "grad_norm": 0.49381592869758606, "learning_rate": 6.78453947368421e-06, "loss": 0.3694, "step": 2648 }, { "epoch": 2.9388434336430453, "grad_norm": 0.7122082114219666, "learning_rate": 6.661184210526315e-06, "loss": 0.4398, "step": 2649 }, { "epoch": 2.9399528498127863, "grad_norm": 0.48016512393951416, "learning_rate": 6.53782894736842e-06, "loss": 0.3111, "step": 2650 }, { "epoch": 2.9410622659825267, "grad_norm": 0.449733167886734, "learning_rate": 6.414473684210526e-06, "loss": 0.3113, "step": 2651 }, { "epoch": 2.942171682152267, "grad_norm": 0.33274686336517334, "learning_rate": 6.291118421052631e-06, "loss": 0.4131, "step": 2652 }, { "epoch": 2.943281098322008, "grad_norm": 0.37493860721588135, "learning_rate": 6.167763157894737e-06, "loss": 0.2557, "step": 2653 }, { "epoch": 2.9443905144917486, "grad_norm": 0.37679123878479004, "learning_rate": 6.044407894736842e-06, "loss": 0.4252, "step": 2654 }, { "epoch": 2.9454999306614895, "grad_norm": 0.40936678647994995, "learning_rate": 5.921052631578947e-06, "loss": 0.4795, "step": 2655 }, { "epoch": 2.94660934683123, "grad_norm": 0.4506048560142517, "learning_rate": 5.797697368421053e-06, "loss": 0.5137, "step": 2656 }, { "epoch": 2.947718763000971, "grad_norm": 0.39844781160354614, "learning_rate": 5.674342105263158e-06, "loss": 0.3271, "step": 2657 }, { "epoch": 2.9488281791707114, "grad_norm": 0.39184948801994324, "learning_rate": 5.550986842105262e-06, "loss": 0.3885, "step": 2658 }, { "epoch": 2.949937595340452, "grad_norm": 0.3266217112541199, "learning_rate": 5.4276315789473686e-06, "loss": 0.3644, "step": 2659 }, { "epoch": 2.9510470115101928, "grad_norm": 0.4527483284473419, "learning_rate": 5.304276315789473e-06, "loss": 0.3945, "step": 2660 }, { "epoch": 2.9521564276799337, "grad_norm": 0.5377230644226074, "learning_rate": 5.180921052631579e-06, "loss": 0.458, "step": 2661 }, { "epoch": 2.953265843849674, "grad_norm": 0.39653611183166504, "learning_rate": 5.057565789473684e-06, "loss": 0.3541, "step": 2662 }, { "epoch": 2.9543752600194146, "grad_norm": 0.2904629409313202, "learning_rate": 4.934210526315789e-06, "loss": 0.1772, "step": 2663 }, { "epoch": 2.9554846761891556, "grad_norm": 0.3323127329349518, "learning_rate": 4.8108552631578946e-06, "loss": 0.5255, "step": 2664 }, { "epoch": 2.956594092358896, "grad_norm": 0.4561123847961426, "learning_rate": 4.6875e-06, "loss": 0.3503, "step": 2665 }, { "epoch": 2.957703508528637, "grad_norm": 0.37283483147621155, "learning_rate": 4.564144736842105e-06, "loss": 0.4867, "step": 2666 }, { "epoch": 2.9588129246983774, "grad_norm": 0.5536327362060547, "learning_rate": 4.44078947368421e-06, "loss": 0.3599, "step": 2667 }, { "epoch": 2.9599223408681183, "grad_norm": 0.42849066853523254, "learning_rate": 4.3174342105263155e-06, "loss": 0.4078, "step": 2668 }, { "epoch": 2.961031757037859, "grad_norm": 0.4770295023918152, "learning_rate": 4.1940789473684206e-06, "loss": 0.5948, "step": 2669 }, { "epoch": 2.9621411732075993, "grad_norm": 0.4041629433631897, "learning_rate": 4.0707236842105264e-06, "loss": 0.3036, "step": 2670 }, { "epoch": 2.96325058937734, "grad_norm": 0.4390498697757721, "learning_rate": 3.947368421052631e-06, "loss": 0.373, "step": 2671 }, { "epoch": 2.964360005547081, "grad_norm": 0.31613367795944214, "learning_rate": 3.8240131578947365e-06, "loss": 0.3119, "step": 2672 }, { "epoch": 2.9654694217168216, "grad_norm": 0.4405744969844818, "learning_rate": 3.700657894736842e-06, "loss": 0.4571, "step": 2673 }, { "epoch": 2.966578837886562, "grad_norm": 0.4392789900302887, "learning_rate": 3.5773026315789474e-06, "loss": 0.4928, "step": 2674 }, { "epoch": 2.967688254056303, "grad_norm": 0.4602966606616974, "learning_rate": 3.453947368421052e-06, "loss": 0.345, "step": 2675 }, { "epoch": 2.9687976702260435, "grad_norm": 0.447457492351532, "learning_rate": 3.3305921052631574e-06, "loss": 0.4556, "step": 2676 }, { "epoch": 2.9699070863957844, "grad_norm": 0.38211768865585327, "learning_rate": 3.207236842105263e-06, "loss": 0.4022, "step": 2677 }, { "epoch": 2.971016502565525, "grad_norm": 0.35372307896614075, "learning_rate": 3.0838815789473684e-06, "loss": 0.3703, "step": 2678 }, { "epoch": 2.9721259187352658, "grad_norm": 0.44302472472190857, "learning_rate": 2.9605263157894734e-06, "loss": 0.4514, "step": 2679 }, { "epoch": 2.9732353349050062, "grad_norm": 0.5012083053588867, "learning_rate": 2.837171052631579e-06, "loss": 0.4311, "step": 2680 }, { "epoch": 2.9743447510747467, "grad_norm": 0.4370708763599396, "learning_rate": 2.7138157894736843e-06, "loss": 0.3342, "step": 2681 }, { "epoch": 2.9754541672444876, "grad_norm": 0.37920454144477844, "learning_rate": 2.5904605263157893e-06, "loss": 0.5212, "step": 2682 }, { "epoch": 2.976563583414228, "grad_norm": 0.2939812242984772, "learning_rate": 2.4671052631578943e-06, "loss": 0.3169, "step": 2683 }, { "epoch": 2.977672999583969, "grad_norm": 0.354949414730072, "learning_rate": 2.34375e-06, "loss": 0.2588, "step": 2684 }, { "epoch": 2.9787824157537095, "grad_norm": 0.334835022687912, "learning_rate": 2.220394736842105e-06, "loss": 0.4058, "step": 2685 }, { "epoch": 2.9798918319234504, "grad_norm": 0.4206099212169647, "learning_rate": 2.0970394736842103e-06, "loss": 0.3528, "step": 2686 }, { "epoch": 2.981001248093191, "grad_norm": 0.4646962583065033, "learning_rate": 1.9736842105263153e-06, "loss": 0.5455, "step": 2687 }, { "epoch": 2.9821106642629314, "grad_norm": 0.4603005647659302, "learning_rate": 1.850328947368421e-06, "loss": 0.3451, "step": 2688 }, { "epoch": 2.9832200804326723, "grad_norm": 0.4108537435531616, "learning_rate": 1.726973684210526e-06, "loss": 0.5365, "step": 2689 }, { "epoch": 2.984329496602413, "grad_norm": 0.43144112825393677, "learning_rate": 1.6036184210526314e-06, "loss": 0.3908, "step": 2690 }, { "epoch": 2.9854389127721537, "grad_norm": 0.5529698133468628, "learning_rate": 1.4802631578947367e-06, "loss": 0.3784, "step": 2691 }, { "epoch": 2.986548328941894, "grad_norm": 0.37649285793304443, "learning_rate": 1.3569078947368421e-06, "loss": 0.4073, "step": 2692 }, { "epoch": 2.987657745111635, "grad_norm": 0.3004691004753113, "learning_rate": 1.2335526315789472e-06, "loss": 0.4305, "step": 2693 }, { "epoch": 2.9887671612813755, "grad_norm": 0.4551496207714081, "learning_rate": 1.1101973684210524e-06, "loss": 0.3474, "step": 2694 }, { "epoch": 2.9898765774511165, "grad_norm": 0.3873107135295868, "learning_rate": 9.868421052631577e-07, "loss": 0.3239, "step": 2695 }, { "epoch": 2.990985993620857, "grad_norm": 0.483542263507843, "learning_rate": 8.63486842105263e-07, "loss": 0.3812, "step": 2696 }, { "epoch": 2.992095409790598, "grad_norm": 0.484693318605423, "learning_rate": 7.401315789473683e-07, "loss": 0.4768, "step": 2697 }, { "epoch": 2.9932048259603383, "grad_norm": 0.3889079988002777, "learning_rate": 6.167763157894736e-07, "loss": 0.5391, "step": 2698 }, { "epoch": 2.994314242130079, "grad_norm": 0.39534711837768555, "learning_rate": 4.934210526315788e-07, "loss": 0.2693, "step": 2699 }, { "epoch": 2.9954236582998197, "grad_norm": 0.28255271911621094, "learning_rate": 3.7006578947368417e-07, "loss": 0.496, "step": 2700 }, { "epoch": 2.9965330744695606, "grad_norm": 0.39558374881744385, "learning_rate": 2.467105263157894e-07, "loss": 0.3213, "step": 2701 }, { "epoch": 2.997642490639301, "grad_norm": 0.39010322093963623, "learning_rate": 1.233552631578947e-07, "loss": 0.3245, "step": 2702 }, { "epoch": 2.9987519068090416, "grad_norm": 0.33332499861717224, "learning_rate": 0.0, "loss": 0.3557, "step": 2703 }, { "epoch": 2.9987519068090416, "step": 2703, "total_flos": 1.203022932864172e+18, "train_loss": 0.49336877679696933, "train_runtime": 13859.7262, "train_samples_per_second": 1.561, "train_steps_per_second": 0.195 } ], "logging_steps": 1.0, "max_steps": 2703, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.203022932864172e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }