{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002717391304347826, "grad_norm": 0.0, "learning_rate": 0, "loss": 0.5829, "step": 1 }, { "epoch": 0.005434782608695652, "grad_norm": 0.0, "learning_rate": 0, "loss": 0.685, "step": 2 }, { "epoch": 0.008152173913043478, "grad_norm": 3.0868945121765137, "learning_rate": 0.0, "loss": 0.7212, "step": 3 }, { "epoch": 0.010869565217391304, "grad_norm": 3.4650824069976807, "learning_rate": 1.5e-05, "loss": 0.6836, "step": 4 }, { "epoch": 0.01358695652173913, "grad_norm": 3.4650824069976807, "learning_rate": 1.5e-05, "loss": 0.5367, "step": 5 }, { "epoch": 0.016304347826086956, "grad_norm": 2.637908935546875, "learning_rate": 2.3774437510817346e-05, "loss": 0.6148, "step": 6 }, { "epoch": 0.019021739130434784, "grad_norm": 1.4185562133789062, "learning_rate": 3e-05, "loss": 0.562, "step": 7 }, { "epoch": 0.021739130434782608, "grad_norm": 3.299220323562622, "learning_rate": 3e-05, "loss": 0.6763, "step": 8 }, { "epoch": 0.024456521739130436, "grad_norm": 3.9320366382598877, "learning_rate": 3e-05, "loss": 0.4243, "step": 9 }, { "epoch": 0.02717391304347826, "grad_norm": 4.625139236450195, "learning_rate": 3e-05, "loss": 0.6627, "step": 10 }, { "epoch": 0.029891304347826088, "grad_norm": 4.016164779663086, "learning_rate": 3e-05, "loss": 0.5826, "step": 11 }, { "epoch": 0.03260869565217391, "grad_norm": 3.5085506439208984, "learning_rate": 3e-05, "loss": 0.7494, "step": 12 }, { "epoch": 0.035326086956521736, "grad_norm": 3.135777235031128, "learning_rate": 3e-05, "loss": 0.5124, "step": 13 }, { "epoch": 0.03804347826086957, "grad_norm": 2.25691294670105, "learning_rate": 3e-05, "loss": 0.5156, "step": 14 }, { "epoch": 0.04076086956521739, "grad_norm": 2.6235625743865967, "learning_rate": 3e-05, "loss": 0.5773, "step": 15 }, { "epoch": 0.043478260869565216, "grad_norm": 1.977939248085022, "learning_rate": 3e-05, "loss": 0.5534, "step": 16 }, { "epoch": 0.04619565217391304, "grad_norm": 3.1098241806030273, "learning_rate": 3e-05, "loss": 0.7569, "step": 17 }, { "epoch": 0.04891304347826087, "grad_norm": 3.5367255210876465, "learning_rate": 3e-05, "loss": 0.6384, "step": 18 }, { "epoch": 0.051630434782608696, "grad_norm": 2.057596445083618, "learning_rate": 3e-05, "loss": 0.5811, "step": 19 }, { "epoch": 0.05434782608695652, "grad_norm": 1.8991059064865112, "learning_rate": 3e-05, "loss": 0.4392, "step": 20 }, { "epoch": 0.057065217391304345, "grad_norm": 2.431248188018799, "learning_rate": 3e-05, "loss": 0.5292, "step": 21 }, { "epoch": 0.059782608695652176, "grad_norm": 2.1876354217529297, "learning_rate": 3e-05, "loss": 0.4795, "step": 22 }, { "epoch": 0.0625, "grad_norm": 2.018975257873535, "learning_rate": 3e-05, "loss": 0.5681, "step": 23 }, { "epoch": 0.06521739130434782, "grad_norm": 1.5382078886032104, "learning_rate": 3e-05, "loss": 0.4643, "step": 24 }, { "epoch": 0.06793478260869565, "grad_norm": 2.18375563621521, "learning_rate": 3e-05, "loss": 0.4675, "step": 25 }, { "epoch": 0.07065217391304347, "grad_norm": 1.810476303100586, "learning_rate": 3e-05, "loss": 0.5259, "step": 26 }, { "epoch": 0.07336956521739131, "grad_norm": 2.9333322048187256, "learning_rate": 3e-05, "loss": 0.6795, "step": 27 }, { "epoch": 0.07608695652173914, "grad_norm": 1.963409185409546, "learning_rate": 3e-05, "loss": 0.5022, "step": 28 }, { "epoch": 0.07880434782608696, "grad_norm": 1.5288830995559692, "learning_rate": 3e-05, "loss": 0.5481, "step": 29 }, { "epoch": 0.08152173913043478, "grad_norm": 2.095547676086426, "learning_rate": 3e-05, "loss": 0.5427, "step": 30 }, { "epoch": 0.08423913043478261, "grad_norm": 2.3924880027770996, "learning_rate": 3e-05, "loss": 0.5936, "step": 31 }, { "epoch": 0.08695652173913043, "grad_norm": 2.6183862686157227, "learning_rate": 3e-05, "loss": 0.5452, "step": 32 }, { "epoch": 0.08967391304347826, "grad_norm": 1.9050800800323486, "learning_rate": 3e-05, "loss": 0.4507, "step": 33 }, { "epoch": 0.09239130434782608, "grad_norm": 2.0894551277160645, "learning_rate": 3e-05, "loss": 0.6293, "step": 34 }, { "epoch": 0.09510869565217392, "grad_norm": 2.201720952987671, "learning_rate": 3e-05, "loss": 0.4793, "step": 35 }, { "epoch": 0.09782608695652174, "grad_norm": 2.516624927520752, "learning_rate": 3e-05, "loss": 0.5658, "step": 36 }, { "epoch": 0.10054347826086957, "grad_norm": 1.703904628753662, "learning_rate": 3e-05, "loss": 0.4968, "step": 37 }, { "epoch": 0.10326086956521739, "grad_norm": 2.972416400909424, "learning_rate": 3e-05, "loss": 0.7267, "step": 38 }, { "epoch": 0.10597826086956522, "grad_norm": 1.9167985916137695, "learning_rate": 3e-05, "loss": 0.4971, "step": 39 }, { "epoch": 0.10869565217391304, "grad_norm": 1.7569814920425415, "learning_rate": 3e-05, "loss": 0.5191, "step": 40 }, { "epoch": 0.11141304347826086, "grad_norm": 2.4087071418762207, "learning_rate": 3e-05, "loss": 0.5737, "step": 41 }, { "epoch": 0.11413043478260869, "grad_norm": 3.517263650894165, "learning_rate": 3e-05, "loss": 0.5688, "step": 42 }, { "epoch": 0.11684782608695653, "grad_norm": 3.5090434551239014, "learning_rate": 3e-05, "loss": 0.4961, "step": 43 }, { "epoch": 0.11956521739130435, "grad_norm": 3.181145668029785, "learning_rate": 3e-05, "loss": 0.8124, "step": 44 }, { "epoch": 0.12228260869565218, "grad_norm": 3.588538885116577, "learning_rate": 3e-05, "loss": 0.5766, "step": 45 }, { "epoch": 0.125, "grad_norm": 3.123659610748291, "learning_rate": 3e-05, "loss": 0.7286, "step": 46 }, { "epoch": 0.12771739130434784, "grad_norm": 2.6902639865875244, "learning_rate": 3e-05, "loss": 0.6418, "step": 47 }, { "epoch": 0.13043478260869565, "grad_norm": 1.6611530780792236, "learning_rate": 3e-05, "loss": 0.5357, "step": 48 }, { "epoch": 0.1331521739130435, "grad_norm": 2.4058802127838135, "learning_rate": 3e-05, "loss": 0.5909, "step": 49 }, { "epoch": 0.1358695652173913, "grad_norm": 2.4058802127838135, "learning_rate": 3e-05, "loss": 0.7068, "step": 50 }, { "epoch": 0.13858695652173914, "grad_norm": 2.513307809829712, "learning_rate": 3e-05, "loss": 0.5384, "step": 51 }, { "epoch": 0.14130434782608695, "grad_norm": 2.7813773155212402, "learning_rate": 3e-05, "loss": 0.5887, "step": 52 }, { "epoch": 0.14402173913043478, "grad_norm": 1.950292706489563, "learning_rate": 3e-05, "loss": 0.5399, "step": 53 }, { "epoch": 0.14673913043478262, "grad_norm": 1.7724437713623047, "learning_rate": 3e-05, "loss": 0.4234, "step": 54 }, { "epoch": 0.14945652173913043, "grad_norm": 3.3404788970947266, "learning_rate": 3e-05, "loss": 0.7648, "step": 55 }, { "epoch": 0.15217391304347827, "grad_norm": 2.424994468688965, "learning_rate": 3e-05, "loss": 0.4959, "step": 56 }, { "epoch": 0.15489130434782608, "grad_norm": 2.12221097946167, "learning_rate": 3e-05, "loss": 0.6107, "step": 57 }, { "epoch": 0.15760869565217392, "grad_norm": 3.7846357822418213, "learning_rate": 3e-05, "loss": 0.6991, "step": 58 }, { "epoch": 0.16032608695652173, "grad_norm": 1.7821156978607178, "learning_rate": 3e-05, "loss": 0.4864, "step": 59 }, { "epoch": 0.16304347826086957, "grad_norm": 1.4435001611709595, "learning_rate": 3e-05, "loss": 0.4782, "step": 60 }, { "epoch": 0.16576086956521738, "grad_norm": 2.097719669342041, "learning_rate": 3e-05, "loss": 0.4761, "step": 61 }, { "epoch": 0.16847826086956522, "grad_norm": 2.082066774368286, "learning_rate": 3e-05, "loss": 0.4565, "step": 62 }, { "epoch": 0.17119565217391305, "grad_norm": 1.8222287893295288, "learning_rate": 3e-05, "loss": 0.5573, "step": 63 }, { "epoch": 0.17391304347826086, "grad_norm": 2.819382667541504, "learning_rate": 3e-05, "loss": 0.5613, "step": 64 }, { "epoch": 0.1766304347826087, "grad_norm": 1.8486182689666748, "learning_rate": 3e-05, "loss": 0.5446, "step": 65 }, { "epoch": 0.1793478260869565, "grad_norm": 1.4942399263381958, "learning_rate": 3e-05, "loss": 0.5744, "step": 66 }, { "epoch": 0.18206521739130435, "grad_norm": 1.9929646253585815, "learning_rate": 3e-05, "loss": 0.4258, "step": 67 }, { "epoch": 0.18478260869565216, "grad_norm": 1.7978485822677612, "learning_rate": 3e-05, "loss": 0.4694, "step": 68 }, { "epoch": 0.1875, "grad_norm": 2.185476064682007, "learning_rate": 3e-05, "loss": 0.5109, "step": 69 }, { "epoch": 0.19021739130434784, "grad_norm": 2.129399538040161, "learning_rate": 3e-05, "loss": 0.4594, "step": 70 }, { "epoch": 0.19293478260869565, "grad_norm": 2.488927125930786, "learning_rate": 3e-05, "loss": 0.5293, "step": 71 }, { "epoch": 0.1956521739130435, "grad_norm": 2.257550001144409, "learning_rate": 3e-05, "loss": 0.5494, "step": 72 }, { "epoch": 0.1983695652173913, "grad_norm": 2.4466147422790527, "learning_rate": 3e-05, "loss": 0.4834, "step": 73 }, { "epoch": 0.20108695652173914, "grad_norm": 2.2242984771728516, "learning_rate": 3e-05, "loss": 0.4992, "step": 74 }, { "epoch": 0.20380434782608695, "grad_norm": 2.867558002471924, "learning_rate": 3e-05, "loss": 0.7131, "step": 75 }, { "epoch": 0.20652173913043478, "grad_norm": 1.8966355323791504, "learning_rate": 3e-05, "loss": 0.5512, "step": 76 }, { "epoch": 0.20923913043478262, "grad_norm": 3.1431808471679688, "learning_rate": 3e-05, "loss": 0.4581, "step": 77 }, { "epoch": 0.21195652173913043, "grad_norm": 3.1641836166381836, "learning_rate": 3e-05, "loss": 0.56, "step": 78 }, { "epoch": 0.21467391304347827, "grad_norm": 1.869903326034546, "learning_rate": 3e-05, "loss": 0.5766, "step": 79 }, { "epoch": 0.21739130434782608, "grad_norm": 2.7795424461364746, "learning_rate": 3e-05, "loss": 0.5911, "step": 80 }, { "epoch": 0.22010869565217392, "grad_norm": 2.3112289905548096, "learning_rate": 3e-05, "loss": 0.5765, "step": 81 }, { "epoch": 0.22282608695652173, "grad_norm": 1.8768959045410156, "learning_rate": 3e-05, "loss": 0.4489, "step": 82 }, { "epoch": 0.22554347826086957, "grad_norm": 1.8480079174041748, "learning_rate": 3e-05, "loss": 0.6242, "step": 83 }, { "epoch": 0.22826086956521738, "grad_norm": 2.04072904586792, "learning_rate": 3e-05, "loss": 0.5061, "step": 84 }, { "epoch": 0.23097826086956522, "grad_norm": 1.778836965560913, "learning_rate": 3e-05, "loss": 0.5375, "step": 85 }, { "epoch": 0.23369565217391305, "grad_norm": 2.0883471965789795, "learning_rate": 3e-05, "loss": 0.5375, "step": 86 }, { "epoch": 0.23641304347826086, "grad_norm": 1.8592745065689087, "learning_rate": 3e-05, "loss": 0.4972, "step": 87 }, { "epoch": 0.2391304347826087, "grad_norm": 1.817451000213623, "learning_rate": 3e-05, "loss": 0.5005, "step": 88 }, { "epoch": 0.2418478260869565, "grad_norm": 3.28531813621521, "learning_rate": 3e-05, "loss": 0.6246, "step": 89 }, { "epoch": 0.24456521739130435, "grad_norm": 1.707846760749817, "learning_rate": 3e-05, "loss": 0.4922, "step": 90 }, { "epoch": 0.24728260869565216, "grad_norm": 2.3756139278411865, "learning_rate": 3e-05, "loss": 0.5826, "step": 91 }, { "epoch": 0.25, "grad_norm": 1.2630126476287842, "learning_rate": 3e-05, "loss": 0.5304, "step": 92 }, { "epoch": 0.25271739130434784, "grad_norm": 2.259389877319336, "learning_rate": 3e-05, "loss": 0.5615, "step": 93 }, { "epoch": 0.2554347826086957, "grad_norm": 1.3903180360794067, "learning_rate": 3e-05, "loss": 0.4406, "step": 94 }, { "epoch": 0.25815217391304346, "grad_norm": 1.9753227233886719, "learning_rate": 3e-05, "loss": 0.4809, "step": 95 }, { "epoch": 0.2608695652173913, "grad_norm": 1.8999621868133545, "learning_rate": 3e-05, "loss": 0.5946, "step": 96 }, { "epoch": 0.26358695652173914, "grad_norm": 1.9928959608078003, "learning_rate": 3e-05, "loss": 0.5532, "step": 97 }, { "epoch": 0.266304347826087, "grad_norm": 1.7581721544265747, "learning_rate": 3e-05, "loss": 0.425, "step": 98 }, { "epoch": 0.26902173913043476, "grad_norm": 3.1840996742248535, "learning_rate": 3e-05, "loss": 0.6684, "step": 99 }, { "epoch": 0.2717391304347826, "grad_norm": 2.4350733757019043, "learning_rate": 3e-05, "loss": 0.5354, "step": 100 }, { "epoch": 0.27445652173913043, "grad_norm": 2.1404078006744385, "learning_rate": 3e-05, "loss": 0.4751, "step": 101 }, { "epoch": 0.27717391304347827, "grad_norm": 3.169942617416382, "learning_rate": 3e-05, "loss": 0.7145, "step": 102 }, { "epoch": 0.2798913043478261, "grad_norm": 1.2857766151428223, "learning_rate": 3e-05, "loss": 0.4536, "step": 103 }, { "epoch": 0.2826086956521739, "grad_norm": 2.7525951862335205, "learning_rate": 3e-05, "loss": 0.5892, "step": 104 }, { "epoch": 0.28532608695652173, "grad_norm": 1.6672691106796265, "learning_rate": 3e-05, "loss": 0.5593, "step": 105 }, { "epoch": 0.28804347826086957, "grad_norm": 1.2421808242797852, "learning_rate": 3e-05, "loss": 0.4584, "step": 106 }, { "epoch": 0.2907608695652174, "grad_norm": 3.0662600994110107, "learning_rate": 3e-05, "loss": 0.564, "step": 107 }, { "epoch": 0.29347826086956524, "grad_norm": 1.9544175863265991, "learning_rate": 3e-05, "loss": 0.5583, "step": 108 }, { "epoch": 0.296195652173913, "grad_norm": 2.374974250793457, "learning_rate": 3e-05, "loss": 0.5546, "step": 109 }, { "epoch": 0.29891304347826086, "grad_norm": 1.730804443359375, "learning_rate": 3e-05, "loss": 0.5021, "step": 110 }, { "epoch": 0.3016304347826087, "grad_norm": 2.308568239212036, "learning_rate": 3e-05, "loss": 0.6143, "step": 111 }, { "epoch": 0.30434782608695654, "grad_norm": 2.2530875205993652, "learning_rate": 3e-05, "loss": 0.5695, "step": 112 }, { "epoch": 0.3070652173913043, "grad_norm": 1.4939433336257935, "learning_rate": 3e-05, "loss": 0.4982, "step": 113 }, { "epoch": 0.30978260869565216, "grad_norm": 1.8290050029754639, "learning_rate": 3e-05, "loss": 0.4945, "step": 114 }, { "epoch": 0.3125, "grad_norm": 2.5083842277526855, "learning_rate": 3e-05, "loss": 0.4693, "step": 115 }, { "epoch": 0.31521739130434784, "grad_norm": 3.2338268756866455, "learning_rate": 3e-05, "loss": 0.6578, "step": 116 }, { "epoch": 0.3179347826086957, "grad_norm": 2.3117527961730957, "learning_rate": 3e-05, "loss": 0.5395, "step": 117 }, { "epoch": 0.32065217391304346, "grad_norm": 1.5958501100540161, "learning_rate": 3e-05, "loss": 0.4975, "step": 118 }, { "epoch": 0.3233695652173913, "grad_norm": 2.4708805084228516, "learning_rate": 3e-05, "loss": 0.4652, "step": 119 }, { "epoch": 0.32608695652173914, "grad_norm": 1.6753262281417847, "learning_rate": 3e-05, "loss": 0.54, "step": 120 }, { "epoch": 0.328804347826087, "grad_norm": 2.4457895755767822, "learning_rate": 3e-05, "loss": 0.5612, "step": 121 }, { "epoch": 0.33152173913043476, "grad_norm": 2.059176445007324, "learning_rate": 3e-05, "loss": 0.6599, "step": 122 }, { "epoch": 0.3342391304347826, "grad_norm": 2.1220273971557617, "learning_rate": 3e-05, "loss": 0.5256, "step": 123 }, { "epoch": 0.33695652173913043, "grad_norm": 1.756909728050232, "learning_rate": 3e-05, "loss": 0.4839, "step": 124 }, { "epoch": 0.33967391304347827, "grad_norm": 2.6595497131347656, "learning_rate": 3e-05, "loss": 0.626, "step": 125 }, { "epoch": 0.3423913043478261, "grad_norm": 2.2510428428649902, "learning_rate": 3e-05, "loss": 0.5029, "step": 126 }, { "epoch": 0.3451086956521739, "grad_norm": 3.559722423553467, "learning_rate": 3e-05, "loss": 0.647, "step": 127 }, { "epoch": 0.34782608695652173, "grad_norm": 2.1882569789886475, "learning_rate": 3e-05, "loss": 0.6679, "step": 128 }, { "epoch": 0.35054347826086957, "grad_norm": 2.5354092121124268, "learning_rate": 3e-05, "loss": 0.6344, "step": 129 }, { "epoch": 0.3532608695652174, "grad_norm": 2.0643749237060547, "learning_rate": 3e-05, "loss": 0.4314, "step": 130 }, { "epoch": 0.35597826086956524, "grad_norm": 1.6011096239089966, "learning_rate": 3e-05, "loss": 0.4533, "step": 131 }, { "epoch": 0.358695652173913, "grad_norm": 2.553201913833618, "learning_rate": 3e-05, "loss": 0.6543, "step": 132 }, { "epoch": 0.36141304347826086, "grad_norm": 2.410280704498291, "learning_rate": 3e-05, "loss": 0.5308, "step": 133 }, { "epoch": 0.3641304347826087, "grad_norm": 1.7064554691314697, "learning_rate": 3e-05, "loss": 0.4105, "step": 134 }, { "epoch": 0.36684782608695654, "grad_norm": 2.6405141353607178, "learning_rate": 3e-05, "loss": 0.5206, "step": 135 }, { "epoch": 0.3695652173913043, "grad_norm": 1.699704885482788, "learning_rate": 3e-05, "loss": 0.6593, "step": 136 }, { "epoch": 0.37228260869565216, "grad_norm": 1.8388729095458984, "learning_rate": 3e-05, "loss": 0.4821, "step": 137 }, { "epoch": 0.375, "grad_norm": 2.7615950107574463, "learning_rate": 3e-05, "loss": 0.5692, "step": 138 }, { "epoch": 0.37771739130434784, "grad_norm": 1.835686206817627, "learning_rate": 3e-05, "loss": 0.5826, "step": 139 }, { "epoch": 0.3804347826086957, "grad_norm": 1.6995174884796143, "learning_rate": 3e-05, "loss": 0.4742, "step": 140 }, { "epoch": 0.38315217391304346, "grad_norm": 1.6208698749542236, "learning_rate": 3e-05, "loss": 0.4437, "step": 141 }, { "epoch": 0.3858695652173913, "grad_norm": 1.2213523387908936, "learning_rate": 3e-05, "loss": 0.4037, "step": 142 }, { "epoch": 0.38858695652173914, "grad_norm": 1.5828511714935303, "learning_rate": 3e-05, "loss": 0.4793, "step": 143 }, { "epoch": 0.391304347826087, "grad_norm": 3.4537761211395264, "learning_rate": 3e-05, "loss": 0.4866, "step": 144 }, { "epoch": 0.39402173913043476, "grad_norm": 1.346982479095459, "learning_rate": 3e-05, "loss": 0.4468, "step": 145 }, { "epoch": 0.3967391304347826, "grad_norm": 3.6017796993255615, "learning_rate": 3e-05, "loss": 0.6744, "step": 146 }, { "epoch": 0.39945652173913043, "grad_norm": 1.896022915840149, "learning_rate": 3e-05, "loss": 0.5816, "step": 147 }, { "epoch": 0.40217391304347827, "grad_norm": 1.970211148262024, "learning_rate": 3e-05, "loss": 0.5449, "step": 148 }, { "epoch": 0.4048913043478261, "grad_norm": 2.315774917602539, "learning_rate": 3e-05, "loss": 0.5042, "step": 149 }, { "epoch": 0.4076086956521739, "grad_norm": 1.7873855829238892, "learning_rate": 3e-05, "loss": 0.5093, "step": 150 }, { "epoch": 0.41032608695652173, "grad_norm": 2.479135036468506, "learning_rate": 3e-05, "loss": 0.6396, "step": 151 }, { "epoch": 0.41304347826086957, "grad_norm": 1.4740537405014038, "learning_rate": 3e-05, "loss": 0.4847, "step": 152 }, { "epoch": 0.4157608695652174, "grad_norm": 1.8202875852584839, "learning_rate": 3e-05, "loss": 0.565, "step": 153 }, { "epoch": 0.41847826086956524, "grad_norm": 2.06355881690979, "learning_rate": 3e-05, "loss": 0.5481, "step": 154 }, { "epoch": 0.421195652173913, "grad_norm": 1.522354245185852, "learning_rate": 3e-05, "loss": 0.4835, "step": 155 }, { "epoch": 0.42391304347826086, "grad_norm": 1.9054198265075684, "learning_rate": 3e-05, "loss": 0.5304, "step": 156 }, { "epoch": 0.4266304347826087, "grad_norm": 1.7483571767807007, "learning_rate": 3e-05, "loss": 0.4882, "step": 157 }, { "epoch": 0.42934782608695654, "grad_norm": 1.8139662742614746, "learning_rate": 3e-05, "loss": 0.6033, "step": 158 }, { "epoch": 0.4320652173913043, "grad_norm": 3.0108718872070312, "learning_rate": 3e-05, "loss": 0.837, "step": 159 }, { "epoch": 0.43478260869565216, "grad_norm": 1.5665092468261719, "learning_rate": 3e-05, "loss": 0.5234, "step": 160 }, { "epoch": 0.4375, "grad_norm": 3.6204946041107178, "learning_rate": 3e-05, "loss": 0.5096, "step": 161 }, { "epoch": 0.44021739130434784, "grad_norm": 2.1590631008148193, "learning_rate": 3e-05, "loss": 0.6107, "step": 162 }, { "epoch": 0.4429347826086957, "grad_norm": 1.8781529664993286, "learning_rate": 3e-05, "loss": 0.5122, "step": 163 }, { "epoch": 0.44565217391304346, "grad_norm": 1.7852369546890259, "learning_rate": 3e-05, "loss": 0.4954, "step": 164 }, { "epoch": 0.4483695652173913, "grad_norm": 1.3662923574447632, "learning_rate": 3e-05, "loss": 0.493, "step": 165 }, { "epoch": 0.45108695652173914, "grad_norm": 2.7863638401031494, "learning_rate": 3e-05, "loss": 0.6855, "step": 166 }, { "epoch": 0.453804347826087, "grad_norm": 2.3714311122894287, "learning_rate": 3e-05, "loss": 0.5858, "step": 167 }, { "epoch": 0.45652173913043476, "grad_norm": 2.097414970397949, "learning_rate": 3e-05, "loss": 0.4895, "step": 168 }, { "epoch": 0.4592391304347826, "grad_norm": 2.0122313499450684, "learning_rate": 3e-05, "loss": 0.4241, "step": 169 }, { "epoch": 0.46195652173913043, "grad_norm": 1.5152671337127686, "learning_rate": 3e-05, "loss": 0.4318, "step": 170 }, { "epoch": 0.46467391304347827, "grad_norm": 2.7088394165039062, "learning_rate": 3e-05, "loss": 0.6514, "step": 171 }, { "epoch": 0.4673913043478261, "grad_norm": 1.5779036283493042, "learning_rate": 3e-05, "loss": 0.4605, "step": 172 }, { "epoch": 0.4701086956521739, "grad_norm": 2.193013906478882, "learning_rate": 3e-05, "loss": 0.6419, "step": 173 }, { "epoch": 0.47282608695652173, "grad_norm": 1.4573429822921753, "learning_rate": 3e-05, "loss": 0.4391, "step": 174 }, { "epoch": 0.47554347826086957, "grad_norm": 1.719489574432373, "learning_rate": 3e-05, "loss": 0.6271, "step": 175 }, { "epoch": 0.4782608695652174, "grad_norm": 1.491647481918335, "learning_rate": 3e-05, "loss": 0.5155, "step": 176 }, { "epoch": 0.48097826086956524, "grad_norm": 2.1597378253936768, "learning_rate": 3e-05, "loss": 0.5132, "step": 177 }, { "epoch": 0.483695652173913, "grad_norm": 1.62423574924469, "learning_rate": 3e-05, "loss": 0.4454, "step": 178 }, { "epoch": 0.48641304347826086, "grad_norm": 2.4392452239990234, "learning_rate": 3e-05, "loss": 0.5913, "step": 179 }, { "epoch": 0.4891304347826087, "grad_norm": 2.407681703567505, "learning_rate": 3e-05, "loss": 0.716, "step": 180 }, { "epoch": 0.49184782608695654, "grad_norm": 2.402573585510254, "learning_rate": 3e-05, "loss": 0.5874, "step": 181 }, { "epoch": 0.4945652173913043, "grad_norm": 1.8457105159759521, "learning_rate": 3e-05, "loss": 0.5029, "step": 182 }, { "epoch": 0.49728260869565216, "grad_norm": 2.4062929153442383, "learning_rate": 3e-05, "loss": 0.6185, "step": 183 }, { "epoch": 0.5, "grad_norm": 2.0781688690185547, "learning_rate": 3e-05, "loss": 0.6093, "step": 184 }, { "epoch": 0.5027173913043478, "grad_norm": 1.5497978925704956, "learning_rate": 3e-05, "loss": 0.5322, "step": 185 }, { "epoch": 0.5054347826086957, "grad_norm": 4.002748489379883, "learning_rate": 3e-05, "loss": 0.7277, "step": 186 }, { "epoch": 0.5081521739130435, "grad_norm": 1.3743958473205566, "learning_rate": 3e-05, "loss": 0.4846, "step": 187 }, { "epoch": 0.5108695652173914, "grad_norm": 1.8466805219650269, "learning_rate": 3e-05, "loss": 0.4154, "step": 188 }, { "epoch": 0.5135869565217391, "grad_norm": 2.3035359382629395, "learning_rate": 3e-05, "loss": 0.5205, "step": 189 }, { "epoch": 0.5163043478260869, "grad_norm": 2.2272605895996094, "learning_rate": 3e-05, "loss": 0.5532, "step": 190 }, { "epoch": 0.5190217391304348, "grad_norm": 1.7287522554397583, "learning_rate": 3e-05, "loss": 0.4787, "step": 191 }, { "epoch": 0.5217391304347826, "grad_norm": 2.0177953243255615, "learning_rate": 3e-05, "loss": 0.4949, "step": 192 }, { "epoch": 0.5244565217391305, "grad_norm": 1.2431449890136719, "learning_rate": 3e-05, "loss": 0.4026, "step": 193 }, { "epoch": 0.5271739130434783, "grad_norm": 1.2249736785888672, "learning_rate": 3e-05, "loss": 0.5167, "step": 194 }, { "epoch": 0.529891304347826, "grad_norm": 1.9964460134506226, "learning_rate": 3e-05, "loss": 0.5396, "step": 195 }, { "epoch": 0.532608695652174, "grad_norm": 1.4250489473342896, "learning_rate": 3e-05, "loss": 0.4755, "step": 196 }, { "epoch": 0.5353260869565217, "grad_norm": 3.141204833984375, "learning_rate": 3e-05, "loss": 0.6342, "step": 197 }, { "epoch": 0.5380434782608695, "grad_norm": 2.3809869289398193, "learning_rate": 3e-05, "loss": 0.5182, "step": 198 }, { "epoch": 0.5407608695652174, "grad_norm": 3.068497896194458, "learning_rate": 3e-05, "loss": 0.789, "step": 199 }, { "epoch": 0.5434782608695652, "grad_norm": 1.3273123502731323, "learning_rate": 3e-05, "loss": 0.5361, "step": 200 }, { "epoch": 0.5461956521739131, "grad_norm": 1.2408298254013062, "learning_rate": 3e-05, "loss": 0.4686, "step": 201 }, { "epoch": 0.5489130434782609, "grad_norm": 2.427406072616577, "learning_rate": 3e-05, "loss": 0.594, "step": 202 }, { "epoch": 0.5516304347826086, "grad_norm": 2.725402593612671, "learning_rate": 3e-05, "loss": 0.5054, "step": 203 }, { "epoch": 0.5543478260869565, "grad_norm": 2.7441318035125732, "learning_rate": 3e-05, "loss": 0.7141, "step": 204 }, { "epoch": 0.5570652173913043, "grad_norm": 1.1885932683944702, "learning_rate": 3e-05, "loss": 0.3824, "step": 205 }, { "epoch": 0.5597826086956522, "grad_norm": 1.8625751733779907, "learning_rate": 3e-05, "loss": 0.5073, "step": 206 }, { "epoch": 0.5625, "grad_norm": 1.8269888162612915, "learning_rate": 3e-05, "loss": 0.5006, "step": 207 }, { "epoch": 0.5652173913043478, "grad_norm": 1.4195995330810547, "learning_rate": 3e-05, "loss": 0.4833, "step": 208 }, { "epoch": 0.5679347826086957, "grad_norm": 1.4971282482147217, "learning_rate": 3e-05, "loss": 0.527, "step": 209 }, { "epoch": 0.5706521739130435, "grad_norm": 2.9385082721710205, "learning_rate": 3e-05, "loss": 0.6193, "step": 210 }, { "epoch": 0.5733695652173914, "grad_norm": 1.6641467809677124, "learning_rate": 3e-05, "loss": 0.6115, "step": 211 }, { "epoch": 0.5760869565217391, "grad_norm": 2.1166892051696777, "learning_rate": 3e-05, "loss": 0.6219, "step": 212 }, { "epoch": 0.5788043478260869, "grad_norm": 2.319002866744995, "learning_rate": 3e-05, "loss": 0.5535, "step": 213 }, { "epoch": 0.5815217391304348, "grad_norm": 2.2194128036499023, "learning_rate": 3e-05, "loss": 0.5677, "step": 214 }, { "epoch": 0.5842391304347826, "grad_norm": 2.779507637023926, "learning_rate": 3e-05, "loss": 0.6392, "step": 215 }, { "epoch": 0.5869565217391305, "grad_norm": 2.3067514896392822, "learning_rate": 3e-05, "loss": 0.6002, "step": 216 }, { "epoch": 0.5896739130434783, "grad_norm": 1.7696046829223633, "learning_rate": 3e-05, "loss": 0.4951, "step": 217 }, { "epoch": 0.592391304347826, "grad_norm": 1.649396538734436, "learning_rate": 3e-05, "loss": 0.4799, "step": 218 }, { "epoch": 0.595108695652174, "grad_norm": 2.204364538192749, "learning_rate": 3e-05, "loss": 0.5045, "step": 219 }, { "epoch": 0.5978260869565217, "grad_norm": 1.8227981328964233, "learning_rate": 3e-05, "loss": 0.5669, "step": 220 }, { "epoch": 0.6005434782608695, "grad_norm": 1.8465766906738281, "learning_rate": 3e-05, "loss": 0.6157, "step": 221 }, { "epoch": 0.6032608695652174, "grad_norm": 2.102964162826538, "learning_rate": 3e-05, "loss": 0.4715, "step": 222 }, { "epoch": 0.6059782608695652, "grad_norm": 1.6074268817901611, "learning_rate": 3e-05, "loss": 0.48, "step": 223 }, { "epoch": 0.6086956521739131, "grad_norm": 1.9385191202163696, "learning_rate": 3e-05, "loss": 0.4727, "step": 224 }, { "epoch": 0.6114130434782609, "grad_norm": 2.639068603515625, "learning_rate": 3e-05, "loss": 0.6696, "step": 225 }, { "epoch": 0.6141304347826086, "grad_norm": 1.9903889894485474, "learning_rate": 3e-05, "loss": 0.5696, "step": 226 }, { "epoch": 0.6168478260869565, "grad_norm": 2.7789058685302734, "learning_rate": 3e-05, "loss": 0.6264, "step": 227 }, { "epoch": 0.6195652173913043, "grad_norm": 2.7294304370880127, "learning_rate": 3e-05, "loss": 0.5185, "step": 228 }, { "epoch": 0.6222826086956522, "grad_norm": 2.1425840854644775, "learning_rate": 3e-05, "loss": 0.4822, "step": 229 }, { "epoch": 0.625, "grad_norm": 2.22175669670105, "learning_rate": 3e-05, "loss": 0.6414, "step": 230 }, { "epoch": 0.6277173913043478, "grad_norm": 2.031067132949829, "learning_rate": 3e-05, "loss": 0.4099, "step": 231 }, { "epoch": 0.6304347826086957, "grad_norm": 2.0958268642425537, "learning_rate": 3e-05, "loss": 0.7023, "step": 232 }, { "epoch": 0.6331521739130435, "grad_norm": 2.353266716003418, "learning_rate": 3e-05, "loss": 0.5861, "step": 233 }, { "epoch": 0.6358695652173914, "grad_norm": 2.579880475997925, "learning_rate": 3e-05, "loss": 0.6487, "step": 234 }, { "epoch": 0.6385869565217391, "grad_norm": 2.1880500316619873, "learning_rate": 3e-05, "loss": 0.6574, "step": 235 }, { "epoch": 0.6413043478260869, "grad_norm": 1.692370891571045, "learning_rate": 3e-05, "loss": 0.443, "step": 236 }, { "epoch": 0.6440217391304348, "grad_norm": 1.253108263015747, "learning_rate": 3e-05, "loss": 0.3879, "step": 237 }, { "epoch": 0.6467391304347826, "grad_norm": 2.0025603771209717, "learning_rate": 3e-05, "loss": 0.5068, "step": 238 }, { "epoch": 0.6494565217391305, "grad_norm": 2.018907070159912, "learning_rate": 3e-05, "loss": 0.5783, "step": 239 }, { "epoch": 0.6521739130434783, "grad_norm": 1.8426876068115234, "learning_rate": 3e-05, "loss": 0.5025, "step": 240 }, { "epoch": 0.654891304347826, "grad_norm": 1.7005494832992554, "learning_rate": 3e-05, "loss": 0.4745, "step": 241 }, { "epoch": 0.657608695652174, "grad_norm": 1.503015398979187, "learning_rate": 3e-05, "loss": 0.5462, "step": 242 }, { "epoch": 0.6603260869565217, "grad_norm": 1.5778555870056152, "learning_rate": 3e-05, "loss": 0.4247, "step": 243 }, { "epoch": 0.6630434782608695, "grad_norm": 2.054659843444824, "learning_rate": 3e-05, "loss": 0.5232, "step": 244 }, { "epoch": 0.6657608695652174, "grad_norm": 1.7915781736373901, "learning_rate": 3e-05, "loss": 0.501, "step": 245 }, { "epoch": 0.6684782608695652, "grad_norm": 2.16675066947937, "learning_rate": 3e-05, "loss": 0.6102, "step": 246 }, { "epoch": 0.6711956521739131, "grad_norm": 2.503446578979492, "learning_rate": 3e-05, "loss": 0.5873, "step": 247 }, { "epoch": 0.6739130434782609, "grad_norm": 1.6269731521606445, "learning_rate": 3e-05, "loss": 0.4125, "step": 248 }, { "epoch": 0.6766304347826086, "grad_norm": 1.795397162437439, "learning_rate": 3e-05, "loss": 0.5117, "step": 249 }, { "epoch": 0.6793478260869565, "grad_norm": 2.1140689849853516, "learning_rate": 3e-05, "loss": 0.494, "step": 250 }, { "epoch": 0.6820652173913043, "grad_norm": 1.8272490501403809, "learning_rate": 3e-05, "loss": 0.506, "step": 251 }, { "epoch": 0.6847826086956522, "grad_norm": 1.8032358884811401, "learning_rate": 3e-05, "loss": 0.5329, "step": 252 }, { "epoch": 0.6875, "grad_norm": 2.1028482913970947, "learning_rate": 3e-05, "loss": 0.5306, "step": 253 }, { "epoch": 0.6902173913043478, "grad_norm": 3.13128399848938, "learning_rate": 3e-05, "loss": 0.6008, "step": 254 }, { "epoch": 0.6929347826086957, "grad_norm": 1.6769640445709229, "learning_rate": 3e-05, "loss": 0.5309, "step": 255 }, { "epoch": 0.6956521739130435, "grad_norm": 1.5554113388061523, "learning_rate": 3e-05, "loss": 0.5298, "step": 256 }, { "epoch": 0.6983695652173914, "grad_norm": 1.408714771270752, "learning_rate": 3e-05, "loss": 0.4415, "step": 257 }, { "epoch": 0.7010869565217391, "grad_norm": 1.4659717082977295, "learning_rate": 3e-05, "loss": 0.3927, "step": 258 }, { "epoch": 0.7038043478260869, "grad_norm": 1.5185580253601074, "learning_rate": 3e-05, "loss": 0.5113, "step": 259 }, { "epoch": 0.7065217391304348, "grad_norm": 1.2095264196395874, "learning_rate": 3e-05, "loss": 0.4989, "step": 260 }, { "epoch": 0.7092391304347826, "grad_norm": 1.7118961811065674, "learning_rate": 3e-05, "loss": 0.5711, "step": 261 }, { "epoch": 0.7119565217391305, "grad_norm": 1.268684983253479, "learning_rate": 3e-05, "loss": 0.4141, "step": 262 }, { "epoch": 0.7146739130434783, "grad_norm": 1.5639039278030396, "learning_rate": 3e-05, "loss": 0.3954, "step": 263 }, { "epoch": 0.717391304347826, "grad_norm": 1.5051435232162476, "learning_rate": 3e-05, "loss": 0.5565, "step": 264 }, { "epoch": 0.720108695652174, "grad_norm": 2.289475679397583, "learning_rate": 3e-05, "loss": 0.6248, "step": 265 }, { "epoch": 0.7228260869565217, "grad_norm": 2.8152410984039307, "learning_rate": 3e-05, "loss": 0.5782, "step": 266 }, { "epoch": 0.7255434782608695, "grad_norm": 1.3830969333648682, "learning_rate": 3e-05, "loss": 0.3414, "step": 267 }, { "epoch": 0.7282608695652174, "grad_norm": 1.6137558221817017, "learning_rate": 3e-05, "loss": 0.5093, "step": 268 }, { "epoch": 0.7309782608695652, "grad_norm": 2.456444025039673, "learning_rate": 3e-05, "loss": 0.5382, "step": 269 }, { "epoch": 0.7336956521739131, "grad_norm": 2.212475061416626, "learning_rate": 3e-05, "loss": 0.4898, "step": 270 }, { "epoch": 0.7364130434782609, "grad_norm": 2.4707953929901123, "learning_rate": 3e-05, "loss": 0.7499, "step": 271 }, { "epoch": 0.7391304347826086, "grad_norm": 1.7205860614776611, "learning_rate": 3e-05, "loss": 0.4294, "step": 272 }, { "epoch": 0.7418478260869565, "grad_norm": 2.055875062942505, "learning_rate": 3e-05, "loss": 0.6336, "step": 273 }, { "epoch": 0.7445652173913043, "grad_norm": 2.388607978820801, "learning_rate": 3e-05, "loss": 0.5929, "step": 274 }, { "epoch": 0.7472826086956522, "grad_norm": 2.509526491165161, "learning_rate": 3e-05, "loss": 0.6704, "step": 275 }, { "epoch": 0.75, "grad_norm": 1.3426049947738647, "learning_rate": 3e-05, "loss": 0.4698, "step": 276 }, { "epoch": 0.7527173913043478, "grad_norm": 1.8016018867492676, "learning_rate": 3e-05, "loss": 0.5052, "step": 277 }, { "epoch": 0.7554347826086957, "grad_norm": 1.7007917165756226, "learning_rate": 3e-05, "loss": 0.5279, "step": 278 }, { "epoch": 0.7581521739130435, "grad_norm": 2.104093074798584, "learning_rate": 3e-05, "loss": 0.5401, "step": 279 }, { "epoch": 0.7608695652173914, "grad_norm": 2.2383487224578857, "learning_rate": 3e-05, "loss": 0.5503, "step": 280 }, { "epoch": 0.7635869565217391, "grad_norm": 1.8349882364273071, "learning_rate": 3e-05, "loss": 0.539, "step": 281 }, { "epoch": 0.7663043478260869, "grad_norm": 1.6838781833648682, "learning_rate": 3e-05, "loss": 0.4281, "step": 282 }, { "epoch": 0.7690217391304348, "grad_norm": 2.3261828422546387, "learning_rate": 3e-05, "loss": 0.507, "step": 283 }, { "epoch": 0.7717391304347826, "grad_norm": 1.6605132818222046, "learning_rate": 3e-05, "loss": 0.6602, "step": 284 }, { "epoch": 0.7744565217391305, "grad_norm": 2.164030075073242, "learning_rate": 3e-05, "loss": 0.4894, "step": 285 }, { "epoch": 0.7771739130434783, "grad_norm": 1.8546051979064941, "learning_rate": 3e-05, "loss": 0.44, "step": 286 }, { "epoch": 0.779891304347826, "grad_norm": 2.498770236968994, "learning_rate": 3e-05, "loss": 0.627, "step": 287 }, { "epoch": 0.782608695652174, "grad_norm": 2.179408550262451, "learning_rate": 3e-05, "loss": 0.6078, "step": 288 }, { "epoch": 0.7853260869565217, "grad_norm": 1.6444724798202515, "learning_rate": 3e-05, "loss": 0.56, "step": 289 }, { "epoch": 0.7880434782608695, "grad_norm": 1.3116724491119385, "learning_rate": 3e-05, "loss": 0.4838, "step": 290 }, { "epoch": 0.7907608695652174, "grad_norm": 2.142730951309204, "learning_rate": 3e-05, "loss": 0.5892, "step": 291 }, { "epoch": 0.7934782608695652, "grad_norm": 1.947159767150879, "learning_rate": 3e-05, "loss": 0.4796, "step": 292 }, { "epoch": 0.7961956521739131, "grad_norm": 1.7489418983459473, "learning_rate": 3e-05, "loss": 0.6132, "step": 293 }, { "epoch": 0.7989130434782609, "grad_norm": 1.7919461727142334, "learning_rate": 3e-05, "loss": 0.4928, "step": 294 }, { "epoch": 0.8016304347826086, "grad_norm": 2.547961711883545, "learning_rate": 3e-05, "loss": 0.491, "step": 295 }, { "epoch": 0.8043478260869565, "grad_norm": 2.5002658367156982, "learning_rate": 3e-05, "loss": 0.6737, "step": 296 }, { "epoch": 0.8070652173913043, "grad_norm": 1.5400561094284058, "learning_rate": 3e-05, "loss": 0.4632, "step": 297 }, { "epoch": 0.8097826086956522, "grad_norm": 1.2512913942337036, "learning_rate": 3e-05, "loss": 0.563, "step": 298 }, { "epoch": 0.8125, "grad_norm": 1.7560540437698364, "learning_rate": 3e-05, "loss": 0.5273, "step": 299 }, { "epoch": 0.8152173913043478, "grad_norm": 2.1025469303131104, "learning_rate": 3e-05, "loss": 0.5813, "step": 300 }, { "epoch": 0.8179347826086957, "grad_norm": 1.748838186264038, "learning_rate": 3e-05, "loss": 0.5182, "step": 301 }, { "epoch": 0.8206521739130435, "grad_norm": 1.7875301837921143, "learning_rate": 3e-05, "loss": 0.5248, "step": 302 }, { "epoch": 0.8233695652173914, "grad_norm": 1.6547335386276245, "learning_rate": 3e-05, "loss": 0.4525, "step": 303 }, { "epoch": 0.8260869565217391, "grad_norm": 2.5749781131744385, "learning_rate": 3e-05, "loss": 0.5864, "step": 304 }, { "epoch": 0.8288043478260869, "grad_norm": 2.010406255722046, "learning_rate": 3e-05, "loss": 0.4704, "step": 305 }, { "epoch": 0.8315217391304348, "grad_norm": 1.61479914188385, "learning_rate": 3e-05, "loss": 0.4981, "step": 306 }, { "epoch": 0.8342391304347826, "grad_norm": 2.3102943897247314, "learning_rate": 3e-05, "loss": 0.4854, "step": 307 }, { "epoch": 0.8369565217391305, "grad_norm": 4.297898769378662, "learning_rate": 3e-05, "loss": 0.6969, "step": 308 }, { "epoch": 0.8396739130434783, "grad_norm": 1.56645667552948, "learning_rate": 3e-05, "loss": 0.4675, "step": 309 }, { "epoch": 0.842391304347826, "grad_norm": 2.9031646251678467, "learning_rate": 3e-05, "loss": 0.713, "step": 310 }, { "epoch": 0.845108695652174, "grad_norm": 1.9214662313461304, "learning_rate": 3e-05, "loss": 0.5379, "step": 311 }, { "epoch": 0.8478260869565217, "grad_norm": 2.371910810470581, "learning_rate": 3e-05, "loss": 0.5174, "step": 312 }, { "epoch": 0.8505434782608695, "grad_norm": 2.3552019596099854, "learning_rate": 3e-05, "loss": 0.4784, "step": 313 }, { "epoch": 0.8532608695652174, "grad_norm": 1.3287854194641113, "learning_rate": 3e-05, "loss": 0.5026, "step": 314 }, { "epoch": 0.8559782608695652, "grad_norm": 3.1460154056549072, "learning_rate": 3e-05, "loss": 0.6437, "step": 315 }, { "epoch": 0.8586956521739131, "grad_norm": 3.9493658542633057, "learning_rate": 3e-05, "loss": 0.7079, "step": 316 }, { "epoch": 0.8614130434782609, "grad_norm": 1.8890721797943115, "learning_rate": 3e-05, "loss": 0.4789, "step": 317 }, { "epoch": 0.8641304347826086, "grad_norm": 2.397585153579712, "learning_rate": 3e-05, "loss": 0.5358, "step": 318 }, { "epoch": 0.8668478260869565, "grad_norm": 3.6481263637542725, "learning_rate": 3e-05, "loss": 0.6699, "step": 319 }, { "epoch": 0.8695652173913043, "grad_norm": 3.643709421157837, "learning_rate": 3e-05, "loss": 0.5464, "step": 320 }, { "epoch": 0.8722826086956522, "grad_norm": 1.8755005598068237, "learning_rate": 3e-05, "loss": 0.5231, "step": 321 }, { "epoch": 0.875, "grad_norm": 1.2588874101638794, "learning_rate": 3e-05, "loss": 0.3987, "step": 322 }, { "epoch": 0.8777173913043478, "grad_norm": 1.8710525035858154, "learning_rate": 3e-05, "loss": 0.5137, "step": 323 }, { "epoch": 0.8804347826086957, "grad_norm": 1.8760285377502441, "learning_rate": 3e-05, "loss": 0.5162, "step": 324 }, { "epoch": 0.8831521739130435, "grad_norm": 2.302496910095215, "learning_rate": 3e-05, "loss": 0.6338, "step": 325 }, { "epoch": 0.8858695652173914, "grad_norm": 2.173513650894165, "learning_rate": 3e-05, "loss": 0.5525, "step": 326 }, { "epoch": 0.8885869565217391, "grad_norm": 3.2215616703033447, "learning_rate": 3e-05, "loss": 0.4746, "step": 327 }, { "epoch": 0.8913043478260869, "grad_norm": 1.402948021888733, "learning_rate": 3e-05, "loss": 0.4551, "step": 328 }, { "epoch": 0.8940217391304348, "grad_norm": 2.177567958831787, "learning_rate": 3e-05, "loss": 0.5937, "step": 329 }, { "epoch": 0.8967391304347826, "grad_norm": 1.9073940515518188, "learning_rate": 3e-05, "loss": 0.6849, "step": 330 }, { "epoch": 0.8994565217391305, "grad_norm": 2.078620195388794, "learning_rate": 3e-05, "loss": 0.4851, "step": 331 }, { "epoch": 0.9021739130434783, "grad_norm": 1.666050672531128, "learning_rate": 3e-05, "loss": 0.5361, "step": 332 }, { "epoch": 0.904891304347826, "grad_norm": 1.8900854587554932, "learning_rate": 3e-05, "loss": 0.5878, "step": 333 }, { "epoch": 0.907608695652174, "grad_norm": 2.4157278537750244, "learning_rate": 3e-05, "loss": 0.583, "step": 334 }, { "epoch": 0.9103260869565217, "grad_norm": 1.9325672388076782, "learning_rate": 3e-05, "loss": 0.5036, "step": 335 }, { "epoch": 0.9130434782608695, "grad_norm": 2.91580867767334, "learning_rate": 3e-05, "loss": 0.6958, "step": 336 }, { "epoch": 0.9157608695652174, "grad_norm": 2.5430386066436768, "learning_rate": 3e-05, "loss": 0.5496, "step": 337 }, { "epoch": 0.9184782608695652, "grad_norm": 2.0265727043151855, "learning_rate": 3e-05, "loss": 0.4997, "step": 338 }, { "epoch": 0.9211956521739131, "grad_norm": 1.583471417427063, "learning_rate": 3e-05, "loss": 0.4641, "step": 339 }, { "epoch": 0.9239130434782609, "grad_norm": 2.3555848598480225, "learning_rate": 3e-05, "loss": 0.4989, "step": 340 }, { "epoch": 0.9266304347826086, "grad_norm": 2.229189395904541, "learning_rate": 3e-05, "loss": 0.5665, "step": 341 }, { "epoch": 0.9293478260869565, "grad_norm": 1.3699840307235718, "learning_rate": 3e-05, "loss": 0.523, "step": 342 }, { "epoch": 0.9320652173913043, "grad_norm": 1.9070311784744263, "learning_rate": 3e-05, "loss": 0.565, "step": 343 }, { "epoch": 0.9347826086956522, "grad_norm": 2.588691234588623, "learning_rate": 3e-05, "loss": 0.5814, "step": 344 }, { "epoch": 0.9375, "grad_norm": 1.7156322002410889, "learning_rate": 3e-05, "loss": 0.5074, "step": 345 }, { "epoch": 0.9402173913043478, "grad_norm": 2.289538860321045, "learning_rate": 3e-05, "loss": 0.5744, "step": 346 }, { "epoch": 0.9429347826086957, "grad_norm": 1.0715011358261108, "learning_rate": 3e-05, "loss": 0.4612, "step": 347 }, { "epoch": 0.9456521739130435, "grad_norm": 1.5660399198532104, "learning_rate": 3e-05, "loss": 0.4354, "step": 348 }, { "epoch": 0.9483695652173914, "grad_norm": 2.2513458728790283, "learning_rate": 3e-05, "loss": 0.5702, "step": 349 }, { "epoch": 0.9510869565217391, "grad_norm": 2.5733985900878906, "learning_rate": 3e-05, "loss": 0.6115, "step": 350 }, { "epoch": 0.9538043478260869, "grad_norm": 1.1676104068756104, "learning_rate": 3e-05, "loss": 0.4158, "step": 351 }, { "epoch": 0.9565217391304348, "grad_norm": 2.1852831840515137, "learning_rate": 3e-05, "loss": 0.5267, "step": 352 }, { "epoch": 0.9592391304347826, "grad_norm": 1.339505910873413, "learning_rate": 3e-05, "loss": 0.4204, "step": 353 }, { "epoch": 0.9619565217391305, "grad_norm": 1.5837697982788086, "learning_rate": 3e-05, "loss": 0.4722, "step": 354 }, { "epoch": 0.9646739130434783, "grad_norm": 1.5127207040786743, "learning_rate": 3e-05, "loss": 0.4782, "step": 355 }, { "epoch": 0.967391304347826, "grad_norm": 0.9464153051376343, "learning_rate": 3e-05, "loss": 0.3675, "step": 356 }, { "epoch": 0.970108695652174, "grad_norm": 3.507087469100952, "learning_rate": 3e-05, "loss": 0.5947, "step": 357 }, { "epoch": 0.9728260869565217, "grad_norm": 2.1700286865234375, "learning_rate": 3e-05, "loss": 0.5949, "step": 358 }, { "epoch": 0.9755434782608695, "grad_norm": 1.070860505104065, "learning_rate": 3e-05, "loss": 0.4209, "step": 359 }, { "epoch": 0.9782608695652174, "grad_norm": 1.8626935482025146, "learning_rate": 3e-05, "loss": 0.4979, "step": 360 }, { "epoch": 0.9809782608695652, "grad_norm": 1.2900367975234985, "learning_rate": 3e-05, "loss": 0.4778, "step": 361 }, { "epoch": 0.9836956521739131, "grad_norm": 1.2917206287384033, "learning_rate": 3e-05, "loss": 0.4561, "step": 362 }, { "epoch": 0.9864130434782609, "grad_norm": 2.3133273124694824, "learning_rate": 3e-05, "loss": 0.5618, "step": 363 }, { "epoch": 0.9891304347826086, "grad_norm": 1.5823960304260254, "learning_rate": 3e-05, "loss": 0.5549, "step": 364 }, { "epoch": 0.9918478260869565, "grad_norm": 1.6155842542648315, "learning_rate": 3e-05, "loss": 0.4442, "step": 365 }, { "epoch": 0.9945652173913043, "grad_norm": 1.7502015829086304, "learning_rate": 3e-05, "loss": 0.5179, "step": 366 }, { "epoch": 0.9972826086956522, "grad_norm": 2.5052552223205566, "learning_rate": 3e-05, "loss": 0.5613, "step": 367 }, { "epoch": 1.0, "grad_norm": 0.9979060888290405, "learning_rate": 3e-05, "loss": 0.3232, "step": 368 }, { "epoch": 1.0, "step": 368, "total_flos": 6.339170402304e+16, "train_loss": 0.5430356652840324, "train_runtime": 908.8402, "train_samples_per_second": 3.234, "train_steps_per_second": 0.405 } ], "logging_steps": 1.0, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.339170402304e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }