{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9969257795344753, "eval_steps": 500, "global_step": 4552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00878348704435661, "grad_norm": 148.59868319057273, "learning_rate": 9.980228471001756e-07, "logits/chosen": 0.748828113079071, "logits/rejected": 0.792675793170929, "logps/chosen": -280.1000061035156, "logps/rejected": -385.6499938964844, "loss": 0.6644, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.3549255430698395, "rewards/margins": 0.095367431640625, "rewards/rejected": -0.45028382539749146, "step": 10 }, { "epoch": 0.01756697408871322, "grad_norm": 119.59249238844725, "learning_rate": 9.958260105448154e-07, "logits/chosen": 0.4925537109375, "logits/rejected": 0.670880138874054, "logps/chosen": -291.2749938964844, "logps/rejected": -406.3500061035156, "loss": 0.6599, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0167968273162842, "rewards/margins": 0.40953367948532104, "rewards/rejected": -1.426171898841858, "step": 20 }, { "epoch": 0.026350461133069828, "grad_norm": 91.67051348870137, "learning_rate": 9.936291739894551e-07, "logits/chosen": 0.40980833768844604, "logits/rejected": 0.6268554925918579, "logps/chosen": -277.45001220703125, "logps/rejected": -418.8999938964844, "loss": 0.6225, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.879150390625, "rewards/margins": 0.38004761934280396, "rewards/rejected": -1.2589843273162842, "step": 30 }, { "epoch": 0.03513394817742644, "grad_norm": 148.89670477517555, "learning_rate": 9.91432337434095e-07, "logits/chosen": 0.4572509825229645, "logits/rejected": 0.47172850370407104, "logps/chosen": -316.6499938964844, "logps/rejected": -431.25, "loss": 0.6497, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.749218761920929, "rewards/margins": 0.24895019829273224, "rewards/rejected": -0.9981445074081421, "step": 40 }, { "epoch": 0.04391743522178305, "grad_norm": 148.18838678912184, "learning_rate": 9.892355008787344e-07, "logits/chosen": 0.4545349180698395, "logits/rejected": 0.685656726360321, "logps/chosen": -324.04998779296875, "logps/rejected": -448.70001220703125, "loss": 0.5888, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6964355707168579, "rewards/margins": 0.4854980409145355, "rewards/rejected": -1.181249976158142, "step": 50 }, { "epoch": 0.052700922266139656, "grad_norm": 144.3663058470471, "learning_rate": 9.870386643233744e-07, "logits/chosen": 0.5623840093612671, "logits/rejected": 0.726855456829071, "logps/chosen": -293.375, "logps/rejected": -398.0, "loss": 0.611, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4938110411167145, "rewards/margins": 0.4091552793979645, "rewards/rejected": -0.903369128704071, "step": 60 }, { "epoch": 0.061484409310496264, "grad_norm": 154.75226533038918, "learning_rate": 9.84841827768014e-07, "logits/chosen": 0.6094726324081421, "logits/rejected": 0.720874011516571, "logps/chosen": -283.3500061035156, "logps/rejected": -382.92498779296875, "loss": 0.6125, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.14274902641773224, "rewards/margins": 0.33563232421875, "rewards/rejected": -0.478759765625, "step": 70 }, { "epoch": 0.07026789635485288, "grad_norm": 141.91847063266113, "learning_rate": 9.826449912126537e-07, "logits/chosen": 0.7362304925918579, "logits/rejected": 0.8065429925918579, "logps/chosen": -295.6499938964844, "logps/rejected": -405.29998779296875, "loss": 0.6634, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4532470703125, "rewards/margins": 0.3498901426792145, "rewards/rejected": -0.802294909954071, "step": 80 }, { "epoch": 0.07905138339920949, "grad_norm": 117.93076078675757, "learning_rate": 9.804481546572935e-07, "logits/chosen": 0.6463867425918579, "logits/rejected": 0.8156982660293579, "logps/chosen": -267.2749938964844, "logps/rejected": -372.75, "loss": 0.5913, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3519760072231293, "rewards/margins": 0.5559326410293579, "rewards/rejected": -0.9078124761581421, "step": 90 }, { "epoch": 0.0878348704435661, "grad_norm": 117.7944036722301, "learning_rate": 9.782513181019332e-07, "logits/chosen": 0.801562488079071, "logits/rejected": 0.814746081829071, "logps/chosen": -292.79998779296875, "logps/rejected": -371.45001220703125, "loss": 0.6021, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19708862900733948, "rewards/margins": 0.49360960721969604, "rewards/rejected": -0.6912597417831421, "step": 100 }, { "epoch": 0.0966183574879227, "grad_norm": 132.37426399802177, "learning_rate": 9.760544815465728e-07, "logits/chosen": 0.636303722858429, "logits/rejected": 0.750927746295929, "logps/chosen": -279.3999938964844, "logps/rejected": -447.79998779296875, "loss": 0.4913, "rewards/accuracies": 0.75, "rewards/chosen": -0.12165527045726776, "rewards/margins": 0.983105480670929, "rewards/rejected": -1.1046874523162842, "step": 110 }, { "epoch": 0.10540184453227931, "grad_norm": 104.80639485347126, "learning_rate": 9.738576449912126e-07, "logits/chosen": 0.553802490234375, "logits/rejected": 0.6413818597793579, "logps/chosen": -304.92498779296875, "logps/rejected": -426.79998779296875, "loss": 0.574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.694995105266571, "rewards/margins": 0.810742199420929, "rewards/rejected": -1.5056641101837158, "step": 120 }, { "epoch": 0.11418533157663592, "grad_norm": 111.77146148396716, "learning_rate": 9.716608084358523e-07, "logits/chosen": 0.5098632574081421, "logits/rejected": 0.6168014407157898, "logps/chosen": -318.79998779296875, "logps/rejected": -460.5, "loss": 0.5105, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6123291254043579, "rewards/margins": 1.1168944835662842, "rewards/rejected": -1.7296874523162842, "step": 130 }, { "epoch": 0.12296881862099253, "grad_norm": 134.31416617256178, "learning_rate": 9.69463971880492e-07, "logits/chosen": 0.749707043170929, "logits/rejected": 0.8342040777206421, "logps/chosen": -327.29998779296875, "logps/rejected": -431.25, "loss": 0.5307, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.594250500202179, "rewards/margins": 0.8269408941268921, "rewards/rejected": -1.4216797351837158, "step": 140 }, { "epoch": 0.13175230566534915, "grad_norm": 138.33794946025216, "learning_rate": 9.672671353251316e-07, "logits/chosen": 0.5429443120956421, "logits/rejected": 0.632006824016571, "logps/chosen": -279.6499938964844, "logps/rejected": -381.6000061035156, "loss": 0.6009, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.717395007610321, "rewards/margins": 0.840502917766571, "rewards/rejected": -1.558691382408142, "step": 150 }, { "epoch": 0.14053579270970576, "grad_norm": 111.73186019255812, "learning_rate": 9.650702987697716e-07, "logits/chosen": 0.5735839605331421, "logits/rejected": 0.6058715581893921, "logps/chosen": -314.32501220703125, "logps/rejected": -476.45001220703125, "loss": 0.5217, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6839233636856079, "rewards/margins": 1.0401122570037842, "rewards/rejected": -1.7257812023162842, "step": 160 }, { "epoch": 0.14931927975406237, "grad_norm": 87.1134985107385, "learning_rate": 9.628734622144111e-07, "logits/chosen": 0.7718750238418579, "logits/rejected": 0.76953125, "logps/chosen": -294.20001220703125, "logps/rejected": -405.3500061035156, "loss": 0.5194, "rewards/accuracies": 0.75, "rewards/chosen": -0.126007080078125, "rewards/margins": 0.9696289300918579, "rewards/rejected": -1.0961425304412842, "step": 170 }, { "epoch": 0.15810276679841898, "grad_norm": 138.87036248904136, "learning_rate": 9.60676625659051e-07, "logits/chosen": 1.000878930091858, "logits/rejected": 0.9208984375, "logps/chosen": -301.45001220703125, "logps/rejected": -428.25, "loss": 0.548, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3602844178676605, "rewards/margins": 0.886035144329071, "rewards/rejected": -1.246118187904358, "step": 180 }, { "epoch": 0.16688625384277558, "grad_norm": 150.31628647236872, "learning_rate": 9.584797891036907e-07, "logits/chosen": 0.71240234375, "logits/rejected": 0.780834972858429, "logps/chosen": -379.1000061035156, "logps/rejected": -475.8999938964844, "loss": 0.5256, "rewards/accuracies": 0.6875, "rewards/chosen": -0.534716784954071, "rewards/margins": 1.0634276866912842, "rewards/rejected": -1.5994141101837158, "step": 190 }, { "epoch": 0.1756697408871322, "grad_norm": 110.19554404708569, "learning_rate": 9.562829525483304e-07, "logits/chosen": 0.71240234375, "logits/rejected": 0.736621081829071, "logps/chosen": -318.0, "logps/rejected": -453.29998779296875, "loss": 0.4658, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.404348760843277, "rewards/margins": 1.2839844226837158, "rewards/rejected": -1.686621069908142, "step": 200 }, { "epoch": 0.1844532279314888, "grad_norm": 108.95623699498367, "learning_rate": 9.5408611599297e-07, "logits/chosen": 0.7989501953125, "logits/rejected": 0.8464111089706421, "logps/chosen": -328.6000061035156, "logps/rejected": -418.6000061035156, "loss": 0.6104, "rewards/accuracies": 0.65625, "rewards/chosen": -0.38177490234375, "rewards/margins": 0.9276367425918579, "rewards/rejected": -1.30926513671875, "step": 210 }, { "epoch": 0.1932367149758454, "grad_norm": 180.43373718069262, "learning_rate": 9.518892794376097e-07, "logits/chosen": 0.7186523675918579, "logits/rejected": 0.787402331829071, "logps/chosen": -295.54998779296875, "logps/rejected": -409.25, "loss": 0.6252, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.32822877168655396, "rewards/margins": 0.858105480670929, "rewards/rejected": -1.185815453529358, "step": 220 }, { "epoch": 0.20202020202020202, "grad_norm": 156.7016000041957, "learning_rate": 9.496924428822495e-07, "logits/chosen": 0.721142590045929, "logits/rejected": 0.853466808795929, "logps/chosen": -299.6499938964844, "logps/rejected": -397.3999938964844, "loss": 0.6013, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3378539979457855, "rewards/margins": 0.980609118938446, "rewards/rejected": -1.3193237781524658, "step": 230 }, { "epoch": 0.21080368906455862, "grad_norm": 107.17060152240023, "learning_rate": 9.474956063268892e-07, "logits/chosen": 0.659912109375, "logits/rejected": 0.829785168170929, "logps/chosen": -318.3999938964844, "logps/rejected": -435.95001220703125, "loss": 0.501, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02102050743997097, "rewards/margins": 1.2213134765625, "rewards/rejected": -1.2421875, "step": 240 }, { "epoch": 0.21958717610891523, "grad_norm": 91.68073549937458, "learning_rate": 9.45298769771529e-07, "logits/chosen": 0.6968749761581421, "logits/rejected": 0.683544933795929, "logps/chosen": -299.75, "logps/rejected": -381.20001220703125, "loss": 0.616, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01799316331744194, "rewards/margins": 1.0184814929962158, "rewards/rejected": -1.037451148033142, "step": 250 }, { "epoch": 0.22837066315327184, "grad_norm": 116.42358024093633, "learning_rate": 9.431019332161687e-07, "logits/chosen": 0.9339843988418579, "logits/rejected": 1.0828125476837158, "logps/chosen": -275.70001220703125, "logps/rejected": -406.70001220703125, "loss": 0.5268, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3944946229457855, "rewards/margins": 1.147363305091858, "rewards/rejected": -0.7533935308456421, "step": 260 }, { "epoch": 0.23715415019762845, "grad_norm": 172.58911816016837, "learning_rate": 9.409050966608084e-07, "logits/chosen": 0.789794921875, "logits/rejected": 0.840527355670929, "logps/chosen": -307.5249938964844, "logps/rejected": -420.75, "loss": 0.58, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09282226860523224, "rewards/margins": 0.984082043170929, "rewards/rejected": -0.8904174566268921, "step": 270 }, { "epoch": 0.24593763724198506, "grad_norm": 115.82490427642993, "learning_rate": 9.387082601054481e-07, "logits/chosen": 0.722363293170929, "logits/rejected": 0.801928699016571, "logps/chosen": -333.3999938964844, "logps/rejected": -438.5, "loss": 0.4808, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.005969238467514515, "rewards/margins": 1.157812476158142, "rewards/rejected": -1.151611328125, "step": 280 }, { "epoch": 0.2547211242863417, "grad_norm": 117.07853482960266, "learning_rate": 9.365114235500879e-07, "logits/chosen": 0.6036132574081421, "logits/rejected": 0.551953136920929, "logps/chosen": -297.8999938964844, "logps/rejected": -427.6000061035156, "loss": 0.4949, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5401366949081421, "rewards/margins": 1.3249022960662842, "rewards/rejected": -1.863867163658142, "step": 290 }, { "epoch": 0.2635046113306983, "grad_norm": 150.65333319518325, "learning_rate": 9.343145869947275e-07, "logits/chosen": 0.5726073980331421, "logits/rejected": 0.568835437297821, "logps/chosen": -269.54998779296875, "logps/rejected": -435.1499938964844, "loss": 0.4774, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8287597894668579, "rewards/margins": 1.4607422351837158, "rewards/rejected": -2.2894530296325684, "step": 300 }, { "epoch": 0.2722880983750549, "grad_norm": 198.3395393458289, "learning_rate": 9.321177504393673e-07, "logits/chosen": 0.6517578363418579, "logits/rejected": 0.69580078125, "logps/chosen": -291.6499938964844, "logps/rejected": -428.29998779296875, "loss": 0.6168, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6890197992324829, "rewards/margins": 1.459191918373108, "rewards/rejected": -2.149707078933716, "step": 310 }, { "epoch": 0.2810715854194115, "grad_norm": 105.86219392786265, "learning_rate": 9.299209138840069e-07, "logits/chosen": 0.766772449016571, "logits/rejected": 0.836352527141571, "logps/chosen": -316.29998779296875, "logps/rejected": -427.1000061035156, "loss": 0.4892, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24748535454273224, "rewards/margins": 1.2219727039337158, "rewards/rejected": -1.469824194908142, "step": 320 }, { "epoch": 0.2898550724637681, "grad_norm": 111.95994348769372, "learning_rate": 9.277240773286467e-07, "logits/chosen": 0.7369140386581421, "logits/rejected": 0.8018554449081421, "logps/chosen": -292.45001220703125, "logps/rejected": -401.75, "loss": 0.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04793701320886612, "rewards/margins": 1.031591773033142, "rewards/rejected": -1.079095482826233, "step": 330 }, { "epoch": 0.29863855950812473, "grad_norm": 143.5508892676723, "learning_rate": 9.255272407732864e-07, "logits/chosen": 0.940869152545929, "logits/rejected": 0.942187488079071, "logps/chosen": -299.1000061035156, "logps/rejected": -423.8999938964844, "loss": 0.5835, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19556884467601776, "rewards/margins": 1.1557495594024658, "rewards/rejected": -0.9599364995956421, "step": 340 }, { "epoch": 0.30742204655248134, "grad_norm": 112.05862120974595, "learning_rate": 9.233304042179262e-07, "logits/chosen": 0.9351562261581421, "logits/rejected": 0.99609375, "logps/chosen": -319.3999938964844, "logps/rejected": -408.75, "loss": 0.5287, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05678100511431694, "rewards/margins": 0.903430163860321, "rewards/rejected": -0.846484363079071, "step": 350 }, { "epoch": 0.31620553359683795, "grad_norm": 148.9146841988132, "learning_rate": 9.211335676625659e-07, "logits/chosen": 0.692523181438446, "logits/rejected": 0.8204101324081421, "logps/chosen": -333.6499938964844, "logps/rejected": -465.1000061035156, "loss": 0.5657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08364257961511612, "rewards/margins": 1.1494140625, "rewards/rejected": -1.2324707508087158, "step": 360 }, { "epoch": 0.32498902064119456, "grad_norm": 113.90097897447896, "learning_rate": 9.189367311072056e-07, "logits/chosen": 0.747265636920929, "logits/rejected": 0.856249988079071, "logps/chosen": -309.20001220703125, "logps/rejected": -403.45001220703125, "loss": 0.5394, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23229369521141052, "rewards/margins": 1.006250023841858, "rewards/rejected": -1.2392089366912842, "step": 370 }, { "epoch": 0.33377250768555117, "grad_norm": 117.29165263327252, "learning_rate": 9.167398945518453e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.712841808795929, "logps/chosen": -314.8999938964844, "logps/rejected": -448.0, "loss": 0.5505, "rewards/accuracies": 0.75, "rewards/chosen": 0.04265747219324112, "rewards/margins": 1.3502929210662842, "rewards/rejected": -1.3093383312225342, "step": 380 }, { "epoch": 0.3425559947299078, "grad_norm": 97.93389851669994, "learning_rate": 9.14543057996485e-07, "logits/chosen": 0.84228515625, "logits/rejected": 0.858593761920929, "logps/chosen": -264.2749938964844, "logps/rejected": -385.79998779296875, "loss": 0.5534, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02700195275247097, "rewards/margins": 1.077276587486267, "rewards/rejected": -1.104394555091858, "step": 390 }, { "epoch": 0.3513394817742644, "grad_norm": 164.8140384036127, "learning_rate": 9.123462214411247e-07, "logits/chosen": 0.601611316204071, "logits/rejected": 0.6729980707168579, "logps/chosen": -335.6499938964844, "logps/rejected": -445.8500061035156, "loss": 0.5424, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4661865234375, "rewards/margins": 1.217675805091858, "rewards/rejected": -1.684472680091858, "step": 400 }, { "epoch": 0.360122968818621, "grad_norm": 85.33204706100605, "learning_rate": 9.101493848857645e-07, "logits/chosen": 0.602343738079071, "logits/rejected": 0.6869140863418579, "logps/chosen": -315.95001220703125, "logps/rejected": -413.70001220703125, "loss": 0.5244, "rewards/accuracies": 0.6875, "rewards/chosen": -0.274169921875, "rewards/margins": 1.255224585533142, "rewards/rejected": -1.52978515625, "step": 410 }, { "epoch": 0.3689064558629776, "grad_norm": 100.26200679561148, "learning_rate": 9.079525483304041e-07, "logits/chosen": 0.789355456829071, "logits/rejected": 0.809863269329071, "logps/chosen": -291.04998779296875, "logps/rejected": -384.5, "loss": 0.5871, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.09244994819164276, "rewards/margins": 0.889843761920929, "rewards/rejected": -0.797436535358429, "step": 420 }, { "epoch": 0.3776899429073342, "grad_norm": 95.0391660743147, "learning_rate": 9.057557117750439e-07, "logits/chosen": 0.9410156011581421, "logits/rejected": 1.0525391101837158, "logps/chosen": -289.25, "logps/rejected": -384.1000061035156, "loss": 0.6174, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.4427856504917145, "rewards/margins": 0.6827636957168579, "rewards/rejected": -0.24020996689796448, "step": 430 }, { "epoch": 0.3864734299516908, "grad_norm": 122.75962457937774, "learning_rate": 9.035588752196836e-07, "logits/chosen": 0.898730456829071, "logits/rejected": 0.9765625, "logps/chosen": -292.04998779296875, "logps/rejected": -438.20001220703125, "loss": 0.5539, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3314575254917145, "rewards/margins": 0.9189453125, "rewards/rejected": -0.5882323980331421, "step": 440 }, { "epoch": 0.3952569169960474, "grad_norm": 170.0486031415585, "learning_rate": 9.013620386643234e-07, "logits/chosen": 0.737988293170929, "logits/rejected": 0.848828136920929, "logps/chosen": -354.70001220703125, "logps/rejected": -492.70001220703125, "loss": 0.536, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.58990478515625, "rewards/margins": 1.1292724609375, "rewards/rejected": -1.7185547351837158, "step": 450 }, { "epoch": 0.40404040404040403, "grad_norm": 68.994102871273, "learning_rate": 8.99165202108963e-07, "logits/chosen": 0.635845959186554, "logits/rejected": 0.722302258014679, "logps/chosen": -285.79998779296875, "logps/rejected": -406.54998779296875, "loss": 0.4573, "rewards/accuracies": 0.75, "rewards/chosen": -0.551513671875, "rewards/margins": 1.3182861804962158, "rewards/rejected": -1.8701171875, "step": 460 }, { "epoch": 0.41282389108476064, "grad_norm": 101.87774340246148, "learning_rate": 8.969683655536028e-07, "logits/chosen": 0.3896484375, "logits/rejected": 0.4637451171875, "logps/chosen": -300.54998779296875, "logps/rejected": -399.70001220703125, "loss": 0.5283, "rewards/accuracies": 0.75, "rewards/chosen": -0.7864745855331421, "rewards/margins": 1.4294922351837158, "rewards/rejected": -2.2168946266174316, "step": 470 }, { "epoch": 0.42160737812911725, "grad_norm": 115.33449602602022, "learning_rate": 8.947715289982425e-07, "logits/chosen": 0.4829345643520355, "logits/rejected": 0.5821288824081421, "logps/chosen": -328.54998779296875, "logps/rejected": -443.3500061035156, "loss": 0.6009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.014013648033142, "rewards/margins": 1.34716796875, "rewards/rejected": -2.360546827316284, "step": 480 }, { "epoch": 0.43039086517347386, "grad_norm": 140.32454124308526, "learning_rate": 8.925746924428822e-07, "logits/chosen": 0.7076171636581421, "logits/rejected": 0.807324230670929, "logps/chosen": -309.125, "logps/rejected": -387.54998779296875, "loss": 0.4962, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.23006591200828552, "rewards/margins": 1.241796851158142, "rewards/rejected": -1.4727661609649658, "step": 490 }, { "epoch": 0.43917435221783047, "grad_norm": 83.64061038822028, "learning_rate": 8.903778558875219e-07, "logits/chosen": 0.773144543170929, "logits/rejected": 0.9498046636581421, "logps/chosen": -312.67498779296875, "logps/rejected": -467.8999938964844, "loss": 0.498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.16492919623851776, "rewards/margins": 1.4518554210662842, "rewards/rejected": -1.2869141101837158, "step": 500 }, { "epoch": 0.4479578392621871, "grad_norm": 106.57953177731393, "learning_rate": 8.881810193321616e-07, "logits/chosen": 0.901660144329071, "logits/rejected": 0.9853515625, "logps/chosen": -287.0, "logps/rejected": -406.29998779296875, "loss": 0.5492, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.32499998807907104, "rewards/margins": 1.066015601158142, "rewards/rejected": -0.740893542766571, "step": 510 }, { "epoch": 0.4567413263065437, "grad_norm": 125.35984365713507, "learning_rate": 8.859841827768013e-07, "logits/chosen": 1.01416015625, "logits/rejected": 1.12451171875, "logps/chosen": -285.29998779296875, "logps/rejected": -394.5, "loss": 0.5067, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.3230041563510895, "rewards/margins": 1.230859398841858, "rewards/rejected": -0.908459484577179, "step": 520 }, { "epoch": 0.4655248133509003, "grad_norm": 128.6573741988705, "learning_rate": 8.837873462214412e-07, "logits/chosen": 0.7294921875, "logits/rejected": 0.861523449420929, "logps/chosen": -306.07501220703125, "logps/rejected": -429.1000061035156, "loss": 0.5538, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.08203125, "rewards/margins": 1.1415526866912842, "rewards/rejected": -1.223535180091858, "step": 530 }, { "epoch": 0.4743083003952569, "grad_norm": 130.98272531263515, "learning_rate": 8.815905096660808e-07, "logits/chosen": 0.755566418170929, "logits/rejected": 0.817333996295929, "logps/chosen": -270.20001220703125, "logps/rejected": -384.29998779296875, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02019042894244194, "rewards/margins": 1.0377686023712158, "rewards/rejected": -1.05712890625, "step": 540 }, { "epoch": 0.4830917874396135, "grad_norm": 116.67300037792285, "learning_rate": 8.793936731107206e-07, "logits/chosen": 0.5877929925918579, "logits/rejected": 0.71044921875, "logps/chosen": -265.6000061035156, "logps/rejected": -397.3999938964844, "loss": 0.4744, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21799011528491974, "rewards/margins": 1.4042236804962158, "rewards/rejected": -1.185937523841858, "step": 550 }, { "epoch": 0.4918752744839701, "grad_norm": 173.96594225287942, "learning_rate": 8.771968365553602e-07, "logits/chosen": 0.6884765625, "logits/rejected": 0.777722179889679, "logps/chosen": -309.5, "logps/rejected": -411.8999938964844, "loss": 0.5833, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2979263365268707, "rewards/margins": 1.1783447265625, "rewards/rejected": -1.4763062000274658, "step": 560 }, { "epoch": 0.5006587615283268, "grad_norm": 94.32494092943736, "learning_rate": 8.75e-07, "logits/chosen": 0.573437511920929, "logits/rejected": 0.675976574420929, "logps/chosen": -315.3500061035156, "logps/rejected": -409.3999938964844, "loss": 0.5119, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.17556151747703552, "rewards/margins": 1.103247046470642, "rewards/rejected": -1.2805297374725342, "step": 570 }, { "epoch": 0.5094422485726834, "grad_norm": 117.86295847693233, "learning_rate": 8.728031634446396e-07, "logits/chosen": 0.5972656011581421, "logits/rejected": 0.6826171875, "logps/chosen": -317.54998779296875, "logps/rejected": -447.04998779296875, "loss": 0.5508, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.01083984412252903, "rewards/margins": 1.3397948741912842, "rewards/rejected": -1.3296630382537842, "step": 580 }, { "epoch": 0.51822573561704, "grad_norm": 89.46765640380897, "learning_rate": 8.706063268892794e-07, "logits/chosen": 0.7608886957168579, "logits/rejected": 0.8517090082168579, "logps/chosen": -257.1000061035156, "logps/rejected": -421.1000061035156, "loss": 0.4825, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21221618354320526, "rewards/margins": 1.6037108898162842, "rewards/rejected": -1.392309546470642, "step": 590 }, { "epoch": 0.5270092226613966, "grad_norm": 215.8298784737252, "learning_rate": 8.684094903339191e-07, "logits/chosen": 0.612060546875, "logits/rejected": 0.6400390863418579, "logps/chosen": -321.82501220703125, "logps/rejected": -451.29998779296875, "loss": 0.5393, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.17647704482078552, "rewards/margins": 1.4263184070587158, "rewards/rejected": -1.6028320789337158, "step": 600 }, { "epoch": 0.5357927097057532, "grad_norm": 145.69023775871486, "learning_rate": 8.662126537785588e-07, "logits/chosen": 0.5292724370956421, "logits/rejected": 0.6452270746231079, "logps/chosen": -243.9499969482422, "logps/rejected": -349.04998779296875, "loss": 0.5741, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03246460109949112, "rewards/margins": 1.0711669921875, "rewards/rejected": -1.103295922279358, "step": 610 }, { "epoch": 0.5445761967501098, "grad_norm": 104.79012505694247, "learning_rate": 8.640158172231986e-07, "logits/chosen": 0.5938720703125, "logits/rejected": 0.6392730474472046, "logps/chosen": -276.125, "logps/rejected": -428.8500061035156, "loss": 0.4934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13970032334327698, "rewards/margins": 1.850000023841858, "rewards/rejected": -1.9906737804412842, "step": 620 }, { "epoch": 0.5533596837944664, "grad_norm": 88.90175510156608, "learning_rate": 8.618189806678383e-07, "logits/chosen": 0.629589855670929, "logits/rejected": 0.5913330316543579, "logps/chosen": -311.0, "logps/rejected": -428.0, "loss": 0.5202, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2864013612270355, "rewards/margins": 1.3152344226837158, "rewards/rejected": -1.6003906726837158, "step": 630 }, { "epoch": 0.562143170838823, "grad_norm": 92.31234049952907, "learning_rate": 8.59622144112478e-07, "logits/chosen": 0.6262451410293579, "logits/rejected": 0.6946655511856079, "logps/chosen": -276.5, "logps/rejected": -409.8500061035156, "loss": 0.5335, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08116455376148224, "rewards/margins": 1.4308593273162842, "rewards/rejected": -1.349023461341858, "step": 640 }, { "epoch": 0.5709266578831796, "grad_norm": 78.17409546464724, "learning_rate": 8.574253075571178e-07, "logits/chosen": 0.805621325969696, "logits/rejected": 0.945117175579071, "logps/chosen": -277.6000061035156, "logps/rejected": -439.04998779296875, "loss": 0.5553, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20489501953125, "rewards/margins": 1.2216308116912842, "rewards/rejected": -1.0167404413223267, "step": 650 }, { "epoch": 0.5797101449275363, "grad_norm": 80.75411992024749, "learning_rate": 8.552284710017574e-07, "logits/chosen": 0.774707019329071, "logits/rejected": 0.823486328125, "logps/chosen": -322.82501220703125, "logps/rejected": -459.95001220703125, "loss": 0.4297, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.22116699814796448, "rewards/margins": 1.6272461414337158, "rewards/rejected": -1.405004858970642, "step": 660 }, { "epoch": 0.5884936319718929, "grad_norm": 122.71228764012359, "learning_rate": 8.530316344463972e-07, "logits/chosen": 0.847949206829071, "logits/rejected": 0.850781261920929, "logps/chosen": -294.95001220703125, "logps/rejected": -395.75, "loss": 0.5844, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.24362793564796448, "rewards/margins": 1.0568115711212158, "rewards/rejected": -0.813891589641571, "step": 670 }, { "epoch": 0.5972771190162495, "grad_norm": 130.78785809796443, "learning_rate": 8.508347978910368e-07, "logits/chosen": 0.7472168207168579, "logits/rejected": 0.761645495891571, "logps/chosen": -279.6499938964844, "logps/rejected": -395.29998779296875, "loss": 0.5086, "rewards/accuracies": 0.71875, "rewards/chosen": 0.33723145723342896, "rewards/margins": 1.2617919445037842, "rewards/rejected": -0.9244140386581421, "step": 680 }, { "epoch": 0.6060606060606061, "grad_norm": 125.87919323687957, "learning_rate": 8.486379613356766e-07, "logits/chosen": 0.836621105670929, "logits/rejected": 0.8564453125, "logps/chosen": -297.75, "logps/rejected": -416.20001220703125, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4564575254917145, "rewards/margins": 1.046044945716858, "rewards/rejected": -0.5906738042831421, "step": 690 }, { "epoch": 0.6148440931049627, "grad_norm": 145.1749143987932, "learning_rate": 8.464411247803162e-07, "logits/chosen": 0.736804187297821, "logits/rejected": 0.748339831829071, "logps/chosen": -325.0, "logps/rejected": -449.54998779296875, "loss": 0.5203, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3740478456020355, "rewards/margins": 1.3361084461212158, "rewards/rejected": -0.9620727300643921, "step": 700 }, { "epoch": 0.6236275801493193, "grad_norm": 101.26990827392123, "learning_rate": 8.442442882249561e-07, "logits/chosen": 0.824902355670929, "logits/rejected": 0.804003894329071, "logps/chosen": -281.32501220703125, "logps/rejected": -385.5, "loss": 0.5027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3863525390625, "rewards/margins": 1.312890648841858, "rewards/rejected": -0.926708996295929, "step": 710 }, { "epoch": 0.6324110671936759, "grad_norm": 141.590728085437, "learning_rate": 8.420474516695958e-07, "logits/chosen": 0.64227294921875, "logits/rejected": 0.776318371295929, "logps/chosen": -295.875, "logps/rejected": -450.04998779296875, "loss": 0.5693, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01694335974752903, "rewards/margins": 1.3640625476837158, "rewards/rejected": -1.380273461341858, "step": 720 }, { "epoch": 0.6411945542380325, "grad_norm": 90.65903441067857, "learning_rate": 8.398506151142355e-07, "logits/chosen": 0.7447143793106079, "logits/rejected": 0.7179199457168579, "logps/chosen": -311.1000061035156, "logps/rejected": -419.6000061035156, "loss": 0.5344, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.14219360053539276, "rewards/margins": 1.3075683116912842, "rewards/rejected": -1.165771484375, "step": 730 }, { "epoch": 0.6499780412823891, "grad_norm": 96.32131222535625, "learning_rate": 8.376537785588752e-07, "logits/chosen": 0.7132323980331421, "logits/rejected": 0.746289074420929, "logps/chosen": -312.1000061035156, "logps/rejected": -416.20001220703125, "loss": 0.5019, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.13435058295726776, "rewards/margins": 1.3684570789337158, "rewards/rejected": -1.2356445789337158, "step": 740 }, { "epoch": 0.6587615283267457, "grad_norm": 62.70520081441064, "learning_rate": 8.354569420035149e-07, "logits/chosen": 0.7081054449081421, "logits/rejected": 0.7386718988418579, "logps/chosen": -332.3999938964844, "logps/rejected": -412.1000061035156, "loss": 0.5895, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06796874850988388, "rewards/margins": 1.259545922279358, "rewards/rejected": -1.326171875, "step": 750 }, { "epoch": 0.6675450153711023, "grad_norm": 80.92080846993106, "learning_rate": 8.332601054481546e-07, "logits/chosen": 0.8075195550918579, "logits/rejected": 0.758471667766571, "logps/chosen": -320.75, "logps/rejected": -451.29998779296875, "loss": 0.509, "rewards/accuracies": 0.75, "rewards/chosen": 0.0556640625, "rewards/margins": 1.432519555091858, "rewards/rejected": -1.3771483898162842, "step": 760 }, { "epoch": 0.6763285024154589, "grad_norm": 168.423836175994, "learning_rate": 8.310632688927944e-07, "logits/chosen": 0.7854248285293579, "logits/rejected": 0.8138183355331421, "logps/chosen": -286.5, "logps/rejected": -393.0, "loss": 0.5017, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.201385498046875, "rewards/margins": 1.3284180164337158, "rewards/rejected": -1.125732421875, "step": 770 }, { "epoch": 0.6851119894598156, "grad_norm": 97.37652027871536, "learning_rate": 8.28866432337434e-07, "logits/chosen": 0.690722644329071, "logits/rejected": 0.7738281488418579, "logps/chosen": -283.1000061035156, "logps/rejected": -430.25, "loss": 0.5766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.33113402128219604, "rewards/margins": 1.5031249523162842, "rewards/rejected": -1.1706054210662842, "step": 780 }, { "epoch": 0.6938954765041722, "grad_norm": 82.59517022761915, "learning_rate": 8.266695957820738e-07, "logits/chosen": 0.6347106695175171, "logits/rejected": 0.656689465045929, "logps/chosen": -272.8999938964844, "logps/rejected": -375.04998779296875, "loss": 0.5215, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27496337890625, "rewards/margins": 1.17578125, "rewards/rejected": -0.9000488519668579, "step": 790 }, { "epoch": 0.7026789635485288, "grad_norm": 58.06619967430277, "learning_rate": 8.244727592267134e-07, "logits/chosen": 0.6208648681640625, "logits/rejected": 0.6363769769668579, "logps/chosen": -283.8999938964844, "logps/rejected": -391.5, "loss": 0.5225, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3349243104457855, "rewards/margins": 1.3944823741912842, "rewards/rejected": -1.059667944908142, "step": 800 }, { "epoch": 0.7114624505928854, "grad_norm": 159.52374812620673, "learning_rate": 8.222759226713533e-07, "logits/chosen": 0.6533569097518921, "logits/rejected": 0.67822265625, "logps/chosen": -272.5, "logps/rejected": -391.75, "loss": 0.53, "rewards/accuracies": 0.75, "rewards/chosen": 0.27149659395217896, "rewards/margins": 1.366601586341858, "rewards/rejected": -1.09539794921875, "step": 810 }, { "epoch": 0.720245937637242, "grad_norm": 71.9754897725476, "learning_rate": 8.200790861159929e-07, "logits/chosen": 0.611480712890625, "logits/rejected": 0.7861328125, "logps/chosen": -323.625, "logps/rejected": -471.20001220703125, "loss": 0.4653, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07088012993335724, "rewards/margins": 1.6259765625, "rewards/rejected": -1.697998046875, "step": 820 }, { "epoch": 0.7290294246815986, "grad_norm": 141.80317625762706, "learning_rate": 8.178822495606327e-07, "logits/chosen": 0.6155029535293579, "logits/rejected": 0.59228515625, "logps/chosen": -266.45001220703125, "logps/rejected": -382.29998779296875, "loss": 0.5168, "rewards/accuracies": 0.71875, "rewards/chosen": 0.26719969511032104, "rewards/margins": 1.516210913658142, "rewards/rejected": -1.2477538585662842, "step": 830 }, { "epoch": 0.7378129117259552, "grad_norm": 140.75487086152734, "learning_rate": 8.156854130052724e-07, "logits/chosen": 0.690777599811554, "logits/rejected": 0.74102783203125, "logps/chosen": -292.5249938964844, "logps/rejected": -391.54998779296875, "loss": 0.552, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27392578125, "rewards/margins": 0.9893554449081421, "rewards/rejected": -0.7154868841171265, "step": 840 }, { "epoch": 0.7465963987703118, "grad_norm": 84.10282689700186, "learning_rate": 8.134885764499121e-07, "logits/chosen": 0.804980456829071, "logits/rejected": 0.902050793170929, "logps/chosen": -278.54998779296875, "logps/rejected": -371.8500061035156, "loss": 0.5193, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3196350038051605, "rewards/margins": 1.3000977039337158, "rewards/rejected": -0.98095703125, "step": 850 }, { "epoch": 0.7553798858146684, "grad_norm": 111.51843498146174, "learning_rate": 8.112917398945518e-07, "logits/chosen": 0.6046844720840454, "logits/rejected": 0.714160144329071, "logps/chosen": -269.3500061035156, "logps/rejected": -389.20001220703125, "loss": 0.5148, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.02806396409869194, "rewards/margins": 1.4083983898162842, "rewards/rejected": -1.38092041015625, "step": 860 }, { "epoch": 0.764163372859025, "grad_norm": 160.493646287475, "learning_rate": 8.090949033391915e-07, "logits/chosen": 0.4907470643520355, "logits/rejected": 0.5363403558731079, "logps/chosen": -297.8500061035156, "logps/rejected": -400.04998779296875, "loss": 0.6569, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.41941529512405396, "rewards/margins": 1.1093871593475342, "rewards/rejected": -1.527929663658142, "step": 870 }, { "epoch": 0.7729468599033816, "grad_norm": 91.54816973758152, "learning_rate": 8.068980667838312e-07, "logits/chosen": 0.5309082269668579, "logits/rejected": 0.628125011920929, "logps/chosen": -257.6000061035156, "logps/rejected": -406.70001220703125, "loss": 0.6181, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3598876893520355, "rewards/margins": 1.5540039539337158, "rewards/rejected": -1.912500023841858, "step": 880 }, { "epoch": 0.7817303469477382, "grad_norm": 163.2456726452387, "learning_rate": 8.04701230228471e-07, "logits/chosen": 0.59716796875, "logits/rejected": 0.63507080078125, "logps/chosen": -290.04998779296875, "logps/rejected": -404.1000061035156, "loss": 0.4996, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20415039360523224, "rewards/margins": 1.4846680164337158, "rewards/rejected": -1.687890648841858, "step": 890 }, { "epoch": 0.7905138339920948, "grad_norm": 176.04241519982924, "learning_rate": 8.025043936731107e-07, "logits/chosen": 0.542236328125, "logits/rejected": 0.5885375738143921, "logps/chosen": -293.8999938964844, "logps/rejected": -384.79998779296875, "loss": 0.5738, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.30535888671875, "rewards/margins": 1.263085961341858, "rewards/rejected": -1.5675780773162842, "step": 900 }, { "epoch": 0.7992973210364515, "grad_norm": 134.34451946848753, "learning_rate": 8.003075571177505e-07, "logits/chosen": 0.646923840045929, "logits/rejected": 0.7884765863418579, "logps/chosen": -303.3999938964844, "logps/rejected": -402.29998779296875, "loss": 0.5167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.15415039658546448, "rewards/margins": 1.2509887218475342, "rewards/rejected": -1.40576171875, "step": 910 }, { "epoch": 0.8080808080808081, "grad_norm": 144.4429707139809, "learning_rate": 7.981107205623901e-07, "logits/chosen": 0.775390625, "logits/rejected": 0.8140624761581421, "logps/chosen": -326.8500061035156, "logps/rejected": -398.45001220703125, "loss": 0.5027, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25385743379592896, "rewards/margins": 1.197167992591858, "rewards/rejected": -0.9429076910018921, "step": 920 }, { "epoch": 0.8168642951251647, "grad_norm": 117.16375050781849, "learning_rate": 7.959138840070299e-07, "logits/chosen": 0.759082019329071, "logits/rejected": 0.722460925579071, "logps/chosen": -315.8500061035156, "logps/rejected": -445.8500061035156, "loss": 0.5108, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24257811903953552, "rewards/margins": 1.4621093273162842, "rewards/rejected": -1.219763159751892, "step": 930 }, { "epoch": 0.8256477821695213, "grad_norm": 56.45715843516822, "learning_rate": 7.937170474516695e-07, "logits/chosen": 0.678759753704071, "logits/rejected": 0.732617199420929, "logps/chosen": -275.8999938964844, "logps/rejected": -390.04998779296875, "loss": 0.441, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21145018935203552, "rewards/margins": 1.5173828601837158, "rewards/rejected": -1.305151343345642, "step": 940 }, { "epoch": 0.8344312692138779, "grad_norm": 90.41553029965456, "learning_rate": 7.915202108963093e-07, "logits/chosen": 0.681396484375, "logits/rejected": 0.807568371295929, "logps/chosen": -308.54998779296875, "logps/rejected": -427.70001220703125, "loss": 0.5556, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.08505859225988388, "rewards/margins": 1.252966284751892, "rewards/rejected": -1.337792992591858, "step": 950 }, { "epoch": 0.8432147562582345, "grad_norm": 104.86276839761041, "learning_rate": 7.89323374340949e-07, "logits/chosen": 0.557055652141571, "logits/rejected": 0.641772449016571, "logps/chosen": -307.1000061035156, "logps/rejected": -425.0, "loss": 0.58, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12518310546875, "rewards/margins": 1.2295410633087158, "rewards/rejected": -1.3542633056640625, "step": 960 }, { "epoch": 0.8519982433025911, "grad_norm": 105.76850237365005, "learning_rate": 7.871265377855887e-07, "logits/chosen": 0.6954101324081421, "logits/rejected": 0.7448974847793579, "logps/chosen": -280.3500061035156, "logps/rejected": -431.1000061035156, "loss": 0.5476, "rewards/accuracies": 0.75, "rewards/chosen": -0.30583494901657104, "rewards/margins": 1.4187500476837158, "rewards/rejected": -1.72412109375, "step": 970 }, { "epoch": 0.8607817303469477, "grad_norm": 137.65030968412748, "learning_rate": 7.849297012302284e-07, "logits/chosen": 0.536669909954071, "logits/rejected": 0.6874023675918579, "logps/chosen": -325.54998779296875, "logps/rejected": -416.75, "loss": 0.4741, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4579406678676605, "rewards/margins": 1.3713867664337158, "rewards/rejected": -1.8303711414337158, "step": 980 }, { "epoch": 0.8695652173913043, "grad_norm": 159.43256013350114, "learning_rate": 7.827328646748682e-07, "logits/chosen": 0.5479491949081421, "logits/rejected": 0.6419311761856079, "logps/chosen": -327.8999938964844, "logps/rejected": -447.45001220703125, "loss": 0.5508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5188934206962585, "rewards/margins": 1.26019287109375, "rewards/rejected": -1.777734398841858, "step": 990 }, { "epoch": 0.8783487044356609, "grad_norm": 105.76813775204988, "learning_rate": 7.805360281195079e-07, "logits/chosen": 0.6756836175918579, "logits/rejected": 0.706250011920929, "logps/chosen": -323.70001220703125, "logps/rejected": -469.54998779296875, "loss": 0.4205, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.35002440214157104, "rewards/margins": 1.718164086341858, "rewards/rejected": -2.068066358566284, "step": 1000 }, { "epoch": 0.8871321914800175, "grad_norm": 96.77724555396985, "learning_rate": 7.783391915641477e-07, "logits/chosen": 0.721630871295929, "logits/rejected": 0.7696288824081421, "logps/chosen": -294.29998779296875, "logps/rejected": -377.95001220703125, "loss": 0.5437, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.005389404483139515, "rewards/margins": 1.2545897960662842, "rewards/rejected": -1.2490234375, "step": 1010 }, { "epoch": 0.8959156785243741, "grad_norm": 83.13608387276506, "learning_rate": 7.761423550087873e-07, "logits/chosen": 0.777294933795929, "logits/rejected": 0.8108886480331421, "logps/chosen": -324.29998779296875, "logps/rejected": -472.8999938964844, "loss": 0.5618, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2977539002895355, "rewards/margins": 1.4220459461212158, "rewards/rejected": -1.122460961341858, "step": 1020 }, { "epoch": 0.9046991655687308, "grad_norm": 79.83099800335017, "learning_rate": 7.739455184534271e-07, "logits/chosen": 0.656542956829071, "logits/rejected": 0.7041991949081421, "logps/chosen": -345.3500061035156, "logps/rejected": -459.79998779296875, "loss": 0.4753, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11873779445886612, "rewards/margins": 1.51361083984375, "rewards/rejected": -1.3961913585662842, "step": 1030 }, { "epoch": 0.9134826526130874, "grad_norm": 202.4175262262392, "learning_rate": 7.717486818980667e-07, "logits/chosen": 0.6935027837753296, "logits/rejected": 0.703320324420929, "logps/chosen": -271.20001220703125, "logps/rejected": -393.6499938964844, "loss": 0.5334, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.22075195610523224, "rewards/margins": 1.4441406726837158, "rewards/rejected": -1.222680687904358, "step": 1040 }, { "epoch": 0.922266139657444, "grad_norm": 105.9261399987333, "learning_rate": 7.695518453427065e-07, "logits/chosen": 0.6714843511581421, "logits/rejected": 0.829296886920929, "logps/chosen": -259.1000061035156, "logps/rejected": -424.5, "loss": 0.5436, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.30000001192092896, "rewards/margins": 1.4658324718475342, "rewards/rejected": -1.165502905845642, "step": 1050 }, { "epoch": 0.9310496267018006, "grad_norm": 160.92255248113352, "learning_rate": 7.673550087873461e-07, "logits/chosen": 0.6338866949081421, "logits/rejected": 0.662792980670929, "logps/chosen": -329.1499938964844, "logps/rejected": -451.1499938964844, "loss": 0.5436, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04085693508386612, "rewards/margins": 1.477197289466858, "rewards/rejected": -1.4367187023162842, "step": 1060 }, { "epoch": 0.9398331137461572, "grad_norm": 110.59868954095693, "learning_rate": 7.651581722319859e-07, "logits/chosen": 0.6666504144668579, "logits/rejected": 0.785205066204071, "logps/chosen": -261.25, "logps/rejected": -413.6000061035156, "loss": 0.4849, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12421874701976776, "rewards/margins": 1.4919922351837158, "rewards/rejected": -1.3689453601837158, "step": 1070 }, { "epoch": 0.9486166007905138, "grad_norm": 97.30888541427004, "learning_rate": 7.629613356766256e-07, "logits/chosen": 0.6907714605331421, "logits/rejected": 0.796679675579071, "logps/chosen": -287.875, "logps/rejected": -389.8999938964844, "loss": 0.4767, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04188232496380806, "rewards/margins": 1.3427734375, "rewards/rejected": -1.301171898841858, "step": 1080 }, { "epoch": 0.9574000878348704, "grad_norm": 72.55425254402961, "learning_rate": 7.607644991212654e-07, "logits/chosen": 0.8084472417831421, "logits/rejected": 0.820605456829071, "logps/chosen": -300.3500061035156, "logps/rejected": -436.1000061035156, "loss": 0.4383, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.19719238579273224, "rewards/margins": 1.681738257408142, "rewards/rejected": -1.878320336341858, "step": 1090 }, { "epoch": 0.966183574879227, "grad_norm": 76.11242425120201, "learning_rate": 7.585676625659051e-07, "logits/chosen": 0.6378418207168579, "logits/rejected": 0.7088867425918579, "logps/chosen": -316.3999938964844, "logps/rejected": -448.6000061035156, "loss": 0.4727, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.09443970024585724, "rewards/margins": 1.6331298351287842, "rewards/rejected": -1.728173851966858, "step": 1100 }, { "epoch": 0.9749670619235836, "grad_norm": 106.22019667096369, "learning_rate": 7.563708260105448e-07, "logits/chosen": 0.6456543207168579, "logits/rejected": 0.624462902545929, "logps/chosen": -290.45001220703125, "logps/rejected": -431.3999938964844, "loss": 0.4817, "rewards/accuracies": 0.75, "rewards/chosen": -0.0264892578125, "rewards/margins": 1.7017090320587158, "rewards/rejected": -1.727636694908142, "step": 1110 }, { "epoch": 0.9837505489679402, "grad_norm": 131.94325620117536, "learning_rate": 7.541739894551845e-07, "logits/chosen": 0.6218017339706421, "logits/rejected": 0.6457275152206421, "logps/chosen": -294.32501220703125, "logps/rejected": -376.3500061035156, "loss": 0.5143, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.13709716498851776, "rewards/margins": 1.227148413658142, "rewards/rejected": -1.363775610923767, "step": 1120 }, { "epoch": 0.9925340360122968, "grad_norm": 181.4041368424247, "learning_rate": 7.519771528998243e-07, "logits/chosen": 0.4788574278354645, "logits/rejected": 0.566912829875946, "logps/chosen": -342.79998779296875, "logps/rejected": -451.3999938964844, "loss": 0.593, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.523120105266571, "rewards/margins": 1.548486351966858, "rewards/rejected": -2.06982421875, "step": 1130 }, { "epoch": 1.0008783487044357, "grad_norm": 29.406577346154506, "learning_rate": 7.497803163444639e-07, "logits/chosen": 0.7065172791481018, "logits/rejected": 0.6998869180679321, "logps/chosen": -312.631591796875, "logps/rejected": -424.0526428222656, "loss": 0.4487, "rewards/accuracies": 0.7565789222717285, "rewards/chosen": 0.0021025005262345076, "rewards/margins": 2.0403988361358643, "rewards/rejected": -2.03752064704895, "step": 1140 }, { "epoch": 1.0096618357487923, "grad_norm": 22.320348250378917, "learning_rate": 7.475834797891037e-07, "logits/chosen": 0.652050793170929, "logits/rejected": 0.658154308795929, "logps/chosen": -276.2749938964844, "logps/rejected": -411.8500061035156, "loss": 0.1263, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.023657202720642, "rewards/margins": 3.7289061546325684, "rewards/rejected": -2.704296827316284, "step": 1150 }, { "epoch": 1.018445322793149, "grad_norm": 40.206614844076576, "learning_rate": 7.453866432337433e-07, "logits/chosen": 0.407235711812973, "logits/rejected": 0.510632336139679, "logps/chosen": -252.3000030517578, "logps/rejected": -410.95001220703125, "loss": 0.107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.535351574420929, "rewards/margins": 3.8375000953674316, "rewards/rejected": -3.3031249046325684, "step": 1160 }, { "epoch": 1.0272288098375055, "grad_norm": 45.09213401971073, "learning_rate": 7.431898066783831e-07, "logits/chosen": 0.19450683891773224, "logits/rejected": 0.2222900390625, "logps/chosen": -301.3500061035156, "logps/rejected": -434.3500061035156, "loss": 0.0914, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.23833923041820526, "rewards/margins": 5.067187309265137, "rewards/rejected": -5.302343845367432, "step": 1170 }, { "epoch": 1.0360122968818621, "grad_norm": 83.38878161974407, "learning_rate": 7.409929701230228e-07, "logits/chosen": 0.04458465427160263, "logits/rejected": 0.08873596042394638, "logps/chosen": -304.79998779296875, "logps/rejected": -468.6000061035156, "loss": 0.109, "rewards/accuracies": 0.96875, "rewards/chosen": -0.813037097454071, "rewards/margins": 5.446875095367432, "rewards/rejected": -6.267187595367432, "step": 1180 }, { "epoch": 1.0447957839262187, "grad_norm": 14.27539965827894, "learning_rate": 7.387961335676626e-07, "logits/chosen": 0.13530120253562927, "logits/rejected": 0.095916748046875, "logps/chosen": -303.04998779296875, "logps/rejected": -461.8999938964844, "loss": 0.0916, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5640869140625, "rewards/margins": 5.236718654632568, "rewards/rejected": -5.805468559265137, "step": 1190 }, { "epoch": 1.0535792709705754, "grad_norm": 37.89861330947791, "learning_rate": 7.365992970123023e-07, "logits/chosen": 0.06229095533490181, "logits/rejected": 0.23023681342601776, "logps/chosen": -310.3500061035156, "logps/rejected": -485.95001220703125, "loss": 0.0977, "rewards/accuracies": 0.96875, "rewards/chosen": -0.03321533277630806, "rewards/margins": 5.484375, "rewards/rejected": -5.517968654632568, "step": 1200 }, { "epoch": 1.062362758014932, "grad_norm": 18.351115344539316, "learning_rate": 7.34402460456942e-07, "logits/chosen": 0.18747863173484802, "logits/rejected": 0.21021728217601776, "logps/chosen": -297.8999938964844, "logps/rejected": -414.1000061035156, "loss": 0.0672, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.06614379584789276, "rewards/margins": 5.051562309265137, "rewards/rejected": -5.114843845367432, "step": 1210 }, { "epoch": 1.0711462450592886, "grad_norm": 11.048720915808161, "learning_rate": 7.322056239015817e-07, "logits/chosen": 0.23781737685203552, "logits/rejected": 0.17812499403953552, "logps/chosen": -285.04998779296875, "logps/rejected": -447.75, "loss": 0.1209, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.1641998291015625, "rewards/margins": 5.384375095367432, "rewards/rejected": -5.5546875, "step": 1220 }, { "epoch": 1.0799297321036452, "grad_norm": 25.937116835418735, "learning_rate": 7.300087873462214e-07, "logits/chosen": 0.10886230319738388, "logits/rejected": 0.18171386420726776, "logps/chosen": -321.6000061035156, "logps/rejected": -466.75, "loss": 0.0869, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.27086180448532104, "rewards/margins": 5.165625095367432, "rewards/rejected": -5.440625190734863, "step": 1230 }, { "epoch": 1.0887132191480018, "grad_norm": 41.39760099901198, "learning_rate": 7.278119507908611e-07, "logits/chosen": 0.11679687350988388, "logits/rejected": 0.09876708686351776, "logps/chosen": -317.3500061035156, "logps/rejected": -453.79998779296875, "loss": 0.091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6776367425918579, "rewards/margins": 5.074999809265137, "rewards/rejected": -5.755468845367432, "step": 1240 }, { "epoch": 1.0974967061923584, "grad_norm": 17.286696251152424, "learning_rate": 7.256151142355009e-07, "logits/chosen": -0.0053955079056322575, "logits/rejected": 0.02189941331744194, "logps/chosen": -299.70001220703125, "logps/rejected": -493.5, "loss": 0.0795, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6955810785293579, "rewards/margins": 5.810156345367432, "rewards/rejected": -6.5078125, "step": 1250 }, { "epoch": 1.106280193236715, "grad_norm": 12.826727376244106, "learning_rate": 7.234182776801405e-07, "logits/chosen": 0.11990203708410263, "logits/rejected": 0.1573486328125, "logps/chosen": -292.79998779296875, "logps/rejected": -479.0, "loss": 0.0796, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9017578363418579, "rewards/margins": 5.938281059265137, "rewards/rejected": -6.835156440734863, "step": 1260 }, { "epoch": 1.1150636802810716, "grad_norm": 26.152674154929368, "learning_rate": 7.212214411247804e-07, "logits/chosen": 0.02687988243997097, "logits/rejected": 0.0037292479537427425, "logps/chosen": -297.1000061035156, "logps/rejected": -461.3999938964844, "loss": 0.0841, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.013159155845642, "rewards/margins": 6.346093654632568, "rewards/rejected": -7.359375, "step": 1270 }, { "epoch": 1.1238471673254282, "grad_norm": 25.42338480790078, "learning_rate": 7.1902460456942e-07, "logits/chosen": -0.07500000298023224, "logits/rejected": -0.03376464918255806, "logps/chosen": -334.5, "logps/rejected": -450.0, "loss": 0.0946, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9002441167831421, "rewards/margins": 5.729687690734863, "rewards/rejected": -6.629687309265137, "step": 1280 }, { "epoch": 1.1326306543697848, "grad_norm": 24.466697095730623, "learning_rate": 7.168277680140598e-07, "logits/chosen": 0.10602416843175888, "logits/rejected": 0.05716552585363388, "logps/chosen": -315.9750061035156, "logps/rejected": -446.45001220703125, "loss": 0.0962, "rewards/accuracies": 0.96875, "rewards/chosen": -0.756640613079071, "rewards/margins": 4.598437309265137, "rewards/rejected": -5.353125095367432, "step": 1290 }, { "epoch": 1.1414141414141414, "grad_norm": 33.75630299524539, "learning_rate": 7.146309314586994e-07, "logits/chosen": 0.15330810844898224, "logits/rejected": 0.13568115234375, "logps/chosen": -286.25, "logps/rejected": -421.8999938964844, "loss": 0.1012, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.20374146103858948, "rewards/margins": 5.221093654632568, "rewards/rejected": -5.422656059265137, "step": 1300 }, { "epoch": 1.150197628458498, "grad_norm": 24.33976283837031, "learning_rate": 7.124340949033392e-07, "logits/chosen": 0.16413573920726776, "logits/rejected": 0.18144531548023224, "logps/chosen": -267.6000061035156, "logps/rejected": -397.1499938964844, "loss": 0.096, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.04319458082318306, "rewards/margins": 4.983593940734863, "rewards/rejected": -5.032031059265137, "step": 1310 }, { "epoch": 1.1589811155028547, "grad_norm": 42.72385616856955, "learning_rate": 7.102372583479789e-07, "logits/chosen": 0.096405029296875, "logits/rejected": 0.13773193955421448, "logps/chosen": -337.25, "logps/rejected": -524.7000122070312, "loss": 0.0883, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.8704773187637329, "rewards/margins": 5.954687595367432, "rewards/rejected": -6.824999809265137, "step": 1320 }, { "epoch": 1.1677646025472113, "grad_norm": 33.455309057722786, "learning_rate": 7.080404217926186e-07, "logits/chosen": -0.143218994140625, "logits/rejected": -0.04854736477136612, "logps/chosen": -289.625, "logps/rejected": -453.3999938964844, "loss": 0.0672, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.06982421875, "rewards/margins": 5.639843940734863, "rewards/rejected": -6.7109375, "step": 1330 }, { "epoch": 1.1765480895915679, "grad_norm": 4.197746410025961, "learning_rate": 7.058435852372583e-07, "logits/chosen": -0.22938232123851776, "logits/rejected": -0.18323364853858948, "logps/chosen": -325.6000061035156, "logps/rejected": -514.2000122070312, "loss": 0.0981, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.393310546875, "rewards/margins": 6.366406440734863, "rewards/rejected": -7.755468845367432, "step": 1340 }, { "epoch": 1.1853315766359245, "grad_norm": 23.197391121891183, "learning_rate": 7.03646748681898e-07, "logits/chosen": -0.30302125215530396, "logits/rejected": -0.21962127089500427, "logps/chosen": -301.75, "logps/rejected": -451.75, "loss": 0.0666, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.424414038658142, "rewards/margins": 6.172656059265137, "rewards/rejected": -7.602343559265137, "step": 1350 }, { "epoch": 1.194115063680281, "grad_norm": 44.82811377024226, "learning_rate": 7.014499121265377e-07, "logits/chosen": -0.296914666891098, "logits/rejected": -0.35521239042282104, "logps/chosen": -341.20001220703125, "logps/rejected": -479.70001220703125, "loss": 0.1007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.53857421875, "rewards/margins": 6.357031345367432, "rewards/rejected": -7.893750190734863, "step": 1360 }, { "epoch": 1.2028985507246377, "grad_norm": 30.366274590830006, "learning_rate": 6.992530755711776e-07, "logits/chosen": -0.09590606391429901, "logits/rejected": -0.04508667066693306, "logps/chosen": -266.07501220703125, "logps/rejected": -442.5, "loss": 0.1168, "rewards/accuracies": 0.96875, "rewards/chosen": -0.98028564453125, "rewards/margins": 5.518750190734863, "rewards/rejected": -6.504687309265137, "step": 1370 }, { "epoch": 1.2116820377689943, "grad_norm": 23.08614796264542, "learning_rate": 6.970562390158172e-07, "logits/chosen": -0.04123535007238388, "logits/rejected": 0.00954589806497097, "logps/chosen": -326.8999938964844, "logps/rejected": -482.3999938964844, "loss": 0.0798, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.46101075410842896, "rewards/margins": 5.574999809265137, "rewards/rejected": -6.032812595367432, "step": 1380 }, { "epoch": 1.220465524813351, "grad_norm": 12.81882417875156, "learning_rate": 6.94859402460457e-07, "logits/chosen": 0.09822998195886612, "logits/rejected": 0.23986205458641052, "logps/chosen": -341.1000061035156, "logps/rejected": -550.2000122070312, "loss": 0.0915, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.49250489473342896, "rewards/margins": 5.896093845367432, "rewards/rejected": -6.382031440734863, "step": 1390 }, { "epoch": 1.2292490118577075, "grad_norm": 22.581524316483865, "learning_rate": 6.926625659050966e-07, "logits/chosen": -0.10782776027917862, "logits/rejected": -0.006579590030014515, "logps/chosen": -299.3500061035156, "logps/rejected": -438.20001220703125, "loss": 0.0821, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6231018304824829, "rewards/margins": 5.3515625, "rewards/rejected": -5.978125095367432, "step": 1400 }, { "epoch": 1.2380324989020641, "grad_norm": 22.887229083501676, "learning_rate": 6.904657293497364e-07, "logits/chosen": -0.09979858249425888, "logits/rejected": -0.1475830078125, "logps/chosen": -315.8999938964844, "logps/rejected": -487.54998779296875, "loss": 0.0921, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.139428734779358, "rewards/margins": 6.382031440734863, "rewards/rejected": -7.522656440734863, "step": 1410 }, { "epoch": 1.2468159859464207, "grad_norm": 8.498974531549697, "learning_rate": 6.88268892794376e-07, "logits/chosen": -0.27006834745407104, "logits/rejected": -0.172526553273201, "logps/chosen": -325.6000061035156, "logps/rejected": -487.8999938964844, "loss": 0.0818, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.462915062904358, "rewards/margins": 6.350781440734863, "rewards/rejected": -7.805468559265137, "step": 1420 }, { "epoch": 1.2555994729907773, "grad_norm": 72.48319653576353, "learning_rate": 6.860720562390158e-07, "logits/chosen": -0.13785400986671448, "logits/rejected": -0.14661864936351776, "logps/chosen": -315.5, "logps/rejected": -432.6000061035156, "loss": 0.0955, "rewards/accuracies": 0.96875, "rewards/chosen": -1.141845703125, "rewards/margins": 6.037499904632568, "rewards/rejected": -7.1796875, "step": 1430 }, { "epoch": 1.264382960035134, "grad_norm": 39.174641640963245, "learning_rate": 6.838752196836555e-07, "logits/chosen": -0.12259521335363388, "logits/rejected": -0.09263916313648224, "logps/chosen": -296.70001220703125, "logps/rejected": -442.1000061035156, "loss": 0.0885, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9246902465820312, "rewards/margins": 5.868750095367432, "rewards/rejected": -6.783593654632568, "step": 1440 }, { "epoch": 1.2731664470794906, "grad_norm": 26.192167987037877, "learning_rate": 6.816783831282952e-07, "logits/chosen": -0.11909179389476776, "logits/rejected": -0.02045898512005806, "logps/chosen": -305.98748779296875, "logps/rejected": -470.0, "loss": 0.0925, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.84405517578125, "rewards/margins": 5.938281059265137, "rewards/rejected": -6.78125, "step": 1450 }, { "epoch": 1.2819499341238472, "grad_norm": 36.85669530727992, "learning_rate": 6.79481546572935e-07, "logits/chosen": -0.06085815280675888, "logits/rejected": -0.03992919996380806, "logps/chosen": -277.75, "logps/rejected": -437.95001220703125, "loss": 0.1168, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.977856457233429, "rewards/margins": 5.481249809265137, "rewards/rejected": -6.4609375, "step": 1460 }, { "epoch": 1.2907334211682038, "grad_norm": 40.7871195105309, "learning_rate": 6.772847100175747e-07, "logits/chosen": -0.34234619140625, "logits/rejected": -0.33674317598342896, "logps/chosen": -300.3500061035156, "logps/rejected": -453.29998779296875, "loss": 0.1009, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.5572509765625, "rewards/margins": 5.637499809265137, "rewards/rejected": -7.196875095367432, "step": 1470 }, { "epoch": 1.2995169082125604, "grad_norm": 11.572653992999776, "learning_rate": 6.750878734622144e-07, "logits/chosen": -0.123931884765625, "logits/rejected": -0.17093506455421448, "logps/chosen": -346.3999938964844, "logps/rejected": -540.2999877929688, "loss": 0.055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5871093273162842, "rewards/margins": 6.801562309265137, "rewards/rejected": -8.385937690734863, "step": 1480 }, { "epoch": 1.308300395256917, "grad_norm": 20.430299685595873, "learning_rate": 6.728910369068542e-07, "logits/chosen": -0.04848632961511612, "logits/rejected": -0.1207427978515625, "logps/chosen": -309.1000061035156, "logps/rejected": -484.6000061035156, "loss": 0.0714, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.4919922351837158, "rewards/margins": 6.565625190734863, "rewards/rejected": -8.056249618530273, "step": 1490 }, { "epoch": 1.3170838823012736, "grad_norm": 41.67606779213329, "learning_rate": 6.706942003514938e-07, "logits/chosen": -0.0581207275390625, "logits/rejected": -0.07653198391199112, "logps/chosen": -303.8999938964844, "logps/rejected": -432.3999938964844, "loss": 0.0806, "rewards/accuracies": 0.96875, "rewards/chosen": -1.149560570716858, "rewards/margins": 6.154687404632568, "rewards/rejected": -7.301562309265137, "step": 1500 }, { "epoch": 1.3258673693456302, "grad_norm": 30.205562951546153, "learning_rate": 6.684973637961336e-07, "logits/chosen": -0.02061157301068306, "logits/rejected": 0.03819580003619194, "logps/chosen": -308.8999938964844, "logps/rejected": -529.25, "loss": 0.0628, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3117187023162842, "rewards/margins": 6.530468940734863, "rewards/rejected": -7.845312595367432, "step": 1510 }, { "epoch": 1.3346508563899868, "grad_norm": 40.26126731262252, "learning_rate": 6.663005272407732e-07, "logits/chosen": 0.0064331055618822575, "logits/rejected": 0.018768310546875, "logps/chosen": -338.3500061035156, "logps/rejected": -516.2999877929688, "loss": 0.0834, "rewards/accuracies": 0.96875, "rewards/chosen": -1.207373023033142, "rewards/margins": 6.135937690734863, "rewards/rejected": -7.346875190734863, "step": 1520 }, { "epoch": 1.3434343434343434, "grad_norm": 2.273513081374984, "learning_rate": 6.64103690685413e-07, "logits/chosen": -0.02860107459127903, "logits/rejected": -0.02980346605181694, "logps/chosen": -293.04998779296875, "logps/rejected": -466.3999938964844, "loss": 0.0887, "rewards/accuracies": 0.96875, "rewards/chosen": -0.650708019733429, "rewards/margins": 5.749218940734863, "rewards/rejected": -6.403124809265137, "step": 1530 }, { "epoch": 1.3522178304787, "grad_norm": 39.55844965075696, "learning_rate": 6.619068541300526e-07, "logits/chosen": 0.01328125037252903, "logits/rejected": -0.01910400390625, "logps/chosen": -330.5, "logps/rejected": -510.70001220703125, "loss": 0.0963, "rewards/accuracies": 0.96875, "rewards/chosen": -0.751983642578125, "rewards/margins": 6.418749809265137, "rewards/rejected": -7.170312404632568, "step": 1540 }, { "epoch": 1.3610013175230566, "grad_norm": 15.703584734813845, "learning_rate": 6.597100175746925e-07, "logits/chosen": 0.02508544921875, "logits/rejected": 0.02389373816549778, "logps/chosen": -309.75, "logps/rejected": -509.29998779296875, "loss": 0.062, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.786328136920929, "rewards/margins": 6.303906440734863, "rewards/rejected": -7.087500095367432, "step": 1550 }, { "epoch": 1.3697848045674132, "grad_norm": 13.81014984288943, "learning_rate": 6.575131810193322e-07, "logits/chosen": -0.26025390625, "logits/rejected": -0.24281616508960724, "logps/chosen": -299.8999938964844, "logps/rejected": -482.70001220703125, "loss": 0.083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8814941644668579, "rewards/margins": 6.403124809265137, "rewards/rejected": -7.2890625, "step": 1560 }, { "epoch": 1.3785682916117699, "grad_norm": 140.99532440848787, "learning_rate": 6.553163444639719e-07, "logits/chosen": -0.147796630859375, "logits/rejected": -0.06104583665728569, "logps/chosen": -275.32501220703125, "logps/rejected": -433.5, "loss": 0.1215, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2014038562774658, "rewards/margins": 5.592187404632568, "rewards/rejected": -6.790625095367432, "step": 1570 }, { "epoch": 1.3873517786561265, "grad_norm": 39.392698774350976, "learning_rate": 6.531195079086116e-07, "logits/chosen": -0.04363403469324112, "logits/rejected": -0.11330566555261612, "logps/chosen": -301.1499938964844, "logps/rejected": -478.3999938964844, "loss": 0.1033, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3847167491912842, "rewards/margins": 6.044531345367432, "rewards/rejected": -7.432812690734863, "step": 1580 }, { "epoch": 1.396135265700483, "grad_norm": 18.064117443594974, "learning_rate": 6.509226713532513e-07, "logits/chosen": -0.09661255031824112, "logits/rejected": -0.08617858588695526, "logps/chosen": -338.3999938964844, "logps/rejected": -509.20001220703125, "loss": 0.071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.420751929283142, "rewards/margins": 5.94921875, "rewards/rejected": -7.368750095367432, "step": 1590 }, { "epoch": 1.4049187527448397, "grad_norm": 26.80636550134301, "learning_rate": 6.48725834797891e-07, "logits/chosen": 0.01853027381002903, "logits/rejected": -0.04599304124712944, "logps/chosen": -314.4750061035156, "logps/rejected": -520.7000122070312, "loss": 0.0897, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.341210961341858, "rewards/margins": 5.896874904632568, "rewards/rejected": -7.239062309265137, "step": 1600 }, { "epoch": 1.4137022397891963, "grad_norm": 31.248682930732706, "learning_rate": 6.465289982425308e-07, "logits/chosen": -0.24051514267921448, "logits/rejected": -0.18806152045726776, "logps/chosen": -285.79998779296875, "logps/rejected": -472.6000061035156, "loss": 0.0742, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5554687976837158, "rewards/margins": 6.047656059265137, "rewards/rejected": -7.59375, "step": 1610 }, { "epoch": 1.422485726833553, "grad_norm": 50.04592639455034, "learning_rate": 6.443321616871704e-07, "logits/chosen": -0.30909425020217896, "logits/rejected": -0.15338134765625, "logps/chosen": -288.0, "logps/rejected": -472.1000061035156, "loss": 0.1134, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.6801025867462158, "rewards/margins": 6.086718559265137, "rewards/rejected": -7.764843940734863, "step": 1620 }, { "epoch": 1.4312692138779095, "grad_norm": 43.77129022403175, "learning_rate": 6.421353251318102e-07, "logits/chosen": -0.0555419921875, "logits/rejected": -0.04697265475988388, "logps/chosen": -351.1499938964844, "logps/rejected": -532.7000122070312, "loss": 0.0854, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.521875023841858, "rewards/margins": 7.017187595367432, "rewards/rejected": -8.546875, "step": 1630 }, { "epoch": 1.4400527009222661, "grad_norm": 52.19870242919901, "learning_rate": 6.399384885764498e-07, "logits/chosen": -0.13608399033546448, "logits/rejected": -0.04797973483800888, "logps/chosen": -302.3500061035156, "logps/rejected": -484.1000061035156, "loss": 0.1136, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.13232421875, "rewards/margins": 6.109375, "rewards/rejected": -7.2421875, "step": 1640 }, { "epoch": 1.4488361879666227, "grad_norm": 10.701986961032219, "learning_rate": 6.377416520210897e-07, "logits/chosen": 0.09787597507238388, "logits/rejected": 0.13475342094898224, "logps/chosen": -287.04998779296875, "logps/rejected": -461.5, "loss": 0.1067, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9332275390625, "rewards/margins": 5.341406345367432, "rewards/rejected": -6.271093845367432, "step": 1650 }, { "epoch": 1.4576196750109793, "grad_norm": 23.88294358993554, "learning_rate": 6.355448154657293e-07, "logits/chosen": 0.10003662109375, "logits/rejected": 0.02080078050494194, "logps/chosen": -320.6000061035156, "logps/rejected": -488.0, "loss": 0.0622, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.574267566204071, "rewards/margins": 6.303124904632568, "rewards/rejected": -6.877343654632568, "step": 1660 }, { "epoch": 1.466403162055336, "grad_norm": 81.97171050960371, "learning_rate": 6.333479789103691e-07, "logits/chosen": -0.00798950158059597, "logits/rejected": 0.04533081129193306, "logps/chosen": -316.2749938964844, "logps/rejected": -474.29998779296875, "loss": 0.1318, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8567870855331421, "rewards/margins": 6.036718845367432, "rewards/rejected": -6.892968654632568, "step": 1670 }, { "epoch": 1.4751866490996925, "grad_norm": 34.99106026435826, "learning_rate": 6.311511423550088e-07, "logits/chosen": 0.0147705078125, "logits/rejected": 0.01856689527630806, "logps/chosen": -292.04998779296875, "logps/rejected": -453.79998779296875, "loss": 0.1172, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.673449695110321, "rewards/margins": 5.646484375, "rewards/rejected": -6.311718940734863, "step": 1680 }, { "epoch": 1.4839701361440492, "grad_norm": 19.189793786334235, "learning_rate": 6.289543057996485e-07, "logits/chosen": 0.05220336839556694, "logits/rejected": 0.0745697021484375, "logps/chosen": -349.8500061035156, "logps/rejected": -523.2000122070312, "loss": 0.0751, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.803509533405304, "rewards/margins": 5.828906059265137, "rewards/rejected": -6.632031440734863, "step": 1690 }, { "epoch": 1.4927536231884058, "grad_norm": 17.575816677073036, "learning_rate": 6.267574692442882e-07, "logits/chosen": 0.04204712063074112, "logits/rejected": -0.05594482272863388, "logps/chosen": -358.6000061035156, "logps/rejected": -502.79998779296875, "loss": 0.071, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.0838501453399658, "rewards/margins": 6.247656345367432, "rewards/rejected": -7.331250190734863, "step": 1700 }, { "epoch": 1.5015371102327624, "grad_norm": 36.31414349069316, "learning_rate": 6.245606326889279e-07, "logits/chosen": -0.03830566257238388, "logits/rejected": -0.01305541954934597, "logps/chosen": -313.3999938964844, "logps/rejected": -459.79998779296875, "loss": 0.0917, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6198180913925171, "rewards/margins": 6.130468845367432, "rewards/rejected": -6.749218940734863, "step": 1710 }, { "epoch": 1.510320597277119, "grad_norm": 31.77463317343136, "learning_rate": 6.223637961335676e-07, "logits/chosen": -0.0306396484375, "logits/rejected": 0.05202026292681694, "logps/chosen": -295.0, "logps/rejected": -466.1000061035156, "loss": 0.0826, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.867749035358429, "rewards/margins": 5.69921875, "rewards/rejected": -6.5703125, "step": 1720 }, { "epoch": 1.5191040843214756, "grad_norm": 30.014341027400945, "learning_rate": 6.201669595782074e-07, "logits/chosen": -0.06491699069738388, "logits/rejected": 0.0181884765625, "logps/chosen": -294.1499938964844, "logps/rejected": -462.0, "loss": 0.0815, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.697070300579071, "rewards/margins": 5.817187309265137, "rewards/rejected": -6.509375095367432, "step": 1730 }, { "epoch": 1.5278875713658322, "grad_norm": 9.785163614435682, "learning_rate": 6.179701230228471e-07, "logits/chosen": 0.01868896558880806, "logits/rejected": 0.01265869103372097, "logps/chosen": -313.54998779296875, "logps/rejected": -482.6000061035156, "loss": 0.0723, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.662158191204071, "rewards/margins": 5.901562690734863, "rewards/rejected": -6.565625190734863, "step": 1740 }, { "epoch": 1.5366710584101888, "grad_norm": 27.858717568090693, "learning_rate": 6.157732864674869e-07, "logits/chosen": -0.03766479343175888, "logits/rejected": -0.11530151218175888, "logps/chosen": -301.6499938964844, "logps/rejected": -438.20001220703125, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -0.6853698492050171, "rewards/margins": 5.970312595367432, "rewards/rejected": -6.654687404632568, "step": 1750 }, { "epoch": 1.5454545454545454, "grad_norm": 6.031599385188138, "learning_rate": 6.135764499121265e-07, "logits/chosen": -0.12230835109949112, "logits/rejected": 0.006335449405014515, "logps/chosen": -299.54998779296875, "logps/rejected": -515.5999755859375, "loss": 0.0915, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.496484398841858, "rewards/margins": 6.317187309265137, "rewards/rejected": -7.814843654632568, "step": 1760 }, { "epoch": 1.554238032498902, "grad_norm": 34.93567947616748, "learning_rate": 6.113796133567663e-07, "logits/chosen": -0.09699096530675888, "logits/rejected": -0.2086944580078125, "logps/chosen": -317.6000061035156, "logps/rejected": -430.70001220703125, "loss": 0.1074, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9927978515625, "rewards/margins": 6.052343845367432, "rewards/rejected": -7.042187690734863, "step": 1770 }, { "epoch": 1.5630215195432586, "grad_norm": 38.57937332823341, "learning_rate": 6.091827768014059e-07, "logits/chosen": -0.05367736890912056, "logits/rejected": -0.02277832105755806, "logps/chosen": -285.29998779296875, "logps/rejected": -426.8500061035156, "loss": 0.1117, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.555145263671875, "rewards/margins": 5.666406154632568, "rewards/rejected": -6.217187404632568, "step": 1780 }, { "epoch": 1.5718050065876152, "grad_norm": 44.13709302827605, "learning_rate": 6.069859402460457e-07, "logits/chosen": 0.03424072265625, "logits/rejected": -0.08402099460363388, "logps/chosen": -303.9750061035156, "logps/rejected": -465.79998779296875, "loss": 0.0666, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.05029296875, "rewards/margins": 6.082812309265137, "rewards/rejected": -7.128125190734863, "step": 1790 }, { "epoch": 1.5805884936319718, "grad_norm": 20.368122928110196, "learning_rate": 6.047891036906854e-07, "logits/chosen": -0.2890625, "logits/rejected": -0.21949462592601776, "logps/chosen": -297.75, "logps/rejected": -510.70001220703125, "loss": 0.104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.641699194908142, "rewards/margins": 6.796875, "rewards/rejected": -8.439062118530273, "step": 1800 }, { "epoch": 1.5893719806763285, "grad_norm": 101.26013950027891, "learning_rate": 6.025922671353251e-07, "logits/chosen": -0.3020690977573395, "logits/rejected": -0.2308197021484375, "logps/chosen": -304.54998779296875, "logps/rejected": -470.1000061035156, "loss": 0.1067, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.429834008216858, "rewards/margins": 5.965624809265137, "rewards/rejected": -7.392187595367432, "step": 1810 }, { "epoch": 1.598155467720685, "grad_norm": 21.083205067537076, "learning_rate": 6.003954305799648e-07, "logits/chosen": -0.2089691162109375, "logits/rejected": -0.19093628227710724, "logps/chosen": -323.75, "logps/rejected": -526.5, "loss": 0.0928, "rewards/accuracies": 0.96875, "rewards/chosen": -1.118066430091858, "rewards/margins": 6.435937404632568, "rewards/rejected": -7.550000190734863, "step": 1820 }, { "epoch": 1.6069389547650417, "grad_norm": 19.076293718173105, "learning_rate": 5.981985940246046e-07, "logits/chosen": 0.02821960486471653, "logits/rejected": 0.02653198316693306, "logps/chosen": -309.25, "logps/rejected": -478.29998779296875, "loss": 0.0692, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7344604730606079, "rewards/margins": 6.1328125, "rewards/rejected": -6.860937595367432, "step": 1830 }, { "epoch": 1.6157224418093983, "grad_norm": 26.940462924266946, "learning_rate": 5.960017574692443e-07, "logits/chosen": -0.03572692722082138, "logits/rejected": 0.02583007887005806, "logps/chosen": -268.82501220703125, "logps/rejected": -457.29998779296875, "loss": 0.1003, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.56201171875, "rewards/margins": 5.69921875, "rewards/rejected": -6.259375095367432, "step": 1840 }, { "epoch": 1.6245059288537549, "grad_norm": 33.75996773138413, "learning_rate": 5.938049209138841e-07, "logits/chosen": -0.07940063625574112, "logits/rejected": -0.09064330905675888, "logps/chosen": -315.6499938964844, "logps/rejected": -444.3500061035156, "loss": 0.095, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6738647222518921, "rewards/margins": 5.549218654632568, "rewards/rejected": -6.223437309265137, "step": 1850 }, { "epoch": 1.6332894158981115, "grad_norm": 22.434054688810456, "learning_rate": 5.916080843585237e-07, "logits/chosen": -0.2037353515625, "logits/rejected": -0.14406737685203552, "logps/chosen": -321.7250061035156, "logps/rejected": -497.25, "loss": 0.0707, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.711474597454071, "rewards/margins": 6.3671875, "rewards/rejected": -7.078125, "step": 1860 }, { "epoch": 1.642072902942468, "grad_norm": 51.94268184236992, "learning_rate": 5.894112478031635e-07, "logits/chosen": -0.10286559909582138, "logits/rejected": -0.10621337592601776, "logps/chosen": -300.20001220703125, "logps/rejected": -490.70001220703125, "loss": 0.11, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9009033441543579, "rewards/margins": 6.412499904632568, "rewards/rejected": -7.3125, "step": 1870 }, { "epoch": 1.6508563899868247, "grad_norm": 12.968537032938697, "learning_rate": 5.872144112478031e-07, "logits/chosen": -0.2662719786167145, "logits/rejected": -0.22523804008960724, "logps/chosen": -313.04998779296875, "logps/rejected": -469.29998779296875, "loss": 0.0584, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1854979991912842, "rewards/margins": 6.384375095367432, "rewards/rejected": -7.573437690734863, "step": 1880 }, { "epoch": 1.6596398770311813, "grad_norm": 17.30185682994802, "learning_rate": 5.850175746924429e-07, "logits/chosen": -0.15960693359375, "logits/rejected": -0.15380859375, "logps/chosen": -295.17498779296875, "logps/rejected": -466.8999938964844, "loss": 0.1076, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4401366710662842, "rewards/margins": 6.296875, "rewards/rejected": -7.729687690734863, "step": 1890 }, { "epoch": 1.668423364075538, "grad_norm": 20.93601442355895, "learning_rate": 5.828207381370825e-07, "logits/chosen": -0.21343994140625, "logits/rejected": -0.17524413764476776, "logps/chosen": -302.0249938964844, "logps/rejected": -494.54998779296875, "loss": 0.0828, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0376465320587158, "rewards/margins": 6.534375190734863, "rewards/rejected": -7.568749904632568, "step": 1900 }, { "epoch": 1.6772068511198945, "grad_norm": 29.65540149424815, "learning_rate": 5.806239015817222e-07, "logits/chosen": -0.15504150092601776, "logits/rejected": -0.11469726264476776, "logps/chosen": -323.0, "logps/rejected": -463.04998779296875, "loss": 0.071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0254395008087158, "rewards/margins": 6.099218845367432, "rewards/rejected": -7.124218940734863, "step": 1910 }, { "epoch": 1.6859903381642511, "grad_norm": 5.250230410143558, "learning_rate": 5.78427065026362e-07, "logits/chosen": -0.14237670600414276, "logits/rejected": -0.16058349609375, "logps/chosen": -296.29998779296875, "logps/rejected": -483.5, "loss": 0.0694, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.197509765625, "rewards/margins": 6.432812690734863, "rewards/rejected": -7.626562595367432, "step": 1920 }, { "epoch": 1.6947738252086078, "grad_norm": 17.0013676480821, "learning_rate": 5.762302284710018e-07, "logits/chosen": -0.19916382431983948, "logits/rejected": -0.2809814512729645, "logps/chosen": -306.95001220703125, "logps/rejected": -480.5, "loss": 0.0655, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.049902319908142, "rewards/margins": 6.931250095367432, "rewards/rejected": -7.979687690734863, "step": 1930 }, { "epoch": 1.7035573122529644, "grad_norm": 5.837699036175044, "learning_rate": 5.740333919156415e-07, "logits/chosen": -0.15673828125, "logits/rejected": -0.21162644028663635, "logps/chosen": -340.0, "logps/rejected": -534.75, "loss": 0.0537, "rewards/accuracies": 0.96875, "rewards/chosen": -1.410009741783142, "rewards/margins": 7.0234375, "rewards/rejected": -8.435937881469727, "step": 1940 }, { "epoch": 1.712340799297321, "grad_norm": 42.723525696751, "learning_rate": 5.718365553602812e-07, "logits/chosen": -0.09373168647289276, "logits/rejected": -0.2821899354457855, "logps/chosen": -312.5, "logps/rejected": -493.25, "loss": 0.0813, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.518310546875, "rewards/margins": 6.853125095367432, "rewards/rejected": -8.375, "step": 1950 }, { "epoch": 1.7211242863416776, "grad_norm": 29.821882896848162, "learning_rate": 5.696397188049209e-07, "logits/chosen": -0.23991699516773224, "logits/rejected": -0.10544433444738388, "logps/chosen": -359.32501220703125, "logps/rejected": -494.0, "loss": 0.1012, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.614477515220642, "rewards/margins": 6.631249904632568, "rewards/rejected": -8.248437881469727, "step": 1960 }, { "epoch": 1.7299077733860342, "grad_norm": 63.18098561145562, "learning_rate": 5.674428822495607e-07, "logits/chosen": -0.136363223195076, "logits/rejected": -0.10057983547449112, "logps/chosen": -299.8999938964844, "logps/rejected": -473.0, "loss": 0.0799, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9446777105331421, "rewards/margins": 6.229687690734863, "rewards/rejected": -7.1796875, "step": 1970 }, { "epoch": 1.7386912604303908, "grad_norm": 99.05515470141863, "learning_rate": 5.652460456942003e-07, "logits/chosen": -0.06154479831457138, "logits/rejected": -0.09433593600988388, "logps/chosen": -287.25, "logps/rejected": -439.6499938964844, "loss": 0.1103, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4610595703125, "rewards/margins": 5.671875, "rewards/rejected": -6.131249904632568, "step": 1980 }, { "epoch": 1.7474747474747474, "grad_norm": 36.625926043047556, "learning_rate": 5.6304920913884e-07, "logits/chosen": -0.02309570275247097, "logits/rejected": -0.0142364501953125, "logps/chosen": -297.1499938964844, "logps/rejected": -490.3999938964844, "loss": 0.0908, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.6238769292831421, "rewards/margins": 5.895312309265137, "rewards/rejected": -6.518750190734863, "step": 1990 }, { "epoch": 1.756258234519104, "grad_norm": 34.12914874744045, "learning_rate": 5.608523725834797e-07, "logits/chosen": -0.04711608961224556, "logits/rejected": -0.05615844577550888, "logps/chosen": -282.6000061035156, "logps/rejected": -444.6499938964844, "loss": 0.1037, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2186279296875, "rewards/margins": 6.103906154632568, "rewards/rejected": -6.327343940734863, "step": 2000 }, { "epoch": 1.7650417215634606, "grad_norm": 9.397946856956624, "learning_rate": 5.586555360281194e-07, "logits/chosen": 0.0036254883743822575, "logits/rejected": 0.117919921875, "logps/chosen": -320.79998779296875, "logps/rejected": -509.0, "loss": 0.0644, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.717822253704071, "rewards/margins": 6.223437309265137, "rewards/rejected": -6.942187309265137, "step": 2010 }, { "epoch": 1.7738252086078172, "grad_norm": 27.734665662012336, "learning_rate": 5.564586994727593e-07, "logits/chosen": -0.19244995713233948, "logits/rejected": -0.09702758491039276, "logps/chosen": -307.70001220703125, "logps/rejected": -436.1000061035156, "loss": 0.1049, "rewards/accuracies": 0.96875, "rewards/chosen": -0.854736328125, "rewards/margins": 5.5078125, "rewards/rejected": -6.364062309265137, "step": 2020 }, { "epoch": 1.7826086956521738, "grad_norm": 82.84739687389144, "learning_rate": 5.54261862917399e-07, "logits/chosen": -0.23595580458641052, "logits/rejected": -0.2042800933122635, "logps/chosen": -306.75, "logps/rejected": -485.0, "loss": 0.0694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.499365210533142, "rewards/margins": 6.946875095367432, "rewards/rejected": -8.442187309265137, "step": 2030 }, { "epoch": 1.7913921826965304, "grad_norm": 40.66563001671908, "learning_rate": 5.520650263620387e-07, "logits/chosen": -0.2695983946323395, "logits/rejected": -0.19211654365062714, "logps/chosen": -318.3500061035156, "logps/rejected": -472.54998779296875, "loss": 0.1045, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.543359398841858, "rewards/margins": 6.353125095367432, "rewards/rejected": -7.890625, "step": 2040 }, { "epoch": 1.800175669740887, "grad_norm": 23.60237891615322, "learning_rate": 5.498681898066783e-07, "logits/chosen": -0.12987060844898224, "logits/rejected": -0.15207596123218536, "logps/chosen": -330.6499938964844, "logps/rejected": -500.5, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -1.2175781726837158, "rewards/margins": 6.675000190734863, "rewards/rejected": -7.890625, "step": 2050 }, { "epoch": 1.8089591567852437, "grad_norm": 15.803422446370408, "learning_rate": 5.476713532513181e-07, "logits/chosen": -0.24332275986671448, "logits/rejected": -0.16697387397289276, "logps/chosen": -299.6000061035156, "logps/rejected": -429.70001220703125, "loss": 0.0789, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.777478039264679, "rewards/margins": 6.024218559265137, "rewards/rejected": -6.801562309265137, "step": 2060 }, { "epoch": 1.8177426438296003, "grad_norm": 30.766787548356177, "learning_rate": 5.454745166959577e-07, "logits/chosen": -0.19248351454734802, "logits/rejected": -0.09450988471508026, "logps/chosen": -339.8999938964844, "logps/rejected": -527.9000244140625, "loss": 0.1033, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.240441918373108, "rewards/margins": 6.920312404632568, "rewards/rejected": -8.155468940734863, "step": 2070 }, { "epoch": 1.8265261308739569, "grad_norm": 12.039363761474991, "learning_rate": 5.432776801405975e-07, "logits/chosen": -0.14935989677906036, "logits/rejected": -0.14826659858226776, "logps/chosen": -321.32501220703125, "logps/rejected": -470.20001220703125, "loss": 0.1489, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9764648675918579, "rewards/margins": 5.357031345367432, "rewards/rejected": -6.329687595367432, "step": 2080 }, { "epoch": 1.8353096179183135, "grad_norm": 31.65237337653492, "learning_rate": 5.410808435852372e-07, "logits/chosen": -0.12595215439796448, "logits/rejected": -0.05571288987994194, "logps/chosen": -304.1499938964844, "logps/rejected": -485.6499938964844, "loss": 0.0871, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.44743043184280396, "rewards/margins": 6.046875, "rewards/rejected": -6.495312690734863, "step": 2090 }, { "epoch": 1.84409310496267, "grad_norm": 27.490366668075296, "learning_rate": 5.388840070298769e-07, "logits/chosen": -0.18271484971046448, "logits/rejected": -0.11206664890050888, "logps/chosen": -304.3500061035156, "logps/rejected": -495.0, "loss": 0.0831, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.765045166015625, "rewards/margins": 6.43359375, "rewards/rejected": -7.198437690734863, "step": 2100 }, { "epoch": 1.8528765920070267, "grad_norm": 14.000245772855038, "learning_rate": 5.366871704745168e-07, "logits/chosen": -0.23728027939796448, "logits/rejected": -0.32062989473342896, "logps/chosen": -307.54998779296875, "logps/rejected": -463.70001220703125, "loss": 0.0825, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.85986328125, "rewards/margins": 6.518750190734863, "rewards/rejected": -7.375, "step": 2110 }, { "epoch": 1.8616600790513833, "grad_norm": 51.3097738063691, "learning_rate": 5.344903339191564e-07, "logits/chosen": -0.18938598036766052, "logits/rejected": -0.18184204399585724, "logps/chosen": -306.2250061035156, "logps/rejected": -489.29998779296875, "loss": 0.0887, "rewards/accuracies": 0.96875, "rewards/chosen": -1.231787085533142, "rewards/margins": 6.875, "rewards/rejected": -8.108593940734863, "step": 2120 }, { "epoch": 1.87044356609574, "grad_norm": 19.761661096166794, "learning_rate": 5.322934973637961e-07, "logits/chosen": -0.23894043266773224, "logits/rejected": -0.18857422471046448, "logps/chosen": -314.70001220703125, "logps/rejected": -503.3999938964844, "loss": 0.0748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.035772681236267, "rewards/margins": 6.818749904632568, "rewards/rejected": -7.860937595367432, "step": 2130 }, { "epoch": 1.8792270531400965, "grad_norm": 30.438383595430253, "learning_rate": 5.300966608084358e-07, "logits/chosen": 0.04300536960363388, "logits/rejected": -0.004803466610610485, "logps/chosen": -325.70001220703125, "logps/rejected": -476.6000061035156, "loss": 0.1145, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.80291748046875, "rewards/margins": 6.453125, "rewards/rejected": -7.254687309265137, "step": 2140 }, { "epoch": 1.8880105401844531, "grad_norm": 63.639150902985186, "learning_rate": 5.278998242530755e-07, "logits/chosen": 0.02180786058306694, "logits/rejected": -0.0143585205078125, "logps/chosen": -334.3500061035156, "logps/rejected": -469.1000061035156, "loss": 0.1083, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.711334228515625, "rewards/margins": 5.938281059265137, "rewards/rejected": -6.6484375, "step": 2150 }, { "epoch": 1.8967940272288097, "grad_norm": 23.68628114874884, "learning_rate": 5.257029876977153e-07, "logits/chosen": 0.11909179389476776, "logits/rejected": 0.16771849989891052, "logps/chosen": -300.54998779296875, "logps/rejected": -492.5, "loss": 0.0719, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3790527284145355, "rewards/margins": 6.072656154632568, "rewards/rejected": -6.451562404632568, "step": 2160 }, { "epoch": 1.9055775142731664, "grad_norm": 26.39124925194465, "learning_rate": 5.235061511423549e-07, "logits/chosen": 0.007229614071547985, "logits/rejected": -0.02583618089556694, "logps/chosen": -299.95001220703125, "logps/rejected": -446.20001220703125, "loss": 0.0841, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5140746831893921, "rewards/margins": 5.712500095367432, "rewards/rejected": -6.224999904632568, "step": 2170 }, { "epoch": 1.914361001317523, "grad_norm": 42.000177429250876, "learning_rate": 5.213093145869947e-07, "logits/chosen": 0.08103332668542862, "logits/rejected": 0.17319336533546448, "logps/chosen": -314.04998779296875, "logps/rejected": -459.8999938964844, "loss": 0.1042, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6610107421875, "rewards/margins": 5.836718559265137, "rewards/rejected": -6.502343654632568, "step": 2180 }, { "epoch": 1.9231444883618796, "grad_norm": 21.758560074591824, "learning_rate": 5.191124780316343e-07, "logits/chosen": 0.02301025390625, "logits/rejected": -0.02350463904440403, "logps/chosen": -286.04998779296875, "logps/rejected": -478.8999938964844, "loss": 0.1002, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.603271484375, "rewards/margins": 5.757031440734863, "rewards/rejected": -6.361718654632568, "step": 2190 }, { "epoch": 1.9319279754062362, "grad_norm": 20.890375889355628, "learning_rate": 5.169156414762741e-07, "logits/chosen": -0.0640869140625, "logits/rejected": -0.007781982421875, "logps/chosen": -374.25, "logps/rejected": -529.0, "loss": 0.0803, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.793261706829071, "rewards/margins": 6.7890625, "rewards/rejected": -7.579687595367432, "step": 2200 }, { "epoch": 1.9407114624505928, "grad_norm": 35.71746188228084, "learning_rate": 5.147188049209139e-07, "logits/chosen": -0.0218658447265625, "logits/rejected": 0.06565093994140625, "logps/chosen": -273.2749938964844, "logps/rejected": -420.6499938964844, "loss": 0.1243, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4482055604457855, "rewards/margins": 5.530468940734863, "rewards/rejected": -5.983593940734863, "step": 2210 }, { "epoch": 1.9494949494949494, "grad_norm": 25.123093372437598, "learning_rate": 5.125219683655536e-07, "logits/chosen": -0.109619140625, "logits/rejected": 0.0008331298595294356, "logps/chosen": -298.75, "logps/rejected": -480.79998779296875, "loss": 0.0821, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.811267077922821, "rewards/margins": 6.22265625, "rewards/rejected": -7.0390625, "step": 2220 }, { "epoch": 1.958278436539306, "grad_norm": 19.941292495173467, "learning_rate": 5.103251318101933e-07, "logits/chosen": -0.08156432956457138, "logits/rejected": -0.075225830078125, "logps/chosen": -281.5, "logps/rejected": -444.70001220703125, "loss": 0.111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8741210699081421, "rewards/margins": 6.090624809265137, "rewards/rejected": -6.966406345367432, "step": 2230 }, { "epoch": 1.9670619235836626, "grad_norm": 23.775021751773505, "learning_rate": 5.08128295254833e-07, "logits/chosen": -0.12982483208179474, "logits/rejected": -0.04172363132238388, "logps/chosen": -316.45001220703125, "logps/rejected": -500.8999938964844, "loss": 0.0968, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.257055640220642, "rewards/margins": 6.764062404632568, "rewards/rejected": -8.0234375, "step": 2240 }, { "epoch": 1.9758454106280192, "grad_norm": 24.26765103576258, "learning_rate": 5.059314586994727e-07, "logits/chosen": -0.26988524198532104, "logits/rejected": -0.2302902191877365, "logps/chosen": -355.20001220703125, "logps/rejected": -536.9000244140625, "loss": 0.0631, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.87347412109375, "rewards/margins": 7.090624809265137, "rewards/rejected": -8.978124618530273, "step": 2250 }, { "epoch": 1.9846288976723758, "grad_norm": 75.55438077858082, "learning_rate": 5.037346221441124e-07, "logits/chosen": -0.1328125, "logits/rejected": -0.042755126953125, "logps/chosen": -275.6000061035156, "logps/rejected": -518.7999877929688, "loss": 0.1198, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.5271484851837158, "rewards/margins": 7.129687309265137, "rewards/rejected": -8.659375190734863, "step": 2260 }, { "epoch": 1.9934123847167324, "grad_norm": 120.69268435207285, "learning_rate": 5.015377855887521e-07, "logits/chosen": -0.25457763671875, "logits/rejected": -0.11226806789636612, "logps/chosen": -285.29998779296875, "logps/rejected": -462.6499938964844, "loss": 0.083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1895630359649658, "rewards/margins": 6.404687404632568, "rewards/rejected": -7.594531059265137, "step": 2270 }, { "epoch": 2.0017566974088714, "grad_norm": 4.859324537679196, "learning_rate": 4.993409490333919e-07, "logits/chosen": -0.1587865799665451, "logits/rejected": -0.1274157017469406, "logps/chosen": -281.9473571777344, "logps/rejected": -446.47369384765625, "loss": 0.0814, "rewards/accuracies": 0.9671052694320679, "rewards/chosen": -1.0663034915924072, "rewards/margins": 6.1134867668151855, "rewards/rejected": -7.1759867668151855, "step": 2280 }, { "epoch": 2.010540184453228, "grad_norm": 12.2365555464515, "learning_rate": 4.971441124780316e-07, "logits/chosen": -0.202545166015625, "logits/rejected": -0.21123047173023224, "logps/chosen": -310.3999938964844, "logps/rejected": -484.1000061035156, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -0.8527282476425171, "rewards/margins": 7.512499809265137, "rewards/rejected": -8.362500190734863, "step": 2290 }, { "epoch": 2.0193236714975846, "grad_norm": 7.410889805166732, "learning_rate": 4.949472759226713e-07, "logits/chosen": -0.4449829161167145, "logits/rejected": -0.455902099609375, "logps/chosen": -336.8500061035156, "logps/rejected": -521.7000122070312, "loss": 0.0117, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.967382788658142, "rewards/margins": 8.279687881469727, "rewards/rejected": -10.239062309265137, "step": 2300 }, { "epoch": 2.0281071585419412, "grad_norm": 2.3894587536281082, "learning_rate": 4.92750439367311e-07, "logits/chosen": -0.4965576231479645, "logits/rejected": -0.5143066644668579, "logps/chosen": -328.20001220703125, "logps/rejected": -536.2999877929688, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -3.2177734375, "rewards/margins": 9.693750381469727, "rewards/rejected": -12.90625, "step": 2310 }, { "epoch": 2.036890645586298, "grad_norm": 22.856897559791378, "learning_rate": 4.905536028119508e-07, "logits/chosen": -0.5282226800918579, "logits/rejected": -0.5304199457168579, "logps/chosen": -354.04998779296875, "logps/rejected": -578.5999755859375, "loss": 0.0275, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.132226467132568, "rewards/margins": 9.471875190734863, "rewards/rejected": -13.598437309265137, "step": 2320 }, { "epoch": 2.0456741326306545, "grad_norm": 16.712558675069666, "learning_rate": 4.883567662565905e-07, "logits/chosen": -0.7026122808456421, "logits/rejected": -0.6421142816543579, "logps/chosen": -337.1000061035156, "logps/rejected": -496.75, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -3.186718702316284, "rewards/margins": 9.154687881469727, "rewards/rejected": -12.353124618530273, "step": 2330 }, { "epoch": 2.054457619675011, "grad_norm": 1.17865844511937, "learning_rate": 4.861599297012302e-07, "logits/chosen": -0.47197264432907104, "logits/rejected": -0.40800780057907104, "logps/chosen": -351.29998779296875, "logps/rejected": -595.0, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.4920897483825684, "rewards/margins": 9.8125, "rewards/rejected": -13.306249618530273, "step": 2340 }, { "epoch": 2.0632411067193677, "grad_norm": 3.580825961111089, "learning_rate": 4.839630931458699e-07, "logits/chosen": -0.3642578125, "logits/rejected": -0.4580078125, "logps/chosen": -314.1000061035156, "logps/rejected": -531.5999755859375, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -2.658831834793091, "rewards/margins": 9.517187118530273, "rewards/rejected": -12.184374809265137, "step": 2350 }, { "epoch": 2.0720245937637243, "grad_norm": 4.18998880895587, "learning_rate": 4.817662565905096e-07, "logits/chosen": -0.5641845464706421, "logits/rejected": -0.49580079317092896, "logps/chosen": -308.29998779296875, "logps/rejected": -464.3999938964844, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -1.9303710460662842, "rewards/margins": 8.1328125, "rewards/rejected": -10.059374809265137, "step": 2360 }, { "epoch": 2.080808080808081, "grad_norm": 14.114833617448781, "learning_rate": 4.795694200351494e-07, "logits/chosen": -0.553149402141571, "logits/rejected": -0.5236114263534546, "logps/chosen": -356.8999938964844, "logps/rejected": -561.7000122070312, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5380859375, "rewards/margins": 9.659375190734863, "rewards/rejected": -12.199999809265137, "step": 2370 }, { "epoch": 2.0895915678524375, "grad_norm": 0.9759060817806015, "learning_rate": 4.77372583479789e-07, "logits/chosen": -0.3748779296875, "logits/rejected": -0.598193347454071, "logps/chosen": -325.95001220703125, "logps/rejected": -527.2999877929688, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -2.6539063453674316, "rewards/margins": 9.765625, "rewards/rejected": -12.418749809265137, "step": 2380 }, { "epoch": 2.098375054896794, "grad_norm": 0.7326575734888836, "learning_rate": 4.751757469244288e-07, "logits/chosen": -0.4909118711948395, "logits/rejected": -0.519726574420929, "logps/chosen": -335.6000061035156, "logps/rejected": -511.70001220703125, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -2.5687499046325684, "rewards/margins": 8.864062309265137, "rewards/rejected": -11.448437690734863, "step": 2390 }, { "epoch": 2.1071585419411507, "grad_norm": 3.4396368038907195, "learning_rate": 4.729789103690685e-07, "logits/chosen": -0.45025634765625, "logits/rejected": -0.536145031452179, "logps/chosen": -284.3500061035156, "logps/rejected": -536.2000122070312, "loss": 0.0197, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7289061546325684, "rewards/margins": 9.731249809265137, "rewards/rejected": -12.454687118530273, "step": 2400 }, { "epoch": 2.1159420289855073, "grad_norm": 5.287505016376614, "learning_rate": 4.707820738137082e-07, "logits/chosen": -0.5482422113418579, "logits/rejected": -0.62890625, "logps/chosen": -324.1000061035156, "logps/rejected": -519.0999755859375, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -3.08203125, "rewards/margins": 9.370312690734863, "rewards/rejected": -12.446874618530273, "step": 2410 }, { "epoch": 2.124725516029864, "grad_norm": 12.171905175587224, "learning_rate": 4.68585237258348e-07, "logits/chosen": -0.44130247831344604, "logits/rejected": -0.4844970703125, "logps/chosen": -304.0, "logps/rejected": -503.70001220703125, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -2.817187547683716, "rewards/margins": 9.253125190734863, "rewards/rejected": -12.0703125, "step": 2420 }, { "epoch": 2.1335090030742205, "grad_norm": 3.398761139866252, "learning_rate": 4.663884007029877e-07, "logits/chosen": -0.4713134765625, "logits/rejected": -0.439697265625, "logps/chosen": -340.3500061035156, "logps/rejected": -537.5, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.498278856277466, "rewards/margins": 9.203125, "rewards/rejected": -11.703125, "step": 2430 }, { "epoch": 2.142292490118577, "grad_norm": 16.45083076055659, "learning_rate": 4.641915641476274e-07, "logits/chosen": -0.42060548067092896, "logits/rejected": -0.3839111328125, "logps/chosen": -318.20001220703125, "logps/rejected": -505.5, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5445313453674316, "rewards/margins": 9.112500190734863, "rewards/rejected": -11.653124809265137, "step": 2440 }, { "epoch": 2.1510759771629338, "grad_norm": 9.628999432695988, "learning_rate": 4.619947275922671e-07, "logits/chosen": -0.4518798887729645, "logits/rejected": -0.49409180879592896, "logps/chosen": -330.3500061035156, "logps/rejected": -524.9000244140625, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.7197265625, "rewards/margins": 8.964062690734863, "rewards/rejected": -11.6875, "step": 2450 }, { "epoch": 2.1598594642072904, "grad_norm": 2.3735176611424387, "learning_rate": 4.5979789103690687e-07, "logits/chosen": -0.30616456270217896, "logits/rejected": -0.4649291932582855, "logps/chosen": -325.8500061035156, "logps/rejected": -521.7999877929688, "loss": 0.0175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6803221702575684, "rewards/margins": 9.190625190734863, "rewards/rejected": -11.868749618530273, "step": 2460 }, { "epoch": 2.168642951251647, "grad_norm": 21.110313586220983, "learning_rate": 4.576010544815466e-07, "logits/chosen": -0.537487804889679, "logits/rejected": -0.49848634004592896, "logps/chosen": -325.6499938964844, "logps/rejected": -541.9000244140625, "loss": 0.018, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5492186546325684, "rewards/margins": 9.385937690734863, "rewards/rejected": -11.928125381469727, "step": 2470 }, { "epoch": 2.1774264382960036, "grad_norm": 4.205063353837868, "learning_rate": 4.554042179261863e-07, "logits/chosen": -0.38898926973342896, "logits/rejected": -0.6607421636581421, "logps/chosen": -351.0, "logps/rejected": -492.3999938964844, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -3.198437452316284, "rewards/margins": 9.2578125, "rewards/rejected": -12.446874618530273, "step": 2480 }, { "epoch": 2.18620992534036, "grad_norm": 4.022963071197421, "learning_rate": 4.53207381370826e-07, "logits/chosen": -0.543774425983429, "logits/rejected": -0.6096435785293579, "logps/chosen": -345.20001220703125, "logps/rejected": -552.7000122070312, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -3.236523389816284, "rewards/margins": 9.7890625, "rewards/rejected": -13.03125, "step": 2490 }, { "epoch": 2.194993412384717, "grad_norm": 2.561524545763578, "learning_rate": 4.510105448154657e-07, "logits/chosen": -0.5821777582168579, "logits/rejected": -0.593701183795929, "logps/chosen": -335.3999938964844, "logps/rejected": -528.5999755859375, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -3.455859422683716, "rewards/margins": 9.115625381469727, "rewards/rejected": -12.582812309265137, "step": 2500 }, { "epoch": 2.2037768994290734, "grad_norm": 41.19241412813892, "learning_rate": 4.4881370826010546e-07, "logits/chosen": -0.7020508050918579, "logits/rejected": -0.5090087652206421, "logps/chosen": -339.1499938964844, "logps/rejected": -547.5999755859375, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -3.757031202316284, "rewards/margins": 9.926562309265137, "rewards/rejected": -13.681249618530273, "step": 2510 }, { "epoch": 2.21256038647343, "grad_norm": 1.2604123239985972, "learning_rate": 4.4661687170474517e-07, "logits/chosen": -0.4627441465854645, "logits/rejected": -0.6253417730331421, "logps/chosen": -337.8999938964844, "logps/rejected": -552.0, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -3.014843702316284, "rewards/margins": 9.796875, "rewards/rejected": -12.8203125, "step": 2520 }, { "epoch": 2.2213438735177866, "grad_norm": 3.665958077774627, "learning_rate": 4.444200351493849e-07, "logits/chosen": -0.642529308795929, "logits/rejected": -0.662890613079071, "logps/chosen": -289.54998779296875, "logps/rejected": -500.8999938964844, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -2.500659227371216, "rewards/margins": 9.065625190734863, "rewards/rejected": -11.553125381469727, "step": 2530 }, { "epoch": 2.2301273605621432, "grad_norm": 1.9105881170134582, "learning_rate": 4.422231985940246e-07, "logits/chosen": -0.49406737089157104, "logits/rejected": -0.6146484613418579, "logps/chosen": -343.1000061035156, "logps/rejected": -538.0999755859375, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -3.2750000953674316, "rewards/margins": 9.4765625, "rewards/rejected": -12.75, "step": 2540 }, { "epoch": 2.2389108476065, "grad_norm": 10.786118304804031, "learning_rate": 4.400263620386643e-07, "logits/chosen": -0.4947753846645355, "logits/rejected": -0.646679699420929, "logps/chosen": -333.5, "logps/rejected": -573.7999877929688, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.4398436546325684, "rewards/margins": 10.387499809265137, "rewards/rejected": -13.831250190734863, "step": 2550 }, { "epoch": 2.2476943346508564, "grad_norm": 7.993914092869045, "learning_rate": 4.3782952548330405e-07, "logits/chosen": -0.5188964605331421, "logits/rejected": -0.6089843511581421, "logps/chosen": -370.6000061035156, "logps/rejected": -572.9000244140625, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -3.565624952316284, "rewards/margins": 10.060937881469727, "rewards/rejected": -13.625, "step": 2560 }, { "epoch": 2.256477821695213, "grad_norm": 0.48086936211827197, "learning_rate": 4.3563268892794376e-07, "logits/chosen": -0.6973876953125, "logits/rejected": -0.665209949016571, "logps/chosen": -382.3500061035156, "logps/rejected": -615.9000244140625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -3.274218797683716, "rewards/margins": 10.754687309265137, "rewards/rejected": -14.03125, "step": 2570 }, { "epoch": 2.2652613087395697, "grad_norm": 2.1498128017844977, "learning_rate": 4.3343585237258347e-07, "logits/chosen": -0.42133790254592896, "logits/rejected": -0.49799805879592896, "logps/chosen": -332.1499938964844, "logps/rejected": -538.5, "loss": 0.0334, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4193358421325684, "rewards/margins": 9.310937881469727, "rewards/rejected": -12.739062309265137, "step": 2580 }, { "epoch": 2.2740447957839263, "grad_norm": 5.690523197645481, "learning_rate": 4.312390158172232e-07, "logits/chosen": -0.390188604593277, "logits/rejected": -0.505126953125, "logps/chosen": -381.8999938964844, "logps/rejected": -582.9000244140625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.696484327316284, "rewards/margins": 10.951562881469727, "rewards/rejected": -14.65625, "step": 2590 }, { "epoch": 2.282828282828283, "grad_norm": 11.618034026710022, "learning_rate": 4.2904217926186293e-07, "logits/chosen": -0.554003894329071, "logits/rejected": -0.523681640625, "logps/chosen": -314.5249938964844, "logps/rejected": -494.04998779296875, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -2.8659424781799316, "rewards/margins": 9.106249809265137, "rewards/rejected": -11.975000381469727, "step": 2600 }, { "epoch": 2.2916117698726395, "grad_norm": 3.193486303064242, "learning_rate": 4.2684534270650264e-07, "logits/chosen": -0.41304320096969604, "logits/rejected": -0.561962902545929, "logps/chosen": -329.70001220703125, "logps/rejected": -516.2999877929688, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.0806641578674316, "rewards/margins": 9.824999809265137, "rewards/rejected": -12.915624618530273, "step": 2610 }, { "epoch": 2.300395256916996, "grad_norm": 0.5838201936742903, "learning_rate": 4.2464850615114235e-07, "logits/chosen": -0.5024169683456421, "logits/rejected": -0.5477050542831421, "logps/chosen": -308.5, "logps/rejected": -521.2000122070312, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.3941407203674316, "rewards/margins": 9.259374618530273, "rewards/rejected": -12.662500381469727, "step": 2620 }, { "epoch": 2.3091787439613527, "grad_norm": 8.909239724012217, "learning_rate": 4.2245166959578206e-07, "logits/chosen": -0.565173327922821, "logits/rejected": -0.652587890625, "logps/chosen": -299.8999938964844, "logps/rejected": -448.20001220703125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -3.1312499046325684, "rewards/margins": 9.092187881469727, "rewards/rejected": -12.209375381469727, "step": 2630 }, { "epoch": 2.3179622310057093, "grad_norm": 19.9424794899944, "learning_rate": 4.2025483304042177e-07, "logits/chosen": -0.6207275390625, "logits/rejected": -0.6966308355331421, "logps/chosen": -343.8500061035156, "logps/rejected": -561.7000122070312, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.653515577316284, "rewards/margins": 10.487500190734863, "rewards/rejected": -14.146875381469727, "step": 2640 }, { "epoch": 2.326745718050066, "grad_norm": 8.138191209101597, "learning_rate": 4.180579964850615e-07, "logits/chosen": -0.5520874261856079, "logits/rejected": -0.783984363079071, "logps/chosen": -334.5249938964844, "logps/rejected": -542.2000122070312, "loss": 0.0213, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.515820264816284, "rewards/margins": 10.160937309265137, "rewards/rejected": -13.675000190734863, "step": 2650 }, { "epoch": 2.3355292050944225, "grad_norm": 13.17613739551801, "learning_rate": 4.1586115992970123e-07, "logits/chosen": -0.583251953125, "logits/rejected": -0.6724609136581421, "logps/chosen": -339.8999938964844, "logps/rejected": -514.7999877929688, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.725878953933716, "rewards/margins": 9.962499618530273, "rewards/rejected": -12.693750381469727, "step": 2660 }, { "epoch": 2.344312692138779, "grad_norm": 8.270799969977597, "learning_rate": 4.1366432337434094e-07, "logits/chosen": -0.584301769733429, "logits/rejected": -0.5688720941543579, "logps/chosen": -334.75, "logps/rejected": -507.8999938964844, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.80078125, "rewards/margins": 8.774999618530273, "rewards/rejected": -11.571874618530273, "step": 2670 }, { "epoch": 2.3530961791831357, "grad_norm": 6.879163138813254, "learning_rate": 4.1146748681898065e-07, "logits/chosen": -0.5956268310546875, "logits/rejected": -0.752734363079071, "logps/chosen": -323.3999938964844, "logps/rejected": -528.0999755859375, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.2347655296325684, "rewards/margins": 9.9296875, "rewards/rejected": -13.162500381469727, "step": 2680 }, { "epoch": 2.3618796662274923, "grad_norm": 55.19981593521635, "learning_rate": 4.0927065026362036e-07, "logits/chosen": -0.595263659954071, "logits/rejected": -0.678906261920929, "logps/chosen": -377.1499938964844, "logps/rejected": -606.7000122070312, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8929686546325684, "rewards/margins": 10.329687118530273, "rewards/rejected": -14.234375, "step": 2690 }, { "epoch": 2.370663153271849, "grad_norm": 0.9045914730613386, "learning_rate": 4.070738137082601e-07, "logits/chosen": -0.67767333984375, "logits/rejected": -0.6314697265625, "logps/chosen": -350.54998779296875, "logps/rejected": -533.0999755859375, "loss": 0.0129, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.697265625, "rewards/margins": 10.271875381469727, "rewards/rejected": -13.971875190734863, "step": 2700 }, { "epoch": 2.3794466403162056, "grad_norm": 2.219697158923149, "learning_rate": 4.048769771528998e-07, "logits/chosen": -0.6197754144668579, "logits/rejected": -0.7586914300918579, "logps/chosen": -372.29998779296875, "logps/rejected": -546.5, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.8119139671325684, "rewards/margins": 9.814062118530273, "rewards/rejected": -13.628125190734863, "step": 2710 }, { "epoch": 2.388230127360562, "grad_norm": 15.265649606698613, "learning_rate": 4.0268014059753953e-07, "logits/chosen": -0.4295654296875, "logits/rejected": -0.662524402141571, "logps/chosen": -348.92498779296875, "logps/rejected": -535.2000122070312, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.8423829078674316, "rewards/margins": 10.073437690734863, "rewards/rejected": -13.920312881469727, "step": 2720 }, { "epoch": 2.397013614404919, "grad_norm": 15.49699528496114, "learning_rate": 4.0048330404217924e-07, "logits/chosen": -0.46142578125, "logits/rejected": -0.705761730670929, "logps/chosen": -350.8999938964844, "logps/rejected": -538.9500122070312, "loss": 0.0126, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.400000095367432, "rewards/margins": 10.565625190734863, "rewards/rejected": -14.975000381469727, "step": 2730 }, { "epoch": 2.4057971014492754, "grad_norm": 3.50905984185022, "learning_rate": 3.98286467486819e-07, "logits/chosen": -0.4443359375, "logits/rejected": -0.721142590045929, "logps/chosen": -290.20001220703125, "logps/rejected": -536.2999877929688, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -3.512890577316284, "rewards/margins": 11.153124809265137, "rewards/rejected": -14.668749809265137, "step": 2740 }, { "epoch": 2.414580588493632, "grad_norm": 23.397013739626125, "learning_rate": 3.960896309314587e-07, "logits/chosen": -0.47077637910842896, "logits/rejected": -0.49859619140625, "logps/chosen": -296.54998779296875, "logps/rejected": -526.5, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -3.120312452316284, "rewards/margins": 10.157812118530273, "rewards/rejected": -13.274999618530273, "step": 2750 }, { "epoch": 2.4233640755379886, "grad_norm": 0.3279813296513034, "learning_rate": 3.938927943760984e-07, "logits/chosen": -0.518139660358429, "logits/rejected": -0.63916015625, "logps/chosen": -291.0, "logps/rejected": -490.29998779296875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.4937500953674316, "rewards/margins": 9.251562118530273, "rewards/rejected": -11.753125190734863, "step": 2760 }, { "epoch": 2.432147562582345, "grad_norm": 7.535494797057814, "learning_rate": 3.916959578207381e-07, "logits/chosen": -0.42535400390625, "logits/rejected": -0.5865478515625, "logps/chosen": -338.79998779296875, "logps/rejected": -529.0999755859375, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -3.686718702316284, "rewards/margins": 10.732812881469727, "rewards/rejected": -14.428125381469727, "step": 2770 }, { "epoch": 2.440931049626702, "grad_norm": 3.637485954903328, "learning_rate": 3.8949912126537783e-07, "logits/chosen": -0.565014660358429, "logits/rejected": -0.5870116949081421, "logps/chosen": -336.3500061035156, "logps/rejected": -547.4000244140625, "loss": 0.0152, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9066405296325684, "rewards/margins": 10.84375, "rewards/rejected": -14.753125190734863, "step": 2780 }, { "epoch": 2.4497145366710584, "grad_norm": 0.607355146227388, "learning_rate": 3.873022847100176e-07, "logits/chosen": -0.5799926519393921, "logits/rejected": -0.5981384515762329, "logps/chosen": -294.29998779296875, "logps/rejected": -527.2000122070312, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.576171875, "rewards/margins": 10.387499809265137, "rewards/rejected": -13.965624809265137, "step": 2790 }, { "epoch": 2.458498023715415, "grad_norm": 30.157855647650607, "learning_rate": 3.851054481546573e-07, "logits/chosen": -0.49702149629592896, "logits/rejected": -0.599597156047821, "logps/chosen": -328.29998779296875, "logps/rejected": -522.2999877929688, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.835156202316284, "rewards/margins": 10.201562881469727, "rewards/rejected": -14.040624618530273, "step": 2800 }, { "epoch": 2.4672815107597716, "grad_norm": 16.213048592311562, "learning_rate": 3.82908611599297e-07, "logits/chosen": -0.47050780057907104, "logits/rejected": -0.6095215082168579, "logps/chosen": -369.45001220703125, "logps/rejected": -573.2000122070312, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -3.8199219703674316, "rewards/margins": 10.040624618530273, "rewards/rejected": -13.859375, "step": 2810 }, { "epoch": 2.4760649978041283, "grad_norm": 25.86673537725094, "learning_rate": 3.807117750439367e-07, "logits/chosen": -0.18330231308937073, "logits/rejected": -0.5303955078125, "logps/chosen": -300.45001220703125, "logps/rejected": -550.0999755859375, "loss": 0.0149, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.29296875, "rewards/margins": 10.5859375, "rewards/rejected": -13.876562118530273, "step": 2820 }, { "epoch": 2.484848484848485, "grad_norm": 2.1305328973868174, "learning_rate": 3.785149384885764e-07, "logits/chosen": -0.4776458740234375, "logits/rejected": -0.49378663301467896, "logps/chosen": -370.25, "logps/rejected": -554.4000244140625, "loss": 0.0387, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.339648485183716, "rewards/margins": 9.7421875, "rewards/rejected": -13.081250190734863, "step": 2830 }, { "epoch": 2.4936319718928415, "grad_norm": 11.807458731910856, "learning_rate": 3.763181019332162e-07, "logits/chosen": -0.524169921875, "logits/rejected": -0.615917980670929, "logps/chosen": -315.95001220703125, "logps/rejected": -545.7000122070312, "loss": 0.0156, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3121094703674316, "rewards/margins": 10.262499809265137, "rewards/rejected": -13.568750381469727, "step": 2840 }, { "epoch": 2.502415458937198, "grad_norm": 1.5842353748364417, "learning_rate": 3.741212653778559e-07, "logits/chosen": -0.4516967833042145, "logits/rejected": -0.4715332090854645, "logps/chosen": -345.45001220703125, "logps/rejected": -547.7999877929688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.265625, "rewards/margins": 10.420312881469727, "rewards/rejected": -13.678125381469727, "step": 2850 }, { "epoch": 2.5111989459815547, "grad_norm": 14.129051638988512, "learning_rate": 3.719244288224956e-07, "logits/chosen": -0.46368408203125, "logits/rejected": -0.45376890897750854, "logps/chosen": -340.29998779296875, "logps/rejected": -566.9000244140625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -3.034374952316284, "rewards/margins": 9.287500381469727, "rewards/rejected": -12.3125, "step": 2860 }, { "epoch": 2.5199824330259113, "grad_norm": 6.091639562094377, "learning_rate": 3.697275922671353e-07, "logits/chosen": -0.577392578125, "logits/rejected": -0.49785155057907104, "logps/chosen": -285.54998779296875, "logps/rejected": -512.0999755859375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -2.673828125, "rewards/margins": 9.142187118530273, "rewards/rejected": -11.803125381469727, "step": 2870 }, { "epoch": 2.528765920070268, "grad_norm": 12.121653340633321, "learning_rate": 3.6753075571177507e-07, "logits/chosen": -0.40644532442092896, "logits/rejected": -0.6261841058731079, "logps/chosen": -355.0249938964844, "logps/rejected": -530.5499877929688, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -3.428271532058716, "rewards/margins": 8.7578125, "rewards/rejected": -12.184374809265137, "step": 2880 }, { "epoch": 2.5375494071146245, "grad_norm": 7.163761815927153, "learning_rate": 3.653339191564148e-07, "logits/chosen": -0.2828125059604645, "logits/rejected": -0.5655456781387329, "logps/chosen": -318.95001220703125, "logps/rejected": -511.5, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.655468702316284, "rewards/margins": 9.901562690734863, "rewards/rejected": -13.5625, "step": 2890 }, { "epoch": 2.546332894158981, "grad_norm": 19.70624732269805, "learning_rate": 3.631370826010545e-07, "logits/chosen": -0.6097412109375, "logits/rejected": -0.650439441204071, "logps/chosen": -313.1499938964844, "logps/rejected": -526.9500122070312, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -3.2542967796325684, "rewards/margins": 9.795312881469727, "rewards/rejected": -13.046875, "step": 2900 }, { "epoch": 2.5551163812033377, "grad_norm": 1.116236560505976, "learning_rate": 3.609402460456942e-07, "logits/chosen": -0.2607177793979645, "logits/rejected": -0.560864269733429, "logps/chosen": -340.1499938964844, "logps/rejected": -525.5999755859375, "loss": 0.0205, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.8203125, "rewards/margins": 9.478124618530273, "rewards/rejected": -13.296875, "step": 2910 }, { "epoch": 2.5638998682476943, "grad_norm": 30.696083873432222, "learning_rate": 3.587434094903339e-07, "logits/chosen": -0.531018078327179, "logits/rejected": -0.6320556402206421, "logps/chosen": -336.95001220703125, "logps/rejected": -534.5, "loss": 0.0205, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5718750953674316, "rewards/margins": 10.293749809265137, "rewards/rejected": -13.865625381469727, "step": 2920 }, { "epoch": 2.572683355292051, "grad_norm": 7.849445736291118, "learning_rate": 3.5654657293497366e-07, "logits/chosen": -0.568359375, "logits/rejected": -0.7803710699081421, "logps/chosen": -325.6000061035156, "logps/rejected": -591.2000122070312, "loss": 0.0291, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.852343559265137, "rewards/margins": 10.826562881469727, "rewards/rejected": -15.678125381469727, "step": 2930 }, { "epoch": 2.5814668423364076, "grad_norm": 19.3511872353604, "learning_rate": 3.5434973637961337e-07, "logits/chosen": -0.656298816204071, "logits/rejected": -0.6795898675918579, "logps/chosen": -353.54998779296875, "logps/rejected": -565.7999877929688, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -4.483593940734863, "rewards/margins": 10.795312881469727, "rewards/rejected": -15.262499809265137, "step": 2940 }, { "epoch": 2.590250329380764, "grad_norm": 5.77955773615677, "learning_rate": 3.521528998242531e-07, "logits/chosen": -0.8702148199081421, "logits/rejected": -0.749707043170929, "logps/chosen": -313.54998779296875, "logps/rejected": -504.20001220703125, "loss": 0.0185, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.056250095367432, "rewards/margins": 9.5703125, "rewards/rejected": -13.628125190734863, "step": 2950 }, { "epoch": 2.5990338164251208, "grad_norm": 2.3812371250234907, "learning_rate": 3.499560632688928e-07, "logits/chosen": -0.631542980670929, "logits/rejected": -0.620312511920929, "logps/chosen": -305.45001220703125, "logps/rejected": -489.79998779296875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.3208985328674316, "rewards/margins": 9.3828125, "rewards/rejected": -12.699999809265137, "step": 2960 }, { "epoch": 2.6078173034694774, "grad_norm": 12.208375076600582, "learning_rate": 3.477592267135325e-07, "logits/chosen": -0.605743408203125, "logits/rejected": -0.7142578363418579, "logps/chosen": -309.0, "logps/rejected": -542.0, "loss": 0.0177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1566405296325684, "rewards/margins": 10.571874618530273, "rewards/rejected": -13.737500190734863, "step": 2970 }, { "epoch": 2.616600790513834, "grad_norm": 32.74067594586259, "learning_rate": 3.4556239015817225e-07, "logits/chosen": -0.6337890625, "logits/rejected": -0.662493884563446, "logps/chosen": -316.6499938964844, "logps/rejected": -494.8500061035156, "loss": 0.0116, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.091992139816284, "rewards/margins": 10.003125190734863, "rewards/rejected": -13.090624809265137, "step": 2980 }, { "epoch": 2.6253842775581906, "grad_norm": 4.850041606532472, "learning_rate": 3.4336555360281196e-07, "logits/chosen": -0.5137939453125, "logits/rejected": -0.6543945074081421, "logps/chosen": -377.54998779296875, "logps/rejected": -605.7000122070312, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -4.96875, "rewards/margins": 10.303125381469727, "rewards/rejected": -15.2734375, "step": 2990 }, { "epoch": 2.634167764602547, "grad_norm": 14.302777900083154, "learning_rate": 3.4116871704745167e-07, "logits/chosen": -0.43498533964157104, "logits/rejected": -0.5858398675918579, "logps/chosen": -342.3999938964844, "logps/rejected": -527.4000244140625, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -4.041015625, "rewards/margins": 9.199999809265137, "rewards/rejected": -13.243749618530273, "step": 3000 }, { "epoch": 2.642951251646904, "grad_norm": 7.230066258678824, "learning_rate": 3.389718804920914e-07, "logits/chosen": -0.5193237066268921, "logits/rejected": -0.6883300542831421, "logps/chosen": -334.5, "logps/rejected": -560.4000244140625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.5703125, "rewards/margins": 10.337499618530273, "rewards/rejected": -13.909375190734863, "step": 3010 }, { "epoch": 2.6517347386912604, "grad_norm": 8.034465066106216, "learning_rate": 3.3677504393673114e-07, "logits/chosen": -0.619677722454071, "logits/rejected": -0.7250732183456421, "logps/chosen": -337.75, "logps/rejected": -560.7000122070312, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.5699219703674316, "rewards/margins": 10.171875, "rewards/rejected": -13.734375, "step": 3020 }, { "epoch": 2.660518225735617, "grad_norm": 1.652084985790989, "learning_rate": 3.3457820738137084e-07, "logits/chosen": -0.6778320074081421, "logits/rejected": -0.66845703125, "logps/chosen": -324.8500061035156, "logps/rejected": -533.4000244140625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -3.630859375, "rewards/margins": 10.90625, "rewards/rejected": -14.524999618530273, "step": 3030 }, { "epoch": 2.6693017127799736, "grad_norm": 2.636933645130095, "learning_rate": 3.3238137082601055e-07, "logits/chosen": -0.702587902545929, "logits/rejected": -0.792187511920929, "logps/chosen": -373.25, "logps/rejected": -629.7000122070312, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.8095703125, "rewards/margins": 11.28125, "rewards/rejected": -16.103124618530273, "step": 3040 }, { "epoch": 2.6780851998243302, "grad_norm": 1.0160746220757926, "learning_rate": 3.3018453427065026e-07, "logits/chosen": -0.652539074420929, "logits/rejected": -0.7568359375, "logps/chosen": -379.3999938964844, "logps/rejected": -553.2000122070312, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -4.245312690734863, "rewards/margins": 9.684374809265137, "rewards/rejected": -13.931249618530273, "step": 3050 }, { "epoch": 2.686868686868687, "grad_norm": 36.095188931338576, "learning_rate": 3.2798769771528997e-07, "logits/chosen": -0.6179565191268921, "logits/rejected": -0.786816418170929, "logps/chosen": -358.20001220703125, "logps/rejected": -568.6500244140625, "loss": 0.0152, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.730273485183716, "rewards/margins": 11.342187881469727, "rewards/rejected": -15.068750381469727, "step": 3060 }, { "epoch": 2.6956521739130435, "grad_norm": 13.935636596735444, "learning_rate": 3.2579086115992973e-07, "logits/chosen": -0.599621593952179, "logits/rejected": -0.743945300579071, "logps/chosen": -326.79998779296875, "logps/rejected": -549.5, "loss": 0.008, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.077734470367432, "rewards/margins": 10.487500190734863, "rewards/rejected": -14.568750381469727, "step": 3070 }, { "epoch": 2.7044356609574, "grad_norm": 0.7703202091371882, "learning_rate": 3.2359402460456944e-07, "logits/chosen": -0.791308581829071, "logits/rejected": -0.8277832269668579, "logps/chosen": -376.0, "logps/rejected": -574.0, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -4.195703029632568, "rewards/margins": 10.2734375, "rewards/rejected": -14.478124618530273, "step": 3080 }, { "epoch": 2.7132191480017567, "grad_norm": 50.24418383734732, "learning_rate": 3.2139718804920914e-07, "logits/chosen": -0.73193359375, "logits/rejected": -0.7465575933456421, "logps/chosen": -343.79998779296875, "logps/rejected": -553.9000244140625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -4.198046684265137, "rewards/margins": 10.064062118530273, "rewards/rejected": -14.262499809265137, "step": 3090 }, { "epoch": 2.7220026350461133, "grad_norm": 38.7653817336604, "learning_rate": 3.1920035149384885e-07, "logits/chosen": -0.572235107421875, "logits/rejected": -0.6168457269668579, "logps/chosen": -298.54998779296875, "logps/rejected": -513.5, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -3.764843702316284, "rewards/margins": 9.728124618530273, "rewards/rejected": -13.484375, "step": 3100 }, { "epoch": 2.73078612209047, "grad_norm": 4.874046140139498, "learning_rate": 3.1700351493848856e-07, "logits/chosen": -0.5474853515625, "logits/rejected": -0.5938476324081421, "logps/chosen": -366.45001220703125, "logps/rejected": -605.5999755859375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.900390625, "rewards/margins": 11.484375, "rewards/rejected": -15.390625, "step": 3110 }, { "epoch": 2.7395696091348265, "grad_norm": 71.76118539622341, "learning_rate": 3.148066783831283e-07, "logits/chosen": -0.6941894292831421, "logits/rejected": -0.7733398675918579, "logps/chosen": -351.79998779296875, "logps/rejected": -530.5, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.164843559265137, "rewards/margins": 10.046875, "rewards/rejected": -14.21875, "step": 3120 }, { "epoch": 2.748353096179183, "grad_norm": 10.573269520920325, "learning_rate": 3.1260984182776803e-07, "logits/chosen": -0.544482409954071, "logits/rejected": -0.6775878667831421, "logps/chosen": -329.75, "logps/rejected": -527.7000122070312, "loss": 0.0154, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9765625, "rewards/margins": 10.889062881469727, "rewards/rejected": -14.859375, "step": 3130 }, { "epoch": 2.7571365832235397, "grad_norm": 2.7838540224271524, "learning_rate": 3.1041300527240773e-07, "logits/chosen": -0.26807862520217896, "logits/rejected": -0.6246703863143921, "logps/chosen": -359.04998779296875, "logps/rejected": -601.2000122070312, "loss": 0.0165, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9124999046325684, "rewards/margins": 11.1328125, "rewards/rejected": -15.040624618530273, "step": 3140 }, { "epoch": 2.7659200702678963, "grad_norm": 1.711257641822783, "learning_rate": 3.0821616871704744e-07, "logits/chosen": -0.59765625, "logits/rejected": -0.7366943359375, "logps/chosen": -311.9750061035156, "logps/rejected": -558.5, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.283422946929932, "rewards/margins": 11.053125381469727, "rewards/rejected": -15.337499618530273, "step": 3150 }, { "epoch": 2.774703557312253, "grad_norm": 21.277212384598855, "learning_rate": 3.060193321616872e-07, "logits/chosen": -0.7032226324081421, "logits/rejected": -0.7989257574081421, "logps/chosen": -388.3500061035156, "logps/rejected": -578.0, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.958593845367432, "rewards/margins": 10.365625381469727, "rewards/rejected": -15.321874618530273, "step": 3160 }, { "epoch": 2.7834870443566095, "grad_norm": 11.686522192107264, "learning_rate": 3.038224956063269e-07, "logits/chosen": -0.527050793170929, "logits/rejected": -0.732861340045929, "logps/chosen": -316.6000061035156, "logps/rejected": -541.7999877929688, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -3.907421827316284, "rewards/margins": 11.639062881469727, "rewards/rejected": -15.548437118530273, "step": 3170 }, { "epoch": 2.792270531400966, "grad_norm": 13.965357038169337, "learning_rate": 3.016256590509666e-07, "logits/chosen": -0.5638427734375, "logits/rejected": -0.696765124797821, "logps/chosen": -305.75, "logps/rejected": -545.0999755859375, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -3.4593749046325684, "rewards/margins": 11.0, "rewards/rejected": -14.456250190734863, "step": 3180 }, { "epoch": 2.8010540184453228, "grad_norm": 16.79671072160075, "learning_rate": 2.994288224956063e-07, "logits/chosen": -0.3569580018520355, "logits/rejected": -0.7469238042831421, "logps/chosen": -293.6499938964844, "logps/rejected": -505.3999938964844, "loss": 0.0254, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.590625047683716, "rewards/margins": 8.878125190734863, "rewards/rejected": -12.465624809265137, "step": 3190 }, { "epoch": 2.8098375054896794, "grad_norm": 8.472945806574524, "learning_rate": 2.9723198594024603e-07, "logits/chosen": -0.4981445372104645, "logits/rejected": -0.642016589641571, "logps/chosen": -295.8999938964844, "logps/rejected": -548.2999877929688, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.615234375, "rewards/margins": 10.310937881469727, "rewards/rejected": -13.925000190734863, "step": 3200 }, { "epoch": 2.818620992534036, "grad_norm": 6.131570482025351, "learning_rate": 2.950351493848858e-07, "logits/chosen": -0.7705078125, "logits/rejected": -0.8495117425918579, "logps/chosen": -323.45001220703125, "logps/rejected": -485.20001220703125, "loss": 0.0206, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.075390815734863, "rewards/margins": 9.717187881469727, "rewards/rejected": -13.790624618530273, "step": 3210 }, { "epoch": 2.8274044795783926, "grad_norm": 13.722998744191147, "learning_rate": 2.928383128295255e-07, "logits/chosen": -0.686657726764679, "logits/rejected": -0.7440429925918579, "logps/chosen": -366.70001220703125, "logps/rejected": -587.0999755859375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.455468654632568, "rewards/margins": 10.426562309265137, "rewards/rejected": -14.871874809265137, "step": 3220 }, { "epoch": 2.836187966622749, "grad_norm": 2.5924627436509105, "learning_rate": 2.906414762741652e-07, "logits/chosen": -0.6075683832168579, "logits/rejected": -0.733447253704071, "logps/chosen": -342.75, "logps/rejected": -534.0999755859375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.9638671875, "rewards/margins": 9.839062690734863, "rewards/rejected": -13.8125, "step": 3230 }, { "epoch": 2.844971453667106, "grad_norm": 1.2715945729900175, "learning_rate": 2.884446397188049e-07, "logits/chosen": -0.636761486530304, "logits/rejected": -0.8280273675918579, "logps/chosen": -336.6000061035156, "logps/rejected": -567.5999755859375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -3.834765672683716, "rewards/margins": 11.512499809265137, "rewards/rejected": -15.34375, "step": 3240 }, { "epoch": 2.8537549407114624, "grad_norm": 7.703477061081086, "learning_rate": 2.862478031634446e-07, "logits/chosen": -0.7156738042831421, "logits/rejected": -0.649340808391571, "logps/chosen": -315.8999938964844, "logps/rejected": -542.4000244140625, "loss": 0.0248, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8597655296325684, "rewards/margins": 9.785937309265137, "rewards/rejected": -13.643750190734863, "step": 3250 }, { "epoch": 2.862538427755819, "grad_norm": 1.4148602966885966, "learning_rate": 2.840509666080844e-07, "logits/chosen": -0.650463879108429, "logits/rejected": -0.771484375, "logps/chosen": -375.54998779296875, "logps/rejected": -582.5999755859375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.274121284484863, "rewards/margins": 10.1875, "rewards/rejected": -14.459375381469727, "step": 3260 }, { "epoch": 2.8713219148001756, "grad_norm": 6.616435671981006, "learning_rate": 2.818541300527241e-07, "logits/chosen": -0.603405773639679, "logits/rejected": -0.76953125, "logps/chosen": -287.5, "logps/rejected": -503.3999938964844, "loss": 0.0227, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.359814405441284, "rewards/margins": 9.620312690734863, "rewards/rejected": -12.987500190734863, "step": 3270 }, { "epoch": 2.8801054018445322, "grad_norm": 4.0441110157823275, "learning_rate": 2.796572934973638e-07, "logits/chosen": -0.549755871295929, "logits/rejected": -0.645703136920929, "logps/chosen": -339.45001220703125, "logps/rejected": -508.29998779296875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.2578125, "rewards/margins": 9.939062118530273, "rewards/rejected": -13.199999809265137, "step": 3280 }, { "epoch": 2.888888888888889, "grad_norm": 10.207821017898326, "learning_rate": 2.774604569420035e-07, "logits/chosen": -0.6419402956962585, "logits/rejected": -0.594958484172821, "logps/chosen": -321.45001220703125, "logps/rejected": -513.5, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.494140625, "rewards/margins": 9.270312309265137, "rewards/rejected": -12.762499809265137, "step": 3290 }, { "epoch": 2.8976723759332454, "grad_norm": 12.151714162828808, "learning_rate": 2.7526362038664327e-07, "logits/chosen": -0.610699474811554, "logits/rejected": -0.6646972894668579, "logps/chosen": -299.875, "logps/rejected": -491.6000061035156, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.436328172683716, "rewards/margins": 9.639062881469727, "rewards/rejected": -13.071874618530273, "step": 3300 }, { "epoch": 2.906455862977602, "grad_norm": 2.145971240456945, "learning_rate": 2.73066783831283e-07, "logits/chosen": -0.559155285358429, "logits/rejected": -0.7417968511581421, "logps/chosen": -350.1000061035156, "logps/rejected": -577.2999877929688, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -4.077343940734863, "rewards/margins": 10.907812118530273, "rewards/rejected": -14.984375, "step": 3310 }, { "epoch": 2.9152393500219587, "grad_norm": 1.4978296606185773, "learning_rate": 2.708699472759227e-07, "logits/chosen": -0.644091784954071, "logits/rejected": -0.67431640625, "logps/chosen": -364.8999938964844, "logps/rejected": -514.2000122070312, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -4.122656345367432, "rewards/margins": 9.482812881469727, "rewards/rejected": -13.614062309265137, "step": 3320 }, { "epoch": 2.9240228370663153, "grad_norm": 2.7544855423335184, "learning_rate": 2.686731107205624e-07, "logits/chosen": -0.7057129144668579, "logits/rejected": -0.7413085699081421, "logps/chosen": -323.54998779296875, "logps/rejected": -535.7000122070312, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.8094725608825684, "rewards/margins": 10.190625190734863, "rewards/rejected": -13.984375, "step": 3330 }, { "epoch": 2.932806324110672, "grad_norm": 8.77245919472834, "learning_rate": 2.664762741652021e-07, "logits/chosen": -0.648388683795929, "logits/rejected": -0.8115234375, "logps/chosen": -307.20001220703125, "logps/rejected": -508.5, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.6275391578674316, "rewards/margins": 9.770312309265137, "rewards/rejected": -13.399999618530273, "step": 3340 }, { "epoch": 2.9415898111550285, "grad_norm": 6.12344358688435, "learning_rate": 2.6427943760984186e-07, "logits/chosen": -0.527801513671875, "logits/rejected": -0.6234375238418579, "logps/chosen": -351.45001220703125, "logps/rejected": -593.9000244140625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -4.530859470367432, "rewards/margins": 10.2265625, "rewards/rejected": -14.756250381469727, "step": 3350 }, { "epoch": 2.950373298199385, "grad_norm": 9.654673432489632, "learning_rate": 2.6208260105448157e-07, "logits/chosen": -0.6624755859375, "logits/rejected": -0.6791015863418579, "logps/chosen": -343.95001220703125, "logps/rejected": -554.0, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -4.123046875, "rewards/margins": 9.879687309265137, "rewards/rejected": -13.990625381469727, "step": 3360 }, { "epoch": 2.9591567852437417, "grad_norm": 15.720147696101986, "learning_rate": 2.598857644991213e-07, "logits/chosen": -0.564697265625, "logits/rejected": -0.6291534304618835, "logps/chosen": -337.8999938964844, "logps/rejected": -554.7999877929688, "loss": 0.0163, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9320311546325684, "rewards/margins": 10.496874809265137, "rewards/rejected": -14.428125381469727, "step": 3370 }, { "epoch": 2.9679402722880983, "grad_norm": 1.2068090205326016, "learning_rate": 2.57688927943761e-07, "logits/chosen": -0.622753918170929, "logits/rejected": -0.8487304449081421, "logps/chosen": -403.8500061035156, "logps/rejected": -585.5, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -3.7789063453674316, "rewards/margins": 10.564062118530273, "rewards/rejected": -14.346875190734863, "step": 3380 }, { "epoch": 2.976723759332455, "grad_norm": 7.87480607448499, "learning_rate": 2.5549209138840064e-07, "logits/chosen": -0.596923828125, "logits/rejected": -0.697949230670929, "logps/chosen": -326.8500061035156, "logps/rejected": -562.5, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.803515672683716, "rewards/margins": 10.721875190734863, "rewards/rejected": -13.521875381469727, "step": 3390 }, { "epoch": 2.9855072463768115, "grad_norm": 7.235545566669574, "learning_rate": 2.5329525483304045e-07, "logits/chosen": -0.53173828125, "logits/rejected": -0.552539050579071, "logps/chosen": -293.45001220703125, "logps/rejected": -513.3499755859375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -2.9150390625, "rewards/margins": 8.973437309265137, "rewards/rejected": -11.887499809265137, "step": 3400 }, { "epoch": 2.994290733421168, "grad_norm": 8.735564921457842, "learning_rate": 2.5109841827768016e-07, "logits/chosen": -0.5979980230331421, "logits/rejected": -0.6913086175918579, "logps/chosen": -325.45001220703125, "logps/rejected": -493.20001220703125, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2422852516174316, "rewards/margins": 9.392187118530273, "rewards/rejected": -12.6328125, "step": 3410 }, { "epoch": 3.002635046113307, "grad_norm": 1.9698357816589713, "learning_rate": 2.489015817223198e-07, "logits/chosen": -0.6874197125434875, "logits/rejected": -0.6706414222717285, "logps/chosen": -357.1052551269531, "logps/rejected": -635.7894897460938, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.0847039222717285, "rewards/margins": 11.171052932739258, "rewards/rejected": -15.253289222717285, "step": 3420 }, { "epoch": 3.0114185331576637, "grad_norm": 0.4857285903227084, "learning_rate": 2.467047451669596e-07, "logits/chosen": -0.631884753704071, "logits/rejected": -0.7655273675918579, "logps/chosen": -329.6000061035156, "logps/rejected": -548.4000244140625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.804492235183716, "rewards/margins": 10.703125, "rewards/rejected": -14.518750190734863, "step": 3430 }, { "epoch": 3.0202020202020203, "grad_norm": 1.0825610508353438, "learning_rate": 2.445079086115993e-07, "logits/chosen": -0.669506847858429, "logits/rejected": -0.6805419921875, "logps/chosen": -343.5, "logps/rejected": -554.5999755859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.36328125, "rewards/margins": 11.396875381469727, "rewards/rejected": -15.771875381469727, "step": 3440 }, { "epoch": 3.028985507246377, "grad_norm": 0.11627403049276494, "learning_rate": 2.42311072056239e-07, "logits/chosen": -0.798388659954071, "logits/rejected": -0.8189452886581421, "logps/chosen": -374.0, "logps/rejected": -583.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.042187690734863, "rewards/margins": 11.565625190734863, "rewards/rejected": -16.609375, "step": 3450 }, { "epoch": 3.0377689942907335, "grad_norm": 4.581001584226567, "learning_rate": 2.401142355008787e-07, "logits/chosen": -0.553271472454071, "logits/rejected": -0.7668212652206421, "logps/chosen": -356.1499938964844, "logps/rejected": -561.7999877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.986718654632568, "rewards/margins": 12.396875381469727, "rewards/rejected": -17.399999618530273, "step": 3460 }, { "epoch": 3.04655248133509, "grad_norm": 0.40425497082906864, "learning_rate": 2.3791739894551843e-07, "logits/chosen": -0.655322253704071, "logits/rejected": -0.8287109136581421, "logps/chosen": -341.95001220703125, "logps/rejected": -576.7000122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.336718559265137, "rewards/margins": 12.409375190734863, "rewards/rejected": -17.743749618530273, "step": 3470 }, { "epoch": 3.0553359683794468, "grad_norm": 5.070807016039924, "learning_rate": 2.3572056239015817e-07, "logits/chosen": -0.7774902582168579, "logits/rejected": -0.8922363519668579, "logps/chosen": -321.29998779296875, "logps/rejected": -552.0499877929688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.028906345367432, "rewards/margins": 11.975000381469727, "rewards/rejected": -17.0, "step": 3480 }, { "epoch": 3.0641194554238034, "grad_norm": 1.3106382808969026, "learning_rate": 2.3352372583479788e-07, "logits/chosen": -0.678027331829071, "logits/rejected": -0.8067382574081421, "logps/chosen": -318.54998779296875, "logps/rejected": -532.9000244140625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.9296875, "rewards/margins": 11.764062881469727, "rewards/rejected": -16.709375381469727, "step": 3490 }, { "epoch": 3.07290294246816, "grad_norm": 0.4212040571369489, "learning_rate": 2.313268892794376e-07, "logits/chosen": -0.807910144329071, "logits/rejected": -0.9102538824081421, "logps/chosen": -343.1000061035156, "logps/rejected": -579.0499877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.436816215515137, "rewards/margins": 12.357812881469727, "rewards/rejected": -17.806249618530273, "step": 3500 }, { "epoch": 3.0816864295125166, "grad_norm": 0.8531936976492741, "learning_rate": 2.2913005272407732e-07, "logits/chosen": -0.728344738483429, "logits/rejected": -0.796630859375, "logps/chosen": -383.70001220703125, "logps/rejected": -612.9000244140625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -6.4921875, "rewards/margins": 12.140625, "rewards/rejected": -18.621875762939453, "step": 3510 }, { "epoch": 3.090469916556873, "grad_norm": 0.15177516558137835, "learning_rate": 2.2693321616871705e-07, "logits/chosen": -0.589245617389679, "logits/rejected": -0.7318359613418579, "logps/chosen": -345.1000061035156, "logps/rejected": -561.0999755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.523828029632568, "rewards/margins": 11.8125, "rewards/rejected": -16.328125, "step": 3520 }, { "epoch": 3.09925340360123, "grad_norm": 0.120626959793989, "learning_rate": 2.2473637961335676e-07, "logits/chosen": -0.563403308391571, "logits/rejected": -0.6958252191543579, "logps/chosen": -312.8500061035156, "logps/rejected": -581.5999755859375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.671875, "rewards/margins": 12.015625, "rewards/rejected": -16.693750381469727, "step": 3530 }, { "epoch": 3.1080368906455864, "grad_norm": 1.5319111400515755, "learning_rate": 2.2253954305799647e-07, "logits/chosen": -0.6733245849609375, "logits/rejected": -0.642041027545929, "logps/chosen": -343.45001220703125, "logps/rejected": -575.5, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.588671684265137, "rewards/margins": 11.854687690734863, "rewards/rejected": -16.440624237060547, "step": 3540 }, { "epoch": 3.116820377689943, "grad_norm": 0.8666825887040802, "learning_rate": 2.203427065026362e-07, "logits/chosen": -0.659863293170929, "logits/rejected": -0.7002929449081421, "logps/chosen": -313.0, "logps/rejected": -559.2000122070312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -4.561327934265137, "rewards/margins": 11.546875, "rewards/rejected": -16.112499237060547, "step": 3550 }, { "epoch": 3.1256038647342996, "grad_norm": 5.76368477390638, "learning_rate": 2.181458699472759e-07, "logits/chosen": -0.711315929889679, "logits/rejected": -0.8411865234375, "logps/chosen": -290.29998779296875, "logps/rejected": -498.5, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.840234279632568, "rewards/margins": 11.665624618530273, "rewards/rejected": -16.509374618530273, "step": 3560 }, { "epoch": 3.1343873517786562, "grad_norm": 1.634396421988934, "learning_rate": 2.1594903339191564e-07, "logits/chosen": -0.809374988079071, "logits/rejected": -0.9111328125, "logps/chosen": -356.8500061035156, "logps/rejected": -628.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.405468940734863, "rewards/margins": 12.140625, "rewards/rejected": -17.556249618530273, "step": 3570 }, { "epoch": 3.143170838823013, "grad_norm": 3.491692562433568, "learning_rate": 2.1375219683655535e-07, "logits/chosen": -0.61981201171875, "logits/rejected": -0.758544921875, "logps/chosen": -374.04998779296875, "logps/rejected": -587.5999755859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -5.574999809265137, "rewards/margins": 12.175000190734863, "rewards/rejected": -17.743749618530273, "step": 3580 }, { "epoch": 3.1519543258673695, "grad_norm": 8.377854564568725, "learning_rate": 2.1155536028119509e-07, "logits/chosen": -0.6871337890625, "logits/rejected": -0.737811267375946, "logps/chosen": -347.6000061035156, "logps/rejected": -577.5999755859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.086718559265137, "rewards/margins": 12.268750190734863, "rewards/rejected": -17.368749618530273, "step": 3590 }, { "epoch": 3.160737812911726, "grad_norm": 0.39093916096203324, "learning_rate": 2.093585237258348e-07, "logits/chosen": -0.68994140625, "logits/rejected": -0.800048828125, "logps/chosen": -347.25, "logps/rejected": -568.4000244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.254687309265137, "rewards/margins": 11.350000381469727, "rewards/rejected": -16.606250762939453, "step": 3600 }, { "epoch": 3.1695212999560827, "grad_norm": 0.8937712440022854, "learning_rate": 2.071616871704745e-07, "logits/chosen": -0.643627941608429, "logits/rejected": -0.779711902141571, "logps/chosen": -320.25, "logps/rejected": -520.5999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.577734470367432, "rewards/margins": 11.625, "rewards/rejected": -16.190624237060547, "step": 3610 }, { "epoch": 3.1783047870004393, "grad_norm": 1.0013280002118932, "learning_rate": 2.0496485061511424e-07, "logits/chosen": -0.712646484375, "logits/rejected": -0.719042956829071, "logps/chosen": -345.70001220703125, "logps/rejected": -594.7999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.424218654632568, "rewards/margins": 12.524999618530273, "rewards/rejected": -17.946874618530273, "step": 3620 }, { "epoch": 3.187088274044796, "grad_norm": 1.0102112525468798, "learning_rate": 2.0276801405975394e-07, "logits/chosen": -0.760791003704071, "logits/rejected": -0.7755126953125, "logps/chosen": -376.04998779296875, "logps/rejected": -600.2999877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.429638862609863, "rewards/margins": 12.068750381469727, "rewards/rejected": -17.506250381469727, "step": 3630 }, { "epoch": 3.1958717610891525, "grad_norm": 1.0183227522939449, "learning_rate": 2.0057117750439368e-07, "logits/chosen": -0.709057629108429, "logits/rejected": -0.6863647699356079, "logps/chosen": -331.75, "logps/rejected": -569.4000244140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.873827934265137, "rewards/margins": 12.203125, "rewards/rejected": -17.0625, "step": 3640 }, { "epoch": 3.204655248133509, "grad_norm": 1.2792447838504455, "learning_rate": 1.9837434094903339e-07, "logits/chosen": -0.6567627191543579, "logits/rejected": -0.8201049566268921, "logps/chosen": -311.79998779296875, "logps/rejected": -597.2000122070312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.84765625, "rewards/margins": 12.176562309265137, "rewards/rejected": -17.037500381469727, "step": 3650 }, { "epoch": 3.2134387351778657, "grad_norm": 0.12559651035129127, "learning_rate": 1.9617750439367312e-07, "logits/chosen": -0.815722644329071, "logits/rejected": -0.7769531011581421, "logps/chosen": -352.29998779296875, "logps/rejected": -567.4000244140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -5.123046875, "rewards/margins": 11.434374809265137, "rewards/rejected": -16.575000762939453, "step": 3660 }, { "epoch": 3.2222222222222223, "grad_norm": 0.3880835608117375, "learning_rate": 1.9398066783831283e-07, "logits/chosen": -0.8683105707168579, "logits/rejected": -0.775341808795929, "logps/chosen": -348.5, "logps/rejected": -554.0999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.690234184265137, "rewards/margins": 12.390625, "rewards/rejected": -17.081249237060547, "step": 3670 }, { "epoch": 3.231005709266579, "grad_norm": 0.42291518071994627, "learning_rate": 1.9178383128295253e-07, "logits/chosen": -0.813720703125, "logits/rejected": -0.921582043170929, "logps/chosen": -341.3999938964844, "logps/rejected": -558.0999755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.828906059265137, "rewards/margins": 11.709375381469727, "rewards/rejected": -16.524999618530273, "step": 3680 }, { "epoch": 3.2397891963109355, "grad_norm": 0.22249151307047435, "learning_rate": 1.8958699472759227e-07, "logits/chosen": -0.8407958745956421, "logits/rejected": -0.8814452886581421, "logps/chosen": -382.82501220703125, "logps/rejected": -568.0999755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.096875190734863, "rewards/margins": 11.5625, "rewards/rejected": -16.653125762939453, "step": 3690 }, { "epoch": 3.248572683355292, "grad_norm": 6.592911659395918, "learning_rate": 1.8739015817223198e-07, "logits/chosen": -0.62249755859375, "logits/rejected": -0.9051758050918579, "logps/chosen": -343.6499938964844, "logps/rejected": -568.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.244531154632568, "rewards/margins": 12.003125190734863, "rewards/rejected": -17.253124237060547, "step": 3700 }, { "epoch": 3.2573561703996488, "grad_norm": 0.5254110853134186, "learning_rate": 1.851933216168717e-07, "logits/chosen": -0.7266601324081421, "logits/rejected": -0.863964855670929, "logps/chosen": -361.25, "logps/rejected": -589.2000122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.5703125, "rewards/margins": 12.778124809265137, "rewards/rejected": -18.346874237060547, "step": 3710 }, { "epoch": 3.2661396574440054, "grad_norm": 0.6052952126054115, "learning_rate": 1.8299648506151142e-07, "logits/chosen": -0.5437256097793579, "logits/rejected": -0.8807617425918579, "logps/chosen": -369.29998779296875, "logps/rejected": -617.2000122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -6.411718845367432, "rewards/margins": 12.953125, "rewards/rejected": -19.34375, "step": 3720 }, { "epoch": 3.274923144488362, "grad_norm": 0.2866706320831091, "learning_rate": 1.8079964850615115e-07, "logits/chosen": -0.5754241943359375, "logits/rejected": -0.847460925579071, "logps/chosen": -345.75, "logps/rejected": -591.2000122070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.604687690734863, "rewards/margins": 13.087499618530273, "rewards/rejected": -18.6875, "step": 3730 }, { "epoch": 3.2837066315327186, "grad_norm": 0.15271600506682761, "learning_rate": 1.7860281195079086e-07, "logits/chosen": -0.814404308795929, "logits/rejected": -0.8441406488418579, "logps/chosen": -351.54998779296875, "logps/rejected": -599.7999877929688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.288281440734863, "rewards/margins": 12.356249809265137, "rewards/rejected": -18.643749237060547, "step": 3740 }, { "epoch": 3.292490118577075, "grad_norm": 0.1320380706120885, "learning_rate": 1.7640597539543057e-07, "logits/chosen": -0.7835327386856079, "logits/rejected": -0.769970715045929, "logps/chosen": -371.1000061035156, "logps/rejected": -575.4000244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.296093940734863, "rewards/margins": 13.106249809265137, "rewards/rejected": -18.403125762939453, "step": 3750 }, { "epoch": 3.301273605621432, "grad_norm": 0.1373532237907729, "learning_rate": 1.742091388400703e-07, "logits/chosen": -0.8756774663925171, "logits/rejected": -0.9493163824081421, "logps/chosen": -370.79998779296875, "logps/rejected": -656.5999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.409375190734863, "rewards/margins": 13.765625, "rewards/rejected": -20.168750762939453, "step": 3760 }, { "epoch": 3.3100570926657884, "grad_norm": 1.2453718592139729, "learning_rate": 1.7201230228471e-07, "logits/chosen": -0.6209472417831421, "logits/rejected": -0.793652355670929, "logps/chosen": -420.3500061035156, "logps/rejected": -648.2000122070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.819531440734863, "rewards/margins": 12.853124618530273, "rewards/rejected": -18.643749237060547, "step": 3770 }, { "epoch": 3.318840579710145, "grad_norm": 0.1489782648525199, "learning_rate": 1.6981546572934974e-07, "logits/chosen": -0.6748046875, "logits/rejected": -0.804272472858429, "logps/chosen": -352.79998779296875, "logps/rejected": -602.7000122070312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.942968845367432, "rewards/margins": 13.71875, "rewards/rejected": -18.662500381469727, "step": 3780 }, { "epoch": 3.3276240667545016, "grad_norm": 0.19871166227766085, "learning_rate": 1.6761862917398945e-07, "logits/chosen": -0.5995117425918579, "logits/rejected": -0.775683581829071, "logps/chosen": -288.1000061035156, "logps/rejected": -516.0999755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.289843559265137, "rewards/margins": 11.878125190734863, "rewards/rejected": -16.165624618530273, "step": 3790 }, { "epoch": 3.3364075537988582, "grad_norm": 3.2212776464152615, "learning_rate": 1.6542179261862919e-07, "logits/chosen": -0.7139022946357727, "logits/rejected": -0.9049316644668579, "logps/chosen": -300.95001220703125, "logps/rejected": -556.7999877929688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.397265434265137, "rewards/margins": 12.643750190734863, "rewards/rejected": -17.040624618530273, "step": 3800 }, { "epoch": 3.345191040843215, "grad_norm": 0.32888300472844767, "learning_rate": 1.632249560632689e-07, "logits/chosen": -0.68414306640625, "logits/rejected": -0.905957043170929, "logps/chosen": -390.1000061035156, "logps/rejected": -610.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.37109375, "rewards/margins": 12.387499809265137, "rewards/rejected": -17.75, "step": 3810 }, { "epoch": 3.3539745278875714, "grad_norm": 0.6509340399244322, "learning_rate": 1.610281195079086e-07, "logits/chosen": -0.6949462890625, "logits/rejected": -0.996874988079071, "logps/chosen": -337.6000061035156, "logps/rejected": -607.7000122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.23828125, "rewards/margins": 12.928125381469727, "rewards/rejected": -18.174999237060547, "step": 3820 }, { "epoch": 3.362758014931928, "grad_norm": 3.8441839625949963, "learning_rate": 1.5883128295254834e-07, "logits/chosen": -0.811328113079071, "logits/rejected": -0.9325195550918579, "logps/chosen": -367.6000061035156, "logps/rejected": -619.2999877929688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.882031440734863, "rewards/margins": 12.331250190734863, "rewards/rejected": -18.212499618530273, "step": 3830 }, { "epoch": 3.3715415019762847, "grad_norm": 1.113363074039923, "learning_rate": 1.5663444639718804e-07, "logits/chosen": -0.67041015625, "logits/rejected": -0.683886706829071, "logps/chosen": -346.3500061035156, "logps/rejected": -571.2000122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.285546779632568, "rewards/margins": 13.065625190734863, "rewards/rejected": -18.353124618530273, "step": 3840 }, { "epoch": 3.3803249890206413, "grad_norm": 1.4284484181003771, "learning_rate": 1.5443760984182778e-07, "logits/chosen": -0.7538086175918579, "logits/rejected": -0.8604491949081421, "logps/chosen": -365.3500061035156, "logps/rejected": -589.7000122070312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -5.614843845367432, "rewards/margins": 12.59375, "rewards/rejected": -18.206249237060547, "step": 3850 }, { "epoch": 3.389108476064998, "grad_norm": 0.47949983408537433, "learning_rate": 1.5224077328646749e-07, "logits/chosen": -0.788195788860321, "logits/rejected": -0.830371081829071, "logps/chosen": -351.1000061035156, "logps/rejected": -531.5999755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.085156440734863, "rewards/margins": 11.140625, "rewards/rejected": -16.221874237060547, "step": 3860 }, { "epoch": 3.3978919631093545, "grad_norm": 1.8758923073625002, "learning_rate": 1.5004393673110722e-07, "logits/chosen": -0.683154284954071, "logits/rejected": -0.8568359613418579, "logps/chosen": -358.29998779296875, "logps/rejected": -573.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.255468845367432, "rewards/margins": 11.596875190734863, "rewards/rejected": -16.846874237060547, "step": 3870 }, { "epoch": 3.406675450153711, "grad_norm": 0.10004962016699698, "learning_rate": 1.4784710017574693e-07, "logits/chosen": -0.48969727754592896, "logits/rejected": -0.6861327886581421, "logps/chosen": -340.70001220703125, "logps/rejected": -591.2000122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.8046875, "rewards/margins": 12.3125, "rewards/rejected": -17.125, "step": 3880 }, { "epoch": 3.4154589371980677, "grad_norm": 0.8643026986621021, "learning_rate": 1.4565026362038664e-07, "logits/chosen": -0.643627941608429, "logits/rejected": -0.757250964641571, "logps/chosen": -355.1499938964844, "logps/rejected": -574.5999755859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.896874904632568, "rewards/margins": 12.243749618530273, "rewards/rejected": -17.146875381469727, "step": 3890 }, { "epoch": 3.4242424242424243, "grad_norm": 0.519757644240999, "learning_rate": 1.4345342706502637e-07, "logits/chosen": -0.842456042766571, "logits/rejected": -0.970898449420929, "logps/chosen": -385.3500061035156, "logps/rejected": -610.9000244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.62890625, "rewards/margins": 12.346875190734863, "rewards/rejected": -17.96875, "step": 3900 }, { "epoch": 3.433025911286781, "grad_norm": 1.206817105724007, "learning_rate": 1.4125659050966608e-07, "logits/chosen": -0.7891601324081421, "logits/rejected": -0.8911987543106079, "logps/chosen": -382.79998779296875, "logps/rejected": -627.7999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.856249809265137, "rewards/margins": 13.350000381469727, "rewards/rejected": -19.234375, "step": 3910 }, { "epoch": 3.4418093983311375, "grad_norm": 0.02338955786275608, "learning_rate": 1.390597539543058e-07, "logits/chosen": -0.7966552972793579, "logits/rejected": -0.9984375238418579, "logps/chosen": -344.6499938964844, "logps/rejected": -554.5999755859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.6640625, "rewards/margins": 12.715624809265137, "rewards/rejected": -18.365625381469727, "step": 3920 }, { "epoch": 3.450592885375494, "grad_norm": 2.503542135793396, "learning_rate": 1.3686291739894552e-07, "logits/chosen": -0.7371581792831421, "logits/rejected": -0.82421875, "logps/chosen": -301.29998779296875, "logps/rejected": -592.4000244140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.8359375, "rewards/margins": 12.918749809265137, "rewards/rejected": -17.756250381469727, "step": 3930 }, { "epoch": 3.4593763724198507, "grad_norm": 0.31999049829060405, "learning_rate": 1.3466608084358525e-07, "logits/chosen": -0.8643554449081421, "logits/rejected": -0.949902355670929, "logps/chosen": -322.70001220703125, "logps/rejected": -614.4000244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.157422065734863, "rewards/margins": 13.100000381469727, "rewards/rejected": -18.253124237060547, "step": 3940 }, { "epoch": 3.4681598594642074, "grad_norm": 1.3690410036756941, "learning_rate": 1.3246924428822496e-07, "logits/chosen": -0.7431640625, "logits/rejected": -0.8711913824081421, "logps/chosen": -338.95001220703125, "logps/rejected": -552.7999877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.025000095367432, "rewards/margins": 12.321874618530273, "rewards/rejected": -17.325000762939453, "step": 3950 }, { "epoch": 3.476943346508564, "grad_norm": 0.9963866592133116, "learning_rate": 1.3027240773286467e-07, "logits/chosen": -0.767333984375, "logits/rejected": -0.8601654171943665, "logps/chosen": -319.6000061035156, "logps/rejected": -533.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.089062690734863, "rewards/margins": 11.734375, "rewards/rejected": -16.828125, "step": 3960 }, { "epoch": 3.4857268335529206, "grad_norm": 0.6102091502226259, "learning_rate": 1.280755711775044e-07, "logits/chosen": -0.652539074420929, "logits/rejected": -0.692089855670929, "logps/chosen": -382.79998779296875, "logps/rejected": -617.5999755859375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -5.981640815734863, "rewards/margins": 12.293749809265137, "rewards/rejected": -18.284374237060547, "step": 3970 }, { "epoch": 3.494510320597277, "grad_norm": 0.28944721435938897, "learning_rate": 1.258787346221441e-07, "logits/chosen": -0.8193603754043579, "logits/rejected": -0.83154296875, "logps/chosen": -323.8500061035156, "logps/rejected": -599.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.2578125, "rewards/margins": 13.475000381469727, "rewards/rejected": -18.740625381469727, "step": 3980 }, { "epoch": 3.503293807641634, "grad_norm": 1.2699112162598072, "learning_rate": 1.2368189806678382e-07, "logits/chosen": -0.7298034429550171, "logits/rejected": -0.8475097417831421, "logps/chosen": -360.3999938964844, "logps/rejected": -568.2999877929688, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.374218940734863, "rewards/margins": 11.96875, "rewards/rejected": -17.346874237060547, "step": 3990 }, { "epoch": 3.5120772946859904, "grad_norm": 1.21445466513798, "learning_rate": 1.2148506151142355e-07, "logits/chosen": -0.7781006097793579, "logits/rejected": -0.919726550579071, "logps/chosen": -360.25, "logps/rejected": -593.7000122070312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -5.369531154632568, "rewards/margins": 12.309374809265137, "rewards/rejected": -17.668750762939453, "step": 4000 }, { "epoch": 3.520860781730347, "grad_norm": 0.37449217304403204, "learning_rate": 1.1928822495606326e-07, "logits/chosen": -0.6583007574081421, "logits/rejected": -0.76513671875, "logps/chosen": -369.0, "logps/rejected": -595.2000122070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.332812309265137, "rewards/margins": 12.178125381469727, "rewards/rejected": -17.515625, "step": 4010 }, { "epoch": 3.5296442687747036, "grad_norm": 0.18820118692501386, "learning_rate": 1.1709138840070298e-07, "logits/chosen": -0.8780761957168579, "logits/rejected": -0.798046886920929, "logps/chosen": -380.8500061035156, "logps/rejected": -660.7000122070312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.493750095367432, "rewards/margins": 13.190625190734863, "rewards/rejected": -19.668750762939453, "step": 4020 }, { "epoch": 3.53842775581906, "grad_norm": 1.1737776178840431, "learning_rate": 1.148945518453427e-07, "logits/chosen": -0.7520507574081421, "logits/rejected": -0.8872314691543579, "logps/chosen": -388.3500061035156, "logps/rejected": -594.9000244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.37890625, "rewards/margins": 12.509374618530273, "rewards/rejected": -17.887500762939453, "step": 4030 }, { "epoch": 3.547211242863417, "grad_norm": 0.40392707455640753, "learning_rate": 1.1269771528998242e-07, "logits/chosen": -0.659960925579071, "logits/rejected": -0.9271484613418579, "logps/chosen": -390.3999938964844, "logps/rejected": -571.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.2109375, "rewards/margins": 12.640625, "rewards/rejected": -17.850000381469727, "step": 4040 }, { "epoch": 3.5559947299077734, "grad_norm": 0.09860827664203785, "learning_rate": 1.1050087873462213e-07, "logits/chosen": -0.771484375, "logits/rejected": -0.899121105670929, "logps/chosen": -390.29998779296875, "logps/rejected": -637.5999755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -6.043749809265137, "rewards/margins": 12.984375, "rewards/rejected": -19.037500381469727, "step": 4050 }, { "epoch": 3.56477821695213, "grad_norm": 1.534280906919521, "learning_rate": 1.0830404217926185e-07, "logits/chosen": -0.6519531011581421, "logits/rejected": -0.8147948980331421, "logps/chosen": -414.8999938964844, "logps/rejected": -648.5, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -6.534375190734863, "rewards/margins": 12.125, "rewards/rejected": -18.65625, "step": 4060 }, { "epoch": 3.5735617039964866, "grad_norm": 0.8167720803498206, "learning_rate": 1.0610720562390157e-07, "logits/chosen": -0.7841598391532898, "logits/rejected": -0.90966796875, "logps/chosen": -376.95001220703125, "logps/rejected": -610.0999755859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.750390529632568, "rewards/margins": 12.574999809265137, "rewards/rejected": -18.315624237060547, "step": 4070 }, { "epoch": 3.5823451910408433, "grad_norm": 1.0055268508578736, "learning_rate": 1.039103690685413e-07, "logits/chosen": -0.782910168170929, "logits/rejected": -0.9605468511581421, "logps/chosen": -354.6499938964844, "logps/rejected": -591.2000122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.487500190734863, "rewards/margins": 12.565625190734863, "rewards/rejected": -18.056249618530273, "step": 4080 }, { "epoch": 3.5911286780852, "grad_norm": 0.9421711234169581, "learning_rate": 1.0171353251318102e-07, "logits/chosen": -0.6322021484375, "logits/rejected": -0.860546886920929, "logps/chosen": -309.54998779296875, "logps/rejected": -549.7000122070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.800000190734863, "rewards/margins": 12.178125381469727, "rewards/rejected": -16.984375, "step": 4090 }, { "epoch": 3.5999121651295565, "grad_norm": 0.48502348848461296, "learning_rate": 9.951669595782074e-08, "logits/chosen": -0.7259765863418579, "logits/rejected": -0.875781238079071, "logps/chosen": -335.6499938964844, "logps/rejected": -564.0999755859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.364062309265137, "rewards/margins": 11.506250381469727, "rewards/rejected": -16.850000381469727, "step": 4100 }, { "epoch": 3.608695652173913, "grad_norm": 0.3552856600530717, "learning_rate": 9.731985940246046e-08, "logits/chosen": -0.898144543170929, "logits/rejected": -0.887133777141571, "logps/chosen": -357.1000061035156, "logps/rejected": -614.4000244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.7265625, "rewards/margins": 12.753125190734863, "rewards/rejected": -18.487499237060547, "step": 4110 }, { "epoch": 3.6174791392182697, "grad_norm": 7.800181520285459, "learning_rate": 9.512302284710017e-08, "logits/chosen": -0.653076171875, "logits/rejected": -0.7555907964706421, "logps/chosen": -336.1000061035156, "logps/rejected": -568.8499755859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.805761814117432, "rewards/margins": 12.324999809265137, "rewards/rejected": -17.134374618530273, "step": 4120 }, { "epoch": 3.6262626262626263, "grad_norm": 1.5259738413035562, "learning_rate": 9.292618629173989e-08, "logits/chosen": -0.840771496295929, "logits/rejected": -0.8824218511581421, "logps/chosen": -319.3500061035156, "logps/rejected": -574.9000244140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.239843845367432, "rewards/margins": 12.865625381469727, "rewards/rejected": -18.115625381469727, "step": 4130 }, { "epoch": 3.635046113306983, "grad_norm": 0.3277343269235385, "learning_rate": 9.072934973637961e-08, "logits/chosen": -0.6695801019668579, "logits/rejected": -0.82440185546875, "logps/chosen": -323.3500061035156, "logps/rejected": -529.5999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.821679592132568, "rewards/margins": 11.615625381469727, "rewards/rejected": -16.450000762939453, "step": 4140 }, { "epoch": 3.6438296003513395, "grad_norm": 0.5342608770261899, "learning_rate": 8.853251318101933e-08, "logits/chosen": -0.6636718511581421, "logits/rejected": -0.8084961175918579, "logps/chosen": -324.70001220703125, "logps/rejected": -555.5, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.150781154632568, "rewards/margins": 11.481249809265137, "rewards/rejected": -16.621875762939453, "step": 4150 }, { "epoch": 3.652613087395696, "grad_norm": 0.7111916725311979, "learning_rate": 8.633567662565905e-08, "logits/chosen": -0.7704101800918579, "logits/rejected": -0.810253918170929, "logps/chosen": -377.6499938964844, "logps/rejected": -550.7000122070312, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -5.010937690734863, "rewards/margins": 12.34375, "rewards/rejected": -17.362499237060547, "step": 4160 }, { "epoch": 3.6613965744400527, "grad_norm": 0.6218180668882708, "learning_rate": 8.413884007029877e-08, "logits/chosen": -0.853515625, "logits/rejected": -0.909472644329071, "logps/chosen": -363.5, "logps/rejected": -578.5, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.40625, "rewards/margins": 11.956250190734863, "rewards/rejected": -17.365625381469727, "step": 4170 }, { "epoch": 3.6701800614844093, "grad_norm": 1.0150234728257561, "learning_rate": 8.194200351493849e-08, "logits/chosen": -0.720141589641571, "logits/rejected": -0.899218738079071, "logps/chosen": -340.8999938964844, "logps/rejected": -570.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.9609375, "rewards/margins": 12.331250190734863, "rewards/rejected": -17.278125762939453, "step": 4180 }, { "epoch": 3.678963548528766, "grad_norm": 3.9199498593360547, "learning_rate": 7.97451669595782e-08, "logits/chosen": -0.683398425579071, "logits/rejected": -0.871777355670929, "logps/chosen": -307.3999938964844, "logps/rejected": -526.4000244140625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.525000095367432, "rewards/margins": 11.790624618530273, "rewards/rejected": -16.318750381469727, "step": 4190 }, { "epoch": 3.6877470355731226, "grad_norm": 1.0042343709214525, "learning_rate": 7.754833040421792e-08, "logits/chosen": -0.792675793170929, "logits/rejected": -0.878222644329071, "logps/chosen": -370.8999938964844, "logps/rejected": -580.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.015625, "rewards/margins": 13.018750190734863, "rewards/rejected": -18.037500381469727, "step": 4200 }, { "epoch": 3.696530522617479, "grad_norm": 0.05879685602714947, "learning_rate": 7.535149384885764e-08, "logits/chosen": -0.8681640625, "logits/rejected": -0.934765636920929, "logps/chosen": -356.20001220703125, "logps/rejected": -617.2000122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.675000190734863, "rewards/margins": 12.515625, "rewards/rejected": -18.184375762939453, "step": 4210 }, { "epoch": 3.7053140096618358, "grad_norm": 0.7459190921645195, "learning_rate": 7.315465729349736e-08, "logits/chosen": -0.7363525629043579, "logits/rejected": -0.7402588129043579, "logps/chosen": -389.45001220703125, "logps/rejected": -614.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.888281345367432, "rewards/margins": 12.75, "rewards/rejected": -18.649999618530273, "step": 4220 }, { "epoch": 3.7140974967061924, "grad_norm": 0.16791537200406803, "learning_rate": 7.095782073813708e-08, "logits/chosen": -0.628979504108429, "logits/rejected": -0.88623046875, "logps/chosen": -346.54998779296875, "logps/rejected": -581.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.677343845367432, "rewards/margins": 12.793749809265137, "rewards/rejected": -18.475000381469727, "step": 4230 }, { "epoch": 3.722880983750549, "grad_norm": 0.3766722459937703, "learning_rate": 6.87609841827768e-08, "logits/chosen": -0.6266235113143921, "logits/rejected": -0.767041027545929, "logps/chosen": -366.1000061035156, "logps/rejected": -571.4000244140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -5.175000190734863, "rewards/margins": 12.125, "rewards/rejected": -17.290624618530273, "step": 4240 }, { "epoch": 3.7316644707949056, "grad_norm": 0.42901964636548273, "learning_rate": 6.656414762741652e-08, "logits/chosen": -0.635089099407196, "logits/rejected": -0.8465820550918579, "logps/chosen": -361.75, "logps/rejected": -590.2999877929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.26953125, "rewards/margins": 12.15625, "rewards/rejected": -17.434375762939453, "step": 4250 }, { "epoch": 3.740447957839262, "grad_norm": 0.6101971258876042, "learning_rate": 6.436731107205623e-08, "logits/chosen": -0.728222668170929, "logits/rejected": -0.995312511920929, "logps/chosen": -358.95001220703125, "logps/rejected": -629.5999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.238671779632568, "rewards/margins": 13.403124809265137, "rewards/rejected": -18.649999618530273, "step": 4260 }, { "epoch": 3.749231444883619, "grad_norm": 0.18924798324364295, "learning_rate": 6.217047451669595e-08, "logits/chosen": -0.6636718511581421, "logits/rejected": -0.980273425579071, "logps/chosen": -364.20001220703125, "logps/rejected": -528.7000122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.005468845367432, "rewards/margins": 11.637499809265137, "rewards/rejected": -16.640625, "step": 4270 }, { "epoch": 3.7580149319279754, "grad_norm": 0.4252768428152253, "learning_rate": 5.997363796133567e-08, "logits/chosen": -0.705078125, "logits/rejected": -0.895214855670929, "logps/chosen": -431.1000061035156, "logps/rejected": -616.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.664843559265137, "rewards/margins": 12.106249809265137, "rewards/rejected": -17.756250381469727, "step": 4280 }, { "epoch": 3.766798418972332, "grad_norm": 0.24749015058843712, "learning_rate": 5.7776801405975395e-08, "logits/chosen": -0.7414306402206421, "logits/rejected": -0.815625011920929, "logps/chosen": -361.5, "logps/rejected": -621.7999877929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.099999904632568, "rewards/margins": 11.893750190734863, "rewards/rejected": -16.984375, "step": 4290 }, { "epoch": 3.7755819060166886, "grad_norm": 0.34039804259793743, "learning_rate": 5.5579964850615116e-08, "logits/chosen": -0.773974597454071, "logits/rejected": -0.941699206829071, "logps/chosen": -335.04998779296875, "logps/rejected": -593.9000244140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.869140625, "rewards/margins": 12.690625190734863, "rewards/rejected": -17.559375762939453, "step": 4300 }, { "epoch": 3.7843653930610452, "grad_norm": 1.083507956106083, "learning_rate": 5.338312829525484e-08, "logits/chosen": -0.70916748046875, "logits/rejected": -0.8550781011581421, "logps/chosen": -331.45001220703125, "logps/rejected": -552.75, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.585156440734863, "rewards/margins": 12.084375381469727, "rewards/rejected": -16.678125381469727, "step": 4310 }, { "epoch": 3.793148880105402, "grad_norm": 0.18944377089963302, "learning_rate": 5.1186291739894545e-08, "logits/chosen": -0.6427246332168579, "logits/rejected": -0.8628906011581421, "logps/chosen": -359.3999938964844, "logps/rejected": -596.2000122070312, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -5.824999809265137, "rewards/margins": 13.109375, "rewards/rejected": -18.946874618530273, "step": 4320 }, { "epoch": 3.8019323671497585, "grad_norm": 0.12274363944606641, "learning_rate": 4.8989455184534266e-08, "logits/chosen": -0.5781799554824829, "logits/rejected": -0.791796863079071, "logps/chosen": -344.20001220703125, "logps/rejected": -580.2000122070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.828906059265137, "rewards/margins": 12.850000381469727, "rewards/rejected": -17.684375762939453, "step": 4330 }, { "epoch": 3.810715854194115, "grad_norm": 1.8447122642955467, "learning_rate": 4.679261862917399e-08, "logits/chosen": -0.6944824457168579, "logits/rejected": -0.966601550579071, "logps/chosen": -373.54998779296875, "logps/rejected": -588.2000122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.485156059265137, "rewards/margins": 11.865625381469727, "rewards/rejected": -17.359375, "step": 4340 }, { "epoch": 3.8194993412384717, "grad_norm": 1.8040121868134822, "learning_rate": 4.45957820738137e-08, "logits/chosen": -0.820727527141571, "logits/rejected": -0.8787597417831421, "logps/chosen": -366.1000061035156, "logps/rejected": -616.2999877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.779687404632568, "rewards/margins": 12.899999618530273, "rewards/rejected": -18.678125381469727, "step": 4350 }, { "epoch": 3.8282828282828283, "grad_norm": 0.7811412759968733, "learning_rate": 4.239894551845342e-08, "logits/chosen": -0.629986584186554, "logits/rejected": -0.713623046875, "logps/chosen": -368.20001220703125, "logps/rejected": -577.0999755859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.564062595367432, "rewards/margins": 12.324999809265137, "rewards/rejected": -17.896875381469727, "step": 4360 }, { "epoch": 3.837066315327185, "grad_norm": 4.335908812472759, "learning_rate": 4.020210896309314e-08, "logits/chosen": -0.7867187261581421, "logits/rejected": -0.7392944097518921, "logps/chosen": -373.3999938964844, "logps/rejected": -578.7000122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.256249904632568, "rewards/margins": 12.646875381469727, "rewards/rejected": -17.890625, "step": 4370 }, { "epoch": 3.8458498023715415, "grad_norm": 5.462106863372973, "learning_rate": 3.8005272407732864e-08, "logits/chosen": -0.841601550579071, "logits/rejected": -0.962207019329071, "logps/chosen": -372.75, "logps/rejected": -603.7000122070312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.821875095367432, "rewards/margins": 12.90625, "rewards/rejected": -18.71875, "step": 4380 }, { "epoch": 3.854633289415898, "grad_norm": 0.5539964668668165, "learning_rate": 3.580843585237258e-08, "logits/chosen": -0.628857433795929, "logits/rejected": -0.872851550579071, "logps/chosen": -306.32501220703125, "logps/rejected": -564.9000244140625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.810937404632568, "rewards/margins": 12.84375, "rewards/rejected": -17.643749237060547, "step": 4390 }, { "epoch": 3.8634167764602547, "grad_norm": 0.19094031328079536, "learning_rate": 3.36115992970123e-08, "logits/chosen": -0.7801269292831421, "logits/rejected": -0.9029296636581421, "logps/chosen": -330.8500061035156, "logps/rejected": -557.5999755859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.775000095367432, "rewards/margins": 12.770312309265137, "rewards/rejected": -17.543750762939453, "step": 4400 }, { "epoch": 3.8722002635046113, "grad_norm": 1.2718564213054837, "learning_rate": 3.141476274165202e-08, "logits/chosen": -0.730712890625, "logits/rejected": -0.820068359375, "logps/chosen": -406.8999938964844, "logps/rejected": -644.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.0703125, "rewards/margins": 12.834375381469727, "rewards/rejected": -19.896875381469727, "step": 4410 }, { "epoch": 3.880983750548968, "grad_norm": 0.2775425492417625, "learning_rate": 2.9217926186291738e-08, "logits/chosen": -0.7862793207168579, "logits/rejected": -0.856152355670929, "logps/chosen": -396.25, "logps/rejected": -592.2999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.849218845367432, "rewards/margins": 12.5625, "rewards/rejected": -18.412500381469727, "step": 4420 }, { "epoch": 3.8897672375933245, "grad_norm": 0.9255690071346867, "learning_rate": 2.7021089630931456e-08, "logits/chosen": -0.5414794683456421, "logits/rejected": -0.86962890625, "logps/chosen": -329.70001220703125, "logps/rejected": -597.2999877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.1640625, "rewards/margins": 12.887499809265137, "rewards/rejected": -18.046875, "step": 4430 }, { "epoch": 3.898550724637681, "grad_norm": 0.09509219960006751, "learning_rate": 2.4824253075571177e-08, "logits/chosen": -0.733349621295929, "logits/rejected": -0.8858398199081421, "logps/chosen": -347.29998779296875, "logps/rejected": -577.5999755859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.623437404632568, "rewards/margins": 12.990625381469727, "rewards/rejected": -18.609375, "step": 4440 }, { "epoch": 3.9073342116820378, "grad_norm": 0.1492750588536168, "learning_rate": 2.2627416520210894e-08, "logits/chosen": -0.7288147211074829, "logits/rejected": -0.9326171875, "logps/chosen": -320.04998779296875, "logps/rejected": -554.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -4.787499904632568, "rewards/margins": 13.546875, "rewards/rejected": -18.353124618530273, "step": 4450 }, { "epoch": 3.9161176987263944, "grad_norm": 2.821939353521206, "learning_rate": 2.0430579964850612e-08, "logits/chosen": -0.79736328125, "logits/rejected": -0.791369616985321, "logps/chosen": -333.8999938964844, "logps/rejected": -573.0, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.453125, "rewards/margins": 12.649999618530273, "rewards/rejected": -18.100000381469727, "step": 4460 }, { "epoch": 3.924901185770751, "grad_norm": 0.7860293457274615, "learning_rate": 1.8233743409490333e-08, "logits/chosen": -0.5291992425918579, "logits/rejected": -0.8553711175918579, "logps/chosen": -275.125, "logps/rejected": -494.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.118359565734863, "rewards/margins": 12.365625381469727, "rewards/rejected": -16.481250762939453, "step": 4470 }, { "epoch": 3.9336846728151076, "grad_norm": 0.5347462186914494, "learning_rate": 1.603690685413005e-08, "logits/chosen": -0.908709704875946, "logits/rejected": -0.898730456829071, "logps/chosen": -336.45001220703125, "logps/rejected": -563.7000122070312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -5.376562595367432, "rewards/margins": 12.512499809265137, "rewards/rejected": -17.887500762939453, "step": 4480 }, { "epoch": 3.942468159859464, "grad_norm": 0.6051490997330453, "learning_rate": 1.3840070298769772e-08, "logits/chosen": -0.61871337890625, "logits/rejected": -0.8662109375, "logps/chosen": -299.5, "logps/rejected": -521.5999755859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.442187309265137, "rewards/margins": 12.171875, "rewards/rejected": -16.596874237060547, "step": 4490 }, { "epoch": 3.951251646903821, "grad_norm": 0.09922337635807642, "learning_rate": 1.164323374340949e-08, "logits/chosen": -0.6731933355331421, "logits/rejected": -0.9261718988418579, "logps/chosen": -368.1499938964844, "logps/rejected": -579.5, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.251562595367432, "rewards/margins": 13.153124809265137, "rewards/rejected": -18.415624618530273, "step": 4500 }, { "epoch": 3.9600351339481774, "grad_norm": 5.74323605673917, "learning_rate": 9.446397188049209e-09, "logits/chosen": -0.530517578125, "logits/rejected": -0.783935546875, "logps/chosen": -353.25, "logps/rejected": -564.2000122070312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -5.331250190734863, "rewards/margins": 11.571874618530273, "rewards/rejected": -16.896875381469727, "step": 4510 }, { "epoch": 3.968818620992534, "grad_norm": 0.22821694470083217, "learning_rate": 7.249560632688927e-09, "logits/chosen": -0.6523681879043579, "logits/rejected": -1.005957007408142, "logps/chosen": -394.04998779296875, "logps/rejected": -567.5999755859375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -5.9453125, "rewards/margins": 11.743749618530273, "rewards/rejected": -17.690624237060547, "step": 4520 }, { "epoch": 3.9776021080368906, "grad_norm": 5.352910616931071, "learning_rate": 5.0527240773286466e-09, "logits/chosen": -0.699633777141571, "logits/rejected": -0.764208972454071, "logps/chosen": -339.75, "logps/rejected": -611.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.450781345367432, "rewards/margins": 12.853124618530273, "rewards/rejected": -18.315624237060547, "step": 4530 }, { "epoch": 3.9863855950812472, "grad_norm": 0.2362407465685419, "learning_rate": 2.8558875219683655e-09, "logits/chosen": -0.6990722417831421, "logits/rejected": -0.946044921875, "logps/chosen": -363.29998779296875, "logps/rejected": -585.4000244140625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -5.189843654632568, "rewards/margins": 12.456250190734863, "rewards/rejected": -17.640625, "step": 4540 }, { "epoch": 3.995169082125604, "grad_norm": 1.7024768923286342, "learning_rate": 6.590509666080844e-10, "logits/chosen": -0.726208508014679, "logits/rejected": -0.8506835699081421, "logps/chosen": -350.6499938964844, "logps/rejected": -584.7999877929688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.971875190734863, "rewards/margins": 12.178125381469727, "rewards/rejected": -17.162500381469727, "step": 4550 } ], "logging_steps": 10, "max_steps": 4552, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }