{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 515,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0970873786407767,
      "grad_norm": 6.964529721458028,
      "learning_rate": 8.653846153846154e-07,
      "loss": 0.9101,
      "step": 10
    },
    {
      "epoch": 0.1941747572815534,
      "grad_norm": 3.566929368993238,
      "learning_rate": 1.826923076923077e-06,
      "loss": 0.785,
      "step": 20
    },
    {
      "epoch": 0.2912621359223301,
      "grad_norm": 2.572419911943158,
      "learning_rate": 2.7884615384615386e-06,
      "loss": 0.6303,
      "step": 30
    },
    {
      "epoch": 0.3883495145631068,
      "grad_norm": 0.6433805154127846,
      "learning_rate": 3.7500000000000005e-06,
      "loss": 0.5371,
      "step": 40
    },
    {
      "epoch": 0.4854368932038835,
      "grad_norm": 0.5152897860452502,
      "learning_rate": 4.711538461538462e-06,
      "loss": 0.4788,
      "step": 50
    },
    {
      "epoch": 0.5825242718446602,
      "grad_norm": 0.7787775149779967,
      "learning_rate": 4.997180564209414e-06,
      "loss": 0.4451,
      "step": 60
    },
    {
      "epoch": 0.6796116504854369,
      "grad_norm": 0.8091799603982648,
      "learning_rate": 4.9833863897161715e-06,
      "loss": 0.4145,
      "step": 70
    },
    {
      "epoch": 0.7766990291262136,
      "grad_norm": 0.8426511044153906,
      "learning_rate": 4.95816302585984e-06,
      "loss": 0.3935,
      "step": 80
    },
    {
      "epoch": 0.8737864077669902,
      "grad_norm": 0.8115389627762832,
      "learning_rate": 4.9216265571140565e-06,
      "loss": 0.3753,
      "step": 90
    },
    {
      "epoch": 0.970873786407767,
      "grad_norm": 0.5597270195195797,
      "learning_rate": 4.8739451338003675e-06,
      "loss": 0.3679,
      "step": 100
    },
    {
      "epoch": 1.0679611650485437,
      "grad_norm": 0.41701001823219247,
      "learning_rate": 4.815338198216762e-06,
      "loss": 0.3288,
      "step": 110
    },
    {
      "epoch": 1.1650485436893203,
      "grad_norm": 22.448277335293643,
      "learning_rate": 4.746075474707204e-06,
      "loss": 0.3192,
      "step": 120
    },
    {
      "epoch": 1.262135922330097,
      "grad_norm": 0.4010902102419476,
      "learning_rate": 4.666475728320124e-06,
      "loss": 0.3189,
      "step": 130
    },
    {
      "epoch": 1.3592233009708738,
      "grad_norm": 0.48930786140814836,
      "learning_rate": 4.576905297768856e-06,
      "loss": 0.3132,
      "step": 140
    },
    {
      "epoch": 1.4563106796116505,
      "grad_norm": 0.3541487037094227,
      "learning_rate": 4.477776409445692e-06,
      "loss": 0.3038,
      "step": 150
    },
    {
      "epoch": 1.5533980582524272,
      "grad_norm": 31.33298474339703,
      "learning_rate": 4.369545280248932e-06,
      "loss": 0.2957,
      "step": 160
    },
    {
      "epoch": 1.650485436893204,
      "grad_norm": 0.3578518918951451,
      "learning_rate": 4.252710017954191e-06,
      "loss": 0.2966,
      "step": 170
    },
    {
      "epoch": 1.7475728155339807,
      "grad_norm": 0.33938843089336523,
      "learning_rate": 4.127808328793e-06,
      "loss": 0.3004,
      "step": 180
    },
    {
      "epoch": 1.8446601941747574,
      "grad_norm": 0.32000682056080126,
      "learning_rate": 3.995415042789034e-06,
      "loss": 0.3086,
      "step": 190
    },
    {
      "epoch": 1.941747572815534,
      "grad_norm": 0.3829047834599722,
      "learning_rate": 3.856139468240996e-06,
      "loss": 0.2868,
      "step": 200
    },
    {
      "epoch": 2.0388349514563107,
      "grad_norm": 0.4031366118857969,
      "learning_rate": 3.7106225875275257e-06,
      "loss": 0.2779,
      "step": 210
    },
    {
      "epoch": 2.1359223300970873,
      "grad_norm": 0.37263381399379847,
      "learning_rate": 3.5595341071397627e-06,
      "loss": 0.2504,
      "step": 220
    },
    {
      "epoch": 2.233009708737864,
      "grad_norm": 0.3332345121427193,
      "learning_rate": 3.4035693755180817e-06,
      "loss": 0.2466,
      "step": 230
    },
    {
      "epoch": 2.3300970873786406,
      "grad_norm": 0.2989895988940855,
      "learning_rate": 3.2434461828779096e-06,
      "loss": 0.2387,
      "step": 240
    },
    {
      "epoch": 2.4271844660194173,
      "grad_norm": 0.3583335796728048,
      "learning_rate": 3.0799014577526735e-06,
      "loss": 0.2418,
      "step": 250
    },
    {
      "epoch": 2.524271844660194,
      "grad_norm": 0.38853396086729947,
      "learning_rate": 2.9136878754572317e-06,
      "loss": 0.2542,
      "step": 260
    },
    {
      "epoch": 2.6213592233009706,
      "grad_norm": 0.3547407166001253,
      "learning_rate": 2.7455703940805228e-06,
      "loss": 0.2419,
      "step": 270
    },
    {
      "epoch": 2.7184466019417477,
      "grad_norm": 0.33729197409950573,
      "learning_rate": 2.5763227339496984e-06,
      "loss": 0.24,
      "step": 280
    },
    {
      "epoch": 2.8155339805825244,
      "grad_norm": 0.33325471675524143,
      "learning_rate": 2.4067238167681655e-06,
      "loss": 0.2414,
      "step": 290
    },
    {
      "epoch": 2.912621359223301,
      "grad_norm": 0.3516852631876031,
      "learning_rate": 2.237554180815538e-06,
      "loss": 0.2413,
      "step": 300
    },
    {
      "epoch": 3.0097087378640777,
      "grad_norm": 0.3860131260501091,
      "learning_rate": 2.0695923887076824e-06,
      "loss": 0.241,
      "step": 310
    },
    {
      "epoch": 3.1067961165048543,
      "grad_norm": 0.3824808831772379,
      "learning_rate": 1.9036114442492901e-06,
      "loss": 0.2095,
      "step": 320
    },
    {
      "epoch": 3.203883495145631,
      "grad_norm": 0.39411524156521255,
      "learning_rate": 1.7403752348695296e-06,
      "loss": 0.211,
      "step": 330
    },
    {
      "epoch": 3.3009708737864076,
      "grad_norm": 0.3829443147282107,
      "learning_rate": 1.5806350160136446e-06,
      "loss": 0.2072,
      "step": 340
    },
    {
      "epoch": 3.3980582524271843,
      "grad_norm": 0.36458747976200806,
      "learning_rate": 1.4251259536702078e-06,
      "loss": 0.1964,
      "step": 350
    },
    {
      "epoch": 3.4951456310679614,
      "grad_norm": 0.37575548194010133,
      "learning_rate": 1.2745637409462447e-06,
      "loss": 0.2088,
      "step": 360
    },
    {
      "epoch": 3.592233009708738,
      "grad_norm": 0.3652962786931115,
      "learning_rate": 1.1296413042616115e-06,
      "loss": 0.1994,
      "step": 370
    },
    {
      "epoch": 3.6893203883495147,
      "grad_norm": 0.3712933316004299,
      "learning_rate": 9.910256143215882e-07,
      "loss": 0.2045,
      "step": 380
    },
    {
      "epoch": 3.7864077669902914,
      "grad_norm": 0.3461802470195759,
      "learning_rate": 8.593546165444078e-07,
      "loss": 0.2045,
      "step": 390
    },
    {
      "epoch": 3.883495145631068,
      "grad_norm": 0.35407836747337357,
      "learning_rate": 7.352342950706964e-07,
      "loss": 0.2067,
      "step": 400
    },
    {
      "epoch": 3.9805825242718447,
      "grad_norm": 0.35720973588507643,
      "learning_rate": 6.192358838670293e-07,
      "loss": 0.1983,
      "step": 410
    },
    {
      "epoch": 4.077669902912621,
      "grad_norm": 0.49136621176532774,
      "learning_rate": 5.118932377587984e-07,
      "loss": 0.186,
      "step": 420
    },
    {
      "epoch": 4.174757281553398,
      "grad_norm": 0.3964877004137412,
      "learning_rate": 4.137003754916105e-07,
      "loss": 0.1931,
      "step": 430
    },
    {
      "epoch": 4.271844660194175,
      "grad_norm": 0.34705190574049327,
      "learning_rate": 3.2510920612867284e-07,
      "loss": 0.1775,
      "step": 440
    },
    {
      "epoch": 4.368932038834951,
      "grad_norm": 0.37657649802404225,
      "learning_rate": 2.4652744924787253e-07,
      "loss": 0.1913,
      "step": 450
    },
    {
      "epoch": 4.466019417475728,
      "grad_norm": 0.3797812117921156,
      "learning_rate": 1.7831675851035264e-07,
      "loss": 0.193,
      "step": 460
    },
    {
      "epoch": 4.563106796116505,
      "grad_norm": 0.35942347600355457,
      "learning_rate": 1.207910572364046e-07,
      "loss": 0.1828,
      "step": 470
    },
    {
      "epoch": 4.660194174757281,
      "grad_norm": 0.34568138491238304,
      "learning_rate": 7.421509364878927e-08,
      "loss": 0.1833,
      "step": 480
    },
    {
      "epoch": 4.757281553398058,
      "grad_norm": 0.6159835033744901,
      "learning_rate": 3.8803222432630685e-08,
      "loss": 0.1832,
      "step": 490
    },
    {
      "epoch": 4.854368932038835,
      "grad_norm": 0.38886425151937265,
      "learning_rate": 1.4718418219468178e-08,
      "loss": 0.1827,
      "step": 500
    },
    {
      "epoch": 4.951456310679612,
      "grad_norm": 0.4089928939727998,
      "learning_rate": 2.0715255356559826e-09,
      "loss": 0.1829,
      "step": 510
    },
    {
      "epoch": 5.0,
      "step": 515,
      "total_flos": 944499472728064.0,
      "train_loss": 0.29363987642584494,
      "train_runtime": 38623.7413,
      "train_samples_per_second": 0.426,
      "train_steps_per_second": 0.013
    }
  ],
  "logging_steps": 10,
  "max_steps": 515,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 944499472728064.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}