{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02280718426304286, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 295.25, "epoch": 1.9005986885869048e-05, "grad_norm": 2.673111623301322, "kl": 0.0, "learning_rate": 9.999999991087068e-07, "loss": -0.0, "reward": 1.7687500715255737, "reward_std": 0.20764468610286713, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.9750000238418579, "step": 1, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 367.3500061035156, "epoch": 3.8011973771738095e-05, "grad_norm": 2.082116157134833, "kl": 0.000606536865234375, "learning_rate": 9.99999996434827e-07, "loss": 0.0, "reward": 1.5409200191497803, "reward_std": 0.2872665822505951, "rewards/accuracy_reward": 0.46467000246047974, "rewards/format_reward": 1.0, "step": 2, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 279.8999938964844, "epoch": 5.7017960657607147e-05, "grad_norm": 4.104595525624102, "kl": 0.000659942626953125, "learning_rate": 9.99999991978361e-07, "loss": 0.0, "reward": 1.492989420890808, "reward_std": 0.32215064764022827, "rewards/accuracy_reward": 0.4917394816875458, "rewards/format_reward": 0.9750000238418579, "step": 3, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 299.6499938964844, "epoch": 7.602394754347619e-05, "grad_norm": 4.066751398647228, "kl": 0.000591278076171875, "learning_rate": 9.999999857393084e-07, "loss": 0.0, "reward": 1.9667459726333618, "reward_std": 0.2620104253292084, "rewards/accuracy_reward": 0.7704960703849792, "rewards/format_reward": 1.0, "step": 4, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 337.8500061035156, "epoch": 9.502993442934525e-05, "grad_norm": 1.8447963431136554, "kl": 0.0005340576171875, "learning_rate": 9.999999777176696e-07, "loss": 0.0, "reward": 1.6162500381469727, "reward_std": 0.2701939046382904, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.824999988079071, "step": 5, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 214.75, "epoch": 0.00011403592131521429, "grad_norm": 4.865076982742215, "kl": 0.000720977783203125, "learning_rate": 9.999999679134443e-07, "loss": 0.0, "reward": 1.943750023841858, "reward_std": 0.2636513411998749, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 6, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 251.22500610351562, "epoch": 0.00013304190820108335, "grad_norm": 2.734172384623042, "kl": 0.00066375732421875, "learning_rate": 9.999999563266326e-07, "loss": 0.0, "reward": 1.7112499475479126, "reward_std": 0.42926207184791565, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 7, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 285.2749938964844, "epoch": 0.00015204789508695238, "grad_norm": 2.5397443822078283, "kl": 0.000606536865234375, "learning_rate": 9.999999429572349e-07, "loss": 0.0, "reward": 1.7121597528457642, "reward_std": 0.24398386478424072, "rewards/accuracy_reward": 0.6559095978736877, "rewards/format_reward": 0.925000011920929, "step": 8, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 322.875, "epoch": 0.00017105388197282144, "grad_norm": 2.8129541643991116, "kl": 0.000865936279296875, "learning_rate": 9.999999278052507e-07, "loss": 0.0, "reward": 1.6165634393692017, "reward_std": 0.45454278588294983, "rewards/accuracy_reward": 0.5178134441375732, "rewards/format_reward": 0.9750000238418579, "step": 9, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 221.0, "epoch": 0.0001900598688586905, "grad_norm": 1.9687283034904635, "kl": 0.00093841552734375, "learning_rate": 9.999999108706803e-07, "loss": 0.0, "reward": 1.777500033378601, "reward_std": 0.3578013777732849, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 10, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 194.4250030517578, "epoch": 0.00020906585574455953, "grad_norm": 2.3991361721082303, "kl": 0.00106048583984375, "learning_rate": 9.999998921535239e-07, "loss": 0.0, "reward": 1.9177086353302002, "reward_std": 0.1179908737540245, "rewards/accuracy_reward": 0.6414585709571838, "rewards/format_reward": 1.0, "step": 11, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 314.5, "epoch": 0.00022807184263042859, "grad_norm": 1.886022960586048, "kl": 0.00086212158203125, "learning_rate": 9.999998716537811e-07, "loss": 0.0, "reward": 1.2020833492279053, "reward_std": 0.1961066573858261, "rewards/accuracy_reward": 0.3283333480358124, "rewards/format_reward": 0.800000011920929, "step": 12, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 383.6750183105469, "epoch": 0.0002470778295162976, "grad_norm": 2.709438114169319, "kl": 0.0014495849609375, "learning_rate": 9.999998493714527e-07, "loss": 0.0001, "reward": 1.718187689781189, "reward_std": 0.4320615828037262, "rewards/accuracy_reward": 0.6806877255439758, "rewards/format_reward": 0.925000011920929, "step": 13, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 304.45001220703125, "epoch": 0.0002660838164021667, "grad_norm": 3.7270686122717542, "kl": 0.001220703125, "learning_rate": 9.999998253065383e-07, "loss": 0.0001, "reward": 1.809033989906311, "reward_std": 0.04920737445354462, "rewards/accuracy_reward": 0.6952840685844421, "rewards/format_reward": 1.0, "step": 14, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 333.375, "epoch": 0.00028508980328803573, "grad_norm": 4.998729494746825, "kl": 0.0012664794921875, "learning_rate": 9.99999799459038e-07, "loss": 0.0, "reward": 1.8887499570846558, "reward_std": 0.2809317409992218, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.925000011920929, "step": 15, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 286.1750183105469, "epoch": 0.00030409579017390476, "grad_norm": 2.0233101130622044, "kl": 0.00133514404296875, "learning_rate": 9.99999771828952e-07, "loss": 0.0001, "reward": 1.5862499475479126, "reward_std": 0.13032536208629608, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 16, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 250.8000030517578, "epoch": 0.00032310177705977385, "grad_norm": 2.3714583420433533, "kl": 0.0020294189453125, "learning_rate": 9.999997424162806e-07, "loss": 0.0001, "reward": 1.9924728870391846, "reward_std": 0.11796430498361588, "rewards/accuracy_reward": 0.776222825050354, "rewards/format_reward": 1.0, "step": 17, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 303.3999938964844, "epoch": 0.0003421077639456429, "grad_norm": 1.9635571237272242, "kl": 0.00160980224609375, "learning_rate": 9.999997112210234e-07, "loss": 0.0001, "reward": 1.7024999856948853, "reward_std": 0.1976005882024765, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 18, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 292.5249938964844, "epoch": 0.0003611137508315119, "grad_norm": 2.2403687588157464, "kl": 0.0022430419921875, "learning_rate": 9.999996782431807e-07, "loss": 0.0001, "reward": 1.723750114440918, "reward_std": 0.1547650843858719, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 19, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 236.10000610351562, "epoch": 0.000380119737717381, "grad_norm": 2.817841479009199, "kl": 0.0023193359375, "learning_rate": 9.99999643482753e-07, "loss": 0.0001, "reward": 2.0712499618530273, "reward_std": 0.2146197408437729, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 20, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 356.32501220703125, "epoch": 0.00039912572460325, "grad_norm": 2.36363757335534, "kl": 0.00174713134765625, "learning_rate": 9.999996069397399e-07, "loss": 0.0001, "reward": 1.4609562158584595, "reward_std": 0.47292444109916687, "rewards/accuracy_reward": 0.4997062683105469, "rewards/format_reward": 0.949999988079071, "step": 21, "temporal_rewards": 0.29999998211860657 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 291.0500183105469, "epoch": 0.00041813171148911906, "grad_norm": 2.368744313375501, "kl": 0.0030364990234375, "learning_rate": 9.999995686141417e-07, "loss": 0.0001, "reward": 2.262500047683716, "reward_std": 0.12062937021255493, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 22, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 377.82501220703125, "epoch": 0.00043713769837498814, "grad_norm": 1.9474999500583028, "kl": 0.0031585693359375, "learning_rate": 9.999995285059588e-07, "loss": 0.0001, "reward": 1.4155882596969604, "reward_std": 0.29959985613822937, "rewards/accuracy_reward": 0.4380883276462555, "rewards/format_reward": 0.949999988079071, "step": 23, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 297.3999938964844, "epoch": 0.00045614368526085717, "grad_norm": 2.610511274752285, "kl": 0.0042724609375, "learning_rate": 9.999994866151911e-07, "loss": 0.0002, "reward": 1.799687385559082, "reward_std": 0.13027535378932953, "rewards/accuracy_reward": 0.629687488079071, "rewards/format_reward": 0.9750000238418579, "step": 24, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 281.07501220703125, "epoch": 0.0004751496721467262, "grad_norm": 2.114629919066998, "kl": 0.0040283203125, "learning_rate": 9.999994429418386e-07, "loss": 0.0002, "reward": 1.6384057998657227, "reward_std": 0.18284937739372253, "rewards/accuracy_reward": 0.540905773639679, "rewards/format_reward": 1.0, "step": 25, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 324.2749938964844, "epoch": 0.0004941556590325952, "grad_norm": 2.755769295658409, "kl": 0.003692626953125, "learning_rate": 9.99999397485902e-07, "loss": 0.0001, "reward": 1.8387451171875, "reward_std": 0.18429307639598846, "rewards/accuracy_reward": 0.7824951410293579, "rewards/format_reward": 0.9750000238418579, "step": 26, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 234.65000915527344, "epoch": 0.0005131616459184643, "grad_norm": 4.940365643192497, "kl": 0.00787353515625, "learning_rate": 9.999993502473808e-07, "loss": 0.0003, "reward": 1.9099998474121094, "reward_std": 0.25944098830223083, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 27, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 251.65000915527344, "epoch": 0.0005321676328043334, "grad_norm": 2.3419283166589544, "kl": 0.007720947265625, "learning_rate": 9.999993012262756e-07, "loss": 0.0003, "reward": 1.896433711051941, "reward_std": 0.2827633321285248, "rewards/accuracy_reward": 0.6639335751533508, "rewards/format_reward": 1.0, "step": 28, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 316.20001220703125, "epoch": 0.0005511736196902024, "grad_norm": 1.86592975728736, "kl": 0.006134033203125, "learning_rate": 9.999992504225862e-07, "loss": 0.0002, "reward": 1.3874719142913818, "reward_std": 0.2548384666442871, "rewards/accuracy_reward": 0.32372185587882996, "rewards/format_reward": 1.0, "step": 29, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 271.3500061035156, "epoch": 0.0005701796065760715, "grad_norm": 3.968800523270615, "kl": 0.0074462890625, "learning_rate": 9.999991978363134e-07, "loss": 0.0003, "reward": 1.781022310256958, "reward_std": 0.21934275329113007, "rewards/accuracy_reward": 0.6797724962234497, "rewards/format_reward": 1.0, "step": 30, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 391.3999938964844, "epoch": 0.0005891855934619405, "grad_norm": 1.7032289212136063, "kl": 0.00665283203125, "learning_rate": 9.999991434674566e-07, "loss": 0.0003, "reward": 1.7510108947753906, "reward_std": 0.2869863510131836, "rewards/accuracy_reward": 0.7847608923912048, "rewards/format_reward": 0.949999988079071, "step": 31, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.1499938964844, "epoch": 0.0006081915803478095, "grad_norm": 2.358577127026973, "kl": 0.00469970703125, "learning_rate": 9.999990873160167e-07, "loss": 0.0002, "reward": 1.7725000381469727, "reward_std": 0.45565930008888245, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.9000000357627869, "step": 32, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 352.7749938964844, "epoch": 0.0006271975672336786, "grad_norm": 2.056994484616452, "kl": 0.006317138671875, "learning_rate": 9.999990293819936e-07, "loss": 0.0003, "reward": 1.6348778009414673, "reward_std": 0.4777977466583252, "rewards/accuracy_reward": 0.6273777484893799, "rewards/format_reward": 0.925000011920929, "step": 33, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 393.7749938964844, "epoch": 0.0006462035541195477, "grad_norm": 2.221129428967049, "kl": 0.007415771484375, "learning_rate": 9.999989696653875e-07, "loss": 0.0003, "reward": 1.435416579246521, "reward_std": 0.33462318778038025, "rewards/accuracy_reward": 0.416666716337204, "rewards/format_reward": 0.9750000238418579, "step": 34, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 299.4250183105469, "epoch": 0.0006652095410054167, "grad_norm": 2.2926228153550388, "kl": 0.010986328125, "learning_rate": 9.999989081661987e-07, "loss": 0.0004, "reward": 1.2972177267074585, "reward_std": 0.3400118052959442, "rewards/accuracy_reward": 0.1997176855802536, "rewards/format_reward": 1.0, "step": 35, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 247.9250030517578, "epoch": 0.0006842155278912858, "grad_norm": 2.5878857876544337, "kl": 0.0179443359375, "learning_rate": 9.999988448844271e-07, "loss": 0.0007, "reward": 2.221250057220459, "reward_std": 0.17021353542804718, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 36, "temporal_rewards": 1.0 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 283.9250183105469, "epoch": 0.0007032215147771548, "grad_norm": 2.2395132656722803, "kl": 0.0133056640625, "learning_rate": 9.999987798200732e-07, "loss": 0.0005, "reward": 2.171196699142456, "reward_std": 0.0765160396695137, "rewards/accuracy_reward": 0.8761968612670898, "rewards/format_reward": 1.0, "step": 37, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 381.3500061035156, "epoch": 0.0007222275016630238, "grad_norm": 2.1015139286583184, "kl": 0.01080322265625, "learning_rate": 9.999987129731372e-07, "loss": 0.0004, "reward": 1.7929672002792358, "reward_std": 0.17871348559856415, "rewards/accuracy_reward": 0.7979673743247986, "rewards/format_reward": 0.875, "step": 38, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 346.8999938964844, "epoch": 0.0007412334885488929, "grad_norm": 2.133894867879289, "kl": 0.0126953125, "learning_rate": 9.999986443436198e-07, "loss": 0.0005, "reward": 1.6820032596588135, "reward_std": 0.21932058036327362, "rewards/accuracy_reward": 0.5907532572746277, "rewards/format_reward": 0.949999988079071, "step": 39, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 393.2749938964844, "epoch": 0.000760239475434762, "grad_norm": 5.01450145069222, "kl": 0.0133056640625, "learning_rate": 9.999985739315202e-07, "loss": 0.0005, "reward": 1.6507374048233032, "reward_std": 0.3641747832298279, "rewards/accuracy_reward": 0.6694874167442322, "rewards/format_reward": 0.875, "step": 40, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 343.82501220703125, "epoch": 0.000779245462320631, "grad_norm": 2.2408385652997183, "kl": 0.01483154296875, "learning_rate": 9.999985017368396e-07, "loss": 0.0006, "reward": 1.7862499952316284, "reward_std": 0.3176189959049225, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 41, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 272.75, "epoch": 0.0007982514492065, "grad_norm": 2.690457314397398, "kl": 0.018310546875, "learning_rate": 9.999984277595777e-07, "loss": 0.0007, "reward": 1.8350000381469727, "reward_std": 0.37591132521629333, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 42, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 296.1499938964844, "epoch": 0.0008172574360923691, "grad_norm": 2.4773046353194883, "kl": 0.0228271484375, "learning_rate": 9.99998351999735e-07, "loss": 0.0009, "reward": 1.8862498998641968, "reward_std": 0.22204652428627014, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 43, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 374.2749938964844, "epoch": 0.0008362634229782381, "grad_norm": 2.023591117900132, "kl": 0.0155029296875, "learning_rate": 9.999982744573119e-07, "loss": 0.0006, "reward": 1.4919394254684448, "reward_std": 0.2940160036087036, "rewards/accuracy_reward": 0.590689480304718, "rewards/format_reward": 0.9000000357627869, "step": 44, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.6, "completion_length": 354.8999938964844, "epoch": 0.0008552694098641071, "grad_norm": 1.4699723957467639, "kl": 0.0181884765625, "learning_rate": 9.999981951323081e-07, "loss": 0.0007, "reward": 1.3650000095367432, "reward_std": 0.24048756062984467, "rewards/accuracy_reward": 0.3499999940395355, "rewards/format_reward": 0.925000011920929, "step": 45, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 368.5, "epoch": 0.0008742753967499763, "grad_norm": 2.056801555870482, "kl": 0.01324462890625, "learning_rate": 9.999981140247246e-07, "loss": 0.0005, "reward": 1.7650002241134644, "reward_std": 0.35767483711242676, "rewards/accuracy_reward": 0.6462500095367432, "rewards/format_reward": 1.0, "step": 46, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 288.4750061035156, "epoch": 0.0008932813836358453, "grad_norm": 2.983353682348288, "kl": 0.0205078125, "learning_rate": 9.999980311345615e-07, "loss": 0.0008, "reward": 1.6775000095367432, "reward_std": 0.5823832750320435, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 47, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 428.1499938964844, "epoch": 0.0009122873705217143, "grad_norm": 1.6983275803418565, "kl": 0.01104736328125, "learning_rate": 9.999979464618186e-07, "loss": 0.0004, "reward": 1.7168115377426147, "reward_std": 0.23992919921875, "rewards/accuracy_reward": 0.6680614948272705, "rewards/format_reward": 0.9750000238418579, "step": 48, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 357.125, "epoch": 0.0009312933574075834, "grad_norm": 1.5213134378521853, "kl": 0.01953125, "learning_rate": 9.99997860006497e-07, "loss": 0.0008, "reward": 1.7687500715255737, "reward_std": 0.2421073466539383, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.9750000238418579, "step": 49, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 328.95001220703125, "epoch": 0.0009502993442934524, "grad_norm": 2.390613929329034, "kl": 0.020751953125, "learning_rate": 9.999977717685962e-07, "loss": 0.0008, "reward": 1.6322113275527954, "reward_std": 0.42426714301109314, "rewards/accuracy_reward": 0.5122115015983582, "rewards/format_reward": 1.0, "step": 50, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 385.6000061035156, "epoch": 0.0009693053311793214, "grad_norm": 1.5275343267282913, "kl": 0.0181884765625, "learning_rate": 9.99997681748117e-07, "loss": 0.0007, "reward": 1.912500023841858, "reward_std": 0.4116156995296478, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 51, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 332.1750183105469, "epoch": 0.0009883113180651905, "grad_norm": 1.4369320573249387, "kl": 0.0096435546875, "learning_rate": 9.999975899450599e-07, "loss": 0.0004, "reward": 1.9117647409439087, "reward_std": 0.206694558262825, "rewards/accuracy_reward": 0.8617647290229797, "rewards/format_reward": 1.0, "step": 52, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 328.5249938964844, "epoch": 0.0010073173049510596, "grad_norm": 1.8136528430577885, "kl": 0.0179443359375, "learning_rate": 9.999974963594246e-07, "loss": 0.0007, "reward": 1.6705681085586548, "reward_std": 0.23624132573604584, "rewards/accuracy_reward": 0.6318181753158569, "rewards/format_reward": 1.0, "step": 53, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 420.6750183105469, "epoch": 0.0010263232918369285, "grad_norm": 4.1499151215157, "kl": 0.016845703125, "learning_rate": 9.99997400991212e-07, "loss": 0.0007, "reward": 1.2430310249328613, "reward_std": 0.479884535074234, "rewards/accuracy_reward": 0.2567810118198395, "rewards/format_reward": 0.9750000238418579, "step": 54, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 412.9250183105469, "epoch": 0.0010453292787227977, "grad_norm": 1.773815811249763, "kl": 0.01495361328125, "learning_rate": 9.999973038404219e-07, "loss": 0.0006, "reward": 1.3813902139663696, "reward_std": 0.24255304038524628, "rewards/accuracy_reward": 0.5513902306556702, "rewards/format_reward": 0.800000011920929, "step": 55, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 389.6000061035156, "epoch": 0.0010643352656086668, "grad_norm": 2.5237128428586373, "kl": 0.0191650390625, "learning_rate": 9.999972049070554e-07, "loss": 0.0008, "reward": 1.743749976158142, "reward_std": 0.32777532935142517, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.925000011920929, "step": 56, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 314.07501220703125, "epoch": 0.0010833412524945357, "grad_norm": 2.826625934788473, "kl": 0.0262451171875, "learning_rate": 9.99997104191112e-07, "loss": 0.001, "reward": 2.0825002193450928, "reward_std": 0.2573499381542206, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 57, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 422.70001220703125, "epoch": 0.0011023472393804049, "grad_norm": 2.21783790000557, "kl": 0.0223388671875, "learning_rate": 9.999970016925928e-07, "loss": 0.0009, "reward": 1.3523801565170288, "reward_std": 0.24788342416286469, "rewards/accuracy_reward": 0.46488019824028015, "rewards/format_reward": 0.800000011920929, "step": 58, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 387.07501220703125, "epoch": 0.0011213532262662738, "grad_norm": 1.711331367857792, "kl": 0.018798828125, "learning_rate": 9.999968974114975e-07, "loss": 0.0008, "reward": 1.3595693111419678, "reward_std": 0.18932317197322845, "rewards/accuracy_reward": 0.35081931948661804, "rewards/format_reward": 1.0, "step": 59, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 376.82501220703125, "epoch": 0.001140359213152143, "grad_norm": 1.4756213822670716, "kl": 0.01470947265625, "learning_rate": 9.999967913478272e-07, "loss": 0.0006, "reward": 1.5280908346176147, "reward_std": 0.20485416054725647, "rewards/accuracy_reward": 0.5118408203125, "rewards/format_reward": 1.0, "step": 60, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 374.8000183105469, "epoch": 0.001159365200038012, "grad_norm": 1.903452860609033, "kl": 0.02685546875, "learning_rate": 9.999966835015817e-07, "loss": 0.0011, "reward": 1.7850000858306885, "reward_std": 0.024494878947734833, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.800000011920929, "step": 61, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 380.5500183105469, "epoch": 0.001178371186923881, "grad_norm": 1.6696030822325638, "kl": 0.0203857421875, "learning_rate": 9.999965738727617e-07, "loss": 0.0008, "reward": 1.730209231376648, "reward_std": 0.16135059297084808, "rewards/accuracy_reward": 0.5977091193199158, "rewards/format_reward": 1.0, "step": 62, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 380.07501220703125, "epoch": 0.0011973771738097501, "grad_norm": 1.8256827508939328, "kl": 0.01495361328125, "learning_rate": 9.999964624613672e-07, "loss": 0.0006, "reward": 1.389145016670227, "reward_std": 0.1907978653907776, "rewards/accuracy_reward": 0.42789506912231445, "rewards/format_reward": 0.949999988079071, "step": 63, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 364.25, "epoch": 0.001216383160695619, "grad_norm": 3.37428251166175, "kl": 0.0244140625, "learning_rate": 9.999963492673991e-07, "loss": 0.001, "reward": 1.7062500715255737, "reward_std": 0.3639249801635742, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 64, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 343.6000061035156, "epoch": 0.0012353891475814882, "grad_norm": 27.10310867256576, "kl": 0.016357421875, "learning_rate": 9.999962342908576e-07, "loss": 0.0007, "reward": 1.545454978942871, "reward_std": 0.2258632928133011, "rewards/accuracy_reward": 0.5217050313949585, "rewards/format_reward": 1.0, "step": 65, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 426.3000183105469, "epoch": 0.0012543951344673571, "grad_norm": 1.834166770400696, "kl": 0.018310546875, "learning_rate": 9.999961175317429e-07, "loss": 0.0007, "reward": 1.2920359373092651, "reward_std": 0.3662143349647522, "rewards/accuracy_reward": 0.36703595519065857, "rewards/format_reward": 0.949999988079071, "step": 66, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.8500061035156, "epoch": 0.0012734011213532263, "grad_norm": 1.7231419925264362, "kl": 0.0250244140625, "learning_rate": 9.999959989900558e-07, "loss": 0.001, "reward": 1.4859206676483154, "reward_std": 0.5184847712516785, "rewards/accuracy_reward": 0.3896706998348236, "rewards/format_reward": 0.9750000238418579, "step": 67, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 324.125, "epoch": 0.0012924071082390954, "grad_norm": 2.0570183466608007, "kl": 0.02734375, "learning_rate": 9.999958786657963e-07, "loss": 0.0011, "reward": 2.1500000953674316, "reward_std": 0.09946426004171371, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 68, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 332.6750183105469, "epoch": 0.0013114130951249643, "grad_norm": 1.7805400018342927, "kl": 0.0286865234375, "learning_rate": 9.999957565589651e-07, "loss": 0.0011, "reward": 2.137500047683716, "reward_std": 0.1191316768527031, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 69, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 345.8500061035156, "epoch": 0.0013304190820108335, "grad_norm": 1.7616052253473493, "kl": 0.0262451171875, "learning_rate": 9.999956326695626e-07, "loss": 0.0011, "reward": 1.6295082569122314, "reward_std": 0.061908524483442307, "rewards/accuracy_reward": 0.5357584357261658, "rewards/format_reward": 1.0, "step": 70, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 433.6499938964844, "epoch": 0.0013494250688967024, "grad_norm": 6.06299305105756, "kl": 0.02197265625, "learning_rate": 9.999955069975894e-07, "loss": 0.0009, "reward": 1.840000033378601, "reward_std": 0.4016962945461273, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.949999988079071, "step": 71, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 457.3000183105469, "epoch": 0.0013684310557825715, "grad_norm": 3.8644206596649666, "kl": 0.0150146484375, "learning_rate": 9.999953795430456e-07, "loss": 0.0006, "reward": 1.4176387786865234, "reward_std": 0.5008097290992737, "rewards/accuracy_reward": 0.5276389122009277, "rewards/format_reward": 0.875, "step": 72, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 381.5249938964844, "epoch": 0.0013874370426684407, "grad_norm": 1.8633577290688386, "kl": 0.02197265625, "learning_rate": 9.999952503059319e-07, "loss": 0.0009, "reward": 1.5517667531967163, "reward_std": 0.2887406051158905, "rewards/accuracy_reward": 0.4705166816711426, "rewards/format_reward": 1.0, "step": 73, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 408.4750061035156, "epoch": 0.0014064430295543096, "grad_norm": 22.293175208669798, "kl": 0.02685546875, "learning_rate": 9.999951192862486e-07, "loss": 0.0011, "reward": 1.863899827003479, "reward_std": 0.17262814939022064, "rewards/accuracy_reward": 0.7163999080657959, "rewards/format_reward": 1.0, "step": 74, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 427.625, "epoch": 0.0014254490164401787, "grad_norm": 1.8972113246695448, "kl": 0.022216796875, "learning_rate": 9.999949864839963e-07, "loss": 0.0009, "reward": 1.5265105962753296, "reward_std": 0.3422698676586151, "rewards/accuracy_reward": 0.4840105175971985, "rewards/format_reward": 1.0, "step": 75, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 475.4250183105469, "epoch": 0.0014444550033260476, "grad_norm": 1.7901602657202653, "kl": 0.016357421875, "learning_rate": 9.999948518991755e-07, "loss": 0.0007, "reward": 1.3103300333023071, "reward_std": 0.44092264771461487, "rewards/accuracy_reward": 0.5003300309181213, "rewards/format_reward": 0.824999988079071, "step": 76, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 430.3999938964844, "epoch": 0.0014634609902119168, "grad_norm": 1.7444814779855429, "kl": 0.0196533203125, "learning_rate": 9.999947155317865e-07, "loss": 0.0008, "reward": 1.6164630651474, "reward_std": 0.319307804107666, "rewards/accuracy_reward": 0.5464630126953125, "rewards/format_reward": 0.949999988079071, "step": 77, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 420.1000061035156, "epoch": 0.0014824669770977857, "grad_norm": 1.972675507869758, "kl": 0.027099609375, "learning_rate": 9.999945773818298e-07, "loss": 0.0011, "reward": 1.7337499856948853, "reward_std": 0.47605353593826294, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 78, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 406.2250061035156, "epoch": 0.0015014729639836548, "grad_norm": 2.177804655688531, "kl": 0.0267333984375, "learning_rate": 9.99994437449306e-07, "loss": 0.0011, "reward": 1.8521394729614258, "reward_std": 0.324115127325058, "rewards/accuracy_reward": 0.6583895087242126, "rewards/format_reward": 0.949999988079071, "step": 79, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 299.9750061035156, "epoch": 0.001520478950869524, "grad_norm": 2.7777346773266824, "kl": 0.02880859375, "learning_rate": 9.999942957342157e-07, "loss": 0.0012, "reward": 1.299134612083435, "reward_std": 0.29816532135009766, "rewards/accuracy_reward": 0.26538464426994324, "rewards/format_reward": 1.0, "step": 80, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 410.6499938964844, "epoch": 0.001539484937755393, "grad_norm": 1.7769340762731953, "kl": 0.0306396484375, "learning_rate": 9.999941522365595e-07, "loss": 0.0012, "reward": 1.3981298208236694, "reward_std": 0.22073152661323547, "rewards/accuracy_reward": 0.4568799138069153, "rewards/format_reward": 0.824999988079071, "step": 81, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 409.6750183105469, "epoch": 0.001558490924641262, "grad_norm": 1.5432801343755391, "kl": 0.0194091796875, "learning_rate": 9.999940069563375e-07, "loss": 0.0008, "reward": 1.4534090757369995, "reward_std": 0.09737285226583481, "rewards/accuracy_reward": 0.4284090995788574, "rewards/format_reward": 1.0, "step": 82, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 420.7749938964844, "epoch": 0.001577496911527131, "grad_norm": 3.0572783343126035, "kl": 0.0218505859375, "learning_rate": 9.999938598935503e-07, "loss": 0.0009, "reward": 1.3602596521377563, "reward_std": 0.40853962302207947, "rewards/accuracy_reward": 0.5077596306800842, "rewards/format_reward": 0.824999988079071, "step": 83, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 402.6000061035156, "epoch": 0.001596502898413, "grad_norm": 2.17268441438714, "kl": 0.0322265625, "learning_rate": 9.999937110481986e-07, "loss": 0.0013, "reward": 1.5801323652267456, "reward_std": 0.20872293412685394, "rewards/accuracy_reward": 0.7001323103904724, "rewards/format_reward": 0.824999988079071, "step": 84, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 393.1750183105469, "epoch": 0.0016155088852988692, "grad_norm": 2.251746293822135, "kl": 0.0308837890625, "learning_rate": 9.99993560420283e-07, "loss": 0.0012, "reward": 2.0899999141693115, "reward_std": 0.14470690488815308, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 85, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 348.6750183105469, "epoch": 0.0016345148721847382, "grad_norm": 4.678195652996588, "kl": 0.03564453125, "learning_rate": 9.999934080098037e-07, "loss": 0.0014, "reward": 1.906760811805725, "reward_std": 0.17761114239692688, "rewards/accuracy_reward": 0.7080109119415283, "rewards/format_reward": 1.0, "step": 86, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 388.4750061035156, "epoch": 0.0016535208590706073, "grad_norm": 5.109596701914393, "kl": 0.022216796875, "learning_rate": 9.999932538167616e-07, "loss": 0.0009, "reward": 1.5554808378219604, "reward_std": 0.19344615936279297, "rewards/accuracy_reward": 0.4942307472229004, "rewards/format_reward": 1.0, "step": 87, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 455.7749938964844, "epoch": 0.0016725268459564762, "grad_norm": 1.7715382719224089, "kl": 0.0238037109375, "learning_rate": 9.999930978411573e-07, "loss": 0.0009, "reward": 1.2448269128799438, "reward_std": 0.3760250210762024, "rewards/accuracy_reward": 0.34857693314552307, "rewards/format_reward": 0.8500000238418579, "step": 88, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 417.8999938964844, "epoch": 0.0016915328328423454, "grad_norm": 2.718023008251072, "kl": 0.028564453125, "learning_rate": 9.99992940082991e-07, "loss": 0.0011, "reward": 1.9399998188018799, "reward_std": 0.3909613788127899, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.9000000357627869, "step": 89, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 406.9750061035156, "epoch": 0.0017105388197282143, "grad_norm": 1.3912795718135529, "kl": 0.0234375, "learning_rate": 9.999927805422633e-07, "loss": 0.0009, "reward": 1.7662500143051147, "reward_std": 0.19376306235790253, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 90, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 421.875, "epoch": 0.0017295448066140834, "grad_norm": 1.6675486147647514, "kl": 0.0201416015625, "learning_rate": 9.99992619218975e-07, "loss": 0.0008, "reward": 1.8392499685287476, "reward_std": 0.2976999282836914, "rewards/accuracy_reward": 0.7480000853538513, "rewards/format_reward": 1.0, "step": 91, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 422.57501220703125, "epoch": 0.0017485507934999526, "grad_norm": 2.3307118152368393, "kl": 0.0225830078125, "learning_rate": 9.999924561131264e-07, "loss": 0.0009, "reward": 1.7600001096725464, "reward_std": 0.3509061932563782, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 92, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 495.375, "epoch": 0.0017675567803858215, "grad_norm": 3.649647628545239, "kl": 0.0205078125, "learning_rate": 9.999922912247185e-07, "loss": 0.0008, "reward": 1.1277230978012085, "reward_std": 0.5832155346870422, "rewards/accuracy_reward": 0.308973103761673, "rewards/format_reward": 0.824999988079071, "step": 93, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 470.9750061035156, "epoch": 0.0017865627672716906, "grad_norm": 2.4778149390422177, "kl": 0.0206298828125, "learning_rate": 9.999921245537516e-07, "loss": 0.0008, "reward": 1.4761621952056885, "reward_std": 0.29485660791397095, "rewards/accuracy_reward": 0.5574120879173279, "rewards/format_reward": 0.875, "step": 94, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 385.8500061035156, "epoch": 0.0018055687541575596, "grad_norm": 2.01354489856464, "kl": 0.0299072265625, "learning_rate": 9.999919561002262e-07, "loss": 0.0012, "reward": 1.6849998235702515, "reward_std": 0.3496052920818329, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 95, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 423.57501220703125, "epoch": 0.0018245747410434287, "grad_norm": 1.6843216852448375, "kl": 0.0194091796875, "learning_rate": 9.999917858641431e-07, "loss": 0.0008, "reward": 1.464853048324585, "reward_std": 0.305313378572464, "rewards/accuracy_reward": 0.5086029767990112, "rewards/format_reward": 0.925000011920929, "step": 96, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 370.3500061035156, "epoch": 0.0018435807279292978, "grad_norm": 1.7691210684176437, "kl": 0.029296875, "learning_rate": 9.999916138455027e-07, "loss": 0.0012, "reward": 1.7887500524520874, "reward_std": 0.20897598564624786, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 97, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 395.4250183105469, "epoch": 0.0018625867148151668, "grad_norm": 1.6729182059362162, "kl": 0.0311279296875, "learning_rate": 9.99991440044306e-07, "loss": 0.0012, "reward": 1.3479642868041992, "reward_std": 0.25888824462890625, "rewards/accuracy_reward": 0.3267143666744232, "rewards/format_reward": 0.9750000238418579, "step": 98, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 368.9750061035156, "epoch": 0.001881592701701036, "grad_norm": 1.6641565139502756, "kl": 0.03564453125, "learning_rate": 9.999912644605532e-07, "loss": 0.0014, "reward": 1.9850000143051147, "reward_std": 0.5545549392700195, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 99, "temporal_rewards": 1.0 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 386.6499938964844, "epoch": 0.0019005986885869048, "grad_norm": 1.661428182116276, "kl": 0.029052734375, "learning_rate": 9.999910870942452e-07, "loss": 0.0012, "reward": 1.962499976158142, "reward_std": 0.37154579162597656, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 100, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 383.3500061035156, "epoch": 0.001919604675472774, "grad_norm": 1.9380789848090383, "kl": 0.03515625, "learning_rate": 9.999909079453825e-07, "loss": 0.0014, "reward": 1.7787500619888306, "reward_std": 0.37119224667549133, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 101, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 365.0, "epoch": 0.0019386106623586429, "grad_norm": 1.8562862398595406, "kl": 0.031005859375, "learning_rate": 9.999907270139655e-07, "loss": 0.0012, "reward": 1.8941665887832642, "reward_std": 0.15514008700847626, "rewards/accuracy_reward": 0.7266666293144226, "rewards/format_reward": 1.0, "step": 102, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.375, "epoch": 0.001957616649244512, "grad_norm": 1.311548261470929, "kl": 0.0113525390625, "learning_rate": 9.999905442999955e-07, "loss": 0.0005, "reward": 1.5067170858383179, "reward_std": 0.4453655183315277, "rewards/accuracy_reward": 0.4992171823978424, "rewards/format_reward": 1.0, "step": 103, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.4250183105469, "epoch": 0.001976622636130381, "grad_norm": 1.831309505314721, "kl": 0.025390625, "learning_rate": 9.999903598034726e-07, "loss": 0.001, "reward": 1.6432164907455444, "reward_std": 0.38924336433410645, "rewards/accuracy_reward": 0.640716552734375, "rewards/format_reward": 0.925000011920929, "step": 104, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 435.6000061035156, "epoch": 0.0019956286230162503, "grad_norm": 2.3635153149852437, "kl": 0.0206298828125, "learning_rate": 9.999901735243977e-07, "loss": 0.0008, "reward": 1.3154586553573608, "reward_std": 0.29778310656547546, "rewards/accuracy_reward": 0.45920857787132263, "rewards/format_reward": 0.824999988079071, "step": 105, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 477.5249938964844, "epoch": 0.002014634609902119, "grad_norm": 1.958280285952236, "kl": 0.0211181640625, "learning_rate": 9.999899854627713e-07, "loss": 0.0008, "reward": 1.4671143293380737, "reward_std": 0.1926993429660797, "rewards/accuracy_reward": 0.6296143531799316, "rewards/format_reward": 0.800000011920929, "step": 106, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 447.4250183105469, "epoch": 0.002033640596787988, "grad_norm": 1.870154208847281, "kl": 0.021240234375, "learning_rate": 9.999897956185942e-07, "loss": 0.0008, "reward": 1.3505624532699585, "reward_std": 0.4228193461894989, "rewards/accuracy_reward": 0.3843124806880951, "rewards/format_reward": 0.925000011920929, "step": 107, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 455.6750183105469, "epoch": 0.002052646583673857, "grad_norm": 1.74426244908067, "kl": 0.0198974609375, "learning_rate": 9.999896039918671e-07, "loss": 0.0008, "reward": 1.4027106761932373, "reward_std": 0.49836865067481995, "rewards/accuracy_reward": 0.4889605641365051, "rewards/format_reward": 0.9000000357627869, "step": 108, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 447.875, "epoch": 0.0020716525705597264, "grad_norm": 1.4266038092185571, "kl": 0.0252685546875, "learning_rate": 9.999894105825904e-07, "loss": 0.001, "reward": 1.9647228717803955, "reward_std": 0.23401236534118652, "rewards/accuracy_reward": 0.9209728240966797, "rewards/format_reward": 0.9750000238418579, "step": 109, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 440.45001220703125, "epoch": 0.0020906585574455953, "grad_norm": 1.6661206643921196, "kl": 0.0244140625, "learning_rate": 9.999892153907652e-07, "loss": 0.001, "reward": 1.7705137729644775, "reward_std": 0.24715518951416016, "rewards/accuracy_reward": 0.7055138349533081, "rewards/format_reward": 0.949999988079071, "step": 110, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 406.4750061035156, "epoch": 0.0021096645443314643, "grad_norm": 2.858453558713371, "kl": 0.033935546875, "learning_rate": 9.99989018416392e-07, "loss": 0.0014, "reward": 1.6805261373519897, "reward_std": 0.35290464758872986, "rewards/accuracy_reward": 0.5055261850357056, "rewards/format_reward": 1.0, "step": 111, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.375, "epoch": 0.0021286705312173336, "grad_norm": 1.7454017233232972, "kl": 0.033935546875, "learning_rate": 9.999888196594714e-07, "loss": 0.0014, "reward": 1.8360843658447266, "reward_std": 0.2726055085659027, "rewards/accuracy_reward": 0.7735845446586609, "rewards/format_reward": 0.875, "step": 112, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 353.0, "epoch": 0.0021476765181032025, "grad_norm": 2.4722137217847258, "kl": 0.03466796875, "learning_rate": 9.999886191200043e-07, "loss": 0.0014, "reward": 1.5362499952316284, "reward_std": 0.3735165297985077, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 113, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 521.25, "epoch": 0.0021666825049890715, "grad_norm": 1.6236430715967438, "kl": 0.02197265625, "learning_rate": 9.999884167979913e-07, "loss": 0.0009, "reward": 1.2192209959030151, "reward_std": 0.41181594133377075, "rewards/accuracy_reward": 0.4542209804058075, "rewards/format_reward": 0.7250000238418579, "step": 114, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 357.57501220703125, "epoch": 0.002185688491874941, "grad_norm": 2.284632892835743, "kl": 0.041015625, "learning_rate": 9.999882126934332e-07, "loss": 0.0016, "reward": 2.074758291244507, "reward_std": 0.14666441082954407, "rewards/accuracy_reward": 0.8460081219673157, "rewards/format_reward": 1.0, "step": 115, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 389.20001220703125, "epoch": 0.0022046944787608097, "grad_norm": 3.16351175778279, "kl": 0.024169921875, "learning_rate": 9.999880068063305e-07, "loss": 0.001, "reward": 1.2041759490966797, "reward_std": 0.24465619027614594, "rewards/accuracy_reward": 0.2766759991645813, "rewards/format_reward": 0.925000011920929, "step": 116, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 391.5500183105469, "epoch": 0.0022237004656466787, "grad_norm": 2.0304763663719276, "kl": 0.03369140625, "learning_rate": 9.999877991366843e-07, "loss": 0.0014, "reward": 1.879212737083435, "reward_std": 0.2156396210193634, "rewards/accuracy_reward": 0.6679627895355225, "rewards/format_reward": 1.0, "step": 117, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 465.2250061035156, "epoch": 0.0022427064525325476, "grad_norm": 1.6546801011563235, "kl": 0.0203857421875, "learning_rate": 9.99987589684495e-07, "loss": 0.0008, "reward": 1.3929481506347656, "reward_std": 0.28699302673339844, "rewards/accuracy_reward": 0.5216981768608093, "rewards/format_reward": 0.8500000238418579, "step": 118, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 405.7749938964844, "epoch": 0.002261712439418417, "grad_norm": 1.7681545304480941, "kl": 0.0306396484375, "learning_rate": 9.999873784497636e-07, "loss": 0.0012, "reward": 1.6875, "reward_std": 0.29129740595817566, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.9750000238418579, "step": 119, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 517.0, "epoch": 0.002280718426304286, "grad_norm": 2.9999589200376198, "kl": 0.02294921875, "learning_rate": 9.999871654324907e-07, "loss": 0.0009, "reward": 1.3318989276885986, "reward_std": 0.5621644854545593, "rewards/accuracy_reward": 0.4856489598751068, "rewards/format_reward": 0.7750000357627869, "step": 120, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 408.57501220703125, "epoch": 0.002299724413190155, "grad_norm": 1.757897593859272, "kl": 0.0361328125, "learning_rate": 9.999869506326773e-07, "loss": 0.0014, "reward": 1.6224998235702515, "reward_std": 0.32296404242515564, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 121, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 361.375, "epoch": 0.002318730400076024, "grad_norm": 1.8040697104095194, "kl": 0.030517578125, "learning_rate": 9.999867340503238e-07, "loss": 0.0012, "reward": 1.5762500762939453, "reward_std": 0.2644350230693817, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 122, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 394.5249938964844, "epoch": 0.002337736386961893, "grad_norm": 2.125098480736845, "kl": 0.04052734375, "learning_rate": 9.999865156854312e-07, "loss": 0.0016, "reward": 1.9887498617172241, "reward_std": 0.09045388549566269, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 123, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 372.9250183105469, "epoch": 0.002356742373847762, "grad_norm": 3.6327000620063026, "kl": 0.023681640625, "learning_rate": 9.99986295538e-07, "loss": 0.001, "reward": 1.830775499343872, "reward_std": 0.08539687097072601, "rewards/accuracy_reward": 0.785775363445282, "rewards/format_reward": 1.0, "step": 124, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 409.4750061035156, "epoch": 0.002375748360733631, "grad_norm": 1.7642240926650516, "kl": 0.033935546875, "learning_rate": 9.999860736080315e-07, "loss": 0.0014, "reward": 1.7674999237060547, "reward_std": 0.4329659938812256, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.9750000238418579, "step": 125, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 439.20001220703125, "epoch": 0.0023947543476195003, "grad_norm": 1.5360351515377255, "kl": 0.026123046875, "learning_rate": 9.999858498955262e-07, "loss": 0.001, "reward": 1.462499976158142, "reward_std": 0.2535651624202728, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 0.949999988079071, "step": 126, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 442.1000061035156, "epoch": 0.002413760334505369, "grad_norm": 2.3843788079336665, "kl": 0.02587890625, "learning_rate": 9.999856244004847e-07, "loss": 0.001, "reward": 1.4300276041030884, "reward_std": 0.3910476267337799, "rewards/accuracy_reward": 0.47002753615379333, "rewards/format_reward": 0.875, "step": 127, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 401.8999938964844, "epoch": 0.002432766321391238, "grad_norm": 2.056022106306336, "kl": 0.0286865234375, "learning_rate": 9.999853971229081e-07, "loss": 0.0011, "reward": 1.4300159215927124, "reward_std": 0.25307053327560425, "rewards/accuracy_reward": 0.3525159955024719, "rewards/format_reward": 1.0, "step": 128, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 430.5500183105469, "epoch": 0.0024517723082771075, "grad_norm": 1.198429674553318, "kl": 0.02734375, "learning_rate": 9.999851680627973e-07, "loss": 0.0011, "reward": 1.4500000476837158, "reward_std": 0.4522148072719574, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.925000011920929, "step": 129, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 347.6750183105469, "epoch": 0.0024707782951629764, "grad_norm": 1.9389868981099811, "kl": 0.032958984375, "learning_rate": 9.999849372201528e-07, "loss": 0.0013, "reward": 1.8525002002716064, "reward_std": 0.2627958655357361, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 130, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 405.5249938964844, "epoch": 0.0024897842820488453, "grad_norm": 1.875951052181491, "kl": 0.0303955078125, "learning_rate": 9.999847045949754e-07, "loss": 0.0012, "reward": 1.8334811925888062, "reward_std": 0.22262628376483917, "rewards/accuracy_reward": 0.7034812569618225, "rewards/format_reward": 1.0, "step": 131, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 443.95001220703125, "epoch": 0.0025087902689347142, "grad_norm": 1.6541187018826684, "kl": 0.02099609375, "learning_rate": 9.999844701872662e-07, "loss": 0.0008, "reward": 1.7012500762939453, "reward_std": 0.3903408646583557, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.9750000238418579, "step": 132, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 431.8999938964844, "epoch": 0.0025277962558205836, "grad_norm": 2.2437243545249372, "kl": 0.024658203125, "learning_rate": 9.99984233997026e-07, "loss": 0.001, "reward": 1.2993817329406738, "reward_std": 0.41282787919044495, "rewards/accuracy_reward": 0.3556317985057831, "rewards/format_reward": 0.9000000357627869, "step": 133, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 421.95001220703125, "epoch": 0.0025468022427064525, "grad_norm": 1.9312409773493668, "kl": 0.03759765625, "learning_rate": 9.999839960242553e-07, "loss": 0.0015, "reward": 2.2100000381469727, "reward_std": 0.17241929471492767, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 0.9750000238418579, "step": 134, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 375.3000183105469, "epoch": 0.0025658082295923214, "grad_norm": 1.7486348360473247, "kl": 0.03955078125, "learning_rate": 9.999837562689555e-07, "loss": 0.0016, "reward": 2.102916717529297, "reward_std": 0.26096248626708984, "rewards/accuracy_reward": 0.8466667532920837, "rewards/format_reward": 1.0, "step": 135, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 469.70001220703125, "epoch": 0.002584814216478191, "grad_norm": 1.3447144107629372, "kl": 0.020751953125, "learning_rate": 9.999835147311272e-07, "loss": 0.0008, "reward": 1.6257904767990112, "reward_std": 0.25289788842201233, "rewards/accuracy_reward": 0.6632905006408691, "rewards/format_reward": 0.925000011920929, "step": 136, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 442.1750183105469, "epoch": 0.0026038202033640597, "grad_norm": 1.5618984064917965, "kl": 0.0252685546875, "learning_rate": 9.99983271410771e-07, "loss": 0.001, "reward": 1.5629686117172241, "reward_std": 0.4603721797466278, "rewards/accuracy_reward": 0.5992185473442078, "rewards/format_reward": 0.925000011920929, "step": 137, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 416.9250183105469, "epoch": 0.0026228261902499286, "grad_norm": 2.1516783669296142, "kl": 0.03369140625, "learning_rate": 9.99983026307888e-07, "loss": 0.0013, "reward": 1.840000033378601, "reward_std": 0.12689465284347534, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.8500000238418579, "step": 138, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 387.6000061035156, "epoch": 0.002641832177135798, "grad_norm": 1.594655353605103, "kl": 0.0439453125, "learning_rate": 9.999827794224791e-07, "loss": 0.0018, "reward": 1.7712500095367432, "reward_std": 0.4626477360725403, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.9750000238418579, "step": 139, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 391.57501220703125, "epoch": 0.002660838164021667, "grad_norm": 1.8211123826179678, "kl": 0.0322265625, "learning_rate": 9.999825307545453e-07, "loss": 0.0013, "reward": 1.7787498235702515, "reward_std": 0.08908182382583618, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 140, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 453.875, "epoch": 0.002679844150907536, "grad_norm": 1.4377069826682358, "kl": 0.03125, "learning_rate": 9.999822803040872e-07, "loss": 0.0013, "reward": 1.7375000715255737, "reward_std": 0.19416609406471252, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.9750000238418579, "step": 141, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.4750061035156, "epoch": 0.0026988501377934048, "grad_norm": 1.5245498518911158, "kl": 0.0294189453125, "learning_rate": 9.99982028071106e-07, "loss": 0.0012, "reward": 2.0346338748931885, "reward_std": 0.3204995095729828, "rewards/accuracy_reward": 0.8821339011192322, "rewards/format_reward": 0.9750000238418579, "step": 142, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.002717856124679274, "grad_norm": 1.488686670355777, "kl": 0.0311279296875, "learning_rate": 9.999817740556023e-07, "loss": 0.0012, "reward": 1.7937500476837158, "reward_std": 0.15125273168087006, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 143, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 471.75, "epoch": 0.002736862111565143, "grad_norm": 1.8671895767371864, "kl": 0.0284423828125, "learning_rate": 9.99981518257577e-07, "loss": 0.0011, "reward": 1.7274999618530273, "reward_std": 0.3801653981208801, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.949999988079071, "step": 144, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 434.6750183105469, "epoch": 0.002755868098451012, "grad_norm": 1.421053342895394, "kl": 0.02587890625, "learning_rate": 9.999812606770313e-07, "loss": 0.001, "reward": 1.4275000095367432, "reward_std": 0.2674916982650757, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.824999988079071, "step": 145, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 424.875, "epoch": 0.0027748740853368813, "grad_norm": 1.4913815773458112, "kl": 0.0303955078125, "learning_rate": 9.99981001313966e-07, "loss": 0.0012, "reward": 1.5325000286102295, "reward_std": 0.25276699662208557, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 0.949999988079071, "step": 146, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 461.2250061035156, "epoch": 0.0027938800722227502, "grad_norm": 1.7956360796966173, "kl": 0.0201416015625, "learning_rate": 9.999807401683819e-07, "loss": 0.0008, "reward": 1.584999918937683, "reward_std": 0.3503597676753998, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.949999988079071, "step": 147, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 427.6750183105469, "epoch": 0.002812886059108619, "grad_norm": 2.365073143548544, "kl": 0.03857421875, "learning_rate": 9.9998047724028e-07, "loss": 0.0015, "reward": 1.8576242923736572, "reward_std": 0.1773967295885086, "rewards/accuracy_reward": 0.8163743019104004, "rewards/format_reward": 0.9000000357627869, "step": 148, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 413.2250061035156, "epoch": 0.002831892045994488, "grad_norm": 1.7252276542579688, "kl": 0.0289306640625, "learning_rate": 9.999802125296613e-07, "loss": 0.0012, "reward": 1.8983334302902222, "reward_std": 0.11522179841995239, "rewards/accuracy_reward": 0.8183333277702332, "rewards/format_reward": 1.0, "step": 149, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 403.75, "epoch": 0.0028508980328803574, "grad_norm": 2.0589334279901808, "kl": 0.044677734375, "learning_rate": 9.999799460365267e-07, "loss": 0.0018, "reward": 1.9924999475479126, "reward_std": 0.40971073508262634, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 150, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 447.32501220703125, "epoch": 0.0028699040197662264, "grad_norm": 1.6391078667422532, "kl": 0.03466796875, "learning_rate": 9.99979677760877e-07, "loss": 0.0014, "reward": 1.424102783203125, "reward_std": 0.43598437309265137, "rewards/accuracy_reward": 0.5078528523445129, "rewards/format_reward": 0.925000011920929, "step": 151, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 425.4250183105469, "epoch": 0.0028889100066520953, "grad_norm": 9.396291486368417, "kl": 0.0289306640625, "learning_rate": 9.999794077027135e-07, "loss": 0.0012, "reward": 1.6387499570846558, "reward_std": 0.22128906846046448, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9000000357627869, "step": 152, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 400.6750183105469, "epoch": 0.0029079159935379646, "grad_norm": 1.6113288987892758, "kl": 0.03173828125, "learning_rate": 9.99979135862037e-07, "loss": 0.0013, "reward": 1.7445834875106812, "reward_std": 0.24926964938640594, "rewards/accuracy_reward": 0.6333333253860474, "rewards/format_reward": 1.0, "step": 153, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 438.1750183105469, "epoch": 0.0029269219804238336, "grad_norm": 2.0588162325910186, "kl": 0.0272216796875, "learning_rate": 9.999788622388484e-07, "loss": 0.0011, "reward": 1.7337499856948853, "reward_std": 0.566247820854187, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.949999988079071, "step": 154, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 429.7749938964844, "epoch": 0.0029459279673097025, "grad_norm": 1.8617665853698921, "kl": 0.046630859375, "learning_rate": 9.999785868331486e-07, "loss": 0.0019, "reward": 2.2300000190734863, "reward_std": 0.19540588557720184, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 0.9750000238418579, "step": 155, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.95001220703125, "epoch": 0.0029649339541955714, "grad_norm": 1.719956446797739, "kl": 0.041015625, "learning_rate": 9.999783096449389e-07, "loss": 0.0016, "reward": 1.8305953741073608, "reward_std": 0.3304825723171234, "rewards/accuracy_reward": 0.630595326423645, "rewards/format_reward": 0.9750000238418579, "step": 156, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 422.70001220703125, "epoch": 0.0029839399410814408, "grad_norm": 1.9840915198333549, "kl": 0.039306640625, "learning_rate": 9.9997803067422e-07, "loss": 0.0016, "reward": 2.134999990463257, "reward_std": 0.10920828580856323, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 157, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 458.8999938964844, "epoch": 0.0030029459279673097, "grad_norm": 1.7639869601003102, "kl": 0.03271484375, "learning_rate": 9.999777499209927e-07, "loss": 0.0013, "reward": 1.4048618078231812, "reward_std": 0.48996439576148987, "rewards/accuracy_reward": 0.4323618412017822, "rewards/format_reward": 0.949999988079071, "step": 158, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 389.7250061035156, "epoch": 0.0030219519148531786, "grad_norm": 2.3599515456427547, "kl": 0.039794921875, "learning_rate": 9.999774673852586e-07, "loss": 0.0016, "reward": 1.8200000524520874, "reward_std": 0.3282524049282074, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 159, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 405.7749938964844, "epoch": 0.003040957901739048, "grad_norm": 2.2616609557105343, "kl": 0.036376953125, "learning_rate": 9.999771830670182e-07, "loss": 0.0015, "reward": 2.129999876022339, "reward_std": 0.1709517389535904, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 160, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 415.7749938964844, "epoch": 0.003059963888624917, "grad_norm": 1.7102218764435317, "kl": 0.04248046875, "learning_rate": 9.999768969662727e-07, "loss": 0.0017, "reward": 1.5400406122207642, "reward_std": 0.31520184874534607, "rewards/accuracy_reward": 0.41754060983657837, "rewards/format_reward": 1.0, "step": 161, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 372.125, "epoch": 0.003078969875510786, "grad_norm": 1.8026554065712848, "kl": 0.033447265625, "learning_rate": 9.999766090830233e-07, "loss": 0.0013, "reward": 1.5858334302902222, "reward_std": 0.22057271003723145, "rewards/accuracy_reward": 0.5633333325386047, "rewards/format_reward": 1.0, "step": 162, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 392.20001220703125, "epoch": 0.0030979758623966547, "grad_norm": 2.011151309629896, "kl": 0.040771484375, "learning_rate": 9.999763194172708e-07, "loss": 0.0016, "reward": 2.1462502479553223, "reward_std": 0.03894924744963646, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 163, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 375.6750183105469, "epoch": 0.003116981849282524, "grad_norm": 1.8918527334820754, "kl": 0.046875, "learning_rate": 9.999760279690162e-07, "loss": 0.0019, "reward": 1.9532890319824219, "reward_std": 0.22268572449684143, "rewards/accuracy_reward": 0.7332891821861267, "rewards/format_reward": 1.0, "step": 164, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 405.6499938964844, "epoch": 0.003135987836168393, "grad_norm": 1.3711286517136527, "kl": 0.0289306640625, "learning_rate": 9.999757347382606e-07, "loss": 0.0012, "reward": 1.7587499618530273, "reward_std": 0.2259451448917389, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 165, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 383.07501220703125, "epoch": 0.003154993823054262, "grad_norm": 1.9232649231057724, "kl": 0.04931640625, "learning_rate": 9.99975439725005e-07, "loss": 0.002, "reward": 2.1524999141693115, "reward_std": 0.1306147575378418, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 166, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 418.3500061035156, "epoch": 0.0031739998099401313, "grad_norm": 1.861616322884654, "kl": 0.035888671875, "learning_rate": 9.999751429292506e-07, "loss": 0.0014, "reward": 1.2532682418823242, "reward_std": 0.3139711022377014, "rewards/accuracy_reward": 0.3482682406902313, "rewards/format_reward": 0.925000011920929, "step": 167, "temporal_rewards": 0.29999998211860657 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 473.3999938964844, "epoch": 0.003193005796826, "grad_norm": 2.1140474531654054, "kl": 0.0322265625, "learning_rate": 9.999748443509986e-07, "loss": 0.0013, "reward": 1.4529370069503784, "reward_std": 0.4296736419200897, "rewards/accuracy_reward": 0.5854371190071106, "rewards/format_reward": 0.7750000357627869, "step": 168, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 426.6499938964844, "epoch": 0.003212011783711869, "grad_norm": 2.081056245713128, "kl": 0.045166015625, "learning_rate": 9.999745439902495e-07, "loss": 0.0018, "reward": 1.66265070438385, "reward_std": 0.11052091419696808, "rewards/accuracy_reward": 0.590150773525238, "rewards/format_reward": 1.0, "step": 169, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 391.0500183105469, "epoch": 0.0032310177705977385, "grad_norm": 1.557973829500598, "kl": 0.03173828125, "learning_rate": 9.99974241847005e-07, "loss": 0.0013, "reward": 2.132500171661377, "reward_std": 0.1289599984884262, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 170, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.0032500237574836074, "grad_norm": 1.8045856773772166, "kl": 0.0225830078125, "learning_rate": 9.999739379212658e-07, "loss": 0.0009, "reward": 1.7682842016220093, "reward_std": 0.18975917994976044, "rewards/accuracy_reward": 0.7132843136787415, "rewards/format_reward": 1.0, "step": 171, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 480.7749938964844, "epoch": 0.0032690297443694763, "grad_norm": 1.9547616570315738, "kl": 0.03466796875, "learning_rate": 9.999736322130328e-07, "loss": 0.0014, "reward": 1.6181875467300415, "reward_std": 0.32576438784599304, "rewards/accuracy_reward": 0.5969375371932983, "rewards/format_reward": 0.9000000357627869, "step": 172, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 460.3999938964844, "epoch": 0.0032880357312553453, "grad_norm": 1.334266052271473, "kl": 0.03125, "learning_rate": 9.999733247223077e-07, "loss": 0.0013, "reward": 0.9670197367668152, "reward_std": 0.39288732409477234, "rewards/accuracy_reward": 0.1620197594165802, "rewards/format_reward": 0.8500000238418579, "step": 173, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 486.2749938964844, "epoch": 0.0033070417181412146, "grad_norm": 2.0305263629651344, "kl": 0.0286865234375, "learning_rate": 9.999730154490912e-07, "loss": 0.0011, "reward": 1.2086223363876343, "reward_std": 0.40833497047424316, "rewards/accuracy_reward": 0.3061222732067108, "rewards/format_reward": 0.925000011920929, "step": 174, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 492.625, "epoch": 0.0033260477050270835, "grad_norm": 2.170161481168517, "kl": 0.0286865234375, "learning_rate": 9.999727043933842e-07, "loss": 0.0011, "reward": 1.491103172302246, "reward_std": 0.39911141991615295, "rewards/accuracy_reward": 0.4773530960083008, "rewards/format_reward": 0.9000000357627869, "step": 175, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 464.3000183105469, "epoch": 0.0033450536919129525, "grad_norm": 1.6902429526032825, "kl": 0.03173828125, "learning_rate": 9.999723915551882e-07, "loss": 0.0013, "reward": 1.4843275547027588, "reward_std": 0.3326936662197113, "rewards/accuracy_reward": 0.5693275332450867, "rewards/format_reward": 0.824999988079071, "step": 176, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 436.2749938964844, "epoch": 0.003364059678798822, "grad_norm": 1.7250359682167837, "kl": 0.041748046875, "learning_rate": 9.999720769345044e-07, "loss": 0.0017, "reward": 1.776833415031433, "reward_std": 0.2125244438648224, "rewards/accuracy_reward": 0.6368333697319031, "rewards/format_reward": 1.0, "step": 177, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 490.0249938964844, "epoch": 0.0033830656656846907, "grad_norm": 1.278385568657809, "kl": 0.030517578125, "learning_rate": 9.999717605313334e-07, "loss": 0.0012, "reward": 1.463749885559082, "reward_std": 0.4122951626777649, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 0.800000011920929, "step": 178, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 435.82501220703125, "epoch": 0.0034020716525705597, "grad_norm": 3.7786056142158233, "kl": 0.0267333984375, "learning_rate": 9.999714423456768e-07, "loss": 0.0011, "reward": 1.6909410953521729, "reward_std": 0.21448658406734467, "rewards/accuracy_reward": 0.7021910548210144, "rewards/format_reward": 0.8500000238418579, "step": 179, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 444.2250061035156, "epoch": 0.0034210776394564286, "grad_norm": 8.131646771139263, "kl": 0.044677734375, "learning_rate": 9.999711223775355e-07, "loss": 0.0018, "reward": 1.7312500476837158, "reward_std": 0.34000539779663086, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.875, "step": 180, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 399.8999938964844, "epoch": 0.003440083626342298, "grad_norm": 2.2282777402698746, "kl": 0.046142578125, "learning_rate": 9.999708006269108e-07, "loss": 0.0018, "reward": 1.5961250066757202, "reward_std": 0.3732485771179199, "rewards/accuracy_reward": 0.48487502336502075, "rewards/format_reward": 1.0, "step": 181, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 422.6499938964844, "epoch": 0.003459089613228167, "grad_norm": 1.643446544121657, "kl": 0.031494140625, "learning_rate": 9.999704770938035e-07, "loss": 0.0013, "reward": 1.6674998998641968, "reward_std": 0.24944470822811127, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 182, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 395.8000183105469, "epoch": 0.0034780956001140358, "grad_norm": 2.3874501163374577, "kl": 0.03369140625, "learning_rate": 9.999701517782153e-07, "loss": 0.0013, "reward": 1.4371393918991089, "reward_std": 0.4773987829685211, "rewards/accuracy_reward": 0.38963934779167175, "rewards/format_reward": 1.0, "step": 183, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 415.125, "epoch": 0.003497101586999905, "grad_norm": 4.498598361526847, "kl": 0.047607421875, "learning_rate": 9.99969824680147e-07, "loss": 0.0019, "reward": 1.6487499475479126, "reward_std": 0.31695321202278137, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 184, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 433.57501220703125, "epoch": 0.003516107573885774, "grad_norm": 1.6196430695974566, "kl": 0.023681640625, "learning_rate": 9.999694957995997e-07, "loss": 0.0009, "reward": 1.499170184135437, "reward_std": 0.36921173334121704, "rewards/accuracy_reward": 0.5179200172424316, "rewards/format_reward": 0.9750000238418579, "step": 185, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 445.3500061035156, "epoch": 0.003535113560771643, "grad_norm": 1.6563067051311027, "kl": 0.0302734375, "learning_rate": 9.99969165136575e-07, "loss": 0.0012, "reward": 1.5774999856948853, "reward_std": 0.18491069972515106, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 0.9750000238418579, "step": 186, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 408.2749938964844, "epoch": 0.003554119547657512, "grad_norm": 1.6860997944834948, "kl": 0.0264892578125, "learning_rate": 9.999688326910734e-07, "loss": 0.0011, "reward": 1.8612499237060547, "reward_std": 0.38037070631980896, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9750000238418579, "step": 187, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 417.7250061035156, "epoch": 0.0035731255345433813, "grad_norm": 1.549050333294932, "kl": 0.03369140625, "learning_rate": 9.999684984630967e-07, "loss": 0.0013, "reward": 1.4499999284744263, "reward_std": 0.2958005368709564, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 188, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 398.45001220703125, "epoch": 0.00359213152142925, "grad_norm": 2.4893560926809686, "kl": 0.03857421875, "learning_rate": 9.99968162452646e-07, "loss": 0.0015, "reward": 1.869797945022583, "reward_std": 0.31480318307876587, "rewards/accuracy_reward": 0.7172979712486267, "rewards/format_reward": 0.949999988079071, "step": 189, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 339.125, "epoch": 0.003611137508315119, "grad_norm": 4.690755256141441, "kl": 0.0279541015625, "learning_rate": 9.999678246597221e-07, "loss": 0.0011, "reward": 1.8263648748397827, "reward_std": 0.08638795465230942, "rewards/accuracy_reward": 0.7126147150993347, "rewards/format_reward": 1.0, "step": 190, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 360.1000061035156, "epoch": 0.0036301434952009885, "grad_norm": 2.0870351156232627, "kl": 0.035888671875, "learning_rate": 9.999674850843264e-07, "loss": 0.0014, "reward": 1.9257738590240479, "reward_std": 0.3620302975177765, "rewards/accuracy_reward": 0.7782737612724304, "rewards/format_reward": 1.0, "step": 191, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 409.1750183105469, "epoch": 0.0036491494820868574, "grad_norm": 1.724990929171834, "kl": 0.043212890625, "learning_rate": 9.999671437264604e-07, "loss": 0.0017, "reward": 1.4066804647445679, "reward_std": 0.23692777752876282, "rewards/accuracy_reward": 0.4429304599761963, "rewards/format_reward": 0.8500000238418579, "step": 192, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 440.1499938964844, "epoch": 0.0036681554689727263, "grad_norm": 1.8795830935742788, "kl": 0.037841796875, "learning_rate": 9.99966800586125e-07, "loss": 0.0015, "reward": 1.964198112487793, "reward_std": 0.2989325225353241, "rewards/accuracy_reward": 0.8041982650756836, "rewards/format_reward": 0.925000011920929, "step": 193, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 362.4750061035156, "epoch": 0.0036871614558585957, "grad_norm": 14.906225680594083, "kl": 0.03564453125, "learning_rate": 9.999664556633216e-07, "loss": 0.0014, "reward": 1.687585473060608, "reward_std": 0.1823313981294632, "rewards/accuracy_reward": 0.5925855040550232, "rewards/format_reward": 1.0, "step": 194, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 433.875, "epoch": 0.0037061674427444646, "grad_norm": 1.8383321669606432, "kl": 0.029296875, "learning_rate": 9.999661089580513e-07, "loss": 0.0012, "reward": 1.6870087385177612, "reward_std": 0.3377555012702942, "rewards/accuracy_reward": 0.6270086765289307, "rewards/format_reward": 0.9750000238418579, "step": 195, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 444.4750061035156, "epoch": 0.0037251734296303335, "grad_norm": 1.5536260862831237, "kl": 0.02880859375, "learning_rate": 9.999657604703153e-07, "loss": 0.0012, "reward": 1.7862499952316284, "reward_std": 0.2919498085975647, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.949999988079071, "step": 196, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 463.1499938964844, "epoch": 0.0037441794165162024, "grad_norm": 1.9298679368934535, "kl": 0.0272216796875, "learning_rate": 9.99965410200115e-07, "loss": 0.0011, "reward": 1.9523206949234009, "reward_std": 0.20473122596740723, "rewards/accuracy_reward": 0.8385707139968872, "rewards/format_reward": 0.9750000238418579, "step": 197, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 411.4750061035156, "epoch": 0.003763185403402072, "grad_norm": 1.6415641870464832, "kl": 0.0303955078125, "learning_rate": 9.999650581474515e-07, "loss": 0.0012, "reward": 1.9512499570846558, "reward_std": 0.20175762474536896, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 198, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 383.1000061035156, "epoch": 0.0037821913902879407, "grad_norm": 1.5556281838875161, "kl": 0.0311279296875, "learning_rate": 9.99964704312326e-07, "loss": 0.0012, "reward": 1.8170557022094727, "reward_std": 0.036341097205877304, "rewards/accuracy_reward": 0.6770557761192322, "rewards/format_reward": 1.0, "step": 199, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 434.3500061035156, "epoch": 0.0038011973771738096, "grad_norm": 1.9466362755351019, "kl": 0.0223388671875, "learning_rate": 9.999643486947402e-07, "loss": 0.0009, "reward": 1.576269507408142, "reward_std": 0.4818333089351654, "rewards/accuracy_reward": 0.6600195169448853, "rewards/format_reward": 0.8500000238418579, "step": 200, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 401.9250183105469, "epoch": 0.003820203364059679, "grad_norm": 1.789056343317186, "kl": 0.0301513671875, "learning_rate": 9.99963991294695e-07, "loss": 0.0012, "reward": 1.6654579639434814, "reward_std": 0.3330591320991516, "rewards/accuracy_reward": 0.6129579544067383, "rewards/format_reward": 0.9750000238418579, "step": 201, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 433.125, "epoch": 0.003839209350945548, "grad_norm": 2.076335731507936, "kl": 0.025390625, "learning_rate": 9.999636321121916e-07, "loss": 0.001, "reward": 1.6747560501098633, "reward_std": 0.22126667201519012, "rewards/accuracy_reward": 0.646006166934967, "rewards/format_reward": 0.949999988079071, "step": 202, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 432.95001220703125, "epoch": 0.003858215337831417, "grad_norm": 1.4563151581298903, "kl": 0.0194091796875, "learning_rate": 9.999632711472315e-07, "loss": 0.0008, "reward": 1.5349998474121094, "reward_std": 0.43499547243118286, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.875, "step": 203, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 460.25, "epoch": 0.0038772213247172858, "grad_norm": 1.9965813028927424, "kl": 0.023193359375, "learning_rate": 9.999629083998157e-07, "loss": 0.0009, "reward": 1.3198680877685547, "reward_std": 0.44186311960220337, "rewards/accuracy_reward": 0.4436180591583252, "rewards/format_reward": 0.875, "step": 204, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 396.4750061035156, "epoch": 0.003896227311603155, "grad_norm": 2.177392570358997, "kl": 0.034423828125, "learning_rate": 9.99962543869946e-07, "loss": 0.0014, "reward": 1.7174999713897705, "reward_std": 0.2532403767108917, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 205, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 415.5500183105469, "epoch": 0.003915233298489024, "grad_norm": 2.6421269200066275, "kl": 0.036865234375, "learning_rate": 9.999621775576233e-07, "loss": 0.0015, "reward": 1.6265392303466797, "reward_std": 0.03958814591169357, "rewards/accuracy_reward": 0.47903934121131897, "rewards/format_reward": 1.0, "step": 206, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 411.625, "epoch": 0.003934239285374893, "grad_norm": 1.8699388357712072, "kl": 0.029296875, "learning_rate": 9.999618094628489e-07, "loss": 0.0012, "reward": 1.4309269189834595, "reward_std": 0.2140554040670395, "rewards/accuracy_reward": 0.559677004814148, "rewards/format_reward": 0.824999988079071, "step": 207, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 391.6499938964844, "epoch": 0.003953245272260762, "grad_norm": 2.265716581167074, "kl": 0.03662109375, "learning_rate": 9.999614395856241e-07, "loss": 0.0015, "reward": 1.9126160144805908, "reward_std": 0.1788838654756546, "rewards/accuracy_reward": 0.7213660478591919, "rewards/format_reward": 1.0, "step": 208, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 391.20001220703125, "epoch": 0.003972251259146631, "grad_norm": 1.9098996418145857, "kl": 0.0302734375, "learning_rate": 9.999610679259507e-07, "loss": 0.0012, "reward": 1.8287500143051147, "reward_std": 0.19593499600887299, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 209, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 382.82501220703125, "epoch": 0.003991257246032501, "grad_norm": 1.846439228788096, "kl": 0.0556640625, "learning_rate": 9.999606944838293e-07, "loss": 0.0022, "reward": 1.8587497472763062, "reward_std": 0.163147434592247, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 210, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 362.7250061035156, "epoch": 0.0040102632329183695, "grad_norm": 2.0967587806330403, "kl": 0.0361328125, "learning_rate": 9.999603192592619e-07, "loss": 0.0015, "reward": 1.8357433080673218, "reward_std": 0.17262892425060272, "rewards/accuracy_reward": 0.6807434558868408, "rewards/format_reward": 1.0, "step": 211, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 398.32501220703125, "epoch": 0.004029269219804238, "grad_norm": 1.9610268313868995, "kl": 0.038330078125, "learning_rate": 9.999599422522493e-07, "loss": 0.0015, "reward": 1.9411624670028687, "reward_std": 0.3060757517814636, "rewards/accuracy_reward": 0.7724127769470215, "rewards/format_reward": 0.9750000238418579, "step": 212, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 342.7749938964844, "epoch": 0.004048275206690107, "grad_norm": 2.1030485166399076, "kl": 0.037841796875, "learning_rate": 9.99959563462793e-07, "loss": 0.0015, "reward": 1.8831208944320679, "reward_std": 0.2587890028953552, "rewards/accuracy_reward": 0.7806208729743958, "rewards/format_reward": 1.0, "step": 213, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 338.1499938964844, "epoch": 0.004067281193575976, "grad_norm": 1.9004382975827243, "kl": 0.033935546875, "learning_rate": 9.999591828908945e-07, "loss": 0.0014, "reward": 1.693750023841858, "reward_std": 0.3791329860687256, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 214, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 389.7749938964844, "epoch": 0.004086287180461845, "grad_norm": 1.4921861618965029, "kl": 0.03515625, "learning_rate": 9.999588005365551e-07, "loss": 0.0014, "reward": 1.7537498474121094, "reward_std": 0.27417823672294617, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 215, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 445.1750183105469, "epoch": 0.004105293167347714, "grad_norm": 1.9721176529898625, "kl": 0.03564453125, "learning_rate": 9.999584163997761e-07, "loss": 0.0014, "reward": 1.7214065790176392, "reward_std": 0.22224795818328857, "rewards/accuracy_reward": 0.6701565384864807, "rewards/format_reward": 0.9000000357627869, "step": 216, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 345.82501220703125, "epoch": 0.004124299154233584, "grad_norm": 1.6072084220040581, "kl": 0.045654296875, "learning_rate": 9.99958030480559e-07, "loss": 0.0018, "reward": 1.5512501001358032, "reward_std": 0.1395757645368576, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 217, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 402.0500183105469, "epoch": 0.004143305141119453, "grad_norm": 4.347914787153495, "kl": 0.04248046875, "learning_rate": 9.99957642778905e-07, "loss": 0.0017, "reward": 1.8440383672714233, "reward_std": 0.20403912663459778, "rewards/accuracy_reward": 0.7202884554862976, "rewards/format_reward": 1.0, "step": 218, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.4250183105469, "epoch": 0.004162311128005322, "grad_norm": 4.993495433573057, "kl": 0.042236328125, "learning_rate": 9.999572532948155e-07, "loss": 0.0017, "reward": 1.6324999332427979, "reward_std": 0.5957102179527283, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 0.9750000238418579, "step": 219, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 479.6750183105469, "epoch": 0.004181317114891191, "grad_norm": 1.7391347502476673, "kl": 0.02587890625, "learning_rate": 9.999568620282921e-07, "loss": 0.001, "reward": 1.6527748107910156, "reward_std": 0.3692838251590729, "rewards/accuracy_reward": 0.7015247344970703, "rewards/format_reward": 0.925000011920929, "step": 220, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 469.0500183105469, "epoch": 0.00420032310177706, "grad_norm": 1.6241228943350263, "kl": 0.0255126953125, "learning_rate": 9.999564689793361e-07, "loss": 0.001, "reward": 1.6034713983535767, "reward_std": 0.17141413688659668, "rewards/accuracy_reward": 0.5809712409973145, "rewards/format_reward": 0.9750000238418579, "step": 221, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 449.375, "epoch": 0.0042193290886629285, "grad_norm": 1.7685131823196605, "kl": 0.0233154296875, "learning_rate": 9.999560741479488e-07, "loss": 0.0009, "reward": 1.6205015182495117, "reward_std": 0.2021985799074173, "rewards/accuracy_reward": 0.5430015325546265, "rewards/format_reward": 0.9750000238418579, "step": 222, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 435.6750183105469, "epoch": 0.0042383350755487974, "grad_norm": 1.6842644341593895, "kl": 0.039794921875, "learning_rate": 9.999556775341314e-07, "loss": 0.0016, "reward": 1.3455833196640015, "reward_std": 0.11606644839048386, "rewards/accuracy_reward": 0.48433348536491394, "rewards/format_reward": 0.800000011920929, "step": 223, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 373.4250183105469, "epoch": 0.004257341062434667, "grad_norm": 10.91069205223911, "kl": 0.033203125, "learning_rate": 9.999552791378858e-07, "loss": 0.0013, "reward": 1.625, "reward_std": 0.31769201159477234, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 224, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 415.1499938964844, "epoch": 0.004276347049320536, "grad_norm": 2.0543386020637833, "kl": 0.0498046875, "learning_rate": 9.999548789592131e-07, "loss": 0.002, "reward": 1.9660313129425049, "reward_std": 0.18239329755306244, "rewards/accuracy_reward": 0.7422811388969421, "rewards/format_reward": 1.0, "step": 225, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 408.7749938964844, "epoch": 0.004295353036206405, "grad_norm": 1.9001147314987352, "kl": 0.05078125, "learning_rate": 9.99954476998115e-07, "loss": 0.002, "reward": 1.7498661279678345, "reward_std": 0.03813100978732109, "rewards/accuracy_reward": 0.6148661375045776, "rewards/format_reward": 1.0, "step": 226, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 471.5500183105469, "epoch": 0.004314359023092274, "grad_norm": 2.0480415066524924, "kl": 0.02587890625, "learning_rate": 9.999540732545926e-07, "loss": 0.001, "reward": 1.2433445453643799, "reward_std": 0.140644833445549, "rewards/accuracy_reward": 0.2945944666862488, "rewards/format_reward": 0.9750000238418579, "step": 227, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 354.32501220703125, "epoch": 0.004333365009978143, "grad_norm": 1.443703893431704, "kl": 0.037109375, "learning_rate": 9.999536677286475e-07, "loss": 0.0015, "reward": 1.7266666889190674, "reward_std": 0.20770369470119476, "rewards/accuracy_reward": 0.5916666984558105, "rewards/format_reward": 1.0, "step": 228, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 414.1499938964844, "epoch": 0.004352370996864012, "grad_norm": 1.4456257892300983, "kl": 0.041015625, "learning_rate": 9.999532604202813e-07, "loss": 0.0016, "reward": 1.662500023841858, "reward_std": 0.33464279770851135, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.9750000238418579, "step": 229, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 473.8500061035156, "epoch": 0.004371376983749882, "grad_norm": 1.6367581525951993, "kl": 0.021728515625, "learning_rate": 9.99952851329495e-07, "loss": 0.0009, "reward": 1.346039056777954, "reward_std": 0.23499193787574768, "rewards/accuracy_reward": 0.3635389506816864, "rewards/format_reward": 0.949999988079071, "step": 230, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 449.6000061035156, "epoch": 0.0043903829706357506, "grad_norm": 1.7578070517860367, "kl": 0.053466796875, "learning_rate": 9.999524404562905e-07, "loss": 0.0021, "reward": 2.1212499141693115, "reward_std": 0.1882200539112091, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 0.9750000238418579, "step": 231, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 410.4250183105469, "epoch": 0.0044093889575216195, "grad_norm": 3.449030092917484, "kl": 0.04931640625, "learning_rate": 9.99952027800669e-07, "loss": 0.002, "reward": 1.7983547449111938, "reward_std": 0.260799378156662, "rewards/accuracy_reward": 0.7121047377586365, "rewards/format_reward": 0.949999988079071, "step": 232, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 455.25, "epoch": 0.004428394944407488, "grad_norm": 2.692992906063105, "kl": 0.0361328125, "learning_rate": 9.999516133626323e-07, "loss": 0.0014, "reward": 1.5289610624313354, "reward_std": 0.2862740457057953, "rewards/accuracy_reward": 0.5789610743522644, "rewards/format_reward": 0.800000011920929, "step": 233, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 423.4250183105469, "epoch": 0.004447400931293357, "grad_norm": 1.9270645096913896, "kl": 0.030517578125, "learning_rate": 9.999511971421815e-07, "loss": 0.0012, "reward": 1.8414939641952515, "reward_std": 0.09297298640012741, "rewards/accuracy_reward": 0.686493992805481, "rewards/format_reward": 1.0, "step": 234, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 462.8000183105469, "epoch": 0.004466406918179226, "grad_norm": 1.2439532476556123, "kl": 0.0169677734375, "learning_rate": 9.999507791393183e-07, "loss": 0.0007, "reward": 1.376250147819519, "reward_std": 0.35701727867126465, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 0.949999988079071, "step": 235, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 443.9250183105469, "epoch": 0.004485412905065095, "grad_norm": 4.492013529506022, "kl": 0.03271484375, "learning_rate": 9.99950359354044e-07, "loss": 0.0013, "reward": 1.83798086643219, "reward_std": 0.21238887310028076, "rewards/accuracy_reward": 0.7792307734489441, "rewards/format_reward": 0.949999988079071, "step": 236, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 364.2250061035156, "epoch": 0.004504418891950965, "grad_norm": 3.5382877146896314, "kl": 0.0537109375, "learning_rate": 9.999499377863605e-07, "loss": 0.0021, "reward": 2.152939558029175, "reward_std": 0.06586786359548569, "rewards/accuracy_reward": 0.9216896295547485, "rewards/format_reward": 1.0, "step": 237, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 425.5, "epoch": 0.004523424878836834, "grad_norm": 2.5140456355627796, "kl": 0.039794921875, "learning_rate": 9.999495144362688e-07, "loss": 0.0016, "reward": 1.8570069074630737, "reward_std": 0.19184057414531708, "rewards/accuracy_reward": 0.7457568049430847, "rewards/format_reward": 1.0, "step": 238, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 462.32501220703125, "epoch": 0.004542430865722703, "grad_norm": 1.7425844949320046, "kl": 0.03173828125, "learning_rate": 9.999490893037708e-07, "loss": 0.0013, "reward": 1.7546519041061401, "reward_std": 0.3788732588291168, "rewards/accuracy_reward": 0.6721518635749817, "rewards/format_reward": 0.9750000238418579, "step": 239, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 464.6499938964844, "epoch": 0.004561436852608572, "grad_norm": 1.870313907201393, "kl": 0.0196533203125, "learning_rate": 9.999486623888678e-07, "loss": 0.0008, "reward": 1.6782845258712769, "reward_std": 0.30399638414382935, "rewards/accuracy_reward": 0.7795344591140747, "rewards/format_reward": 0.9000000357627869, "step": 240, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 395.1000061035156, "epoch": 0.004580442839494441, "grad_norm": 1.7342872853087963, "kl": 0.046630859375, "learning_rate": 9.999482336915612e-07, "loss": 0.0019, "reward": 1.2387466430664062, "reward_std": 0.13596639037132263, "rewards/accuracy_reward": 0.2249966412782669, "rewards/format_reward": 1.0, "step": 241, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 411.45001220703125, "epoch": 0.00459944882638031, "grad_norm": 1.7048656705859881, "kl": 0.03125, "learning_rate": 9.99947803211853e-07, "loss": 0.0012, "reward": 1.830570101737976, "reward_std": 0.28210732340812683, "rewards/accuracy_reward": 0.7455701231956482, "rewards/format_reward": 1.0, "step": 242, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.2749938964844, "epoch": 0.0046184548132661785, "grad_norm": 1.7703257921397861, "kl": 0.0291748046875, "learning_rate": 9.999473709497444e-07, "loss": 0.0012, "reward": 1.7837803363800049, "reward_std": 0.12731657922267914, "rewards/accuracy_reward": 0.6737803220748901, "rewards/format_reward": 0.9750000238418579, "step": 243, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 414.5, "epoch": 0.004637460800152048, "grad_norm": 2.0279619453881312, "kl": 0.03955078125, "learning_rate": 9.999469369052368e-07, "loss": 0.0016, "reward": 2.035435438156128, "reward_std": 0.17900130152702332, "rewards/accuracy_reward": 0.7754355072975159, "rewards/format_reward": 1.0, "step": 244, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 392.4250183105469, "epoch": 0.004656466787037917, "grad_norm": 2.18718915370095, "kl": 0.045654296875, "learning_rate": 9.99946501078332e-07, "loss": 0.0018, "reward": 1.938750147819519, "reward_std": 0.24295035004615784, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 245, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 420.9750061035156, "epoch": 0.004675472773923786, "grad_norm": 4.047967176597088, "kl": 0.040283203125, "learning_rate": 9.999460634690316e-07, "loss": 0.0016, "reward": 1.6499645709991455, "reward_std": 0.2919858396053314, "rewards/accuracy_reward": 0.538714587688446, "rewards/format_reward": 1.0, "step": 246, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 397.45001220703125, "epoch": 0.004694478760809655, "grad_norm": 1.5683094599419327, "kl": 0.0213623046875, "learning_rate": 9.99945624077337e-07, "loss": 0.0009, "reward": 1.68316650390625, "reward_std": 0.40017709136009216, "rewards/accuracy_reward": 0.6481666564941406, "rewards/format_reward": 1.0, "step": 247, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 397.95001220703125, "epoch": 0.004713484747695524, "grad_norm": 2.258023600731953, "kl": 0.0289306640625, "learning_rate": 9.999451829032496e-07, "loss": 0.0012, "reward": 1.6057461500167847, "reward_std": 0.16116632521152496, "rewards/accuracy_reward": 0.5182459950447083, "rewards/format_reward": 0.949999988079071, "step": 248, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 365.0, "epoch": 0.004732490734581393, "grad_norm": 1.5050973691188916, "kl": 0.043212890625, "learning_rate": 9.999447399467716e-07, "loss": 0.0017, "reward": 1.7637500762939453, "reward_std": 0.23460076749324799, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 249, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 372.875, "epoch": 0.004751496721467262, "grad_norm": 2.943327256962536, "kl": 0.0546875, "learning_rate": 9.999442952079038e-07, "loss": 0.0022, "reward": 1.9262498617172241, "reward_std": 0.22076304256916046, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 250, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 365.625, "epoch": 0.004770502708353132, "grad_norm": 2.3868015907290046, "kl": 0.037841796875, "learning_rate": 9.999438486866483e-07, "loss": 0.0015, "reward": 1.5969265699386597, "reward_std": 0.08501073718070984, "rewards/accuracy_reward": 0.5581764578819275, "rewards/format_reward": 1.0, "step": 251, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 426.1499938964844, "epoch": 0.0047895086952390005, "grad_norm": 2.7695669849171662, "kl": 0.03076171875, "learning_rate": 9.999434003830065e-07, "loss": 0.0012, "reward": 1.4927480220794678, "reward_std": 0.15079958736896515, "rewards/accuracy_reward": 0.4289979934692383, "rewards/format_reward": 1.0, "step": 252, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 380.6750183105469, "epoch": 0.0048085146821248695, "grad_norm": 2.9327343872552443, "kl": 0.033935546875, "learning_rate": 9.9994295029698e-07, "loss": 0.0014, "reward": 1.4994280338287354, "reward_std": 0.33042779564857483, "rewards/accuracy_reward": 0.42817798256874084, "rewards/format_reward": 1.0, "step": 253, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 372.8999938964844, "epoch": 0.004827520669010738, "grad_norm": 1.746185537369398, "kl": 0.044189453125, "learning_rate": 9.999424984285707e-07, "loss": 0.0018, "reward": 2.1137502193450928, "reward_std": 0.13382381200790405, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 254, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 408.7749938964844, "epoch": 0.004846526655896607, "grad_norm": 3.387153629636754, "kl": 0.046630859375, "learning_rate": 9.999420447777797e-07, "loss": 0.0019, "reward": 1.6637500524520874, "reward_std": 0.2163672000169754, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 255, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 429.20001220703125, "epoch": 0.004865532642782476, "grad_norm": 2.1413556603365933, "kl": 0.04296875, "learning_rate": 9.99941589344609e-07, "loss": 0.0017, "reward": 1.8574990034103394, "reward_std": 0.15932835638523102, "rewards/accuracy_reward": 0.7074990272521973, "rewards/format_reward": 1.0, "step": 256, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 406.7749938964844, "epoch": 0.004884538629668345, "grad_norm": 2.3319646144845403, "kl": 0.032470703125, "learning_rate": 9.9994113212906e-07, "loss": 0.0013, "reward": 1.8343689441680908, "reward_std": 0.18212370574474335, "rewards/accuracy_reward": 0.6893689632415771, "rewards/format_reward": 0.9750000238418579, "step": 257, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 389.7250061035156, "epoch": 0.004903544616554215, "grad_norm": 1.8436358357332203, "kl": 0.043212890625, "learning_rate": 9.999406731311345e-07, "loss": 0.0017, "reward": 2.137500047683716, "reward_std": 0.11913169920444489, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 258, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 428.0500183105469, "epoch": 0.004922550603440084, "grad_norm": 1.4617709353194248, "kl": 0.03369140625, "learning_rate": 9.99940212350834e-07, "loss": 0.0013, "reward": 1.7537500858306885, "reward_std": 0.08835282176733017, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 259, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 410.375, "epoch": 0.004941556590325953, "grad_norm": 2.2530722269993717, "kl": 0.0322265625, "learning_rate": 9.999397497881602e-07, "loss": 0.0013, "reward": 1.462499976158142, "reward_std": 0.2487768679857254, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 1.0, "step": 260, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 422.375, "epoch": 0.004960562577211822, "grad_norm": 1.5428129906627623, "kl": 0.0308837890625, "learning_rate": 9.999392854431147e-07, "loss": 0.0012, "reward": 1.8400497436523438, "reward_std": 0.18627618253231049, "rewards/accuracy_reward": 0.7262999415397644, "rewards/format_reward": 0.9750000238418579, "step": 261, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 352.7749938964844, "epoch": 0.004979568564097691, "grad_norm": 1.821662983092403, "kl": 0.034423828125, "learning_rate": 9.999388193156994e-07, "loss": 0.0014, "reward": 1.8456611633300781, "reward_std": 0.1655697375535965, "rewards/accuracy_reward": 0.6794113516807556, "rewards/format_reward": 1.0, "step": 262, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 383.70001220703125, "epoch": 0.0049985745509835595, "grad_norm": 1.6262339594971065, "kl": 0.0341796875, "learning_rate": 9.999383514059156e-07, "loss": 0.0014, "reward": 2.0712499618530273, "reward_std": 0.20316505432128906, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 263, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.2749938964844, "epoch": 0.0050175805378694285, "grad_norm": 2.032651744539549, "kl": 0.037109375, "learning_rate": 9.999378817137653e-07, "loss": 0.0015, "reward": 1.4592880010604858, "reward_std": 0.4818175435066223, "rewards/accuracy_reward": 0.41178807616233826, "rewards/format_reward": 0.9750000238418579, "step": 264, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 451.9750061035156, "epoch": 0.005036586524755298, "grad_norm": 1.874956085494121, "kl": 0.031982421875, "learning_rate": 9.999374102392499e-07, "loss": 0.0013, "reward": 1.7299950122833252, "reward_std": 0.3897639214992523, "rewards/accuracy_reward": 0.6899950504302979, "rewards/format_reward": 0.925000011920929, "step": 265, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 410.5249938964844, "epoch": 0.005055592511641167, "grad_norm": 2.5142976106624997, "kl": 0.0284423828125, "learning_rate": 9.999369369823713e-07, "loss": 0.0011, "reward": 1.472083330154419, "reward_std": 0.2645128667354584, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 1.0, "step": 266, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 364.2749938964844, "epoch": 0.005074598498527036, "grad_norm": 5.5873798269407855, "kl": 0.048583984375, "learning_rate": 9.99936461943131e-07, "loss": 0.0019, "reward": 2.2362499237060547, "reward_std": 0.03061859868466854, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 267, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 392.6000061035156, "epoch": 0.005093604485412905, "grad_norm": 2.1393830938553564, "kl": 0.0400390625, "learning_rate": 9.999359851215307e-07, "loss": 0.0016, "reward": 1.9487498998641968, "reward_std": 0.22993634641170502, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 268, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 362.625, "epoch": 0.005112610472298774, "grad_norm": 1.8288490575949787, "kl": 0.039794921875, "learning_rate": 9.999355065175725e-07, "loss": 0.0016, "reward": 1.5055413246154785, "reward_std": 0.16168683767318726, "rewards/accuracy_reward": 0.4117913246154785, "rewards/format_reward": 1.0, "step": 269, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 436.6499938964844, "epoch": 0.005131616459184643, "grad_norm": 1.7773413687933584, "kl": 0.03125, "learning_rate": 9.999350261312574e-07, "loss": 0.0013, "reward": 1.3650546073913574, "reward_std": 0.46858763694763184, "rewards/accuracy_reward": 0.44380465149879456, "rewards/format_reward": 0.9000000357627869, "step": 270, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.7749938964844, "epoch": 0.005150622446070512, "grad_norm": 2.1996420793665163, "kl": 0.044189453125, "learning_rate": 9.999345439625877e-07, "loss": 0.0018, "reward": 1.6639299392700195, "reward_std": 0.4508049190044403, "rewards/accuracy_reward": 0.5039300322532654, "rewards/format_reward": 1.0, "step": 271, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 450.6750183105469, "epoch": 0.005169628432956382, "grad_norm": 3.840479202326024, "kl": 0.0191650390625, "learning_rate": 9.999340600115648e-07, "loss": 0.0008, "reward": 1.4488624334335327, "reward_std": 0.2074168175458908, "rewards/accuracy_reward": 0.4713623523712158, "rewards/format_reward": 0.9750000238418579, "step": 272, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 421.75, "epoch": 0.0051886344198422505, "grad_norm": 1.7917107119442928, "kl": 0.041748046875, "learning_rate": 9.999335742781908e-07, "loss": 0.0017, "reward": 2.077500104904175, "reward_std": 0.13986968994140625, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 273, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 385.8000183105469, "epoch": 0.005207640406728119, "grad_norm": 1.9068560667666392, "kl": 0.036376953125, "learning_rate": 9.99933086762467e-07, "loss": 0.0015, "reward": 1.9535537958145142, "reward_std": 0.07865867763757706, "rewards/accuracy_reward": 0.7960537075996399, "rewards/format_reward": 1.0, "step": 274, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 446.75, "epoch": 0.005226646393613988, "grad_norm": 1.686649728486627, "kl": 0.04150390625, "learning_rate": 9.999325974643953e-07, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.22701247036457062, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.9000000357627869, "step": 275, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 386.1499938964844, "epoch": 0.005245652380499857, "grad_norm": 1.9749736422183155, "kl": 0.049560546875, "learning_rate": 9.999321063839773e-07, "loss": 0.002, "reward": 1.8651264905929565, "reward_std": 0.1253584325313568, "rewards/accuracy_reward": 0.692626416683197, "rewards/format_reward": 1.0, "step": 276, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 390.82501220703125, "epoch": 0.005264658367385726, "grad_norm": 2.0171568395282455, "kl": 0.057861328125, "learning_rate": 9.999316135212151e-07, "loss": 0.0023, "reward": 2.0, "reward_std": 0.3786541521549225, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 277, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 429.6499938964844, "epoch": 0.005283664354271596, "grad_norm": 1.9974755299257028, "kl": 0.046142578125, "learning_rate": 9.9993111887611e-07, "loss": 0.0018, "reward": 1.4188908338546753, "reward_std": 0.2939603924751282, "rewards/accuracy_reward": 0.49389082193374634, "rewards/format_reward": 0.875, "step": 278, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 449.9250183105469, "epoch": 0.005302670341157465, "grad_norm": 2.516161294244043, "kl": 0.033447265625, "learning_rate": 9.999306224486645e-07, "loss": 0.0013, "reward": 1.4664536714553833, "reward_std": 0.26865848898887634, "rewards/accuracy_reward": 0.4177037179470062, "rewards/format_reward": 1.0, "step": 279, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 420.6000061035156, "epoch": 0.005321676328043334, "grad_norm": 1.8123336510918249, "kl": 0.0361328125, "learning_rate": 9.999301242388796e-07, "loss": 0.0014, "reward": 1.6839393377304077, "reward_std": 0.21438370645046234, "rewards/accuracy_reward": 0.625189483165741, "rewards/format_reward": 0.9750000238418579, "step": 280, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 405.82501220703125, "epoch": 0.005340682314929203, "grad_norm": 2.202812449318228, "kl": 0.033935546875, "learning_rate": 9.999296242467575e-07, "loss": 0.0014, "reward": 1.7349998950958252, "reward_std": 0.3030186593532562, "rewards/accuracy_reward": 0.6649999618530273, "rewards/format_reward": 1.0, "step": 281, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 427.5500183105469, "epoch": 0.005359688301815072, "grad_norm": 2.5372341454729423, "kl": 0.0272216796875, "learning_rate": 9.999291224722996e-07, "loss": 0.0011, "reward": 1.4512499570846558, "reward_std": 0.37611016631126404, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 0.9750000238418579, "step": 282, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.82501220703125, "epoch": 0.005378694288700941, "grad_norm": 1.8983014932514142, "kl": 0.0294189453125, "learning_rate": 9.999286189155084e-07, "loss": 0.0012, "reward": 1.4568980932235718, "reward_std": 0.2850346863269806, "rewards/accuracy_reward": 0.4418979585170746, "rewards/format_reward": 1.0, "step": 283, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.0053977002755868095, "grad_norm": 2.1893349041774774, "kl": 0.031494140625, "learning_rate": 9.99928113576385e-07, "loss": 0.0013, "reward": 1.5455259084701538, "reward_std": 0.3938310146331787, "rewards/accuracy_reward": 0.5380258560180664, "rewards/format_reward": 0.949999988079071, "step": 284, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 445.07501220703125, "epoch": 0.005416706262472679, "grad_norm": 1.3788351111737587, "kl": 0.0308837890625, "learning_rate": 9.999276064549312e-07, "loss": 0.0012, "reward": 1.3413587808609009, "reward_std": 0.025265518575906754, "rewards/accuracy_reward": 0.5576087236404419, "rewards/format_reward": 0.800000011920929, "step": 285, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 433.3000183105469, "epoch": 0.005435712249358548, "grad_norm": 1.7270606628152778, "kl": 0.05029296875, "learning_rate": 9.999270975511492e-07, "loss": 0.002, "reward": 1.8880033493041992, "reward_std": 0.12568174302577972, "rewards/accuracy_reward": 0.7255033850669861, "rewards/format_reward": 1.0, "step": 286, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 432.9750061035156, "epoch": 0.005454718236244417, "grad_norm": 1.7570532941872483, "kl": 0.03564453125, "learning_rate": 9.999265868650407e-07, "loss": 0.0014, "reward": 1.8522926568984985, "reward_std": 0.23886366188526154, "rewards/accuracy_reward": 0.6947928071022034, "rewards/format_reward": 0.9750000238418579, "step": 287, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 398.25, "epoch": 0.005473724223130286, "grad_norm": 1.2257385015768534, "kl": 0.0286865234375, "learning_rate": 9.999260743966076e-07, "loss": 0.0011, "reward": 1.6737499237060547, "reward_std": 0.033717554062604904, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 288, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 340.32501220703125, "epoch": 0.005492730210016155, "grad_norm": 1.9855940068002134, "kl": 0.03662109375, "learning_rate": 9.999255601458514e-07, "loss": 0.0015, "reward": 1.905234694480896, "reward_std": 0.2063770741224289, "rewards/accuracy_reward": 0.7927348017692566, "rewards/format_reward": 1.0, "step": 289, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 396.95001220703125, "epoch": 0.005511736196902024, "grad_norm": 1.9685183838556808, "kl": 0.055419921875, "learning_rate": 9.999250441127741e-07, "loss": 0.0022, "reward": 1.527500033378601, "reward_std": 0.35372811555862427, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 290, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 372.375, "epoch": 0.005530742183787893, "grad_norm": 1.9848425205936178, "kl": 0.038818359375, "learning_rate": 9.999245262973778e-07, "loss": 0.0016, "reward": 1.6812708377838135, "reward_std": 0.09697379171848297, "rewards/accuracy_reward": 0.5825207829475403, "rewards/format_reward": 1.0, "step": 291, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 407.625, "epoch": 0.005549748170673763, "grad_norm": 1.9446097606410466, "kl": 0.03662109375, "learning_rate": 9.999240066996642e-07, "loss": 0.0015, "reward": 1.5091886520385742, "reward_std": 0.41227608919143677, "rewards/accuracy_reward": 0.47543859481811523, "rewards/format_reward": 1.0, "step": 292, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 452.125, "epoch": 0.0055687541575596316, "grad_norm": 1.5105927127067236, "kl": 0.0269775390625, "learning_rate": 9.99923485319635e-07, "loss": 0.0011, "reward": 1.42807137966156, "reward_std": 0.49917203187942505, "rewards/accuracy_reward": 0.4680713713169098, "rewards/format_reward": 0.949999988079071, "step": 293, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 385.0249938964844, "epoch": 0.0055877601444455005, "grad_norm": 2.7735985161630983, "kl": 0.03662109375, "learning_rate": 9.99922962157292e-07, "loss": 0.0015, "reward": 1.9042307138442993, "reward_std": 0.1640632450580597, "rewards/accuracy_reward": 0.8192307353019714, "rewards/format_reward": 0.949999988079071, "step": 294, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.005606766131331369, "grad_norm": 1.6919466482631782, "kl": 0.0361328125, "learning_rate": 9.999224372126374e-07, "loss": 0.0014, "reward": 1.9099998474121094, "reward_std": 0.23988580703735352, "rewards/accuracy_reward": 0.7550000548362732, "rewards/format_reward": 1.0, "step": 295, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 459.0500183105469, "epoch": 0.005625772118217238, "grad_norm": 1.5434745574990358, "kl": 0.0201416015625, "learning_rate": 9.999219104856726e-07, "loss": 0.0008, "reward": 1.4433367252349854, "reward_std": 0.4637759327888489, "rewards/accuracy_reward": 0.5270866751670837, "rewards/format_reward": 0.925000011920929, "step": 296, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 398.70001220703125, "epoch": 0.005644778105103107, "grad_norm": 1.7890713273018475, "kl": 0.046875, "learning_rate": 9.999213819764e-07, "loss": 0.0019, "reward": 1.53458833694458, "reward_std": 0.27791649103164673, "rewards/accuracy_reward": 0.44333839416503906, "rewards/format_reward": 1.0, "step": 297, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 381.07501220703125, "epoch": 0.005663784091988976, "grad_norm": 2.094499926828871, "kl": 0.04150390625, "learning_rate": 9.999208516848211e-07, "loss": 0.0017, "reward": 1.8804165124893188, "reward_std": 0.23431554436683655, "rewards/accuracy_reward": 0.7166666984558105, "rewards/format_reward": 1.0, "step": 298, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 478.1000061035156, "epoch": 0.005682790078874846, "grad_norm": 1.6197449172994878, "kl": 0.03076171875, "learning_rate": 9.99920319610938e-07, "loss": 0.0012, "reward": 1.382501482963562, "reward_std": 0.11060315370559692, "rewards/accuracy_reward": 0.4325014054775238, "rewards/format_reward": 0.800000011920929, "step": 299, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 418.0, "epoch": 0.005701796065760715, "grad_norm": 1.5217130478601588, "kl": 0.03955078125, "learning_rate": 9.999197857547526e-07, "loss": 0.0016, "reward": 1.8587499856948853, "reward_std": 0.22571049630641937, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 300, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 422.0500183105469, "epoch": 0.005720802052646584, "grad_norm": 2.332531459916934, "kl": 0.04736328125, "learning_rate": 9.999192501162666e-07, "loss": 0.0019, "reward": 1.9512500762939453, "reward_std": 0.2596333920955658, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 301, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.6, "completion_length": 444.9750061035156, "epoch": 0.005739808039532453, "grad_norm": 1.6562509499100286, "kl": 0.03759765625, "learning_rate": 9.999187126954823e-07, "loss": 0.0015, "reward": 1.3737499713897705, "reward_std": 0.17272552847862244, "rewards/accuracy_reward": 0.32500001788139343, "rewards/format_reward": 1.0, "step": 302, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 400.5500183105469, "epoch": 0.005758814026418322, "grad_norm": 2.407287220935292, "kl": 0.043212890625, "learning_rate": 9.999181734924011e-07, "loss": 0.0017, "reward": 1.7440416812896729, "reward_std": 0.2270553857088089, "rewards/accuracy_reward": 0.6052916646003723, "rewards/format_reward": 1.0, "step": 303, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 395.20001220703125, "epoch": 0.0057778200133041906, "grad_norm": 1.5804410452974618, "kl": 0.04052734375, "learning_rate": 9.999176325070252e-07, "loss": 0.0016, "reward": 1.1881250143051147, "reward_std": 0.31124377250671387, "rewards/accuracy_reward": 0.17812500894069672, "rewards/format_reward": 1.0, "step": 304, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 384.4750061035156, "epoch": 0.0057968260001900595, "grad_norm": 1.7580727061972712, "kl": 0.043212890625, "learning_rate": 9.999170897393564e-07, "loss": 0.0017, "reward": 1.7383193969726562, "reward_std": 0.28262844681739807, "rewards/accuracy_reward": 0.6033194661140442, "rewards/format_reward": 1.0, "step": 305, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 456.32501220703125, "epoch": 0.005815831987075929, "grad_norm": 1.7699612076645639, "kl": 0.037841796875, "learning_rate": 9.99916545189397e-07, "loss": 0.0015, "reward": 1.5950278043746948, "reward_std": 0.3412419855594635, "rewards/accuracy_reward": 0.5387776494026184, "rewards/format_reward": 0.9750000238418579, "step": 306, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 388.1499938964844, "epoch": 0.005834837973961798, "grad_norm": 2.9426697811317077, "kl": 0.0439453125, "learning_rate": 9.999159988571486e-07, "loss": 0.0018, "reward": 1.4479975700378418, "reward_std": 0.3414693772792816, "rewards/accuracy_reward": 0.4167475700378418, "rewards/format_reward": 1.0, "step": 307, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 405.45001220703125, "epoch": 0.005853843960847667, "grad_norm": 3.8830806162209903, "kl": 0.04296875, "learning_rate": 9.999154507426131e-07, "loss": 0.0017, "reward": 1.7688621282577515, "reward_std": 0.29286906123161316, "rewards/accuracy_reward": 0.6538621783256531, "rewards/format_reward": 0.9750000238418579, "step": 308, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 386.3500061035156, "epoch": 0.005872849947733536, "grad_norm": 1.781044153110498, "kl": 0.040771484375, "learning_rate": 9.999149008457927e-07, "loss": 0.0016, "reward": 1.8679708242416382, "reward_std": 0.13607355952262878, "rewards/accuracy_reward": 0.8142208456993103, "rewards/format_reward": 1.0, "step": 309, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 363.7749938964844, "epoch": 0.005891855934619405, "grad_norm": 2.2107186154410865, "kl": 0.053466796875, "learning_rate": 9.999143491666893e-07, "loss": 0.0021, "reward": 1.7591158151626587, "reward_std": 0.22363010048866272, "rewards/accuracy_reward": 0.5991159081459045, "rewards/format_reward": 1.0, "step": 310, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 437.6750183105469, "epoch": 0.005910861921505274, "grad_norm": 1.7226129105690773, "kl": 0.0400390625, "learning_rate": 9.999137957053048e-07, "loss": 0.0016, "reward": 1.323028802871704, "reward_std": 0.4296836853027344, "rewards/accuracy_reward": 0.30802878737449646, "rewards/format_reward": 0.949999988079071, "step": 311, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 380.1499938964844, "epoch": 0.005929867908391143, "grad_norm": 4.031065740331065, "kl": 0.041748046875, "learning_rate": 9.999132404616411e-07, "loss": 0.0017, "reward": 1.590849757194519, "reward_std": 0.23907624185085297, "rewards/accuracy_reward": 0.4983498752117157, "rewards/format_reward": 1.0, "step": 312, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 434.1499938964844, "epoch": 0.005948873895277013, "grad_norm": 1.9627437006929076, "kl": 0.04736328125, "learning_rate": 9.999126834357003e-07, "loss": 0.0019, "reward": 1.938750147819519, "reward_std": 0.1775941550731659, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 313, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 389.5500183105469, "epoch": 0.0059678798821628815, "grad_norm": 2.697616212323388, "kl": 0.04443359375, "learning_rate": 9.999121246274844e-07, "loss": 0.0018, "reward": 1.9043397903442383, "reward_std": 0.24692252278327942, "rewards/accuracy_reward": 0.7355899214744568, "rewards/format_reward": 1.0, "step": 314, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 418.5249938964844, "epoch": 0.0059868858690487504, "grad_norm": 1.9088638488853837, "kl": 0.040283203125, "learning_rate": 9.999115640369952e-07, "loss": 0.0016, "reward": 1.7094920873641968, "reward_std": 0.06522587686777115, "rewards/accuracy_reward": 0.5194922685623169, "rewards/format_reward": 1.0, "step": 315, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 390.1750183105469, "epoch": 0.006005891855934619, "grad_norm": 3.3724263207938017, "kl": 0.048828125, "learning_rate": 9.99911001664235e-07, "loss": 0.002, "reward": 1.6162500381469727, "reward_std": 0.36210280656814575, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 316, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 437.9250183105469, "epoch": 0.006024897842820488, "grad_norm": 2.61414351147298, "kl": 0.03515625, "learning_rate": 9.999104375092055e-07, "loss": 0.0014, "reward": 1.5493055582046509, "reward_std": 0.32617637515068054, "rewards/accuracy_reward": 0.605555534362793, "rewards/format_reward": 0.9000000357627869, "step": 317, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 405.0500183105469, "epoch": 0.006043903829706357, "grad_norm": 2.2876864698009736, "kl": 0.041015625, "learning_rate": 9.999098715719089e-07, "loss": 0.0016, "reward": 1.7958471775054932, "reward_std": 0.1639290750026703, "rewards/accuracy_reward": 0.5970970988273621, "rewards/format_reward": 1.0, "step": 318, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 404.4750061035156, "epoch": 0.006062909816592226, "grad_norm": 1.8550519555730431, "kl": 0.0498046875, "learning_rate": 9.999093038523474e-07, "loss": 0.002, "reward": 1.9999569654464722, "reward_std": 0.10818469524383545, "rewards/accuracy_reward": 0.8362069129943848, "rewards/format_reward": 1.0, "step": 319, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 402.20001220703125, "epoch": 0.006081915803478096, "grad_norm": 2.988112669247468, "kl": 0.033203125, "learning_rate": 9.999087343505226e-07, "loss": 0.0013, "reward": 1.8866890668869019, "reward_std": 0.12124647945165634, "rewards/accuracy_reward": 0.7766891121864319, "rewards/format_reward": 1.0, "step": 320, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 429.45001220703125, "epoch": 0.006100921790363965, "grad_norm": 1.6852191045663105, "kl": 0.03662109375, "learning_rate": 9.999081630664368e-07, "loss": 0.0015, "reward": 1.6287879943847656, "reward_std": 0.30897006392478943, "rewards/accuracy_reward": 0.527538001537323, "rewards/format_reward": 1.0, "step": 321, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 399.3000183105469, "epoch": 0.006119927777249834, "grad_norm": 2.145423119385929, "kl": 0.037841796875, "learning_rate": 9.999075900000919e-07, "loss": 0.0015, "reward": 1.7036985158920288, "reward_std": 0.14790479838848114, "rewards/accuracy_reward": 0.6086985468864441, "rewards/format_reward": 1.0, "step": 322, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 377.7250061035156, "epoch": 0.006138933764135703, "grad_norm": 1.6399069447510508, "kl": 0.04052734375, "learning_rate": 9.9990701515149e-07, "loss": 0.0016, "reward": 1.9162501096725464, "reward_std": 0.28791746497154236, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 0.9750000238418579, "step": 323, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.3500061035156, "epoch": 0.006157939751021572, "grad_norm": 1.9349427211289458, "kl": 0.03515625, "learning_rate": 9.999064385206333e-07, "loss": 0.0014, "reward": 1.5591973066329956, "reward_std": 0.4124367833137512, "rewards/accuracy_reward": 0.6341972351074219, "rewards/format_reward": 0.925000011920929, "step": 324, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 480.0, "epoch": 0.0061769457379074405, "grad_norm": 1.4796136349327658, "kl": 0.031005859375, "learning_rate": 9.999058601075234e-07, "loss": 0.0012, "reward": 1.5225000381469727, "reward_std": 0.6301305890083313, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.7750000357627869, "step": 325, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 422.625, "epoch": 0.0061959517247933095, "grad_norm": 1.953144297505596, "kl": 0.041015625, "learning_rate": 9.99905279912163e-07, "loss": 0.0016, "reward": 2.0349998474121094, "reward_std": 0.14549851417541504, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 326, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 393.0500183105469, "epoch": 0.006214957711679179, "grad_norm": 1.8833621265410123, "kl": 0.046875, "learning_rate": 9.999046979345538e-07, "loss": 0.0019, "reward": 1.9712499380111694, "reward_std": 0.2521544098854065, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 327, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 427.875, "epoch": 0.006233963698565048, "grad_norm": 1.6375846005801433, "kl": 0.041015625, "learning_rate": 9.999041141746978e-07, "loss": 0.0016, "reward": 1.520936369895935, "reward_std": 0.4715319573879242, "rewards/accuracy_reward": 0.49843630194664, "rewards/format_reward": 0.949999988079071, "step": 328, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 394.125, "epoch": 0.006252969685450917, "grad_norm": 1.5972663021511073, "kl": 0.056640625, "learning_rate": 9.999035286325973e-07, "loss": 0.0023, "reward": 2.1449999809265137, "reward_std": 0.10506661981344223, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 329, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 384.9250183105469, "epoch": 0.006271975672336786, "grad_norm": 1.7875585389955349, "kl": 0.036376953125, "learning_rate": 9.999029413082543e-07, "loss": 0.0015, "reward": 1.7178245782852173, "reward_std": 0.1480218768119812, "rewards/accuracy_reward": 0.616574764251709, "rewards/format_reward": 1.0, "step": 330, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 445.4750061035156, "epoch": 0.006290981659222655, "grad_norm": 2.5762864445319504, "kl": 0.02734375, "learning_rate": 9.999023522016707e-07, "loss": 0.0011, "reward": 1.4148296117782593, "reward_std": 0.11809279769659042, "rewards/accuracy_reward": 0.37357956171035767, "rewards/format_reward": 1.0, "step": 331, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 410.5500183105469, "epoch": 0.006309987646108524, "grad_norm": 1.9593416173996685, "kl": 0.051513671875, "learning_rate": 9.999017613128492e-07, "loss": 0.0021, "reward": 2.0071799755096436, "reward_std": 0.2119959443807602, "rewards/accuracy_reward": 0.842180073261261, "rewards/format_reward": 1.0, "step": 332, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 422.0500183105469, "epoch": 0.006328993632994394, "grad_norm": 1.620746774279616, "kl": 0.0322265625, "learning_rate": 9.99901168641791e-07, "loss": 0.0013, "reward": 1.4732033014297485, "reward_std": 0.2191545069217682, "rewards/accuracy_reward": 0.4119533598423004, "rewards/format_reward": 1.0, "step": 333, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 409.3000183105469, "epoch": 0.006347999619880263, "grad_norm": 2.103129597124734, "kl": 0.05224609375, "learning_rate": 9.999005741884988e-07, "loss": 0.0021, "reward": 2.04705548286438, "reward_std": 0.03932321071624756, "rewards/accuracy_reward": 0.8808054327964783, "rewards/format_reward": 1.0, "step": 334, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 420.8999938964844, "epoch": 0.0063670056067661315, "grad_norm": 1.7259102939137225, "kl": 0.049072265625, "learning_rate": 9.99899977952975e-07, "loss": 0.002, "reward": 1.906690001487732, "reward_std": 0.15094737708568573, "rewards/accuracy_reward": 0.759190022945404, "rewards/format_reward": 1.0, "step": 335, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 468.7749938964844, "epoch": 0.006386011593652, "grad_norm": 1.6827581107423368, "kl": 0.0458984375, "learning_rate": 9.99899379935221e-07, "loss": 0.0018, "reward": 1.3009380102157593, "reward_std": 0.27229711413383484, "rewards/accuracy_reward": 0.429688036441803, "rewards/format_reward": 0.8500000238418579, "step": 336, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 436.3500061035156, "epoch": 0.006405017580537869, "grad_norm": 1.221869542475631, "kl": 0.05859375, "learning_rate": 9.998987801352394e-07, "loss": 0.0023, "reward": 1.6899999380111694, "reward_std": 0.06650522351264954, "rewards/accuracy_reward": 0.5512499809265137, "rewards/format_reward": 1.0, "step": 337, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 421.8999938964844, "epoch": 0.006424023567423738, "grad_norm": 2.024033674147605, "kl": 0.05126953125, "learning_rate": 9.998981785530324e-07, "loss": 0.002, "reward": 1.6939667463302612, "reward_std": 0.10195688158273697, "rewards/accuracy_reward": 0.6152166128158569, "rewards/format_reward": 1.0, "step": 338, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 416.9750061035156, "epoch": 0.006443029554309607, "grad_norm": 1.6706206833094994, "kl": 0.064453125, "learning_rate": 9.998975751886016e-07, "loss": 0.0026, "reward": 2.112499952316284, "reward_std": 0.18197335302829742, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 339, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 417.4250183105469, "epoch": 0.006462035541195477, "grad_norm": 1.5353881044695634, "kl": 0.0712890625, "learning_rate": 9.998969700419497e-07, "loss": 0.0029, "reward": 1.8312500715255737, "reward_std": 0.10216512531042099, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 340, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 403.4750061035156, "epoch": 0.006481041528081346, "grad_norm": 1.5509881900311082, "kl": 0.039306640625, "learning_rate": 9.998963631130785e-07, "loss": 0.0016, "reward": 1.501399278640747, "reward_std": 0.34407320618629456, "rewards/accuracy_reward": 0.5813993215560913, "rewards/format_reward": 0.875, "step": 341, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 428.375, "epoch": 0.006500047514967215, "grad_norm": 3.207707638276993, "kl": 0.06787109375, "learning_rate": 9.998957544019906e-07, "loss": 0.0027, "reward": 1.7646111249923706, "reward_std": 0.14869549870491028, "rewards/accuracy_reward": 0.6221112608909607, "rewards/format_reward": 1.0, "step": 342, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 414.6499938964844, "epoch": 0.006519053501853084, "grad_norm": 2.228689409588524, "kl": 0.0400390625, "learning_rate": 9.998951439086879e-07, "loss": 0.0016, "reward": 1.3966782093048096, "reward_std": 0.38540342450141907, "rewards/accuracy_reward": 0.3366781771183014, "rewards/format_reward": 0.9750000238418579, "step": 343, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 480.20001220703125, "epoch": 0.006538059488738953, "grad_norm": 1.5924744575492689, "kl": 0.04296875, "learning_rate": 9.998945316331725e-07, "loss": 0.0017, "reward": 1.6147444248199463, "reward_std": 0.2346866875886917, "rewards/accuracy_reward": 0.6559944152832031, "rewards/format_reward": 0.8500000238418579, "step": 344, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 391.5, "epoch": 0.006557065475624822, "grad_norm": 1.845479006981145, "kl": 0.0517578125, "learning_rate": 9.998939175754465e-07, "loss": 0.0021, "reward": 1.8144477605819702, "reward_std": 0.1335175484418869, "rewards/accuracy_reward": 0.6406978964805603, "rewards/format_reward": 1.0, "step": 345, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 413.0249938964844, "epoch": 0.0065760714625106905, "grad_norm": 2.1479510426702806, "kl": 0.0625, "learning_rate": 9.998933017355125e-07, "loss": 0.0025, "reward": 1.7408040761947632, "reward_std": 0.27547261118888855, "rewards/accuracy_reward": 0.5920543074607849, "rewards/format_reward": 1.0, "step": 346, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 423.5249938964844, "epoch": 0.00659507744939656, "grad_norm": 1.9013688449713897, "kl": 0.07763671875, "learning_rate": 9.998926841133723e-07, "loss": 0.0031, "reward": 2.0399999618530273, "reward_std": 0.10641022026538849, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 347, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 436.6000061035156, "epoch": 0.006614083436282429, "grad_norm": 1.6456004700430817, "kl": 0.0458984375, "learning_rate": 9.998920647090284e-07, "loss": 0.0018, "reward": 2.046250104904175, "reward_std": 0.2025504857301712, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 348, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 450.95001220703125, "epoch": 0.006633089423168298, "grad_norm": 1.6585390593393898, "kl": 0.05029296875, "learning_rate": 9.998914435224825e-07, "loss": 0.002, "reward": 1.7212451696395874, "reward_std": 0.1151413694024086, "rewards/accuracy_reward": 0.6524952054023743, "rewards/format_reward": 1.0, "step": 349, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 411.8000183105469, "epoch": 0.006652095410054167, "grad_norm": 1.7439355336313964, "kl": 0.04248046875, "learning_rate": 9.998908205537375e-07, "loss": 0.0017, "reward": 1.680641770362854, "reward_std": 0.06813951581716537, "rewards/accuracy_reward": 0.6068916916847229, "rewards/format_reward": 1.0, "step": 350, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 451.8999938964844, "epoch": 0.006671101396940036, "grad_norm": 1.8796301429933802, "kl": 0.053955078125, "learning_rate": 9.998901958027952e-07, "loss": 0.0022, "reward": 1.6309250593185425, "reward_std": 0.37135452032089233, "rewards/accuracy_reward": 0.6484249830245972, "rewards/format_reward": 0.949999988079071, "step": 351, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 380.625, "epoch": 0.006690107383825905, "grad_norm": 1.6554985002376068, "kl": 0.05419921875, "learning_rate": 9.998895692696581e-07, "loss": 0.0022, "reward": 1.5699999332427979, "reward_std": 0.21335946023464203, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 352, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 440.7749938964844, "epoch": 0.006709113370711774, "grad_norm": 2.6934214757735915, "kl": 0.06005859375, "learning_rate": 9.998889409543282e-07, "loss": 0.0024, "reward": 1.96999990940094, "reward_std": 0.11868967115879059, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9750000238418579, "step": 353, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 450.6750183105469, "epoch": 0.006728119357597644, "grad_norm": 1.8864143064385501, "kl": 0.03173828125, "learning_rate": 9.998883108568076e-07, "loss": 0.0013, "reward": 1.2243750095367432, "reward_std": 0.28353166580200195, "rewards/accuracy_reward": 0.2593750059604645, "rewards/format_reward": 0.925000011920929, "step": 354, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 472.8500061035156, "epoch": 0.0067471253444835125, "grad_norm": 1.608030280986472, "kl": 0.041015625, "learning_rate": 9.998876789770988e-07, "loss": 0.0016, "reward": 1.8496843576431274, "reward_std": 0.06093672662973404, "rewards/accuracy_reward": 0.7346843481063843, "rewards/format_reward": 1.0, "step": 355, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 466.57501220703125, "epoch": 0.0067661313313693815, "grad_norm": 1.2996917671241226, "kl": 0.043701171875, "learning_rate": 9.998870453152041e-07, "loss": 0.0018, "reward": 1.4723917245864868, "reward_std": 0.31451860070228577, "rewards/accuracy_reward": 0.45239168405532837, "rewards/format_reward": 0.9750000238418579, "step": 356, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 405.6000061035156, "epoch": 0.00678513731825525, "grad_norm": 1.6829502773588834, "kl": 0.043701171875, "learning_rate": 9.998864098711256e-07, "loss": 0.0018, "reward": 1.8915386199951172, "reward_std": 0.031106257811188698, "rewards/accuracy_reward": 0.7865384817123413, "rewards/format_reward": 1.0, "step": 357, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 473.45001220703125, "epoch": 0.006804143305141119, "grad_norm": 1.8442333951772039, "kl": 0.0361328125, "learning_rate": 9.998857726448657e-07, "loss": 0.0014, "reward": 1.6287158727645874, "reward_std": 0.24379150569438934, "rewards/accuracy_reward": 0.6162160038948059, "rewards/format_reward": 0.949999988079071, "step": 358, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 400.625, "epoch": 0.006823149292026988, "grad_norm": 2.089704128109271, "kl": 0.06494140625, "learning_rate": 9.998851336364266e-07, "loss": 0.0026, "reward": 1.9037498235702515, "reward_std": 0.2772168815135956, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 359, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 466.4750061035156, "epoch": 0.006842155278912857, "grad_norm": 1.5966099844671438, "kl": 0.03955078125, "learning_rate": 9.998844928458105e-07, "loss": 0.0016, "reward": 1.6224985122680664, "reward_std": 0.19626890122890472, "rewards/accuracy_reward": 0.6537485122680664, "rewards/format_reward": 0.8500000238418579, "step": 360, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 467.2749938964844, "epoch": 0.006861161265798727, "grad_norm": 1.2285476563398856, "kl": 0.0654296875, "learning_rate": 9.998838502730197e-07, "loss": 0.0026, "reward": 1.738376498222351, "reward_std": 0.18706470727920532, "rewards/accuracy_reward": 0.6521264314651489, "rewards/format_reward": 0.925000011920929, "step": 361, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 452.4750061035156, "epoch": 0.006880167252684596, "grad_norm": 2.8371789308331703, "kl": 0.052978515625, "learning_rate": 9.998832059180566e-07, "loss": 0.0021, "reward": 1.8680633306503296, "reward_std": 0.15290091931819916, "rewards/accuracy_reward": 0.7280632853507996, "rewards/format_reward": 1.0, "step": 362, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 426.5500183105469, "epoch": 0.006899173239570465, "grad_norm": 1.8890417363934238, "kl": 0.068359375, "learning_rate": 9.998825597809236e-07, "loss": 0.0027, "reward": 2.134999990463257, "reward_std": 0.10920830816030502, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 363, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 426.70001220703125, "epoch": 0.006918179226456334, "grad_norm": 2.087265422991059, "kl": 0.0517578125, "learning_rate": 9.998819118616225e-07, "loss": 0.0021, "reward": 1.8948386907577515, "reward_std": 0.1319272369146347, "rewards/accuracy_reward": 0.7548387050628662, "rewards/format_reward": 1.0, "step": 364, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 436.25, "epoch": 0.006937185213342203, "grad_norm": 2.655390747868118, "kl": 0.04150390625, "learning_rate": 9.998812621601563e-07, "loss": 0.0017, "reward": 1.2124675512313843, "reward_std": 0.365411639213562, "rewards/accuracy_reward": 0.2399676889181137, "rewards/format_reward": 0.949999988079071, "step": 365, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 462.8500061035156, "epoch": 0.0069561912002280716, "grad_norm": 2.1552617311384537, "kl": 0.058837890625, "learning_rate": 9.998806106765268e-07, "loss": 0.0023, "reward": 1.9191792011260986, "reward_std": 0.2516794204711914, "rewards/accuracy_reward": 0.7091791033744812, "rewards/format_reward": 0.9750000238418579, "step": 366, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 427.2250061035156, "epoch": 0.0069751971871139405, "grad_norm": 1.680821189014191, "kl": 0.046875, "learning_rate": 9.998799574107366e-07, "loss": 0.0019, "reward": 1.8996073007583618, "reward_std": 0.15216723084449768, "rewards/accuracy_reward": 0.749607264995575, "rewards/format_reward": 1.0, "step": 367, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 359.3999938964844, "epoch": 0.00699420317399981, "grad_norm": 1.7035912917544618, "kl": 0.046630859375, "learning_rate": 9.998793023627879e-07, "loss": 0.0019, "reward": 1.6501998901367188, "reward_std": 0.1932554692029953, "rewards/accuracy_reward": 0.6126998066902161, "rewards/format_reward": 1.0, "step": 368, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 457.57501220703125, "epoch": 0.007013209160885679, "grad_norm": 1.8126025432320474, "kl": 0.043212890625, "learning_rate": 9.998786455326828e-07, "loss": 0.0017, "reward": 1.6159191131591797, "reward_std": 0.4298867881298065, "rewards/accuracy_reward": 0.5759193301200867, "rewards/format_reward": 0.949999988079071, "step": 369, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.4750061035156, "epoch": 0.007032215147771548, "grad_norm": 1.8250904494274842, "kl": 0.068359375, "learning_rate": 9.998779869204243e-07, "loss": 0.0027, "reward": 2.0712497234344482, "reward_std": 0.18371863663196564, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 370, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 442.07501220703125, "epoch": 0.007051221134657417, "grad_norm": 3.1025661461638943, "kl": 0.051025390625, "learning_rate": 9.99877326526014e-07, "loss": 0.002, "reward": 1.9512499570846558, "reward_std": 0.16346576809883118, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 371, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 386.20001220703125, "epoch": 0.007070227121543286, "grad_norm": 2.1965569940976493, "kl": 0.05419921875, "learning_rate": 9.998766643494549e-07, "loss": 0.0022, "reward": 1.8899999856948853, "reward_std": 0.16408979892730713, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 372, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 424.125, "epoch": 0.007089233108429155, "grad_norm": 3.503327292169236, "kl": 0.0439453125, "learning_rate": 9.99876000390749e-07, "loss": 0.0018, "reward": 1.4891948699951172, "reward_std": 0.17772071063518524, "rewards/accuracy_reward": 0.5254448652267456, "rewards/format_reward": 0.9000000357627869, "step": 373, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 441.7749938964844, "epoch": 0.007108239095315024, "grad_norm": 1.8971048143789693, "kl": 0.0277099609375, "learning_rate": 9.998753346498988e-07, "loss": 0.0011, "reward": 1.3545420169830322, "reward_std": 0.2112666368484497, "rewards/accuracy_reward": 0.36204198002815247, "rewards/format_reward": 1.0, "step": 374, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 398.3500061035156, "epoch": 0.007127245082200894, "grad_norm": 1.6972808617420034, "kl": 0.0439453125, "learning_rate": 9.998746671269063e-07, "loss": 0.0018, "reward": 1.582077980041504, "reward_std": 0.0559798963367939, "rewards/accuracy_reward": 0.43707799911499023, "rewards/format_reward": 1.0, "step": 375, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 425.32501220703125, "epoch": 0.0071462510690867625, "grad_norm": 1.6154083998539943, "kl": 0.046142578125, "learning_rate": 9.998739978217744e-07, "loss": 0.0018, "reward": 1.5512499809265137, "reward_std": 0.19012121856212616, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 376, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 444.70001220703125, "epoch": 0.0071652570559726314, "grad_norm": 1.790521644083571, "kl": 0.05322265625, "learning_rate": 9.998733267345052e-07, "loss": 0.0021, "reward": 1.4208310842514038, "reward_std": 0.3444187343120575, "rewards/accuracy_reward": 0.45208102464675903, "rewards/format_reward": 0.9750000238418579, "step": 377, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 387.3999938964844, "epoch": 0.0071842630428585, "grad_norm": 2.0289734289878614, "kl": 0.0556640625, "learning_rate": 9.998726538651012e-07, "loss": 0.0022, "reward": 1.8264998197555542, "reward_std": 0.22795183956623077, "rewards/accuracy_reward": 0.727749764919281, "rewards/format_reward": 1.0, "step": 378, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 417.82501220703125, "epoch": 0.007203269029744369, "grad_norm": 2.437693174872833, "kl": 0.042236328125, "learning_rate": 9.998719792135648e-07, "loss": 0.0017, "reward": 1.6114012002944946, "reward_std": 0.17477957904338837, "rewards/accuracy_reward": 0.5289012789726257, "rewards/format_reward": 1.0, "step": 379, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 428.7250061035156, "epoch": 0.007222275016630238, "grad_norm": 1.8435416122800077, "kl": 0.05615234375, "learning_rate": 9.998713027798984e-07, "loss": 0.0022, "reward": 1.7316067218780518, "reward_std": 0.032526444643735886, "rewards/accuracy_reward": 0.5466069579124451, "rewards/format_reward": 1.0, "step": 380, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.375, "epoch": 0.007241281003516108, "grad_norm": 1.9998852169321388, "kl": 0.047607421875, "learning_rate": 9.998706245641044e-07, "loss": 0.0019, "reward": 1.8103069067001343, "reward_std": 0.16898544132709503, "rewards/accuracy_reward": 0.6915570497512817, "rewards/format_reward": 0.9750000238418579, "step": 381, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 398.2250061035156, "epoch": 0.007260286990401977, "grad_norm": 7.520919082948273, "kl": 0.05322265625, "learning_rate": 9.998699445661852e-07, "loss": 0.0021, "reward": 1.8115625381469727, "reward_std": 0.30977484583854675, "rewards/accuracy_reward": 0.714062511920929, "rewards/format_reward": 1.0, "step": 382, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 414.7250061035156, "epoch": 0.007279292977287846, "grad_norm": 1.562333371677345, "kl": 0.05908203125, "learning_rate": 9.99869262786143e-07, "loss": 0.0024, "reward": 1.7900002002716064, "reward_std": 0.14036639034748077, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 383, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 433.2250061035156, "epoch": 0.007298298964173715, "grad_norm": 1.787448528538503, "kl": 0.040771484375, "learning_rate": 9.998685792239805e-07, "loss": 0.0016, "reward": 1.6576591730117798, "reward_std": 0.266426146030426, "rewards/accuracy_reward": 0.5114091038703918, "rewards/format_reward": 1.0, "step": 384, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 456.32501220703125, "epoch": 0.007317304951059584, "grad_norm": 1.893005359468317, "kl": 0.0303955078125, "learning_rate": 9.998678938797003e-07, "loss": 0.0012, "reward": 1.054965615272522, "reward_std": 0.42117562890052795, "rewards/accuracy_reward": 0.28371554613113403, "rewards/format_reward": 0.800000011920929, "step": 385, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 412.20001220703125, "epoch": 0.007336310937945453, "grad_norm": 3.547055383632895, "kl": 0.043212890625, "learning_rate": 9.998672067533045e-07, "loss": 0.0017, "reward": 1.8491076231002808, "reward_std": 0.35944467782974243, "rewards/accuracy_reward": 0.736607551574707, "rewards/format_reward": 1.0, "step": 386, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 427.9750061035156, "epoch": 0.0073553169248313215, "grad_norm": 1.8100462172422234, "kl": 0.054931640625, "learning_rate": 9.998665178447959e-07, "loss": 0.0022, "reward": 1.9475001096725464, "reward_std": 0.21716085076332092, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 387, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 411.20001220703125, "epoch": 0.007374322911717191, "grad_norm": 1.9122638378072812, "kl": 0.05859375, "learning_rate": 9.998658271541765e-07, "loss": 0.0023, "reward": 2.0924999713897705, "reward_std": 0.13992176949977875, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 388, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 442.3999938964844, "epoch": 0.00739332889860306, "grad_norm": 1.648036839441318, "kl": 0.0390625, "learning_rate": 9.998651346814491e-07, "loss": 0.0016, "reward": 1.4970048666000366, "reward_std": 0.155747190117836, "rewards/accuracy_reward": 0.4220047891139984, "rewards/format_reward": 1.0, "step": 389, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 398.8500061035156, "epoch": 0.007412334885488929, "grad_norm": 1.884855440401696, "kl": 0.03515625, "learning_rate": 9.998644404266159e-07, "loss": 0.0014, "reward": 2.0762500762939453, "reward_std": 0.04517301544547081, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 390, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 397.875, "epoch": 0.007431340872374798, "grad_norm": 2.012870740898151, "kl": 0.0732421875, "learning_rate": 9.998637443896798e-07, "loss": 0.0029, "reward": 2.024012327194214, "reward_std": 0.03822634741663933, "rewards/accuracy_reward": 0.8040122985839844, "rewards/format_reward": 1.0, "step": 391, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 437.4750061035156, "epoch": 0.007450346859260667, "grad_norm": 2.136602140886897, "kl": 0.0517578125, "learning_rate": 9.998630465706429e-07, "loss": 0.0021, "reward": 1.887841820716858, "reward_std": 0.29177364706993103, "rewards/accuracy_reward": 0.7940918803215027, "rewards/format_reward": 1.0, "step": 392, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 439.45001220703125, "epoch": 0.007469352846146536, "grad_norm": 1.6753465748215923, "kl": 0.047119140625, "learning_rate": 9.998623469695077e-07, "loss": 0.0019, "reward": 1.6429789066314697, "reward_std": 0.2791181206703186, "rewards/accuracy_reward": 0.5617288947105408, "rewards/format_reward": 0.949999988079071, "step": 393, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 457.0, "epoch": 0.007488358833032405, "grad_norm": 3.4292571857131438, "kl": 0.052978515625, "learning_rate": 9.998616455862769e-07, "loss": 0.0021, "reward": 1.5460175275802612, "reward_std": 0.19698475301265717, "rewards/accuracy_reward": 0.5110175013542175, "rewards/format_reward": 0.9000000357627869, "step": 394, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.0500183105469, "epoch": 0.007507364819918275, "grad_norm": 1.8761488822131187, "kl": 0.05859375, "learning_rate": 9.99860942420953e-07, "loss": 0.0023, "reward": 1.9135173559188843, "reward_std": 0.1456160545349121, "rewards/accuracy_reward": 0.7097675204277039, "rewards/format_reward": 1.0, "step": 395, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 371.375, "epoch": 0.007526370806804144, "grad_norm": 2.4885891300655985, "kl": 0.06689453125, "learning_rate": 9.998602374735382e-07, "loss": 0.0027, "reward": 2.1424999237060547, "reward_std": 0.12062933295965195, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 396, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 423.32501220703125, "epoch": 0.0075453767936900125, "grad_norm": 1.9929969264561822, "kl": 0.0517578125, "learning_rate": 9.998595307440355e-07, "loss": 0.0021, "reward": 1.6995066404342651, "reward_std": 0.31410613656044006, "rewards/accuracy_reward": 0.6020066142082214, "rewards/format_reward": 1.0, "step": 397, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 448.9250183105469, "epoch": 0.007564382780575881, "grad_norm": 1.6375051527548983, "kl": 0.0478515625, "learning_rate": 9.99858822232447e-07, "loss": 0.0019, "reward": 1.507171630859375, "reward_std": 0.1280670464038849, "rewards/accuracy_reward": 0.5134217143058777, "rewards/format_reward": 1.0, "step": 398, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 411.45001220703125, "epoch": 0.00758338876746175, "grad_norm": 2.1896361836524307, "kl": 0.06396484375, "learning_rate": 9.998581119387757e-07, "loss": 0.0026, "reward": 2.0359530448913574, "reward_std": 0.21659286320209503, "rewards/accuracy_reward": 0.8134528994560242, "rewards/format_reward": 1.0, "step": 399, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 367.7250061035156, "epoch": 0.007602394754347619, "grad_norm": 1.84768057076166, "kl": 0.0458984375, "learning_rate": 9.998573998630233e-07, "loss": 0.0018, "reward": 1.475223422050476, "reward_std": 0.44805580377578735, "rewards/accuracy_reward": 0.45022326707839966, "rewards/format_reward": 1.0, "step": 400, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 406.0249938964844, "epoch": 0.007621400741233488, "grad_norm": 1.7659669090674897, "kl": 0.041259765625, "learning_rate": 9.998566860051931e-07, "loss": 0.0016, "reward": 1.8125, "reward_std": 0.34115922451019287, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 401, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 435.7749938964844, "epoch": 0.007640406728119358, "grad_norm": 2.811419109647443, "kl": 0.04443359375, "learning_rate": 9.998559703652875e-07, "loss": 0.0018, "reward": 1.9911285638809204, "reward_std": 0.11318478733301163, "rewards/accuracy_reward": 0.8373786211013794, "rewards/format_reward": 1.0, "step": 402, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 449.6499938964844, "epoch": 0.007659412715005227, "grad_norm": 1.783109772005397, "kl": 0.051513671875, "learning_rate": 9.998552529433087e-07, "loss": 0.0021, "reward": 1.385178565979004, "reward_std": 0.351605087518692, "rewards/accuracy_reward": 0.3514285683631897, "rewards/format_reward": 0.9750000238418579, "step": 403, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 436.0, "epoch": 0.007678418701891096, "grad_norm": 1.7785382428420244, "kl": 0.04931640625, "learning_rate": 9.998545337392597e-07, "loss": 0.002, "reward": 1.9119949340820312, "reward_std": 0.15657897293567657, "rewards/accuracy_reward": 0.7657451033592224, "rewards/format_reward": 1.0, "step": 404, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 451.0, "epoch": 0.007697424688776965, "grad_norm": 3.121557264536802, "kl": 0.042724609375, "learning_rate": 9.998538127531426e-07, "loss": 0.0017, "reward": 1.6889053583145142, "reward_std": 0.2106253206729889, "rewards/accuracy_reward": 0.5889055132865906, "rewards/format_reward": 1.0, "step": 405, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 446.2749938964844, "epoch": 0.007716430675662834, "grad_norm": 2.7189021736733188, "kl": 0.04833984375, "learning_rate": 9.998530899849608e-07, "loss": 0.0019, "reward": 1.7862499952316284, "reward_std": 0.4135487675666809, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 0.9750000238418579, "step": 406, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 433.1000061035156, "epoch": 0.007735436662548703, "grad_norm": 1.5164413881509486, "kl": 0.060546875, "learning_rate": 9.998523654347158e-07, "loss": 0.0024, "reward": 1.7937500476837158, "reward_std": 0.04809223487973213, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 407, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 421.0249938964844, "epoch": 0.0077544426494345715, "grad_norm": 1.7936475114070736, "kl": 0.06396484375, "learning_rate": 9.998516391024107e-07, "loss": 0.0026, "reward": 2.2200000286102295, "reward_std": 0.19642554223537445, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 0.9750000238418579, "step": 408, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 436.5249938964844, "epoch": 0.007773448636320441, "grad_norm": 2.469095547587512, "kl": 0.056396484375, "learning_rate": 9.998509109880482e-07, "loss": 0.0022, "reward": 1.8845361471176147, "reward_std": 0.3339006304740906, "rewards/accuracy_reward": 0.6895362138748169, "rewards/format_reward": 1.0, "step": 409, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 396.1499938964844, "epoch": 0.00779245462320631, "grad_norm": 1.6910365414575763, "kl": 0.038818359375, "learning_rate": 9.99850181091631e-07, "loss": 0.0016, "reward": 1.8951218128204346, "reward_std": 0.03609613701701164, "rewards/accuracy_reward": 0.795121967792511, "rewards/format_reward": 1.0, "step": 410, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 385.6499938964844, "epoch": 0.007811460610092179, "grad_norm": 2.4764594373217266, "kl": 0.050048828125, "learning_rate": 9.998494494131611e-07, "loss": 0.002, "reward": 1.8788856267929077, "reward_std": 0.07852822542190552, "rewards/accuracy_reward": 0.7288856506347656, "rewards/format_reward": 1.0, "step": 411, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 377.875, "epoch": 0.007830466596978048, "grad_norm": 1.6739286220487823, "kl": 0.047119140625, "learning_rate": 9.998487159526416e-07, "loss": 0.0019, "reward": 1.8614593744277954, "reward_std": 0.11115310341119766, "rewards/accuracy_reward": 0.7039594650268555, "rewards/format_reward": 1.0, "step": 412, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 427.8999938964844, "epoch": 0.007849472583863917, "grad_norm": 1.8420906369117143, "kl": 0.036376953125, "learning_rate": 9.998479807100753e-07, "loss": 0.0015, "reward": 1.5246965885162354, "reward_std": 0.22792772948741913, "rewards/accuracy_reward": 0.5071965456008911, "rewards/format_reward": 1.0, "step": 413, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 416.45001220703125, "epoch": 0.007868478570749786, "grad_norm": 1.9987084326326474, "kl": 0.064453125, "learning_rate": 9.99847243685464e-07, "loss": 0.0026, "reward": 2.083038806915283, "reward_std": 0.18066976964473724, "rewards/accuracy_reward": 0.8742886781692505, "rewards/format_reward": 1.0, "step": 414, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 404.2250061035156, "epoch": 0.007887484557635655, "grad_norm": 1.6017550448750113, "kl": 0.05712890625, "learning_rate": 9.998465048788114e-07, "loss": 0.0023, "reward": 1.652500033378601, "reward_std": 0.25705400109291077, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 415, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 421.9250183105469, "epoch": 0.007906490544521524, "grad_norm": 2.317179197968382, "kl": 0.042236328125, "learning_rate": 9.998457642901193e-07, "loss": 0.0017, "reward": 1.4829562902450562, "reward_std": 0.3408910036087036, "rewards/accuracy_reward": 0.5017063617706299, "rewards/format_reward": 0.925000011920929, "step": 416, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 420.625, "epoch": 0.007925496531407393, "grad_norm": 1.7210194138161985, "kl": 0.04541015625, "learning_rate": 9.998450219193906e-07, "loss": 0.0018, "reward": 1.8924999237060547, "reward_std": 0.1583809107542038, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 417, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 404.6000061035156, "epoch": 0.007944502518293262, "grad_norm": 2.5968579347435194, "kl": 0.03955078125, "learning_rate": 9.99844277766628e-07, "loss": 0.0016, "reward": 1.7641693353652954, "reward_std": 0.14600078761577606, "rewards/accuracy_reward": 0.7091692686080933, "rewards/format_reward": 0.9750000238418579, "step": 418, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 418.0249938964844, "epoch": 0.007963508505179132, "grad_norm": 2.1050169840326096, "kl": 0.0556640625, "learning_rate": 9.998435318318344e-07, "loss": 0.0022, "reward": 2.0356645584106445, "reward_std": 0.15891394019126892, "rewards/accuracy_reward": 0.8581647276878357, "rewards/format_reward": 0.9750000238418579, "step": 419, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 387.32501220703125, "epoch": 0.007982514492065001, "grad_norm": 1.9107633229926246, "kl": 0.080078125, "learning_rate": 9.99842784115012e-07, "loss": 0.0032, "reward": 2.1649999618530273, "reward_std": 0.13496284186840057, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 420, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 443.20001220703125, "epoch": 0.00800152047895087, "grad_norm": 1.3999056191462678, "kl": 0.033203125, "learning_rate": 9.998420346161635e-07, "loss": 0.0013, "reward": 1.5151971578598022, "reward_std": 0.3034624755382538, "rewards/accuracy_reward": 0.5676971673965454, "rewards/format_reward": 0.9000000357627869, "step": 421, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.008020526465836739, "grad_norm": 6.772662006510986, "kl": 0.054931640625, "learning_rate": 9.99841283335292e-07, "loss": 0.0022, "reward": 1.835519790649414, "reward_std": 0.06594506651163101, "rewards/accuracy_reward": 0.6855198740959167, "rewards/format_reward": 1.0, "step": 422, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 433.95001220703125, "epoch": 0.008039532452722608, "grad_norm": 3.103936355535458, "kl": 0.050048828125, "learning_rate": 9.998405302723999e-07, "loss": 0.002, "reward": 1.5305408239364624, "reward_std": 0.14203767478466034, "rewards/accuracy_reward": 0.5067909955978394, "rewards/format_reward": 1.0, "step": 423, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 437.75, "epoch": 0.008058538439608477, "grad_norm": 1.751885930886735, "kl": 0.045654296875, "learning_rate": 9.998397754274896e-07, "loss": 0.0018, "reward": 1.7733423709869385, "reward_std": 0.25026729702949524, "rewards/accuracy_reward": 0.6433423161506653, "rewards/format_reward": 0.9750000238418579, "step": 424, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 455.2749938964844, "epoch": 0.008077544426494346, "grad_norm": 1.594292295318808, "kl": 0.040283203125, "learning_rate": 9.998390188005643e-07, "loss": 0.0016, "reward": 1.3378807306289673, "reward_std": 0.2880254089832306, "rewards/accuracy_reward": 0.36163076758384705, "rewards/format_reward": 0.925000011920929, "step": 425, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 456.1750183105469, "epoch": 0.008096550413380215, "grad_norm": 1.5825411019513485, "kl": 0.029296875, "learning_rate": 9.998382603916263e-07, "loss": 0.0012, "reward": 1.5910849571228027, "reward_std": 0.3767656087875366, "rewards/accuracy_reward": 0.5748350620269775, "rewards/format_reward": 0.949999988079071, "step": 426, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 490.4250183105469, "epoch": 0.008115556400266084, "grad_norm": 9.285023650541742, "kl": 0.0546875, "learning_rate": 9.998375002006786e-07, "loss": 0.0022, "reward": 1.625339150428772, "reward_std": 0.37460535764694214, "rewards/accuracy_reward": 0.6265891194343567, "rewards/format_reward": 0.824999988079071, "step": 427, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.8999938964844, "epoch": 0.008134562387151953, "grad_norm": 1.7176171170008445, "kl": 0.05908203125, "learning_rate": 9.998367382277238e-07, "loss": 0.0024, "reward": 1.8588972091674805, "reward_std": 0.12745192646980286, "rewards/accuracy_reward": 0.7801470756530762, "rewards/format_reward": 1.0, "step": 428, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 445.20001220703125, "epoch": 0.008153568374037821, "grad_norm": 3.7194328086291146, "kl": 0.061279296875, "learning_rate": 9.998359744727647e-07, "loss": 0.0025, "reward": 1.848841905593872, "reward_std": 0.16556741297245026, "rewards/accuracy_reward": 0.7400919198989868, "rewards/format_reward": 1.0, "step": 429, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 463.82501220703125, "epoch": 0.00817257436092369, "grad_norm": 1.7967336621898506, "kl": 0.043212890625, "learning_rate": 9.998352089358037e-07, "loss": 0.0017, "reward": 1.481200098991394, "reward_std": 0.19158399105072021, "rewards/accuracy_reward": 0.4224500358104706, "rewards/format_reward": 1.0, "step": 430, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 380.9750061035156, "epoch": 0.00819158034780956, "grad_norm": 8.46157327695237, "kl": 0.06396484375, "learning_rate": 9.99834441616844e-07, "loss": 0.0026, "reward": 2.0628411769866943, "reward_std": 0.053140003234148026, "rewards/accuracy_reward": 0.8340908885002136, "rewards/format_reward": 1.0, "step": 431, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 452.7250061035156, "epoch": 0.008210586334695428, "grad_norm": 3.2258464214695555, "kl": 0.0859375, "learning_rate": 9.99833672515888e-07, "loss": 0.0034, "reward": 2.2587499618530273, "reward_std": 0.16861510276794434, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 432, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 482.5500183105469, "epoch": 0.008229592321581299, "grad_norm": 2.3400375203817796, "kl": 0.04150390625, "learning_rate": 9.998329016329386e-07, "loss": 0.0017, "reward": 1.6458524465560913, "reward_std": 0.4171806275844574, "rewards/accuracy_reward": 0.6333524584770203, "rewards/format_reward": 0.925000011920929, "step": 433, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 464.8999938964844, "epoch": 0.008248598308467168, "grad_norm": 1.996777851806519, "kl": 0.061279296875, "learning_rate": 9.998321289679984e-07, "loss": 0.0025, "reward": 1.9785350561141968, "reward_std": 0.13937710225582123, "rewards/accuracy_reward": 0.8335351943969727, "rewards/format_reward": 0.9750000238418579, "step": 434, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 482.2749938964844, "epoch": 0.008267604295353037, "grad_norm": 6.700843684552652, "kl": 0.052978515625, "learning_rate": 9.998313545210703e-07, "loss": 0.0021, "reward": 1.4458342790603638, "reward_std": 0.40489354729652405, "rewards/accuracy_reward": 0.40333423018455505, "rewards/format_reward": 0.949999988079071, "step": 435, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 451.1499938964844, "epoch": 0.008286610282238906, "grad_norm": 1.3066923042092622, "kl": 0.0576171875, "learning_rate": 9.99830578292157e-07, "loss": 0.0023, "reward": 1.6087499856948853, "reward_std": 0.049174584448337555, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 436, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 444.70001220703125, "epoch": 0.008305616269124775, "grad_norm": 2.2048405462097587, "kl": 0.04345703125, "learning_rate": 9.998298002812611e-07, "loss": 0.0017, "reward": 1.5705558061599731, "reward_std": 0.2630314528942108, "rewards/accuracy_reward": 0.5193058848381042, "rewards/format_reward": 0.9750000238418579, "step": 437, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 445.8000183105469, "epoch": 0.008324622256010644, "grad_norm": 2.556293043683744, "kl": 0.06494140625, "learning_rate": 9.998290204883858e-07, "loss": 0.0026, "reward": 2.2065789699554443, "reward_std": 0.09058817476034164, "rewards/accuracy_reward": 0.9253290295600891, "rewards/format_reward": 1.0, "step": 438, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 451.1000061035156, "epoch": 0.008343628242896512, "grad_norm": 1.6765442463037867, "kl": 0.0654296875, "learning_rate": 9.998282389135336e-07, "loss": 0.0026, "reward": 2.0200002193450928, "reward_std": 0.10454743355512619, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 439, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 450.875, "epoch": 0.008362634229782381, "grad_norm": 1.7624051832032692, "kl": 0.052734375, "learning_rate": 9.998274555567072e-07, "loss": 0.0021, "reward": 1.876250147819519, "reward_std": 0.20211093127727509, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.9750000238418579, "step": 440, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 409.375, "epoch": 0.00838164021666825, "grad_norm": 1.502506444784469, "kl": 0.047607421875, "learning_rate": 9.998266704179095e-07, "loss": 0.0019, "reward": 1.6598774194717407, "reward_std": 0.20364916324615479, "rewards/accuracy_reward": 0.564877450466156, "rewards/format_reward": 1.0, "step": 441, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 412.70001220703125, "epoch": 0.00840064620355412, "grad_norm": 1.2978180991720345, "kl": 0.050537109375, "learning_rate": 9.998258834971435e-07, "loss": 0.002, "reward": 1.5098358392715454, "reward_std": 0.18291102349758148, "rewards/accuracy_reward": 0.5410858392715454, "rewards/format_reward": 0.949999988079071, "step": 442, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 379.5, "epoch": 0.008419652190439988, "grad_norm": 1.598554996381953, "kl": 0.078125, "learning_rate": 9.998250947944114e-07, "loss": 0.0031, "reward": 1.5625, "reward_std": 0.12271541357040405, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 443, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 444.0249938964844, "epoch": 0.008438658177325857, "grad_norm": 1.7640028260705563, "kl": 0.07373046875, "learning_rate": 9.998243043097168e-07, "loss": 0.003, "reward": 1.9037498235702515, "reward_std": 0.12421140819787979, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 444, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 439.125, "epoch": 0.008457664164211726, "grad_norm": 2.1733653590761377, "kl": 0.06787109375, "learning_rate": 9.99823512043062e-07, "loss": 0.0027, "reward": 2.233750104904175, "reward_std": 0.13382382690906525, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 445, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.82501220703125, "epoch": 0.008476670151097595, "grad_norm": 1.769665720019225, "kl": 0.050537109375, "learning_rate": 9.9982271799445e-07, "loss": 0.002, "reward": 1.7072620391845703, "reward_std": 0.2012491673231125, "rewards/accuracy_reward": 0.6085121035575867, "rewards/format_reward": 1.0, "step": 446, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 402.5, "epoch": 0.008495676137983466, "grad_norm": 2.008099728650602, "kl": 0.06201171875, "learning_rate": 9.998219221638836e-07, "loss": 0.0025, "reward": 2.0126137733459473, "reward_std": 0.1306818574666977, "rewards/accuracy_reward": 0.8488636016845703, "rewards/format_reward": 1.0, "step": 447, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 391.4750061035156, "epoch": 0.008514682124869334, "grad_norm": 3.8300353028265888, "kl": 0.06689453125, "learning_rate": 9.998211245513654e-07, "loss": 0.0027, "reward": 2.1112499237060547, "reward_std": 0.2548518776893616, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 448, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 436.70001220703125, "epoch": 0.008533688111755203, "grad_norm": 2.2214874490514793, "kl": 0.06689453125, "learning_rate": 9.998203251568987e-07, "loss": 0.0027, "reward": 1.5724999904632568, "reward_std": 0.29077717661857605, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 449, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 443.7749938964844, "epoch": 0.008552694098641072, "grad_norm": 3.765816352045816, "kl": 0.0439453125, "learning_rate": 9.99819523980486e-07, "loss": 0.0018, "reward": 1.750025749206543, "reward_std": 0.1611539125442505, "rewards/accuracy_reward": 0.6850257515907288, "rewards/format_reward": 0.925000011920929, "step": 450, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 379.3999938964844, "epoch": 0.008571700085526941, "grad_norm": 1.6304799044093234, "kl": 0.07177734375, "learning_rate": 9.998187210221304e-07, "loss": 0.0029, "reward": 1.6189264059066772, "reward_std": 0.10391286760568619, "rewards/accuracy_reward": 0.5301764607429504, "rewards/format_reward": 1.0, "step": 451, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 431.1499938964844, "epoch": 0.00859070607241281, "grad_norm": 6.060644775858865, "kl": 0.06787109375, "learning_rate": 9.998179162818347e-07, "loss": 0.0027, "reward": 2.291249990463257, "reward_std": 0.03622095659375191, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 452, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 416.25, "epoch": 0.008609712059298679, "grad_norm": 2.5573521251097415, "kl": 0.046142578125, "learning_rate": 9.998171097596018e-07, "loss": 0.0018, "reward": 1.8114269971847534, "reward_std": 0.1684807389974594, "rewards/accuracy_reward": 0.6476770639419556, "rewards/format_reward": 1.0, "step": 453, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 414.5500183105469, "epoch": 0.008628718046184548, "grad_norm": 44.9747760791854, "kl": 0.05322265625, "learning_rate": 9.998163014554343e-07, "loss": 0.0021, "reward": 1.611638069152832, "reward_std": 0.28337562084198, "rewards/accuracy_reward": 0.5853880047798157, "rewards/format_reward": 0.949999988079071, "step": 454, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 395.32501220703125, "epoch": 0.008647724033070417, "grad_norm": 1.4439523409893464, "kl": 0.048095703125, "learning_rate": 9.998154913693353e-07, "loss": 0.0019, "reward": 1.662500023841858, "reward_std": 0.3478153645992279, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 455, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 394.6750183105469, "epoch": 0.008666730019956286, "grad_norm": 1.8460985799610978, "kl": 0.044677734375, "learning_rate": 9.998146795013077e-07, "loss": 0.0018, "reward": 1.8881248235702515, "reward_std": 0.2242121696472168, "rewards/accuracy_reward": 0.7906250357627869, "rewards/format_reward": 1.0, "step": 456, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 442.57501220703125, "epoch": 0.008685736006842155, "grad_norm": 1.5509793941472338, "kl": 0.047607421875, "learning_rate": 9.998138658513542e-07, "loss": 0.0019, "reward": 2.0087499618530273, "reward_std": 0.2518826425075531, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 0.9750000238418579, "step": 457, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 391.1750183105469, "epoch": 0.008704741993728024, "grad_norm": 1.8502122141626356, "kl": 0.0546875, "learning_rate": 9.998130504194779e-07, "loss": 0.0022, "reward": 1.9807662963867188, "reward_std": 0.24359726905822754, "rewards/accuracy_reward": 0.8107662200927734, "rewards/format_reward": 1.0, "step": 458, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 413.0, "epoch": 0.008723747980613893, "grad_norm": 1.989755215059745, "kl": 0.051025390625, "learning_rate": 9.998122332056818e-07, "loss": 0.002, "reward": 1.8674999475479126, "reward_std": 0.11793271452188492, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 459, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 405.0249938964844, "epoch": 0.008742753967499763, "grad_norm": 2.0166022130076526, "kl": 0.0703125, "learning_rate": 9.998114142099685e-07, "loss": 0.0028, "reward": 2.0862159729003906, "reward_std": 0.04510483145713806, "rewards/accuracy_reward": 0.8499659895896912, "rewards/format_reward": 1.0, "step": 460, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 450.0, "epoch": 0.008761759954385632, "grad_norm": 2.151680256921219, "kl": 0.0517578125, "learning_rate": 9.998105934323412e-07, "loss": 0.0021, "reward": 1.7433960437774658, "reward_std": 0.2005983144044876, "rewards/accuracy_reward": 0.68464595079422, "rewards/format_reward": 0.925000011920929, "step": 461, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 385.875, "epoch": 0.008780765941271501, "grad_norm": 2.6575912642055526, "kl": 0.044189453125, "learning_rate": 9.998097708728029e-07, "loss": 0.0018, "reward": 1.9312794208526611, "reward_std": 0.0516841895878315, "rewards/accuracy_reward": 0.8750292658805847, "rewards/format_reward": 1.0, "step": 462, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 388.625, "epoch": 0.00879977192815737, "grad_norm": 1.7631730380466322, "kl": 0.06494140625, "learning_rate": 9.99808946531356e-07, "loss": 0.0026, "reward": 2.0160138607025146, "reward_std": 0.058510977774858475, "rewards/accuracy_reward": 0.9047636389732361, "rewards/format_reward": 1.0, "step": 463, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 439.20001220703125, "epoch": 0.008818777915043239, "grad_norm": 2.5066464555266075, "kl": 0.054443359375, "learning_rate": 9.99808120408004e-07, "loss": 0.0022, "reward": 1.8968384265899658, "reward_std": 0.04898768290877342, "rewards/accuracy_reward": 0.740588366985321, "rewards/format_reward": 1.0, "step": 464, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 406.1000061035156, "epoch": 0.008837783901929108, "grad_norm": 2.2478606074374317, "kl": 0.0732421875, "learning_rate": 9.998072925027496e-07, "loss": 0.0029, "reward": 1.5600000619888306, "reward_std": 0.42751264572143555, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 465, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 416.32501220703125, "epoch": 0.008856789888814977, "grad_norm": 1.859802253341565, "kl": 0.06689453125, "learning_rate": 9.998064628155958e-07, "loss": 0.0027, "reward": 1.962499976158142, "reward_std": 0.1362399160861969, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 466, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 402.2250061035156, "epoch": 0.008875795875700846, "grad_norm": 1.5858260461104032, "kl": 0.050537109375, "learning_rate": 9.998056313465455e-07, "loss": 0.002, "reward": 1.6306251287460327, "reward_std": 0.1722022444009781, "rewards/accuracy_reward": 0.590624988079071, "rewards/format_reward": 1.0, "step": 467, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 450.6499938964844, "epoch": 0.008894801862586715, "grad_norm": 1.702762047523296, "kl": 0.05908203125, "learning_rate": 9.998047980956018e-07, "loss": 0.0024, "reward": 1.7095654010772705, "reward_std": 0.3553208112716675, "rewards/accuracy_reward": 0.7445651888847351, "rewards/format_reward": 0.9000000357627869, "step": 468, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 417.0249938964844, "epoch": 0.008913807849472584, "grad_norm": 1.7034703754068596, "kl": 0.0478515625, "learning_rate": 9.998039630627675e-07, "loss": 0.0019, "reward": 1.692050576210022, "reward_std": 0.17455986142158508, "rewards/accuracy_reward": 0.6433005332946777, "rewards/format_reward": 1.0, "step": 469, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.57501220703125, "epoch": 0.008932813836358452, "grad_norm": 1.657649325279252, "kl": 0.0634765625, "learning_rate": 9.998031262480458e-07, "loss": 0.0025, "reward": 1.9054111242294312, "reward_std": 0.2784424424171448, "rewards/accuracy_reward": 0.7979111671447754, "rewards/format_reward": 1.0, "step": 470, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 442.8999938964844, "epoch": 0.008951819823244321, "grad_norm": 1.7373903623286233, "kl": 0.0625, "learning_rate": 9.998022876514394e-07, "loss": 0.0025, "reward": 1.7674999237060547, "reward_std": 0.30718716979026794, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.925000011920929, "step": 471, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.625, "epoch": 0.00897082581013019, "grad_norm": 4.952608861638442, "kl": 0.052001953125, "learning_rate": 9.998014472729515e-07, "loss": 0.0021, "reward": 1.712471604347229, "reward_std": 0.17913781106472015, "rewards/accuracy_reward": 0.5599716305732727, "rewards/format_reward": 1.0, "step": 472, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 447.32501220703125, "epoch": 0.00898983179701606, "grad_norm": 4.860283678424628, "kl": 0.059326171875, "learning_rate": 9.99800605112585e-07, "loss": 0.0024, "reward": 2.067499876022339, "reward_std": 0.15350113809108734, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 473, "temporal_rewards": 0.699999988079071 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 472.2749938964844, "epoch": 0.00900883778390193, "grad_norm": 1.6051640826240865, "kl": 0.072265625, "learning_rate": 9.997997611703428e-07, "loss": 0.0029, "reward": 2.3062500953674316, "reward_std": 0.0686725452542305, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 474, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 434.3500061035156, "epoch": 0.009027843770787799, "grad_norm": 1.8164006005138298, "kl": 0.058349609375, "learning_rate": 9.99798915446228e-07, "loss": 0.0023, "reward": 1.9271538257598877, "reward_std": 0.0653354674577713, "rewards/accuracy_reward": 0.7109037637710571, "rewards/format_reward": 1.0, "step": 475, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 454.8500061035156, "epoch": 0.009046849757673668, "grad_norm": 1.504528415551405, "kl": 0.05419921875, "learning_rate": 9.99798067940244e-07, "loss": 0.0022, "reward": 2.09375, "reward_std": 0.15219198167324066, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 476, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 395.95001220703125, "epoch": 0.009065855744559537, "grad_norm": 2.781745702470083, "kl": 0.0712890625, "learning_rate": 9.997972186523933e-07, "loss": 0.0028, "reward": 1.662500023841858, "reward_std": 0.4719924032688141, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 477, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 431.82501220703125, "epoch": 0.009084861731445406, "grad_norm": 1.7579791943277092, "kl": 0.0751953125, "learning_rate": 9.99796367582679e-07, "loss": 0.003, "reward": 1.8686836957931519, "reward_std": 0.03408441320061684, "rewards/accuracy_reward": 0.6536837816238403, "rewards/format_reward": 1.0, "step": 478, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 427.70001220703125, "epoch": 0.009103867718331275, "grad_norm": 2.554115765918967, "kl": 0.0654296875, "learning_rate": 9.997955147311044e-07, "loss": 0.0026, "reward": 2.195833206176758, "reward_std": 0.19695670902729034, "rewards/accuracy_reward": 0.9333333373069763, "rewards/format_reward": 1.0, "step": 479, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 453.45001220703125, "epoch": 0.009122873705217143, "grad_norm": 1.5587892217213488, "kl": 0.0615234375, "learning_rate": 9.997946600976723e-07, "loss": 0.0025, "reward": 1.9755518436431885, "reward_std": 0.15970341861248016, "rewards/accuracy_reward": 0.7743017673492432, "rewards/format_reward": 1.0, "step": 480, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.6, "completion_length": 417.5, "epoch": 0.009141879692103012, "grad_norm": 1.19688361762643, "kl": 0.051513671875, "learning_rate": 9.997938036823858e-07, "loss": 0.0021, "reward": 1.1349740028381348, "reward_std": 0.11196901649236679, "rewards/accuracy_reward": 0.08122406154870987, "rewards/format_reward": 1.0, "step": 481, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 429.07501220703125, "epoch": 0.009160885678988881, "grad_norm": 1.926439565443606, "kl": 0.078125, "learning_rate": 9.99792945485248e-07, "loss": 0.0031, "reward": 1.7087501287460327, "reward_std": 0.09527556598186493, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 482, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 415.2749938964844, "epoch": 0.00917989166587475, "grad_norm": 1.9094138725860295, "kl": 0.0732421875, "learning_rate": 9.99792085506262e-07, "loss": 0.0029, "reward": 1.5049999952316284, "reward_std": 0.4254470765590668, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 483, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 454.5500183105469, "epoch": 0.00919889765276062, "grad_norm": 1.7604931063216653, "kl": 0.03369140625, "learning_rate": 9.997912237454309e-07, "loss": 0.0013, "reward": 1.5671659708023071, "reward_std": 0.32028064131736755, "rewards/accuracy_reward": 0.6371659636497498, "rewards/format_reward": 0.949999988079071, "step": 484, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 452.3000183105469, "epoch": 0.009217903639646488, "grad_norm": 1.714713292367152, "kl": 0.03466796875, "learning_rate": 9.997903602027574e-07, "loss": 0.0014, "reward": 1.1929138898849487, "reward_std": 0.3162080645561218, "rewards/accuracy_reward": 0.36041373014450073, "rewards/format_reward": 0.8500000238418579, "step": 485, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 410.125, "epoch": 0.009236909626532357, "grad_norm": 2.521064980361963, "kl": 0.06201171875, "learning_rate": 9.99789494878245e-07, "loss": 0.0025, "reward": 1.6861017942428589, "reward_std": 0.15840964019298553, "rewards/accuracy_reward": 0.5286018252372742, "rewards/format_reward": 1.0, "step": 486, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 378.95001220703125, "epoch": 0.009255915613418226, "grad_norm": 2.5559421663142023, "kl": 0.04638671875, "learning_rate": 9.997886277718965e-07, "loss": 0.0019, "reward": 1.625555396080017, "reward_std": 0.3310706317424774, "rewards/accuracy_reward": 0.5305555462837219, "rewards/format_reward": 1.0, "step": 487, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 387.07501220703125, "epoch": 0.009274921600304097, "grad_norm": 2.6883486653837565, "kl": 0.07666015625, "learning_rate": 9.997877588837154e-07, "loss": 0.0031, "reward": 1.517319917678833, "reward_std": 0.18872372806072235, "rewards/accuracy_reward": 0.4960698187351227, "rewards/format_reward": 1.0, "step": 488, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 370.375, "epoch": 0.009293927587189965, "grad_norm": 1.8962347858409532, "kl": 0.064453125, "learning_rate": 9.997868882137043e-07, "loss": 0.0026, "reward": 2.0177972316741943, "reward_std": 0.32380202412605286, "rewards/accuracy_reward": 0.8040472865104675, "rewards/format_reward": 1.0, "step": 489, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 434.2250061035156, "epoch": 0.009312933574075834, "grad_norm": 1.7829801566738295, "kl": 0.0498046875, "learning_rate": 9.997860157618667e-07, "loss": 0.002, "reward": 1.64120614528656, "reward_std": 0.3396809995174408, "rewards/accuracy_reward": 0.6849562525749207, "rewards/format_reward": 0.9000000357627869, "step": 490, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 407.1750183105469, "epoch": 0.009331939560961703, "grad_norm": 1.779304318494529, "kl": 0.049072265625, "learning_rate": 9.997851415282054e-07, "loss": 0.002, "reward": 1.8875000476837158, "reward_std": 0.22840066254138947, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 491, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 379.0500183105469, "epoch": 0.009350945547847572, "grad_norm": 1.625905006496971, "kl": 0.0654296875, "learning_rate": 9.997842655127238e-07, "loss": 0.0026, "reward": 1.7537500858306885, "reward_std": 0.018371161073446274, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 492, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 392.5500183105469, "epoch": 0.009369951534733441, "grad_norm": 2.3368460537892943, "kl": 0.0615234375, "learning_rate": 9.99783387715425e-07, "loss": 0.0025, "reward": 1.647323489189148, "reward_std": 0.10424643009901047, "rewards/accuracy_reward": 0.5535734295845032, "rewards/format_reward": 1.0, "step": 493, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 397.2250061035156, "epoch": 0.00938895752161931, "grad_norm": 2.0508207232179667, "kl": 0.038818359375, "learning_rate": 9.997825081363118e-07, "loss": 0.0015, "reward": 1.7971175909042358, "reward_std": 0.042405229061841965, "rewards/accuracy_reward": 0.680867612361908, "rewards/format_reward": 1.0, "step": 494, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 388.1750183105469, "epoch": 0.009407963508505179, "grad_norm": 1.4153900026035005, "kl": 0.052734375, "learning_rate": 9.997816267753875e-07, "loss": 0.0021, "reward": 1.71875, "reward_std": 0.08908186107873917, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 495, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 447.9250183105469, "epoch": 0.009426969495391048, "grad_norm": 1.7138907316857963, "kl": 0.057861328125, "learning_rate": 9.997807436326553e-07, "loss": 0.0023, "reward": 1.59375, "reward_std": 0.1288391649723053, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.800000011920929, "step": 496, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 384.20001220703125, "epoch": 0.009445975482276917, "grad_norm": 1.8854658576800034, "kl": 0.05908203125, "learning_rate": 9.997798587081186e-07, "loss": 0.0024, "reward": 1.7962499856948853, "reward_std": 0.278097003698349, "rewards/accuracy_reward": 0.6975000500679016, "rewards/format_reward": 1.0, "step": 497, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 411.9750061035156, "epoch": 0.009464981469162786, "grad_norm": 3.4054225772597775, "kl": 0.053466796875, "learning_rate": 9.9977897200178e-07, "loss": 0.0021, "reward": 2.0822036266326904, "reward_std": 0.040723223239183426, "rewards/accuracy_reward": 0.8659538626670837, "rewards/format_reward": 1.0, "step": 498, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.6499938964844, "epoch": 0.009483987456048655, "grad_norm": 2.2012332646042796, "kl": 0.06982421875, "learning_rate": 9.99778083513643e-07, "loss": 0.0028, "reward": 2.231250047683716, "reward_std": 0.03622094541788101, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 499, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 427.6000061035156, "epoch": 0.009502993442934524, "grad_norm": 1.4652239261353879, "kl": 0.035400390625, "learning_rate": 9.99777193243711e-07, "loss": 0.0014, "reward": 1.4185357093811035, "reward_std": 0.2842453420162201, "rewards/accuracy_reward": 0.40603572130203247, "rewards/format_reward": 0.949999988079071, "step": 500, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 442.32501220703125, "epoch": 0.009521999429820393, "grad_norm": 1.8280873574114165, "kl": 0.05859375, "learning_rate": 9.997763011919864e-07, "loss": 0.0023, "reward": 1.863806128501892, "reward_std": 0.196150004863739, "rewards/accuracy_reward": 0.7075561881065369, "rewards/format_reward": 1.0, "step": 501, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 415.5, "epoch": 0.009541005416706263, "grad_norm": 1.8075627828806675, "kl": 0.07470703125, "learning_rate": 9.997754073584732e-07, "loss": 0.003, "reward": 2.078749895095825, "reward_std": 0.2691101133823395, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 502, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 425.0, "epoch": 0.009560011403592132, "grad_norm": 2.5875098453575585, "kl": 0.049560546875, "learning_rate": 9.997745117431743e-07, "loss": 0.002, "reward": 1.2162787914276123, "reward_std": 0.05266318470239639, "rewards/accuracy_reward": 0.23002873361110687, "rewards/format_reward": 1.0, "step": 503, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 444.5, "epoch": 0.009579017390478001, "grad_norm": 6.085291880837437, "kl": 0.0703125, "learning_rate": 9.99773614346093e-07, "loss": 0.0028, "reward": 1.8537498712539673, "reward_std": 0.2420050948858261, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 504, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 471.95001220703125, "epoch": 0.00959802337736387, "grad_norm": 1.648046194070453, "kl": 0.06005859375, "learning_rate": 9.99772715167232e-07, "loss": 0.0024, "reward": 1.6337499618530273, "reward_std": 0.3327481746673584, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 0.9750000238418579, "step": 505, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 430.0249938964844, "epoch": 0.009617029364249739, "grad_norm": 1.5849127897738289, "kl": 0.048828125, "learning_rate": 9.99771814206595e-07, "loss": 0.002, "reward": 1.763901948928833, "reward_std": 0.18851494789123535, "rewards/accuracy_reward": 0.6864018440246582, "rewards/format_reward": 1.0, "step": 506, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 441.32501220703125, "epoch": 0.009636035351135608, "grad_norm": 1.5959921680901905, "kl": 0.0615234375, "learning_rate": 9.99770911464185e-07, "loss": 0.0025, "reward": 2.2049999237060547, "reward_std": 0.09811828285455704, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 507, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 478.5249938964844, "epoch": 0.009655041338021477, "grad_norm": 1.4051461165424177, "kl": 0.03955078125, "learning_rate": 9.997700069400054e-07, "loss": 0.0016, "reward": 1.3736310005187988, "reward_std": 0.38330039381980896, "rewards/accuracy_reward": 0.4586309492588043, "rewards/format_reward": 0.925000011920929, "step": 508, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 439.0, "epoch": 0.009674047324907346, "grad_norm": 2.0685341563787314, "kl": 0.05322265625, "learning_rate": 9.997691006340593e-07, "loss": 0.0021, "reward": 1.561039686203003, "reward_std": 0.12414371222257614, "rewards/accuracy_reward": 0.5460395812988281, "rewards/format_reward": 1.0, "step": 509, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 482.9750061035156, "epoch": 0.009693053311793215, "grad_norm": 1.421114213059028, "kl": 0.044189453125, "learning_rate": 9.9976819254635e-07, "loss": 0.0018, "reward": 1.9362499713897705, "reward_std": 0.23603010177612305, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 510, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 437.95001220703125, "epoch": 0.009712059298679084, "grad_norm": 1.4247342133916885, "kl": 0.062255859375, "learning_rate": 9.997672826768806e-07, "loss": 0.0025, "reward": 1.8970905542373657, "reward_std": 0.08062443137168884, "rewards/accuracy_reward": 0.7470905780792236, "rewards/format_reward": 1.0, "step": 511, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 452.6750183105469, "epoch": 0.009731065285564952, "grad_norm": 1.6229637859363697, "kl": 0.047119140625, "learning_rate": 9.997663710256545e-07, "loss": 0.0019, "reward": 1.6080732345581055, "reward_std": 0.1674114614725113, "rewards/accuracy_reward": 0.5068233013153076, "rewards/format_reward": 0.9750000238418579, "step": 512, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 442.82501220703125, "epoch": 0.009750071272450821, "grad_norm": 1.5770019601527168, "kl": 0.0634765625, "learning_rate": 9.997654575926748e-07, "loss": 0.0025, "reward": 1.933750033378601, "reward_std": 0.18966975808143616, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 513, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 483.2749938964844, "epoch": 0.00976907725933669, "grad_norm": 1.5718725128573952, "kl": 0.04931640625, "learning_rate": 9.99764542377945e-07, "loss": 0.002, "reward": 1.7304449081420898, "reward_std": 0.20716892182826996, "rewards/accuracy_reward": 0.676694929599762, "rewards/format_reward": 0.9750000238418579, "step": 514, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 411.2250061035156, "epoch": 0.009788083246222561, "grad_norm": 1.4474529353934316, "kl": 0.0673828125, "learning_rate": 9.99763625381468e-07, "loss": 0.0027, "reward": 1.3837499618530273, "reward_std": 0.30521318316459656, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 515, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 466.0500183105469, "epoch": 0.00980708923310843, "grad_norm": 2.0818827919319185, "kl": 0.060791015625, "learning_rate": 9.997627066032475e-07, "loss": 0.0024, "reward": 1.9562500715255737, "reward_std": 0.28784844279289246, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 516, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 435.0, "epoch": 0.009826095219994299, "grad_norm": 1.924609376887236, "kl": 0.06494140625, "learning_rate": 9.997617860432864e-07, "loss": 0.0026, "reward": 1.8871879577636719, "reward_std": 0.13538892567157745, "rewards/accuracy_reward": 0.7396878600120544, "rewards/format_reward": 1.0, "step": 517, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 428.5249938964844, "epoch": 0.009845101206880168, "grad_norm": 1.840152705053356, "kl": 0.0576171875, "learning_rate": 9.99760863701588e-07, "loss": 0.0023, "reward": 1.9327281713485718, "reward_std": 0.06402174383401871, "rewards/accuracy_reward": 0.776478111743927, "rewards/format_reward": 1.0, "step": 518, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 529.875, "epoch": 0.009864107193766037, "grad_norm": 1.457960696500777, "kl": 0.031005859375, "learning_rate": 9.997599395781559e-07, "loss": 0.0012, "reward": 1.0476676225662231, "reward_std": 0.3968335688114166, "rewards/accuracy_reward": 0.4464176297187805, "rewards/format_reward": 0.675000011920929, "step": 519, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 488.3999938964844, "epoch": 0.009883113180651906, "grad_norm": 2.0649997556200983, "kl": 0.037109375, "learning_rate": 9.997590136729931e-07, "loss": 0.0015, "reward": 1.631659746170044, "reward_std": 0.22774966061115265, "rewards/accuracy_reward": 0.6929095387458801, "rewards/format_reward": 0.949999988079071, "step": 520, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 447.5, "epoch": 0.009902119167537774, "grad_norm": 1.8737860942870295, "kl": 0.08349609375, "learning_rate": 9.99758085986103e-07, "loss": 0.0033, "reward": 1.8599998950958252, "reward_std": 0.19150128960609436, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 521, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 450.3500061035156, "epoch": 0.009921125154423643, "grad_norm": 1.647439142953048, "kl": 0.06201171875, "learning_rate": 9.997571565174892e-07, "loss": 0.0025, "reward": 1.7257064580917358, "reward_std": 0.31141045689582825, "rewards/accuracy_reward": 0.686956524848938, "rewards/format_reward": 0.9750000238418579, "step": 522, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 401.3000183105469, "epoch": 0.009940131141309512, "grad_norm": 2.161017413640146, "kl": 0.09716796875, "learning_rate": 9.997562252671545e-07, "loss": 0.0039, "reward": 2.2019996643066406, "reward_std": 0.14086316525936127, "rewards/accuracy_reward": 0.9194995760917664, "rewards/format_reward": 1.0, "step": 523, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 452.1000061035156, "epoch": 0.009959137128195381, "grad_norm": 3.977499471645308, "kl": 0.060546875, "learning_rate": 9.997552922351024e-07, "loss": 0.0024, "reward": 1.5689438581466675, "reward_std": 0.06857716292142868, "rewards/accuracy_reward": 0.5039438605308533, "rewards/format_reward": 1.0, "step": 524, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 469.5249938964844, "epoch": 0.00997814311508125, "grad_norm": 1.25780381342993, "kl": 0.059326171875, "learning_rate": 9.997543574213363e-07, "loss": 0.0024, "reward": 1.743749976158142, "reward_std": 0.1468115597963333, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 525, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 442.7749938964844, "epoch": 0.009997149101967119, "grad_norm": 1.5561277581708837, "kl": 0.05029296875, "learning_rate": 9.997534208258596e-07, "loss": 0.002, "reward": 1.5413535833358765, "reward_std": 0.1735600233078003, "rewards/accuracy_reward": 0.4588536322116852, "rewards/format_reward": 1.0, "step": 526, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 448.57501220703125, "epoch": 0.010016155088852988, "grad_norm": 1.8192200758534172, "kl": 0.06103515625, "learning_rate": 9.997524824486754e-07, "loss": 0.0024, "reward": 2.0274999141693115, "reward_std": 0.20530882477760315, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 527, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 416.45001220703125, "epoch": 0.010035161075738857, "grad_norm": 1.7830384198425893, "kl": 0.068359375, "learning_rate": 9.997515422897875e-07, "loss": 0.0027, "reward": 1.969956636428833, "reward_std": 0.12310776859521866, "rewards/accuracy_reward": 0.8174566626548767, "rewards/format_reward": 1.0, "step": 528, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 418.45001220703125, "epoch": 0.010054167062624728, "grad_norm": 1.29074227610966, "kl": 0.062255859375, "learning_rate": 9.997506003491988e-07, "loss": 0.0025, "reward": 1.8600000143051147, "reward_std": 0.20849597454071045, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 529, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 416.3000183105469, "epoch": 0.010073173049510597, "grad_norm": 2.2381431785356507, "kl": 0.057373046875, "learning_rate": 9.997496566269127e-07, "loss": 0.0023, "reward": 1.5626453161239624, "reward_std": 0.3552330434322357, "rewards/accuracy_reward": 0.5088953375816345, "rewards/format_reward": 0.925000011920929, "step": 530, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 421.9750061035156, "epoch": 0.010092179036396465, "grad_norm": 24.485746925275997, "kl": 0.03857421875, "learning_rate": 9.997487111229328e-07, "loss": 0.0015, "reward": 1.503008484840393, "reward_std": 0.04300389438867569, "rewards/accuracy_reward": 0.40800848603248596, "rewards/format_reward": 1.0, "step": 531, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 410.3000183105469, "epoch": 0.010111185023282334, "grad_norm": 1.868644726796586, "kl": 0.059326171875, "learning_rate": 9.997477638372623e-07, "loss": 0.0024, "reward": 1.8612297773361206, "reward_std": 0.2363087683916092, "rewards/accuracy_reward": 0.7599797248840332, "rewards/format_reward": 1.0, "step": 532, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 401.6750183105469, "epoch": 0.010130191010168203, "grad_norm": 2.1563928038223703, "kl": 0.060302734375, "learning_rate": 9.997468147699044e-07, "loss": 0.0024, "reward": 2.0249557495117188, "reward_std": 0.04264111444354057, "rewards/accuracy_reward": 0.8487057089805603, "rewards/format_reward": 1.0, "step": 533, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 421.6000061035156, "epoch": 0.010149196997054072, "grad_norm": 1.6260224977457973, "kl": 0.034912109375, "learning_rate": 9.997458639208628e-07, "loss": 0.0014, "reward": 1.5147907733917236, "reward_std": 0.32387611269950867, "rewards/accuracy_reward": 0.48979073762893677, "rewards/format_reward": 1.0, "step": 534, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 452.4750061035156, "epoch": 0.010168202983939941, "grad_norm": 1.6739947191706201, "kl": 0.059814453125, "learning_rate": 9.99744911290141e-07, "loss": 0.0024, "reward": 1.777500033378601, "reward_std": 0.3292026221752167, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.9750000238418579, "step": 535, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 426.6750183105469, "epoch": 0.01018720897082581, "grad_norm": 2.191671938308414, "kl": 0.053955078125, "learning_rate": 9.99743956877742e-07, "loss": 0.0022, "reward": 1.7059297561645508, "reward_std": 0.31015655398368835, "rewards/accuracy_reward": 0.6221798062324524, "rewards/format_reward": 0.949999988079071, "step": 536, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 410.0, "epoch": 0.010206214957711679, "grad_norm": 2.1192463675187327, "kl": 0.07470703125, "learning_rate": 9.997430006836696e-07, "loss": 0.003, "reward": 2.161249876022339, "reward_std": 0.2553601861000061, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 537, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 410.875, "epoch": 0.010225220944597548, "grad_norm": 1.7509901082763857, "kl": 0.0478515625, "learning_rate": 9.997420427079268e-07, "loss": 0.0019, "reward": 1.5246129035949707, "reward_std": 0.1447751522064209, "rewards/accuracy_reward": 0.4433629512786865, "rewards/format_reward": 1.0, "step": 538, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 427.32501220703125, "epoch": 0.010244226931483417, "grad_norm": 1.9569781491781264, "kl": 0.05810546875, "learning_rate": 9.997410829505174e-07, "loss": 0.0023, "reward": 1.6866406202316284, "reward_std": 0.2554178535938263, "rewards/accuracy_reward": 0.6703906059265137, "rewards/format_reward": 1.0, "step": 539, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 396.1499938964844, "epoch": 0.010263232918369286, "grad_norm": 1.9090177382978537, "kl": 0.0703125, "learning_rate": 9.997401214114444e-07, "loss": 0.0028, "reward": 1.9938347339630127, "reward_std": 0.21185918152332306, "rewards/accuracy_reward": 0.8288349509239197, "rewards/format_reward": 1.0, "step": 540, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 425.4250183105469, "epoch": 0.010282238905255155, "grad_norm": 2.949924736464448, "kl": 0.0693359375, "learning_rate": 9.997391580907118e-07, "loss": 0.0028, "reward": 1.9996589422225952, "reward_std": 0.12238943576812744, "rewards/accuracy_reward": 0.7909091114997864, "rewards/format_reward": 1.0, "step": 541, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 453.6499938964844, "epoch": 0.010301244892141024, "grad_norm": 1.59182327289358, "kl": 0.044921875, "learning_rate": 9.997381929883225e-07, "loss": 0.0018, "reward": 1.8970184326171875, "reward_std": 0.24323752522468567, "rewards/accuracy_reward": 0.7957685589790344, "rewards/format_reward": 0.9750000238418579, "step": 542, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.010320250879026894, "grad_norm": 2.1514497795864083, "kl": 0.058349609375, "learning_rate": 9.9973722610428e-07, "loss": 0.0023, "reward": 1.7581989765167236, "reward_std": 0.21770977973937988, "rewards/accuracy_reward": 0.7731990218162537, "rewards/format_reward": 0.9750000238418579, "step": 543, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 429.6499938964844, "epoch": 0.010339256865912763, "grad_norm": 3.768053852370147, "kl": 0.083984375, "learning_rate": 9.99736257438588e-07, "loss": 0.0033, "reward": 2.1200387477874756, "reward_std": 0.038123708218336105, "rewards/accuracy_reward": 0.8837887644767761, "rewards/format_reward": 1.0, "step": 544, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 412.3500061035156, "epoch": 0.010358262852798632, "grad_norm": 1.8367108151585614, "kl": 0.080078125, "learning_rate": 9.997352869912499e-07, "loss": 0.0032, "reward": 1.7112499475479126, "reward_std": 0.12422802299261093, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 545, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 440.3999938964844, "epoch": 0.010377268839684501, "grad_norm": 2.2196999092149823, "kl": 0.078125, "learning_rate": 9.99734314762269e-07, "loss": 0.0031, "reward": 1.7476776838302612, "reward_std": 0.25022193789482117, "rewards/accuracy_reward": 0.5614275932312012, "rewards/format_reward": 1.0, "step": 546, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 445.7749938964844, "epoch": 0.01039627482657037, "grad_norm": 1.6856817799618924, "kl": 0.052490234375, "learning_rate": 9.99733340751649e-07, "loss": 0.0021, "reward": 1.5888121128082275, "reward_std": 0.27665263414382935, "rewards/accuracy_reward": 0.5363120436668396, "rewards/format_reward": 0.9750000238418579, "step": 547, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 440.45001220703125, "epoch": 0.010415280813456239, "grad_norm": 1.9253001434590515, "kl": 0.055908203125, "learning_rate": 9.997323649593932e-07, "loss": 0.0022, "reward": 1.8682085275650024, "reward_std": 0.16223637759685516, "rewards/accuracy_reward": 0.7207085490226746, "rewards/format_reward": 1.0, "step": 548, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 435.6750183105469, "epoch": 0.010434286800342108, "grad_norm": 2.0731845286020967, "kl": 0.08154296875, "learning_rate": 9.997313873855052e-07, "loss": 0.0033, "reward": 1.549357295036316, "reward_std": 0.30646300315856934, "rewards/accuracy_reward": 0.3893572986125946, "rewards/format_reward": 1.0, "step": 549, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 421.4250183105469, "epoch": 0.010453292787227977, "grad_norm": 3.059206823205699, "kl": 0.08740234375, "learning_rate": 9.997304080299883e-07, "loss": 0.0035, "reward": 1.9794644117355347, "reward_std": 0.18326017260551453, "rewards/accuracy_reward": 0.7357142567634583, "rewards/format_reward": 1.0, "step": 550, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 439.8999938964844, "epoch": 0.010472298774113846, "grad_norm": 1.8211971498851012, "kl": 0.068359375, "learning_rate": 9.99729426892846e-07, "loss": 0.0027, "reward": 1.962499976158142, "reward_std": 0.1277000904083252, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 551, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 435.57501220703125, "epoch": 0.010491304760999715, "grad_norm": 1.930959635006113, "kl": 0.0673828125, "learning_rate": 9.997284439740818e-07, "loss": 0.0027, "reward": 2.1675000190734863, "reward_std": 0.14230495691299438, "rewards/accuracy_reward": 0.9524999856948853, "rewards/format_reward": 1.0, "step": 552, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 429.1750183105469, "epoch": 0.010510310747885583, "grad_norm": 1.5425747829899408, "kl": 0.052001953125, "learning_rate": 9.997274592736995e-07, "loss": 0.0021, "reward": 1.7487499713897705, "reward_std": 0.03251330554485321, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 553, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 419.75, "epoch": 0.010529316734771452, "grad_norm": 3.186369692711284, "kl": 0.07666015625, "learning_rate": 9.997264727917025e-07, "loss": 0.0031, "reward": 1.5479166507720947, "reward_std": 0.3285001814365387, "rewards/accuracy_reward": 0.44166669249534607, "rewards/format_reward": 0.9750000238418579, "step": 554, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.7250061035156, "epoch": 0.010548322721657321, "grad_norm": 2.0793191587907582, "kl": 0.0673828125, "learning_rate": 9.99725484528094e-07, "loss": 0.0027, "reward": 1.929342269897461, "reward_std": 0.23153726756572723, "rewards/accuracy_reward": 0.7868421077728271, "rewards/format_reward": 1.0, "step": 555, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 429.7749938964844, "epoch": 0.010567328708543192, "grad_norm": 2.382159792702014, "kl": 0.055908203125, "learning_rate": 9.99724494482878e-07, "loss": 0.0022, "reward": 1.8536278009414673, "reward_std": 0.16179294884204865, "rewards/accuracy_reward": 0.7536277770996094, "rewards/format_reward": 1.0, "step": 556, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 430.9250183105469, "epoch": 0.010586334695429061, "grad_norm": 2.1033740533698104, "kl": 0.0712890625, "learning_rate": 9.997235026560576e-07, "loss": 0.0028, "reward": 2.291249990463257, "reward_std": 0.036220937967300415, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 557, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 435.8500061035156, "epoch": 0.01060534068231493, "grad_norm": 5.011974977628457, "kl": 0.0712890625, "learning_rate": 9.997225090476364e-07, "loss": 0.0028, "reward": 2.0250000953674316, "reward_std": 0.3077133297920227, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.949999988079071, "step": 558, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 446.0249938964844, "epoch": 0.010624346669200799, "grad_norm": 1.6251668542112612, "kl": 0.07861328125, "learning_rate": 9.997215136576183e-07, "loss": 0.0031, "reward": 1.9399999380111694, "reward_std": 0.14692406356334686, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.9750000238418579, "step": 559, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 461.125, "epoch": 0.010643352656086668, "grad_norm": 1.4719581109155024, "kl": 0.0380859375, "learning_rate": 9.997205164860066e-07, "loss": 0.0015, "reward": 1.6075143814086914, "reward_std": 0.30150026082992554, "rewards/accuracy_reward": 0.6612643599510193, "rewards/format_reward": 0.949999988079071, "step": 560, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 447.82501220703125, "epoch": 0.010662358642972537, "grad_norm": 1.8722311584053601, "kl": 0.07470703125, "learning_rate": 9.997195175328048e-07, "loss": 0.003, "reward": 1.6799999475479126, "reward_std": 0.2983327805995941, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 561, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 493.375, "epoch": 0.010681364629858405, "grad_norm": 2.5175396066040867, "kl": 0.04345703125, "learning_rate": 9.997185167980164e-07, "loss": 0.0017, "reward": 1.2380895614624023, "reward_std": 0.5283424258232117, "rewards/accuracy_reward": 0.3505896329879761, "rewards/format_reward": 0.925000011920929, "step": 562, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 421.6000061035156, "epoch": 0.010700370616744274, "grad_norm": 1.2818517472753972, "kl": 0.03955078125, "learning_rate": 9.997175142816452e-07, "loss": 0.0016, "reward": 1.6775001287460327, "reward_std": 0.12553423643112183, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 563, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 404.625, "epoch": 0.010719376603630143, "grad_norm": 1.7596950112705625, "kl": 0.07275390625, "learning_rate": 9.997165099836945e-07, "loss": 0.0029, "reward": 1.8583303689956665, "reward_std": 0.12296537309885025, "rewards/accuracy_reward": 0.7608304023742676, "rewards/format_reward": 1.0, "step": 564, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 449.70001220703125, "epoch": 0.010738382590516012, "grad_norm": 2.0564729193203313, "kl": 0.06640625, "learning_rate": 9.997155039041684e-07, "loss": 0.0027, "reward": 1.8003486394882202, "reward_std": 0.20200787484645844, "rewards/accuracy_reward": 0.6553487777709961, "rewards/format_reward": 1.0, "step": 565, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 450.95001220703125, "epoch": 0.010757388577401881, "grad_norm": 1.8121228815285055, "kl": 0.08251953125, "learning_rate": 9.9971449604307e-07, "loss": 0.0033, "reward": 1.868749976158142, "reward_std": 0.18496833741664886, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 566, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 432.875, "epoch": 0.01077639456428775, "grad_norm": 3.497682098835307, "kl": 0.0625, "learning_rate": 9.997134864004028e-07, "loss": 0.0025, "reward": 1.529032588005066, "reward_std": 0.06469090282917023, "rewards/accuracy_reward": 0.45028257369995117, "rewards/format_reward": 1.0, "step": 567, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 436.70001220703125, "epoch": 0.010795400551173619, "grad_norm": 2.4668971310469705, "kl": 0.08154296875, "learning_rate": 9.997124749761708e-07, "loss": 0.0033, "reward": 1.6386276483535767, "reward_std": 0.15997079014778137, "rewards/accuracy_reward": 0.5061275362968445, "rewards/format_reward": 1.0, "step": 568, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 455.375, "epoch": 0.010814406538059488, "grad_norm": 1.7766542047831666, "kl": 0.06689453125, "learning_rate": 9.997114617703774e-07, "loss": 0.0027, "reward": 1.590166687965393, "reward_std": 0.3435172140598297, "rewards/accuracy_reward": 0.515166699886322, "rewards/format_reward": 0.9750000238418579, "step": 569, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 451.7250061035156, "epoch": 0.010833412524945359, "grad_norm": 2.0663020555330553, "kl": 0.060302734375, "learning_rate": 9.997104467830264e-07, "loss": 0.0024, "reward": 1.84375, "reward_std": 0.2997171878814697, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 0.925000011920929, "step": 570, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 427.0, "epoch": 0.010852418511831228, "grad_norm": 1.9441534396138849, "kl": 0.061279296875, "learning_rate": 9.99709430014121e-07, "loss": 0.0024, "reward": 1.9108333587646484, "reward_std": 0.1315259039402008, "rewards/accuracy_reward": 0.753333330154419, "rewards/format_reward": 1.0, "step": 571, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 461.0, "epoch": 0.010871424498717096, "grad_norm": 5.2720125799568445, "kl": 0.06640625, "learning_rate": 9.997084114636653e-07, "loss": 0.0027, "reward": 1.712392807006836, "reward_std": 0.18492774665355682, "rewards/accuracy_reward": 0.5636427998542786, "rewards/format_reward": 0.949999988079071, "step": 572, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 454.2749938964844, "epoch": 0.010890430485602965, "grad_norm": 3.7820635582263407, "kl": 0.060791015625, "learning_rate": 9.997073911316625e-07, "loss": 0.0024, "reward": 1.8587497472763062, "reward_std": 0.11446737498044968, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 573, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 394.0, "epoch": 0.010909436472488834, "grad_norm": 2.9854145728478922, "kl": 0.072265625, "learning_rate": 9.997063690181166e-07, "loss": 0.0029, "reward": 1.9924999475479126, "reward_std": 0.12202110141515732, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 574, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 429.1000061035156, "epoch": 0.010928442459374703, "grad_norm": 7.321171036237711, "kl": 0.048583984375, "learning_rate": 9.99705345123031e-07, "loss": 0.0019, "reward": 1.4943394660949707, "reward_std": 0.23887300491333008, "rewards/accuracy_reward": 0.42433950304985046, "rewards/format_reward": 1.0, "step": 575, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 447.7749938964844, "epoch": 0.010947448446260572, "grad_norm": 7.609787721551616, "kl": 0.056884765625, "learning_rate": 9.997043194464097e-07, "loss": 0.0023, "reward": 1.8877452611923218, "reward_std": 0.08956362307071686, "rewards/accuracy_reward": 0.7914954423904419, "rewards/format_reward": 1.0, "step": 576, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 420.6750183105469, "epoch": 0.010966454433146441, "grad_norm": 1.7424394249563804, "kl": 0.0634765625, "learning_rate": 9.997032919882557e-07, "loss": 0.0025, "reward": 2.137741804122925, "reward_std": 0.10617410391569138, "rewards/accuracy_reward": 0.9677419662475586, "rewards/format_reward": 1.0, "step": 577, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.1000061035156, "epoch": 0.01098546042003231, "grad_norm": 2.1309296570794722, "kl": 0.07275390625, "learning_rate": 9.997022627485733e-07, "loss": 0.0029, "reward": 1.8008911609649658, "reward_std": 0.058240581303834915, "rewards/accuracy_reward": 0.6608911752700806, "rewards/format_reward": 1.0, "step": 578, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 469.3500061035156, "epoch": 0.011004466406918179, "grad_norm": 3.070584243478517, "kl": 0.029296875, "learning_rate": 9.99701231727366e-07, "loss": 0.0012, "reward": 1.7209606170654297, "reward_std": 0.3849009871482849, "rewards/accuracy_reward": 0.7397105097770691, "rewards/format_reward": 0.925000011920929, "step": 579, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 444.1750183105469, "epoch": 0.011023472393804048, "grad_norm": 3.4046461154933247, "kl": 0.0634765625, "learning_rate": 9.997001989246375e-07, "loss": 0.0025, "reward": 1.8424999713897705, "reward_std": 0.13863466680049896, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 580, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 451.4250183105469, "epoch": 0.011042478380689917, "grad_norm": 2.120796271990568, "kl": 0.0634765625, "learning_rate": 9.99699164340391e-07, "loss": 0.0025, "reward": 1.9725000858306885, "reward_std": 0.19706712663173676, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 581, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 419.375, "epoch": 0.011061484367575786, "grad_norm": 1.5933497265646501, "kl": 0.058837890625, "learning_rate": 9.996981279746309e-07, "loss": 0.0024, "reward": 1.778942346572876, "reward_std": 0.2655041813850403, "rewards/accuracy_reward": 0.7076923251152039, "rewards/format_reward": 0.9750000238418579, "step": 582, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 463.95001220703125, "epoch": 0.011080490354461655, "grad_norm": 2.068419841779232, "kl": 0.0576171875, "learning_rate": 9.996970898273605e-07, "loss": 0.0023, "reward": 1.7924998998641968, "reward_std": 0.29065975546836853, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.9000000357627869, "step": 583, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 411.1499938964844, "epoch": 0.011099496341347525, "grad_norm": 1.9404332385370395, "kl": 0.0849609375, "learning_rate": 9.996960498985835e-07, "loss": 0.0034, "reward": 2.0962026119232178, "reward_std": 0.042331378906965256, "rewards/accuracy_reward": 0.7999525666236877, "rewards/format_reward": 1.0, "step": 584, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.1750183105469, "epoch": 0.011118502328233394, "grad_norm": 1.5837026435663861, "kl": 0.047119140625, "learning_rate": 9.99695008188304e-07, "loss": 0.0019, "reward": 1.7075976133346558, "reward_std": 0.0701654776930809, "rewards/accuracy_reward": 0.6163474917411804, "rewards/format_reward": 1.0, "step": 585, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 446.45001220703125, "epoch": 0.011137508315119263, "grad_norm": 2.0138325412935068, "kl": 0.053955078125, "learning_rate": 9.99693964696525e-07, "loss": 0.0022, "reward": 1.6336510181427002, "reward_std": 0.18786856532096863, "rewards/accuracy_reward": 0.4961509704589844, "rewards/format_reward": 1.0, "step": 586, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 398.8000183105469, "epoch": 0.011156514302005132, "grad_norm": 1.8654457872792787, "kl": 0.0830078125, "learning_rate": 9.99692919423251e-07, "loss": 0.0033, "reward": 1.6038554906845093, "reward_std": 0.05064338445663452, "rewards/accuracy_reward": 0.5063557028770447, "rewards/format_reward": 1.0, "step": 587, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 469.45001220703125, "epoch": 0.011175520288891001, "grad_norm": 1.739404524855777, "kl": 0.056396484375, "learning_rate": 9.99691872368485e-07, "loss": 0.0023, "reward": 1.8059364557266235, "reward_std": 0.33440202474594116, "rewards/accuracy_reward": 0.6834363341331482, "rewards/format_reward": 1.0, "step": 588, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 427.3500061035156, "epoch": 0.01119452627577687, "grad_norm": 2.541403588992428, "kl": 0.04931640625, "learning_rate": 9.996908235322312e-07, "loss": 0.002, "reward": 1.561975359916687, "reward_std": 0.09330564737319946, "rewards/accuracy_reward": 0.5307253003120422, "rewards/format_reward": 1.0, "step": 589, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 433.125, "epoch": 0.011213532262662739, "grad_norm": 1.7224078447156417, "kl": 0.07421875, "learning_rate": 9.996897729144933e-07, "loss": 0.003, "reward": 1.8650000095367432, "reward_std": 0.3313520848751068, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 590, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 419.0, "epoch": 0.011232538249548608, "grad_norm": 1.804958188799846, "kl": 0.047119140625, "learning_rate": 9.996887205152748e-07, "loss": 0.0019, "reward": 1.6495860815048218, "reward_std": 0.13574181497097015, "rewards/accuracy_reward": 0.5445861220359802, "rewards/format_reward": 0.9750000238418579, "step": 591, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 451.1750183105469, "epoch": 0.011251544236434477, "grad_norm": 1.94029667751913, "kl": 0.060791015625, "learning_rate": 9.9968766633458e-07, "loss": 0.0024, "reward": 1.8676280975341797, "reward_std": 0.0523289330303669, "rewards/accuracy_reward": 0.7163779735565186, "rewards/format_reward": 1.0, "step": 592, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 460.82501220703125, "epoch": 0.011270550223320346, "grad_norm": 2.876341928937013, "kl": 0.0673828125, "learning_rate": 9.996866103724119e-07, "loss": 0.0027, "reward": 1.8318378925323486, "reward_std": 0.13642318546772003, "rewards/accuracy_reward": 0.6943378448486328, "rewards/format_reward": 1.0, "step": 593, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 448.95001220703125, "epoch": 0.011289556210206214, "grad_norm": 3.606542369974204, "kl": 0.0751953125, "learning_rate": 9.996855526287748e-07, "loss": 0.003, "reward": 1.8008333444595337, "reward_std": 0.23959660530090332, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.9750000238418579, "step": 594, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 455.6499938964844, "epoch": 0.011308562197092083, "grad_norm": 1.6976024232220916, "kl": 0.050048828125, "learning_rate": 9.996844931036723e-07, "loss": 0.002, "reward": 1.5475986003875732, "reward_std": 0.24376149475574493, "rewards/accuracy_reward": 0.5800986289978027, "rewards/format_reward": 0.8500000238418579, "step": 595, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 485.57501220703125, "epoch": 0.011327568183977952, "grad_norm": 1.3563633391799217, "kl": 0.0693359375, "learning_rate": 9.99683431797108e-07, "loss": 0.0028, "reward": 1.1493602991104126, "reward_std": 0.09293808788061142, "rewards/accuracy_reward": 0.3168603479862213, "rewards/format_reward": 0.800000011920929, "step": 596, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 434.25, "epoch": 0.011346574170863821, "grad_norm": 1.6313913910403548, "kl": 0.054443359375, "learning_rate": 9.996823687090861e-07, "loss": 0.0022, "reward": 2.0971591472625732, "reward_std": 0.06261853128671646, "rewards/accuracy_reward": 0.9909090995788574, "rewards/format_reward": 1.0, "step": 597, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 452.75, "epoch": 0.011365580157749692, "grad_norm": 1.4532476788156559, "kl": 0.0693359375, "learning_rate": 9.996813038396102e-07, "loss": 0.0028, "reward": 1.875, "reward_std": 0.18700535595417023, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.925000011920929, "step": 598, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 446.1499938964844, "epoch": 0.01138458614463556, "grad_norm": 1.8266604983575412, "kl": 0.06884765625, "learning_rate": 9.99680237188684e-07, "loss": 0.0028, "reward": 1.57846999168396, "reward_std": 0.18595215678215027, "rewards/accuracy_reward": 0.5334700345993042, "rewards/format_reward": 1.0, "step": 599, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.6, "completion_length": 420.6000061035156, "epoch": 0.01140359213152143, "grad_norm": 1.4683311607751286, "kl": 0.0556640625, "learning_rate": 9.99679168756311e-07, "loss": 0.0022, "reward": 1.1973066329956055, "reward_std": 0.17622622847557068, "rewards/accuracy_reward": 0.1673065721988678, "rewards/format_reward": 1.0, "step": 600, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 411.6750183105469, "epoch": 0.011422598118407299, "grad_norm": 4.324589723372153, "kl": 0.0771484375, "learning_rate": 9.99678098542496e-07, "loss": 0.0031, "reward": 2.0297563076019287, "reward_std": 0.17133544385433197, "rewards/accuracy_reward": 0.8635061383247375, "rewards/format_reward": 1.0, "step": 601, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 431.5249938964844, "epoch": 0.011441604105293168, "grad_norm": 2.2166544804561843, "kl": 0.061767578125, "learning_rate": 9.996770265472418e-07, "loss": 0.0025, "reward": 1.6730873584747314, "reward_std": 0.08894231915473938, "rewards/accuracy_reward": 0.5393373370170593, "rewards/format_reward": 1.0, "step": 602, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 463.875, "epoch": 0.011460610092179037, "grad_norm": 1.674720354156424, "kl": 0.064453125, "learning_rate": 9.996759527705526e-07, "loss": 0.0026, "reward": 1.9574998617172241, "reward_std": 0.19369134306907654, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 603, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 445.32501220703125, "epoch": 0.011479616079064905, "grad_norm": 1.6980972175217657, "kl": 0.06787109375, "learning_rate": 9.996748772124324e-07, "loss": 0.0027, "reward": 2.0137500762939453, "reward_std": 0.14856313169002533, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 604, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 436.25, "epoch": 0.011498622065950774, "grad_norm": 2.500974902897863, "kl": 0.0908203125, "learning_rate": 9.996737998728849e-07, "loss": 0.0036, "reward": 2.077589750289917, "reward_std": 0.16722789406776428, "rewards/accuracy_reward": 0.8088399767875671, "rewards/format_reward": 1.0, "step": 605, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 429.75, "epoch": 0.011517628052836643, "grad_norm": 1.947607128316437, "kl": 0.083984375, "learning_rate": 9.996727207519138e-07, "loss": 0.0034, "reward": 2.1837499141693115, "reward_std": 0.1226191446185112, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 606, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 432.9250183105469, "epoch": 0.011536634039722512, "grad_norm": 3.156271591850942, "kl": 0.083984375, "learning_rate": 9.996716398495229e-07, "loss": 0.0034, "reward": 1.9579235315322876, "reward_std": 0.17999373376369476, "rewards/accuracy_reward": 0.730423629283905, "rewards/format_reward": 1.0, "step": 607, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 433.3500061035156, "epoch": 0.011555640026608381, "grad_norm": 1.6373683781484616, "kl": 0.0615234375, "learning_rate": 9.996705571657165e-07, "loss": 0.0025, "reward": 1.7975000143051147, "reward_std": 0.12626336514949799, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 608, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 420.95001220703125, "epoch": 0.01157464601349425, "grad_norm": 1.9431819870080271, "kl": 0.062255859375, "learning_rate": 9.996694727004979e-07, "loss": 0.0025, "reward": 1.6727215051651, "reward_std": 0.2554248869419098, "rewards/accuracy_reward": 0.5839714407920837, "rewards/format_reward": 1.0, "step": 609, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 415.3999938964844, "epoch": 0.011593652000380119, "grad_norm": 1.6287603168584877, "kl": 0.0712890625, "learning_rate": 9.996683864538716e-07, "loss": 0.0028, "reward": 2.0281901359558105, "reward_std": 0.04945867881178856, "rewards/accuracy_reward": 0.8719400763511658, "rewards/format_reward": 1.0, "step": 610, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 471.1499938964844, "epoch": 0.01161265798726599, "grad_norm": 1.925339316876235, "kl": 0.07080078125, "learning_rate": 9.996672984258408e-07, "loss": 0.0028, "reward": 2.042964220046997, "reward_std": 0.1353432983160019, "rewards/accuracy_reward": 0.8554641604423523, "rewards/format_reward": 1.0, "step": 611, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 467.45001220703125, "epoch": 0.011631663974151859, "grad_norm": 1.768305528014443, "kl": 0.051025390625, "learning_rate": 9.996662086164098e-07, "loss": 0.002, "reward": 1.9012501239776611, "reward_std": 0.2875203490257263, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9750000238418579, "step": 612, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.375, "epoch": 0.011650669961037727, "grad_norm": 1.983511940413985, "kl": 0.062255859375, "learning_rate": 9.996651170255822e-07, "loss": 0.0025, "reward": 1.655466914176941, "reward_std": 0.35578984022140503, "rewards/accuracy_reward": 0.5692169070243835, "rewards/format_reward": 1.0, "step": 613, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 433.7749938964844, "epoch": 0.011669675947923596, "grad_norm": 1.4749190160715435, "kl": 0.08056640625, "learning_rate": 9.996640236533624e-07, "loss": 0.0032, "reward": 1.9987499713897705, "reward_std": 0.2163955271244049, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 614, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 443.95001220703125, "epoch": 0.011688681934809465, "grad_norm": 1.918478728157044, "kl": 0.07470703125, "learning_rate": 9.996629284997538e-07, "loss": 0.003, "reward": 2.009999990463257, "reward_std": 0.12332119792699814, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 615, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 469.1000061035156, "epoch": 0.011707687921695334, "grad_norm": 2.346581040758844, "kl": 0.05615234375, "learning_rate": 9.996618315647606e-07, "loss": 0.0023, "reward": 1.4388395547866821, "reward_std": 0.29419374465942383, "rewards/accuracy_reward": 0.37758952379226685, "rewards/format_reward": 1.0, "step": 616, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 459.7749938964844, "epoch": 0.011726693908581203, "grad_norm": 2.5079159276012684, "kl": 0.083984375, "learning_rate": 9.996607328483863e-07, "loss": 0.0034, "reward": 2.0762500762939453, "reward_std": 0.18828822672367096, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 617, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 483.2749938964844, "epoch": 0.011745699895467072, "grad_norm": 1.3953719435878342, "kl": 0.0595703125, "learning_rate": 9.996596323506355e-07, "loss": 0.0024, "reward": 1.5387500524520874, "reward_std": 0.3371405005455017, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 0.9750000238418579, "step": 618, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 462.3000183105469, "epoch": 0.011764705882352941, "grad_norm": 1.4392682872300198, "kl": 0.0908203125, "learning_rate": 9.996585300715115e-07, "loss": 0.0036, "reward": 1.6375000476837158, "reward_std": 0.2294822484254837, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 619, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 457.2250061035156, "epoch": 0.01178371186923881, "grad_norm": 2.6246094174848236, "kl": 0.0703125, "learning_rate": 9.996574260110183e-07, "loss": 0.0028, "reward": 1.6786422729492188, "reward_std": 0.30471470952033997, "rewards/accuracy_reward": 0.6811421513557434, "rewards/format_reward": 0.949999988079071, "step": 620, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 461.1499938964844, "epoch": 0.011802717856124679, "grad_norm": 1.3656036894578758, "kl": 0.08056640625, "learning_rate": 9.996563201691602e-07, "loss": 0.0032, "reward": 1.6598033905029297, "reward_std": 0.14326943457126617, "rewards/accuracy_reward": 0.516053318977356, "rewards/format_reward": 1.0, "step": 621, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 456.8000183105469, "epoch": 0.011821723843010548, "grad_norm": 2.0122664631701905, "kl": 0.076171875, "learning_rate": 9.996552125459408e-07, "loss": 0.003, "reward": 1.9619076251983643, "reward_std": 0.07952728122472763, "rewards/accuracy_reward": 0.745657742023468, "rewards/format_reward": 1.0, "step": 622, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 467.5500183105469, "epoch": 0.011840729829896417, "grad_norm": 1.4686695012801096, "kl": 0.037841796875, "learning_rate": 9.996541031413643e-07, "loss": 0.0015, "reward": 1.6117315292358398, "reward_std": 0.3073624074459076, "rewards/accuracy_reward": 0.6267315149307251, "rewards/format_reward": 0.925000011920929, "step": 623, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 461.20001220703125, "epoch": 0.011859735816782286, "grad_norm": 3.0580021522718965, "kl": 0.07470703125, "learning_rate": 9.996529919554345e-07, "loss": 0.003, "reward": 1.564117670059204, "reward_std": 0.255709707736969, "rewards/accuracy_reward": 0.5378676652908325, "rewards/format_reward": 0.949999988079071, "step": 624, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 444.125, "epoch": 0.011878741803668156, "grad_norm": 1.9487338400428331, "kl": 0.09521484375, "learning_rate": 9.996518789881555e-07, "loss": 0.0038, "reward": 2.1648502349853516, "reward_std": 0.04231090843677521, "rewards/accuracy_reward": 0.9986003041267395, "rewards/format_reward": 1.0, "step": 625, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 406.375, "epoch": 0.011897747790554025, "grad_norm": 1.9128282239923182, "kl": 0.0869140625, "learning_rate": 9.996507642395308e-07, "loss": 0.0035, "reward": 1.518365502357483, "reward_std": 0.27028393745422363, "rewards/accuracy_reward": 0.3833654820919037, "rewards/format_reward": 1.0, "step": 626, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 458.45001220703125, "epoch": 0.011916753777439894, "grad_norm": 1.6452795615978173, "kl": 0.0908203125, "learning_rate": 9.996496477095651e-07, "loss": 0.0036, "reward": 1.7387501001358032, "reward_std": 0.2975967824459076, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 627, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 444.0500183105469, "epoch": 0.011935759764325763, "grad_norm": 1.7520423278402224, "kl": 0.10986328125, "learning_rate": 9.996485293982619e-07, "loss": 0.0044, "reward": 2.1905357837677, "reward_std": 0.0547470822930336, "rewards/accuracy_reward": 0.9642857909202576, "rewards/format_reward": 1.0, "step": 628, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 432.8000183105469, "epoch": 0.011954765751211632, "grad_norm": 1.919723592207281, "kl": 0.06787109375, "learning_rate": 9.996474093056252e-07, "loss": 0.0027, "reward": 1.7476999759674072, "reward_std": 0.16914092004299164, "rewards/accuracy_reward": 0.7076999545097351, "rewards/format_reward": 1.0, "step": 629, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 483.125, "epoch": 0.011973771738097501, "grad_norm": 1.8779415515611733, "kl": 0.09326171875, "learning_rate": 9.996462874316594e-07, "loss": 0.0037, "reward": 1.7400000095367432, "reward_std": 0.15443065762519836, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 630, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 447.6499938964844, "epoch": 0.01199277772498337, "grad_norm": 1.8232683787485553, "kl": 0.08056640625, "learning_rate": 9.99645163776368e-07, "loss": 0.0032, "reward": 1.8219269514083862, "reward_std": 0.31342241168022156, "rewards/accuracy_reward": 0.6906768083572388, "rewards/format_reward": 1.0, "step": 631, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 424.57501220703125, "epoch": 0.012011783711869239, "grad_norm": 1.424304229543566, "kl": 0.0859375, "learning_rate": 9.99644038339755e-07, "loss": 0.0034, "reward": 1.597337245941162, "reward_std": 0.0419875867664814, "rewards/accuracy_reward": 0.453587144613266, "rewards/format_reward": 1.0, "step": 632, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 457.6750183105469, "epoch": 0.012030789698755108, "grad_norm": 1.8217729732219172, "kl": 0.07666015625, "learning_rate": 9.99642911121825e-07, "loss": 0.0031, "reward": 1.590000033378601, "reward_std": 0.25514116883277893, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 633, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 449.9250183105469, "epoch": 0.012049795685640977, "grad_norm": 1.7278090085604565, "kl": 0.076171875, "learning_rate": 9.996417821225816e-07, "loss": 0.0031, "reward": 2.0562500953674316, "reward_std": 0.2243637591600418, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 634, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 445.4750061035156, "epoch": 0.012068801672526846, "grad_norm": 2.0969020755065686, "kl": 0.08203125, "learning_rate": 9.996406513420286e-07, "loss": 0.0033, "reward": 1.8986866474151611, "reward_std": 0.16078971326351166, "rewards/accuracy_reward": 0.6424368023872375, "rewards/format_reward": 1.0, "step": 635, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 417.125, "epoch": 0.012087807659412714, "grad_norm": 2.5056519956756445, "kl": 0.0849609375, "learning_rate": 9.996395187801704e-07, "loss": 0.0034, "reward": 1.6768711805343628, "reward_std": 0.26196733117103577, "rewards/accuracy_reward": 0.616871178150177, "rewards/format_reward": 1.0, "step": 636, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 438.1000061035156, "epoch": 0.012106813646298583, "grad_norm": 1.4188254461964447, "kl": 0.087890625, "learning_rate": 9.99638384437011e-07, "loss": 0.0035, "reward": 2.0, "reward_std": 0.04963252693414688, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 637, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 425.57501220703125, "epoch": 0.012125819633184452, "grad_norm": 1.5734732496071313, "kl": 0.091796875, "learning_rate": 9.996372483125545e-07, "loss": 0.0037, "reward": 1.7487499713897705, "reward_std": 0.02397349290549755, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 638, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 383.0500183105469, "epoch": 0.012144825620070323, "grad_norm": 2.1373953876716247, "kl": 0.083984375, "learning_rate": 9.996361104068046e-07, "loss": 0.0034, "reward": 2.0375001430511475, "reward_std": 0.13431942462921143, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 639, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 454.7250061035156, "epoch": 0.012163831606956192, "grad_norm": 2.934219298783286, "kl": 0.06201171875, "learning_rate": 9.996349707197658e-07, "loss": 0.0025, "reward": 1.9974462985992432, "reward_std": 0.04825620725750923, "rewards/accuracy_reward": 0.8361961245536804, "rewards/format_reward": 1.0, "step": 640, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.0249938964844, "epoch": 0.01218283759384206, "grad_norm": 4.915724140240582, "kl": 0.0703125, "learning_rate": 9.996338292514417e-07, "loss": 0.0028, "reward": 1.905596375465393, "reward_std": 0.28087708353996277, "rewards/accuracy_reward": 0.8243463635444641, "rewards/format_reward": 0.949999988079071, "step": 641, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 436.125, "epoch": 0.01220184358072793, "grad_norm": 1.7731350972977515, "kl": 0.09375, "learning_rate": 9.996326860018367e-07, "loss": 0.0037, "reward": 1.9500000476837158, "reward_std": 0.389931857585907, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 642, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.3500061035156, "epoch": 0.012220849567613799, "grad_norm": 2.0456006068500403, "kl": 0.0654296875, "learning_rate": 9.99631540970955e-07, "loss": 0.0026, "reward": 1.8209278583526611, "reward_std": 0.04710244759917259, "rewards/accuracy_reward": 0.6621779799461365, "rewards/format_reward": 1.0, "step": 643, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 437.0500183105469, "epoch": 0.012239855554499668, "grad_norm": 1.864468504138235, "kl": 0.0869140625, "learning_rate": 9.996303941588001e-07, "loss": 0.0035, "reward": 1.8938648700714111, "reward_std": 0.04810573533177376, "rewards/accuracy_reward": 0.6726149916648865, "rewards/format_reward": 1.0, "step": 644, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 460.2250061035156, "epoch": 0.012258861541385536, "grad_norm": 2.7097819212031315, "kl": 0.0732421875, "learning_rate": 9.996292455653765e-07, "loss": 0.0029, "reward": 1.5883558988571167, "reward_std": 0.03470195457339287, "rewards/accuracy_reward": 0.43960580229759216, "rewards/format_reward": 1.0, "step": 645, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 417.7749938964844, "epoch": 0.012277867528271405, "grad_norm": 2.0277230704655484, "kl": 0.1044921875, "learning_rate": 9.996280951906884e-07, "loss": 0.0042, "reward": 2.0450000762939453, "reward_std": 0.10080788284540176, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 646, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 452.95001220703125, "epoch": 0.012296873515157274, "grad_norm": 1.8869256961406453, "kl": 0.0693359375, "learning_rate": 9.996269430347397e-07, "loss": 0.0028, "reward": 1.651249885559082, "reward_std": 0.24141600728034973, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 647, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.7250061035156, "epoch": 0.012315879502043143, "grad_norm": 5.339584858474898, "kl": 0.06494140625, "learning_rate": 9.996257890975348e-07, "loss": 0.0026, "reward": 1.7776453495025635, "reward_std": 0.08328854292631149, "rewards/accuracy_reward": 0.651395320892334, "rewards/format_reward": 1.0, "step": 648, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 444.5, "epoch": 0.012334885488929012, "grad_norm": 1.9100516400642031, "kl": 0.10302734375, "learning_rate": 9.996246333790773e-07, "loss": 0.0041, "reward": 1.7387501001358032, "reward_std": 0.03230414167046547, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 649, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 453.6000061035156, "epoch": 0.012353891475814881, "grad_norm": 1.8411640564675906, "kl": 0.08203125, "learning_rate": 9.996234758793719e-07, "loss": 0.0033, "reward": 1.9427776336669922, "reward_std": 0.13237065076828003, "rewards/accuracy_reward": 0.7477777004241943, "rewards/format_reward": 1.0, "step": 650, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 470.7250061035156, "epoch": 0.01237289746270075, "grad_norm": 1.457761360473391, "kl": 0.08837890625, "learning_rate": 9.996223165984222e-07, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.28509339690208435, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.9750000238418579, "step": 651, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 440.625, "epoch": 0.012391903449586619, "grad_norm": 2.048107960168898, "kl": 0.087890625, "learning_rate": 9.996211555362323e-07, "loss": 0.0035, "reward": 1.803942084312439, "reward_std": 0.24984395503997803, "rewards/accuracy_reward": 0.6914423108100891, "rewards/format_reward": 1.0, "step": 652, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 437.32501220703125, "epoch": 0.01241090943647249, "grad_norm": 2.2810613413674905, "kl": 0.0849609375, "learning_rate": 9.996199926928071e-07, "loss": 0.0034, "reward": 1.8767356872558594, "reward_std": 0.1472785919904709, "rewards/accuracy_reward": 0.8229856491088867, "rewards/format_reward": 1.0, "step": 653, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 450.1000061035156, "epoch": 0.012429915423358359, "grad_norm": 4.732046469898634, "kl": 0.087890625, "learning_rate": 9.9961882806815e-07, "loss": 0.0035, "reward": 1.9523528814315796, "reward_std": 0.04254266619682312, "rewards/accuracy_reward": 0.8073530197143555, "rewards/format_reward": 1.0, "step": 654, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 413.6000061035156, "epoch": 0.012448921410244227, "grad_norm": 1.6900681576111625, "kl": 0.07958984375, "learning_rate": 9.996176616622653e-07, "loss": 0.0032, "reward": 1.8049999475479126, "reward_std": 0.3436048626899719, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 655, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 461.70001220703125, "epoch": 0.012467927397130096, "grad_norm": 2.5144251096286796, "kl": 0.10009765625, "learning_rate": 9.996164934751575e-07, "loss": 0.004, "reward": 2.023566961288452, "reward_std": 0.13137094676494598, "rewards/accuracy_reward": 0.8110671043395996, "rewards/format_reward": 1.0, "step": 656, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 469.1750183105469, "epoch": 0.012486933384015965, "grad_norm": 1.5470058910709186, "kl": 0.048828125, "learning_rate": 9.996153235068303e-07, "loss": 0.0019, "reward": 1.8545799255371094, "reward_std": 0.12340261787176132, "rewards/accuracy_reward": 0.8495798110961914, "rewards/format_reward": 1.0, "step": 657, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 467.1499938964844, "epoch": 0.012505939370901834, "grad_norm": 2.1737929252088724, "kl": 0.111328125, "learning_rate": 9.996141517572882e-07, "loss": 0.0045, "reward": 2.06499981880188, "reward_std": 0.048381078988313675, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 658, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 418.3000183105469, "epoch": 0.012524945357787703, "grad_norm": 1.737305405881958, "kl": 0.060791015625, "learning_rate": 9.996129782265354e-07, "loss": 0.0024, "reward": 1.751460313796997, "reward_std": 0.026737544685602188, "rewards/accuracy_reward": 0.6464601755142212, "rewards/format_reward": 1.0, "step": 659, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 451.57501220703125, "epoch": 0.012543951344673572, "grad_norm": 2.0080142630810327, "kl": 0.09130859375, "learning_rate": 9.996118029145757e-07, "loss": 0.0036, "reward": 2.202500104904175, "reward_std": 0.12062938511371613, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 660, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 445.95001220703125, "epoch": 0.012562957331559441, "grad_norm": 2.335174355328368, "kl": 0.076171875, "learning_rate": 9.996106258214138e-07, "loss": 0.003, "reward": 1.9196529388427734, "reward_std": 0.14973393082618713, "rewards/accuracy_reward": 0.713403046131134, "rewards/format_reward": 1.0, "step": 661, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 437.07501220703125, "epoch": 0.01258196331844531, "grad_norm": 1.7604136242117012, "kl": 0.09521484375, "learning_rate": 9.996094469470534e-07, "loss": 0.0038, "reward": 1.7487499713897705, "reward_std": 0.0325133316218853, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 662, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 445.5249938964844, "epoch": 0.012600969305331179, "grad_norm": 1.9687321204089707, "kl": 0.0869140625, "learning_rate": 9.99608266291499e-07, "loss": 0.0035, "reward": 1.7225693464279175, "reward_std": 0.11156398057937622, "rewards/accuracy_reward": 0.5750694274902344, "rewards/format_reward": 1.0, "step": 663, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 455.25, "epoch": 0.012619975292217048, "grad_norm": 2.047854068258095, "kl": 0.1005859375, "learning_rate": 9.996070838547548e-07, "loss": 0.004, "reward": 2.004999876022339, "reward_std": 0.05544399842619896, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 664, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 438.1000061035156, "epoch": 0.012638981279102917, "grad_norm": 5.169949131914374, "kl": 0.08203125, "learning_rate": 9.99605899636825e-07, "loss": 0.0033, "reward": 1.960333228111267, "reward_std": 0.162943035364151, "rewards/accuracy_reward": 0.861583411693573, "rewards/format_reward": 1.0, "step": 665, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 461.7250061035156, "epoch": 0.012657987265988787, "grad_norm": 2.463152764732014, "kl": 0.099609375, "learning_rate": 9.99604713637714e-07, "loss": 0.004, "reward": 1.9513797760009766, "reward_std": 0.25893664360046387, "rewards/accuracy_reward": 0.8126299977302551, "rewards/format_reward": 1.0, "step": 666, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 464.3000183105469, "epoch": 0.012676993252874656, "grad_norm": 2.4680220945409133, "kl": 0.06982421875, "learning_rate": 9.996035258574253e-07, "loss": 0.0028, "reward": 1.8019866943359375, "reward_std": 0.18856680393218994, "rewards/accuracy_reward": 0.6719867587089539, "rewards/format_reward": 1.0, "step": 667, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 403.5500183105469, "epoch": 0.012695999239760525, "grad_norm": 1.7906531481507488, "kl": 0.060546875, "learning_rate": 9.99602336295964e-07, "loss": 0.0024, "reward": 1.6636707782745361, "reward_std": 0.24025700986385345, "rewards/accuracy_reward": 0.6261708736419678, "rewards/format_reward": 1.0, "step": 668, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 455.125, "epoch": 0.012715005226646394, "grad_norm": 1.3604911153138315, "kl": 0.08935546875, "learning_rate": 9.99601144953334e-07, "loss": 0.0036, "reward": 2.015000104904175, "reward_std": 0.035699550062417984, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 669, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 442.0500183105469, "epoch": 0.012734011213532263, "grad_norm": 1.6110022902882712, "kl": 0.045654296875, "learning_rate": 9.995999518295395e-07, "loss": 0.0018, "reward": 1.787500023841858, "reward_std": 0.25540170073509216, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 670, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 438.9750061035156, "epoch": 0.012753017200418132, "grad_norm": 1.5515349711701059, "kl": 0.0625, "learning_rate": 9.995987569245848e-07, "loss": 0.0025, "reward": 1.6841137409210205, "reward_std": 0.04847750440239906, "rewards/accuracy_reward": 0.5491136908531189, "rewards/format_reward": 1.0, "step": 671, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 438.0249938964844, "epoch": 0.012772023187304, "grad_norm": 2.391331011104615, "kl": 0.07421875, "learning_rate": 9.99597560238474e-07, "loss": 0.003, "reward": 1.9480408430099487, "reward_std": 0.12720969319343567, "rewards/accuracy_reward": 0.7405409216880798, "rewards/format_reward": 1.0, "step": 672, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 382.6000061035156, "epoch": 0.01279102917418987, "grad_norm": 2.0372452827805874, "kl": 0.0654296875, "learning_rate": 9.995963617712116e-07, "loss": 0.0026, "reward": 1.7572059631347656, "reward_std": 0.07543417066335678, "rewards/accuracy_reward": 0.6547058820724487, "rewards/format_reward": 1.0, "step": 673, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 432.3000183105469, "epoch": 0.012810035161075739, "grad_norm": 1.3425902262789746, "kl": 0.06787109375, "learning_rate": 9.99595161522802e-07, "loss": 0.0027, "reward": 1.813750147819519, "reward_std": 0.09199460595846176, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 674, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 427.75, "epoch": 0.012829041147961608, "grad_norm": 1.6300342526398393, "kl": 0.087890625, "learning_rate": 9.99593959493249e-07, "loss": 0.0035, "reward": 1.9650001525878906, "reward_std": 0.02449488826096058, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 675, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 446.6750183105469, "epoch": 0.012848047134847477, "grad_norm": 2.8352253222956687, "kl": 0.0634765625, "learning_rate": 9.995927556825572e-07, "loss": 0.0025, "reward": 1.5192979574203491, "reward_std": 0.25122472643852234, "rewards/accuracy_reward": 0.40304800868034363, "rewards/format_reward": 1.0, "step": 676, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 446.5249938964844, "epoch": 0.012867053121733345, "grad_norm": 1.5411355002220397, "kl": 0.0703125, "learning_rate": 9.99591550090731e-07, "loss": 0.0028, "reward": 1.7590769529342651, "reward_std": 0.29475030303001404, "rewards/accuracy_reward": 0.719076931476593, "rewards/format_reward": 1.0, "step": 677, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 445.3000183105469, "epoch": 0.012886059108619214, "grad_norm": 2.266135144032741, "kl": 0.0732421875, "learning_rate": 9.995903427177743e-07, "loss": 0.0029, "reward": 1.6436678171157837, "reward_std": 0.09614302963018417, "rewards/accuracy_reward": 0.4949178695678711, "rewards/format_reward": 1.0, "step": 678, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 474.32501220703125, "epoch": 0.012905065095505083, "grad_norm": 1.5861619437714614, "kl": 0.107421875, "learning_rate": 9.99589133563692e-07, "loss": 0.0043, "reward": 1.4812500476837158, "reward_std": 0.35283902287483215, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 0.949999988079071, "step": 679, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 468.7250061035156, "epoch": 0.012924071082390954, "grad_norm": 2.1824498112310957, "kl": 0.1103515625, "learning_rate": 9.995879226284878e-07, "loss": 0.0044, "reward": 2.0587499141693115, "reward_std": 0.13940279185771942, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 680, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 476.8000183105469, "epoch": 0.012943077069276823, "grad_norm": 1.419429883664164, "kl": 0.115234375, "learning_rate": 9.995867099121663e-07, "loss": 0.0046, "reward": 1.7587498426437378, "reward_std": 0.11715694516897202, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 681, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 445.0500183105469, "epoch": 0.012962083056162692, "grad_norm": 4.968383152055369, "kl": 0.111328125, "learning_rate": 9.995854954147318e-07, "loss": 0.0045, "reward": 1.8493388891220093, "reward_std": 0.17193397879600525, "rewards/accuracy_reward": 0.6318390965461731, "rewards/format_reward": 1.0, "step": 682, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 439.1750183105469, "epoch": 0.01298108904304856, "grad_norm": 1.7905287798967389, "kl": 0.08837890625, "learning_rate": 9.995842791361889e-07, "loss": 0.0035, "reward": 1.712389349937439, "reward_std": 0.06312983483076096, "rewards/accuracy_reward": 0.4986395537853241, "rewards/format_reward": 1.0, "step": 683, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 488.0500183105469, "epoch": 0.01300009502993443, "grad_norm": 3.3361165259050787, "kl": 0.10400390625, "learning_rate": 9.995830610765413e-07, "loss": 0.0042, "reward": 2.029280424118042, "reward_std": 0.2020348608493805, "rewards/accuracy_reward": 0.8017805218696594, "rewards/format_reward": 0.9750000238418579, "step": 684, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 455.875, "epoch": 0.013019101016820299, "grad_norm": 1.6627613233285068, "kl": 0.11181640625, "learning_rate": 9.995818412357939e-07, "loss": 0.0045, "reward": 2.078749895095825, "reward_std": 0.12405641376972198, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 685, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 481.5, "epoch": 0.013038107003706167, "grad_norm": 2.147588045489399, "kl": 0.08447265625, "learning_rate": 9.99580619613951e-07, "loss": 0.0034, "reward": 1.9386663436889648, "reward_std": 0.17508693039417267, "rewards/accuracy_reward": 0.8549163937568665, "rewards/format_reward": 1.0, "step": 686, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 486.1750183105469, "epoch": 0.013057112990592036, "grad_norm": 1.6460557367558475, "kl": 0.11474609375, "learning_rate": 9.995793962110164e-07, "loss": 0.0046, "reward": 1.7737499475479126, "reward_std": 0.07152249664068222, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 687, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 487.2749938964844, "epoch": 0.013076118977477905, "grad_norm": 2.9424336084366276, "kl": 0.10498046875, "learning_rate": 9.995781710269952e-07, "loss": 0.0042, "reward": 1.6504096984863281, "reward_std": 0.32433828711509705, "rewards/accuracy_reward": 0.6016597151756287, "rewards/format_reward": 0.949999988079071, "step": 688, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 488.6499938964844, "epoch": 0.013095124964363774, "grad_norm": 1.3859886241595234, "kl": 0.1025390625, "learning_rate": 9.995769440618914e-07, "loss": 0.0041, "reward": 1.5981104373931885, "reward_std": 0.3493271470069885, "rewards/accuracy_reward": 0.5643603801727295, "rewards/format_reward": 0.9750000238418579, "step": 689, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 459.75, "epoch": 0.013114130951249643, "grad_norm": 1.5592672984917082, "kl": 0.1044921875, "learning_rate": 9.995757153157093e-07, "loss": 0.0042, "reward": 2.1262500286102295, "reward_std": 0.14376184344291687, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 690, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 487.5, "epoch": 0.013133136938135512, "grad_norm": 1.9335252465138564, "kl": 0.1279296875, "learning_rate": 9.995744847884535e-07, "loss": 0.0051, "reward": 2.0260372161865234, "reward_std": 0.1518925577402115, "rewards/accuracy_reward": 0.7647872567176819, "rewards/format_reward": 1.0, "step": 691, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 471.3000183105469, "epoch": 0.013152142925021381, "grad_norm": 1.6032090227902767, "kl": 0.1318359375, "learning_rate": 9.995732524801284e-07, "loss": 0.0053, "reward": 1.7048267126083374, "reward_std": 0.19036322832107544, "rewards/accuracy_reward": 0.5773269534111023, "rewards/format_reward": 0.9750000238418579, "step": 692, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 470.375, "epoch": 0.01317114891190725, "grad_norm": 1.7611384344022971, "kl": 0.11474609375, "learning_rate": 9.99572018390738e-07, "loss": 0.0046, "reward": 1.9498538970947266, "reward_std": 0.25558096170425415, "rewards/accuracy_reward": 0.8211038708686829, "rewards/format_reward": 0.9750000238418579, "step": 693, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 434.6750183105469, "epoch": 0.01319015489879312, "grad_norm": 1.4777802098925599, "kl": 0.0947265625, "learning_rate": 9.995707825202873e-07, "loss": 0.0038, "reward": 1.3722171783447266, "reward_std": 0.1442262977361679, "rewards/accuracy_reward": 0.3622172772884369, "rewards/format_reward": 1.0, "step": 694, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 467.625, "epoch": 0.01320916088567899, "grad_norm": 1.5567855319393067, "kl": 0.10009765625, "learning_rate": 9.995695448687803e-07, "loss": 0.004, "reward": 2.016423225402832, "reward_std": 0.05085080862045288, "rewards/accuracy_reward": 0.8601731657981873, "rewards/format_reward": 1.0, "step": 695, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 450.4250183105469, "epoch": 0.013228166872564858, "grad_norm": 1.8308404877685869, "kl": 0.0791015625, "learning_rate": 9.995683054362214e-07, "loss": 0.0032, "reward": 1.8470677137374878, "reward_std": 0.1592775285243988, "rewards/accuracy_reward": 0.6845678687095642, "rewards/format_reward": 1.0, "step": 696, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 430.4250183105469, "epoch": 0.013247172859450727, "grad_norm": 1.8005562681832192, "kl": 0.10888671875, "learning_rate": 9.99567064222615e-07, "loss": 0.0044, "reward": 1.8144510984420776, "reward_std": 0.1525733917951584, "rewards/accuracy_reward": 0.6607011556625366, "rewards/format_reward": 1.0, "step": 697, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 449.8999938964844, "epoch": 0.013266178846336596, "grad_norm": 1.6268096392218965, "kl": 0.0927734375, "learning_rate": 9.99565821227966e-07, "loss": 0.0037, "reward": 1.8695858716964722, "reward_std": 0.03064870275557041, "rewards/accuracy_reward": 0.6445857882499695, "rewards/format_reward": 1.0, "step": 698, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 396.4750061035156, "epoch": 0.013285184833222465, "grad_norm": 1.5353147209699973, "kl": 0.08740234375, "learning_rate": 9.995645764522783e-07, "loss": 0.0035, "reward": 1.558989405632019, "reward_std": 0.033408764749765396, "rewards/accuracy_reward": 0.4802393615245819, "rewards/format_reward": 1.0, "step": 699, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 450.45001220703125, "epoch": 0.013304190820108334, "grad_norm": 8.411435210594698, "kl": 0.08447265625, "learning_rate": 9.995633298955563e-07, "loss": 0.0034, "reward": 1.4234392642974854, "reward_std": 0.09718119353055954, "rewards/accuracy_reward": 0.3634392023086548, "rewards/format_reward": 1.0, "step": 700, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 442.6499938964844, "epoch": 0.013323196806994203, "grad_norm": 1.561761211920665, "kl": 0.0712890625, "learning_rate": 9.99562081557805e-07, "loss": 0.0029, "reward": 1.8637498617172241, "reward_std": 0.11404251307249069, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 701, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 423.1499938964844, "epoch": 0.013342202793880072, "grad_norm": 1.7734278607055687, "kl": 0.068359375, "learning_rate": 9.995608314390282e-07, "loss": 0.0027, "reward": 1.7347859144210815, "reward_std": 0.20102575421333313, "rewards/accuracy_reward": 0.6360358595848083, "rewards/format_reward": 1.0, "step": 702, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 414.7749938964844, "epoch": 0.013361208780765941, "grad_norm": 1.500602105385007, "kl": 0.10888671875, "learning_rate": 9.995595795392309e-07, "loss": 0.0043, "reward": 1.872756004333496, "reward_std": 0.04351817071437836, "rewards/accuracy_reward": 0.6527560949325562, "rewards/format_reward": 1.0, "step": 703, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 401.1750183105469, "epoch": 0.01338021476765181, "grad_norm": 2.457688062196613, "kl": 0.09326171875, "learning_rate": 9.995583258584173e-07, "loss": 0.0037, "reward": 1.9233334064483643, "reward_std": 0.1413118690252304, "rewards/accuracy_reward": 0.7583333849906921, "rewards/format_reward": 1.0, "step": 704, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 441.375, "epoch": 0.013399220754537679, "grad_norm": 2.159684717658479, "kl": 0.056396484375, "learning_rate": 9.995570703965919e-07, "loss": 0.0023, "reward": 1.666308045387268, "reward_std": 0.15317901968955994, "rewards/accuracy_reward": 0.5713080763816833, "rewards/format_reward": 1.0, "step": 705, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 417.1000061035156, "epoch": 0.013418226741423548, "grad_norm": 1.8305413335876235, "kl": 0.10888671875, "learning_rate": 9.99555813153759e-07, "loss": 0.0043, "reward": 1.7286853790283203, "reward_std": 0.15412406623363495, "rewards/accuracy_reward": 0.516185462474823, "rewards/format_reward": 1.0, "step": 706, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 412.6000061035156, "epoch": 0.013437232728309418, "grad_norm": 1.6703440186537377, "kl": 0.087890625, "learning_rate": 9.995545541299234e-07, "loss": 0.0035, "reward": 1.998673677444458, "reward_std": 0.0639389157295227, "rewards/accuracy_reward": 0.8824234008789062, "rewards/format_reward": 1.0, "step": 707, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 467.5, "epoch": 0.013456238715195287, "grad_norm": 1.683539870395896, "kl": 0.09375, "learning_rate": 9.995532933250893e-07, "loss": 0.0037, "reward": 2.0198028087615967, "reward_std": 0.05851763114333153, "rewards/accuracy_reward": 0.8735527396202087, "rewards/format_reward": 1.0, "step": 708, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 444.2250061035156, "epoch": 0.013475244702081156, "grad_norm": 1.7378851528712578, "kl": 0.1123046875, "learning_rate": 9.995520307392617e-07, "loss": 0.0045, "reward": 1.576685905456543, "reward_std": 0.05492706224322319, "rewards/accuracy_reward": 0.4429359436035156, "rewards/format_reward": 1.0, "step": 709, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 413.57501220703125, "epoch": 0.013494250688967025, "grad_norm": 1.6149188920475832, "kl": 0.07666015625, "learning_rate": 9.995507663724445e-07, "loss": 0.0031, "reward": 1.9402374029159546, "reward_std": 0.09124945849180222, "rewards/accuracy_reward": 0.8314873576164246, "rewards/format_reward": 1.0, "step": 710, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 491.9750061035156, "epoch": 0.013513256675852894, "grad_norm": 1.629602526732811, "kl": 0.1337890625, "learning_rate": 9.995495002246424e-07, "loss": 0.0053, "reward": 2.268749952316284, "reward_std": 0.16989779472351074, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 711, "temporal_rewards": 1.0 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 500.5500183105469, "epoch": 0.013532262662738763, "grad_norm": 1.2915342122524358, "kl": 0.07861328125, "learning_rate": 9.9954823229586e-07, "loss": 0.0031, "reward": 1.6649999618530273, "reward_std": 0.4036776125431061, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 0.9000000357627869, "step": 712, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 521.1749877929688, "epoch": 0.013551268649624632, "grad_norm": 1.477959756958054, "kl": 0.1181640625, "learning_rate": 9.99546962586102e-07, "loss": 0.0047, "reward": 1.9762500524520874, "reward_std": 0.3797941505908966, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 0.949999988079071, "step": 713, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 491.6000061035156, "epoch": 0.0135702746365105, "grad_norm": 2.2341639964465068, "kl": 0.10009765625, "learning_rate": 9.995456910953723e-07, "loss": 0.004, "reward": 1.7212860584259033, "reward_std": 0.11174527555704117, "rewards/accuracy_reward": 0.6525360941886902, "rewards/format_reward": 1.0, "step": 714, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 514.7000122070312, "epoch": 0.01358928062339637, "grad_norm": 1.6864523396092914, "kl": 0.0908203125, "learning_rate": 9.99544417823676e-07, "loss": 0.0036, "reward": 1.6670664548873901, "reward_std": 0.26193180680274963, "rewards/accuracy_reward": 0.6383164525032043, "rewards/format_reward": 1.0, "step": 715, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 514.9750366210938, "epoch": 0.013608286610282239, "grad_norm": 1.5579047563085828, "kl": 0.09423828125, "learning_rate": 9.995431427710176e-07, "loss": 0.0038, "reward": 1.6074436902999878, "reward_std": 0.17465007305145264, "rewards/accuracy_reward": 0.6824435591697693, "rewards/format_reward": 0.824999988079071, "step": 716, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 493.9250183105469, "epoch": 0.013627292597168108, "grad_norm": 1.344428379186121, "kl": 0.095703125, "learning_rate": 9.995418659374015e-07, "loss": 0.0038, "reward": 1.6413357257843018, "reward_std": 0.13948741555213928, "rewards/accuracy_reward": 0.5838356018066406, "rewards/format_reward": 1.0, "step": 717, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 475.0249938964844, "epoch": 0.013646298584053976, "grad_norm": 1.5545225116049264, "kl": 0.08984375, "learning_rate": 9.995405873228323e-07, "loss": 0.0036, "reward": 1.6741666793823242, "reward_std": 0.17087028920650482, "rewards/accuracy_reward": 0.6166667342185974, "rewards/format_reward": 1.0, "step": 718, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 465.70001220703125, "epoch": 0.013665304570939845, "grad_norm": 1.7208840275761956, "kl": 0.11279296875, "learning_rate": 9.995393069273145e-07, "loss": 0.0045, "reward": 1.7095832824707031, "reward_std": 0.1775379627943039, "rewards/accuracy_reward": 0.6195833683013916, "rewards/format_reward": 0.949999988079071, "step": 719, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 463.3000183105469, "epoch": 0.013684310557825714, "grad_norm": 1.914289754729987, "kl": 0.10107421875, "learning_rate": 9.995380247508528e-07, "loss": 0.004, "reward": 1.944087266921997, "reward_std": 0.1626613438129425, "rewards/accuracy_reward": 0.7515873312950134, "rewards/format_reward": 1.0, "step": 720, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 471.57501220703125, "epoch": 0.013703316544711585, "grad_norm": 2.187028513443578, "kl": 0.12255859375, "learning_rate": 9.995367407934513e-07, "loss": 0.0049, "reward": 1.9500000476837158, "reward_std": 0.03842785581946373, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 721, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 446.75, "epoch": 0.013722322531597454, "grad_norm": 2.083046307719133, "kl": 0.08203125, "learning_rate": 9.995354550551155e-07, "loss": 0.0033, "reward": 1.7972339391708374, "reward_std": 0.25385886430740356, "rewards/accuracy_reward": 0.6409839987754822, "rewards/format_reward": 0.949999988079071, "step": 722, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 430.9750061035156, "epoch": 0.013741328518483323, "grad_norm": 1.8906994238501933, "kl": 0.0908203125, "learning_rate": 9.99534167535849e-07, "loss": 0.0036, "reward": 1.6831945180892944, "reward_std": 0.06170584633946419, "rewards/accuracy_reward": 0.6194444298744202, "rewards/format_reward": 1.0, "step": 723, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 424.1750183105469, "epoch": 0.013760334505369192, "grad_norm": 2.8193160405494675, "kl": 0.056884765625, "learning_rate": 9.99532878235657e-07, "loss": 0.0023, "reward": 1.7075878381729126, "reward_std": 0.2339305430650711, "rewards/accuracy_reward": 0.6125877499580383, "rewards/format_reward": 1.0, "step": 724, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.3999938964844, "epoch": 0.01377934049225506, "grad_norm": 1.5906748419733205, "kl": 0.06494140625, "learning_rate": 9.99531587154544e-07, "loss": 0.0026, "reward": 1.7350000143051147, "reward_std": 0.2571442723274231, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.949999988079071, "step": 725, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 414.875, "epoch": 0.01379834647914093, "grad_norm": 1.9145714963599305, "kl": 0.1015625, "learning_rate": 9.995302942925143e-07, "loss": 0.0041, "reward": 1.9920570850372314, "reward_std": 0.24692554771900177, "rewards/accuracy_reward": 0.8295570611953735, "rewards/format_reward": 1.0, "step": 726, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 396.5, "epoch": 0.013817352466026799, "grad_norm": 2.034283441537553, "kl": 0.0673828125, "learning_rate": 9.99528999649573e-07, "loss": 0.0027, "reward": 1.5793137550354004, "reward_std": 0.028951624408364296, "rewards/accuracy_reward": 0.48556381464004517, "rewards/format_reward": 1.0, "step": 727, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 409.1499938964844, "epoch": 0.013836358452912667, "grad_norm": 1.9427356544075842, "kl": 0.09716796875, "learning_rate": 9.995277032257243e-07, "loss": 0.0039, "reward": 2.0037500858306885, "reward_std": 0.16830992698669434, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 728, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 395.32501220703125, "epoch": 0.013855364439798536, "grad_norm": 1.5906150597472941, "kl": 0.06689453125, "learning_rate": 9.995264050209728e-07, "loss": 0.0027, "reward": 1.931249976158142, "reward_std": 0.24466462433338165, "rewards/accuracy_reward": 0.8324999809265137, "rewards/format_reward": 1.0, "step": 729, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 446.4250183105469, "epoch": 0.013874370426684405, "grad_norm": 2.359530532053771, "kl": 0.1005859375, "learning_rate": 9.995251050353236e-07, "loss": 0.004, "reward": 1.9478927850723267, "reward_std": 0.04787446931004524, "rewards/accuracy_reward": 0.7928928732872009, "rewards/format_reward": 1.0, "step": 730, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 419.07501220703125, "epoch": 0.013893376413570274, "grad_norm": 1.734433346991532, "kl": 0.09521484375, "learning_rate": 9.995238032687809e-07, "loss": 0.0038, "reward": 1.693750023841858, "reward_std": 0.01837114803493023, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 731, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 470.3500061035156, "epoch": 0.013912382400456143, "grad_norm": 1.536544196243968, "kl": 0.09619140625, "learning_rate": 9.995224997213497e-07, "loss": 0.0038, "reward": 1.7087501287460327, "reward_std": 0.2257985919713974, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 732, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 478.0500183105469, "epoch": 0.013931388387342012, "grad_norm": 1.653609333566446, "kl": 0.13671875, "learning_rate": 9.995211943930342e-07, "loss": 0.0055, "reward": 2.0799999237060547, "reward_std": 0.03009720705449581, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 733, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 475.20001220703125, "epoch": 0.013950394374227881, "grad_norm": 1.5542723449274536, "kl": 0.1298828125, "learning_rate": 9.995198872838392e-07, "loss": 0.0052, "reward": 2.231250047683716, "reward_std": 0.03622090816497803, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 734, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 509.1750183105469, "epoch": 0.013969400361113752, "grad_norm": 1.37641121629186, "kl": 0.11572265625, "learning_rate": 9.995185783937697e-07, "loss": 0.0046, "reward": 1.529911994934082, "reward_std": 0.13772501051425934, "rewards/accuracy_reward": 0.5061619877815247, "rewards/format_reward": 0.9750000238418579, "step": 735, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 502.1000061035156, "epoch": 0.01398840634799962, "grad_norm": 3.8174721207345197, "kl": 0.09765625, "learning_rate": 9.9951726772283e-07, "loss": 0.0039, "reward": 1.6628608703613281, "reward_std": 0.316476970911026, "rewards/accuracy_reward": 0.731610894203186, "rewards/format_reward": 0.9000000357627869, "step": 736, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 496.4250183105469, "epoch": 0.01400741233488549, "grad_norm": 1.7831031299435494, "kl": 0.1298828125, "learning_rate": 9.995159552710246e-07, "loss": 0.0052, "reward": 1.7787500619888306, "reward_std": 0.2259797602891922, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 737, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 452.6000061035156, "epoch": 0.014026418321771358, "grad_norm": 1.544603820218446, "kl": 0.1279296875, "learning_rate": 9.995146410383587e-07, "loss": 0.0051, "reward": 1.5206249952316284, "reward_std": 0.24276505410671234, "rewards/accuracy_reward": 0.4593749940395355, "rewards/format_reward": 1.0, "step": 738, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 531.9500122070312, "epoch": 0.014045424308657227, "grad_norm": 1.788567454252881, "kl": 0.10107421875, "learning_rate": 9.995133250248368e-07, "loss": 0.004, "reward": 1.1032320261001587, "reward_std": 0.49773478507995605, "rewards/accuracy_reward": 0.2544820308685303, "rewards/format_reward": 0.875, "step": 739, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 477.1750183105469, "epoch": 0.014064430295543096, "grad_norm": 1.7441095899233134, "kl": 0.11083984375, "learning_rate": 9.995120072304634e-07, "loss": 0.0044, "reward": 1.3567793369293213, "reward_std": 0.15901394188404083, "rewards/accuracy_reward": 0.30427926778793335, "rewards/format_reward": 1.0, "step": 740, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.6, "completion_length": 488.1750183105469, "epoch": 0.014083436282428965, "grad_norm": 1.3004473374352874, "kl": 0.140625, "learning_rate": 9.995106876552436e-07, "loss": 0.0056, "reward": 1.2024999856948853, "reward_std": 0.2259700745344162, "rewards/accuracy_reward": 0.20000000298023224, "rewards/format_reward": 1.0, "step": 741, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 498.0500183105469, "epoch": 0.014102442269314834, "grad_norm": 1.2445748694317842, "kl": 0.07421875, "learning_rate": 9.995093662991816e-07, "loss": 0.003, "reward": 1.6849998235702515, "reward_std": 0.2943117022514343, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.9750000238418579, "step": 742, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 479.45001220703125, "epoch": 0.014121448256200703, "grad_norm": 6.583813889317297, "kl": 0.1474609375, "learning_rate": 9.995080431622822e-07, "loss": 0.0059, "reward": 2.288750171661377, "reward_std": 0.2106401026248932, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 1.0, "step": 743, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 420.0249938964844, "epoch": 0.014140454243086572, "grad_norm": 2.0836211277274277, "kl": 0.08349609375, "learning_rate": 9.995067182445504e-07, "loss": 0.0033, "reward": 1.5436538457870483, "reward_std": 0.14448438584804535, "rewards/accuracy_reward": 0.4761539101600647, "rewards/format_reward": 1.0, "step": 744, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.5249938964844, "epoch": 0.01415946022997244, "grad_norm": 6.236097500249061, "kl": 0.091796875, "learning_rate": 9.99505391545991e-07, "loss": 0.0037, "reward": 2.132500171661377, "reward_std": 0.1318339854478836, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 745, "temporal_rewards": 0.699999988079071 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 471.07501220703125, "epoch": 0.01417846621685831, "grad_norm": 2.033070771072757, "kl": 0.1220703125, "learning_rate": 9.995040630666083e-07, "loss": 0.0049, "reward": 2.286250114440918, "reward_std": 0.04182327538728714, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 746, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 461.875, "epoch": 0.014197472203744179, "grad_norm": 1.8083703913124065, "kl": 0.09033203125, "learning_rate": 9.995027328064074e-07, "loss": 0.0036, "reward": 1.4150055646896362, "reward_std": 0.2622986137866974, "rewards/accuracy_reward": 0.4250055253505707, "rewards/format_reward": 0.9750000238418579, "step": 747, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 446.8500061035156, "epoch": 0.014216478190630048, "grad_norm": 1.728439739270468, "kl": 0.09033203125, "learning_rate": 9.995014007653929e-07, "loss": 0.0036, "reward": 2.0140626430511475, "reward_std": 0.31676068902015686, "rewards/accuracy_reward": 0.895312488079071, "rewards/format_reward": 0.9750000238418579, "step": 748, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 477.6499938964844, "epoch": 0.014235484177515918, "grad_norm": 2.306479461937606, "kl": 0.06298828125, "learning_rate": 9.995000669435692e-07, "loss": 0.0025, "reward": 1.4972113370895386, "reward_std": 0.24063192307949066, "rewards/accuracy_reward": 0.4259612560272217, "rewards/format_reward": 0.9750000238418579, "step": 749, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 433.0500183105469, "epoch": 0.014254490164401787, "grad_norm": 3.734498611144611, "kl": 0.076171875, "learning_rate": 9.994987313409419e-07, "loss": 0.0031, "reward": 1.8204425573349, "reward_std": 0.236294224858284, "rewards/accuracy_reward": 0.7641925811767578, "rewards/format_reward": 0.9750000238418579, "step": 750, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 451.125, "epoch": 0.014273496151287656, "grad_norm": 2.0203727131221747, "kl": 0.1064453125, "learning_rate": 9.99497393957515e-07, "loss": 0.0043, "reward": 1.5, "reward_std": 0.19880543649196625, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 1.0, "step": 751, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 449.5500183105469, "epoch": 0.014292502138173525, "grad_norm": 1.9808058069268584, "kl": 0.0751953125, "learning_rate": 9.994960547932934e-07, "loss": 0.003, "reward": 1.6224838495254517, "reward_std": 0.29527994990348816, "rewards/accuracy_reward": 0.507483959197998, "rewards/format_reward": 0.9750000238418579, "step": 752, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 441.20001220703125, "epoch": 0.014311508125059394, "grad_norm": 1.4863333316983103, "kl": 0.0966796875, "learning_rate": 9.99494713848282e-07, "loss": 0.0039, "reward": 1.9149998426437378, "reward_std": 0.13809554278850555, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 753, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 453.6499938964844, "epoch": 0.014330514111945263, "grad_norm": 1.5590016955518622, "kl": 0.09423828125, "learning_rate": 9.99493371122486e-07, "loss": 0.0038, "reward": 1.8900002241134644, "reward_std": 0.045196861028671265, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 754, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 474.2749938964844, "epoch": 0.014349520098831132, "grad_norm": 1.499816752825605, "kl": 0.0859375, "learning_rate": 9.994920266159094e-07, "loss": 0.0034, "reward": 1.337070107460022, "reward_std": 0.4382023811340332, "rewards/accuracy_reward": 0.36957013607025146, "rewards/format_reward": 0.9000000357627869, "step": 755, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 460.75, "epoch": 0.014368526085717, "grad_norm": 1.7724661170009917, "kl": 0.0830078125, "learning_rate": 9.994906803285575e-07, "loss": 0.0033, "reward": 1.8309954404830933, "reward_std": 0.32961222529411316, "rewards/accuracy_reward": 0.7172453999519348, "rewards/format_reward": 0.9750000238418579, "step": 756, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 480.2250061035156, "epoch": 0.01438753207260287, "grad_norm": 1.8904261757288219, "kl": 0.08203125, "learning_rate": 9.99489332260435e-07, "loss": 0.0033, "reward": 1.7253230810165405, "reward_std": 0.23734644055366516, "rewards/accuracy_reward": 0.5840731859207153, "rewards/format_reward": 1.0, "step": 757, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 470.6750183105469, "epoch": 0.014406538059488739, "grad_norm": 1.5154542573522882, "kl": 0.142578125, "learning_rate": 9.994879824115466e-07, "loss": 0.0057, "reward": 1.8287498950958252, "reward_std": 0.14298126101493835, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 758, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 461.6499938964844, "epoch": 0.014425544046374607, "grad_norm": 1.8544975874557557, "kl": 0.099609375, "learning_rate": 9.994866307818973e-07, "loss": 0.004, "reward": 1.8991936445236206, "reward_std": 0.19035105407238007, "rewards/accuracy_reward": 0.7004434466362, "rewards/format_reward": 1.0, "step": 759, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 454.82501220703125, "epoch": 0.014444550033260476, "grad_norm": 1.4462065275613183, "kl": 0.10400390625, "learning_rate": 9.994852773714915e-07, "loss": 0.0042, "reward": 1.7649999856948853, "reward_std": 0.1486663967370987, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 760, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 484.125, "epoch": 0.014463556020146345, "grad_norm": 3.1441284308440065, "kl": 0.11767578125, "learning_rate": 9.994839221803346e-07, "loss": 0.0047, "reward": 1.9326947927474976, "reward_std": 0.2921665906906128, "rewards/accuracy_reward": 0.7714447975158691, "rewards/format_reward": 1.0, "step": 761, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 487.8500061035156, "epoch": 0.014482562007032216, "grad_norm": 3.0139285479540927, "kl": 0.0654296875, "learning_rate": 9.994825652084312e-07, "loss": 0.0026, "reward": 1.6638822555541992, "reward_std": 0.26299145817756653, "rewards/accuracy_reward": 0.5501323342323303, "rewards/format_reward": 1.0, "step": 762, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 447.8000183105469, "epoch": 0.014501567993918085, "grad_norm": 1.989444391148717, "kl": 0.09423828125, "learning_rate": 9.994812064557858e-07, "loss": 0.0038, "reward": 2.0299999713897705, "reward_std": 0.13900230824947357, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 763, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 474.75, "epoch": 0.014520573980803954, "grad_norm": 2.6836970393642363, "kl": 0.0791015625, "learning_rate": 9.994798459224038e-07, "loss": 0.0032, "reward": 1.8350870609283447, "reward_std": 0.15132379531860352, "rewards/accuracy_reward": 0.6375870108604431, "rewards/format_reward": 1.0, "step": 764, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 497.5249938964844, "epoch": 0.014539579967689823, "grad_norm": 1.549667656114667, "kl": 0.10546875, "learning_rate": 9.994784836082896e-07, "loss": 0.0042, "reward": 1.7287498712539673, "reward_std": 0.1318923383951187, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 765, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 460.1000061035156, "epoch": 0.014558585954575692, "grad_norm": 1.882961564104107, "kl": 0.10595703125, "learning_rate": 9.994771195134485e-07, "loss": 0.0042, "reward": 1.9885374307632446, "reward_std": 0.07097212225198746, "rewards/accuracy_reward": 0.7822873592376709, "rewards/format_reward": 1.0, "step": 766, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 497.9250183105469, "epoch": 0.01457759194146156, "grad_norm": 1.4813787887514824, "kl": 0.1142578125, "learning_rate": 9.99475753637885e-07, "loss": 0.0046, "reward": 2.026250123977661, "reward_std": 0.2759816348552704, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9750000238418579, "step": 767, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 480.0, "epoch": 0.01459659792834743, "grad_norm": 1.7262944765708559, "kl": 0.1494140625, "learning_rate": 9.99474385981604e-07, "loss": 0.006, "reward": 2.271250009536743, "reward_std": 0.052882224321365356, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 768, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 460.0, "epoch": 0.014615603915233298, "grad_norm": 1.5616740663610267, "kl": 0.0810546875, "learning_rate": 9.994730165446105e-07, "loss": 0.0032, "reward": 1.4993590116500854, "reward_std": 0.20657002925872803, "rewards/accuracy_reward": 0.4493590295314789, "rewards/format_reward": 0.9750000238418579, "step": 769, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 505.8500061035156, "epoch": 0.014634609902119167, "grad_norm": 1.4632730129406921, "kl": 0.134765625, "learning_rate": 9.994716453269095e-07, "loss": 0.0054, "reward": 1.9187500476837158, "reward_std": 0.20924797654151917, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.9750000238418579, "step": 770, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 482.3000183105469, "epoch": 0.014653615889005036, "grad_norm": 2.1424523172082552, "kl": 0.11669921875, "learning_rate": 9.994702723285055e-07, "loss": 0.0047, "reward": 2.063645839691162, "reward_std": 0.050597209483385086, "rewards/accuracy_reward": 0.8036457896232605, "rewards/format_reward": 1.0, "step": 771, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 471.125, "epoch": 0.014672621875890905, "grad_norm": 1.723248117910098, "kl": 0.115234375, "learning_rate": 9.994688975494038e-07, "loss": 0.0046, "reward": 2.108116865158081, "reward_std": 0.05700944736599922, "rewards/accuracy_reward": 0.841866672039032, "rewards/format_reward": 1.0, "step": 772, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 460.45001220703125, "epoch": 0.014691627862776774, "grad_norm": 1.5126150292434897, "kl": 0.115234375, "learning_rate": 9.99467520989609e-07, "loss": 0.0046, "reward": 1.5732500553131104, "reward_std": 0.21708396077156067, "rewards/accuracy_reward": 0.5057500004768372, "rewards/format_reward": 1.0, "step": 773, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 448.3500061035156, "epoch": 0.014710633849662643, "grad_norm": 1.4429782586789055, "kl": 0.10400390625, "learning_rate": 9.994661426491262e-07, "loss": 0.0041, "reward": 1.7385681867599487, "reward_std": 0.292192280292511, "rewards/accuracy_reward": 0.5723182559013367, "rewards/format_reward": 1.0, "step": 774, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 503.4250183105469, "epoch": 0.014729639836548512, "grad_norm": 1.3077908130981213, "kl": 0.1416015625, "learning_rate": 9.994647625279603e-07, "loss": 0.0057, "reward": 2.2612500190734863, "reward_std": 0.0640869066119194, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 775, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 481.95001220703125, "epoch": 0.014748645823434383, "grad_norm": 1.3490194086212917, "kl": 0.1484375, "learning_rate": 9.994633806261162e-07, "loss": 0.0059, "reward": 1.683749794960022, "reward_std": 0.11192484945058823, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 776, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 487.0, "epoch": 0.014767651810320252, "grad_norm": 1.3791951464065508, "kl": 0.13671875, "learning_rate": 9.994619969435986e-07, "loss": 0.0055, "reward": 1.7949999570846558, "reward_std": 0.2829101085662842, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 777, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 468.1750183105469, "epoch": 0.01478665779720612, "grad_norm": 1.5387961756905428, "kl": 0.11572265625, "learning_rate": 9.994606114804128e-07, "loss": 0.0046, "reward": 1.8037500381469727, "reward_std": 0.28276464343070984, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 778, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 473.25, "epoch": 0.01480566378409199, "grad_norm": 2.0734525343624104, "kl": 0.146484375, "learning_rate": 9.994592242365634e-07, "loss": 0.0059, "reward": 1.8637498617172241, "reward_std": 0.15475887060165405, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 779, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 479.6499938964844, "epoch": 0.014824669770977858, "grad_norm": 1.3821247588442285, "kl": 0.1201171875, "learning_rate": 9.994578352120555e-07, "loss": 0.0048, "reward": 1.9819663763046265, "reward_std": 0.06187741085886955, "rewards/accuracy_reward": 0.8507165312767029, "rewards/format_reward": 1.0, "step": 780, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 503.8000183105469, "epoch": 0.014843675757863727, "grad_norm": 1.4582675984327689, "kl": 0.126953125, "learning_rate": 9.994564444068942e-07, "loss": 0.0051, "reward": 2.132500171661377, "reward_std": 0.2329360991716385, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 0.9750000238418579, "step": 781, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 514.6749877929688, "epoch": 0.014862681744749596, "grad_norm": 1.367826239836461, "kl": 0.09521484375, "learning_rate": 9.994550518210843e-07, "loss": 0.0038, "reward": 1.158756136894226, "reward_std": 0.2638576626777649, "rewards/accuracy_reward": 0.16750600934028625, "rewards/format_reward": 0.9750000238418579, "step": 782, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 466.2749938964844, "epoch": 0.014881687731635465, "grad_norm": 1.4429481769775017, "kl": 0.08837890625, "learning_rate": 9.994536574546308e-07, "loss": 0.0035, "reward": 1.7625430822372437, "reward_std": 0.3283025324344635, "rewards/accuracy_reward": 0.7387930750846863, "rewards/format_reward": 1.0, "step": 783, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 443.75, "epoch": 0.014900693718521334, "grad_norm": 1.5136080221349137, "kl": 0.119140625, "learning_rate": 9.994522613075387e-07, "loss": 0.0048, "reward": 2.048386335372925, "reward_std": 0.2058589905500412, "rewards/accuracy_reward": 0.8271364569664001, "rewards/format_reward": 1.0, "step": 784, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 440.57501220703125, "epoch": 0.014919699705407203, "grad_norm": 1.5032876573447234, "kl": 0.068359375, "learning_rate": 9.994508633798128e-07, "loss": 0.0027, "reward": 1.8225001096725464, "reward_std": 0.3319225311279297, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 785, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 461.0249938964844, "epoch": 0.014938705692293072, "grad_norm": 2.6713461994091143, "kl": 0.08349609375, "learning_rate": 9.994494636714583e-07, "loss": 0.0033, "reward": 1.6481415033340454, "reward_std": 0.30230990052223206, "rewards/accuracy_reward": 0.6331415772438049, "rewards/format_reward": 0.949999988079071, "step": 786, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 409.5500183105469, "epoch": 0.01495771167917894, "grad_norm": 2.9009179538967174, "kl": 0.1103515625, "learning_rate": 9.9944806218248e-07, "loss": 0.0044, "reward": 2.1116578578948975, "reward_std": 0.18856696784496307, "rewards/accuracy_reward": 0.8254079222679138, "rewards/format_reward": 1.0, "step": 787, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 396.8999938964844, "epoch": 0.01497671766606481, "grad_norm": 1.3688749861320781, "kl": 0.0869140625, "learning_rate": 9.994466589128832e-07, "loss": 0.0035, "reward": 1.6337502002716064, "reward_std": 0.27336740493774414, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 788, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 412.8500061035156, "epoch": 0.014995723652950679, "grad_norm": 2.2438678795410527, "kl": 0.0966796875, "learning_rate": 9.994452538626726e-07, "loss": 0.0039, "reward": 1.8737499713897705, "reward_std": 0.2121085226535797, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 789, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 473.07501220703125, "epoch": 0.01501472963983655, "grad_norm": 1.749537418576682, "kl": 0.130859375, "learning_rate": 9.994438470318533e-07, "loss": 0.0052, "reward": 2.200000047683716, "reward_std": 0.22416846454143524, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 790, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 405.75, "epoch": 0.015033735626722418, "grad_norm": 1.4448913315052803, "kl": 0.09619140625, "learning_rate": 9.994424384204304e-07, "loss": 0.0038, "reward": 1.5371110439300537, "reward_std": 0.20227237045764923, "rewards/accuracy_reward": 0.4433610439300537, "rewards/format_reward": 1.0, "step": 791, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 497.1000061035156, "epoch": 0.015052741613608287, "grad_norm": 1.4979165703036394, "kl": 0.1171875, "learning_rate": 9.99441028028409e-07, "loss": 0.0047, "reward": 1.630000114440918, "reward_std": 0.286091148853302, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.925000011920929, "step": 792, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 461.4750061035156, "epoch": 0.015071747600494156, "grad_norm": 1.6179945119043353, "kl": 0.1318359375, "learning_rate": 9.994396158557937e-07, "loss": 0.0053, "reward": 1.90500009059906, "reward_std": 0.02449488453567028, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 793, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 391.1000061035156, "epoch": 0.015090753587380025, "grad_norm": 4.376878225461876, "kl": 0.06689453125, "learning_rate": 9.9943820190259e-07, "loss": 0.0027, "reward": 1.518892765045166, "reward_std": 0.16035513579845428, "rewards/accuracy_reward": 0.4276427924633026, "rewards/format_reward": 1.0, "step": 794, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 447.20001220703125, "epoch": 0.015109759574265894, "grad_norm": 1.6290473732172044, "kl": 0.12060546875, "learning_rate": 9.994367861688026e-07, "loss": 0.0048, "reward": 1.7299998998641968, "reward_std": 0.18592557311058044, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 795, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 450.3000183105469, "epoch": 0.015128765561151763, "grad_norm": 1.6960873174202618, "kl": 0.107421875, "learning_rate": 9.994353686544368e-07, "loss": 0.0043, "reward": 1.9645652770996094, "reward_std": 0.14851684868335724, "rewards/accuracy_reward": 0.7620654106140137, "rewards/format_reward": 1.0, "step": 796, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 473.2250061035156, "epoch": 0.015147771548037632, "grad_norm": 1.5568052726387847, "kl": 0.125, "learning_rate": 9.994339493594976e-07, "loss": 0.005, "reward": 2.1659560203552246, "reward_std": 0.04685882478952408, "rewards/accuracy_reward": 0.8847060203552246, "rewards/format_reward": 1.0, "step": 797, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 375.95001220703125, "epoch": 0.0151667775349235, "grad_norm": 2.441515227496874, "kl": 0.0859375, "learning_rate": 9.9943252828399e-07, "loss": 0.0034, "reward": 1.4190090894699097, "reward_std": 0.051840174943208694, "rewards/accuracy_reward": 0.3252590298652649, "rewards/format_reward": 1.0, "step": 798, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 427.25, "epoch": 0.01518578352180937, "grad_norm": 1.68778991778817, "kl": 0.11181640625, "learning_rate": 9.994311054279191e-07, "loss": 0.0045, "reward": 1.5412499904632568, "reward_std": 0.333090215921402, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 1.0, "step": 799, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 458.57501220703125, "epoch": 0.015204789508695239, "grad_norm": 1.748309733250112, "kl": 0.07373046875, "learning_rate": 9.9942968079129e-07, "loss": 0.003, "reward": 1.7813431024551392, "reward_std": 0.3278692662715912, "rewards/accuracy_reward": 0.6700930595397949, "rewards/format_reward": 0.925000011920929, "step": 800, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 470.6000061035156, "epoch": 0.015223795495581107, "grad_norm": 1.462241991884523, "kl": 0.11962890625, "learning_rate": 9.994282543741077e-07, "loss": 0.0048, "reward": 1.90625, "reward_std": 0.2457003891468048, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 801, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 478.125, "epoch": 0.015242801482466976, "grad_norm": 2.082936475391673, "kl": 0.10986328125, "learning_rate": 9.994268261763773e-07, "loss": 0.0044, "reward": 1.5495386123657227, "reward_std": 0.21620170772075653, "rewards/accuracy_reward": 0.4370386302471161, "rewards/format_reward": 1.0, "step": 802, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 437.7749938964844, "epoch": 0.015261807469352845, "grad_norm": 6.520626729244516, "kl": 0.07568359375, "learning_rate": 9.994253961981038e-07, "loss": 0.003, "reward": 1.5650115013122559, "reward_std": 0.06255774945020676, "rewards/accuracy_reward": 0.43251147866249084, "rewards/format_reward": 1.0, "step": 803, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 450.3999938964844, "epoch": 0.015280813456238716, "grad_norm": 3.0711017255804527, "kl": 0.1005859375, "learning_rate": 9.994239644392925e-07, "loss": 0.004, "reward": 1.5605331659317017, "reward_std": 0.05940740182995796, "rewards/accuracy_reward": 0.42428311705589294, "rewards/format_reward": 1.0, "step": 804, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 468.1499938964844, "epoch": 0.015299819443124585, "grad_norm": 1.3510199743183433, "kl": 0.08056640625, "learning_rate": 9.994225308999485e-07, "loss": 0.0032, "reward": 1.7200106382369995, "reward_std": 0.29686713218688965, "rewards/accuracy_reward": 0.7100105881690979, "rewards/format_reward": 1.0, "step": 805, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 476.8999938964844, "epoch": 0.015318825430010454, "grad_norm": 1.386754069916214, "kl": 0.12451171875, "learning_rate": 9.994210955800768e-07, "loss": 0.005, "reward": 1.6737499237060547, "reward_std": 0.04467545449733734, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 806, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 455.5249938964844, "epoch": 0.015337831416896323, "grad_norm": 1.4872206246572202, "kl": 0.1103515625, "learning_rate": 9.994196584796827e-07, "loss": 0.0044, "reward": 2.0174999237060547, "reward_std": 0.3088816702365875, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 807, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 457.8000183105469, "epoch": 0.015356837403782192, "grad_norm": 2.7290521499829854, "kl": 0.1171875, "learning_rate": 9.994182195987708e-07, "loss": 0.0047, "reward": 2.0900001525878906, "reward_std": 0.0994642972946167, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 808, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 420.32501220703125, "epoch": 0.01537584339066806, "grad_norm": 1.5777825182643292, "kl": 0.07275390625, "learning_rate": 9.99416778937347e-07, "loss": 0.0029, "reward": 1.8416575193405151, "reward_std": 0.14976145327091217, "rewards/accuracy_reward": 0.6754074096679688, "rewards/format_reward": 1.0, "step": 809, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 440.3000183105469, "epoch": 0.01539484937755393, "grad_norm": 2.2896139769601613, "kl": 0.0751953125, "learning_rate": 9.994153364954157e-07, "loss": 0.003, "reward": 1.5402681827545166, "reward_std": 0.05909978225827217, "rewards/accuracy_reward": 0.5115181803703308, "rewards/format_reward": 1.0, "step": 810, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 404.4750061035156, "epoch": 0.015413855364439798, "grad_norm": 3.144694858752442, "kl": 0.1171875, "learning_rate": 9.994138922729825e-07, "loss": 0.0047, "reward": 1.9600000381469727, "reward_std": 0.03009721077978611, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 811, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 423.5, "epoch": 0.015432861351325667, "grad_norm": 2.234117683871632, "kl": 0.08154296875, "learning_rate": 9.994124462700525e-07, "loss": 0.0033, "reward": 1.8830076456069946, "reward_std": 0.13665837049484253, "rewards/accuracy_reward": 0.6817578077316284, "rewards/format_reward": 1.0, "step": 812, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 448.6750183105469, "epoch": 0.015451867338211536, "grad_norm": 2.371888037720843, "kl": 0.10791015625, "learning_rate": 9.994109984866308e-07, "loss": 0.0043, "reward": 1.7918812036514282, "reward_std": 0.32109126448631287, "rewards/accuracy_reward": 0.6243812441825867, "rewards/format_reward": 1.0, "step": 813, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 446.1499938964844, "epoch": 0.015470873325097405, "grad_norm": 1.3257033955433293, "kl": 0.1259765625, "learning_rate": 9.994095489227224e-07, "loss": 0.005, "reward": 1.6075000762939453, "reward_std": 0.26312994956970215, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 814, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 411.57501220703125, "epoch": 0.015489879311983274, "grad_norm": 2.288194544489373, "kl": 0.10498046875, "learning_rate": 9.994080975783329e-07, "loss": 0.0042, "reward": 1.839240312576294, "reward_std": 0.29091987013816833, "rewards/accuracy_reward": 0.6767401695251465, "rewards/format_reward": 1.0, "step": 815, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 451.32501220703125, "epoch": 0.015508885298869143, "grad_norm": 2.3909629602223723, "kl": 0.08154296875, "learning_rate": 9.994066444534668e-07, "loss": 0.0033, "reward": 1.6088298559188843, "reward_std": 0.1844092607498169, "rewards/accuracy_reward": 0.458829790353775, "rewards/format_reward": 1.0, "step": 816, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 505.6499938964844, "epoch": 0.015527891285755014, "grad_norm": 1.3999538222360683, "kl": 0.13671875, "learning_rate": 9.994051895481298e-07, "loss": 0.0055, "reward": 1.9837497472763062, "reward_std": 0.350053071975708, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 0.925000011920929, "step": 817, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 465.6499938964844, "epoch": 0.015546897272640883, "grad_norm": 1.877393189163533, "kl": 0.1044921875, "learning_rate": 9.99403732862327e-07, "loss": 0.0042, "reward": 1.436156153678894, "reward_std": 0.15536737442016602, "rewards/accuracy_reward": 0.3149060010910034, "rewards/format_reward": 1.0, "step": 818, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 430.1750183105469, "epoch": 0.015565903259526752, "grad_norm": 2.896829914699599, "kl": 0.09814453125, "learning_rate": 9.994022743960638e-07, "loss": 0.0039, "reward": 1.9121776819229126, "reward_std": 0.18449024856090546, "rewards/accuracy_reward": 0.8334276080131531, "rewards/format_reward": 1.0, "step": 819, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 446.9250183105469, "epoch": 0.01558490924641262, "grad_norm": 1.6359786073151226, "kl": 0.1015625, "learning_rate": 9.994008141493447e-07, "loss": 0.0041, "reward": 1.7087547779083252, "reward_std": 0.19845259189605713, "rewards/accuracy_reward": 0.6412548422813416, "rewards/format_reward": 1.0, "step": 820, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 454.0249938964844, "epoch": 0.01560391523329849, "grad_norm": 2.0659525121154276, "kl": 0.0859375, "learning_rate": 9.993993521221757e-07, "loss": 0.0034, "reward": 1.72882080078125, "reward_std": 0.03259288892149925, "rewards/accuracy_reward": 0.6288207173347473, "rewards/format_reward": 1.0, "step": 821, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 381.45001220703125, "epoch": 0.015622921220184358, "grad_norm": 2.3887967683025733, "kl": 0.1123046875, "learning_rate": 9.993978883145614e-07, "loss": 0.0045, "reward": 1.8892822265625, "reward_std": 0.06339588016271591, "rewards/accuracy_reward": 0.7130321860313416, "rewards/format_reward": 1.0, "step": 822, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 423.45001220703125, "epoch": 0.015641927207070225, "grad_norm": 2.9798839148968446, "kl": 0.07421875, "learning_rate": 9.993964227265074e-07, "loss": 0.003, "reward": 1.58841073513031, "reward_std": 0.1449354737997055, "rewards/accuracy_reward": 0.500910758972168, "rewards/format_reward": 1.0, "step": 823, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 457.1000061035156, "epoch": 0.015660933193956096, "grad_norm": 1.5512224124952236, "kl": 0.1416015625, "learning_rate": 9.99394955358019e-07, "loss": 0.0056, "reward": 1.8575000762939453, "reward_std": 0.1376771777868271, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 824, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 438.125, "epoch": 0.015679939180841967, "grad_norm": 1.5639207551772911, "kl": 0.09130859375, "learning_rate": 9.993934862091009e-07, "loss": 0.0036, "reward": 1.70260751247406, "reward_std": 0.232549786567688, "rewards/accuracy_reward": 0.6038575768470764, "rewards/format_reward": 1.0, "step": 825, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.3999938964844, "epoch": 0.015698945167727834, "grad_norm": 2.3209589552496106, "kl": 0.10205078125, "learning_rate": 9.99392015279759e-07, "loss": 0.0041, "reward": 1.8627609014511108, "reward_std": 0.1048198714852333, "rewards/accuracy_reward": 0.7027609348297119, "rewards/format_reward": 1.0, "step": 826, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 470.7250061035156, "epoch": 0.015717951154613705, "grad_norm": 2.3085009505564713, "kl": 0.1494140625, "learning_rate": 9.99390542569998e-07, "loss": 0.006, "reward": 2.1050000190734863, "reward_std": 0.10080789774656296, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 827, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 451.5249938964844, "epoch": 0.015736957141499572, "grad_norm": 1.7331765865917723, "kl": 0.10791015625, "learning_rate": 9.993890680798234e-07, "loss": 0.0043, "reward": 1.4598677158355713, "reward_std": 0.23452387750148773, "rewards/accuracy_reward": 0.37736770510673523, "rewards/format_reward": 1.0, "step": 828, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 446.70001220703125, "epoch": 0.015755963128385442, "grad_norm": 2.0426688994806304, "kl": 0.078125, "learning_rate": 9.993875918092406e-07, "loss": 0.0031, "reward": 1.8170711994171143, "reward_std": 0.11288689821958542, "rewards/accuracy_reward": 0.7020711302757263, "rewards/format_reward": 1.0, "step": 829, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 451.95001220703125, "epoch": 0.01577496911527131, "grad_norm": 2.0469231909203396, "kl": 0.12255859375, "learning_rate": 9.993861137582544e-07, "loss": 0.0049, "reward": 2.0579166412353516, "reward_std": 0.18174327909946442, "rewards/accuracy_reward": 0.8716667294502258, "rewards/format_reward": 1.0, "step": 830, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 455.625, "epoch": 0.01579397510215718, "grad_norm": 1.9212226372226346, "kl": 0.09765625, "learning_rate": 9.993846339268706e-07, "loss": 0.0039, "reward": 1.7817646265029907, "reward_std": 0.22643624246120453, "rewards/accuracy_reward": 0.6630145907402039, "rewards/format_reward": 1.0, "step": 831, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 485.0500183105469, "epoch": 0.015812981089043048, "grad_norm": 1.7117025823471566, "kl": 0.125, "learning_rate": 9.993831523150941e-07, "loss": 0.005, "reward": 2.002500057220459, "reward_std": 0.3246128559112549, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 832, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 478.6750183105469, "epoch": 0.015831987075928918, "grad_norm": 2.0869795193340797, "kl": 0.107421875, "learning_rate": 9.993816689229306e-07, "loss": 0.0043, "reward": 2.0425000190734863, "reward_std": 0.29715046286582947, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 0.9750000238418579, "step": 833, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 474.375, "epoch": 0.015850993062814785, "grad_norm": 1.4969927994579393, "kl": 0.12109375, "learning_rate": 9.993801837503847e-07, "loss": 0.0048, "reward": 2.1942081451416016, "reward_std": 0.06370045244693756, "rewards/accuracy_reward": 0.9979583621025085, "rewards/format_reward": 1.0, "step": 834, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 439.9250183105469, "epoch": 0.015869999049700656, "grad_norm": 1.6378010011346011, "kl": 0.076171875, "learning_rate": 9.993786967974624e-07, "loss": 0.003, "reward": 1.6663583517074585, "reward_std": 0.2924807071685791, "rewards/accuracy_reward": 0.621358335018158, "rewards/format_reward": 0.925000011920929, "step": 835, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 419.5, "epoch": 0.015889005036586523, "grad_norm": 1.9354578913615843, "kl": 0.08154296875, "learning_rate": 9.993772080641687e-07, "loss": 0.0033, "reward": 1.8075135946273804, "reward_std": 0.27105510234832764, "rewards/accuracy_reward": 0.6525136232376099, "rewards/format_reward": 1.0, "step": 836, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 428.82501220703125, "epoch": 0.015908011023472394, "grad_norm": 1.7627924290890316, "kl": 0.11279296875, "learning_rate": 9.99375717550509e-07, "loss": 0.0045, "reward": 1.7273215055465698, "reward_std": 0.23579041659832, "rewards/accuracy_reward": 0.543571412563324, "rewards/format_reward": 1.0, "step": 837, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 453.5249938964844, "epoch": 0.015927017010358265, "grad_norm": 1.274599125249701, "kl": 0.12890625, "learning_rate": 9.993742252564883e-07, "loss": 0.0052, "reward": 1.4049999713897705, "reward_std": 0.3525925576686859, "rewards/accuracy_reward": 0.3499999940395355, "rewards/format_reward": 1.0, "step": 838, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 427.375, "epoch": 0.01594602299724413, "grad_norm": 1.8369478545790794, "kl": 0.09228515625, "learning_rate": 9.99372731182112e-07, "loss": 0.0037, "reward": 1.7975000143051147, "reward_std": 0.21443481743335724, "rewards/accuracy_reward": 0.7350000739097595, "rewards/format_reward": 0.9750000238418579, "step": 839, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 422.1000061035156, "epoch": 0.015965028984130002, "grad_norm": 1.8753793570007402, "kl": 0.1357421875, "learning_rate": 9.99371235327386e-07, "loss": 0.0054, "reward": 2.102954626083374, "reward_std": 0.09214019030332565, "rewards/accuracy_reward": 0.9279546141624451, "rewards/format_reward": 1.0, "step": 840, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 472.5, "epoch": 0.01598403497101587, "grad_norm": 1.4138033024464294, "kl": 0.11669921875, "learning_rate": 9.99369737692315e-07, "loss": 0.0047, "reward": 1.8562500476837158, "reward_std": 0.21997599303722382, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 841, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 407.70001220703125, "epoch": 0.01600304095790174, "grad_norm": 1.555435218787223, "kl": 0.0986328125, "learning_rate": 9.993682382769045e-07, "loss": 0.0039, "reward": 2.0625, "reward_std": 0.17160551249980927, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 842, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 418.25, "epoch": 0.016022046944787607, "grad_norm": 1.6548112802951471, "kl": 0.130859375, "learning_rate": 9.993667370811598e-07, "loss": 0.0052, "reward": 1.818750023841858, "reward_std": 0.15131191909313202, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 843, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.6000061035156, "epoch": 0.016041052931673478, "grad_norm": 1.7388815142539007, "kl": 0.103515625, "learning_rate": 9.993652341050865e-07, "loss": 0.0041, "reward": 1.7882276773452759, "reward_std": 0.19487141072750092, "rewards/accuracy_reward": 0.6319777369499207, "rewards/format_reward": 1.0, "step": 844, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 441.3999938964844, "epoch": 0.016060058918559345, "grad_norm": 2.3646734653737314, "kl": 0.10205078125, "learning_rate": 9.993637293486897e-07, "loss": 0.0041, "reward": 1.7157411575317383, "reward_std": 0.2658672332763672, "rewards/accuracy_reward": 0.6307411789894104, "rewards/format_reward": 1.0, "step": 845, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 418.2749938964844, "epoch": 0.016079064905445216, "grad_norm": 2.2928230050104363, "kl": 0.09765625, "learning_rate": 9.993622228119752e-07, "loss": 0.0039, "reward": 2.1500000953674316, "reward_std": 0.09946425259113312, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 846, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 449.1000061035156, "epoch": 0.016098070892331083, "grad_norm": 4.330309305614554, "kl": 0.12060546875, "learning_rate": 9.993607144949477e-07, "loss": 0.0048, "reward": 1.993749976158142, "reward_std": 0.2732623517513275, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 847, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 398.2250061035156, "epoch": 0.016117076879216954, "grad_norm": 1.9516245579933782, "kl": 0.142578125, "learning_rate": 9.99359204397613e-07, "loss": 0.0057, "reward": 1.9774999618530273, "reward_std": 0.25878724455833435, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 848, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 446.9750061035156, "epoch": 0.01613608286610282, "grad_norm": 2.0447718957624392, "kl": 0.12890625, "learning_rate": 9.993576925199765e-07, "loss": 0.0052, "reward": 2.2055232524871826, "reward_std": 0.039260316640138626, "rewards/accuracy_reward": 0.9142733812332153, "rewards/format_reward": 1.0, "step": 849, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 387.8000183105469, "epoch": 0.01615508885298869, "grad_norm": 2.2145563974573106, "kl": 0.08251953125, "learning_rate": 9.993561788620434e-07, "loss": 0.0033, "reward": 1.533031702041626, "reward_std": 0.024825790897011757, "rewards/accuracy_reward": 0.4392816126346588, "rewards/format_reward": 1.0, "step": 850, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 467.1499938964844, "epoch": 0.01617409483987456, "grad_norm": 2.981423734250105, "kl": 0.11083984375, "learning_rate": 9.993546634238192e-07, "loss": 0.0044, "reward": 2.0568604469299316, "reward_std": 0.04423154518008232, "rewards/accuracy_reward": 0.8406102061271667, "rewards/format_reward": 1.0, "step": 851, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 424.8999938964844, "epoch": 0.01619310082676043, "grad_norm": 1.4528924750481853, "kl": 0.09765625, "learning_rate": 9.993531462053093e-07, "loss": 0.0039, "reward": 1.5247740745544434, "reward_std": 0.22725442051887512, "rewards/accuracy_reward": 0.43977412581443787, "rewards/format_reward": 1.0, "step": 852, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 478.1750183105469, "epoch": 0.0162121068136463, "grad_norm": 2.236282492619206, "kl": 0.1552734375, "learning_rate": 9.99351627206519e-07, "loss": 0.0062, "reward": 2.28125, "reward_std": 0.047425609081983566, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 853, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 444.8999938964844, "epoch": 0.016231112800532167, "grad_norm": 1.6565299598191192, "kl": 0.115234375, "learning_rate": 9.993501064274539e-07, "loss": 0.0046, "reward": 1.7637265920639038, "reward_std": 0.1827048510313034, "rewards/accuracy_reward": 0.6649765968322754, "rewards/format_reward": 1.0, "step": 854, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 480.95001220703125, "epoch": 0.016250118787418038, "grad_norm": 1.5293667400472255, "kl": 0.1728515625, "learning_rate": 9.993485838681193e-07, "loss": 0.0069, "reward": 2.0875000953674316, "reward_std": 0.1455240547657013, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 855, "temporal_rewards": 0.699999988079071 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 471.8000183105469, "epoch": 0.016269124774303905, "grad_norm": 1.8992269382996032, "kl": 0.1435546875, "learning_rate": 9.993470595285206e-07, "loss": 0.0057, "reward": 2.286249876022339, "reward_std": 0.04182329773902893, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 856, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 461.8000183105469, "epoch": 0.016288130761189776, "grad_norm": 2.212047363842615, "kl": 0.1376953125, "learning_rate": 9.993455334086635e-07, "loss": 0.0055, "reward": 2.2024998664855957, "reward_std": 0.12062934786081314, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 857, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 455.5, "epoch": 0.016307136748075643, "grad_norm": 4.892652987176434, "kl": 0.0693359375, "learning_rate": 9.99344005508553e-07, "loss": 0.0028, "reward": 1.5883082151412964, "reward_std": 0.10912847518920898, "rewards/accuracy_reward": 0.5095581412315369, "rewards/format_reward": 1.0, "step": 858, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 509.0249938964844, "epoch": 0.016326142734961514, "grad_norm": 1.6402165764541399, "kl": 0.08447265625, "learning_rate": 9.99342475828195e-07, "loss": 0.0034, "reward": 1.508105754852295, "reward_std": 0.37558695673942566, "rewards/accuracy_reward": 0.5031058192253113, "rewards/format_reward": 0.9000000357627869, "step": 859, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 486.20001220703125, "epoch": 0.01634514872184738, "grad_norm": 1.6755708474898512, "kl": 0.1279296875, "learning_rate": 9.993409443675947e-07, "loss": 0.0051, "reward": 2.018625020980835, "reward_std": 0.0413166843354702, "rewards/accuracy_reward": 0.8523751497268677, "rewards/format_reward": 1.0, "step": 860, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 473.4250183105469, "epoch": 0.01636415470873325, "grad_norm": 1.5552095725377828, "kl": 0.11181640625, "learning_rate": 9.993394111267577e-07, "loss": 0.0045, "reward": 1.7212499380111694, "reward_std": 0.3069811463356018, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9750000238418579, "step": 861, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 473.2250061035156, "epoch": 0.01638316069561912, "grad_norm": 1.7240589687162269, "kl": 0.1533203125, "learning_rate": 9.993378761056892e-07, "loss": 0.0061, "reward": 2.192929983139038, "reward_std": 0.07727370411157608, "rewards/accuracy_reward": 0.9216799139976501, "rewards/format_reward": 1.0, "step": 862, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 487.07501220703125, "epoch": 0.01640216668250499, "grad_norm": 1.7580497691378385, "kl": 0.056640625, "learning_rate": 9.99336339304395e-07, "loss": 0.0023, "reward": 1.6729110479354858, "reward_std": 0.20493152737617493, "rewards/accuracy_reward": 0.7079111933708191, "rewards/format_reward": 0.9750000238418579, "step": 863, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 459.0249938964844, "epoch": 0.016421172669390856, "grad_norm": 1.9702491120937722, "kl": 0.125, "learning_rate": 9.9933480072288e-07, "loss": 0.005, "reward": 2.115000009536743, "reward_std": 0.22647862136363983, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 864, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 458.1750183105469, "epoch": 0.016440178656276727, "grad_norm": 1.474895194088238, "kl": 0.044677734375, "learning_rate": 9.993332603611506e-07, "loss": 0.0018, "reward": 1.5189027786254883, "reward_std": 0.10458987206220627, "rewards/accuracy_reward": 0.5051528811454773, "rewards/format_reward": 1.0, "step": 865, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 471.6499938964844, "epoch": 0.016459184643162598, "grad_norm": 1.813972251346778, "kl": 0.15234375, "learning_rate": 9.993317182192117e-07, "loss": 0.0061, "reward": 1.7414944171905518, "reward_std": 0.3115447163581848, "rewards/accuracy_reward": 0.6439945101737976, "rewards/format_reward": 0.949999988079071, "step": 866, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 423.8500061035156, "epoch": 0.016478190630048465, "grad_norm": 1.7205660396982019, "kl": 0.1171875, "learning_rate": 9.993301742970685e-07, "loss": 0.0047, "reward": 1.8674999475479126, "reward_std": 0.23858864605426788, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 867, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 417.0, "epoch": 0.016497196616934336, "grad_norm": 2.31565667012534, "kl": 0.1015625, "learning_rate": 9.993286285947272e-07, "loss": 0.0041, "reward": 1.7135841846466064, "reward_std": 0.1332218199968338, "rewards/accuracy_reward": 0.5673341155052185, "rewards/format_reward": 1.0, "step": 868, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 432.5, "epoch": 0.016516202603820203, "grad_norm": 1.8884284497793173, "kl": 0.11181640625, "learning_rate": 9.99327081112193e-07, "loss": 0.0045, "reward": 1.8396732807159424, "reward_std": 0.11203023046255112, "rewards/accuracy_reward": 0.754673421382904, "rewards/format_reward": 1.0, "step": 869, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 489.9750061035156, "epoch": 0.016535208590706073, "grad_norm": 2.104984276769367, "kl": 0.1259765625, "learning_rate": 9.993255318494716e-07, "loss": 0.005, "reward": 2.012500047683716, "reward_std": 0.282286137342453, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.9750000238418579, "step": 870, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 477.95001220703125, "epoch": 0.01655421457759194, "grad_norm": 1.9211525955631754, "kl": 0.10791015625, "learning_rate": 9.993239808065681e-07, "loss": 0.0043, "reward": 1.875353217124939, "reward_std": 0.04591451212763786, "rewards/accuracy_reward": 0.7191033363342285, "rewards/format_reward": 1.0, "step": 871, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 410.82501220703125, "epoch": 0.01657322056447781, "grad_norm": 1.8909875407142658, "kl": 0.1142578125, "learning_rate": 9.993224279834883e-07, "loss": 0.0046, "reward": 1.8530091047286987, "reward_std": 0.16574768722057343, "rewards/accuracy_reward": 0.7055088877677917, "rewards/format_reward": 1.0, "step": 872, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 480.3999938964844, "epoch": 0.01659222655136368, "grad_norm": 1.2802115653241215, "kl": 0.126953125, "learning_rate": 9.993208733802379e-07, "loss": 0.0051, "reward": 1.575060486793518, "reward_std": 0.13075125217437744, "rewards/accuracy_reward": 0.4925605356693268, "rewards/format_reward": 1.0, "step": 873, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 496.7250061035156, "epoch": 0.01661123253824955, "grad_norm": 1.5261699288032977, "kl": 0.162109375, "learning_rate": 9.99319316996822e-07, "loss": 0.0065, "reward": 2.216249942779541, "reward_std": 0.05015389993786812, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 874, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 375.8000183105469, "epoch": 0.016630238525135416, "grad_norm": 3.478005387881262, "kl": 0.1025390625, "learning_rate": 9.993177588332466e-07, "loss": 0.0041, "reward": 1.8293765783309937, "reward_std": 0.15665097534656525, "rewards/accuracy_reward": 0.6006266474723816, "rewards/format_reward": 1.0, "step": 875, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 444.82501220703125, "epoch": 0.016649244512021287, "grad_norm": 1.723714483408541, "kl": 0.1005859375, "learning_rate": 9.99316198889517e-07, "loss": 0.004, "reward": 1.7071765661239624, "reward_std": 0.055387865751981735, "rewards/accuracy_reward": 0.6221765279769897, "rewards/format_reward": 1.0, "step": 876, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 426.6750183105469, "epoch": 0.016668250498907154, "grad_norm": 1.319722363716567, "kl": 0.0693359375, "learning_rate": 9.993146371656387e-07, "loss": 0.0028, "reward": 1.5111616849899292, "reward_std": 0.1265857219696045, "rewards/accuracy_reward": 0.5024116039276123, "rewards/format_reward": 1.0, "step": 877, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 423.8999938964844, "epoch": 0.016687256485793025, "grad_norm": 6.5689295087820065, "kl": 0.1201171875, "learning_rate": 9.993130736616176e-07, "loss": 0.0048, "reward": 1.5787500143051147, "reward_std": 0.1456565111875534, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 878, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 442.5249938964844, "epoch": 0.016706262472678896, "grad_norm": 1.83588452778704, "kl": 0.12158203125, "learning_rate": 9.993115083774588e-07, "loss": 0.0049, "reward": 2.0285227298736572, "reward_std": 0.054203104227781296, "rewards/accuracy_reward": 0.8085227012634277, "rewards/format_reward": 1.0, "step": 879, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 446.0249938964844, "epoch": 0.016725268459564763, "grad_norm": 2.2596994671096984, "kl": 0.099609375, "learning_rate": 9.993099413131685e-07, "loss": 0.004, "reward": 2.103471040725708, "reward_std": 0.07465606927871704, "rewards/accuracy_reward": 0.9172208905220032, "rewards/format_reward": 1.0, "step": 880, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 417.5, "epoch": 0.016744274446450633, "grad_norm": 1.744277750918761, "kl": 0.0888671875, "learning_rate": 9.993083724687516e-07, "loss": 0.0035, "reward": 1.7909082174301147, "reward_std": 0.05641747638583183, "rewards/accuracy_reward": 0.5809082388877869, "rewards/format_reward": 1.0, "step": 881, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 448.2250061035156, "epoch": 0.0167632804333365, "grad_norm": 2.153572328587144, "kl": 0.1123046875, "learning_rate": 9.993068018442141e-07, "loss": 0.0045, "reward": 1.734042763710022, "reward_std": 0.19810186326503754, "rewards/accuracy_reward": 0.6402926445007324, "rewards/format_reward": 1.0, "step": 882, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 466.0, "epoch": 0.01678228642022237, "grad_norm": 1.7170656576305139, "kl": 0.1123046875, "learning_rate": 9.993052294395617e-07, "loss": 0.0045, "reward": 2.1014654636383057, "reward_std": 0.25909802317619324, "rewards/accuracy_reward": 0.9189655184745789, "rewards/format_reward": 1.0, "step": 883, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 424.6750183105469, "epoch": 0.01680129240710824, "grad_norm": 2.373601273643548, "kl": 0.1142578125, "learning_rate": 9.993036552547997e-07, "loss": 0.0046, "reward": 2.0, "reward_std": 0.24250374734401703, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 884, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 432.625, "epoch": 0.01682029839399411, "grad_norm": 1.746439401988913, "kl": 0.09130859375, "learning_rate": 9.993020792899338e-07, "loss": 0.0037, "reward": 1.631177544593811, "reward_std": 0.04874923452734947, "rewards/accuracy_reward": 0.4924275875091553, "rewards/format_reward": 1.0, "step": 885, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 439.20001220703125, "epoch": 0.016839304380879976, "grad_norm": 1.7100199705566455, "kl": 0.14453125, "learning_rate": 9.9930050154497e-07, "loss": 0.0058, "reward": 1.9174998998641968, "reward_std": 0.34676268696784973, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 886, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 404.5249938964844, "epoch": 0.016858310367765847, "grad_norm": 3.0662870955776023, "kl": 0.09228515625, "learning_rate": 9.992989220199131e-07, "loss": 0.0037, "reward": 1.720924735069275, "reward_std": 0.30055004358291626, "rewards/accuracy_reward": 0.5334247350692749, "rewards/format_reward": 1.0, "step": 887, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 384.45001220703125, "epoch": 0.016877316354651714, "grad_norm": 4.733601015078978, "kl": 0.07763671875, "learning_rate": 9.992973407147694e-07, "loss": 0.0031, "reward": 1.5069843530654907, "reward_std": 0.2703723907470703, "rewards/accuracy_reward": 0.48573437333106995, "rewards/format_reward": 1.0, "step": 888, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 414.3000183105469, "epoch": 0.016896322341537585, "grad_norm": 1.9393133107030889, "kl": 0.1279296875, "learning_rate": 9.992957576295444e-07, "loss": 0.0051, "reward": 1.5737501382827759, "reward_std": 0.21649669110774994, "rewards/accuracy_reward": 0.5375000834465027, "rewards/format_reward": 1.0, "step": 889, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 440.5500183105469, "epoch": 0.016915328328423452, "grad_norm": 1.8157109279591936, "kl": 0.126953125, "learning_rate": 9.992941727642437e-07, "loss": 0.0051, "reward": 1.6087499856948853, "reward_std": 0.20222507417201996, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 890, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 433.6000061035156, "epoch": 0.016934334315309323, "grad_norm": 7.040820298091219, "kl": 0.07421875, "learning_rate": 9.99292586118873e-07, "loss": 0.003, "reward": 1.6710137128829956, "reward_std": 0.36479684710502625, "rewards/accuracy_reward": 0.5985136032104492, "rewards/format_reward": 1.0, "step": 891, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 411.5500183105469, "epoch": 0.01695334030219519, "grad_norm": 2.831915634340959, "kl": 0.08447265625, "learning_rate": 9.992909976934379e-07, "loss": 0.0034, "reward": 1.7837499380111694, "reward_std": 0.21281376481056213, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 892, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 461.7749938964844, "epoch": 0.01697234628908106, "grad_norm": 1.869869477482866, "kl": 0.12109375, "learning_rate": 9.992894074879438e-07, "loss": 0.0049, "reward": 1.991864800453186, "reward_std": 0.11864250898361206, "rewards/accuracy_reward": 0.7768649458885193, "rewards/format_reward": 1.0, "step": 893, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 437.8500061035156, "epoch": 0.01699135227596693, "grad_norm": 1.5980319565472654, "kl": 0.1025390625, "learning_rate": 9.99287815502397e-07, "loss": 0.0041, "reward": 2.0072453022003174, "reward_std": 0.3231770694255829, "rewards/accuracy_reward": 0.9172453880310059, "rewards/format_reward": 0.9750000238418579, "step": 894, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 422.875, "epoch": 0.0170103582628528, "grad_norm": 2.8889234715050813, "kl": 0.12109375, "learning_rate": 9.992862217368026e-07, "loss": 0.0048, "reward": 1.8533226251602173, "reward_std": 0.29257944226264954, "rewards/accuracy_reward": 0.6683226823806763, "rewards/format_reward": 1.0, "step": 895, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 421.70001220703125, "epoch": 0.01702936424973867, "grad_norm": 1.8171922834320466, "kl": 0.12890625, "learning_rate": 9.992846261911667e-07, "loss": 0.0052, "reward": 1.6515644788742065, "reward_std": 0.40723466873168945, "rewards/accuracy_reward": 0.5890643000602722, "rewards/format_reward": 0.949999988079071, "step": 896, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 433.7250061035156, "epoch": 0.017048370236624536, "grad_norm": 1.7504183513198104, "kl": 0.1376953125, "learning_rate": 9.992830288654946e-07, "loss": 0.0055, "reward": 2.0028178691864014, "reward_std": 0.13226179778575897, "rewards/accuracy_reward": 0.8440677523612976, "rewards/format_reward": 1.0, "step": 897, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 478.875, "epoch": 0.017067376223510407, "grad_norm": 1.6776428420008855, "kl": 0.107421875, "learning_rate": 9.992814297597921e-07, "loss": 0.0043, "reward": 1.8325001001358032, "reward_std": 0.1587127298116684, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 898, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 405.4750061035156, "epoch": 0.017086382210396274, "grad_norm": 3.391580591063593, "kl": 0.140625, "learning_rate": 9.992798288740652e-07, "loss": 0.0056, "reward": 1.9079521894454956, "reward_std": 0.03444410488009453, "rewards/accuracy_reward": 0.7429521679878235, "rewards/format_reward": 1.0, "step": 899, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 417.6499938964844, "epoch": 0.017105388197282145, "grad_norm": 1.92102547604499, "kl": 0.1630859375, "learning_rate": 9.992782262083192e-07, "loss": 0.0065, "reward": 1.8000000715255737, "reward_std": 0.1320357769727707, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 900, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 416.95001220703125, "epoch": 0.017124394184168012, "grad_norm": 1.5300812122541545, "kl": 0.138671875, "learning_rate": 9.992766217625602e-07, "loss": 0.0055, "reward": 1.4424999952316284, "reward_std": 0.08856045454740524, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 1.0, "step": 901, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 457.20001220703125, "epoch": 0.017143400171053882, "grad_norm": 1.5973634459051322, "kl": 0.07470703125, "learning_rate": 9.992750155367936e-07, "loss": 0.003, "reward": 1.6483417749404907, "reward_std": 0.15044425427913666, "rewards/accuracy_reward": 0.613341748714447, "rewards/format_reward": 1.0, "step": 902, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 381.4750061035156, "epoch": 0.01716240615793975, "grad_norm": 2.8127747506593352, "kl": 0.10107421875, "learning_rate": 9.992734075310252e-07, "loss": 0.004, "reward": 1.9759514331817627, "reward_std": 0.15555985271930695, "rewards/accuracy_reward": 0.8222013711929321, "rewards/format_reward": 1.0, "step": 903, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 379.875, "epoch": 0.01718141214482562, "grad_norm": 1.9238245977549657, "kl": 0.1015625, "learning_rate": 9.992717977452609e-07, "loss": 0.0041, "reward": 1.7420654296875, "reward_std": 0.11957808583974838, "rewards/accuracy_reward": 0.5795655250549316, "rewards/format_reward": 1.0, "step": 904, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 437.0249938964844, "epoch": 0.017200418131711488, "grad_norm": 1.9246966725259174, "kl": 0.1337890625, "learning_rate": 9.992701861795064e-07, "loss": 0.0053, "reward": 1.7537500858306885, "reward_std": 0.018371179699897766, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 905, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 395.2749938964844, "epoch": 0.017219424118597358, "grad_norm": 3.2173579808888024, "kl": 0.09716796875, "learning_rate": 9.992685728337672e-07, "loss": 0.0039, "reward": 1.8408873081207275, "reward_std": 0.17486131191253662, "rewards/accuracy_reward": 0.6183871626853943, "rewards/format_reward": 1.0, "step": 906, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 375.1000061035156, "epoch": 0.01723843010548323, "grad_norm": 1.9334271577006181, "kl": 0.11083984375, "learning_rate": 9.992669577080492e-07, "loss": 0.0044, "reward": 2.0159237384796143, "reward_std": 0.0631365180015564, "rewards/accuracy_reward": 0.8484236001968384, "rewards/format_reward": 1.0, "step": 907, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 423.6750183105469, "epoch": 0.017257436092369096, "grad_norm": 1.7833943522477942, "kl": 0.09521484375, "learning_rate": 9.99265340802358e-07, "loss": 0.0038, "reward": 1.6475000381469727, "reward_std": 0.16724412143230438, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 1.0, "step": 908, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 431.4250183105469, "epoch": 0.017276442079254967, "grad_norm": 2.340604095428615, "kl": 0.14453125, "learning_rate": 9.992637221167e-07, "loss": 0.0058, "reward": 2.0562500953674316, "reward_std": 0.046975038945674896, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 909, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 489.3999938964844, "epoch": 0.017295448066140834, "grad_norm": 2.0548023227236714, "kl": 0.13671875, "learning_rate": 9.992621016510803e-07, "loss": 0.0055, "reward": 1.8840579986572266, "reward_std": 0.061809565871953964, "rewards/accuracy_reward": 0.6840581297874451, "rewards/format_reward": 1.0, "step": 910, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 460.3999938964844, "epoch": 0.017314454053026705, "grad_norm": 2.2033052205495034, "kl": 0.1337890625, "learning_rate": 9.992604794055047e-07, "loss": 0.0054, "reward": 1.9399999380111694, "reward_std": 0.2200370579957962, "rewards/accuracy_reward": 0.8350000381469727, "rewards/format_reward": 1.0, "step": 911, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 477.5249938964844, "epoch": 0.01733346003991257, "grad_norm": 1.8187752496420266, "kl": 0.1376953125, "learning_rate": 9.992588553799794e-07, "loss": 0.0055, "reward": 2.045555591583252, "reward_std": 0.1822027564048767, "rewards/accuracy_reward": 0.9055555462837219, "rewards/format_reward": 1.0, "step": 912, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 490.0249938964844, "epoch": 0.017352466026798442, "grad_norm": 1.5538539437913994, "kl": 0.1416015625, "learning_rate": 9.9925722957451e-07, "loss": 0.0056, "reward": 1.6162500381469727, "reward_std": 0.42723989486694336, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 0.9750000238418579, "step": 913, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 457.125, "epoch": 0.01737147201368431, "grad_norm": 2.1621873194400587, "kl": 0.1201171875, "learning_rate": 9.99255601989102e-07, "loss": 0.0048, "reward": 1.8568352460861206, "reward_std": 0.1532951146364212, "rewards/accuracy_reward": 0.7430852055549622, "rewards/format_reward": 1.0, "step": 914, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 456.1750183105469, "epoch": 0.01739047800057018, "grad_norm": 1.6042727946718174, "kl": 0.1142578125, "learning_rate": 9.992539726237616e-07, "loss": 0.0046, "reward": 1.6074215173721313, "reward_std": 0.18194305896759033, "rewards/accuracy_reward": 0.6086716651916504, "rewards/format_reward": 1.0, "step": 915, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 428.8000183105469, "epoch": 0.017409483987456047, "grad_norm": 1.9681153378432128, "kl": 0.1279296875, "learning_rate": 9.992523414784945e-07, "loss": 0.0051, "reward": 1.6426811218261719, "reward_std": 0.20662398636341095, "rewards/accuracy_reward": 0.5626811385154724, "rewards/format_reward": 1.0, "step": 916, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 441.5249938964844, "epoch": 0.017428489974341918, "grad_norm": 1.6637915253080449, "kl": 0.091796875, "learning_rate": 9.992507085533065e-07, "loss": 0.0037, "reward": 1.8812862634658813, "reward_std": 0.05521545559167862, "rewards/accuracy_reward": 0.7700361609458923, "rewards/format_reward": 1.0, "step": 917, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 463.32501220703125, "epoch": 0.017447495961227785, "grad_norm": 2.2822791510295173, "kl": 0.107421875, "learning_rate": 9.992490738482035e-07, "loss": 0.0043, "reward": 1.5084000825881958, "reward_std": 0.31936535239219666, "rewards/accuracy_reward": 0.4284001290798187, "rewards/format_reward": 1.0, "step": 918, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 452.2749938964844, "epoch": 0.017466501948113656, "grad_norm": 2.0519751011605467, "kl": 0.146484375, "learning_rate": 9.992474373631911e-07, "loss": 0.0059, "reward": 1.8933333158493042, "reward_std": 0.030097205191850662, "rewards/accuracy_reward": 0.7333333492279053, "rewards/format_reward": 1.0, "step": 919, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 462.95001220703125, "epoch": 0.017485507934999527, "grad_norm": 2.518309606045907, "kl": 0.08935546875, "learning_rate": 9.992457990982752e-07, "loss": 0.0036, "reward": 2.0289437770843506, "reward_std": 0.20214155316352844, "rewards/accuracy_reward": 0.8114437460899353, "rewards/format_reward": 1.0, "step": 920, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 394.6750183105469, "epoch": 0.017504513921885394, "grad_norm": 3.374778912898044, "kl": 0.09814453125, "learning_rate": 9.992441590534619e-07, "loss": 0.0039, "reward": 1.776856780052185, "reward_std": 0.20960001647472382, "rewards/accuracy_reward": 0.6406068205833435, "rewards/format_reward": 1.0, "step": 921, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 425.70001220703125, "epoch": 0.017523519908771264, "grad_norm": 2.174622831364653, "kl": 0.134765625, "learning_rate": 9.992425172287568e-07, "loss": 0.0054, "reward": 1.5806818008422852, "reward_std": 0.2188062220811844, "rewards/accuracy_reward": 0.543181836605072, "rewards/format_reward": 1.0, "step": 922, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 420.1000061035156, "epoch": 0.01754252589565713, "grad_norm": 1.752949143156655, "kl": 0.1259765625, "learning_rate": 9.992408736241657e-07, "loss": 0.0051, "reward": 1.7737499475479126, "reward_std": 0.1288391351699829, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 923, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 464.8999938964844, "epoch": 0.017561531882543002, "grad_norm": 1.7384423819909192, "kl": 0.1279296875, "learning_rate": 9.992392282396945e-07, "loss": 0.0051, "reward": 1.5212500095367432, "reward_std": 0.23440158367156982, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 0.9750000238418579, "step": 924, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 432.1750183105469, "epoch": 0.01758053786942887, "grad_norm": 10.388002464218191, "kl": 0.126953125, "learning_rate": 9.992375810753495e-07, "loss": 0.0051, "reward": 1.6720832586288452, "reward_std": 0.026257777586579323, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "step": 925, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 450.95001220703125, "epoch": 0.01759954385631474, "grad_norm": 1.879091653090008, "kl": 0.134765625, "learning_rate": 9.99235932131136e-07, "loss": 0.0054, "reward": 2.0712502002716064, "reward_std": 0.13815949857234955, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 926, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 443.95001220703125, "epoch": 0.017618549843200607, "grad_norm": 1.8656385520671657, "kl": 0.1396484375, "learning_rate": 9.9923428140706e-07, "loss": 0.0056, "reward": 2.1524999141693115, "reward_std": 0.1306147426366806, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 927, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.017637555830086478, "grad_norm": 2.9581923607526366, "kl": 0.09619140625, "learning_rate": 9.992326289031276e-07, "loss": 0.0039, "reward": 1.9236233234405518, "reward_std": 0.23222461342811584, "rewards/accuracy_reward": 0.7536233067512512, "rewards/format_reward": 1.0, "step": 928, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 420.5249938964844, "epoch": 0.017656561816972345, "grad_norm": 2.056712629653743, "kl": 0.140625, "learning_rate": 9.992309746193444e-07, "loss": 0.0056, "reward": 1.837916612625122, "reward_std": 0.17498302459716797, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 1.0, "step": 929, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 487.75, "epoch": 0.017675567803858216, "grad_norm": 4.783569391328312, "kl": 0.146484375, "learning_rate": 9.992293185557167e-07, "loss": 0.0059, "reward": 1.823996901512146, "reward_std": 0.2458491325378418, "rewards/accuracy_reward": 0.6402468681335449, "rewards/format_reward": 1.0, "step": 930, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 464.3000183105469, "epoch": 0.017694573790744083, "grad_norm": 1.6907576700414495, "kl": 0.125, "learning_rate": 9.9922766071225e-07, "loss": 0.005, "reward": 1.8650000095367432, "reward_std": 0.1977672576904297, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 931, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 463.5249938964844, "epoch": 0.017713579777629954, "grad_norm": 2.1710034956058215, "kl": 0.146484375, "learning_rate": 9.992260010889504e-07, "loss": 0.0058, "reward": 1.9315861463546753, "reward_std": 0.30427151918411255, "rewards/accuracy_reward": 0.7565861940383911, "rewards/format_reward": 1.0, "step": 932, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 455.45001220703125, "epoch": 0.01773258576451582, "grad_norm": 2.982162755567636, "kl": 0.12060546875, "learning_rate": 9.992243396858237e-07, "loss": 0.0048, "reward": 1.8697845935821533, "reward_std": 0.0642734169960022, "rewards/accuracy_reward": 0.6847846508026123, "rewards/format_reward": 1.0, "step": 933, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 425.625, "epoch": 0.01775159175140169, "grad_norm": 2.238340090834316, "kl": 0.091796875, "learning_rate": 9.99222676502876e-07, "loss": 0.0037, "reward": 1.5753962993621826, "reward_std": 0.27225151658058167, "rewards/accuracy_reward": 0.5853962898254395, "rewards/format_reward": 0.949999988079071, "step": 934, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 420.8999938964844, "epoch": 0.017770597738287562, "grad_norm": 1.7667811015325243, "kl": 0.1181640625, "learning_rate": 9.99221011540113e-07, "loss": 0.0047, "reward": 1.7462501525878906, "reward_std": 0.21943913400173187, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 935, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 474.20001220703125, "epoch": 0.01778960372517343, "grad_norm": 1.331285004061845, "kl": 0.11083984375, "learning_rate": 9.992193447975412e-07, "loss": 0.0044, "reward": 1.5946146249771118, "reward_std": 0.23505862057209015, "rewards/accuracy_reward": 0.4983646869659424, "rewards/format_reward": 0.9750000238418579, "step": 936, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 434.7250061035156, "epoch": 0.0178086097120593, "grad_norm": 2.292171652878736, "kl": 0.126953125, "learning_rate": 9.99217676275166e-07, "loss": 0.0051, "reward": 1.6480646133422852, "reward_std": 0.35812708735466003, "rewards/accuracy_reward": 0.5080644488334656, "rewards/format_reward": 1.0, "step": 937, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 435.6499938964844, "epoch": 0.017827615698945167, "grad_norm": 1.7570461417337828, "kl": 0.12060546875, "learning_rate": 9.992160059729933e-07, "loss": 0.0048, "reward": 2.0587499141693115, "reward_std": 0.2870778739452362, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 938, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 471.5, "epoch": 0.017846621685831038, "grad_norm": 5.372988577750309, "kl": 0.162109375, "learning_rate": 9.992143338910292e-07, "loss": 0.0065, "reward": 2.0, "reward_std": 0.05981827899813652, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 939, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 464.8500061035156, "epoch": 0.017865627672716905, "grad_norm": 1.6155353055526307, "kl": 0.1171875, "learning_rate": 9.992126600292799e-07, "loss": 0.0047, "reward": 1.9077577590942383, "reward_std": 0.08513978123664856, "rewards/accuracy_reward": 0.8065077066421509, "rewards/format_reward": 1.0, "step": 940, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 448.6499938964844, "epoch": 0.017884633659602776, "grad_norm": 4.703343237748224, "kl": 0.11572265625, "learning_rate": 9.99210984387751e-07, "loss": 0.0046, "reward": 2.0587499141693115, "reward_std": 0.22609496116638184, "rewards/accuracy_reward": 0.8375000357627869, "rewards/format_reward": 1.0, "step": 941, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.4750061035156, "epoch": 0.017903639646488643, "grad_norm": 2.0665089701744535, "kl": 0.11083984375, "learning_rate": 9.992093069664485e-07, "loss": 0.0044, "reward": 1.8625000715255737, "reward_std": 0.2017369121313095, "rewards/accuracy_reward": 0.7550000548362732, "rewards/format_reward": 1.0, "step": 942, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 494.95001220703125, "epoch": 0.017922645633374513, "grad_norm": 2.0539424581902113, "kl": 0.1259765625, "learning_rate": 9.992076277653787e-07, "loss": 0.005, "reward": 1.5616008043289185, "reward_std": 0.2615428566932678, "rewards/accuracy_reward": 0.6441008448600769, "rewards/format_reward": 0.824999988079071, "step": 943, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 462.125, "epoch": 0.01794165162026038, "grad_norm": 2.6413124699177715, "kl": 0.15234375, "learning_rate": 9.992059467845474e-07, "loss": 0.0061, "reward": 1.932103157043457, "reward_std": 0.038227807730436325, "rewards/accuracy_reward": 0.7308531999588013, "rewards/format_reward": 1.0, "step": 944, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 459.5500183105469, "epoch": 0.01796065760714625, "grad_norm": 3.5179101772434915, "kl": 0.361328125, "learning_rate": 9.992042640239606e-07, "loss": 0.0145, "reward": 2.0274999141693115, "reward_std": 0.3652125895023346, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.925000011920929, "step": 945, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 399.7749938964844, "epoch": 0.01797966359403212, "grad_norm": 3.1089886476084816, "kl": 0.373046875, "learning_rate": 9.99202579483624e-07, "loss": 0.0149, "reward": 1.3112766742706299, "reward_std": 0.16472071409225464, "rewards/accuracy_reward": 0.2525266110897064, "rewards/format_reward": 1.0, "step": 946, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 411.1750183105469, "epoch": 0.01799866958091799, "grad_norm": 2.840354424079927, "kl": 0.28125, "learning_rate": 9.992008931635443e-07, "loss": 0.0112, "reward": 1.9889267683029175, "reward_std": 0.476940393447876, "rewards/accuracy_reward": 0.8439265489578247, "rewards/format_reward": 0.949999988079071, "step": 947, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 416.4750061035156, "epoch": 0.01801767556780386, "grad_norm": 2.5975677131780777, "kl": 0.27734375, "learning_rate": 9.99199205063727e-07, "loss": 0.0111, "reward": 1.894955039024353, "reward_std": 0.19314749538898468, "rewards/accuracy_reward": 0.7112049460411072, "rewards/format_reward": 1.0, "step": 948, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 413.7250061035156, "epoch": 0.018036681554689727, "grad_norm": 12.987690832168017, "kl": 0.1220703125, "learning_rate": 9.99197515184178e-07, "loss": 0.0049, "reward": 1.8079410791397095, "reward_std": 0.05968896299600601, "rewards/accuracy_reward": 0.7729408740997314, "rewards/format_reward": 1.0, "step": 949, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 459.8999938964844, "epoch": 0.018055687541575598, "grad_norm": 2.0117058108923933, "kl": 0.1533203125, "learning_rate": 9.991958235249039e-07, "loss": 0.0061, "reward": 1.973196029663086, "reward_std": 0.15524807572364807, "rewards/accuracy_reward": 0.7856961488723755, "rewards/format_reward": 1.0, "step": 950, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 468.375, "epoch": 0.018074693528461465, "grad_norm": 2.7709125727215214, "kl": 0.1259765625, "learning_rate": 9.9919413008591e-07, "loss": 0.005, "reward": 1.6391013860702515, "reward_std": 0.0913296490907669, "rewards/accuracy_reward": 0.5053513646125793, "rewards/format_reward": 1.0, "step": 951, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 418.4750061035156, "epoch": 0.018093699515347336, "grad_norm": 3.0649735468559176, "kl": 0.19140625, "learning_rate": 9.991924348672031e-07, "loss": 0.0077, "reward": 2.096759796142578, "reward_std": 0.06396092474460602, "rewards/accuracy_reward": 0.8667598962783813, "rewards/format_reward": 1.0, "step": 952, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.018112705502233203, "grad_norm": 2.326711194017883, "kl": 0.1220703125, "learning_rate": 9.991907378687886e-07, "loss": 0.0049, "reward": 1.6628551483154297, "reward_std": 0.25205954909324646, "rewards/accuracy_reward": 0.6278550028800964, "rewards/format_reward": 1.0, "step": 953, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 438.875, "epoch": 0.018131711489119073, "grad_norm": 3.17294327314815, "kl": 0.2373046875, "learning_rate": 9.99189039090673e-07, "loss": 0.0095, "reward": 1.751649260520935, "reward_std": 0.29257771372795105, "rewards/accuracy_reward": 0.7066492438316345, "rewards/format_reward": 0.9750000238418579, "step": 954, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 403.375, "epoch": 0.01815071747600494, "grad_norm": 4.620806520620864, "kl": 0.2216796875, "learning_rate": 9.99187338532862e-07, "loss": 0.0089, "reward": 2.1465559005737305, "reward_std": 0.06309028714895248, "rewards/accuracy_reward": 0.9103057980537415, "rewards/format_reward": 1.0, "step": 955, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 427.375, "epoch": 0.01816972346289081, "grad_norm": 4.196303590071475, "kl": 0.3125, "learning_rate": 9.991856361953619e-07, "loss": 0.0125, "reward": 2.0, "reward_std": 0.05275532230734825, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 956, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 458.1499938964844, "epoch": 0.01818872944977668, "grad_norm": 1.5539313885356878, "kl": 0.099609375, "learning_rate": 9.991839320781787e-07, "loss": 0.004, "reward": 1.4706287384033203, "reward_std": 0.326867014169693, "rewards/accuracy_reward": 0.4743788242340088, "rewards/format_reward": 0.9750000238418579, "step": 957, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 473.2250061035156, "epoch": 0.01820773543666255, "grad_norm": 3.3356646775788477, "kl": 0.427734375, "learning_rate": 9.991822261813186e-07, "loss": 0.0171, "reward": 1.7416805028915405, "reward_std": 0.6748544573783875, "rewards/accuracy_reward": 0.7441805601119995, "rewards/format_reward": 0.8500000238418579, "step": 958, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 463.6000061035156, "epoch": 0.018226741423548416, "grad_norm": 3.093438098821877, "kl": 0.294921875, "learning_rate": 9.991805185047873e-07, "loss": 0.0118, "reward": 1.4122883081436157, "reward_std": 0.1356745809316635, "rewards/accuracy_reward": 0.42978811264038086, "rewards/format_reward": 0.9750000238418579, "step": 959, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 428.1499938964844, "epoch": 0.018245747410434287, "grad_norm": 4.588428440906342, "kl": 0.5859375, "learning_rate": 9.991788090485913e-07, "loss": 0.0234, "reward": 1.465193748474121, "reward_std": 0.13266170024871826, "rewards/accuracy_reward": 0.4364437758922577, "rewards/format_reward": 0.949999988079071, "step": 960, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 430.07501220703125, "epoch": 0.018264753397320158, "grad_norm": 2.2541051628772797, "kl": 0.2734375, "learning_rate": 9.991770978127365e-07, "loss": 0.0109, "reward": 1.6100000143051147, "reward_std": 0.22446639835834503, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 961, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 432.4750061035156, "epoch": 0.018283759384206025, "grad_norm": 4.152128087108797, "kl": 0.275390625, "learning_rate": 9.99175384797229e-07, "loss": 0.011, "reward": 1.8690799474716187, "reward_std": 0.19647662341594696, "rewards/accuracy_reward": 0.7065801024436951, "rewards/format_reward": 1.0, "step": 962, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 481.5, "epoch": 0.018302765371091895, "grad_norm": 2.585845757439392, "kl": 0.291015625, "learning_rate": 9.991736700020751e-07, "loss": 0.0117, "reward": 1.7849998474121094, "reward_std": 0.4497860074043274, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.9750000238418579, "step": 963, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 449.32501220703125, "epoch": 0.018321771357977763, "grad_norm": 3.2951027124883185, "kl": 0.2421875, "learning_rate": 9.991719534272806e-07, "loss": 0.0097, "reward": 1.557499885559082, "reward_std": 0.0724850744009018, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 1.0, "step": 964, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 428.1000061035156, "epoch": 0.018340777344863633, "grad_norm": 4.381643439121722, "kl": 0.37890625, "learning_rate": 9.991702350728518e-07, "loss": 0.0152, "reward": 1.87496018409729, "reward_std": 0.05095122382044792, "rewards/accuracy_reward": 0.6649600863456726, "rewards/format_reward": 1.0, "step": 965, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 511.0, "epoch": 0.0183597833317495, "grad_norm": 15.244856557677098, "kl": 0.30859375, "learning_rate": 9.99168514938795e-07, "loss": 0.0123, "reward": 1.810625433921814, "reward_std": 0.25420841574668884, "rewards/accuracy_reward": 0.8168756365776062, "rewards/format_reward": 0.949999988079071, "step": 966, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 427.0500183105469, "epoch": 0.01837878931863537, "grad_norm": 2.1756852110703693, "kl": 0.115234375, "learning_rate": 9.99166793025116e-07, "loss": 0.0046, "reward": 1.7404800653457642, "reward_std": 0.18577569723129272, "rewards/accuracy_reward": 0.6242300868034363, "rewards/format_reward": 1.0, "step": 967, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 475.7250061035156, "epoch": 0.01839779530552124, "grad_norm": 2.113511488614174, "kl": 0.0908203125, "learning_rate": 9.991650693318213e-07, "loss": 0.0036, "reward": 1.9334135055541992, "reward_std": 0.055705346167087555, "rewards/accuracy_reward": 0.8371635675430298, "rewards/format_reward": 1.0, "step": 968, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 496.0500183105469, "epoch": 0.01841680129240711, "grad_norm": 2.628306037529305, "kl": 0.1142578125, "learning_rate": 9.991633438589164e-07, "loss": 0.0046, "reward": 1.7807127237319946, "reward_std": 0.06006943807005882, "rewards/accuracy_reward": 0.6657127141952515, "rewards/format_reward": 1.0, "step": 969, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 507.20001220703125, "epoch": 0.018435807279292976, "grad_norm": 2.8141559042053665, "kl": 0.1328125, "learning_rate": 9.99161616606408e-07, "loss": 0.0053, "reward": 1.6087499856948853, "reward_std": 0.1750754415988922, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 970, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 503.57501220703125, "epoch": 0.018454813266178847, "grad_norm": 1.9481236056538105, "kl": 0.1337890625, "learning_rate": 9.991598875743023e-07, "loss": 0.0053, "reward": 2.0799663066864014, "reward_std": 0.06390620768070221, "rewards/accuracy_reward": 0.8687164187431335, "rewards/format_reward": 1.0, "step": 971, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 482.32501220703125, "epoch": 0.018473819253064714, "grad_norm": 2.1944315347193553, "kl": 0.10986328125, "learning_rate": 9.991581567626053e-07, "loss": 0.0044, "reward": 1.6176159381866455, "reward_std": 0.29334235191345215, "rewards/accuracy_reward": 0.5413660407066345, "rewards/format_reward": 1.0, "step": 972, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 491.95001220703125, "epoch": 0.018492825239950585, "grad_norm": 1.4056729502158087, "kl": 0.1318359375, "learning_rate": 9.99156424171323e-07, "loss": 0.0053, "reward": 1.743749976158142, "reward_std": 0.04665544256567955, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 973, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 447.3999938964844, "epoch": 0.018511831226836452, "grad_norm": 1.8518768973510429, "kl": 0.0908203125, "learning_rate": 9.99154689800462e-07, "loss": 0.0036, "reward": 1.3625515699386597, "reward_std": 0.23056507110595703, "rewards/accuracy_reward": 0.3863014876842499, "rewards/format_reward": 0.9750000238418579, "step": 974, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 481.1000061035156, "epoch": 0.018530837213722322, "grad_norm": 1.1990804074264203, "kl": 0.134765625, "learning_rate": 9.991529536500281e-07, "loss": 0.0054, "reward": 1.8637501001358032, "reward_std": 0.1592256873846054, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 975, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 478.57501220703125, "epoch": 0.018549843200608193, "grad_norm": 1.9793808979708043, "kl": 0.130859375, "learning_rate": 9.991512157200274e-07, "loss": 0.0052, "reward": 1.8549998998641968, "reward_std": 0.13763809204101562, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 976, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 468.3500061035156, "epoch": 0.01856884918749406, "grad_norm": 1.455494972313243, "kl": 0.11328125, "learning_rate": 9.991494760104666e-07, "loss": 0.0045, "reward": 1.954702377319336, "reward_std": 0.24830415844917297, "rewards/accuracy_reward": 0.7997024059295654, "rewards/format_reward": 1.0, "step": 977, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 495.95001220703125, "epoch": 0.01858785517437993, "grad_norm": 1.2845338422412447, "kl": 0.1357421875, "learning_rate": 9.991477345213516e-07, "loss": 0.0054, "reward": 1.7600164413452148, "reward_std": 0.04295359179377556, "rewards/accuracy_reward": 0.6800164580345154, "rewards/format_reward": 1.0, "step": 978, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 470.82501220703125, "epoch": 0.018606861161265798, "grad_norm": 2.524473888507463, "kl": 0.1201171875, "learning_rate": 9.991459912526885e-07, "loss": 0.0048, "reward": 1.9898589849472046, "reward_std": 0.14100247621536255, "rewards/accuracy_reward": 0.7848591208457947, "rewards/format_reward": 1.0, "step": 979, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 421.8500061035156, "epoch": 0.01862586714815167, "grad_norm": 1.5353370725495674, "kl": 0.07763671875, "learning_rate": 9.991442462044838e-07, "loss": 0.0031, "reward": 1.7439583539962769, "reward_std": 0.05997762829065323, "rewards/accuracy_reward": 0.7239583730697632, "rewards/format_reward": 1.0, "step": 980, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 514.6000366210938, "epoch": 0.018644873135037536, "grad_norm": 1.2507029833332797, "kl": 0.08935546875, "learning_rate": 9.991424993767435e-07, "loss": 0.0036, "reward": 1.5458621978759766, "reward_std": 0.2544994652271271, "rewards/accuracy_reward": 0.5408622026443481, "rewards/format_reward": 0.9750000238418579, "step": 981, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 513.2250366210938, "epoch": 0.018663879121923407, "grad_norm": 1.6842281728942927, "kl": 0.1103515625, "learning_rate": 9.991407507694739e-07, "loss": 0.0044, "reward": 1.60509192943573, "reward_std": 0.37781214714050293, "rewards/accuracy_reward": 0.6463419795036316, "rewards/format_reward": 0.9000000357627869, "step": 982, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 483.20001220703125, "epoch": 0.018682885108809274, "grad_norm": 1.7394526175220422, "kl": 0.1259765625, "learning_rate": 9.991390003826811e-07, "loss": 0.005, "reward": 1.96294105052948, "reward_std": 0.06293322145938873, "rewards/accuracy_reward": 0.7779411673545837, "rewards/format_reward": 1.0, "step": 983, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 470.6000061035156, "epoch": 0.018701891095695145, "grad_norm": 1.4538016558032507, "kl": 0.1142578125, "learning_rate": 9.991372482163715e-07, "loss": 0.0046, "reward": 1.256554365158081, "reward_std": 0.22981882095336914, "rewards/accuracy_reward": 0.19280432164669037, "rewards/format_reward": 1.0, "step": 984, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 437.5249938964844, "epoch": 0.01872089708258101, "grad_norm": 1.8900056804728786, "kl": 0.07763671875, "learning_rate": 9.991354942705515e-07, "loss": 0.0031, "reward": 1.4299341440200806, "reward_std": 0.04026506096124649, "rewards/accuracy_reward": 0.36118412017822266, "rewards/format_reward": 1.0, "step": 985, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 468.625, "epoch": 0.018739903069466882, "grad_norm": 1.915245768357843, "kl": 0.0966796875, "learning_rate": 9.99133738545227e-07, "loss": 0.0039, "reward": 1.7253516912460327, "reward_std": 0.060850657522678375, "rewards/accuracy_reward": 0.5203516483306885, "rewards/format_reward": 1.0, "step": 986, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 455.57501220703125, "epoch": 0.01875890905635275, "grad_norm": 1.5382300833377138, "kl": 0.115234375, "learning_rate": 9.991319810404045e-07, "loss": 0.0046, "reward": 1.631250023841858, "reward_std": 0.12575089931488037, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 987, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 464.1499938964844, "epoch": 0.01877791504323862, "grad_norm": 3.1192437339873416, "kl": 0.0791015625, "learning_rate": 9.9913022175609e-07, "loss": 0.0032, "reward": 1.507525086402893, "reward_std": 0.15524284541606903, "rewards/accuracy_reward": 0.43752503395080566, "rewards/format_reward": 1.0, "step": 988, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 449.1499938964844, "epoch": 0.01879692103012449, "grad_norm": 1.54914774404728, "kl": 0.091796875, "learning_rate": 9.9912846069229e-07, "loss": 0.0037, "reward": 1.4654277563095093, "reward_std": 0.17589350044727325, "rewards/accuracy_reward": 0.4516778886318207, "rewards/format_reward": 1.0, "step": 989, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 469.82501220703125, "epoch": 0.018815927017010358, "grad_norm": 1.4213697094478461, "kl": 0.103515625, "learning_rate": 9.991266978490108e-07, "loss": 0.0041, "reward": 1.7612498998641968, "reward_std": 0.2095511257648468, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 990, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 481.45001220703125, "epoch": 0.01883493300389623, "grad_norm": 1.6312655250495025, "kl": 0.1201171875, "learning_rate": 9.991249332262588e-07, "loss": 0.0048, "reward": 1.7371524572372437, "reward_std": 0.032223377376794815, "rewards/accuracy_reward": 0.6384023427963257, "rewards/format_reward": 1.0, "step": 991, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 485.20001220703125, "epoch": 0.018853938990782096, "grad_norm": 1.3182864645673726, "kl": 0.1103515625, "learning_rate": 9.9912316682404e-07, "loss": 0.0044, "reward": 1.933750033378601, "reward_std": 0.13984672725200653, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 992, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 480.5249938964844, "epoch": 0.018872944977667967, "grad_norm": 1.5844503657364932, "kl": 0.1376953125, "learning_rate": 9.991213986423608e-07, "loss": 0.0055, "reward": 1.4237499237060547, "reward_std": 0.2008010596036911, "rewards/accuracy_reward": 0.3499999940395355, "rewards/format_reward": 1.0, "step": 993, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 408.3999938964844, "epoch": 0.018891950964553834, "grad_norm": 1.8887526543539201, "kl": 0.0703125, "learning_rate": 9.991196286812277e-07, "loss": 0.0028, "reward": 1.6798213720321655, "reward_std": 0.1683581918478012, "rewards/accuracy_reward": 0.5910714268684387, "rewards/format_reward": 1.0, "step": 994, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 495.5249938964844, "epoch": 0.018910956951439704, "grad_norm": 1.8087575175397754, "kl": 0.1416015625, "learning_rate": 9.991178569406465e-07, "loss": 0.0056, "reward": 2.006077527999878, "reward_std": 0.07102253288030624, "rewards/accuracy_reward": 0.8798274993896484, "rewards/format_reward": 1.0, "step": 995, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 441.3000183105469, "epoch": 0.01892996293832557, "grad_norm": 1.7108546851866144, "kl": 0.0869140625, "learning_rate": 9.99116083420624e-07, "loss": 0.0035, "reward": 1.56125009059906, "reward_std": 0.27193161845207214, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 996, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 461.1499938964844, "epoch": 0.018948968925211442, "grad_norm": 9.216624129646492, "kl": 0.10400390625, "learning_rate": 9.991143081211663e-07, "loss": 0.0042, "reward": 2.0642831325531006, "reward_std": 0.1092679500579834, "rewards/accuracy_reward": 0.8342830538749695, "rewards/format_reward": 1.0, "step": 997, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.6, "completion_length": 451.125, "epoch": 0.01896797491209731, "grad_norm": 1.4393901320995328, "kl": 0.134765625, "learning_rate": 9.9911253104228e-07, "loss": 0.0054, "reward": 1.4824999570846558, "reward_std": 0.012247446924448013, "rewards/accuracy_reward": 0.4000000059604645, "rewards/format_reward": 1.0, "step": 998, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 441.57501220703125, "epoch": 0.01898698089898318, "grad_norm": 1.8310984839526372, "kl": 0.130859375, "learning_rate": 9.99110752183971e-07, "loss": 0.0052, "reward": 1.5205000638961792, "reward_std": 0.1124877855181694, "rewards/accuracy_reward": 0.43800002336502075, "rewards/format_reward": 1.0, "step": 999, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 442.32501220703125, "epoch": 0.019005986885869047, "grad_norm": 1.9001223645439553, "kl": 0.1240234375, "learning_rate": 9.99108971546246e-07, "loss": 0.005, "reward": 1.680837631225586, "reward_std": 0.2153915911912918, "rewards/accuracy_reward": 0.5345876812934875, "rewards/format_reward": 1.0, "step": 1000, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 432.2749938964844, "epoch": 0.019024992872754918, "grad_norm": 1.9630811609313032, "kl": 0.10107421875, "learning_rate": 9.991071891291114e-07, "loss": 0.004, "reward": 1.687555193901062, "reward_std": 0.05677928403019905, "rewards/accuracy_reward": 0.5300551652908325, "rewards/format_reward": 1.0, "step": 1001, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 456.6750183105469, "epoch": 0.019043998859640785, "grad_norm": 1.3656022773433452, "kl": 0.1376953125, "learning_rate": 9.991054049325731e-07, "loss": 0.0055, "reward": 1.6224998235702515, "reward_std": 0.12271543592214584, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 1.0, "step": 1002, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 457.1499938964844, "epoch": 0.019063004846526656, "grad_norm": 2.318556812619703, "kl": 0.134765625, "learning_rate": 9.991036189566378e-07, "loss": 0.0054, "reward": 1.87375009059906, "reward_std": 0.09894287586212158, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 1003, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 480.9750061035156, "epoch": 0.019082010833412526, "grad_norm": 1.903658302580525, "kl": 0.12255859375, "learning_rate": 9.991018312013118e-07, "loss": 0.0049, "reward": 2.0052125453948975, "reward_std": 0.10790147632360458, "rewards/accuracy_reward": 0.8402124643325806, "rewards/format_reward": 1.0, "step": 1004, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 448.5, "epoch": 0.019101016820298394, "grad_norm": 1.805451294129596, "kl": 0.11279296875, "learning_rate": 9.991000416666015e-07, "loss": 0.0045, "reward": 1.611193060874939, "reward_std": 0.04346662759780884, "rewards/accuracy_reward": 0.5111930966377258, "rewards/format_reward": 1.0, "step": 1005, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 425.07501220703125, "epoch": 0.019120022807184264, "grad_norm": 1.781836315521842, "kl": 0.08642578125, "learning_rate": 9.990982503525134e-07, "loss": 0.0034, "reward": 1.6278969049453735, "reward_std": 0.31013163924217224, "rewards/accuracy_reward": 0.5953971147537231, "rewards/format_reward": 1.0, "step": 1006, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 511.82501220703125, "epoch": 0.01913902879407013, "grad_norm": 1.5531280531862826, "kl": 0.1201171875, "learning_rate": 9.990964572590537e-07, "loss": 0.0048, "reward": 1.590000033378601, "reward_std": 0.2036997526884079, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 0.9750000238418579, "step": 1007, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 451.75, "epoch": 0.019158034780956002, "grad_norm": 1.40396899180214, "kl": 0.09375, "learning_rate": 9.990946623862286e-07, "loss": 0.0037, "reward": 1.6199522018432617, "reward_std": 0.163608580827713, "rewards/accuracy_reward": 0.47745224833488464, "rewards/format_reward": 1.0, "step": 1008, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 489.8999938964844, "epoch": 0.01917704076784187, "grad_norm": 1.834123059099525, "kl": 0.12451171875, "learning_rate": 9.990928657340448e-07, "loss": 0.005, "reward": 1.5168421268463135, "reward_std": 0.41266241669654846, "rewards/accuracy_reward": 0.4555921256542206, "rewards/format_reward": 0.9750000238418579, "step": 1009, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 469.95001220703125, "epoch": 0.01919604675472774, "grad_norm": 1.7488146322306355, "kl": 0.1552734375, "learning_rate": 9.99091067302509e-07, "loss": 0.0062, "reward": 2.257500171661377, "reward_std": 0.12623165547847748, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 1010, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 466.0, "epoch": 0.019215052741613607, "grad_norm": 1.553807296078126, "kl": 0.08642578125, "learning_rate": 9.99089267091627e-07, "loss": 0.0035, "reward": 1.65625, "reward_std": 0.0689966008067131, "rewards/accuracy_reward": 0.5925000309944153, "rewards/format_reward": 1.0, "step": 1011, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 442.1000061035156, "epoch": 0.019234058728499478, "grad_norm": 2.090037041687299, "kl": 0.17578125, "learning_rate": 9.990874651014054e-07, "loss": 0.007, "reward": 1.7426666021347046, "reward_std": 0.294862300157547, "rewards/accuracy_reward": 0.5976666808128357, "rewards/format_reward": 1.0, "step": 1012, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 469.8500061035156, "epoch": 0.019253064715385345, "grad_norm": 3.0827752207802965, "kl": 0.1435546875, "learning_rate": 9.990856613318506e-07, "loss": 0.0057, "reward": 2.1619644165039062, "reward_std": 0.03936680033802986, "rewards/accuracy_reward": 0.9957141876220703, "rewards/format_reward": 1.0, "step": 1013, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 492.32501220703125, "epoch": 0.019272070702271216, "grad_norm": 2.2608813479083283, "kl": 0.119140625, "learning_rate": 9.990838557829694e-07, "loss": 0.0048, "reward": 1.7749525308609009, "reward_std": 0.15113787353038788, "rewards/accuracy_reward": 0.581202507019043, "rewards/format_reward": 1.0, "step": 1014, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 476.7250061035156, "epoch": 0.019291076689157083, "grad_norm": 2.4735848437679904, "kl": 0.181640625, "learning_rate": 9.990820484547677e-07, "loss": 0.0073, "reward": 1.8574999570846558, "reward_std": 0.35503897070884705, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.9750000238418579, "step": 1015, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 471.3500061035156, "epoch": 0.019310082676042954, "grad_norm": 2.0349285847414365, "kl": 0.1611328125, "learning_rate": 9.990802393472527e-07, "loss": 0.0065, "reward": 1.8112499713897705, "reward_std": 0.1496996134519577, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1016, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 438.3500061035156, "epoch": 0.019329088662928824, "grad_norm": 21.869414747166775, "kl": 0.1376953125, "learning_rate": 9.9907842846043e-07, "loss": 0.0055, "reward": 1.9371391534805298, "reward_std": 0.07876028120517731, "rewards/accuracy_reward": 0.7758890390396118, "rewards/format_reward": 1.0, "step": 1017, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 448.32501220703125, "epoch": 0.01934809464981469, "grad_norm": 2.5556945127144792, "kl": 0.142578125, "learning_rate": 9.990766157943063e-07, "loss": 0.0057, "reward": 2.022892951965332, "reward_std": 0.056258928030729294, "rewards/accuracy_reward": 0.7966430187225342, "rewards/format_reward": 1.0, "step": 1018, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 421.0, "epoch": 0.019367100636700562, "grad_norm": 2.118697152745554, "kl": 0.1396484375, "learning_rate": 9.990748013488883e-07, "loss": 0.0056, "reward": 2.049999952316284, "reward_std": 0.09520556777715683, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 1019, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 412.375, "epoch": 0.01938610662358643, "grad_norm": 2.163694047007911, "kl": 0.146484375, "learning_rate": 9.990729851241823e-07, "loss": 0.0059, "reward": 1.8013746738433838, "reward_std": 0.17564113438129425, "rewards/accuracy_reward": 0.6388747692108154, "rewards/format_reward": 1.0, "step": 1020, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 427.6750183105469, "epoch": 0.0194051126104723, "grad_norm": 1.981511956734752, "kl": 0.1279296875, "learning_rate": 9.99071167120195e-07, "loss": 0.0051, "reward": 1.7745332717895508, "reward_std": 0.12141894549131393, "rewards/accuracy_reward": 0.6207833290100098, "rewards/format_reward": 1.0, "step": 1021, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 449.875, "epoch": 0.019424118597358167, "grad_norm": 2.1361217390911644, "kl": 0.07958984375, "learning_rate": 9.990693473369325e-07, "loss": 0.0032, "reward": 1.4644477367401123, "reward_std": 0.3071606457233429, "rewards/accuracy_reward": 0.41444769501686096, "rewards/format_reward": 1.0, "step": 1022, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 410.95001220703125, "epoch": 0.019443124584244038, "grad_norm": 2.2489954767139735, "kl": 0.10546875, "learning_rate": 9.990675257744017e-07, "loss": 0.0042, "reward": 1.7889044284820557, "reward_std": 0.11797785758972168, "rewards/accuracy_reward": 0.690154492855072, "rewards/format_reward": 1.0, "step": 1023, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 404.2250061035156, "epoch": 0.019462130571129905, "grad_norm": 2.653349206098878, "kl": 0.10693359375, "learning_rate": 9.990657024326087e-07, "loss": 0.0043, "reward": 1.4250637292861938, "reward_std": 0.25342002511024475, "rewards/accuracy_reward": 0.3488137125968933, "rewards/format_reward": 1.0, "step": 1024, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 406.95001220703125, "epoch": 0.019481136558015776, "grad_norm": 1.7123506878473547, "kl": 0.1279296875, "learning_rate": 9.990638773115604e-07, "loss": 0.0051, "reward": 1.7537498474121094, "reward_std": 0.01837117038667202, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 1025, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 397.125, "epoch": 0.019500142544901643, "grad_norm": 2.212712015139928, "kl": 0.142578125, "learning_rate": 9.99062050411263e-07, "loss": 0.0057, "reward": 1.9385731220245361, "reward_std": 0.13834629952907562, "rewards/accuracy_reward": 0.7660731673240662, "rewards/format_reward": 1.0, "step": 1026, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 426.75, "epoch": 0.019519148531787513, "grad_norm": 2.1062121885530103, "kl": 0.177734375, "learning_rate": 9.99060221731723e-07, "loss": 0.0071, "reward": 2.0250000953674316, "reward_std": 0.024494869634509087, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1027, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 392.1499938964844, "epoch": 0.01953815451867338, "grad_norm": 2.8179326490289958, "kl": 0.1748046875, "learning_rate": 9.99058391272947e-07, "loss": 0.007, "reward": 1.9591667652130127, "reward_std": 0.13837946951389313, "rewards/accuracy_reward": 0.7866666913032532, "rewards/format_reward": 1.0, "step": 1028, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 406.5500183105469, "epoch": 0.01955716050555925, "grad_norm": 2.4705107867070684, "kl": 0.169921875, "learning_rate": 9.990565590349418e-07, "loss": 0.0068, "reward": 2.1500000953674316, "reward_std": 0.09946427494287491, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 1029, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 405.4750061035156, "epoch": 0.019576166492445122, "grad_norm": 1.7775593490466943, "kl": 0.12109375, "learning_rate": 9.990547250177134e-07, "loss": 0.0048, "reward": 1.681867003440857, "reward_std": 0.10608299821615219, "rewards/accuracy_reward": 0.6531171202659607, "rewards/format_reward": 1.0, "step": 1030, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 387.75, "epoch": 0.01959517247933099, "grad_norm": 1.7065115803590838, "kl": 0.123046875, "learning_rate": 9.990528892212687e-07, "loss": 0.0049, "reward": 1.53125, "reward_std": 0.11581762135028839, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 1.0, "step": 1031, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 388.1750183105469, "epoch": 0.01961417846621686, "grad_norm": 2.7571682695840933, "kl": 0.26171875, "learning_rate": 9.990510516456143e-07, "loss": 0.0105, "reward": 2.093621015548706, "reward_std": 0.05897649750113487, "rewards/accuracy_reward": 0.858620822429657, "rewards/format_reward": 1.0, "step": 1032, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 414.3000183105469, "epoch": 0.019633184453102727, "grad_norm": 3.9120173181912197, "kl": 0.294921875, "learning_rate": 9.990492122907566e-07, "loss": 0.0118, "reward": 2.351250171661377, "reward_std": 0.036220937967300415, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1033, "temporal_rewards": 1.0 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 426.1499938964844, "epoch": 0.019652190439988598, "grad_norm": 2.3178758492640164, "kl": 0.2451171875, "learning_rate": 9.99047371156702e-07, "loss": 0.0098, "reward": 1.8461750745773315, "reward_std": 0.30464860796928406, "rewards/accuracy_reward": 0.78742516040802, "rewards/format_reward": 0.925000011920929, "step": 1034, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 384.7749938964844, "epoch": 0.019671196426874465, "grad_norm": 2.8619659164198388, "kl": 0.1767578125, "learning_rate": 9.990455282434572e-07, "loss": 0.0071, "reward": 1.6730884313583374, "reward_std": 0.10481330007314682, "rewards/accuracy_reward": 0.5693384408950806, "rewards/format_reward": 1.0, "step": 1035, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 394.1499938964844, "epoch": 0.019690202413760335, "grad_norm": 1.8127472887440206, "kl": 0.26953125, "learning_rate": 9.99043683551029e-07, "loss": 0.0108, "reward": 1.7037500143051147, "reward_std": 0.10087790340185165, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1036, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 380.0249938964844, "epoch": 0.019709208400646203, "grad_norm": 2.0569088952554937, "kl": 0.228515625, "learning_rate": 9.990418370794239e-07, "loss": 0.0092, "reward": 1.6823810338974, "reward_std": 0.06980056315660477, "rewards/accuracy_reward": 0.6461309790611267, "rewards/format_reward": 1.0, "step": 1037, "temporal_rewards": 0.5 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 431.95001220703125, "epoch": 0.019728214387532073, "grad_norm": 2.5848940539338154, "kl": 0.296875, "learning_rate": 9.990399888286482e-07, "loss": 0.0118, "reward": 2.2649998664855957, "reward_std": 0.10506659746170044, "rewards/accuracy_reward": 0.9750000238418579, "rewards/format_reward": 1.0, "step": 1038, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 424.5, "epoch": 0.01974722037441794, "grad_norm": 2.7547516533454206, "kl": 0.416015625, "learning_rate": 9.990381387987086e-07, "loss": 0.0166, "reward": 1.741979956626892, "reward_std": 0.41703328490257263, "rewards/accuracy_reward": 0.7407300472259521, "rewards/format_reward": 0.925000011920929, "step": 1039, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 412.8500061035156, "epoch": 0.01976622636130381, "grad_norm": 2.6965592038809496, "kl": 0.32421875, "learning_rate": 9.990362869896119e-07, "loss": 0.013, "reward": 1.6772819757461548, "reward_std": 0.5514366030693054, "rewards/accuracy_reward": 0.7460320591926575, "rewards/format_reward": 0.9000000357627869, "step": 1040, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 449.9250183105469, "epoch": 0.01978523234818968, "grad_norm": 2.1342571278000584, "kl": 0.283203125, "learning_rate": 9.990344334013646e-07, "loss": 0.0113, "reward": 1.9636785984039307, "reward_std": 0.29915428161621094, "rewards/accuracy_reward": 0.7911784648895264, "rewards/format_reward": 0.9750000238418579, "step": 1041, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 407.1750183105469, "epoch": 0.01980423833507555, "grad_norm": 4.819846812951813, "kl": 0.4921875, "learning_rate": 9.99032578033973e-07, "loss": 0.0197, "reward": 1.6124862432479858, "reward_std": 0.1234317421913147, "rewards/accuracy_reward": 0.4924861490726471, "rewards/format_reward": 0.9750000238418579, "step": 1042, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 427.45001220703125, "epoch": 0.019823244321961416, "grad_norm": 1.3106107015759494, "kl": 0.1455078125, "learning_rate": 9.990307208874442e-07, "loss": 0.0058, "reward": 1.9005578756332397, "reward_std": 0.3362431526184082, "rewards/accuracy_reward": 0.839307963848114, "rewards/format_reward": 0.9750000238418579, "step": 1043, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 378.3500061035156, "epoch": 0.019842250308847287, "grad_norm": 3.034639156082052, "kl": 0.375, "learning_rate": 9.990288619617844e-07, "loss": 0.015, "reward": 1.4457237720489502, "reward_std": 0.47860288619995117, "rewards/accuracy_reward": 0.534473717212677, "rewards/format_reward": 0.9000000357627869, "step": 1044, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.25, "epoch": 0.019861256295733157, "grad_norm": 2.7615694846522914, "kl": 0.57421875, "learning_rate": 9.990270012570005e-07, "loss": 0.023, "reward": 1.8250000476837158, "reward_std": 0.4990555942058563, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 0.925000011920929, "step": 1045, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 439.4250183105469, "epoch": 0.019880262282619025, "grad_norm": 1.8898014206055522, "kl": 0.322265625, "learning_rate": 9.990251387730993e-07, "loss": 0.0129, "reward": 1.6197454929351807, "reward_std": 0.22780810296535492, "rewards/accuracy_reward": 0.5297453999519348, "rewards/format_reward": 1.0, "step": 1046, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 427.9750061035156, "epoch": 0.019899268269504895, "grad_norm": 5.50585571663118, "kl": 0.5625, "learning_rate": 9.990232745100869e-07, "loss": 0.0225, "reward": 1.5639976263046265, "reward_std": 0.13982482254505157, "rewards/accuracy_reward": 0.48024773597717285, "rewards/format_reward": 0.9750000238418579, "step": 1047, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 401.5, "epoch": 0.019918274256390762, "grad_norm": 2.7239170576983307, "kl": 0.2890625, "learning_rate": 9.990214084679705e-07, "loss": 0.0116, "reward": 1.4754761457443237, "reward_std": 0.18877726793289185, "rewards/accuracy_reward": 0.4154761731624603, "rewards/format_reward": 0.9750000238418579, "step": 1048, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 372.70001220703125, "epoch": 0.019937280243276633, "grad_norm": 2.047033943669166, "kl": 0.33203125, "learning_rate": 9.990195406467563e-07, "loss": 0.0133, "reward": 1.8841667175292969, "reward_std": 0.04042382910847664, "rewards/accuracy_reward": 0.7991666793823242, "rewards/format_reward": 1.0, "step": 1049, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 367.2250061035156, "epoch": 0.0199562862301625, "grad_norm": 2.7595244558349434, "kl": 0.3515625, "learning_rate": 9.990176710464512e-07, "loss": 0.0141, "reward": 1.8950246572494507, "reward_std": 0.24025297164916992, "rewards/accuracy_reward": 0.7262746691703796, "rewards/format_reward": 1.0, "step": 1050, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 419.5500183105469, "epoch": 0.01997529221704837, "grad_norm": 2.856255799647025, "kl": 0.703125, "learning_rate": 9.990157996670619e-07, "loss": 0.028, "reward": 1.7473324537277222, "reward_std": 0.3358840048313141, "rewards/accuracy_reward": 0.632332444190979, "rewards/format_reward": 0.9750000238418579, "step": 1051, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 443.2749938964844, "epoch": 0.019994298203934238, "grad_norm": 2.267571252125733, "kl": 0.3125, "learning_rate": 9.99013926508595e-07, "loss": 0.0125, "reward": 1.845000147819519, "reward_std": 0.4743429124355316, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.949999988079071, "step": 1052, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 375.8000183105469, "epoch": 0.02001330419082011, "grad_norm": 2.785642060901953, "kl": 0.5234375, "learning_rate": 9.990120515710572e-07, "loss": 0.0209, "reward": 1.6485786437988281, "reward_std": 0.2498089075088501, "rewards/accuracy_reward": 0.5498287081718445, "rewards/format_reward": 1.0, "step": 1053, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 396.95001220703125, "epoch": 0.020032310177705976, "grad_norm": 2.9415979783586526, "kl": 0.435546875, "learning_rate": 9.99010174854455e-07, "loss": 0.0174, "reward": 1.857224702835083, "reward_std": 0.11635222285985947, "rewards/accuracy_reward": 0.7159746289253235, "rewards/format_reward": 0.9750000238418579, "step": 1054, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 383.45001220703125, "epoch": 0.020051316164591847, "grad_norm": 3.452851265646793, "kl": 0.5625, "learning_rate": 9.990082963587954e-07, "loss": 0.0226, "reward": 1.5637085437774658, "reward_std": 0.16516831517219543, "rewards/accuracy_reward": 0.46870848536491394, "rewards/format_reward": 1.0, "step": 1055, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 434.9750061035156, "epoch": 0.020070322151477714, "grad_norm": 4.607732604558867, "kl": 0.349609375, "learning_rate": 9.990064160840848e-07, "loss": 0.014, "reward": 1.7404297590255737, "reward_std": 0.39258453249931335, "rewards/accuracy_reward": 0.6341797709465027, "rewards/format_reward": 0.949999988079071, "step": 1056, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 411.2250061035156, "epoch": 0.020089328138363585, "grad_norm": 1.7609440047723581, "kl": 0.3046875, "learning_rate": 9.990045340303302e-07, "loss": 0.0121, "reward": 1.5140036344528198, "reward_std": 0.4501815736293793, "rewards/accuracy_reward": 0.5265036225318909, "rewards/format_reward": 0.925000011920929, "step": 1057, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 391.3999938964844, "epoch": 0.020108334125249455, "grad_norm": 2.842884566334913, "kl": 0.53515625, "learning_rate": 9.990026501975382e-07, "loss": 0.0215, "reward": 1.821428656578064, "reward_std": 0.2050504982471466, "rewards/accuracy_reward": 0.7214285731315613, "rewards/format_reward": 1.0, "step": 1058, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 402.8999938964844, "epoch": 0.020127340112135322, "grad_norm": 2.9778563623613348, "kl": 0.38671875, "learning_rate": 9.990007645857153e-07, "loss": 0.0155, "reward": 2.0104236602783203, "reward_std": 0.15403418242931366, "rewards/accuracy_reward": 0.8979236483573914, "rewards/format_reward": 1.0, "step": 1059, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 458.20001220703125, "epoch": 0.020146346099021193, "grad_norm": 3.294990752610389, "kl": 0.435546875, "learning_rate": 9.989988771948685e-07, "loss": 0.0174, "reward": 2.356250047683716, "reward_std": 0.03061860240995884, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1060, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 437.0500183105469, "epoch": 0.02016535208590706, "grad_norm": 3.019277322578053, "kl": 0.58984375, "learning_rate": 9.989969880250044e-07, "loss": 0.0235, "reward": 1.6652908325195312, "reward_std": 0.24903574585914612, "rewards/accuracy_reward": 0.5802907943725586, "rewards/format_reward": 0.9750000238418579, "step": 1061, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 402.07501220703125, "epoch": 0.02018435807279293, "grad_norm": 2.6544577749862595, "kl": 0.49609375, "learning_rate": 9.989950970761298e-07, "loss": 0.0198, "reward": 1.5977493524551392, "reward_std": 0.3752979040145874, "rewards/accuracy_reward": 0.5352492928504944, "rewards/format_reward": 0.9750000238418579, "step": 1062, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 456.8000183105469, "epoch": 0.020203364059678798, "grad_norm": 2.35222328332762, "kl": 0.6796875, "learning_rate": 9.989932043482515e-07, "loss": 0.0271, "reward": 1.3567017316818237, "reward_std": 0.4286865293979645, "rewards/accuracy_reward": 0.3717016279697418, "rewards/format_reward": 0.949999988079071, "step": 1063, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 441.125, "epoch": 0.02022237004656467, "grad_norm": 4.283698679859192, "kl": 0.78125, "learning_rate": 9.989913098413758e-07, "loss": 0.0312, "reward": 1.4906439781188965, "reward_std": 0.1723892241716385, "rewards/accuracy_reward": 0.44439396262168884, "rewards/format_reward": 1.0, "step": 1064, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 531.2750244140625, "epoch": 0.020241376033450536, "grad_norm": 4.876568017614198, "kl": 1.5078125, "learning_rate": 9.989894135555102e-07, "loss": 0.0602, "reward": 1.5412499904632568, "reward_std": 1.0078068971633911, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 0.75, "step": 1065, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 541.5499877929688, "epoch": 0.020260382020336407, "grad_norm": 3.2720555968355365, "kl": 1.1875, "learning_rate": 9.989875154906608e-07, "loss": 0.0474, "reward": 1.0928294658660889, "reward_std": 0.7177988886833191, "rewards/accuracy_reward": 0.40657949447631836, "rewards/format_reward": 0.699999988079071, "step": 1066, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 457.6750183105469, "epoch": 0.020279388007222274, "grad_norm": 4.448078878169397, "kl": 0.97265625, "learning_rate": 9.98985615646835e-07, "loss": 0.0388, "reward": 1.4606159925460815, "reward_std": 0.2986356317996979, "rewards/accuracy_reward": 0.5743657946586609, "rewards/format_reward": 0.925000011920929, "step": 1067, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 517.4500122070312, "epoch": 0.020298393994108144, "grad_norm": 2.363348867384236, "kl": 0.2734375, "learning_rate": 9.989837140240389e-07, "loss": 0.011, "reward": 1.1824373006820679, "reward_std": 0.41314831376075745, "rewards/accuracy_reward": 0.36243730783462524, "rewards/format_reward": 0.824999988079071, "step": 1068, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 536.5499877929688, "epoch": 0.02031739998099401, "grad_norm": 5.134794222810242, "kl": 1.1796875, "learning_rate": 9.989818106222795e-07, "loss": 0.0471, "reward": 1.1349999904632568, "reward_std": 0.7050307393074036, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.800000011920929, "step": 1069, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 439.0249938964844, "epoch": 0.020336405967879882, "grad_norm": 2.3327486514463343, "kl": 0.65625, "learning_rate": 9.98979905441564e-07, "loss": 0.0261, "reward": 1.9594318866729736, "reward_std": 0.20904293656349182, "rewards/accuracy_reward": 0.9306818246841431, "rewards/format_reward": 0.9750000238418579, "step": 1070, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 475.4750061035156, "epoch": 0.020355411954765753, "grad_norm": 2.406752671101741, "kl": 0.396484375, "learning_rate": 9.989779984818985e-07, "loss": 0.0158, "reward": 1.6237499713897705, "reward_std": 0.1582767367362976, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9750000238418579, "step": 1071, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 496.4750061035156, "epoch": 0.02037441794165162, "grad_norm": 3.1892577937015436, "kl": 0.41796875, "learning_rate": 9.989760897432903e-07, "loss": 0.0168, "reward": 1.641535997390747, "reward_std": 0.30127832293510437, "rewards/accuracy_reward": 0.5652860999107361, "rewards/format_reward": 0.9750000238418579, "step": 1072, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 510.8000183105469, "epoch": 0.02039342392853749, "grad_norm": 1.5675129417411724, "kl": 0.20703125, "learning_rate": 9.989741792257463e-07, "loss": 0.0083, "reward": 1.8320114612579346, "reward_std": 0.1766887903213501, "rewards/accuracy_reward": 0.7782613635063171, "rewards/format_reward": 1.0, "step": 1073, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 413.3999938964844, "epoch": 0.020412429915423358, "grad_norm": 1.7064393938049844, "kl": 0.25, "learning_rate": 9.98972266929273e-07, "loss": 0.01, "reward": 1.8130369186401367, "reward_std": 0.14584560692310333, "rewards/accuracy_reward": 0.7942870259284973, "rewards/format_reward": 1.0, "step": 1074, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 461.375, "epoch": 0.02043143590230923, "grad_norm": 4.7527380552141265, "kl": 0.265625, "learning_rate": 9.98970352853877e-07, "loss": 0.0106, "reward": 1.9015886783599854, "reward_std": 0.1914331018924713, "rewards/accuracy_reward": 0.8165885806083679, "rewards/format_reward": 0.9750000238418579, "step": 1075, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 497.0, "epoch": 0.020450441889195096, "grad_norm": 4.212975933640076, "kl": 0.2412109375, "learning_rate": 9.989684369995657e-07, "loss": 0.0097, "reward": 2.0439999103546143, "reward_std": 0.11628691107034683, "rewards/accuracy_reward": 0.9739999771118164, "rewards/format_reward": 1.0, "step": 1076, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 490.9250183105469, "epoch": 0.020469447876080966, "grad_norm": 1.1425370556000116, "kl": 0.11083984375, "learning_rate": 9.989665193663456e-07, "loss": 0.0044, "reward": 1.7450001239776611, "reward_std": 0.13763807713985443, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 1077, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 400.9250183105469, "epoch": 0.020488453862966834, "grad_norm": 24.909327096365107, "kl": 0.10595703125, "learning_rate": 9.989645999542236e-07, "loss": 0.0042, "reward": 1.8112499713897705, "reward_std": 0.25130271911621094, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1078, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 477.6000061035156, "epoch": 0.020507459849852704, "grad_norm": 1.6378639672917517, "kl": 0.1064453125, "learning_rate": 9.989626787632066e-07, "loss": 0.0043, "reward": 1.6247367858886719, "reward_std": 0.23722195625305176, "rewards/accuracy_reward": 0.5822367668151855, "rewards/format_reward": 1.0, "step": 1079, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 466.07501220703125, "epoch": 0.02052646583673857, "grad_norm": 2.131040517194741, "kl": 0.07763671875, "learning_rate": 9.989607557933011e-07, "loss": 0.0031, "reward": 1.7458094358444214, "reward_std": 0.14009727537631989, "rewards/accuracy_reward": 0.647059440612793, "rewards/format_reward": 1.0, "step": 1080, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 554.3250122070312, "epoch": 0.020545471823624442, "grad_norm": 3.6728755784435, "kl": 0.1240234375, "learning_rate": 9.989588310445145e-07, "loss": 0.005, "reward": 1.5125000476837158, "reward_std": 0.265155553817749, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 1081, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 519.7999877929688, "epoch": 0.02056447781051031, "grad_norm": 1.9976021894789202, "kl": 0.1201171875, "learning_rate": 9.989569045168534e-07, "loss": 0.0048, "reward": 1.806249976158142, "reward_std": 0.4982987344264984, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.949999988079071, "step": 1082, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 550.8500366210938, "epoch": 0.02058348379739618, "grad_norm": 11.867139146001337, "kl": 0.1328125, "learning_rate": 9.989549762103247e-07, "loss": 0.0053, "reward": 1.0172616243362427, "reward_std": 0.6514346599578857, "rewards/accuracy_reward": 0.35351163148880005, "rewards/format_reward": 0.7250000238418579, "step": 1083, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 484.3500061035156, "epoch": 0.020602489784282047, "grad_norm": 1.5716586463126754, "kl": 0.11865234375, "learning_rate": 9.98953046124935e-07, "loss": 0.0047, "reward": 1.772499918937683, "reward_std": 0.54538893699646, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 0.8500000238418579, "step": 1084, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 439.3500061035156, "epoch": 0.020621495771167918, "grad_norm": 1.4474047969208617, "kl": 0.080078125, "learning_rate": 9.989511142606918e-07, "loss": 0.0032, "reward": 1.5293375253677368, "reward_std": 0.44812893867492676, "rewards/accuracy_reward": 0.5855875015258789, "rewards/format_reward": 0.9000000357627869, "step": 1085, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 387.1499938964844, "epoch": 0.02064050175805379, "grad_norm": 1.5988116883734946, "kl": 0.08984375, "learning_rate": 9.989491806176011e-07, "loss": 0.0036, "reward": 1.8309376239776611, "reward_std": 0.02562147192656994, "rewards/accuracy_reward": 0.785937488079071, "rewards/format_reward": 1.0, "step": 1086, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.020659507744939656, "grad_norm": 2.1259623459317347, "kl": 0.103515625, "learning_rate": 9.989472451956706e-07, "loss": 0.0041, "reward": 2.0172619819641113, "reward_std": 0.11930018663406372, "rewards/accuracy_reward": 0.7922618985176086, "rewards/format_reward": 1.0, "step": 1087, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 395.70001220703125, "epoch": 0.020678513731825526, "grad_norm": 1.465230605082796, "kl": 0.087890625, "learning_rate": 9.989453079949071e-07, "loss": 0.0035, "reward": 1.6762501001358032, "reward_std": 0.3955805003643036, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 0.9750000238418579, "step": 1088, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 394.0249938964844, "epoch": 0.020697519718711394, "grad_norm": 1.9551105424298225, "kl": 0.08203125, "learning_rate": 9.98943369015317e-07, "loss": 0.0033, "reward": 1.9587501287460327, "reward_std": 0.23802022635936737, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 1.0, "step": 1089, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 380.9750061035156, "epoch": 0.020716525705597264, "grad_norm": 1.8469513508132582, "kl": 0.0869140625, "learning_rate": 9.989414282569076e-07, "loss": 0.0035, "reward": 1.7670704126358032, "reward_std": 0.08387457579374313, "rewards/accuracy_reward": 0.7258204817771912, "rewards/format_reward": 1.0, "step": 1090, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 431.75, "epoch": 0.02073553169248313, "grad_norm": 1.920501572608846, "kl": 0.0693359375, "learning_rate": 9.989394857196858e-07, "loss": 0.0028, "reward": 1.5712499618530273, "reward_std": 0.3352877199649811, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 0.9000000357627869, "step": 1091, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 383.0249938964844, "epoch": 0.020754537679369002, "grad_norm": 2.443963178631602, "kl": 0.11328125, "learning_rate": 9.989375414036584e-07, "loss": 0.0045, "reward": 1.7179073095321655, "reward_std": 0.24341915547847748, "rewards/accuracy_reward": 0.640407383441925, "rewards/format_reward": 1.0, "step": 1092, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 445.1000061035156, "epoch": 0.02077354366625487, "grad_norm": 1.7723624196992813, "kl": 0.06884765625, "learning_rate": 9.989355953088326e-07, "loss": 0.0028, "reward": 1.7637499570846558, "reward_std": 0.3484143614768982, "rewards/accuracy_reward": 0.762499988079071, "rewards/format_reward": 0.875, "step": 1093, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 413.25, "epoch": 0.02079254965314074, "grad_norm": 1.4968608366270706, "kl": 0.080078125, "learning_rate": 9.989336474352151e-07, "loss": 0.0032, "reward": 2.1112499237060547, "reward_std": 0.14834749698638916, "rewards/accuracy_reward": 0.9000000357627869, "rewards/format_reward": 1.0, "step": 1094, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 432.375, "epoch": 0.020811555640026607, "grad_norm": 1.7153893013050203, "kl": 0.0830078125, "learning_rate": 9.989316977828126e-07, "loss": 0.0033, "reward": 1.9839897155761719, "reward_std": 0.1398380547761917, "rewards/accuracy_reward": 0.7677397131919861, "rewards/format_reward": 1.0, "step": 1095, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 443.25, "epoch": 0.020830561626912478, "grad_norm": 1.5051392319753347, "kl": 0.08740234375, "learning_rate": 9.989297463516326e-07, "loss": 0.0035, "reward": 1.9412498474121094, "reward_std": 0.13574114441871643, "rewards/accuracy_reward": 0.7250000238418579, "rewards/format_reward": 1.0, "step": 1096, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 423.7749938964844, "epoch": 0.020849567613798345, "grad_norm": 1.9077565998329153, "kl": 0.10009765625, "learning_rate": 9.989277931416817e-07, "loss": 0.004, "reward": 1.8943119049072266, "reward_std": 0.22906899452209473, "rewards/accuracy_reward": 0.7243117690086365, "rewards/format_reward": 1.0, "step": 1097, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 459.8000183105469, "epoch": 0.020868573600684216, "grad_norm": 1.8276528565109738, "kl": 0.11865234375, "learning_rate": 9.98925838152967e-07, "loss": 0.0047, "reward": 1.9516490697860718, "reward_std": 0.19768182933330536, "rewards/accuracy_reward": 0.8328990340232849, "rewards/format_reward": 0.9750000238418579, "step": 1098, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 497.45001220703125, "epoch": 0.020887579587570086, "grad_norm": 2.2909597620079647, "kl": 0.091796875, "learning_rate": 9.989238813854953e-07, "loss": 0.0037, "reward": 1.4409477710723877, "reward_std": 0.5140753388404846, "rewards/accuracy_reward": 0.5121976733207703, "rewards/format_reward": 0.9000000357627869, "step": 1099, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 492.82501220703125, "epoch": 0.020906585574455953, "grad_norm": 2.2944242066760476, "kl": 0.1220703125, "learning_rate": 9.989219228392737e-07, "loss": 0.0049, "reward": 1.9307613372802734, "reward_std": 0.20309896767139435, "rewards/accuracy_reward": 0.8270112872123718, "rewards/format_reward": 1.0, "step": 1100, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 464.45001220703125, "epoch": 0.020925591561341824, "grad_norm": 2.8809098115632863, "kl": 0.126953125, "learning_rate": 9.989199625143094e-07, "loss": 0.0051, "reward": 1.6325000524520874, "reward_std": 0.40920862555503845, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.9750000238418579, "step": 1101, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 501.0, "epoch": 0.02094459754822769, "grad_norm": 1.8804940928958014, "kl": 0.126953125, "learning_rate": 9.989180004106091e-07, "loss": 0.0051, "reward": 1.4704875946044922, "reward_std": 0.42537155747413635, "rewards/accuracy_reward": 0.490487664937973, "rewards/format_reward": 0.9000000357627869, "step": 1102, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 507.625, "epoch": 0.020963603535113562, "grad_norm": 2.7341625579862483, "kl": 0.1484375, "learning_rate": 9.989160365281797e-07, "loss": 0.0059, "reward": 1.8605848550796509, "reward_std": 0.1214105635881424, "rewards/accuracy_reward": 0.7743348479270935, "rewards/format_reward": 1.0, "step": 1103, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 464.375, "epoch": 0.02098260952199943, "grad_norm": 17.81933693019855, "kl": 0.142578125, "learning_rate": 9.989140708670285e-07, "loss": 0.0057, "reward": 1.9149999618530273, "reward_std": 0.06461530178785324, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1104, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 493.3500061035156, "epoch": 0.0210016155088853, "grad_norm": 3.9765937206582684, "kl": 0.1884765625, "learning_rate": 9.989121034271626e-07, "loss": 0.0075, "reward": 1.8371429443359375, "reward_std": 0.4592036306858063, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.875, "step": 1105, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 482.3500061035156, "epoch": 0.021020621495771167, "grad_norm": 3.3013987507746707, "kl": 0.1611328125, "learning_rate": 9.989101342085884e-07, "loss": 0.0064, "reward": 1.9663803577423096, "reward_std": 0.14365266263484955, "rewards/accuracy_reward": 0.8563804626464844, "rewards/format_reward": 0.9750000238418579, "step": 1106, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 474.1000061035156, "epoch": 0.021039627482657038, "grad_norm": 2.1701200783763728, "kl": 0.1845703125, "learning_rate": 9.989081632113135e-07, "loss": 0.0074, "reward": 1.5840139389038086, "reward_std": 0.16018573939800262, "rewards/accuracy_reward": 0.5752639174461365, "rewards/format_reward": 1.0, "step": 1107, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 467.8500061035156, "epoch": 0.021058633469542905, "grad_norm": 3.1589717624970937, "kl": 0.185546875, "learning_rate": 9.989061904353447e-07, "loss": 0.0074, "reward": 1.340881586074829, "reward_std": 0.19195245206356049, "rewards/accuracy_reward": 0.35463154315948486, "rewards/format_reward": 0.949999988079071, "step": 1108, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 482.6000061035156, "epoch": 0.021077639456428775, "grad_norm": 2.8074066428685516, "kl": 0.259765625, "learning_rate": 9.98904215880689e-07, "loss": 0.0104, "reward": 1.7889518737792969, "reward_std": 0.2965729534626007, "rewards/accuracy_reward": 0.6464519500732422, "rewards/format_reward": 0.9750000238418579, "step": 1109, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 447.8999938964844, "epoch": 0.021096645443314643, "grad_norm": 2.530398401920556, "kl": 0.1845703125, "learning_rate": 9.989022395473538e-07, "loss": 0.0074, "reward": 1.8787498474121094, "reward_std": 0.42936381697654724, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 0.949999988079071, "step": 1110, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 458.45001220703125, "epoch": 0.021115651430200513, "grad_norm": 4.252170277531869, "kl": 0.18359375, "learning_rate": 9.989002614353456e-07, "loss": 0.0074, "reward": 2.06000018119812, "reward_std": 0.15157610177993774, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 1111, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 426.1750183105469, "epoch": 0.021134657417086384, "grad_norm": 2.8291493048761858, "kl": 0.275390625, "learning_rate": 9.988982815446717e-07, "loss": 0.011, "reward": 1.8688366413116455, "reward_std": 0.10286872833967209, "rewards/accuracy_reward": 0.708836555480957, "rewards/format_reward": 1.0, "step": 1112, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 480.6750183105469, "epoch": 0.02115366340397225, "grad_norm": 2.656584988188495, "kl": 0.1923828125, "learning_rate": 9.988962998753392e-07, "loss": 0.0077, "reward": 1.5665762424468994, "reward_std": 0.2611367404460907, "rewards/accuracy_reward": 0.572826087474823, "rewards/format_reward": 0.949999988079071, "step": 1113, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 469.0500183105469, "epoch": 0.021172669390858122, "grad_norm": 2.29502916119169, "kl": 0.3984375, "learning_rate": 9.988943164273552e-07, "loss": 0.0159, "reward": 2.1050591468811035, "reward_std": 0.22429104149341583, "rewards/accuracy_reward": 0.8375590443611145, "rewards/format_reward": 0.949999988079071, "step": 1114, "temporal_rewards": 1.0 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 441.6750183105469, "epoch": 0.02119167537774399, "grad_norm": 2.2118718515634046, "kl": 0.494140625, "learning_rate": 9.988923312007268e-07, "loss": 0.0198, "reward": 1.902500033378601, "reward_std": 0.22653710842132568, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 1115, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 470.32501220703125, "epoch": 0.02121068136462986, "grad_norm": 2.5601758006216837, "kl": 0.458984375, "learning_rate": 9.98890344195461e-07, "loss": 0.0184, "reward": 1.5993732213974, "reward_std": 0.42046108841896057, "rewards/accuracy_reward": 0.49437323212623596, "rewards/format_reward": 0.949999988079071, "step": 1116, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 410.4750061035156, "epoch": 0.021229687351515727, "grad_norm": 3.114194843852156, "kl": 0.482421875, "learning_rate": 9.988883554115645e-07, "loss": 0.0193, "reward": 1.6028660535812378, "reward_std": 0.07979714870452881, "rewards/accuracy_reward": 0.4991160035133362, "rewards/format_reward": 1.0, "step": 1117, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 457.20001220703125, "epoch": 0.021248693338401597, "grad_norm": 2.002613964714544, "kl": 0.28125, "learning_rate": 9.98886364849045e-07, "loss": 0.0112, "reward": 1.5487500429153442, "reward_std": 0.4187520146369934, "rewards/accuracy_reward": 0.5250000357627869, "rewards/format_reward": 0.9750000238418579, "step": 1118, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 441.6750183105469, "epoch": 0.021267699325287465, "grad_norm": 3.274407931048261, "kl": 0.484375, "learning_rate": 9.988843725079095e-07, "loss": 0.0194, "reward": 1.6832802295684814, "reward_std": 0.36162883043289185, "rewards/accuracy_reward": 0.603280246257782, "rewards/format_reward": 0.949999988079071, "step": 1119, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 418.8999938964844, "epoch": 0.021286705312173335, "grad_norm": 2.3955242037890514, "kl": 0.373046875, "learning_rate": 9.988823783881648e-07, "loss": 0.0149, "reward": 1.8356670141220093, "reward_std": 0.09992270916700363, "rewards/accuracy_reward": 0.6706671118736267, "rewards/format_reward": 1.0, "step": 1120, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 440.75, "epoch": 0.021305711299059202, "grad_norm": 3.742041206355109, "kl": 0.365234375, "learning_rate": 9.988803824898183e-07, "loss": 0.0146, "reward": 1.7740919589996338, "reward_std": 0.16810846328735352, "rewards/accuracy_reward": 0.6403418779373169, "rewards/format_reward": 0.9750000238418579, "step": 1121, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 428.3000183105469, "epoch": 0.021324717285945073, "grad_norm": 3.0274629931569965, "kl": 0.369140625, "learning_rate": 9.988783848128768e-07, "loss": 0.0148, "reward": 1.9174998998641968, "reward_std": 0.25878721475601196, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 1.0, "step": 1122, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 418.5, "epoch": 0.02134372327283094, "grad_norm": 3.0556221439357283, "kl": 0.609375, "learning_rate": 9.988763853573476e-07, "loss": 0.0243, "reward": 2.1110386848449707, "reward_std": 0.3558754026889801, "rewards/accuracy_reward": 0.859788715839386, "rewards/format_reward": 0.9750000238418579, "step": 1123, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 426.6499938964844, "epoch": 0.02136272925971681, "grad_norm": 2.823012185745552, "kl": 0.5078125, "learning_rate": 9.98874384123238e-07, "loss": 0.0204, "reward": 1.989999771118164, "reward_std": 0.1035362109541893, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9750000238418579, "step": 1124, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 422.3999938964844, "epoch": 0.021381735246602678, "grad_norm": 2.8385232890125316, "kl": 0.484375, "learning_rate": 9.98872381110555e-07, "loss": 0.0194, "reward": 1.7665491104125977, "reward_std": 0.1389819234609604, "rewards/accuracy_reward": 0.6152991056442261, "rewards/format_reward": 1.0, "step": 1125, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 474.70001220703125, "epoch": 0.02140074123348855, "grad_norm": 2.06715848172825, "kl": 0.33203125, "learning_rate": 9.988703763193054e-07, "loss": 0.0133, "reward": 1.8977149724960327, "reward_std": 0.4486086964607239, "rewards/accuracy_reward": 0.865215003490448, "rewards/format_reward": 0.925000011920929, "step": 1126, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 434.625, "epoch": 0.02141974722037442, "grad_norm": 3.558806967338625, "kl": 0.32421875, "learning_rate": 9.98868369749497e-07, "loss": 0.0129, "reward": 2.0890328884124756, "reward_std": 0.03455172851681709, "rewards/accuracy_reward": 0.8527830243110657, "rewards/format_reward": 1.0, "step": 1127, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 453.07501220703125, "epoch": 0.021438753207260287, "grad_norm": 2.194949637184403, "kl": 0.26953125, "learning_rate": 9.988663614011363e-07, "loss": 0.0108, "reward": 2.0461175441741943, "reward_std": 0.04294559359550476, "rewards/accuracy_reward": 0.8898676037788391, "rewards/format_reward": 1.0, "step": 1128, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 448.6750183105469, "epoch": 0.021457759194146157, "grad_norm": 2.5174553767161276, "kl": 0.30078125, "learning_rate": 9.98864351274231e-07, "loss": 0.012, "reward": 1.773172378540039, "reward_std": 0.3384849727153778, "rewards/accuracy_reward": 0.7331724166870117, "rewards/format_reward": 0.949999988079071, "step": 1129, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 469.7250061035156, "epoch": 0.021476765181032025, "grad_norm": 1.4912590548255154, "kl": 0.22265625, "learning_rate": 9.98862339368788e-07, "loss": 0.0089, "reward": 1.8037500381469727, "reward_std": 0.22611112892627716, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.949999988079071, "step": 1130, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 479.3999938964844, "epoch": 0.021495771167917895, "grad_norm": 2.105907871858483, "kl": 0.283203125, "learning_rate": 9.988603256848143e-07, "loss": 0.0113, "reward": 1.7814123630523682, "reward_std": 0.2664862871170044, "rewards/accuracy_reward": 0.6614122986793518, "rewards/format_reward": 1.0, "step": 1131, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 423.45001220703125, "epoch": 0.021514777154803762, "grad_norm": 2.5390173575463844, "kl": 0.3125, "learning_rate": 9.988583102223176e-07, "loss": 0.0125, "reward": 1.850250244140625, "reward_std": 0.04399308189749718, "rewards/accuracy_reward": 0.6902503371238708, "rewards/format_reward": 1.0, "step": 1132, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 404.5249938964844, "epoch": 0.021533783141689633, "grad_norm": 2.281713118447679, "kl": 0.66015625, "learning_rate": 9.988562929813045e-07, "loss": 0.0265, "reward": 1.479461669921875, "reward_std": 0.21519115567207336, "rewards/accuracy_reward": 0.3582117557525635, "rewards/format_reward": 0.9750000238418579, "step": 1133, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 457.1000061035156, "epoch": 0.0215527891285755, "grad_norm": 2.1119535085806445, "kl": 0.33984375, "learning_rate": 9.988542739617827e-07, "loss": 0.0136, "reward": 1.6164772510528564, "reward_std": 0.21216817200183868, "rewards/accuracy_reward": 0.5977272987365723, "rewards/format_reward": 0.949999988079071, "step": 1134, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 468.70001220703125, "epoch": 0.02157179511546137, "grad_norm": 2.4958872968807695, "kl": 0.396484375, "learning_rate": 9.98852253163759e-07, "loss": 0.0159, "reward": 2.032238721847534, "reward_std": 0.1964418888092041, "rewards/accuracy_reward": 0.8522385954856873, "rewards/format_reward": 0.9750000238418579, "step": 1135, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 456.6000061035156, "epoch": 0.021590801102347238, "grad_norm": 3.3600806794631204, "kl": 0.609375, "learning_rate": 9.98850230587241e-07, "loss": 0.0244, "reward": 1.694771647453308, "reward_std": 0.3171394169330597, "rewards/accuracy_reward": 0.6385214924812317, "rewards/format_reward": 0.925000011920929, "step": 1136, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 440.4750061035156, "epoch": 0.02160980708923311, "grad_norm": 3.0949135162270345, "kl": 0.56640625, "learning_rate": 9.988482062322355e-07, "loss": 0.0226, "reward": 1.4600000381469727, "reward_std": 0.47220584750175476, "rewards/accuracy_reward": 0.4000000059604645, "rewards/format_reward": 0.949999988079071, "step": 1137, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 451.20001220703125, "epoch": 0.021628813076118976, "grad_norm": 2.452416135525228, "kl": 0.8515625, "learning_rate": 9.988461800987497e-07, "loss": 0.0341, "reward": 1.3278422355651855, "reward_std": 0.27286624908447266, "rewards/accuracy_reward": 0.35909223556518555, "rewards/format_reward": 0.9750000238418579, "step": 1138, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 449.20001220703125, "epoch": 0.021647819063004847, "grad_norm": 2.1188206224876818, "kl": 0.58203125, "learning_rate": 9.988441521867915e-07, "loss": 0.0232, "reward": 1.7224998474121094, "reward_std": 0.4373147189617157, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 0.949999988079071, "step": 1139, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 478.9750061035156, "epoch": 0.021666825049890717, "grad_norm": 2.2507479390567324, "kl": 0.53125, "learning_rate": 9.988421224963672e-07, "loss": 0.0213, "reward": 1.5777565240859985, "reward_std": 0.43320542573928833, "rewards/accuracy_reward": 0.6177565455436707, "rewards/format_reward": 0.925000011920929, "step": 1140, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 454.3000183105469, "epoch": 0.021685831036776584, "grad_norm": 5.72497736751786, "kl": 0.7890625, "learning_rate": 9.988400910274849e-07, "loss": 0.0316, "reward": 1.3049999475479126, "reward_std": 0.622310221195221, "rewards/accuracy_reward": 0.4000000059604645, "rewards/format_reward": 0.9000000357627869, "step": 1141, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 442.9750061035156, "epoch": 0.021704837023662455, "grad_norm": 2.3336740721342903, "kl": 0.84375, "learning_rate": 9.98838057780151e-07, "loss": 0.0337, "reward": 1.6687500476837158, "reward_std": 0.5652045011520386, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 0.875, "step": 1142, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 447.25, "epoch": 0.021723843010548322, "grad_norm": 2.348345711262493, "kl": 0.625, "learning_rate": 9.988360227543734e-07, "loss": 0.0249, "reward": 1.9391344785690308, "reward_std": 0.4283779263496399, "rewards/accuracy_reward": 0.7903845906257629, "rewards/format_reward": 0.949999988079071, "step": 1143, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 430.6750183105469, "epoch": 0.021742848997434193, "grad_norm": 1.6655662038556005, "kl": 0.3671875, "learning_rate": 9.98833985950159e-07, "loss": 0.0147, "reward": 1.9451178312301636, "reward_std": 0.05423903465270996, "rewards/accuracy_reward": 0.8988677859306335, "rewards/format_reward": 1.0, "step": 1144, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 458.25, "epoch": 0.02176185498432006, "grad_norm": 3.536788262989706, "kl": 1.265625, "learning_rate": 9.988319473675153e-07, "loss": 0.0507, "reward": 1.3060330152511597, "reward_std": 0.5574919581413269, "rewards/accuracy_reward": 0.3922830820083618, "rewards/format_reward": 0.875, "step": 1145, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 446.7749938964844, "epoch": 0.02178086097120593, "grad_norm": 5.042838672476203, "kl": 1.671875, "learning_rate": 9.988299070064496e-07, "loss": 0.0667, "reward": 1.8337500095367432, "reward_std": 0.5980350375175476, "rewards/accuracy_reward": 0.8500000238418579, "rewards/format_reward": 0.8500000238418579, "step": 1146, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 447.3500061035156, "epoch": 0.021799866958091798, "grad_norm": 4.739213882568442, "kl": 3.078125, "learning_rate": 9.988278648669689e-07, "loss": 0.1235, "reward": 0.9837499856948853, "reward_std": 0.7462717890739441, "rewards/accuracy_reward": 0.42500001192092896, "rewards/format_reward": 0.45000001788139343, "step": 1147, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 406.3000183105469, "epoch": 0.02181887294497767, "grad_norm": 6.881088180730933, "kl": 4.21875, "learning_rate": 9.988258209490807e-07, "loss": 0.1693, "reward": 0.7211111187934875, "reward_std": 0.8187532424926758, "rewards/accuracy_reward": 0.28611111640930176, "rewards/format_reward": 0.4000000059604645, "step": 1148, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.7749938964844, "epoch": 0.021837878931863536, "grad_norm": 7.867300199676618, "kl": 4.21875, "learning_rate": 9.988237752527921e-07, "loss": 0.1685, "reward": 0.8793601393699646, "reward_std": 0.9862723350524902, "rewards/accuracy_reward": 0.4081101417541504, "rewards/format_reward": 0.4749999940395355, "step": 1149, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.3999938964844, "epoch": 0.021856884918749406, "grad_norm": 6.863023704504246, "kl": 4.03125, "learning_rate": 9.988217277781105e-07, "loss": 0.1609, "reward": 1.4275000095367432, "reward_std": 0.8714839816093445, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.699999988079071, "step": 1150, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 416.4250183105469, "epoch": 0.021875890905635274, "grad_norm": 2.6568467453200943, "kl": 2.390625, "learning_rate": 9.988196785250432e-07, "loss": 0.0957, "reward": 1.6962499618530273, "reward_std": 0.6650688052177429, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.875, "step": 1151, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 384.2749938964844, "epoch": 0.021894896892521144, "grad_norm": 2.91787701554552, "kl": 2.0625, "learning_rate": 9.988176274935976e-07, "loss": 0.0823, "reward": 1.8666852712631226, "reward_std": 0.32089829444885254, "rewards/accuracy_reward": 0.7391854524612427, "rewards/format_reward": 0.925000011920929, "step": 1152, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 440.0, "epoch": 0.02191390287940701, "grad_norm": 6.356761104993911, "kl": 0.48046875, "learning_rate": 9.98815574683781e-07, "loss": 0.0192, "reward": 1.890345573425293, "reward_std": 0.12729094922542572, "rewards/accuracy_reward": 0.6853455305099487, "rewards/format_reward": 1.0, "step": 1153, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.8, "all_wrong": 0.0, "completion_length": 413.8999938964844, "epoch": 0.021932908866292882, "grad_norm": 1.9157296553068588, "kl": 0.71875, "learning_rate": 9.988135200956004e-07, "loss": 0.0287, "reward": 2.0818288326263428, "reward_std": 0.050610002130270004, "rewards/accuracy_reward": 0.910578727722168, "rewards/format_reward": 1.0, "step": 1154, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 421.32501220703125, "epoch": 0.021951914853178753, "grad_norm": 3.4329642624347874, "kl": 1.3984375, "learning_rate": 9.988114637290635e-07, "loss": 0.056, "reward": 2.1162497997283936, "reward_std": 0.3431072533130646, "rewards/accuracy_reward": 0.949999988079071, "rewards/format_reward": 0.949999988079071, "step": 1155, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 380.5, "epoch": 0.02197092084006462, "grad_norm": 3.3599646843218895, "kl": 2.1875, "learning_rate": 9.988094055841774e-07, "loss": 0.0874, "reward": 1.5845874547958374, "reward_std": 0.26362115144729614, "rewards/accuracy_reward": 0.5158374905586243, "rewards/format_reward": 0.949999988079071, "step": 1156, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 453.7250061035156, "epoch": 0.02198992682695049, "grad_norm": 6.004458777358778, "kl": 2.46875, "learning_rate": 9.988073456609495e-07, "loss": 0.0989, "reward": 1.6548088788986206, "reward_std": 0.368537962436676, "rewards/accuracy_reward": 0.6173087358474731, "rewards/format_reward": 0.875, "step": 1157, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 417.7749938964844, "epoch": 0.022008932813836358, "grad_norm": 12.377396172306666, "kl": 5.84375, "learning_rate": 9.988052839593873e-07, "loss": 0.2342, "reward": 1.0857117176055908, "reward_std": 0.6150609850883484, "rewards/accuracy_reward": 0.2882117033004761, "rewards/format_reward": 0.7250000238418579, "step": 1158, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 347.0, "epoch": 0.02202793880072223, "grad_norm": 14.310417847636268, "kl": 5.65625, "learning_rate": 9.988032204794978e-07, "loss": 0.2258, "reward": 1.738639235496521, "reward_std": 0.41455432772636414, "rewards/accuracy_reward": 0.6486393213272095, "rewards/format_reward": 0.925000011920929, "step": 1159, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 361.6000061035156, "epoch": 0.022046944787608096, "grad_norm": 15.082462895525596, "kl": 6.1875, "learning_rate": 9.988011552212888e-07, "loss": 0.2469, "reward": 1.5087499618530273, "reward_std": 0.41775813698768616, "rewards/accuracy_reward": 0.45000001788139343, "rewards/format_reward": 0.949999988079071, "step": 1160, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.9250183105469, "epoch": 0.022065950774493966, "grad_norm": 5.934688730965424, "kl": 1.6875, "learning_rate": 9.987990881847675e-07, "loss": 0.0675, "reward": 1.6231422424316406, "reward_std": 0.15801259875297546, "rewards/accuracy_reward": 0.4693923592567444, "rewards/format_reward": 1.0, "step": 1161, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 420.3500061035156, "epoch": 0.022084956761379834, "grad_norm": 3.691650891548696, "kl": 2.484375, "learning_rate": 9.98797019369941e-07, "loss": 0.0995, "reward": 1.5292717218399048, "reward_std": 0.1003173366189003, "rewards/accuracy_reward": 0.4755217134952545, "rewards/format_reward": 0.9750000238418579, "step": 1162, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 448.6000061035156, "epoch": 0.022103962748265704, "grad_norm": 2.161277127845887, "kl": 0.62890625, "learning_rate": 9.98794948776817e-07, "loss": 0.0251, "reward": 1.969212532043457, "reward_std": 0.2199476808309555, "rewards/accuracy_reward": 0.8117125630378723, "rewards/format_reward": 1.0, "step": 1163, "temporal_rewards": 0.699999988079071 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 455.8999938964844, "epoch": 0.02212296873515157, "grad_norm": 3.0084445357306175, "kl": 0.38671875, "learning_rate": 9.987928764054027e-07, "loss": 0.0154, "reward": 2.341249942779541, "reward_std": 0.04742559790611267, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 1164, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 459.75, "epoch": 0.022141974722037442, "grad_norm": 2.534339030967863, "kl": 0.380859375, "learning_rate": 9.987908022557057e-07, "loss": 0.0153, "reward": 1.6424999237060547, "reward_std": 0.12985198199748993, "rewards/accuracy_reward": 0.574999988079071, "rewards/format_reward": 1.0, "step": 1165, "temporal_rewards": 0.5 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 445.625, "epoch": 0.02216098070892331, "grad_norm": 5.799787655731888, "kl": 0.16796875, "learning_rate": 9.98788726327733e-07, "loss": 0.0067, "reward": 1.6402705907821655, "reward_std": 0.24621747434139252, "rewards/accuracy_reward": 0.5752705931663513, "rewards/format_reward": 1.0, "step": 1166, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 467.70001220703125, "epoch": 0.02217998669580918, "grad_norm": 1.3333199001500888, "kl": 0.150390625, "learning_rate": 9.987866486214926e-07, "loss": 0.006, "reward": 2.009999990463257, "reward_std": 0.04696769639849663, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1167, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 468.5500183105469, "epoch": 0.02219899268269505, "grad_norm": 1.422607361320117, "kl": 0.1181640625, "learning_rate": 9.987845691369912e-07, "loss": 0.0047, "reward": 2.0349998474121094, "reward_std": 0.10494961589574814, "rewards/accuracy_reward": 0.824999988079071, "rewards/format_reward": 1.0, "step": 1168, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 446.1000061035156, "epoch": 0.022217998669580918, "grad_norm": 2.2260870129899684, "kl": 0.1875, "learning_rate": 9.987824878742369e-07, "loss": 0.0075, "reward": 1.6787575483322144, "reward_std": 0.05346319079399109, "rewards/accuracy_reward": 0.5237575173377991, "rewards/format_reward": 1.0, "step": 1169, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 425.2749938964844, "epoch": 0.02223700465646679, "grad_norm": 1.7441836081400246, "kl": 0.2138671875, "learning_rate": 9.987804048332366e-07, "loss": 0.0086, "reward": 1.505933165550232, "reward_std": 0.24260154366493225, "rewards/accuracy_reward": 0.5046830773353577, "rewards/format_reward": 1.0, "step": 1170, "temporal_rewards": 0.3999999761581421 }, { "all_correct": 0.2, "all_wrong": 0.0, "completion_length": 443.125, "epoch": 0.022256010643352656, "grad_norm": 3.1235419010677044, "kl": 0.212890625, "learning_rate": 9.987783200139978e-07, "loss": 0.0085, "reward": 1.6564706563949585, "reward_std": 0.10165625810623169, "rewards/accuracy_reward": 0.6177206635475159, "rewards/format_reward": 1.0, "step": 1171, "temporal_rewards": 0.5 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 448.875, "epoch": 0.022275016630238526, "grad_norm": 1.5302875316625966, "kl": 0.1748046875, "learning_rate": 9.987762334165284e-07, "loss": 0.007, "reward": 1.8300564289093018, "reward_std": 0.2543344497680664, "rewards/accuracy_reward": 0.7363064885139465, "rewards/format_reward": 1.0, "step": 1172, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 466.2250061035156, "epoch": 0.022294022617124393, "grad_norm": 1.843049217849662, "kl": 0.142578125, "learning_rate": 9.987741450408354e-07, "loss": 0.0057, "reward": 1.6622055768966675, "reward_std": 0.07972613722085953, "rewards/accuracy_reward": 0.5772055983543396, "rewards/format_reward": 1.0, "step": 1173, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.8, "all_wrong": 0.2, "completion_length": 479.5500183105469, "epoch": 0.022313028604010264, "grad_norm": 1.461850296625544, "kl": 0.158203125, "learning_rate": 9.98772054886926e-07, "loss": 0.0063, "reward": 2.049999952316284, "reward_std": 0.06527493894100189, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1174, "temporal_rewards": 0.8999999761581421 }, { "all_correct": 0.0, "all_wrong": 0.6, "completion_length": 475.6000061035156, "epoch": 0.02233203459089613, "grad_norm": 2.6610367192788806, "kl": 0.205078125, "learning_rate": 9.987699629548083e-07, "loss": 0.0082, "reward": 1.1446107625961304, "reward_std": 0.19837354123592377, "rewards/accuracy_reward": 0.13586071133613586, "rewards/format_reward": 1.0, "step": 1175, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 468.75, "epoch": 0.022351040577782002, "grad_norm": 1.8298904258351714, "kl": 0.2060546875, "learning_rate": 9.987678692444894e-07, "loss": 0.0082, "reward": 1.8398265838623047, "reward_std": 0.2500605583190918, "rewards/accuracy_reward": 0.6973266005516052, "rewards/format_reward": 1.0, "step": 1176, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 469.1000061035156, "epoch": 0.02237004656466787, "grad_norm": 1.645741629002132, "kl": 0.1357421875, "learning_rate": 9.98765773755977e-07, "loss": 0.0054, "reward": 1.693355917930603, "reward_std": 0.09225838631391525, "rewards/accuracy_reward": 0.5808559656143188, "rewards/format_reward": 1.0, "step": 1177, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 469.1499938964844, "epoch": 0.02238905255155374, "grad_norm": 1.5206825641172703, "kl": 0.10009765625, "learning_rate": 9.98763676489278e-07, "loss": 0.004, "reward": 1.6968517303466797, "reward_std": 0.07131198048591614, "rewards/accuracy_reward": 0.6518518328666687, "rewards/format_reward": 1.0, "step": 1178, "temporal_rewards": 0.5 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 445.3500061035156, "epoch": 0.022408058538439607, "grad_norm": 1.3005230258076648, "kl": 0.142578125, "learning_rate": 9.987615774444004e-07, "loss": 0.0057, "reward": 1.8824999332427979, "reward_std": 0.14569950103759766, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 1179, "temporal_rewards": 1.0 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 472.3999938964844, "epoch": 0.022427064525325478, "grad_norm": 2.2157074437726774, "kl": 0.15625, "learning_rate": 9.987594766213516e-07, "loss": 0.0063, "reward": 1.5982060432434082, "reward_std": 0.14768338203430176, "rewards/accuracy_reward": 0.5732061266899109, "rewards/format_reward": 0.9750000238418579, "step": 1180, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 483.82501220703125, "epoch": 0.022446070512211348, "grad_norm": 1.7886712817879944, "kl": 0.146484375, "learning_rate": 9.98757374020139e-07, "loss": 0.0058, "reward": 1.8199999332427979, "reward_std": 0.23715201020240784, "rewards/accuracy_reward": 0.699999988079071, "rewards/format_reward": 1.0, "step": 1181, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 464.7250061035156, "epoch": 0.022465076499097215, "grad_norm": 1.8719500152149386, "kl": 0.140625, "learning_rate": 9.987552696407702e-07, "loss": 0.0056, "reward": 2.0943691730499268, "reward_std": 0.04618341848254204, "rewards/accuracy_reward": 0.9281191229820251, "rewards/format_reward": 1.0, "step": 1182, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 474.7250061035156, "epoch": 0.022484082485983086, "grad_norm": 1.3671762467998057, "kl": 0.146484375, "learning_rate": 9.987531634832527e-07, "loss": 0.0059, "reward": 1.8374998569488525, "reward_std": 0.14663709700107574, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 1183, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.0, "completion_length": 501.82501220703125, "epoch": 0.022503088472868953, "grad_norm": 1.8856679504366625, "kl": 0.1298828125, "learning_rate": 9.987510555475938e-07, "loss": 0.0052, "reward": 1.750697374343872, "reward_std": 0.2980468273162842, "rewards/accuracy_reward": 0.6919474005699158, "rewards/format_reward": 0.9750000238418579, "step": 1184, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.2, "completion_length": 466.32501220703125, "epoch": 0.022522094459754824, "grad_norm": 1.1345719205040237, "kl": 0.1748046875, "learning_rate": 9.987489458338013e-07, "loss": 0.007, "reward": 1.8574999570846558, "reward_std": 0.12913735210895538, "rewards/accuracy_reward": 0.6500000357627869, "rewards/format_reward": 1.0, "step": 1185, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.0, "all_wrong": 0.4, "completion_length": 506.875, "epoch": 0.02254110044664069, "grad_norm": 1.7151041252666872, "kl": 0.15234375, "learning_rate": 9.987468343418823e-07, "loss": 0.0061, "reward": 1.1401817798614502, "reward_std": 0.14938972890377045, "rewards/accuracy_reward": 0.19393174350261688, "rewards/format_reward": 1.0, "step": 1186, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 458.5500183105469, "epoch": 0.022560106433526562, "grad_norm": 1.9190999280521972, "kl": 0.224609375, "learning_rate": 9.987447210718449e-07, "loss": 0.009, "reward": 2.083750009536743, "reward_std": 0.14773176610469818, "rewards/accuracy_reward": 0.925000011920929, "rewards/format_reward": 1.0, "step": 1187, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.2, "completion_length": 456.25, "epoch": 0.02257911242041243, "grad_norm": 2.7601744813281375, "kl": 0.39453125, "learning_rate": 9.98742606023696e-07, "loss": 0.0158, "reward": 1.7402499914169312, "reward_std": 0.17064407467842102, "rewards/accuracy_reward": 0.627750039100647, "rewards/format_reward": 0.9750000238418579, "step": 1188, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 468.75, "epoch": 0.0225981184072983, "grad_norm": 1.6961197264926566, "kl": 0.16015625, "learning_rate": 9.987404891974439e-07, "loss": 0.0064, "reward": 1.8000000715255737, "reward_std": 0.09896720945835114, "rewards/accuracy_reward": 0.6850000619888306, "rewards/format_reward": 1.0, "step": 1189, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.4, "completion_length": 457.1499938964844, "epoch": 0.022617124394184167, "grad_norm": 2.299641723237745, "kl": 0.291015625, "learning_rate": 9.987383705930954e-07, "loss": 0.0116, "reward": 1.5035418272018433, "reward_std": 0.10506286472082138, "rewards/accuracy_reward": 0.432291716337204, "rewards/format_reward": 1.0, "step": 1190, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.2, "all_wrong": 0.4, "completion_length": 447.20001220703125, "epoch": 0.022636130381070037, "grad_norm": 3.1215590241380466, "kl": 0.2431640625, "learning_rate": 9.987362502106586e-07, "loss": 0.0097, "reward": 1.6737499237060547, "reward_std": 0.1938600391149521, "rewards/accuracy_reward": 0.550000011920929, "rewards/format_reward": 1.0, "step": 1191, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 457.8500061035156, "epoch": 0.022655136367955905, "grad_norm": 1.5706659326588908, "kl": 0.267578125, "learning_rate": 9.987341280501407e-07, "loss": 0.0107, "reward": 1.491249918937683, "reward_std": 0.22266459465026855, "rewards/accuracy_reward": 0.4749999940395355, "rewards/format_reward": 0.9750000238418579, "step": 1192, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 436.0500183105469, "epoch": 0.022674142354841775, "grad_norm": 4.590543835937289, "kl": 0.306640625, "learning_rate": 9.987320041115495e-07, "loss": 0.0122, "reward": 2.011476993560791, "reward_std": 0.09533931314945221, "rewards/accuracy_reward": 0.7877270579338074, "rewards/format_reward": 1.0, "step": 1193, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.6, "all_wrong": 0.4, "completion_length": 412.2250061035156, "epoch": 0.022693148341727643, "grad_norm": 3.717987877086677, "kl": 0.2099609375, "learning_rate": 9.987298783948923e-07, "loss": 0.0084, "reward": 1.7487499713897705, "reward_std": 0.0325133316218853, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 1.0, "step": 1194, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 445.4250183105469, "epoch": 0.022712154328613513, "grad_norm": 1.550577133119749, "kl": 0.2021484375, "learning_rate": 9.98727750900177e-07, "loss": 0.0081, "reward": 1.651789665222168, "reward_std": 0.12180455029010773, "rewards/accuracy_reward": 0.5142897963523865, "rewards/format_reward": 1.0, "step": 1195, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 417.6750183105469, "epoch": 0.022731160315499384, "grad_norm": 2.6170553217661006, "kl": 0.3046875, "learning_rate": 9.98725621627411e-07, "loss": 0.0122, "reward": 1.8090852499008179, "reward_std": 0.2262856811285019, "rewards/accuracy_reward": 0.6578353643417358, "rewards/format_reward": 1.0, "step": 1196, "temporal_rewards": 0.699999988079071 }, { "all_correct": 0.4, "all_wrong": 0.2, "completion_length": 402.57501220703125, "epoch": 0.02275016630238525, "grad_norm": 3.1478659850181603, "kl": 0.5078125, "learning_rate": 9.98723490576602e-07, "loss": 0.0203, "reward": 1.850000023841858, "reward_std": 0.16654424369335175, "rewards/accuracy_reward": 0.637499988079071, "rewards/format_reward": 1.0, "step": 1197, "temporal_rewards": 0.7999999523162842 }, { "all_correct": 0.6, "all_wrong": 0.0, "completion_length": 398.8000183105469, "epoch": 0.02276917228927112, "grad_norm": 3.9870173336907477, "kl": 0.498046875, "learning_rate": 9.987213577477574e-07, "loss": 0.0199, "reward": 1.9662498235702515, "reward_std": 0.24570035934448242, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 1.0, "step": 1198, "temporal_rewards": 0.5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.82501220703125, "epoch": 0.02278817827615699, "grad_norm": 4.553674308518263, "kl": 0.427734375, "learning_rate": 9.987192231408851e-07, "loss": 0.0171, "reward": 1.947222113609314, "reward_std": 0.2639230191707611, "rewards/accuracy_reward": 0.8472222685813904, "rewards/format_reward": 1.0, "step": 1199, "temporal_rewards": 0.5999999642372131 }, { "all_correct": 0.0, "all_wrong": 0.2, "completion_length": 364.3000183105469, "epoch": 0.02280718426304286, "grad_norm": 3.1491021834644775, "kl": 0.318359375, "learning_rate": 9.987170867559924e-07, "loss": 0.0127, "reward": 1.329876184463501, "reward_std": 0.15625979006290436, "rewards/accuracy_reward": 0.23987610638141632, "rewards/format_reward": 1.0, "step": 1200, "temporal_rewards": 0.699999988079071 } ], "logging_steps": 1.0, "max_steps": 52615, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }