{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.965157329240215, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0061396776669224865, "grad_norm": 15.421392440795898, "learning_rate": 2.94478527607362e-06, "loss": 7.0576, "step": 10 }, { "epoch": 0.012279355333844973, "grad_norm": 9.991676330566406, "learning_rate": 6.625766871165644e-06, "loss": 6.1821, "step": 20 }, { "epoch": 0.01841903300076746, "grad_norm": 8.402695655822754, "learning_rate": 1.030674846625767e-05, "loss": 5.2081, "step": 30 }, { "epoch": 0.024558710667689946, "grad_norm": 8.112366676330566, "learning_rate": 1.3987730061349692e-05, "loss": 4.4492, "step": 40 }, { "epoch": 0.03069838833461243, "grad_norm": 8.540102005004883, "learning_rate": 1.766871165644172e-05, "loss": 3.698, "step": 50 }, { "epoch": 0.03683806600153492, "grad_norm": 11.69017505645752, "learning_rate": 2.1349693251533743e-05, "loss": 2.783, "step": 60 }, { "epoch": 0.042977743668457406, "grad_norm": 2.1856040954589844, "learning_rate": 2.5030674846625765e-05, "loss": 1.5322, "step": 70 }, { "epoch": 0.04911742133537989, "grad_norm": 1.7973095178604126, "learning_rate": 2.8711656441717793e-05, "loss": 0.9891, "step": 80 }, { "epoch": 0.05525709900230238, "grad_norm": 1.1040600538253784, "learning_rate": 3.239263803680982e-05, "loss": 0.7472, "step": 90 }, { "epoch": 0.06139677666922486, "grad_norm": 0.9011868238449097, "learning_rate": 3.607361963190184e-05, "loss": 0.6202, "step": 100 }, { "epoch": 0.06753645433614736, "grad_norm": 0.6866259574890137, "learning_rate": 3.975460122699386e-05, "loss": 0.5531, "step": 110 }, { "epoch": 0.07367613200306984, "grad_norm": 0.7291966676712036, "learning_rate": 4.3435582822085894e-05, "loss": 0.5087, "step": 120 }, { "epoch": 0.07981580966999233, "grad_norm": 0.6475340127944946, "learning_rate": 4.711656441717792e-05, "loss": 0.4658, "step": 130 }, { "epoch": 0.08595548733691481, "grad_norm": 0.8856902122497559, "learning_rate": 5.079754601226994e-05, "loss": 0.4465, "step": 140 }, { "epoch": 0.0920951650038373, "grad_norm": 0.6716117262840271, "learning_rate": 5.4478527607361964e-05, "loss": 0.4154, "step": 150 }, { "epoch": 0.09823484267075978, "grad_norm": 0.6893500089645386, "learning_rate": 5.815950920245399e-05, "loss": 0.4051, "step": 160 }, { "epoch": 0.10437452033768227, "grad_norm": 0.7085395455360413, "learning_rate": 5.9903069466882075e-05, "loss": 0.3903, "step": 170 }, { "epoch": 0.11051419800460476, "grad_norm": 0.5268082618713379, "learning_rate": 5.970920840064621e-05, "loss": 0.3727, "step": 180 }, { "epoch": 0.11665387567152724, "grad_norm": 0.5620772838592529, "learning_rate": 5.951534733441034e-05, "loss": 0.3656, "step": 190 }, { "epoch": 0.12279355333844973, "grad_norm": 0.5411626696586609, "learning_rate": 5.932148626817448e-05, "loss": 0.3527, "step": 200 }, { "epoch": 0.1289332310053722, "grad_norm": 0.5481147170066833, "learning_rate": 5.912762520193861e-05, "loss": 0.3538, "step": 210 }, { "epoch": 0.1350729086722947, "grad_norm": 0.5867046117782593, "learning_rate": 5.8933764135702745e-05, "loss": 0.3412, "step": 220 }, { "epoch": 0.14121258633921718, "grad_norm": 0.5871285200119019, "learning_rate": 5.873990306946688e-05, "loss": 0.3408, "step": 230 }, { "epoch": 0.14735226400613968, "grad_norm": 0.6050668358802795, "learning_rate": 5.8546042003231024e-05, "loss": 0.3352, "step": 240 }, { "epoch": 0.15349194167306215, "grad_norm": 0.5081577897071838, "learning_rate": 5.8352180936995157e-05, "loss": 0.3267, "step": 250 }, { "epoch": 0.15963161933998465, "grad_norm": 0.5677348971366882, "learning_rate": 5.815831987075929e-05, "loss": 0.3313, "step": 260 }, { "epoch": 0.16577129700690713, "grad_norm": 0.487120658159256, "learning_rate": 5.796445880452343e-05, "loss": 0.3229, "step": 270 }, { "epoch": 0.17191097467382963, "grad_norm": 0.58442622423172, "learning_rate": 5.777059773828756e-05, "loss": 0.3199, "step": 280 }, { "epoch": 0.1780506523407521, "grad_norm": 0.5962138772010803, "learning_rate": 5.7576736672051694e-05, "loss": 0.3154, "step": 290 }, { "epoch": 0.1841903300076746, "grad_norm": 0.49995115399360657, "learning_rate": 5.7382875605815834e-05, "loss": 0.3157, "step": 300 }, { "epoch": 0.1903300076745971, "grad_norm": 0.5417408347129822, "learning_rate": 5.718901453957997e-05, "loss": 0.3107, "step": 310 }, { "epoch": 0.19646968534151957, "grad_norm": 0.4579584300518036, "learning_rate": 5.6995153473344106e-05, "loss": 0.3074, "step": 320 }, { "epoch": 0.20260936300844207, "grad_norm": 0.47001534700393677, "learning_rate": 5.680129240710824e-05, "loss": 0.3048, "step": 330 }, { "epoch": 0.20874904067536454, "grad_norm": 0.4944261312484741, "learning_rate": 5.660743134087238e-05, "loss": 0.3056, "step": 340 }, { "epoch": 0.21488871834228704, "grad_norm": 0.504847526550293, "learning_rate": 5.641357027463651e-05, "loss": 0.3071, "step": 350 }, { "epoch": 0.2210283960092095, "grad_norm": 0.5896368622779846, "learning_rate": 5.621970920840064e-05, "loss": 0.2973, "step": 360 }, { "epoch": 0.227168073676132, "grad_norm": 0.4247358739376068, "learning_rate": 5.602584814216478e-05, "loss": 0.2993, "step": 370 }, { "epoch": 0.23330775134305448, "grad_norm": 0.44181105494499207, "learning_rate": 5.583198707592892e-05, "loss": 0.2976, "step": 380 }, { "epoch": 0.23944742900997698, "grad_norm": 0.44813117384910583, "learning_rate": 5.5638126009693055e-05, "loss": 0.2906, "step": 390 }, { "epoch": 0.24558710667689945, "grad_norm": 0.5210517644882202, "learning_rate": 5.544426494345719e-05, "loss": 0.2883, "step": 400 }, { "epoch": 0.25172678434382195, "grad_norm": 0.519637942314148, "learning_rate": 5.525040387722133e-05, "loss": 0.289, "step": 410 }, { "epoch": 0.2578664620107444, "grad_norm": 0.39594411849975586, "learning_rate": 5.505654281098546e-05, "loss": 0.2883, "step": 420 }, { "epoch": 0.2640061396776669, "grad_norm": 0.44631266593933105, "learning_rate": 5.486268174474959e-05, "loss": 0.2836, "step": 430 }, { "epoch": 0.2701458173445894, "grad_norm": 0.40706467628479004, "learning_rate": 5.466882067851374e-05, "loss": 0.2872, "step": 440 }, { "epoch": 0.2762854950115119, "grad_norm": 0.45857444405555725, "learning_rate": 5.447495961227787e-05, "loss": 0.2833, "step": 450 }, { "epoch": 0.28242517267843437, "grad_norm": 0.5002241730690002, "learning_rate": 5.4281098546042004e-05, "loss": 0.2835, "step": 460 }, { "epoch": 0.2885648503453569, "grad_norm": 0.41167810559272766, "learning_rate": 5.408723747980614e-05, "loss": 0.2859, "step": 470 }, { "epoch": 0.29470452801227937, "grad_norm": 0.4499143064022064, "learning_rate": 5.3893376413570276e-05, "loss": 0.283, "step": 480 }, { "epoch": 0.30084420567920184, "grad_norm": 0.4298258125782013, "learning_rate": 5.369951534733441e-05, "loss": 0.278, "step": 490 }, { "epoch": 0.3069838833461243, "grad_norm": 0.47322022914886475, "learning_rate": 5.350565428109855e-05, "loss": 0.2775, "step": 500 }, { "epoch": 0.31312356101304684, "grad_norm": 0.583753228187561, "learning_rate": 5.331179321486269e-05, "loss": 0.277, "step": 510 }, { "epoch": 0.3192632386799693, "grad_norm": 0.4866049885749817, "learning_rate": 5.311793214862682e-05, "loss": 0.2801, "step": 520 }, { "epoch": 0.3254029163468918, "grad_norm": 0.4380209445953369, "learning_rate": 5.2924071082390953e-05, "loss": 0.281, "step": 530 }, { "epoch": 0.33154259401381425, "grad_norm": 0.4283748269081116, "learning_rate": 5.2730210016155086e-05, "loss": 0.2761, "step": 540 }, { "epoch": 0.3376822716807368, "grad_norm": 0.423459529876709, "learning_rate": 5.2536348949919226e-05, "loss": 0.2794, "step": 550 }, { "epoch": 0.34382194934765925, "grad_norm": 0.41094839572906494, "learning_rate": 5.234248788368336e-05, "loss": 0.2729, "step": 560 }, { "epoch": 0.3499616270145817, "grad_norm": 0.43751445412635803, "learning_rate": 5.21486268174475e-05, "loss": 0.2718, "step": 570 }, { "epoch": 0.3561013046815042, "grad_norm": 0.44664233922958374, "learning_rate": 5.195476575121164e-05, "loss": 0.273, "step": 580 }, { "epoch": 0.3622409823484267, "grad_norm": 0.41911059617996216, "learning_rate": 5.176090468497577e-05, "loss": 0.2723, "step": 590 }, { "epoch": 0.3683806600153492, "grad_norm": 0.42424502968788147, "learning_rate": 5.15670436187399e-05, "loss": 0.2723, "step": 600 }, { "epoch": 0.37452033768227166, "grad_norm": 0.4342273473739624, "learning_rate": 5.137318255250404e-05, "loss": 0.27, "step": 610 }, { "epoch": 0.3806600153491942, "grad_norm": 0.4648367166519165, "learning_rate": 5.1179321486268175e-05, "loss": 0.269, "step": 620 }, { "epoch": 0.38679969301611666, "grad_norm": 0.4721224904060364, "learning_rate": 5.098546042003231e-05, "loss": 0.2728, "step": 630 }, { "epoch": 0.39293937068303914, "grad_norm": 0.4093610644340515, "learning_rate": 5.079159935379645e-05, "loss": 0.2692, "step": 640 }, { "epoch": 0.3990790483499616, "grad_norm": 0.41683274507522583, "learning_rate": 5.0597738287560586e-05, "loss": 0.2635, "step": 650 }, { "epoch": 0.40521872601688413, "grad_norm": 0.4177282154560089, "learning_rate": 5.040387722132472e-05, "loss": 0.267, "step": 660 }, { "epoch": 0.4113584036838066, "grad_norm": 0.3698485791683197, "learning_rate": 5.021001615508885e-05, "loss": 0.2701, "step": 670 }, { "epoch": 0.4174980813507291, "grad_norm": 0.3479083478450775, "learning_rate": 5.001615508885299e-05, "loss": 0.2663, "step": 680 }, { "epoch": 0.42363775901765155, "grad_norm": 0.44725146889686584, "learning_rate": 4.9822294022617124e-05, "loss": 0.2649, "step": 690 }, { "epoch": 0.4297774366845741, "grad_norm": 0.3550412952899933, "learning_rate": 4.9628432956381264e-05, "loss": 0.266, "step": 700 }, { "epoch": 0.43591711435149655, "grad_norm": 0.4252576529979706, "learning_rate": 4.9434571890145396e-05, "loss": 0.2643, "step": 710 }, { "epoch": 0.442056792018419, "grad_norm": 0.39061039686203003, "learning_rate": 4.9240710823909536e-05, "loss": 0.2621, "step": 720 }, { "epoch": 0.4481964696853415, "grad_norm": 0.4167734980583191, "learning_rate": 4.904684975767367e-05, "loss": 0.2643, "step": 730 }, { "epoch": 0.454336147352264, "grad_norm": 0.3748829662799835, "learning_rate": 4.88529886914378e-05, "loss": 0.268, "step": 740 }, { "epoch": 0.4604758250191865, "grad_norm": 0.35719943046569824, "learning_rate": 4.865912762520194e-05, "loss": 0.265, "step": 750 }, { "epoch": 0.46661550268610896, "grad_norm": 0.3589203357696533, "learning_rate": 4.846526655896607e-05, "loss": 0.2583, "step": 760 }, { "epoch": 0.4727551803530315, "grad_norm": 0.381046861410141, "learning_rate": 4.827140549273021e-05, "loss": 0.2613, "step": 770 }, { "epoch": 0.47889485801995396, "grad_norm": 0.46023252606391907, "learning_rate": 4.8077544426494345e-05, "loss": 0.2584, "step": 780 }, { "epoch": 0.48503453568687643, "grad_norm": 0.3704688549041748, "learning_rate": 4.7883683360258485e-05, "loss": 0.2573, "step": 790 }, { "epoch": 0.4911742133537989, "grad_norm": 0.3550325036048889, "learning_rate": 4.768982229402262e-05, "loss": 0.2638, "step": 800 }, { "epoch": 0.49731389102072143, "grad_norm": 0.3660726547241211, "learning_rate": 4.749596122778675e-05, "loss": 0.2614, "step": 810 }, { "epoch": 0.5034535686876439, "grad_norm": 0.3629954159259796, "learning_rate": 4.730210016155089e-05, "loss": 0.2589, "step": 820 }, { "epoch": 0.5095932463545664, "grad_norm": 0.3618892431259155, "learning_rate": 4.710823909531502e-05, "loss": 0.2625, "step": 830 }, { "epoch": 0.5157329240214888, "grad_norm": 0.4103853106498718, "learning_rate": 4.691437802907916e-05, "loss": 0.256, "step": 840 }, { "epoch": 0.5218726016884113, "grad_norm": 0.4005793631076813, "learning_rate": 4.67205169628433e-05, "loss": 0.2609, "step": 850 }, { "epoch": 0.5280122793553338, "grad_norm": 0.4035857617855072, "learning_rate": 4.6526655896607434e-05, "loss": 0.2602, "step": 860 }, { "epoch": 0.5341519570222564, "grad_norm": 0.3564916253089905, "learning_rate": 4.633279483037157e-05, "loss": 0.2599, "step": 870 }, { "epoch": 0.5402916346891788, "grad_norm": 0.3407823443412781, "learning_rate": 4.61389337641357e-05, "loss": 0.2566, "step": 880 }, { "epoch": 0.5464313123561013, "grad_norm": 0.42279985547065735, "learning_rate": 4.594507269789984e-05, "loss": 0.2596, "step": 890 }, { "epoch": 0.5525709900230238, "grad_norm": 0.3459627628326416, "learning_rate": 4.575121163166398e-05, "loss": 0.2579, "step": 900 }, { "epoch": 0.5587106676899463, "grad_norm": 0.3690173327922821, "learning_rate": 4.555735056542811e-05, "loss": 0.2555, "step": 910 }, { "epoch": 0.5648503453568687, "grad_norm": 0.3533875346183777, "learning_rate": 4.536348949919225e-05, "loss": 0.256, "step": 920 }, { "epoch": 0.5709900230237912, "grad_norm": 0.34665781259536743, "learning_rate": 4.516962843295638e-05, "loss": 0.2538, "step": 930 }, { "epoch": 0.5771297006907138, "grad_norm": 0.31980058550834656, "learning_rate": 4.4975767366720516e-05, "loss": 0.2581, "step": 940 }, { "epoch": 0.5832693783576363, "grad_norm": 0.35057130455970764, "learning_rate": 4.478190630048465e-05, "loss": 0.2566, "step": 950 }, { "epoch": 0.5894090560245587, "grad_norm": 0.32480597496032715, "learning_rate": 4.458804523424879e-05, "loss": 0.255, "step": 960 }, { "epoch": 0.5955487336914812, "grad_norm": 0.3633149564266205, "learning_rate": 4.439418416801293e-05, "loss": 0.2551, "step": 970 }, { "epoch": 0.6016884113584037, "grad_norm": 0.34456104040145874, "learning_rate": 4.420032310177706e-05, "loss": 0.253, "step": 980 }, { "epoch": 0.6078280890253261, "grad_norm": 0.3510631322860718, "learning_rate": 4.40064620355412e-05, "loss": 0.254, "step": 990 }, { "epoch": 0.6139677666922486, "grad_norm": 0.35537025332450867, "learning_rate": 4.381260096930533e-05, "loss": 0.2547, "step": 1000 }, { "epoch": 0.6201074443591711, "grad_norm": 0.36508992314338684, "learning_rate": 4.3618739903069465e-05, "loss": 0.2517, "step": 1010 }, { "epoch": 0.6262471220260937, "grad_norm": 0.383023202419281, "learning_rate": 4.3424878836833605e-05, "loss": 0.2532, "step": 1020 }, { "epoch": 0.6323867996930161, "grad_norm": 0.3511675298213959, "learning_rate": 4.323101777059774e-05, "loss": 0.2496, "step": 1030 }, { "epoch": 0.6385264773599386, "grad_norm": 0.39103734493255615, "learning_rate": 4.303715670436188e-05, "loss": 0.2502, "step": 1040 }, { "epoch": 0.6446661550268611, "grad_norm": 0.32976630330085754, "learning_rate": 4.284329563812601e-05, "loss": 0.2516, "step": 1050 }, { "epoch": 0.6508058326937836, "grad_norm": 0.3627534508705139, "learning_rate": 4.264943457189015e-05, "loss": 0.2538, "step": 1060 }, { "epoch": 0.656945510360706, "grad_norm": 0.37391820549964905, "learning_rate": 4.245557350565428e-05, "loss": 0.2528, "step": 1070 }, { "epoch": 0.6630851880276285, "grad_norm": 0.3327382802963257, "learning_rate": 4.2261712439418414e-05, "loss": 0.2523, "step": 1080 }, { "epoch": 0.6692248656945511, "grad_norm": 0.3251570463180542, "learning_rate": 4.2067851373182554e-05, "loss": 0.2509, "step": 1090 }, { "epoch": 0.6753645433614736, "grad_norm": 0.39253005385398865, "learning_rate": 4.1873990306946693e-05, "loss": 0.2507, "step": 1100 }, { "epoch": 0.681504221028396, "grad_norm": 0.33449676632881165, "learning_rate": 4.1680129240710826e-05, "loss": 0.2481, "step": 1110 }, { "epoch": 0.6876438986953185, "grad_norm": 0.37863296270370483, "learning_rate": 4.148626817447496e-05, "loss": 0.252, "step": 1120 }, { "epoch": 0.693783576362241, "grad_norm": 0.31935396790504456, "learning_rate": 4.12924071082391e-05, "loss": 0.2466, "step": 1130 }, { "epoch": 0.6999232540291634, "grad_norm": 0.313819020986557, "learning_rate": 4.109854604200323e-05, "loss": 0.251, "step": 1140 }, { "epoch": 0.7060629316960859, "grad_norm": 0.3724668622016907, "learning_rate": 4.0904684975767364e-05, "loss": 0.2518, "step": 1150 }, { "epoch": 0.7122026093630084, "grad_norm": 0.33299896121025085, "learning_rate": 4.07108239095315e-05, "loss": 0.2472, "step": 1160 }, { "epoch": 0.718342287029931, "grad_norm": 0.3538409173488617, "learning_rate": 4.051696284329564e-05, "loss": 0.2486, "step": 1170 }, { "epoch": 0.7244819646968534, "grad_norm": 0.3172655999660492, "learning_rate": 4.0323101777059775e-05, "loss": 0.2501, "step": 1180 }, { "epoch": 0.7306216423637759, "grad_norm": 0.3064083456993103, "learning_rate": 4.012924071082391e-05, "loss": 0.2453, "step": 1190 }, { "epoch": 0.7367613200306984, "grad_norm": 0.3342798352241516, "learning_rate": 3.993537964458805e-05, "loss": 0.2498, "step": 1200 }, { "epoch": 0.7429009976976209, "grad_norm": 0.31463661789894104, "learning_rate": 3.974151857835218e-05, "loss": 0.2446, "step": 1210 }, { "epoch": 0.7490406753645433, "grad_norm": 0.3608093559741974, "learning_rate": 3.954765751211631e-05, "loss": 0.2483, "step": 1220 }, { "epoch": 0.7551803530314658, "grad_norm": 0.3194938898086548, "learning_rate": 3.935379644588046e-05, "loss": 0.2493, "step": 1230 }, { "epoch": 0.7613200306983884, "grad_norm": 0.3166663944721222, "learning_rate": 3.915993537964459e-05, "loss": 0.2481, "step": 1240 }, { "epoch": 0.7674597083653109, "grad_norm": 0.36826014518737793, "learning_rate": 3.8966074313408725e-05, "loss": 0.2451, "step": 1250 }, { "epoch": 0.7735993860322333, "grad_norm": 0.31451407074928284, "learning_rate": 3.8772213247172864e-05, "loss": 0.2455, "step": 1260 }, { "epoch": 0.7797390636991558, "grad_norm": 0.3546365201473236, "learning_rate": 3.8578352180937e-05, "loss": 0.2417, "step": 1270 }, { "epoch": 0.7858787413660783, "grad_norm": 0.37739697098731995, "learning_rate": 3.838449111470113e-05, "loss": 0.2471, "step": 1280 }, { "epoch": 0.7920184190330007, "grad_norm": 0.29908350110054016, "learning_rate": 3.819063004846526e-05, "loss": 0.2502, "step": 1290 }, { "epoch": 0.7981580966999232, "grad_norm": 0.388713538646698, "learning_rate": 3.799676898222941e-05, "loss": 0.2447, "step": 1300 }, { "epoch": 0.8042977743668457, "grad_norm": 0.31080490350723267, "learning_rate": 3.780290791599354e-05, "loss": 0.2458, "step": 1310 }, { "epoch": 0.8104374520337683, "grad_norm": 0.30132901668548584, "learning_rate": 3.7609046849757674e-05, "loss": 0.2455, "step": 1320 }, { "epoch": 0.8165771297006907, "grad_norm": 0.3251103162765503, "learning_rate": 3.741518578352181e-05, "loss": 0.2454, "step": 1330 }, { "epoch": 0.8227168073676132, "grad_norm": 0.33006975054740906, "learning_rate": 3.7221324717285946e-05, "loss": 0.2466, "step": 1340 }, { "epoch": 0.8288564850345357, "grad_norm": 0.3195136785507202, "learning_rate": 3.702746365105008e-05, "loss": 0.2451, "step": 1350 }, { "epoch": 0.8349961627014582, "grad_norm": 0.33961763978004456, "learning_rate": 3.683360258481421e-05, "loss": 0.2451, "step": 1360 }, { "epoch": 0.8411358403683806, "grad_norm": 0.3274971544742584, "learning_rate": 3.663974151857836e-05, "loss": 0.2445, "step": 1370 }, { "epoch": 0.8472755180353031, "grad_norm": 0.3316924273967743, "learning_rate": 3.644588045234249e-05, "loss": 0.2421, "step": 1380 }, { "epoch": 0.8534151957022257, "grad_norm": 0.34864816069602966, "learning_rate": 3.625201938610662e-05, "loss": 0.2432, "step": 1390 }, { "epoch": 0.8595548733691482, "grad_norm": 0.34979552030563354, "learning_rate": 3.605815831987076e-05, "loss": 0.2445, "step": 1400 }, { "epoch": 0.8656945510360706, "grad_norm": 0.3480820655822754, "learning_rate": 3.5864297253634895e-05, "loss": 0.2423, "step": 1410 }, { "epoch": 0.8718342287029931, "grad_norm": 0.3144533038139343, "learning_rate": 3.567043618739903e-05, "loss": 0.2423, "step": 1420 }, { "epoch": 0.8779739063699156, "grad_norm": 0.35407504439353943, "learning_rate": 3.547657512116317e-05, "loss": 0.2453, "step": 1430 }, { "epoch": 0.884113584036838, "grad_norm": 0.3296087980270386, "learning_rate": 3.528271405492731e-05, "loss": 0.2433, "step": 1440 }, { "epoch": 0.8902532617037605, "grad_norm": 0.37147656083106995, "learning_rate": 3.508885298869144e-05, "loss": 0.243, "step": 1450 }, { "epoch": 0.896392939370683, "grad_norm": 0.291891485452652, "learning_rate": 3.489499192245557e-05, "loss": 0.2415, "step": 1460 }, { "epoch": 0.9025326170376056, "grad_norm": 0.28548404574394226, "learning_rate": 3.470113085621971e-05, "loss": 0.244, "step": 1470 }, { "epoch": 0.908672294704528, "grad_norm": 0.35448750853538513, "learning_rate": 3.4507269789983844e-05, "loss": 0.2419, "step": 1480 }, { "epoch": 0.9148119723714505, "grad_norm": 0.3614279329776764, "learning_rate": 3.431340872374798e-05, "loss": 0.2425, "step": 1490 }, { "epoch": 0.920951650038373, "grad_norm": 0.3446723520755768, "learning_rate": 3.411954765751212e-05, "loss": 0.245, "step": 1500 }, { "epoch": 0.9270913277052955, "grad_norm": 0.336374968290329, "learning_rate": 3.3925686591276256e-05, "loss": 0.2411, "step": 1510 }, { "epoch": 0.9332310053722179, "grad_norm": 0.31548479199409485, "learning_rate": 3.373182552504039e-05, "loss": 0.2414, "step": 1520 }, { "epoch": 0.9393706830391404, "grad_norm": 0.320119172334671, "learning_rate": 3.353796445880452e-05, "loss": 0.2397, "step": 1530 }, { "epoch": 0.945510360706063, "grad_norm": 0.3126417100429535, "learning_rate": 3.334410339256866e-05, "loss": 0.2386, "step": 1540 }, { "epoch": 0.9516500383729855, "grad_norm": 0.30536866188049316, "learning_rate": 3.3150242326332794e-05, "loss": 0.2413, "step": 1550 }, { "epoch": 0.9577897160399079, "grad_norm": 0.30234816670417786, "learning_rate": 3.2956381260096926e-05, "loss": 0.2385, "step": 1560 }, { "epoch": 0.9639293937068304, "grad_norm": 0.30176666378974915, "learning_rate": 3.276252019386107e-05, "loss": 0.2424, "step": 1570 }, { "epoch": 0.9700690713737529, "grad_norm": 0.2967808246612549, "learning_rate": 3.2568659127625205e-05, "loss": 0.2403, "step": 1580 }, { "epoch": 0.9762087490406753, "grad_norm": 0.287546843290329, "learning_rate": 3.237479806138934e-05, "loss": 0.2402, "step": 1590 }, { "epoch": 0.9823484267075978, "grad_norm": 0.32006338238716125, "learning_rate": 3.218093699515347e-05, "loss": 0.2416, "step": 1600 }, { "epoch": 0.9884881043745203, "grad_norm": 0.31201934814453125, "learning_rate": 3.198707592891761e-05, "loss": 0.2396, "step": 1610 }, { "epoch": 0.9946277820414429, "grad_norm": 0.34285658597946167, "learning_rate": 3.179321486268174e-05, "loss": 0.2411, "step": 1620 }, { "epoch": 1.0012279355333844, "grad_norm": 0.2902677655220032, "learning_rate": 3.159935379644588e-05, "loss": 0.2588, "step": 1630 }, { "epoch": 1.0073676132003069, "grad_norm": 0.3334469497203827, "learning_rate": 3.140549273021002e-05, "loss": 0.2412, "step": 1640 }, { "epoch": 1.0135072908672296, "grad_norm": 0.34331703186035156, "learning_rate": 3.1211631663974154e-05, "loss": 0.2376, "step": 1650 }, { "epoch": 1.019646968534152, "grad_norm": 0.30712443590164185, "learning_rate": 3.101777059773829e-05, "loss": 0.2382, "step": 1660 }, { "epoch": 1.0257866462010745, "grad_norm": 0.29111242294311523, "learning_rate": 3.082390953150243e-05, "loss": 0.2396, "step": 1670 }, { "epoch": 1.031926323867997, "grad_norm": 0.30055105686187744, "learning_rate": 3.063004846526656e-05, "loss": 0.2374, "step": 1680 }, { "epoch": 1.0380660015349195, "grad_norm": 0.2835744321346283, "learning_rate": 3.0436187399030692e-05, "loss": 0.2391, "step": 1690 }, { "epoch": 1.044205679201842, "grad_norm": 0.3579924404621124, "learning_rate": 3.0242326332794835e-05, "loss": 0.2366, "step": 1700 }, { "epoch": 1.0503453568687644, "grad_norm": 0.31649139523506165, "learning_rate": 3.0048465266558968e-05, "loss": 0.2386, "step": 1710 }, { "epoch": 1.0564850345356869, "grad_norm": 0.3274291157722473, "learning_rate": 2.9854604200323104e-05, "loss": 0.2367, "step": 1720 }, { "epoch": 1.0626247122026093, "grad_norm": 0.35450875759124756, "learning_rate": 2.966074313408724e-05, "loss": 0.2358, "step": 1730 }, { "epoch": 1.0687643898695318, "grad_norm": 0.31164485216140747, "learning_rate": 2.9466882067851372e-05, "loss": 0.2373, "step": 1740 }, { "epoch": 1.0749040675364543, "grad_norm": 0.29177117347717285, "learning_rate": 2.9273021001615512e-05, "loss": 0.2368, "step": 1750 }, { "epoch": 1.0810437452033768, "grad_norm": 0.31044426560401917, "learning_rate": 2.9079159935379645e-05, "loss": 0.236, "step": 1760 }, { "epoch": 1.0871834228702992, "grad_norm": 0.30381840467453003, "learning_rate": 2.888529886914378e-05, "loss": 0.2355, "step": 1770 }, { "epoch": 1.0933231005372217, "grad_norm": 0.3097009062767029, "learning_rate": 2.8691437802907917e-05, "loss": 0.2338, "step": 1780 }, { "epoch": 1.0994627782041442, "grad_norm": 0.2648329436779022, "learning_rate": 2.8497576736672053e-05, "loss": 0.2383, "step": 1790 }, { "epoch": 1.1056024558710669, "grad_norm": 0.29117581248283386, "learning_rate": 2.830371567043619e-05, "loss": 0.2358, "step": 1800 }, { "epoch": 1.1117421335379893, "grad_norm": 0.3057555556297302, "learning_rate": 2.810985460420032e-05, "loss": 0.2362, "step": 1810 }, { "epoch": 1.1178818112049118, "grad_norm": 0.27993032336235046, "learning_rate": 2.791599353796446e-05, "loss": 0.2354, "step": 1820 }, { "epoch": 1.1240214888718343, "grad_norm": 0.2994190752506256, "learning_rate": 2.7722132471728594e-05, "loss": 0.2345, "step": 1830 }, { "epoch": 1.1301611665387568, "grad_norm": 0.2987305223941803, "learning_rate": 2.752827140549273e-05, "loss": 0.2348, "step": 1840 }, { "epoch": 1.1363008442056792, "grad_norm": 0.3114520013332367, "learning_rate": 2.733441033925687e-05, "loss": 0.2362, "step": 1850 }, { "epoch": 1.1424405218726017, "grad_norm": 0.28420695662498474, "learning_rate": 2.7140549273021002e-05, "loss": 0.2371, "step": 1860 }, { "epoch": 1.1485801995395242, "grad_norm": 0.30027127265930176, "learning_rate": 2.6946688206785138e-05, "loss": 0.2387, "step": 1870 }, { "epoch": 1.1547198772064466, "grad_norm": 0.294156551361084, "learning_rate": 2.6752827140549274e-05, "loss": 0.2372, "step": 1880 }, { "epoch": 1.1608595548733691, "grad_norm": 0.3122671842575073, "learning_rate": 2.655896607431341e-05, "loss": 0.2358, "step": 1890 }, { "epoch": 1.1669992325402916, "grad_norm": 0.3033040761947632, "learning_rate": 2.6365105008077543e-05, "loss": 0.2394, "step": 1900 }, { "epoch": 1.173138910207214, "grad_norm": 0.32276102900505066, "learning_rate": 2.617124394184168e-05, "loss": 0.2321, "step": 1910 }, { "epoch": 1.1792785878741365, "grad_norm": 0.29448068141937256, "learning_rate": 2.597738287560582e-05, "loss": 0.2357, "step": 1920 }, { "epoch": 1.185418265541059, "grad_norm": 0.3151733875274658, "learning_rate": 2.578352180936995e-05, "loss": 0.2341, "step": 1930 }, { "epoch": 1.1915579432079815, "grad_norm": 0.28910940885543823, "learning_rate": 2.5589660743134087e-05, "loss": 0.2338, "step": 1940 }, { "epoch": 1.1976976208749042, "grad_norm": 0.3199194371700287, "learning_rate": 2.5395799676898223e-05, "loss": 0.2366, "step": 1950 }, { "epoch": 1.2038372985418266, "grad_norm": 0.31363335251808167, "learning_rate": 2.520193861066236e-05, "loss": 0.2333, "step": 1960 }, { "epoch": 1.209976976208749, "grad_norm": 0.27689889073371887, "learning_rate": 2.5008077544426496e-05, "loss": 0.232, "step": 1970 }, { "epoch": 1.2161166538756716, "grad_norm": 0.27902254462242126, "learning_rate": 2.4814216478190632e-05, "loss": 0.231, "step": 1980 }, { "epoch": 1.222256331542594, "grad_norm": 0.29046565294265747, "learning_rate": 2.4620355411954768e-05, "loss": 0.2343, "step": 1990 }, { "epoch": 1.2283960092095165, "grad_norm": 0.3037092387676239, "learning_rate": 2.44264943457189e-05, "loss": 0.2359, "step": 2000 }, { "epoch": 1.234535686876439, "grad_norm": 0.2904636561870575, "learning_rate": 2.4232633279483037e-05, "loss": 0.2351, "step": 2010 }, { "epoch": 1.2406753645433615, "grad_norm": 0.31070101261138916, "learning_rate": 2.4038772213247173e-05, "loss": 0.2332, "step": 2020 }, { "epoch": 1.246815042210284, "grad_norm": 0.2760045528411865, "learning_rate": 2.384491114701131e-05, "loss": 0.2344, "step": 2030 }, { "epoch": 1.2529547198772064, "grad_norm": 0.3140803575515747, "learning_rate": 2.3651050080775445e-05, "loss": 0.2381, "step": 2040 }, { "epoch": 1.2590943975441289, "grad_norm": 0.30985718965530396, "learning_rate": 2.345718901453958e-05, "loss": 0.2355, "step": 2050 }, { "epoch": 1.2652340752110514, "grad_norm": 0.2748771905899048, "learning_rate": 2.3263327948303717e-05, "loss": 0.2343, "step": 2060 }, { "epoch": 1.2713737528779738, "grad_norm": 0.2836357057094574, "learning_rate": 2.306946688206785e-05, "loss": 0.2355, "step": 2070 }, { "epoch": 1.2775134305448965, "grad_norm": 0.27377593517303467, "learning_rate": 2.287560581583199e-05, "loss": 0.2348, "step": 2080 }, { "epoch": 1.2836531082118188, "grad_norm": 0.29107093811035156, "learning_rate": 2.2681744749596125e-05, "loss": 0.2336, "step": 2090 }, { "epoch": 1.2897927858787415, "grad_norm": 0.29846909642219543, "learning_rate": 2.2487883683360258e-05, "loss": 0.2344, "step": 2100 }, { "epoch": 1.2959324635456637, "grad_norm": 0.3171651065349579, "learning_rate": 2.2294022617124394e-05, "loss": 0.2325, "step": 2110 }, { "epoch": 1.3020721412125864, "grad_norm": 0.3063752353191376, "learning_rate": 2.210016155088853e-05, "loss": 0.233, "step": 2120 }, { "epoch": 1.3082118188795089, "grad_norm": 0.32770147919654846, "learning_rate": 2.1906300484652666e-05, "loss": 0.2329, "step": 2130 }, { "epoch": 1.3143514965464314, "grad_norm": 0.2971561849117279, "learning_rate": 2.1712439418416802e-05, "loss": 0.234, "step": 2140 }, { "epoch": 1.3204911742133538, "grad_norm": 0.2993122935295105, "learning_rate": 2.151857835218094e-05, "loss": 0.2343, "step": 2150 }, { "epoch": 1.3266308518802763, "grad_norm": 0.2786669135093689, "learning_rate": 2.1324717285945075e-05, "loss": 0.2317, "step": 2160 }, { "epoch": 1.3327705295471988, "grad_norm": 0.284007728099823, "learning_rate": 2.1130856219709207e-05, "loss": 0.2313, "step": 2170 }, { "epoch": 1.3389102072141212, "grad_norm": 0.29710453748703003, "learning_rate": 2.0936995153473347e-05, "loss": 0.234, "step": 2180 }, { "epoch": 1.3450498848810437, "grad_norm": 0.28225216269493103, "learning_rate": 2.074313408723748e-05, "loss": 0.2346, "step": 2190 }, { "epoch": 1.3511895625479662, "grad_norm": 0.30834057927131653, "learning_rate": 2.0549273021001615e-05, "loss": 0.2298, "step": 2200 }, { "epoch": 1.3573292402148887, "grad_norm": 0.2734493911266327, "learning_rate": 2.035541195476575e-05, "loss": 0.231, "step": 2210 }, { "epoch": 1.3634689178818111, "grad_norm": 0.3022773265838623, "learning_rate": 2.0161550888529888e-05, "loss": 0.2319, "step": 2220 }, { "epoch": 1.3696085955487338, "grad_norm": 0.2787306606769562, "learning_rate": 1.9967689822294024e-05, "loss": 0.2327, "step": 2230 }, { "epoch": 1.375748273215656, "grad_norm": 0.27380722761154175, "learning_rate": 1.9773828756058156e-05, "loss": 0.2309, "step": 2240 }, { "epoch": 1.3818879508825788, "grad_norm": 0.26936137676239014, "learning_rate": 1.9579967689822296e-05, "loss": 0.2301, "step": 2250 }, { "epoch": 1.388027628549501, "grad_norm": 0.27485784888267517, "learning_rate": 1.9386106623586432e-05, "loss": 0.2313, "step": 2260 }, { "epoch": 1.3941673062164237, "grad_norm": 0.2874917685985565, "learning_rate": 1.9192245557350565e-05, "loss": 0.2301, "step": 2270 }, { "epoch": 1.4003069838833462, "grad_norm": 0.26473861932754517, "learning_rate": 1.8998384491114704e-05, "loss": 0.2302, "step": 2280 }, { "epoch": 1.4064466615502687, "grad_norm": 0.28767651319503784, "learning_rate": 1.8804523424878837e-05, "loss": 0.2321, "step": 2290 }, { "epoch": 1.4125863392171911, "grad_norm": 0.2758727967739105, "learning_rate": 1.8610662358642973e-05, "loss": 0.2344, "step": 2300 }, { "epoch": 1.4187260168841136, "grad_norm": 0.27313369512557983, "learning_rate": 1.8416801292407106e-05, "loss": 0.2321, "step": 2310 }, { "epoch": 1.424865694551036, "grad_norm": 0.2732943594455719, "learning_rate": 1.8222940226171245e-05, "loss": 0.2346, "step": 2320 }, { "epoch": 1.4310053722179585, "grad_norm": 0.28029924631118774, "learning_rate": 1.802907915993538e-05, "loss": 0.2306, "step": 2330 }, { "epoch": 1.437145049884881, "grad_norm": 0.30662721395492554, "learning_rate": 1.7835218093699514e-05, "loss": 0.2303, "step": 2340 }, { "epoch": 1.4432847275518035, "grad_norm": 0.2687036991119385, "learning_rate": 1.7641357027463653e-05, "loss": 0.2305, "step": 2350 }, { "epoch": 1.449424405218726, "grad_norm": 0.27795106172561646, "learning_rate": 1.7447495961227786e-05, "loss": 0.2311, "step": 2360 }, { "epoch": 1.4555640828856484, "grad_norm": 0.27534544467926025, "learning_rate": 1.7253634894991922e-05, "loss": 0.2295, "step": 2370 }, { "epoch": 1.4617037605525711, "grad_norm": 0.2723388671875, "learning_rate": 1.705977382875606e-05, "loss": 0.2332, "step": 2380 }, { "epoch": 1.4678434382194934, "grad_norm": 0.2775721251964569, "learning_rate": 1.6865912762520194e-05, "loss": 0.2287, "step": 2390 }, { "epoch": 1.473983115886416, "grad_norm": 0.2654743194580078, "learning_rate": 1.667205169628433e-05, "loss": 0.2315, "step": 2400 }, { "epoch": 1.4801227935533383, "grad_norm": 0.29545167088508606, "learning_rate": 1.6478190630048463e-05, "loss": 0.2324, "step": 2410 }, { "epoch": 1.486262471220261, "grad_norm": 0.27187272906303406, "learning_rate": 1.6284329563812603e-05, "loss": 0.2319, "step": 2420 }, { "epoch": 1.4924021488871835, "grad_norm": 0.2794046401977539, "learning_rate": 1.6090468497576735e-05, "loss": 0.2289, "step": 2430 }, { "epoch": 1.498541826554106, "grad_norm": 0.2789725959300995, "learning_rate": 1.589660743134087e-05, "loss": 0.2319, "step": 2440 }, { "epoch": 1.5046815042210284, "grad_norm": 0.2820490598678589, "learning_rate": 1.570274636510501e-05, "loss": 0.2305, "step": 2450 }, { "epoch": 1.510821181887951, "grad_norm": 0.292161762714386, "learning_rate": 1.5508885298869144e-05, "loss": 0.2311, "step": 2460 }, { "epoch": 1.5169608595548734, "grad_norm": 0.28120511770248413, "learning_rate": 1.531502423263328e-05, "loss": 0.2268, "step": 2470 }, { "epoch": 1.5231005372217958, "grad_norm": 0.26915067434310913, "learning_rate": 1.5121163166397417e-05, "loss": 0.2278, "step": 2480 }, { "epoch": 1.5292402148887183, "grad_norm": 0.2905220091342926, "learning_rate": 1.4927302100161552e-05, "loss": 0.2313, "step": 2490 }, { "epoch": 1.5353798925556408, "grad_norm": 0.2794709801673889, "learning_rate": 1.4733441033925686e-05, "loss": 0.2294, "step": 2500 }, { "epoch": 1.5415195702225635, "grad_norm": 0.2608950734138489, "learning_rate": 1.4539579967689822e-05, "loss": 0.2288, "step": 2510 }, { "epoch": 1.5476592478894857, "grad_norm": 0.25274738669395447, "learning_rate": 1.4345718901453958e-05, "loss": 0.2252, "step": 2520 }, { "epoch": 1.5537989255564084, "grad_norm": 0.2868223786354065, "learning_rate": 1.4151857835218094e-05, "loss": 0.2297, "step": 2530 }, { "epoch": 1.5599386032233307, "grad_norm": 0.2765255272388458, "learning_rate": 1.395799676898223e-05, "loss": 0.2295, "step": 2540 }, { "epoch": 1.5660782808902534, "grad_norm": 0.23713639378547668, "learning_rate": 1.3764135702746365e-05, "loss": 0.229, "step": 2550 }, { "epoch": 1.5722179585571756, "grad_norm": 0.2997082471847534, "learning_rate": 1.3570274636510501e-05, "loss": 0.227, "step": 2560 }, { "epoch": 1.5783576362240983, "grad_norm": 0.32743728160858154, "learning_rate": 1.3376413570274637e-05, "loss": 0.2277, "step": 2570 }, { "epoch": 1.5844973138910208, "grad_norm": 0.2513977885246277, "learning_rate": 1.3182552504038772e-05, "loss": 0.2285, "step": 2580 }, { "epoch": 1.5906369915579432, "grad_norm": 0.2621912956237793, "learning_rate": 1.298869143780291e-05, "loss": 0.231, "step": 2590 }, { "epoch": 1.5967766692248657, "grad_norm": 0.2549593448638916, "learning_rate": 1.2794830371567044e-05, "loss": 0.2285, "step": 2600 }, { "epoch": 1.6029163468917882, "grad_norm": 0.25694531202316284, "learning_rate": 1.260096930533118e-05, "loss": 0.229, "step": 2610 }, { "epoch": 1.6090560245587107, "grad_norm": 0.278401643037796, "learning_rate": 1.2407108239095316e-05, "loss": 0.231, "step": 2620 }, { "epoch": 1.6151957022256331, "grad_norm": 0.30101412534713745, "learning_rate": 1.221324717285945e-05, "loss": 0.2306, "step": 2630 }, { "epoch": 1.6213353798925556, "grad_norm": 0.25657519698143005, "learning_rate": 1.2019386106623586e-05, "loss": 0.2289, "step": 2640 }, { "epoch": 1.627475057559478, "grad_norm": 0.2672490179538727, "learning_rate": 1.1825525040387722e-05, "loss": 0.2278, "step": 2650 }, { "epoch": 1.6336147352264008, "grad_norm": 0.269938588142395, "learning_rate": 1.1631663974151859e-05, "loss": 0.2268, "step": 2660 }, { "epoch": 1.639754412893323, "grad_norm": 0.27414554357528687, "learning_rate": 1.1437802907915995e-05, "loss": 0.2292, "step": 2670 }, { "epoch": 1.6458940905602457, "grad_norm": 0.2716943323612213, "learning_rate": 1.1243941841680129e-05, "loss": 0.2257, "step": 2680 }, { "epoch": 1.652033768227168, "grad_norm": 0.29680389165878296, "learning_rate": 1.1050080775444265e-05, "loss": 0.2285, "step": 2690 }, { "epoch": 1.6581734458940907, "grad_norm": 0.25835490226745605, "learning_rate": 1.0856219709208401e-05, "loss": 0.2282, "step": 2700 }, { "epoch": 1.664313123561013, "grad_norm": 0.2611675560474396, "learning_rate": 1.0662358642972537e-05, "loss": 0.2265, "step": 2710 }, { "epoch": 1.6704528012279356, "grad_norm": 0.2636869251728058, "learning_rate": 1.0468497576736673e-05, "loss": 0.2262, "step": 2720 }, { "epoch": 1.676592478894858, "grad_norm": 0.276288777589798, "learning_rate": 1.0274636510500808e-05, "loss": 0.2299, "step": 2730 }, { "epoch": 1.6827321565617805, "grad_norm": 0.24938946962356567, "learning_rate": 1.0080775444264944e-05, "loss": 0.2268, "step": 2740 }, { "epoch": 1.688871834228703, "grad_norm": 0.24695712327957153, "learning_rate": 9.886914378029078e-06, "loss": 0.2282, "step": 2750 }, { "epoch": 1.6950115118956255, "grad_norm": 0.25521910190582275, "learning_rate": 9.693053311793216e-06, "loss": 0.2275, "step": 2760 }, { "epoch": 1.701151189562548, "grad_norm": 0.23849350214004517, "learning_rate": 9.499192245557352e-06, "loss": 0.2274, "step": 2770 }, { "epoch": 1.7072908672294704, "grad_norm": 0.2584358751773834, "learning_rate": 9.305331179321486e-06, "loss": 0.2309, "step": 2780 }, { "epoch": 1.713430544896393, "grad_norm": 0.25768372416496277, "learning_rate": 9.111470113085623e-06, "loss": 0.2293, "step": 2790 }, { "epoch": 1.7195702225633154, "grad_norm": 0.2615247070789337, "learning_rate": 8.917609046849757e-06, "loss": 0.225, "step": 2800 }, { "epoch": 1.725709900230238, "grad_norm": 0.2701919674873352, "learning_rate": 8.723747980613893e-06, "loss": 0.226, "step": 2810 }, { "epoch": 1.7318495778971603, "grad_norm": 0.25696951150894165, "learning_rate": 8.52988691437803e-06, "loss": 0.2273, "step": 2820 }, { "epoch": 1.737989255564083, "grad_norm": 0.2802889049053192, "learning_rate": 8.336025848142165e-06, "loss": 0.2269, "step": 2830 }, { "epoch": 1.7441289332310053, "grad_norm": 0.24956634640693665, "learning_rate": 8.142164781906301e-06, "loss": 0.2274, "step": 2840 }, { "epoch": 1.750268610897928, "grad_norm": 0.2676002085208893, "learning_rate": 7.948303715670436e-06, "loss": 0.2264, "step": 2850 }, { "epoch": 1.7564082885648502, "grad_norm": 0.23091745376586914, "learning_rate": 7.754442649434572e-06, "loss": 0.2284, "step": 2860 }, { "epoch": 1.762547966231773, "grad_norm": 0.2684544324874878, "learning_rate": 7.560581583198709e-06, "loss": 0.2259, "step": 2870 }, { "epoch": 1.7686876438986954, "grad_norm": 0.2851001024246216, "learning_rate": 7.366720516962843e-06, "loss": 0.2266, "step": 2880 }, { "epoch": 1.7748273215656178, "grad_norm": 0.25915250182151794, "learning_rate": 7.172859450726979e-06, "loss": 0.2287, "step": 2890 }, { "epoch": 1.7809669992325403, "grad_norm": 0.24940845370292664, "learning_rate": 6.978998384491115e-06, "loss": 0.2275, "step": 2900 }, { "epoch": 1.7871066768994628, "grad_norm": 0.2588519752025604, "learning_rate": 6.7851373182552505e-06, "loss": 0.229, "step": 2910 }, { "epoch": 1.7932463545663853, "grad_norm": 0.24955865740776062, "learning_rate": 6.591276252019386e-06, "loss": 0.2247, "step": 2920 }, { "epoch": 1.7993860322333077, "grad_norm": 0.26434391736984253, "learning_rate": 6.397415185783522e-06, "loss": 0.2269, "step": 2930 }, { "epoch": 1.8055257099002302, "grad_norm": 0.2523840069770813, "learning_rate": 6.203554119547658e-06, "loss": 0.2255, "step": 2940 }, { "epoch": 1.8116653875671527, "grad_norm": 0.26612043380737305, "learning_rate": 6.009693053311793e-06, "loss": 0.2255, "step": 2950 }, { "epoch": 1.8178050652340754, "grad_norm": 0.24991387128829956, "learning_rate": 5.815831987075929e-06, "loss": 0.2267, "step": 2960 }, { "epoch": 1.8239447429009976, "grad_norm": 0.25838539004325867, "learning_rate": 5.6219709208400645e-06, "loss": 0.228, "step": 2970 }, { "epoch": 1.8300844205679203, "grad_norm": 0.25465282797813416, "learning_rate": 5.428109854604201e-06, "loss": 0.2272, "step": 2980 }, { "epoch": 1.8362240982348426, "grad_norm": 0.25871556997299194, "learning_rate": 5.234248788368337e-06, "loss": 0.2278, "step": 2990 }, { "epoch": 1.8423637759017653, "grad_norm": 0.24951577186584473, "learning_rate": 5.040387722132472e-06, "loss": 0.2285, "step": 3000 }, { "epoch": 1.8485034535686875, "grad_norm": 0.25530776381492615, "learning_rate": 4.846526655896608e-06, "loss": 0.2242, "step": 3010 }, { "epoch": 1.8546431312356102, "grad_norm": 0.24456080794334412, "learning_rate": 4.652665589660743e-06, "loss": 0.2277, "step": 3020 }, { "epoch": 1.8607828089025327, "grad_norm": 0.24893827736377716, "learning_rate": 4.4588045234248785e-06, "loss": 0.2251, "step": 3030 }, { "epoch": 1.8669224865694551, "grad_norm": 0.2683740258216858, "learning_rate": 4.264943457189015e-06, "loss": 0.2263, "step": 3040 }, { "epoch": 1.8730621642363776, "grad_norm": 0.2723815143108368, "learning_rate": 4.071082390953151e-06, "loss": 0.2249, "step": 3050 }, { "epoch": 1.8792018419033, "grad_norm": 0.23197263479232788, "learning_rate": 3.877221324717286e-06, "loss": 0.2278, "step": 3060 }, { "epoch": 1.8853415195702226, "grad_norm": 0.24273467063903809, "learning_rate": 3.6833602584814216e-06, "loss": 0.2256, "step": 3070 }, { "epoch": 1.891481197237145, "grad_norm": 0.26177576184272766, "learning_rate": 3.4894991922455576e-06, "loss": 0.2264, "step": 3080 }, { "epoch": 1.8976208749040675, "grad_norm": 0.2266533076763153, "learning_rate": 3.295638126009693e-06, "loss": 0.2242, "step": 3090 }, { "epoch": 1.90376055257099, "grad_norm": 0.24498549103736877, "learning_rate": 3.101777059773829e-06, "loss": 0.2263, "step": 3100 }, { "epoch": 1.9099002302379127, "grad_norm": 0.23406417667865753, "learning_rate": 2.9079159935379646e-06, "loss": 0.2243, "step": 3110 }, { "epoch": 1.916039907904835, "grad_norm": 0.2536128759384155, "learning_rate": 2.7140549273021003e-06, "loss": 0.225, "step": 3120 }, { "epoch": 1.9221795855717576, "grad_norm": 0.22612562775611877, "learning_rate": 2.520193861066236e-06, "loss": 0.2266, "step": 3130 }, { "epoch": 1.9283192632386799, "grad_norm": 0.22622433304786682, "learning_rate": 2.3263327948303716e-06, "loss": 0.2258, "step": 3140 }, { "epoch": 1.9344589409056026, "grad_norm": 0.2482767254114151, "learning_rate": 2.1324717285945077e-06, "loss": 0.2281, "step": 3150 }, { "epoch": 1.9405986185725248, "grad_norm": 0.22862812876701355, "learning_rate": 1.938610662358643e-06, "loss": 0.2254, "step": 3160 }, { "epoch": 1.9467382962394475, "grad_norm": 0.22068631649017334, "learning_rate": 1.7447495961227788e-06, "loss": 0.2242, "step": 3170 }, { "epoch": 1.95287797390637, "grad_norm": 0.2349786013364792, "learning_rate": 1.5508885298869145e-06, "loss": 0.2281, "step": 3180 }, { "epoch": 1.9590176515732924, "grad_norm": 0.24406956136226654, "learning_rate": 1.3570274636510501e-06, "loss": 0.2274, "step": 3190 }, { "epoch": 1.965157329240215, "grad_norm": 0.2341851145029068, "learning_rate": 1.1631663974151858e-06, "loss": 0.2292, "step": 3200 } ], "logging_steps": 10, "max_steps": 3258, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.729291401165181e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }