diff --git "a/checkpoint-11000/trainer_state.json" "b/checkpoint-11000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-11000/trainer_state.json" @@ -0,0 +1,7734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4079098937667989, + "eval_steps": 500, + "global_step": 11000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01278772378516624, + "grad_norm": 83.28244018554688, + "learning_rate": 1.3499999999999998e-05, + "loss": 16.4089, + "step": 10 + }, + { + "epoch": 0.02557544757033248, + "grad_norm": 23.9244327545166, + "learning_rate": 2.8499999999999998e-05, + "loss": 12.7299, + "step": 20 + }, + { + "epoch": 0.03836317135549872, + "grad_norm": 9.106985092163086, + "learning_rate": 4.3499999999999993e-05, + "loss": 12.0294, + "step": 30 + }, + { + "epoch": 0.05115089514066496, + "grad_norm": 26.396886825561523, + "learning_rate": 5.85e-05, + "loss": 10.0869, + "step": 40 + }, + { + "epoch": 0.0639386189258312, + "grad_norm": 1.9576493501663208, + "learning_rate": 7.35e-05, + "loss": 8.6136, + "step": 50 + }, + { + "epoch": 0.07672634271099744, + "grad_norm": 1.866382360458374, + "learning_rate": 8.849999999999998e-05, + "loss": 8.4649, + "step": 60 + }, + { + "epoch": 0.08951406649616368, + "grad_norm": 2.786794662475586, + "learning_rate": 0.00010349999999999998, + "loss": 8.4592, + "step": 70 + }, + { + "epoch": 0.10230179028132992, + "grad_norm": 3.3617353439331055, + "learning_rate": 0.0001185, + "loss": 8.4249, + "step": 80 + }, + { + "epoch": 0.11508951406649616, + "grad_norm": 6.848003387451172, + "learning_rate": 0.0001335, + "loss": 8.2559, + "step": 90 + }, + { + "epoch": 0.1278772378516624, + "grad_norm": 7.901203632354736, + "learning_rate": 0.00014849999999999998, + "loss": 8.1613, + "step": 100 + }, + { + "epoch": 0.14066496163682865, + "grad_norm": 18.165496826171875, + "learning_rate": 0.0001635, + "loss": 7.7662, + "step": 110 + }, + { + "epoch": 0.1534526854219949, + "grad_norm": 8.8899507522583, + "learning_rate": 0.00017849999999999997, + "loss": 7.5454, + "step": 120 + }, + { + "epoch": 0.16624040920716113, + "grad_norm": 23.676000595092773, + "learning_rate": 0.0001935, + "loss": 7.7264, + "step": 130 + }, + { + "epoch": 0.17902813299232737, + "grad_norm": 20.63345718383789, + "learning_rate": 0.00020849999999999997, + "loss": 7.8592, + "step": 140 + }, + { + "epoch": 0.1918158567774936, + "grad_norm": 15.780885696411133, + "learning_rate": 0.00022349999999999998, + "loss": 7.5941, + "step": 150 + }, + { + "epoch": 0.20460358056265984, + "grad_norm": 60.87010955810547, + "learning_rate": 0.0002385, + "loss": 7.4103, + "step": 160 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 1284.3363037109375, + "learning_rate": 0.0002535, + "loss": 7.6876, + "step": 170 + }, + { + "epoch": 0.23017902813299232, + "grad_norm": 12.15369701385498, + "learning_rate": 0.00026849999999999997, + "loss": 7.3406, + "step": 180 + }, + { + "epoch": 0.24296675191815856, + "grad_norm": 155.99559020996094, + "learning_rate": 0.00028349999999999995, + "loss": 6.9872, + "step": 190 + }, + { + "epoch": 0.2557544757033248, + "grad_norm": 30.107227325439453, + "learning_rate": 0.0002985, + "loss": 7.1682, + "step": 200 + }, + { + "epoch": 0.26854219948849106, + "grad_norm": 4.686079502105713, + "learning_rate": 0.0002999997484922834, + "loss": 7.0612, + "step": 210 + }, + { + "epoch": 0.2813299232736573, + "grad_norm": 5.214747905731201, + "learning_rate": 0.0002999988790839756, + "loss": 6.7361, + "step": 220 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.2351861000061035, + "learning_rate": 0.00029999738867364154, + "loss": 6.625, + "step": 230 + }, + { + "epoch": 0.3069053708439898, + "grad_norm": 1.2042266130447388, + "learning_rate": 0.0002999952772674517, + "loss": 6.5884, + "step": 240 + }, + { + "epoch": 0.319693094629156, + "grad_norm": 15.000553131103516, + "learning_rate": 0.00029999254487414717, + "loss": 6.6537, + "step": 250 + }, + { + "epoch": 0.33248081841432225, + "grad_norm": 0.5562590956687927, + "learning_rate": 0.0002999891915050404, + "loss": 6.5825, + "step": 260 + }, + { + "epoch": 0.3452685421994885, + "grad_norm": 0.9475784301757812, + "learning_rate": 0.00029998521717401446, + "loss": 6.5398, + "step": 270 + }, + { + "epoch": 0.35805626598465473, + "grad_norm": 3.0192008018493652, + "learning_rate": 0.0002999806218975231, + "loss": 6.537, + "step": 280 + }, + { + "epoch": 0.37084398976982097, + "grad_norm": 0.6399412751197815, + "learning_rate": 0.0002999754056945911, + "loss": 6.5786, + "step": 290 + }, + { + "epoch": 0.3836317135549872, + "grad_norm": 48.2303466796875, + "learning_rate": 0.00029996956858681373, + "loss": 6.6661, + "step": 300 + }, + { + "epoch": 0.39641943734015345, + "grad_norm": 6.753386497497559, + "learning_rate": 0.00029996311059835696, + "loss": 6.5767, + "step": 310 + }, + { + "epoch": 0.4092071611253197, + "grad_norm": 0.7255383729934692, + "learning_rate": 0.000299956031755957, + "loss": 6.5298, + "step": 320 + }, + { + "epoch": 0.4219948849104859, + "grad_norm": 19.085235595703125, + "learning_rate": 0.0002999483320889207, + "loss": 6.6483, + "step": 330 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 7.604534149169922, + "learning_rate": 0.0002999400116291249, + "loss": 6.5788, + "step": 340 + }, + { + "epoch": 0.4475703324808184, + "grad_norm": 8.935670852661133, + "learning_rate": 0.0002999310704110168, + "loss": 6.5955, + "step": 350 + }, + { + "epoch": 0.46035805626598464, + "grad_norm": 5.167853832244873, + "learning_rate": 0.0002999215084716133, + "loss": 6.5713, + "step": 360 + }, + { + "epoch": 0.4731457800511509, + "grad_norm": 7.617280006408691, + "learning_rate": 0.00029991132585050146, + "loss": 6.5118, + "step": 370 + }, + { + "epoch": 0.4859335038363171, + "grad_norm": 1.1345467567443848, + "learning_rate": 0.00029990052258983764, + "loss": 6.532, + "step": 380 + }, + { + "epoch": 0.49872122762148335, + "grad_norm": 62.21474838256836, + "learning_rate": 0.0002998890987343478, + "loss": 6.8089, + "step": 390 + }, + { + "epoch": 0.5115089514066496, + "grad_norm": 18.671003341674805, + "learning_rate": 0.0002998770543313273, + "loss": 7.5951, + "step": 400 + }, + { + "epoch": 0.5242966751918159, + "grad_norm": 51.78537368774414, + "learning_rate": 0.0002998643894306404, + "loss": 7.2577, + "step": 410 + }, + { + "epoch": 0.5370843989769821, + "grad_norm": 3.8242099285125732, + "learning_rate": 0.00029985110408472046, + "loss": 6.9031, + "step": 420 + }, + { + "epoch": 0.5498721227621484, + "grad_norm": 0.8091524839401245, + "learning_rate": 0.0002998371983485693, + "loss": 6.513, + "step": 430 + }, + { + "epoch": 0.5626598465473146, + "grad_norm": 1.8598475456237793, + "learning_rate": 0.00029982267227975735, + "loss": 6.5526, + "step": 440 + }, + { + "epoch": 0.5754475703324808, + "grad_norm": 0.7141775488853455, + "learning_rate": 0.0002998075259384232, + "loss": 6.5129, + "step": 450 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.31634974479675293, + "learning_rate": 0.0002997917593872733, + "loss": 6.4868, + "step": 460 + }, + { + "epoch": 0.6010230179028133, + "grad_norm": 0.5466232299804688, + "learning_rate": 0.000299775372691582, + "loss": 6.4654, + "step": 470 + }, + { + "epoch": 0.6138107416879796, + "grad_norm": 0.4127891957759857, + "learning_rate": 0.0002997583659191908, + "loss": 6.454, + "step": 480 + }, + { + "epoch": 0.6265984654731458, + "grad_norm": 3.9770538806915283, + "learning_rate": 0.0002997407391405085, + "loss": 6.4464, + "step": 490 + }, + { + "epoch": 0.639386189258312, + "grad_norm": 0.9428657293319702, + "learning_rate": 0.00029972249242851093, + "loss": 6.3952, + "step": 500 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.7024242281913757, + "learning_rate": 0.0002997036258587401, + "loss": 6.3843, + "step": 510 + }, + { + "epoch": 0.6649616368286445, + "grad_norm": 0.7226349115371704, + "learning_rate": 0.0002996841395093045, + "loss": 6.3729, + "step": 520 + }, + { + "epoch": 0.6777493606138107, + "grad_norm": 1.3316643238067627, + "learning_rate": 0.0002996640334608785, + "loss": 6.3582, + "step": 530 + }, + { + "epoch": 0.690537084398977, + "grad_norm": 1.465959906578064, + "learning_rate": 0.000299643307796702, + "loss": 6.4178, + "step": 540 + }, + { + "epoch": 0.7033248081841432, + "grad_norm": 1.373779058456421, + "learning_rate": 0.00029962196260258024, + "loss": 6.3698, + "step": 550 + }, + { + "epoch": 0.7161125319693095, + "grad_norm": 1.058214545249939, + "learning_rate": 0.0002995999979668833, + "loss": 6.3461, + "step": 560 + }, + { + "epoch": 0.7289002557544757, + "grad_norm": 1.4381204843521118, + "learning_rate": 0.00029957741398054557, + "loss": 6.3417, + "step": 570 + }, + { + "epoch": 0.7416879795396419, + "grad_norm": 7.174375057220459, + "learning_rate": 0.0002995542107370659, + "loss": 6.4027, + "step": 580 + }, + { + "epoch": 0.7544757033248082, + "grad_norm": 1.1935175657272339, + "learning_rate": 0.0002995303883325067, + "loss": 6.4132, + "step": 590 + }, + { + "epoch": 0.7672634271099744, + "grad_norm": 0.6950662732124329, + "learning_rate": 0.0002995059468654939, + "loss": 6.3146, + "step": 600 + }, + { + "epoch": 0.7800511508951407, + "grad_norm": 4.71625280380249, + "learning_rate": 0.0002994808864372161, + "loss": 6.3259, + "step": 610 + }, + { + "epoch": 0.7928388746803069, + "grad_norm": 0.7315058708190918, + "learning_rate": 0.00029945520715142455, + "loss": 6.5004, + "step": 620 + }, + { + "epoch": 0.8056265984654731, + "grad_norm": 1.352614164352417, + "learning_rate": 0.00029942890911443284, + "loss": 6.3314, + "step": 630 + }, + { + "epoch": 0.8184143222506394, + "grad_norm": 1.55237877368927, + "learning_rate": 0.0002994019924351159, + "loss": 6.291, + "step": 640 + }, + { + "epoch": 0.8312020460358056, + "grad_norm": 1.7966560125350952, + "learning_rate": 0.0002993744572249099, + "loss": 6.3086, + "step": 650 + }, + { + "epoch": 0.8439897698209718, + "grad_norm": 0.62131667137146, + "learning_rate": 0.00029934630359781196, + "loss": 6.307, + "step": 660 + }, + { + "epoch": 0.8567774936061381, + "grad_norm": 1.4420467615127563, + "learning_rate": 0.00029931753167037933, + "loss": 6.295, + "step": 670 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.6070270538330078, + "learning_rate": 0.000299288141561729, + "loss": 6.2912, + "step": 680 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.7900067567825317, + "learning_rate": 0.00029925813339353737, + "loss": 6.2815, + "step": 690 + }, + { + "epoch": 0.8951406649616368, + "grad_norm": 0.3804131746292114, + "learning_rate": 0.0002992275072900396, + "loss": 6.2626, + "step": 700 + }, + { + "epoch": 0.907928388746803, + "grad_norm": 0.6143403053283691, + "learning_rate": 0.00029919626337802913, + "loss": 6.2581, + "step": 710 + }, + { + "epoch": 0.9207161125319693, + "grad_norm": 0.6377658247947693, + "learning_rate": 0.00029916440178685713, + "loss": 6.2671, + "step": 720 + }, + { + "epoch": 0.9335038363171355, + "grad_norm": 0.5696841478347778, + "learning_rate": 0.00029913192264843195, + "loss": 6.2527, + "step": 730 + }, + { + "epoch": 0.9462915601023018, + "grad_norm": 0.39249590039253235, + "learning_rate": 0.00029909882609721876, + "loss": 6.2406, + "step": 740 + }, + { + "epoch": 0.959079283887468, + "grad_norm": 0.6684032082557678, + "learning_rate": 0.0002990651122702387, + "loss": 6.2444, + "step": 750 + }, + { + "epoch": 0.9718670076726342, + "grad_norm": 0.7203615307807922, + "learning_rate": 0.00029903078130706846, + "loss": 6.2242, + "step": 760 + }, + { + "epoch": 0.9846547314578005, + "grad_norm": 1.0230414867401123, + "learning_rate": 0.00029899583334983985, + "loss": 6.1915, + "step": 770 + }, + { + "epoch": 0.9974424552429667, + "grad_norm": 1.015184760093689, + "learning_rate": 0.00029896026854323894, + "loss": 6.2248, + "step": 780 + }, + { + "epoch": 1.010230179028133, + "grad_norm": 0.6776460409164429, + "learning_rate": 0.0002989240870345056, + "loss": 6.2067, + "step": 790 + }, + { + "epoch": 1.0230179028132993, + "grad_norm": 0.4673589766025543, + "learning_rate": 0.0002988872889734328, + "loss": 6.1699, + "step": 800 + }, + { + "epoch": 1.0358056265984654, + "grad_norm": 0.4850061535835266, + "learning_rate": 0.0002988498745123663, + "loss": 6.1736, + "step": 810 + }, + { + "epoch": 1.0485933503836318, + "grad_norm": 0.6966080665588379, + "learning_rate": 0.0002988118438062036, + "loss": 6.1996, + "step": 820 + }, + { + "epoch": 1.061381074168798, + "grad_norm": 0.7484626770019531, + "learning_rate": 0.0002987731970123934, + "loss": 6.1527, + "step": 830 + }, + { + "epoch": 1.0741687979539642, + "grad_norm": 2.176851511001587, + "learning_rate": 0.0002987339342909352, + "loss": 6.1356, + "step": 840 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.634508490562439, + "learning_rate": 0.0002986940558043785, + "loss": 6.142, + "step": 850 + }, + { + "epoch": 1.0997442455242967, + "grad_norm": 0.6721930503845215, + "learning_rate": 0.00029865356171782196, + "loss": 6.1289, + "step": 860 + }, + { + "epoch": 1.1125319693094629, + "grad_norm": 0.5172439813613892, + "learning_rate": 0.0002986124521989128, + "loss": 6.1097, + "step": 870 + }, + { + "epoch": 1.1253196930946292, + "grad_norm": 0.609614372253418, + "learning_rate": 0.00029857072741784634, + "loss": 6.1196, + "step": 880 + }, + { + "epoch": 1.1381074168797953, + "grad_norm": 0.7491389513015747, + "learning_rate": 0.0002985283875473651, + "loss": 6.0896, + "step": 890 + }, + { + "epoch": 1.1508951406649617, + "grad_norm": 0.5810283422470093, + "learning_rate": 0.0002984854327627578, + "loss": 6.097, + "step": 900 + }, + { + "epoch": 1.1636828644501278, + "grad_norm": 0.691870391368866, + "learning_rate": 0.00029844186324185943, + "loss": 6.0979, + "step": 910 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.5209296345710754, + "learning_rate": 0.00029839767916504955, + "loss": 6.068, + "step": 920 + }, + { + "epoch": 1.1892583120204603, + "grad_norm": 1.1897755861282349, + "learning_rate": 0.0002983528807152523, + "loss": 6.0626, + "step": 930 + }, + { + "epoch": 1.2020460358056266, + "grad_norm": 0.6294477581977844, + "learning_rate": 0.0002983074680779352, + "loss": 6.0441, + "step": 940 + }, + { + "epoch": 1.2148337595907928, + "grad_norm": 0.7823428511619568, + "learning_rate": 0.0002982614414411086, + "loss": 6.0634, + "step": 950 + }, + { + "epoch": 1.227621483375959, + "grad_norm": 0.641527533531189, + "learning_rate": 0.00029821480099532485, + "loss": 6.0567, + "step": 960 + }, + { + "epoch": 1.2404092071611252, + "grad_norm": 1.0938187837600708, + "learning_rate": 0.00029816754693367745, + "loss": 6.0302, + "step": 970 + }, + { + "epoch": 1.2531969309462916, + "grad_norm": 2.3455729484558105, + "learning_rate": 0.0002981196794518003, + "loss": 6.0441, + "step": 980 + }, + { + "epoch": 1.265984654731458, + "grad_norm": 0.5875425934791565, + "learning_rate": 0.0002980711987478669, + "loss": 6.0277, + "step": 990 + }, + { + "epoch": 1.278772378516624, + "grad_norm": 1.0881823301315308, + "learning_rate": 0.0002980221050225896, + "loss": 6.0174, + "step": 1000 + }, + { + "epoch": 1.2915601023017902, + "grad_norm": 0.8842185735702515, + "learning_rate": 0.0002979723984792185, + "loss": 5.9959, + "step": 1010 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.7018517255783081, + "learning_rate": 0.0002979220793235409, + "loss": 6.0133, + "step": 1020 + }, + { + "epoch": 1.317135549872123, + "grad_norm": 1.0132428407669067, + "learning_rate": 0.0002978711477638804, + "loss": 5.9918, + "step": 1030 + }, + { + "epoch": 1.329923273657289, + "grad_norm": 0.8519907593727112, + "learning_rate": 0.00029781960401109597, + "loss": 6.0053, + "step": 1040 + }, + { + "epoch": 1.3427109974424551, + "grad_norm": 1.0255190134048462, + "learning_rate": 0.00029776744827858087, + "loss": 5.999, + "step": 1050 + }, + { + "epoch": 1.3554987212276215, + "grad_norm": 0.6434299945831299, + "learning_rate": 0.0002977146807822623, + "loss": 5.9846, + "step": 1060 + }, + { + "epoch": 1.3682864450127878, + "grad_norm": 0.6655330657958984, + "learning_rate": 0.0002976613017406, + "loss": 5.9611, + "step": 1070 + }, + { + "epoch": 1.381074168797954, + "grad_norm": 0.7621287703514099, + "learning_rate": 0.00029760731137458554, + "loss": 5.9486, + "step": 1080 + }, + { + "epoch": 1.39386189258312, + "grad_norm": 0.8872424960136414, + "learning_rate": 0.00029755270990774145, + "loss": 5.9409, + "step": 1090 + }, + { + "epoch": 1.4066496163682864, + "grad_norm": 0.978261411190033, + "learning_rate": 0.0002974974975661203, + "loss": 5.9434, + "step": 1100 + }, + { + "epoch": 1.4194373401534528, + "grad_norm": 0.9707337617874146, + "learning_rate": 0.0002974416745783035, + "loss": 5.9248, + "step": 1110 + }, + { + "epoch": 1.432225063938619, + "grad_norm": 1.0900273323059082, + "learning_rate": 0.0002973852411754008, + "loss": 5.9105, + "step": 1120 + }, + { + "epoch": 1.445012787723785, + "grad_norm": 3.00311017036438, + "learning_rate": 0.00029732819759104896, + "loss": 5.922, + "step": 1130 + }, + { + "epoch": 1.4578005115089514, + "grad_norm": 0.9423596858978271, + "learning_rate": 0.00029727054406141097, + "loss": 5.9285, + "step": 1140 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.7384818196296692, + "learning_rate": 0.000297212280825175, + "loss": 5.9222, + "step": 1150 + }, + { + "epoch": 1.4833759590792839, + "grad_norm": 1.1171971559524536, + "learning_rate": 0.0002971534081235535, + "loss": 5.9156, + "step": 1160 + }, + { + "epoch": 1.49616368286445, + "grad_norm": 0.7531259059906006, + "learning_rate": 0.00029709392620028195, + "loss": 5.9004, + "step": 1170 + }, + { + "epoch": 1.5089514066496164, + "grad_norm": 0.9441642761230469, + "learning_rate": 0.0002970338353016182, + "loss": 5.9076, + "step": 1180 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 1.2140872478485107, + "learning_rate": 0.00029697313567634133, + "loss": 5.8894, + "step": 1190 + }, + { + "epoch": 1.5345268542199488, + "grad_norm": 1.1109689474105835, + "learning_rate": 0.0002969118275757504, + "loss": 5.8663, + "step": 1200 + }, + { + "epoch": 1.547314578005115, + "grad_norm": 0.8246760964393616, + "learning_rate": 0.00029684991125366364, + "loss": 5.8749, + "step": 1210 + }, + { + "epoch": 1.5601023017902813, + "grad_norm": 0.744144856929779, + "learning_rate": 0.0002967873869664175, + "loss": 5.8384, + "step": 1220 + }, + { + "epoch": 1.5728900255754477, + "grad_norm": 1.149007797241211, + "learning_rate": 0.00029672425497286514, + "loss": 5.8582, + "step": 1230 + }, + { + "epoch": 1.5856777493606138, + "grad_norm": 0.7142267823219299, + "learning_rate": 0.00029666051553437595, + "loss": 5.8428, + "step": 1240 + }, + { + "epoch": 1.59846547314578, + "grad_norm": 1.0163673162460327, + "learning_rate": 0.000296596168914834, + "loss": 5.8399, + "step": 1250 + }, + { + "epoch": 1.6112531969309463, + "grad_norm": 1.5262479782104492, + "learning_rate": 0.0002965312153806371, + "loss": 5.8286, + "step": 1260 + }, + { + "epoch": 1.6240409207161126, + "grad_norm": 1.150258183479309, + "learning_rate": 0.0002964656552006959, + "loss": 5.8294, + "step": 1270 + }, + { + "epoch": 1.6368286445012787, + "grad_norm": 3.7115478515625, + "learning_rate": 0.0002963994886464323, + "loss": 5.8069, + "step": 1280 + }, + { + "epoch": 1.6496163682864449, + "grad_norm": 0.9176708459854126, + "learning_rate": 0.00029633271599177895, + "loss": 5.8503, + "step": 1290 + }, + { + "epoch": 1.6624040920716112, + "grad_norm": 0.7877741456031799, + "learning_rate": 0.00029626533751317744, + "loss": 5.8067, + "step": 1300 + }, + { + "epoch": 1.6751918158567776, + "grad_norm": 0.8420455455780029, + "learning_rate": 0.0002961973534895778, + "loss": 5.8071, + "step": 1310 + }, + { + "epoch": 1.6879795396419437, + "grad_norm": 1.0373153686523438, + "learning_rate": 0.0002961287642024368, + "loss": 5.8232, + "step": 1320 + }, + { + "epoch": 1.7007672634271098, + "grad_norm": 0.7778970003128052, + "learning_rate": 0.0002960595699357172, + "loss": 5.8029, + "step": 1330 + }, + { + "epoch": 1.7135549872122762, + "grad_norm": 0.77557772397995, + "learning_rate": 0.00029598977097588635, + "loss": 5.8092, + "step": 1340 + }, + { + "epoch": 1.7263427109974425, + "grad_norm": 1.099355697631836, + "learning_rate": 0.000295919367611915, + "loss": 5.8076, + "step": 1350 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 1.891553521156311, + "learning_rate": 0.00029584836013527623, + "loss": 5.7752, + "step": 1360 + }, + { + "epoch": 1.7519181585677748, + "grad_norm": 1.6321626901626587, + "learning_rate": 0.00029577674883994414, + "loss": 5.7776, + "step": 1370 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.969916582107544, + "learning_rate": 0.0002957045340223927, + "loss": 5.7882, + "step": 1380 + }, + { + "epoch": 1.7774936061381075, + "grad_norm": 0.7964078187942505, + "learning_rate": 0.0002956317159815944, + "loss": 5.7876, + "step": 1390 + }, + { + "epoch": 1.7902813299232738, + "grad_norm": 1.2642195224761963, + "learning_rate": 0.00029555829501901927, + "loss": 5.7629, + "step": 1400 + }, + { + "epoch": 1.80306905370844, + "grad_norm": 0.8739522099494934, + "learning_rate": 0.0002954842714386333, + "loss": 5.7302, + "step": 1410 + }, + { + "epoch": 1.815856777493606, + "grad_norm": 1.0974680185317993, + "learning_rate": 0.00029540964554689735, + "loss": 5.7658, + "step": 1420 + }, + { + "epoch": 1.8286445012787724, + "grad_norm": 1.1861488819122314, + "learning_rate": 0.000295334417652766, + "loss": 5.7617, + "step": 1430 + }, + { + "epoch": 1.8414322250639388, + "grad_norm": 1.473907470703125, + "learning_rate": 0.0002952585880676861, + "loss": 5.7425, + "step": 1440 + }, + { + "epoch": 1.854219948849105, + "grad_norm": 1.195039987564087, + "learning_rate": 0.0002951821571055954, + "loss": 5.7414, + "step": 1450 + }, + { + "epoch": 1.867007672634271, + "grad_norm": 1.2760419845581055, + "learning_rate": 0.00029510512508292155, + "loss": 5.7445, + "step": 1460 + }, + { + "epoch": 1.8797953964194374, + "grad_norm": 2.53118896484375, + "learning_rate": 0.0002950274923185806, + "loss": 5.7394, + "step": 1470 + }, + { + "epoch": 1.8925831202046037, + "grad_norm": 1.0948902368545532, + "learning_rate": 0.00029494925913397553, + "loss": 5.7318, + "step": 1480 + }, + { + "epoch": 1.9053708439897699, + "grad_norm": 1.546405553817749, + "learning_rate": 0.00029487042585299526, + "loss": 5.7352, + "step": 1490 + }, + { + "epoch": 1.918158567774936, + "grad_norm": 70.01959228515625, + "learning_rate": 0.0002947909928020131, + "loss": 5.9204, + "step": 1500 + }, + { + "epoch": 1.9309462915601023, + "grad_norm": 1859.7359619140625, + "learning_rate": 0.0002947109603098853, + "loss": 5.9433, + "step": 1510 + }, + { + "epoch": 1.9437340153452687, + "grad_norm": 75.3654556274414, + "learning_rate": 0.0002946303287079501, + "loss": 7.0599, + "step": 1520 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 1592.2181396484375, + "learning_rate": 0.00029454909833002583, + "loss": 8.1567, + "step": 1530 + }, + { + "epoch": 1.969309462915601, + "grad_norm": 2557.483154296875, + "learning_rate": 0.0002944672695124098, + "loss": 9.1084, + "step": 1540 + }, + { + "epoch": 1.9820971867007673, + "grad_norm": 564582.5625, + "learning_rate": 0.00029438484259387707, + "loss": 11.1384, + "step": 1550 + }, + { + "epoch": 1.9948849104859336, + "grad_norm": 83855.9765625, + "learning_rate": 0.0002943018179156787, + "loss": 15.2769, + "step": 1560 + }, + { + "epoch": 2.0076726342710995, + "grad_norm": 17.882341384887695, + "learning_rate": 0.0002942181958215405, + "loss": 10.5658, + "step": 1570 + }, + { + "epoch": 2.020460358056266, + "grad_norm": 7614.091796875, + "learning_rate": 0.0002941339766576618, + "loss": 8.1135, + "step": 1580 + }, + { + "epoch": 2.0332480818414322, + "grad_norm": 479.05926513671875, + "learning_rate": 0.0002940491607727135, + "loss": 18.0011, + "step": 1590 + }, + { + "epoch": 2.0460358056265986, + "grad_norm": 233.18682861328125, + "learning_rate": 0.00029396374851783723, + "loss": 8.1656, + "step": 1600 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 7.251436710357666, + "learning_rate": 0.0002938777402466435, + "loss": 9.6865, + "step": 1610 + }, + { + "epoch": 2.071611253196931, + "grad_norm": 13.384018898010254, + "learning_rate": 0.0002937911363152104, + "loss": 7.5508, + "step": 1620 + }, + { + "epoch": 2.084398976982097, + "grad_norm": 64.90263366699219, + "learning_rate": 0.00029370393708208205, + "loss": 6.7978, + "step": 1630 + }, + { + "epoch": 2.0971867007672635, + "grad_norm": 10.841496467590332, + "learning_rate": 0.00029361614290826705, + "loss": 6.589, + "step": 1640 + }, + { + "epoch": 2.10997442455243, + "grad_norm": 31.3606014251709, + "learning_rate": 0.00029352775415723733, + "loss": 6.5118, + "step": 1650 + }, + { + "epoch": 2.122762148337596, + "grad_norm": 35.56670379638672, + "learning_rate": 0.0002934387711949262, + "loss": 6.3938, + "step": 1660 + }, + { + "epoch": 2.135549872122762, + "grad_norm": 9.298479080200195, + "learning_rate": 0.00029334919438972707, + "loss": 6.3621, + "step": 1670 + }, + { + "epoch": 2.1483375959079285, + "grad_norm": 31.193031311035156, + "learning_rate": 0.0002932590241124918, + "loss": 6.3468, + "step": 1680 + }, + { + "epoch": 2.1611253196930944, + "grad_norm": 63.973716735839844, + "learning_rate": 0.0002931682607365296, + "loss": 6.358, + "step": 1690 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 14.983797073364258, + "learning_rate": 0.0002930769046376046, + "loss": 6.3489, + "step": 1700 + }, + { + "epoch": 2.186700767263427, + "grad_norm": 2612.127685546875, + "learning_rate": 0.00029298495619393527, + "loss": 6.3614, + "step": 1710 + }, + { + "epoch": 2.1994884910485935, + "grad_norm": 1995.3865966796875, + "learning_rate": 0.0002928924157861922, + "loss": 6.3641, + "step": 1720 + }, + { + "epoch": 2.21227621483376, + "grad_norm": 9.419896125793457, + "learning_rate": 0.0002927992837974968, + "loss": 6.3839, + "step": 1730 + }, + { + "epoch": 2.2250639386189257, + "grad_norm": 12.694928169250488, + "learning_rate": 0.00029270556061341966, + "loss": 6.395, + "step": 1740 + }, + { + "epoch": 2.237851662404092, + "grad_norm": 1460.1500244140625, + "learning_rate": 0.0002926112466219789, + "loss": 6.6829, + "step": 1750 + }, + { + "epoch": 2.2506393861892584, + "grad_norm": 11581.7744140625, + "learning_rate": 0.0002925163422136386, + "loss": 7.3051, + "step": 1760 + }, + { + "epoch": 2.2634271099744243, + "grad_norm": 1636.6549072265625, + "learning_rate": 0.0002924208477813074, + "loss": 7.1446, + "step": 1770 + }, + { + "epoch": 2.2762148337595907, + "grad_norm": 62.05595016479492, + "learning_rate": 0.0002923247637203362, + "loss": 6.7255, + "step": 1780 + }, + { + "epoch": 2.289002557544757, + "grad_norm": 1366.1842041015625, + "learning_rate": 0.00029222809042851755, + "loss": 7.1959, + "step": 1790 + }, + { + "epoch": 2.3017902813299234, + "grad_norm": 8.141697883605957, + "learning_rate": 0.0002921308283060831, + "loss": 6.8231, + "step": 1800 + }, + { + "epoch": 2.3145780051150897, + "grad_norm": 20.41695785522461, + "learning_rate": 0.0002920329777557023, + "loss": 6.6307, + "step": 1810 + }, + { + "epoch": 2.3273657289002556, + "grad_norm": 5.036224365234375, + "learning_rate": 0.0002919345391824809, + "loss": 6.3748, + "step": 1820 + }, + { + "epoch": 2.340153452685422, + "grad_norm": 1.9425593614578247, + "learning_rate": 0.00029183551299395873, + "loss": 6.3822, + "step": 1830 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 17.239046096801758, + "learning_rate": 0.00029173589960010887, + "loss": 6.4479, + "step": 1840 + }, + { + "epoch": 2.3657289002557547, + "grad_norm": 12.49085521697998, + "learning_rate": 0.00029163569941333513, + "loss": 6.3155, + "step": 1850 + }, + { + "epoch": 2.3785166240409206, + "grad_norm": 1.438551664352417, + "learning_rate": 0.0002915349128484706, + "loss": 6.3356, + "step": 1860 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 10.845519065856934, + "learning_rate": 0.0002914335403227763, + "loss": 6.3907, + "step": 1870 + }, + { + "epoch": 2.4040920716112533, + "grad_norm": 1.9113539457321167, + "learning_rate": 0.0002913315822559389, + "loss": 6.2959, + "step": 1880 + }, + { + "epoch": 2.4168797953964196, + "grad_norm": 1.137809157371521, + "learning_rate": 0.0002912290390700694, + "loss": 6.2159, + "step": 1890 + }, + { + "epoch": 2.4296675191815855, + "grad_norm": 6.75790548324585, + "learning_rate": 0.0002911259111897011, + "loss": 6.1437, + "step": 1900 + }, + { + "epoch": 2.442455242966752, + "grad_norm": 13.505289077758789, + "learning_rate": 0.000291022199041788, + "loss": 6.1807, + "step": 1910 + }, + { + "epoch": 2.455242966751918, + "grad_norm": 1.1891614198684692, + "learning_rate": 0.000290917903055703, + "loss": 6.0738, + "step": 1920 + }, + { + "epoch": 2.4680306905370846, + "grad_norm": 9.028098106384277, + "learning_rate": 0.0002908130236632361, + "loss": 6.0556, + "step": 1930 + }, + { + "epoch": 2.4808184143222505, + "grad_norm": 41.855743408203125, + "learning_rate": 0.0002907075612985928, + "loss": 6.0713, + "step": 1940 + }, + { + "epoch": 2.493606138107417, + "grad_norm": 5.515649795532227, + "learning_rate": 0.00029060151639839174, + "loss": 6.0796, + "step": 1950 + }, + { + "epoch": 2.506393861892583, + "grad_norm": 740.7005004882812, + "learning_rate": 0.0002904948894016637, + "loss": 6.7849, + "step": 1960 + }, + { + "epoch": 2.5191815856777495, + "grad_norm": 100.34882354736328, + "learning_rate": 0.0002903876807498491, + "loss": 7.0059, + "step": 1970 + }, + { + "epoch": 2.531969309462916, + "grad_norm": 105.65509033203125, + "learning_rate": 0.0002902798908867966, + "loss": 6.3334, + "step": 1980 + }, + { + "epoch": 2.544757033248082, + "grad_norm": 20.092199325561523, + "learning_rate": 0.0002901715202587609, + "loss": 6.376, + "step": 1990 + }, + { + "epoch": 2.557544757033248, + "grad_norm": 4.902982711791992, + "learning_rate": 0.0002900625693144013, + "loss": 6.3327, + "step": 2000 + }, + { + "epoch": 2.5703324808184145, + "grad_norm": 4.835528373718262, + "learning_rate": 0.00028995303850477957, + "loss": 6.2468, + "step": 2010 + }, + { + "epoch": 2.5831202046035804, + "grad_norm": 3.2727978229522705, + "learning_rate": 0.00028984292828335807, + "loss": 6.1977, + "step": 2020 + }, + { + "epoch": 2.5959079283887467, + "grad_norm": 0.9278408288955688, + "learning_rate": 0.00028973223910599803, + "loss": 6.0873, + "step": 2030 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.6826826333999634, + "learning_rate": 0.0002896209714309577, + "loss": 5.9931, + "step": 2040 + }, + { + "epoch": 2.6214833759590794, + "grad_norm": 0.9041231274604797, + "learning_rate": 0.0002895091257188899, + "loss": 5.9545, + "step": 2050 + }, + { + "epoch": 2.634271099744246, + "grad_norm": 0.9247362017631531, + "learning_rate": 0.00028939670243284115, + "loss": 5.9662, + "step": 2060 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 1.4400997161865234, + "learning_rate": 0.00028928370203824865, + "loss": 5.9395, + "step": 2070 + }, + { + "epoch": 2.659846547314578, + "grad_norm": 1.0552617311477661, + "learning_rate": 0.00028917012500293915, + "loss": 5.9204, + "step": 2080 + }, + { + "epoch": 2.6726342710997444, + "grad_norm": 0.743399977684021, + "learning_rate": 0.0002890559717971266, + "loss": 5.9061, + "step": 2090 + }, + { + "epoch": 2.6854219948849103, + "grad_norm": 49.769935607910156, + "learning_rate": 0.0002889412428934104, + "loss": 6.8681, + "step": 2100 + }, + { + "epoch": 2.6982097186700766, + "grad_norm": 1.0962203741073608, + "learning_rate": 0.0002888259387667732, + "loss": 6.0047, + "step": 2110 + }, + { + "epoch": 2.710997442455243, + "grad_norm": 0.7588474154472351, + "learning_rate": 0.0002887100598945793, + "loss": 5.946, + "step": 2120 + }, + { + "epoch": 2.7237851662404093, + "grad_norm": 1.135327935218811, + "learning_rate": 0.0002885936067565723, + "loss": 5.917, + "step": 2130 + }, + { + "epoch": 2.7365728900255757, + "grad_norm": 1.0815144777297974, + "learning_rate": 0.00028847657983487344, + "loss": 5.8928, + "step": 2140 + }, + { + "epoch": 2.7493606138107416, + "grad_norm": 0.9049263000488281, + "learning_rate": 0.0002883589796139794, + "loss": 5.8849, + "step": 2150 + }, + { + "epoch": 2.762148337595908, + "grad_norm": 1.9964706897735596, + "learning_rate": 0.00028824080658076024, + "loss": 5.9063, + "step": 2160 + }, + { + "epoch": 2.7749360613810743, + "grad_norm": 1.1962535381317139, + "learning_rate": 0.00028812206122445766, + "loss": 5.8739, + "step": 2170 + }, + { + "epoch": 2.78772378516624, + "grad_norm": 0.9967917203903198, + "learning_rate": 0.0002880027440366826, + "loss": 5.8599, + "step": 2180 + }, + { + "epoch": 2.8005115089514065, + "grad_norm": 1.099144697189331, + "learning_rate": 0.00028788285551141366, + "loss": 5.8559, + "step": 2190 + }, + { + "epoch": 2.813299232736573, + "grad_norm": 0.7207736968994141, + "learning_rate": 0.00028776239614499447, + "loss": 5.8369, + "step": 2200 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 1.000334620475769, + "learning_rate": 0.0002876413664361323, + "loss": 5.8644, + "step": 2210 + }, + { + "epoch": 2.8388746803069056, + "grad_norm": 0.778367280960083, + "learning_rate": 0.0002875197668858956, + "loss": 5.8342, + "step": 2220 + }, + { + "epoch": 2.8516624040920715, + "grad_norm": 1.511242389678955, + "learning_rate": 0.0002873975979977117, + "loss": 5.8202, + "step": 2230 + }, + { + "epoch": 2.864450127877238, + "grad_norm": 1.0907540321350098, + "learning_rate": 0.00028727486027736536, + "loss": 5.8303, + "step": 2240 + }, + { + "epoch": 2.877237851662404, + "grad_norm": 1.5989691019058228, + "learning_rate": 0.00028715155423299617, + "loss": 5.8212, + "step": 2250 + }, + { + "epoch": 2.89002557544757, + "grad_norm": 1.2920646667480469, + "learning_rate": 0.0002870276803750967, + "loss": 5.825, + "step": 2260 + }, + { + "epoch": 2.9028132992327365, + "grad_norm": 0.9930233955383301, + "learning_rate": 0.0002869032392165101, + "loss": 5.8205, + "step": 2270 + }, + { + "epoch": 2.915601023017903, + "grad_norm": 0.9936062693595886, + "learning_rate": 0.0002867782312724284, + "loss": 5.7916, + "step": 2280 + }, + { + "epoch": 2.928388746803069, + "grad_norm": 2.00677490234375, + "learning_rate": 0.00028665265706039, + "loss": 5.8037, + "step": 2290 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.0021674633026123, + "learning_rate": 0.00028652651710027775, + "loss": 5.8161, + "step": 2300 + }, + { + "epoch": 2.9539641943734014, + "grad_norm": 0.845365583896637, + "learning_rate": 0.0002863998119143166, + "loss": 5.787, + "step": 2310 + }, + { + "epoch": 2.9667519181585678, + "grad_norm": 0.663759708404541, + "learning_rate": 0.0002862725420270718, + "loss": 5.8021, + "step": 2320 + }, + { + "epoch": 2.979539641943734, + "grad_norm": 1.7215557098388672, + "learning_rate": 0.0002861447079654462, + "loss": 5.7939, + "step": 2330 + }, + { + "epoch": 2.9923273657289, + "grad_norm": 1.1704254150390625, + "learning_rate": 0.00028601631025867855, + "loss": 5.7715, + "step": 2340 + }, + { + "epoch": 3.0051150895140664, + "grad_norm": 0.7022088170051575, + "learning_rate": 0.00028588734943834113, + "loss": 5.7745, + "step": 2350 + }, + { + "epoch": 3.0179028132992327, + "grad_norm": 1.17882239818573, + "learning_rate": 0.0002857578260383374, + "loss": 5.7668, + "step": 2360 + }, + { + "epoch": 3.030690537084399, + "grad_norm": 1.0343165397644043, + "learning_rate": 0.00028562774059489996, + "loss": 5.7837, + "step": 2370 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 1.3794867992401123, + "learning_rate": 0.00028549709364658843, + "loss": 5.7698, + "step": 2380 + }, + { + "epoch": 3.0562659846547313, + "grad_norm": 1.120381474494934, + "learning_rate": 0.00028536588573428685, + "loss": 5.7644, + "step": 2390 + }, + { + "epoch": 3.0690537084398977, + "grad_norm": 1.0266337394714355, + "learning_rate": 0.0002852341174012019, + "loss": 5.7424, + "step": 2400 + }, + { + "epoch": 3.081841432225064, + "grad_norm": 1.448861002922058, + "learning_rate": 0.0002851017891928604, + "loss": 5.7292, + "step": 2410 + }, + { + "epoch": 3.0946291560102304, + "grad_norm": 1.5596977472305298, + "learning_rate": 0.0002849689016571068, + "loss": 5.7575, + "step": 2420 + }, + { + "epoch": 3.1074168797953963, + "grad_norm": 0.8733810186386108, + "learning_rate": 0.00028483545534410156, + "loss": 5.7362, + "step": 2430 + }, + { + "epoch": 3.1202046035805626, + "grad_norm": 9.138484954833984, + "learning_rate": 0.0002847014508063184, + "loss": 5.7518, + "step": 2440 + }, + { + "epoch": 3.132992327365729, + "grad_norm": 7.065587520599365, + "learning_rate": 0.0002845668885985419, + "loss": 5.9074, + "step": 2450 + }, + { + "epoch": 3.1457800511508953, + "grad_norm": 2.038409471511841, + "learning_rate": 0.0002844317692778657, + "loss": 5.7704, + "step": 2460 + }, + { + "epoch": 3.1585677749360612, + "grad_norm": 0.9123558402061462, + "learning_rate": 0.00028429609340368975, + "loss": 5.7661, + "step": 2470 + }, + { + "epoch": 3.1713554987212276, + "grad_norm": 0.7301475405693054, + "learning_rate": 0.0002841598615377182, + "loss": 5.7469, + "step": 2480 + }, + { + "epoch": 3.184143222506394, + "grad_norm": 0.8775326609611511, + "learning_rate": 0.000284023074243957, + "loss": 5.7485, + "step": 2490 + }, + { + "epoch": 3.1969309462915603, + "grad_norm": 1.6849805116653442, + "learning_rate": 0.00028388573208871175, + "loss": 5.7296, + "step": 2500 + }, + { + "epoch": 3.209718670076726, + "grad_norm": 0.8062297701835632, + "learning_rate": 0.00028374783564058493, + "loss": 5.7297, + "step": 2510 + }, + { + "epoch": 3.2225063938618925, + "grad_norm": 0.9257404804229736, + "learning_rate": 0.00028360938547047405, + "loss": 5.7582, + "step": 2520 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 1.384611964225769, + "learning_rate": 0.00028347038215156893, + "loss": 5.7018, + "step": 2530 + }, + { + "epoch": 3.2480818414322252, + "grad_norm": 1.1805202960968018, + "learning_rate": 0.0002833308262593495, + "loss": 5.7122, + "step": 2540 + }, + { + "epoch": 3.260869565217391, + "grad_norm": 1.3140634298324585, + "learning_rate": 0.00028319071837158336, + "loss": 5.7173, + "step": 2550 + }, + { + "epoch": 3.2736572890025575, + "grad_norm": 0.8321253061294556, + "learning_rate": 0.00028305005906832333, + "loss": 5.7095, + "step": 2560 + }, + { + "epoch": 3.286445012787724, + "grad_norm": 0.866754412651062, + "learning_rate": 0.0002829088489319053, + "loss": 5.7009, + "step": 2570 + }, + { + "epoch": 3.29923273657289, + "grad_norm": 0.97830730676651, + "learning_rate": 0.00028276708854694545, + "loss": 5.6795, + "step": 2580 + }, + { + "epoch": 3.312020460358056, + "grad_norm": 1.2616524696350098, + "learning_rate": 0.00028262477850033807, + "loss": 5.7157, + "step": 2590 + }, + { + "epoch": 3.3248081841432224, + "grad_norm": 0.9064726829528809, + "learning_rate": 0.00028248191938125305, + "loss": 5.7089, + "step": 2600 + }, + { + "epoch": 3.337595907928389, + "grad_norm": 0.9063658714294434, + "learning_rate": 0.0002823385117811335, + "loss": 5.9595, + "step": 2610 + }, + { + "epoch": 3.350383631713555, + "grad_norm": 1.100963830947876, + "learning_rate": 0.00028219455629369334, + "loss": 5.6872, + "step": 2620 + }, + { + "epoch": 3.363171355498721, + "grad_norm": 1.2329927682876587, + "learning_rate": 0.0002820500535149146, + "loss": 5.7029, + "step": 2630 + }, + { + "epoch": 3.3759590792838874, + "grad_norm": 0.8984890580177307, + "learning_rate": 0.00028190500404304524, + "loss": 5.7381, + "step": 2640 + }, + { + "epoch": 3.3887468030690537, + "grad_norm": 0.696719765663147, + "learning_rate": 0.0002817594084785965, + "loss": 5.6881, + "step": 2650 + }, + { + "epoch": 3.40153452685422, + "grad_norm": 1.0669254064559937, + "learning_rate": 0.0002816132674243405, + "loss": 5.6737, + "step": 2660 + }, + { + "epoch": 3.414322250639386, + "grad_norm": 0.8603677153587341, + "learning_rate": 0.0002814665814853077, + "loss": 5.6669, + "step": 2670 + }, + { + "epoch": 3.4271099744245523, + "grad_norm": 0.902407169342041, + "learning_rate": 0.00028131935126878444, + "loss": 5.6782, + "step": 2680 + }, + { + "epoch": 3.4398976982097187, + "grad_norm": 0.8212502598762512, + "learning_rate": 0.00028117157738431027, + "loss": 5.6825, + "step": 2690 + }, + { + "epoch": 3.452685421994885, + "grad_norm": 0.7769032120704651, + "learning_rate": 0.0002810232604436758, + "loss": 5.6798, + "step": 2700 + }, + { + "epoch": 3.4654731457800514, + "grad_norm": 1.140012264251709, + "learning_rate": 0.0002808744010609196, + "loss": 5.6552, + "step": 2710 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 1.1231127977371216, + "learning_rate": 0.00028072499985232614, + "loss": 5.6731, + "step": 2720 + }, + { + "epoch": 3.4910485933503836, + "grad_norm": 0.6917468309402466, + "learning_rate": 0.0002805750574364231, + "loss": 5.6562, + "step": 2730 + }, + { + "epoch": 3.50383631713555, + "grad_norm": 1.1626530885696411, + "learning_rate": 0.00028042457443397883, + "loss": 5.6433, + "step": 2740 + }, + { + "epoch": 3.516624040920716, + "grad_norm": 1.6136029958724976, + "learning_rate": 0.0002802735514679995, + "loss": 5.6741, + "step": 2750 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.9433214068412781, + "learning_rate": 0.00028012198916372704, + "loss": 5.6615, + "step": 2760 + }, + { + "epoch": 3.5421994884910486, + "grad_norm": 0.8657507300376892, + "learning_rate": 0.000279969888148636, + "loss": 5.6417, + "step": 2770 + }, + { + "epoch": 3.554987212276215, + "grad_norm": 1.2951968908309937, + "learning_rate": 0.0002798172490524315, + "loss": 5.6563, + "step": 2780 + }, + { + "epoch": 3.5677749360613813, + "grad_norm": 0.9073137640953064, + "learning_rate": 0.000279664072507046, + "loss": 5.6356, + "step": 2790 + }, + { + "epoch": 3.580562659846547, + "grad_norm": 1.0318324565887451, + "learning_rate": 0.00027951035914663736, + "loss": 5.6399, + "step": 2800 + }, + { + "epoch": 3.5933503836317136, + "grad_norm": 1.2848615646362305, + "learning_rate": 0.00027935610960758557, + "loss": 5.6362, + "step": 2810 + }, + { + "epoch": 3.60613810741688, + "grad_norm": 1.8240704536437988, + "learning_rate": 0.0002792013245284907, + "loss": 5.631, + "step": 2820 + }, + { + "epoch": 3.618925831202046, + "grad_norm": 3.091918468475342, + "learning_rate": 0.00027904600455016984, + "loss": 5.6453, + "step": 2830 + }, + { + "epoch": 3.631713554987212, + "grad_norm": 0.8699560165405273, + "learning_rate": 0.00027889015031565457, + "loss": 5.6166, + "step": 2840 + }, + { + "epoch": 3.6445012787723785, + "grad_norm": 0.9239150285720825, + "learning_rate": 0.0002787337624701883, + "loss": 5.618, + "step": 2850 + }, + { + "epoch": 3.657289002557545, + "grad_norm": 0.8173996806144714, + "learning_rate": 0.00027857684166122384, + "loss": 5.6296, + "step": 2860 + }, + { + "epoch": 3.670076726342711, + "grad_norm": 0.7985453009605408, + "learning_rate": 0.0002784193885384201, + "loss": 5.6385, + "step": 2870 + }, + { + "epoch": 3.682864450127877, + "grad_norm": 1.2572569847106934, + "learning_rate": 0.0002782614037536402, + "loss": 5.6183, + "step": 2880 + }, + { + "epoch": 3.6956521739130435, + "grad_norm": 1.2069624662399292, + "learning_rate": 0.000278102887960948, + "loss": 5.5862, + "step": 2890 + }, + { + "epoch": 3.70843989769821, + "grad_norm": 0.8699544072151184, + "learning_rate": 0.000277943841816606, + "loss": 5.6158, + "step": 2900 + }, + { + "epoch": 3.7212276214833757, + "grad_norm": 1.2560055255889893, + "learning_rate": 0.00027778426597907237, + "loss": 5.6094, + "step": 2910 + }, + { + "epoch": 3.734015345268542, + "grad_norm": 1.0291788578033447, + "learning_rate": 0.0002776241611089981, + "loss": 5.6025, + "step": 2920 + }, + { + "epoch": 3.7468030690537084, + "grad_norm": 0.9255660772323608, + "learning_rate": 0.0002774635278692244, + "loss": 5.6193, + "step": 2930 + }, + { + "epoch": 3.7595907928388748, + "grad_norm": 0.9527194499969482, + "learning_rate": 0.0002773023669247801, + "loss": 5.609, + "step": 2940 + }, + { + "epoch": 3.772378516624041, + "grad_norm": 1.1681740283966064, + "learning_rate": 0.00027714067894287874, + "loss": 5.5862, + "step": 2950 + }, + { + "epoch": 3.785166240409207, + "grad_norm": 0.9430054426193237, + "learning_rate": 0.0002769784645929156, + "loss": 5.601, + "step": 2960 + }, + { + "epoch": 3.7979539641943734, + "grad_norm": 0.7085330486297607, + "learning_rate": 0.0002768157245464655, + "loss": 5.5931, + "step": 2970 + }, + { + "epoch": 3.8107416879795397, + "grad_norm": 0.92620849609375, + "learning_rate": 0.00027665245947727926, + "loss": 5.584, + "step": 2980 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 1.2099533081054688, + "learning_rate": 0.00027648867006128166, + "loss": 5.5962, + "step": 2990 + }, + { + "epoch": 3.836317135549872, + "grad_norm": 0.8536057472229004, + "learning_rate": 0.00027632435697656804, + "loss": 5.575, + "step": 3000 + }, + { + "epoch": 3.8491048593350383, + "grad_norm": 0.6810599565505981, + "learning_rate": 0.000276159520903402, + "loss": 5.575, + "step": 3010 + }, + { + "epoch": 3.8618925831202047, + "grad_norm": 1.2196543216705322, + "learning_rate": 0.00027599416252421213, + "loss": 5.6085, + "step": 3020 + }, + { + "epoch": 3.874680306905371, + "grad_norm": 1.5269405841827393, + "learning_rate": 0.00027582828252358953, + "loss": 5.599, + "step": 3030 + }, + { + "epoch": 3.887468030690537, + "grad_norm": 0.9655811786651611, + "learning_rate": 0.0002756618815882847, + "loss": 5.5786, + "step": 3040 + }, + { + "epoch": 3.9002557544757033, + "grad_norm": 0.9123175144195557, + "learning_rate": 0.000275494960407205, + "loss": 5.5982, + "step": 3050 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 1.5311001539230347, + "learning_rate": 0.0002753275196714114, + "loss": 5.5947, + "step": 3060 + }, + { + "epoch": 3.9258312020460355, + "grad_norm": 0.829192578792572, + "learning_rate": 0.0002751595600741161, + "loss": 5.574, + "step": 3070 + }, + { + "epoch": 3.938618925831202, + "grad_norm": 1.4503875970840454, + "learning_rate": 0.0002749910823106793, + "loss": 5.5739, + "step": 3080 + }, + { + "epoch": 3.9514066496163682, + "grad_norm": 1.0710264444351196, + "learning_rate": 0.0002748220870786064, + "loss": 5.583, + "step": 3090 + }, + { + "epoch": 3.9641943734015346, + "grad_norm": 1.2536656856536865, + "learning_rate": 0.00027465257507754517, + "loss": 5.5767, + "step": 3100 + }, + { + "epoch": 3.976982097186701, + "grad_norm": 0.6235175728797913, + "learning_rate": 0.0002744825470092828, + "loss": 5.5572, + "step": 3110 + }, + { + "epoch": 3.9897698209718673, + "grad_norm": 0.9897062182426453, + "learning_rate": 0.0002743120035777431, + "loss": 5.5586, + "step": 3120 + }, + { + "epoch": 4.002557544757034, + "grad_norm": 1.1203209161758423, + "learning_rate": 0.00027414094548898347, + "loss": 5.5683, + "step": 3130 + }, + { + "epoch": 4.015345268542199, + "grad_norm": 0.6553176045417786, + "learning_rate": 0.0002739693734511919, + "loss": 5.5741, + "step": 3140 + }, + { + "epoch": 4.028132992327365, + "grad_norm": 0.6654773950576782, + "learning_rate": 0.00027379728817468436, + "loss": 5.537, + "step": 3150 + }, + { + "epoch": 4.040920716112532, + "grad_norm": 1.263575792312622, + "learning_rate": 0.0002736246903719015, + "loss": 5.5378, + "step": 3160 + }, + { + "epoch": 4.053708439897698, + "grad_norm": 1.274593710899353, + "learning_rate": 0.0002734515807574059, + "loss": 5.5649, + "step": 3170 + }, + { + "epoch": 4.0664961636828645, + "grad_norm": 0.804013192653656, + "learning_rate": 0.000273277960047879, + "loss": 5.55, + "step": 3180 + }, + { + "epoch": 4.079283887468031, + "grad_norm": 1.15504789352417, + "learning_rate": 0.0002731038289621184, + "loss": 5.5446, + "step": 3190 + }, + { + "epoch": 4.092071611253197, + "grad_norm": 0.9598748087882996, + "learning_rate": 0.0002729291882210344, + "loss": 5.547, + "step": 3200 + }, + { + "epoch": 4.1048593350383635, + "grad_norm": 0.9165511727333069, + "learning_rate": 0.0002727540385476475, + "loss": 5.5288, + "step": 3210 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.8215676546096802, + "learning_rate": 0.00027257838066708503, + "loss": 5.5356, + "step": 3220 + }, + { + "epoch": 4.130434782608695, + "grad_norm": 0.9655877947807312, + "learning_rate": 0.0002724022153065786, + "loss": 5.5553, + "step": 3230 + }, + { + "epoch": 4.143222506393862, + "grad_norm": 1.4203110933303833, + "learning_rate": 0.00027222554319546047, + "loss": 5.5253, + "step": 3240 + }, + { + "epoch": 4.156010230179028, + "grad_norm": 0.8605411052703857, + "learning_rate": 0.0002720483650651611, + "loss": 5.5499, + "step": 3250 + }, + { + "epoch": 4.168797953964194, + "grad_norm": 1.030411720275879, + "learning_rate": 0.00027187068164920584, + "loss": 5.5306, + "step": 3260 + }, + { + "epoch": 4.181585677749361, + "grad_norm": 0.9130074977874756, + "learning_rate": 0.0002716924936832119, + "loss": 5.524, + "step": 3270 + }, + { + "epoch": 4.194373401534527, + "grad_norm": 0.9061768054962158, + "learning_rate": 0.00027151380190488535, + "loss": 5.5197, + "step": 3280 + }, + { + "epoch": 4.207161125319693, + "grad_norm": 0.9149904847145081, + "learning_rate": 0.00027133460705401814, + "loss": 5.5273, + "step": 3290 + }, + { + "epoch": 4.21994884910486, + "grad_norm": 0.8536543846130371, + "learning_rate": 0.00027115490987248493, + "loss": 5.524, + "step": 3300 + }, + { + "epoch": 4.232736572890025, + "grad_norm": 0.8059023022651672, + "learning_rate": 0.00027097471110424005, + "loss": 5.5276, + "step": 3310 + }, + { + "epoch": 4.245524296675192, + "grad_norm": 0.894961953163147, + "learning_rate": 0.00027079401149531434, + "loss": 5.52, + "step": 3320 + }, + { + "epoch": 4.258312020460358, + "grad_norm": 0.907398521900177, + "learning_rate": 0.0002706128117938122, + "loss": 5.5113, + "step": 3330 + }, + { + "epoch": 4.271099744245524, + "grad_norm": 0.701256513595581, + "learning_rate": 0.0002704311127499086, + "loss": 5.514, + "step": 3340 + }, + { + "epoch": 4.283887468030691, + "grad_norm": 0.8591639399528503, + "learning_rate": 0.00027024891511584547, + "loss": 5.5133, + "step": 3350 + }, + { + "epoch": 4.296675191815857, + "grad_norm": 1.0663071870803833, + "learning_rate": 0.0002700662196459292, + "loss": 5.4978, + "step": 3360 + }, + { + "epoch": 4.309462915601023, + "grad_norm": 1.0883492231369019, + "learning_rate": 0.00026988302709652723, + "loss": 5.5314, + "step": 3370 + }, + { + "epoch": 4.322250639386189, + "grad_norm": 1.2101244926452637, + "learning_rate": 0.00026969933822606465, + "loss": 5.4919, + "step": 3380 + }, + { + "epoch": 4.335038363171355, + "grad_norm": 1.37273371219635, + "learning_rate": 0.00026951515379502176, + "loss": 5.5201, + "step": 3390 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.8530544638633728, + "learning_rate": 0.00026933047456593025, + "loss": 5.4963, + "step": 3400 + }, + { + "epoch": 4.360613810741688, + "grad_norm": 1.0042463541030884, + "learning_rate": 0.0002691453013033702, + "loss": 5.5079, + "step": 3410 + }, + { + "epoch": 4.373401534526854, + "grad_norm": 0.9824084639549255, + "learning_rate": 0.00026895963477396726, + "loss": 5.5065, + "step": 3420 + }, + { + "epoch": 4.3861892583120206, + "grad_norm": 0.9359764456748962, + "learning_rate": 0.0002687734757463891, + "loss": 5.497, + "step": 3430 + }, + { + "epoch": 4.398976982097187, + "grad_norm": 0.7871713042259216, + "learning_rate": 0.0002685868249913424, + "loss": 5.5059, + "step": 3440 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 0.9313884377479553, + "learning_rate": 0.00026839968328156957, + "loss": 5.503, + "step": 3450 + }, + { + "epoch": 4.42455242966752, + "grad_norm": 1.0534262657165527, + "learning_rate": 0.0002682120513918455, + "loss": 5.5077, + "step": 3460 + }, + { + "epoch": 4.437340153452685, + "grad_norm": 1.0140694379806519, + "learning_rate": 0.0002680239300989747, + "loss": 5.5238, + "step": 3470 + }, + { + "epoch": 4.450127877237851, + "grad_norm": 0.9650000929832458, + "learning_rate": 0.00026783532018178756, + "loss": 5.4972, + "step": 3480 + }, + { + "epoch": 4.462915601023018, + "grad_norm": 1.0754122734069824, + "learning_rate": 0.0002676462224211376, + "loss": 5.4936, + "step": 3490 + }, + { + "epoch": 4.475703324808184, + "grad_norm": 1.1840262413024902, + "learning_rate": 0.00026745663759989796, + "loss": 5.4895, + "step": 3500 + }, + { + "epoch": 4.4884910485933505, + "grad_norm": 3.5758776664733887, + "learning_rate": 0.00026726656650295815, + "loss": 5.4779, + "step": 3510 + }, + { + "epoch": 4.501278772378517, + "grad_norm": 1.3599027395248413, + "learning_rate": 0.000267076009917221, + "loss": 5.5273, + "step": 3520 + }, + { + "epoch": 4.514066496163683, + "grad_norm": 1.9289357662200928, + "learning_rate": 0.0002668849686315993, + "loss": 5.5244, + "step": 3530 + }, + { + "epoch": 4.526854219948849, + "grad_norm": 2.649031162261963, + "learning_rate": 0.0002666934434370124, + "loss": 5.5419, + "step": 3540 + }, + { + "epoch": 4.539641943734015, + "grad_norm": 1.6334396600723267, + "learning_rate": 0.00026650143512638316, + "loss": 5.5442, + "step": 3550 + }, + { + "epoch": 4.552429667519181, + "grad_norm": 9.740204811096191, + "learning_rate": 0.00026630894449463445, + "loss": 5.57, + "step": 3560 + }, + { + "epoch": 4.565217391304348, + "grad_norm": 5.615604400634766, + "learning_rate": 0.00026611597233868606, + "loss": 5.6171, + "step": 3570 + }, + { + "epoch": 4.578005115089514, + "grad_norm": 10.387894630432129, + "learning_rate": 0.0002659225194574513, + "loss": 5.5957, + "step": 3580 + }, + { + "epoch": 4.59079283887468, + "grad_norm": 6.4273247718811035, + "learning_rate": 0.00026572858665183384, + "loss": 5.5684, + "step": 3590 + }, + { + "epoch": 4.603580562659847, + "grad_norm": 3.7879221439361572, + "learning_rate": 0.0002655341747247239, + "loss": 5.5616, + "step": 3600 + }, + { + "epoch": 4.616368286445013, + "grad_norm": 1.7430964708328247, + "learning_rate": 0.0002653392844809956, + "loss": 5.5481, + "step": 3610 + }, + { + "epoch": 4.629156010230179, + "grad_norm": 1.193642020225525, + "learning_rate": 0.00026514391672750317, + "loss": 5.5487, + "step": 3620 + }, + { + "epoch": 4.641943734015345, + "grad_norm": 0.9543384909629822, + "learning_rate": 0.00026494807227307786, + "loss": 5.5215, + "step": 3630 + }, + { + "epoch": 4.654731457800511, + "grad_norm": 6.988561153411865, + "learning_rate": 0.0002647517519285243, + "loss": 5.5163, + "step": 3640 + }, + { + "epoch": 4.667519181585678, + "grad_norm": 35138.046875, + "learning_rate": 0.00026455495650661763, + "loss": 14.1816, + "step": 3650 + }, + { + "epoch": 4.680306905370844, + "grad_norm": 649.7811889648438, + "learning_rate": 0.00026435768682209947, + "loss": 9.2072, + "step": 3660 + }, + { + "epoch": 4.69309462915601, + "grad_norm": 1.4763447046279907, + "learning_rate": 0.0002641599436916751, + "loss": 6.1727, + "step": 3670 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 1.2908670902252197, + "learning_rate": 0.00026396172793401, + "loss": 5.6292, + "step": 3680 + }, + { + "epoch": 4.718670076726343, + "grad_norm": 1.0045114755630493, + "learning_rate": 0.0002637630403697261, + "loss": 5.5647, + "step": 3690 + }, + { + "epoch": 4.731457800511509, + "grad_norm": 1.0943928956985474, + "learning_rate": 0.0002635638818213988, + "loss": 5.5281, + "step": 3700 + }, + { + "epoch": 4.744245524296675, + "grad_norm": 0.8787861466407776, + "learning_rate": 0.0002633642531135532, + "loss": 5.5262, + "step": 3710 + }, + { + "epoch": 4.757033248081841, + "grad_norm": 0.9018283486366272, + "learning_rate": 0.0002631641550726613, + "loss": 5.5107, + "step": 3720 + }, + { + "epoch": 4.7698209718670075, + "grad_norm": 1.1844482421875, + "learning_rate": 0.0002629635885271376, + "loss": 5.493, + "step": 3730 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 1.2566533088684082, + "learning_rate": 0.0002627625543073367, + "loss": 5.4924, + "step": 3740 + }, + { + "epoch": 4.79539641943734, + "grad_norm": 1.1357030868530273, + "learning_rate": 0.0002625610532455491, + "loss": 5.4784, + "step": 3750 + }, + { + "epoch": 4.8081841432225065, + "grad_norm": 0.8598061800003052, + "learning_rate": 0.00026235908617599824, + "loss": 5.4796, + "step": 3760 + }, + { + "epoch": 4.820971867007673, + "grad_norm": 1.176074743270874, + "learning_rate": 0.0002621566539348367, + "loss": 5.4893, + "step": 3770 + }, + { + "epoch": 4.833759590792839, + "grad_norm": 0.7435081601142883, + "learning_rate": 0.00026195375736014296, + "loss": 5.489, + "step": 3780 + }, + { + "epoch": 4.846547314578006, + "grad_norm": 1.0044056177139282, + "learning_rate": 0.0002617503972919179, + "loss": 5.5022, + "step": 3790 + }, + { + "epoch": 4.859335038363171, + "grad_norm": 0.9446366429328918, + "learning_rate": 0.0002615465745720812, + "loss": 5.4633, + "step": 3800 + }, + { + "epoch": 4.872122762148337, + "grad_norm": 1.2357178926467896, + "learning_rate": 0.00026134229004446796, + "loss": 5.4782, + "step": 3810 + }, + { + "epoch": 4.884910485933504, + "grad_norm": 1.0345720052719116, + "learning_rate": 0.0002611375445548252, + "loss": 5.5007, + "step": 3820 + }, + { + "epoch": 4.89769820971867, + "grad_norm": 1.6297608613967896, + "learning_rate": 0.00026093233895080846, + "loss": 5.8077, + "step": 3830 + }, + { + "epoch": 4.910485933503836, + "grad_norm": 2.633462905883789, + "learning_rate": 0.000260726674081978, + "loss": 5.5718, + "step": 3840 + }, + { + "epoch": 4.923273657289003, + "grad_norm": 1.6603726148605347, + "learning_rate": 0.00026052055079979546, + "loss": 5.5366, + "step": 3850 + }, + { + "epoch": 4.936061381074169, + "grad_norm": 23.486774444580078, + "learning_rate": 0.0002603139699576204, + "loss": 5.6162, + "step": 3860 + }, + { + "epoch": 4.948849104859335, + "grad_norm": 1.7220473289489746, + "learning_rate": 0.0002601069324107067, + "loss": 5.777, + "step": 3870 + }, + { + "epoch": 4.961636828644501, + "grad_norm": 1.3732489347457886, + "learning_rate": 0.0002598994390161991, + "loss": 5.558, + "step": 3880 + }, + { + "epoch": 4.974424552429667, + "grad_norm": 1.398135781288147, + "learning_rate": 0.0002596914906331294, + "loss": 5.5602, + "step": 3890 + }, + { + "epoch": 4.987212276214834, + "grad_norm": 1.0198397636413574, + "learning_rate": 0.0002594830881224131, + "loss": 5.5123, + "step": 3900 + }, + { + "epoch": 5.0, + "grad_norm": 1.7247081995010376, + "learning_rate": 0.0002592742323468459, + "loss": 5.5164, + "step": 3910 + }, + { + "epoch": 5.012787723785166, + "grad_norm": 1.3416905403137207, + "learning_rate": 0.00025906492417110004, + "loss": 5.5093, + "step": 3920 + }, + { + "epoch": 5.025575447570333, + "grad_norm": 0.9129336476325989, + "learning_rate": 0.0002588551644617206, + "loss": 5.4713, + "step": 3930 + }, + { + "epoch": 5.038363171355499, + "grad_norm": 0.8973252177238464, + "learning_rate": 0.0002586449540871221, + "loss": 5.4519, + "step": 3940 + }, + { + "epoch": 5.051150895140665, + "grad_norm": 0.9549187421798706, + "learning_rate": 0.00025843429391758497, + "loss": 5.4634, + "step": 3950 + }, + { + "epoch": 5.063938618925831, + "grad_norm": 7.259679794311523, + "learning_rate": 0.0002582231848252515, + "loss": 5.4512, + "step": 3960 + }, + { + "epoch": 5.076726342710997, + "grad_norm": 0.8443651795387268, + "learning_rate": 0.00025801162768412277, + "loss": 5.45, + "step": 3970 + }, + { + "epoch": 5.089514066496164, + "grad_norm": 1.066373348236084, + "learning_rate": 0.0002577996233700548, + "loss": 5.4714, + "step": 3980 + }, + { + "epoch": 5.10230179028133, + "grad_norm": 0.8174281716346741, + "learning_rate": 0.0002575871727607548, + "loss": 5.4518, + "step": 3990 + }, + { + "epoch": 5.115089514066496, + "grad_norm": 1.1461961269378662, + "learning_rate": 0.0002573742767357778, + "loss": 5.4323, + "step": 4000 + }, + { + "epoch": 5.127877237851663, + "grad_norm": 1.1951566934585571, + "learning_rate": 0.00025716093617652273, + "loss": 5.4453, + "step": 4010 + }, + { + "epoch": 5.140664961636829, + "grad_norm": 1.038336157798767, + "learning_rate": 0.00025694715196622903, + "loss": 5.4727, + "step": 4020 + }, + { + "epoch": 5.153452685421995, + "grad_norm": 1.3700714111328125, + "learning_rate": 0.0002567329249899729, + "loss": 5.508, + "step": 4030 + }, + { + "epoch": 5.166240409207161, + "grad_norm": 0.8140355348587036, + "learning_rate": 0.0002565182561346634, + "loss": 5.4863, + "step": 4040 + }, + { + "epoch": 5.179028132992327, + "grad_norm": 1.4147567749023438, + "learning_rate": 0.00025630314628903925, + "loss": 5.4518, + "step": 4050 + }, + { + "epoch": 5.1918158567774935, + "grad_norm": 0.7016567587852478, + "learning_rate": 0.0002560875963436647, + "loss": 5.4447, + "step": 4060 + }, + { + "epoch": 5.20460358056266, + "grad_norm": 0.7389644980430603, + "learning_rate": 0.0002558716071909262, + "loss": 5.444, + "step": 4070 + }, + { + "epoch": 5.217391304347826, + "grad_norm": 0.9639290571212769, + "learning_rate": 0.0002556551797250283, + "loss": 5.4364, + "step": 4080 + }, + { + "epoch": 5.2301790281329925, + "grad_norm": 0.8074414730072021, + "learning_rate": 0.0002554383148419904, + "loss": 5.4526, + "step": 4090 + }, + { + "epoch": 5.242966751918159, + "grad_norm": 1.0686334371566772, + "learning_rate": 0.0002552210134396429, + "loss": 5.4213, + "step": 4100 + }, + { + "epoch": 5.255754475703325, + "grad_norm": 0.7948906421661377, + "learning_rate": 0.00025500327641762296, + "loss": 5.4414, + "step": 4110 + }, + { + "epoch": 5.268542199488491, + "grad_norm": 3.3121063709259033, + "learning_rate": 0.00025478510467737175, + "loss": 5.4386, + "step": 4120 + }, + { + "epoch": 5.281329923273657, + "grad_norm": 1.0081732273101807, + "learning_rate": 0.0002545664991221299, + "loss": 5.436, + "step": 4130 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 1.1705923080444336, + "learning_rate": 0.0002543474606569342, + "loss": 5.4249, + "step": 4140 + }, + { + "epoch": 5.30690537084399, + "grad_norm": 1.5626429319381714, + "learning_rate": 0.0002541279901886135, + "loss": 5.4149, + "step": 4150 + }, + { + "epoch": 5.319693094629156, + "grad_norm": 1.235137701034546, + "learning_rate": 0.00025390808862578543, + "loss": 5.4067, + "step": 4160 + }, + { + "epoch": 5.332480818414322, + "grad_norm": 1.3060702085494995, + "learning_rate": 0.0002536877568788521, + "loss": 5.4284, + "step": 4170 + }, + { + "epoch": 5.345268542199489, + "grad_norm": 0.7894797325134277, + "learning_rate": 0.00025346699585999683, + "loss": 5.4261, + "step": 4180 + }, + { + "epoch": 5.358056265984655, + "grad_norm": 0.6973442435264587, + "learning_rate": 0.00025324580648318, + "loss": 5.4292, + "step": 4190 + }, + { + "epoch": 5.370843989769821, + "grad_norm": 0.7937166094779968, + "learning_rate": 0.00025302418966413555, + "loss": 5.407, + "step": 4200 + }, + { + "epoch": 5.383631713554987, + "grad_norm": 1.1912145614624023, + "learning_rate": 0.00025280214632036684, + "loss": 5.4215, + "step": 4210 + }, + { + "epoch": 5.396419437340153, + "grad_norm": 1.135903000831604, + "learning_rate": 0.00025257967737114323, + "loss": 5.4018, + "step": 4220 + }, + { + "epoch": 5.40920716112532, + "grad_norm": 1.0384650230407715, + "learning_rate": 0.0002523567837374961, + "loss": 5.4002, + "step": 4230 + }, + { + "epoch": 5.421994884910486, + "grad_norm": 1.1547504663467407, + "learning_rate": 0.00025213346634221493, + "loss": 5.406, + "step": 4240 + }, + { + "epoch": 5.434782608695652, + "grad_norm": 1.5584670305252075, + "learning_rate": 0.0002519097261098437, + "loss": 5.4227, + "step": 4250 + }, + { + "epoch": 5.447570332480819, + "grad_norm": 1.0803555250167847, + "learning_rate": 0.00025168556396667693, + "loss": 5.4294, + "step": 4260 + }, + { + "epoch": 5.460358056265985, + "grad_norm": 1.1373729705810547, + "learning_rate": 0.0002514609808407558, + "loss": 5.401, + "step": 4270 + }, + { + "epoch": 5.4731457800511505, + "grad_norm": 0.9056215882301331, + "learning_rate": 0.00025123597766186454, + "loss": 5.4026, + "step": 4280 + }, + { + "epoch": 5.485933503836317, + "grad_norm": 1.3096917867660522, + "learning_rate": 0.00025101055536152616, + "loss": 5.4131, + "step": 4290 + }, + { + "epoch": 5.498721227621483, + "grad_norm": 0.890828013420105, + "learning_rate": 0.00025078471487299917, + "loss": 5.4063, + "step": 4300 + }, + { + "epoch": 5.5115089514066495, + "grad_norm": 0.9283934235572815, + "learning_rate": 0.00025055845713127306, + "loss": 5.4078, + "step": 4310 + }, + { + "epoch": 5.524296675191816, + "grad_norm": 0.9405550360679626, + "learning_rate": 0.00025033178307306506, + "loss": 5.4147, + "step": 4320 + }, + { + "epoch": 5.537084398976982, + "grad_norm": 0.8457754254341125, + "learning_rate": 0.0002501046936368158, + "loss": 5.408, + "step": 4330 + }, + { + "epoch": 5.549872122762149, + "grad_norm": 1.2794405221939087, + "learning_rate": 0.0002498771897626857, + "loss": 5.4104, + "step": 4340 + }, + { + "epoch": 5.562659846547315, + "grad_norm": 0.8734764456748962, + "learning_rate": 0.00024964927239255074, + "loss": 5.4089, + "step": 4350 + }, + { + "epoch": 5.57544757033248, + "grad_norm": 1.1535333395004272, + "learning_rate": 0.0002494209424699992, + "loss": 5.3985, + "step": 4360 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 0.8653953671455383, + "learning_rate": 0.0002491922009403269, + "loss": 5.3734, + "step": 4370 + }, + { + "epoch": 5.601023017902813, + "grad_norm": 1.14579439163208, + "learning_rate": 0.0002489630487505341, + "loss": 5.3813, + "step": 4380 + }, + { + "epoch": 5.6138107416879794, + "grad_norm": 1.0796849727630615, + "learning_rate": 0.00024873348684932095, + "loss": 5.4106, + "step": 4390 + }, + { + "epoch": 5.626598465473146, + "grad_norm": 1.226593255996704, + "learning_rate": 0.000248503516187084, + "loss": 5.3917, + "step": 4400 + }, + { + "epoch": 5.639386189258312, + "grad_norm": 0.761320173740387, + "learning_rate": 0.00024827313771591183, + "loss": 5.3818, + "step": 4410 + }, + { + "epoch": 5.6521739130434785, + "grad_norm": 1.1446237564086914, + "learning_rate": 0.0002480423523895818, + "loss": 5.3926, + "step": 4420 + }, + { + "epoch": 5.664961636828645, + "grad_norm": 1.49809992313385, + "learning_rate": 0.00024781116116355523, + "loss": 5.4011, + "step": 4430 + }, + { + "epoch": 5.677749360613811, + "grad_norm": 0.9937270879745483, + "learning_rate": 0.00024757956499497414, + "loss": 5.4073, + "step": 4440 + }, + { + "epoch": 5.690537084398977, + "grad_norm": 1.3228845596313477, + "learning_rate": 0.000247347564842657, + "loss": 5.412, + "step": 4450 + }, + { + "epoch": 5.703324808184143, + "grad_norm": 1.146344542503357, + "learning_rate": 0.0002471151616670947, + "loss": 5.4071, + "step": 4460 + }, + { + "epoch": 5.716112531969309, + "grad_norm": 55.619258880615234, + "learning_rate": 0.00024688235643044676, + "loss": 5.7836, + "step": 4470 + }, + { + "epoch": 5.728900255754476, + "grad_norm": 2.3042633533477783, + "learning_rate": 0.0002466491500965372, + "loss": 5.6276, + "step": 4480 + }, + { + "epoch": 5.741687979539642, + "grad_norm": 2.298129081726074, + "learning_rate": 0.00024641554363085056, + "loss": 5.4675, + "step": 4490 + }, + { + "epoch": 5.754475703324808, + "grad_norm": 89.96639251708984, + "learning_rate": 0.00024618153800052796, + "loss": 5.478, + "step": 4500 + }, + { + "epoch": 5.767263427109975, + "grad_norm": 1.2469855546951294, + "learning_rate": 0.0002459471341743633, + "loss": 5.4764, + "step": 4510 + }, + { + "epoch": 5.78005115089514, + "grad_norm": 1.0728439092636108, + "learning_rate": 0.0002457123331227986, + "loss": 5.4405, + "step": 4520 + }, + { + "epoch": 5.792838874680307, + "grad_norm": 1.9817936420440674, + "learning_rate": 0.0002454771358179208, + "loss": 5.4187, + "step": 4530 + }, + { + "epoch": 5.805626598465473, + "grad_norm": 2.8526406288146973, + "learning_rate": 0.0002452415432334571, + "loss": 5.4591, + "step": 4540 + }, + { + "epoch": 5.818414322250639, + "grad_norm": 2.35715651512146, + "learning_rate": 0.0002450055563447714, + "loss": 5.4392, + "step": 4550 + }, + { + "epoch": 5.831202046035806, + "grad_norm": 1.693472981452942, + "learning_rate": 0.00024476917612885975, + "loss": 5.4059, + "step": 4560 + }, + { + "epoch": 5.843989769820972, + "grad_norm": 1.3062572479248047, + "learning_rate": 0.0002445324035643469, + "loss": 5.4088, + "step": 4570 + }, + { + "epoch": 5.856777493606138, + "grad_norm": 1.1204801797866821, + "learning_rate": 0.00024429523963148165, + "loss": 5.3992, + "step": 4580 + }, + { + "epoch": 5.869565217391305, + "grad_norm": 5.129584312438965, + "learning_rate": 0.00024405768531213337, + "loss": 5.3986, + "step": 4590 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 162.03988647460938, + "learning_rate": 0.0002438197415897874, + "loss": 5.489, + "step": 4600 + }, + { + "epoch": 5.8951406649616365, + "grad_norm": 48338.75390625, + "learning_rate": 0.00024358140944954134, + "loss": 6.4544, + "step": 4610 + }, + { + "epoch": 5.907928388746803, + "grad_norm": 309.1040344238281, + "learning_rate": 0.00024334268987810087, + "loss": 6.5122, + "step": 4620 + }, + { + "epoch": 5.920716112531969, + "grad_norm": 91.73091125488281, + "learning_rate": 0.00024310358386377563, + "loss": 6.1042, + "step": 4630 + }, + { + "epoch": 5.9335038363171355, + "grad_norm": 4794.84912109375, + "learning_rate": 0.0002428640923964751, + "loss": 5.9236, + "step": 4640 + }, + { + "epoch": 5.946291560102302, + "grad_norm": 428.7879638671875, + "learning_rate": 0.00024262421646770461, + "loss": 6.3916, + "step": 4650 + }, + { + "epoch": 5.959079283887468, + "grad_norm": 2455.22900390625, + "learning_rate": 0.00024238395707056122, + "loss": 6.2707, + "step": 4660 + }, + { + "epoch": 5.971867007672635, + "grad_norm": 5.497981548309326, + "learning_rate": 0.00024214331519972944, + "loss": 5.9861, + "step": 4670 + }, + { + "epoch": 5.9846547314578, + "grad_norm": 7.887845993041992, + "learning_rate": 0.00024190229185147734, + "loss": 5.6179, + "step": 4680 + }, + { + "epoch": 5.997442455242966, + "grad_norm": 8.13206958770752, + "learning_rate": 0.00024166088802365232, + "loss": 5.5614, + "step": 4690 + }, + { + "epoch": 6.010230179028133, + "grad_norm": 170.85169982910156, + "learning_rate": 0.00024141910471567695, + "loss": 5.703, + "step": 4700 + }, + { + "epoch": 6.023017902813299, + "grad_norm": 8.114191055297852, + "learning_rate": 0.00024117694292854482, + "loss": 5.6643, + "step": 4710 + }, + { + "epoch": 6.035805626598465, + "grad_norm": 1.649259328842163, + "learning_rate": 0.00024093440366481655, + "loss": 5.5548, + "step": 4720 + }, + { + "epoch": 6.048593350383632, + "grad_norm": 3.6291794776916504, + "learning_rate": 0.0002406914879286155, + "loss": 5.5627, + "step": 4730 + }, + { + "epoch": 6.061381074168798, + "grad_norm": 17.16327476501465, + "learning_rate": 0.00024044819672562352, + "loss": 5.5179, + "step": 4740 + }, + { + "epoch": 6.0741687979539645, + "grad_norm": 66.84463500976562, + "learning_rate": 0.00024020453106307705, + "loss": 5.6356, + "step": 4750 + }, + { + "epoch": 6.086956521739131, + "grad_norm": 71.15001678466797, + "learning_rate": 0.00023996049194976272, + "loss": 5.8183, + "step": 4760 + }, + { + "epoch": 6.099744245524296, + "grad_norm": 215.5955352783203, + "learning_rate": 0.00023971608039601342, + "loss": 5.7444, + "step": 4770 + }, + { + "epoch": 6.112531969309463, + "grad_norm": 22.749778747558594, + "learning_rate": 0.00023947129741370372, + "loss": 5.6429, + "step": 4780 + }, + { + "epoch": 6.125319693094629, + "grad_norm": 4.4116387367248535, + "learning_rate": 0.0002392261440162461, + "loss": 5.5824, + "step": 4790 + }, + { + "epoch": 6.138107416879795, + "grad_norm": 21.407913208007812, + "learning_rate": 0.00023898062121858657, + "loss": 5.5583, + "step": 4800 + }, + { + "epoch": 6.150895140664962, + "grad_norm": 2.962198257446289, + "learning_rate": 0.0002387347300372004, + "loss": 5.5477, + "step": 4810 + }, + { + "epoch": 6.163682864450128, + "grad_norm": 2.31837797164917, + "learning_rate": 0.0002384884714900879, + "loss": 5.5348, + "step": 4820 + }, + { + "epoch": 6.176470588235294, + "grad_norm": 1.527478575706482, + "learning_rate": 0.00023824184659677054, + "loss": 5.5314, + "step": 4830 + }, + { + "epoch": 6.189258312020461, + "grad_norm": 61.507530212402344, + "learning_rate": 0.00023799485637828626, + "loss": 5.5663, + "step": 4840 + }, + { + "epoch": 6.202046035805626, + "grad_norm": 2436.508544921875, + "learning_rate": 0.0002377475018571856, + "loss": 5.6175, + "step": 4850 + }, + { + "epoch": 6.2148337595907925, + "grad_norm": 2913507.5, + "learning_rate": 0.00023749978405752722, + "loss": 6.377, + "step": 4860 + }, + { + "epoch": 6.227621483375959, + "grad_norm": 2641881.0, + "learning_rate": 0.00023725170400487386, + "loss": 7.9995, + "step": 4870 + }, + { + "epoch": 6.240409207161125, + "grad_norm": 159218.71875, + "learning_rate": 0.00023700326272628794, + "loss": 9.1041, + "step": 4880 + }, + { + "epoch": 6.253196930946292, + "grad_norm": 11773.8193359375, + "learning_rate": 0.00023675446125032736, + "loss": 8.7907, + "step": 4890 + }, + { + "epoch": 6.265984654731458, + "grad_norm": 1598409.75, + "learning_rate": 0.00023650530060704137, + "loss": 8.2848, + "step": 4900 + }, + { + "epoch": 6.278772378516624, + "grad_norm": 10316.607421875, + "learning_rate": 0.00023625578182796608, + "loss": 7.0858, + "step": 4910 + }, + { + "epoch": 6.291560102301791, + "grad_norm": 4661.24853515625, + "learning_rate": 0.00023600590594612025, + "loss": 6.5619, + "step": 4920 + }, + { + "epoch": 6.304347826086957, + "grad_norm": 50447.3359375, + "learning_rate": 0.00023575567399600124, + "loss": 6.3697, + "step": 4930 + }, + { + "epoch": 6.3171355498721224, + "grad_norm": 11574.3974609375, + "learning_rate": 0.0002355050870135804, + "loss": 6.4702, + "step": 4940 + }, + { + "epoch": 6.329923273657289, + "grad_norm": 1139.9498291015625, + "learning_rate": 0.00023525414603629897, + "loss": 6.3997, + "step": 4950 + }, + { + "epoch": 6.342710997442455, + "grad_norm": 571.5132446289062, + "learning_rate": 0.00023500285210306372, + "loss": 6.0743, + "step": 4960 + }, + { + "epoch": 6.3554987212276215, + "grad_norm": 242.6359100341797, + "learning_rate": 0.0002347512062542428, + "loss": 5.7139, + "step": 4970 + }, + { + "epoch": 6.368286445012788, + "grad_norm": 490.5594787597656, + "learning_rate": 0.00023449920953166114, + "loss": 5.787, + "step": 4980 + }, + { + "epoch": 6.381074168797954, + "grad_norm": 81.9707260131836, + "learning_rate": 0.00023424686297859637, + "loss": 5.8251, + "step": 4990 + }, + { + "epoch": 6.3938618925831205, + "grad_norm": 2.5100646018981934, + "learning_rate": 0.00023399416763977447, + "loss": 5.7931, + "step": 5000 + }, + { + "epoch": 6.406649616368286, + "grad_norm": 4.38476037979126, + "learning_rate": 0.0002337411245613653, + "loss": 5.6809, + "step": 5010 + }, + { + "epoch": 6.419437340153452, + "grad_norm": 0.9684484601020813, + "learning_rate": 0.00023348773479097855, + "loss": 5.6249, + "step": 5020 + }, + { + "epoch": 6.432225063938619, + "grad_norm": 2.0226662158966064, + "learning_rate": 0.00023323399937765906, + "loss": 5.5767, + "step": 5030 + }, + { + "epoch": 6.445012787723785, + "grad_norm": 1.4661270380020142, + "learning_rate": 0.0002329799193718827, + "loss": 5.5135, + "step": 5040 + }, + { + "epoch": 6.457800511508951, + "grad_norm": 1.4130449295043945, + "learning_rate": 0.00023272549582555212, + "loss": 5.5327, + "step": 5050 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 1.2371689081192017, + "learning_rate": 0.00023247072979199192, + "loss": 5.4606, + "step": 5060 + }, + { + "epoch": 6.483375959079284, + "grad_norm": 0.9750532507896423, + "learning_rate": 0.000232215622325945, + "loss": 5.433, + "step": 5070 + }, + { + "epoch": 6.4961636828644505, + "grad_norm": 1.7443705797195435, + "learning_rate": 0.00023196017448356743, + "loss": 5.4655, + "step": 5080 + }, + { + "epoch": 6.508951406649617, + "grad_norm": 1.21770179271698, + "learning_rate": 0.00023170438732242473, + "loss": 5.4349, + "step": 5090 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 1.1340687274932861, + "learning_rate": 0.0002314482619014871, + "loss": 5.4672, + "step": 5100 + }, + { + "epoch": 6.534526854219949, + "grad_norm": 2.9821395874023438, + "learning_rate": 0.0002311917992811252, + "loss": 5.4951, + "step": 5110 + }, + { + "epoch": 6.547314578005115, + "grad_norm": 199.1261749267578, + "learning_rate": 0.00023093500052310558, + "loss": 5.5208, + "step": 5120 + }, + { + "epoch": 6.560102301790281, + "grad_norm": 45.095088958740234, + "learning_rate": 0.0002306778666905866, + "loss": 5.4837, + "step": 5130 + }, + { + "epoch": 6.572890025575448, + "grad_norm": 2.1452138423919678, + "learning_rate": 0.00023042039884811372, + "loss": 5.4714, + "step": 5140 + }, + { + "epoch": 6.585677749360614, + "grad_norm": 61.150856018066406, + "learning_rate": 0.00023016259806161524, + "loss": 5.4805, + "step": 5150 + }, + { + "epoch": 6.59846547314578, + "grad_norm": 29.349306106567383, + "learning_rate": 0.00022990446539839795, + "loss": 5.5292, + "step": 5160 + }, + { + "epoch": 6.611253196930946, + "grad_norm": 51.63431930541992, + "learning_rate": 0.0002296460019271424, + "loss": 5.4861, + "step": 5170 + }, + { + "epoch": 6.624040920716112, + "grad_norm": 271368.5625, + "learning_rate": 0.000229387208717899, + "loss": 5.4857, + "step": 5180 + }, + { + "epoch": 6.6368286445012785, + "grad_norm": 22.639575958251953, + "learning_rate": 0.00022912808684208302, + "loss": 5.5158, + "step": 5190 + }, + { + "epoch": 6.649616368286445, + "grad_norm": 34.41466522216797, + "learning_rate": 0.00022886863737247068, + "loss": 5.477, + "step": 5200 + }, + { + "epoch": 6.662404092071611, + "grad_norm": 4.41986083984375, + "learning_rate": 0.0002286088613831942, + "loss": 5.4337, + "step": 5210 + }, + { + "epoch": 6.675191815856778, + "grad_norm": 3.1884968280792236, + "learning_rate": 0.00022834875994973773, + "loss": 5.5051, + "step": 5220 + }, + { + "epoch": 6.687979539641944, + "grad_norm": 1.9236191511154175, + "learning_rate": 0.00022808833414893282, + "loss": 5.457, + "step": 5230 + }, + { + "epoch": 6.70076726342711, + "grad_norm": 7.70404577255249, + "learning_rate": 0.0002278275850589538, + "loss": 5.4598, + "step": 5240 + }, + { + "epoch": 6.713554987212277, + "grad_norm": 5.475555419921875, + "learning_rate": 0.00022756651375931356, + "loss": 5.5004, + "step": 5250 + }, + { + "epoch": 6.726342710997442, + "grad_norm": 4.682950973510742, + "learning_rate": 0.00022730512133085885, + "loss": 5.4897, + "step": 5260 + }, + { + "epoch": 6.739130434782608, + "grad_norm": 2.2729697227478027, + "learning_rate": 0.00022704340885576588, + "loss": 5.5197, + "step": 5270 + }, + { + "epoch": 6.751918158567775, + "grad_norm": 1.674745798110962, + "learning_rate": 0.00022678137741753604, + "loss": 5.4519, + "step": 5280 + }, + { + "epoch": 6.764705882352941, + "grad_norm": 2.585331439971924, + "learning_rate": 0.00022651902810099107, + "loss": 5.4677, + "step": 5290 + }, + { + "epoch": 6.7774936061381075, + "grad_norm": 1.7059080600738525, + "learning_rate": 0.00022625636199226885, + "loss": 5.4422, + "step": 5300 + }, + { + "epoch": 6.790281329923274, + "grad_norm": 1.875679612159729, + "learning_rate": 0.00022599338017881865, + "loss": 5.4344, + "step": 5310 + }, + { + "epoch": 6.80306905370844, + "grad_norm": 1.07345449924469, + "learning_rate": 0.00022573008374939693, + "loss": 5.424, + "step": 5320 + }, + { + "epoch": 6.8158567774936065, + "grad_norm": 2.112574338912964, + "learning_rate": 0.0002254664737940626, + "loss": 5.392, + "step": 5330 + }, + { + "epoch": 6.828644501278772, + "grad_norm": 2.7668375968933105, + "learning_rate": 0.00022520255140417262, + "loss": 5.4379, + "step": 5340 + }, + { + "epoch": 6.841432225063938, + "grad_norm": 10.602677345275879, + "learning_rate": 0.00022493831767237736, + "loss": 5.4118, + "step": 5350 + }, + { + "epoch": 6.854219948849105, + "grad_norm": 2.519099712371826, + "learning_rate": 0.0002246737736926162, + "loss": 5.3876, + "step": 5360 + }, + { + "epoch": 6.867007672634271, + "grad_norm": 1.2932206392288208, + "learning_rate": 0.00022440892056011308, + "loss": 5.4044, + "step": 5370 + }, + { + "epoch": 6.879795396419437, + "grad_norm": 1.185011863708496, + "learning_rate": 0.00022414375937137164, + "loss": 5.4038, + "step": 5380 + }, + { + "epoch": 6.892583120204604, + "grad_norm": 1.340401530265808, + "learning_rate": 0.00022387829122417103, + "loss": 5.403, + "step": 5390 + }, + { + "epoch": 6.90537084398977, + "grad_norm": 1.3118840456008911, + "learning_rate": 0.0002236125172175612, + "loss": 5.407, + "step": 5400 + }, + { + "epoch": 6.918158567774936, + "grad_norm": 1.2288802862167358, + "learning_rate": 0.0002233464384518583, + "loss": 5.3923, + "step": 5410 + }, + { + "epoch": 6.930946291560103, + "grad_norm": 1.8922580480575562, + "learning_rate": 0.00022308005602864026, + "loss": 5.3871, + "step": 5420 + }, + { + "epoch": 6.943734015345268, + "grad_norm": 7.790588855743408, + "learning_rate": 0.00022281337105074214, + "loss": 5.4146, + "step": 5430 + }, + { + "epoch": 6.956521739130435, + "grad_norm": 4344.03271484375, + "learning_rate": 0.00022254638462225162, + "loss": 5.4527, + "step": 5440 + }, + { + "epoch": 6.969309462915601, + "grad_norm": 140.49502563476562, + "learning_rate": 0.00022227909784850437, + "loss": 5.6473, + "step": 5450 + }, + { + "epoch": 6.982097186700767, + "grad_norm": 130.85202026367188, + "learning_rate": 0.00022201151183607958, + "loss": 5.6701, + "step": 5460 + }, + { + "epoch": 6.994884910485934, + "grad_norm": 126.7076416015625, + "learning_rate": 0.0002217436276927951, + "loss": 5.788, + "step": 5470 + }, + { + "epoch": 7.0076726342711, + "grad_norm": 435.14361572265625, + "learning_rate": 0.00022147544652770332, + "loss": 5.906, + "step": 5480 + }, + { + "epoch": 7.020460358056266, + "grad_norm": 331.05145263671875, + "learning_rate": 0.0002212069694510861, + "loss": 5.7959, + "step": 5490 + }, + { + "epoch": 7.033248081841432, + "grad_norm": 24.31149673461914, + "learning_rate": 0.0002209381975744505, + "loss": 5.7055, + "step": 5500 + }, + { + "epoch": 7.046035805626598, + "grad_norm": 370.831298828125, + "learning_rate": 0.00022066913201052406, + "loss": 5.6893, + "step": 5510 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 80.24853515625, + "learning_rate": 0.00022039977387325012, + "loss": 5.6668, + "step": 5520 + }, + { + "epoch": 7.071611253196931, + "grad_norm": 26.689146041870117, + "learning_rate": 0.00022013012427778337, + "loss": 5.6272, + "step": 5530 + }, + { + "epoch": 7.084398976982097, + "grad_norm": 18.282222747802734, + "learning_rate": 0.00021986018434048515, + "loss": 5.6231, + "step": 5540 + }, + { + "epoch": 7.0971867007672635, + "grad_norm": 71.10526275634766, + "learning_rate": 0.00021958995517891872, + "loss": 5.6329, + "step": 5550 + }, + { + "epoch": 7.10997442455243, + "grad_norm": 66.7136459350586, + "learning_rate": 0.00021931943791184488, + "loss": 5.5997, + "step": 5560 + }, + { + "epoch": 7.122762148337596, + "grad_norm": 13.023746490478516, + "learning_rate": 0.00021904863365921715, + "loss": 5.585, + "step": 5570 + }, + { + "epoch": 7.135549872122763, + "grad_norm": 20.470502853393555, + "learning_rate": 0.0002187775435421771, + "loss": 5.6463, + "step": 5580 + }, + { + "epoch": 7.148337595907928, + "grad_norm": 308.2809753417969, + "learning_rate": 0.00021850616868305, + "loss": 5.6014, + "step": 5590 + }, + { + "epoch": 7.161125319693094, + "grad_norm": 2176.8095703125, + "learning_rate": 0.00021823451020533966, + "loss": 5.601, + "step": 5600 + }, + { + "epoch": 7.173913043478261, + "grad_norm": 14.34654712677002, + "learning_rate": 0.00021796256923372435, + "loss": 5.715, + "step": 5610 + }, + { + "epoch": 7.186700767263427, + "grad_norm": 4.329311847686768, + "learning_rate": 0.00021769034689405174, + "loss": 5.6829, + "step": 5620 + }, + { + "epoch": 7.1994884910485935, + "grad_norm": 6.315857410430908, + "learning_rate": 0.00021741784431333444, + "loss": 5.6157, + "step": 5630 + }, + { + "epoch": 7.21227621483376, + "grad_norm": 7.763189315795898, + "learning_rate": 0.00021714506261974527, + "loss": 5.6019, + "step": 5640 + }, + { + "epoch": 7.225063938618926, + "grad_norm": 375.8602294921875, + "learning_rate": 0.00021687200294261255, + "loss": 5.6288, + "step": 5650 + }, + { + "epoch": 7.2378516624040925, + "grad_norm": 16.852691650390625, + "learning_rate": 0.0002165986664124154, + "loss": 5.6608, + "step": 5660 + }, + { + "epoch": 7.250639386189258, + "grad_norm": 582.8134765625, + "learning_rate": 0.0002163250541607793, + "loss": 5.5801, + "step": 5670 + }, + { + "epoch": 7.263427109974424, + "grad_norm": 13.16250228881836, + "learning_rate": 0.00021605116732047113, + "loss": 5.5558, + "step": 5680 + }, + { + "epoch": 7.276214833759591, + "grad_norm": 81.9183120727539, + "learning_rate": 0.0002157770070253945, + "loss": 5.5759, + "step": 5690 + }, + { + "epoch": 7.289002557544757, + "grad_norm": 34.48521041870117, + "learning_rate": 0.0002155025744105853, + "loss": 5.5714, + "step": 5700 + }, + { + "epoch": 7.301790281329923, + "grad_norm": 5.397050857543945, + "learning_rate": 0.0002152278706122067, + "loss": 5.5447, + "step": 5710 + }, + { + "epoch": 7.31457800511509, + "grad_norm": 7.5026068687438965, + "learning_rate": 0.0002149528967675447, + "loss": 5.5271, + "step": 5720 + }, + { + "epoch": 7.327365728900256, + "grad_norm": 21.039640426635742, + "learning_rate": 0.00021467765401500316, + "loss": 5.5523, + "step": 5730 + }, + { + "epoch": 7.340153452685422, + "grad_norm": 3.5831055641174316, + "learning_rate": 0.00021440214349409937, + "loss": 5.5607, + "step": 5740 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 2.3747684955596924, + "learning_rate": 0.0002141263663454591, + "loss": 5.5686, + "step": 5750 + }, + { + "epoch": 7.365728900255754, + "grad_norm": 3.3695812225341797, + "learning_rate": 0.00021385032371081197, + "loss": 5.5386, + "step": 5760 + }, + { + "epoch": 7.378516624040921, + "grad_norm": 1.6819612979888916, + "learning_rate": 0.00021357401673298686, + "loss": 5.4853, + "step": 5770 + }, + { + "epoch": 7.391304347826087, + "grad_norm": 2.8821001052856445, + "learning_rate": 0.00021329744655590685, + "loss": 5.4891, + "step": 5780 + }, + { + "epoch": 7.404092071611253, + "grad_norm": 10.89107894897461, + "learning_rate": 0.00021302061432458476, + "loss": 5.5159, + "step": 5790 + }, + { + "epoch": 7.41687979539642, + "grad_norm": 8.90987777709961, + "learning_rate": 0.00021274352118511838, + "loss": 5.4912, + "step": 5800 + }, + { + "epoch": 7.429667519181586, + "grad_norm": 2.6703226566314697, + "learning_rate": 0.00021246616828468563, + "loss": 5.4594, + "step": 5810 + }, + { + "epoch": 7.442455242966752, + "grad_norm": 8.18791389465332, + "learning_rate": 0.0002121885567715398, + "loss": 5.4581, + "step": 5820 + }, + { + "epoch": 7.455242966751918, + "grad_norm": 6.304783344268799, + "learning_rate": 0.00021191068779500495, + "loss": 5.4296, + "step": 5830 + }, + { + "epoch": 7.468030690537084, + "grad_norm": 81.46858215332031, + "learning_rate": 0.0002116325625054709, + "loss": 5.4251, + "step": 5840 + }, + { + "epoch": 7.4808184143222505, + "grad_norm": 36.4720344543457, + "learning_rate": 0.00021135418205438879, + "loss": 5.4542, + "step": 5850 + }, + { + "epoch": 7.493606138107417, + "grad_norm": 2.27168345451355, + "learning_rate": 0.00021107554759426601, + "loss": 5.5033, + "step": 5860 + }, + { + "epoch": 7.506393861892583, + "grad_norm": 2.0624887943267822, + "learning_rate": 0.00021079666027866165, + "loss": 5.4607, + "step": 5870 + }, + { + "epoch": 7.5191815856777495, + "grad_norm": 1.2841031551361084, + "learning_rate": 0.0002105175212621816, + "loss": 5.4534, + "step": 5880 + }, + { + "epoch": 7.531969309462916, + "grad_norm": 1.7171348333358765, + "learning_rate": 0.00021023813170047378, + "loss": 5.4251, + "step": 5890 + }, + { + "epoch": 7.544757033248082, + "grad_norm": 2.006608247756958, + "learning_rate": 0.00020995849275022338, + "loss": 5.4303, + "step": 5900 + }, + { + "epoch": 7.557544757033249, + "grad_norm": 1.1459277868270874, + "learning_rate": 0.00020967860556914808, + "loss": 5.4326, + "step": 5910 + }, + { + "epoch": 7.570332480818414, + "grad_norm": 2.0150973796844482, + "learning_rate": 0.00020939847131599325, + "loss": 5.4058, + "step": 5920 + }, + { + "epoch": 7.58312020460358, + "grad_norm": 1.5685169696807861, + "learning_rate": 0.00020911809115052718, + "loss": 5.4017, + "step": 5930 + }, + { + "epoch": 7.595907928388747, + "grad_norm": 4.461862564086914, + "learning_rate": 0.00020883746623353615, + "loss": 5.4142, + "step": 5940 + }, + { + "epoch": 7.608695652173913, + "grad_norm": 3.8895885944366455, + "learning_rate": 0.00020855659772681988, + "loss": 5.4473, + "step": 5950 + }, + { + "epoch": 7.621483375959079, + "grad_norm": 26.799531936645508, + "learning_rate": 0.0002082754867931864, + "loss": 5.3851, + "step": 5960 + }, + { + "epoch": 7.634271099744246, + "grad_norm": 28.458995819091797, + "learning_rate": 0.00020799413459644746, + "loss": 5.404, + "step": 5970 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 5.39988374710083, + "learning_rate": 0.00020771254230141374, + "loss": 5.4142, + "step": 5980 + }, + { + "epoch": 7.659846547314578, + "grad_norm": 24.639158248901367, + "learning_rate": 0.00020743071107388978, + "loss": 5.4414, + "step": 5990 + }, + { + "epoch": 7.672634271099744, + "grad_norm": 31.065847396850586, + "learning_rate": 0.00020714864208066944, + "loss": 5.4642, + "step": 6000 + }, + { + "epoch": 7.68542199488491, + "grad_norm": 3.8758230209350586, + "learning_rate": 0.00020686633648953087, + "loss": 5.4475, + "step": 6010 + }, + { + "epoch": 7.698209718670077, + "grad_norm": 1.0469917058944702, + "learning_rate": 0.00020658379546923173, + "loss": 5.4207, + "step": 6020 + }, + { + "epoch": 7.710997442455243, + "grad_norm": 1.1670767068862915, + "learning_rate": 0.00020630102018950446, + "loss": 5.388, + "step": 6030 + }, + { + "epoch": 7.723785166240409, + "grad_norm": 1.3517999649047852, + "learning_rate": 0.00020601801182105128, + "loss": 5.4072, + "step": 6040 + }, + { + "epoch": 7.736572890025576, + "grad_norm": 1.2495683431625366, + "learning_rate": 0.00020573477153553932, + "loss": 5.363, + "step": 6050 + }, + { + "epoch": 7.749360613810742, + "grad_norm": 2.0349299907684326, + "learning_rate": 0.0002054513005055961, + "loss": 5.3792, + "step": 6060 + }, + { + "epoch": 7.762148337595908, + "grad_norm": 417.0843811035156, + "learning_rate": 0.0002051675999048042, + "loss": 5.4068, + "step": 6070 + }, + { + "epoch": 7.774936061381074, + "grad_norm": 1.2949700355529785, + "learning_rate": 0.0002048836709076967, + "loss": 5.4034, + "step": 6080 + }, + { + "epoch": 7.78772378516624, + "grad_norm": 1.1051993370056152, + "learning_rate": 0.00020459951468975227, + "loss": 5.4037, + "step": 6090 + }, + { + "epoch": 7.8005115089514065, + "grad_norm": 1.6521767377853394, + "learning_rate": 0.00020431513242739034, + "loss": 5.377, + "step": 6100 + }, + { + "epoch": 7.813299232736573, + "grad_norm": 1.1317176818847656, + "learning_rate": 0.00020403052529796605, + "loss": 5.3547, + "step": 6110 + }, + { + "epoch": 7.826086956521739, + "grad_norm": 1.4219192266464233, + "learning_rate": 0.00020374569447976563, + "loss": 5.3507, + "step": 6120 + }, + { + "epoch": 7.838874680306906, + "grad_norm": 154.09230041503906, + "learning_rate": 0.0002034606411520013, + "loss": 5.355, + "step": 6130 + }, + { + "epoch": 7.851662404092072, + "grad_norm": 1.127315878868103, + "learning_rate": 0.00020317536649480645, + "loss": 5.3597, + "step": 6140 + }, + { + "epoch": 7.864450127877237, + "grad_norm": 2.925020933151245, + "learning_rate": 0.0002028898716892309, + "loss": 5.3633, + "step": 6150 + }, + { + "epoch": 7.877237851662404, + "grad_norm": 1.464866280555725, + "learning_rate": 0.00020260415791723582, + "loss": 5.3502, + "step": 6160 + }, + { + "epoch": 7.89002557544757, + "grad_norm": 1.2165534496307373, + "learning_rate": 0.00020231822636168894, + "loss": 5.3366, + "step": 6170 + }, + { + "epoch": 7.9028132992327365, + "grad_norm": 1.153878092765808, + "learning_rate": 0.0002020320782063596, + "loss": 5.373, + "step": 6180 + }, + { + "epoch": 7.915601023017903, + "grad_norm": 3.499431848526001, + "learning_rate": 0.00020174571463591387, + "loss": 5.3212, + "step": 6190 + }, + { + "epoch": 7.928388746803069, + "grad_norm": 1.6055837869644165, + "learning_rate": 0.00020145913683590965, + "loss": 5.3329, + "step": 6200 + }, + { + "epoch": 7.9411764705882355, + "grad_norm": 20.313854217529297, + "learning_rate": 0.00020117234599279177, + "loss": 5.3892, + "step": 6210 + }, + { + "epoch": 7.953964194373402, + "grad_norm": 1047.665283203125, + "learning_rate": 0.00020088534329388712, + "loss": 5.4904, + "step": 6220 + }, + { + "epoch": 7.966751918158568, + "grad_norm": 199.46250915527344, + "learning_rate": 0.00020059812992739956, + "loss": 5.4668, + "step": 6230 + }, + { + "epoch": 7.979539641943734, + "grad_norm": 77.22345733642578, + "learning_rate": 0.00020031070708240525, + "loss": 5.447, + "step": 6240 + }, + { + "epoch": 7.9923273657289, + "grad_norm": 2511.667236328125, + "learning_rate": 0.0002000230759488475, + "loss": 5.4581, + "step": 6250 + }, + { + "epoch": 8.005115089514067, + "grad_norm": 964.1087646484375, + "learning_rate": 0.00019973523771753203, + "loss": 5.4332, + "step": 6260 + }, + { + "epoch": 8.017902813299234, + "grad_norm": 1089.00146484375, + "learning_rate": 0.00019944719358012188, + "loss": 5.4214, + "step": 6270 + }, + { + "epoch": 8.030690537084398, + "grad_norm": 177.26681518554688, + "learning_rate": 0.00019915894472913258, + "loss": 5.4031, + "step": 6280 + }, + { + "epoch": 8.043478260869565, + "grad_norm": 37.300941467285156, + "learning_rate": 0.00019887049235792716, + "loss": 5.3803, + "step": 6290 + }, + { + "epoch": 8.05626598465473, + "grad_norm": 41.14563751220703, + "learning_rate": 0.00019858183766071133, + "loss": 5.4269, + "step": 6300 + }, + { + "epoch": 8.069053708439897, + "grad_norm": 1573.4498291015625, + "learning_rate": 0.00019829298183252832, + "loss": 5.3899, + "step": 6310 + }, + { + "epoch": 8.081841432225064, + "grad_norm": 90.4874038696289, + "learning_rate": 0.00019800392606925409, + "loss": 5.4205, + "step": 6320 + }, + { + "epoch": 8.09462915601023, + "grad_norm": 87.13722229003906, + "learning_rate": 0.00019771467156759235, + "loss": 5.4472, + "step": 6330 + }, + { + "epoch": 8.107416879795396, + "grad_norm": 69.13899230957031, + "learning_rate": 0.0001974252195250696, + "loss": 5.4954, + "step": 6340 + }, + { + "epoch": 8.120204603580563, + "grad_norm": 64.28082275390625, + "learning_rate": 0.00019713557114003018, + "loss": 5.532, + "step": 6350 + }, + { + "epoch": 8.132992327365729, + "grad_norm": 6390.68896484375, + "learning_rate": 0.00019684572761163125, + "loss": 5.4868, + "step": 6360 + }, + { + "epoch": 8.145780051150895, + "grad_norm": 4.564518928527832, + "learning_rate": 0.00019655569013983797, + "loss": 5.4988, + "step": 6370 + }, + { + "epoch": 8.158567774936062, + "grad_norm": 5.2263665199279785, + "learning_rate": 0.00019626545992541824, + "loss": 5.5725, + "step": 6380 + }, + { + "epoch": 8.171355498721228, + "grad_norm": 4.126269340515137, + "learning_rate": 0.00019597503816993808, + "loss": 5.5511, + "step": 6390 + }, + { + "epoch": 8.184143222506394, + "grad_norm": 13.468574523925781, + "learning_rate": 0.00019568442607575655, + "loss": 5.4787, + "step": 6400 + }, + { + "epoch": 8.19693094629156, + "grad_norm": 10.518543243408203, + "learning_rate": 0.00019539362484602058, + "loss": 5.5517, + "step": 6410 + }, + { + "epoch": 8.209718670076727, + "grad_norm": 49.399635314941406, + "learning_rate": 0.00019510263568466016, + "loss": 5.478, + "step": 6420 + }, + { + "epoch": 8.222506393861893, + "grad_norm": 193.64840698242188, + "learning_rate": 0.00019481145979638342, + "loss": 5.457, + "step": 6430 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 317.044189453125, + "learning_rate": 0.00019452009838667142, + "loss": 5.4926, + "step": 6440 + }, + { + "epoch": 8.248081841432224, + "grad_norm": 11.334222793579102, + "learning_rate": 0.00019422855266177337, + "loss": 5.5006, + "step": 6450 + }, + { + "epoch": 8.26086956521739, + "grad_norm": 46.2332649230957, + "learning_rate": 0.00019393682382870151, + "loss": 5.469, + "step": 6460 + }, + { + "epoch": 8.273657289002557, + "grad_norm": 15.786870956420898, + "learning_rate": 0.0001936449130952262, + "loss": 5.4381, + "step": 6470 + }, + { + "epoch": 8.286445012787723, + "grad_norm": 52.73589324951172, + "learning_rate": 0.00019335282166987083, + "loss": 5.4408, + "step": 6480 + }, + { + "epoch": 8.29923273657289, + "grad_norm": 61.6315803527832, + "learning_rate": 0.00019306055076190692, + "loss": 5.4318, + "step": 6490 + }, + { + "epoch": 8.312020460358056, + "grad_norm": 12.02227783203125, + "learning_rate": 0.000192768101581349, + "loss": 5.4227, + "step": 6500 + }, + { + "epoch": 8.324808184143222, + "grad_norm": 15.120413780212402, + "learning_rate": 0.0001924754753389496, + "loss": 5.5355, + "step": 6510 + }, + { + "epoch": 8.337595907928389, + "grad_norm": 40.40769958496094, + "learning_rate": 0.0001921826732461945, + "loss": 5.4947, + "step": 6520 + }, + { + "epoch": 8.350383631713555, + "grad_norm": 84.49889373779297, + "learning_rate": 0.0001918896965152973, + "loss": 5.4894, + "step": 6530 + }, + { + "epoch": 8.363171355498721, + "grad_norm": 10.641755104064941, + "learning_rate": 0.00019159654635919472, + "loss": 5.6015, + "step": 6540 + }, + { + "epoch": 8.375959079283888, + "grad_norm": 5.664783000946045, + "learning_rate": 0.00019130322399154144, + "loss": 5.4883, + "step": 6550 + }, + { + "epoch": 8.388746803069054, + "grad_norm": 2.0645973682403564, + "learning_rate": 0.00019100973062670504, + "loss": 5.4584, + "step": 6560 + }, + { + "epoch": 8.40153452685422, + "grad_norm": 3.2000749111175537, + "learning_rate": 0.00019071606747976113, + "loss": 5.4612, + "step": 6570 + }, + { + "epoch": 8.414322250639387, + "grad_norm": 5.516482830047607, + "learning_rate": 0.00019042223576648822, + "loss": 5.4622, + "step": 6580 + }, + { + "epoch": 8.427109974424553, + "grad_norm": 1.833092451095581, + "learning_rate": 0.00019012823670336258, + "loss": 5.4434, + "step": 6590 + }, + { + "epoch": 8.43989769820972, + "grad_norm": 1.5939054489135742, + "learning_rate": 0.00018983407150755349, + "loss": 5.4028, + "step": 6600 + }, + { + "epoch": 8.452685421994884, + "grad_norm": 2.889866352081299, + "learning_rate": 0.00018953974139691794, + "loss": 5.5098, + "step": 6610 + }, + { + "epoch": 8.46547314578005, + "grad_norm": 12.796149253845215, + "learning_rate": 0.0001892452475899956, + "loss": 5.4479, + "step": 6620 + }, + { + "epoch": 8.478260869565217, + "grad_norm": 11.244743347167969, + "learning_rate": 0.00018895059130600396, + "loss": 5.4367, + "step": 6630 + }, + { + "epoch": 8.491048593350383, + "grad_norm": 6.546978950500488, + "learning_rate": 0.0001886557737648331, + "loss": 5.3932, + "step": 6640 + }, + { + "epoch": 8.50383631713555, + "grad_norm": 50.4334831237793, + "learning_rate": 0.00018836079618704074, + "loss": 5.4069, + "step": 6650 + }, + { + "epoch": 8.516624040920716, + "grad_norm": 7.810596466064453, + "learning_rate": 0.00018806565979384725, + "loss": 5.4757, + "step": 6660 + }, + { + "epoch": 8.529411764705882, + "grad_norm": 1.919179081916809, + "learning_rate": 0.0001877703658071303, + "loss": 5.398, + "step": 6670 + }, + { + "epoch": 8.542199488491049, + "grad_norm": 2.1735100746154785, + "learning_rate": 0.0001874749154494202, + "loss": 5.3852, + "step": 6680 + }, + { + "epoch": 8.554987212276215, + "grad_norm": 1.4280660152435303, + "learning_rate": 0.00018717930994389452, + "loss": 5.368, + "step": 6690 + }, + { + "epoch": 8.567774936061381, + "grad_norm": 0.9073035717010498, + "learning_rate": 0.00018688355051437323, + "loss": 5.3647, + "step": 6700 + }, + { + "epoch": 8.580562659846548, + "grad_norm": 0.9486181139945984, + "learning_rate": 0.00018658763838531346, + "loss": 5.3562, + "step": 6710 + }, + { + "epoch": 8.593350383631714, + "grad_norm": 2.596919059753418, + "learning_rate": 0.0001862915747818046, + "loss": 5.329, + "step": 6720 + }, + { + "epoch": 8.60613810741688, + "grad_norm": 1.398964762687683, + "learning_rate": 0.0001859953609295632, + "loss": 5.3702, + "step": 6730 + }, + { + "epoch": 8.618925831202047, + "grad_norm": 1.2070012092590332, + "learning_rate": 0.00018569899805492764, + "loss": 5.3389, + "step": 6740 + }, + { + "epoch": 8.631713554987213, + "grad_norm": 3.92047381401062, + "learning_rate": 0.00018540248738485346, + "loss": 5.3392, + "step": 6750 + }, + { + "epoch": 8.644501278772378, + "grad_norm": 0.8959820866584778, + "learning_rate": 0.00018510583014690805, + "loss": 5.3569, + "step": 6760 + }, + { + "epoch": 8.657289002557544, + "grad_norm": 0.838767409324646, + "learning_rate": 0.0001848090275692655, + "loss": 5.3327, + "step": 6770 + }, + { + "epoch": 8.67007672634271, + "grad_norm": 1.3687005043029785, + "learning_rate": 0.00018451208088070172, + "loss": 5.3473, + "step": 6780 + }, + { + "epoch": 8.682864450127877, + "grad_norm": 1.2928316593170166, + "learning_rate": 0.00018421499131058918, + "loss": 5.3158, + "step": 6790 + }, + { + "epoch": 8.695652173913043, + "grad_norm": 1.4481250047683716, + "learning_rate": 0.0001839177600888919, + "loss": 5.3533, + "step": 6800 + }, + { + "epoch": 8.70843989769821, + "grad_norm": 1.228041172027588, + "learning_rate": 0.0001836203884461603, + "loss": 5.331, + "step": 6810 + }, + { + "epoch": 8.721227621483376, + "grad_norm": 1.7305997610092163, + "learning_rate": 0.00018332287761352632, + "loss": 5.3389, + "step": 6820 + }, + { + "epoch": 8.734015345268542, + "grad_norm": 1.4301363229751587, + "learning_rate": 0.0001830252288226979, + "loss": 5.3091, + "step": 6830 + }, + { + "epoch": 8.746803069053708, + "grad_norm": 1.5935139656066895, + "learning_rate": 0.00018272744330595432, + "loss": 5.3222, + "step": 6840 + }, + { + "epoch": 8.759590792838875, + "grad_norm": 4.215871810913086, + "learning_rate": 0.00018242952229614092, + "loss": 5.3164, + "step": 6850 + }, + { + "epoch": 8.772378516624041, + "grad_norm": 3.8628363609313965, + "learning_rate": 0.0001821314670266638, + "loss": 5.3282, + "step": 6860 + }, + { + "epoch": 8.785166240409207, + "grad_norm": 5.04424524307251, + "learning_rate": 0.00018183327873148508, + "loss": 5.3065, + "step": 6870 + }, + { + "epoch": 8.797953964194374, + "grad_norm": 2.324964761734009, + "learning_rate": 0.00018153495864511757, + "loss": 5.3322, + "step": 6880 + }, + { + "epoch": 8.81074168797954, + "grad_norm": 0.8637028932571411, + "learning_rate": 0.00018123650800261966, + "loss": 5.3113, + "step": 6890 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 2.1809170246124268, + "learning_rate": 0.0001809379280395903, + "loss": 5.2999, + "step": 6900 + }, + { + "epoch": 8.836317135549873, + "grad_norm": 0.9374740719795227, + "learning_rate": 0.00018063921999216374, + "loss": 5.3243, + "step": 6910 + }, + { + "epoch": 8.84910485933504, + "grad_norm": 1.3672325611114502, + "learning_rate": 0.00018034038509700468, + "loss": 5.3473, + "step": 6920 + }, + { + "epoch": 8.861892583120204, + "grad_norm": 1.0719919204711914, + "learning_rate": 0.00018004142459130282, + "loss": 5.3407, + "step": 6930 + }, + { + "epoch": 8.87468030690537, + "grad_norm": 0.8660845160484314, + "learning_rate": 0.00017974233971276791, + "loss": 5.3093, + "step": 6940 + }, + { + "epoch": 8.887468030690536, + "grad_norm": 1.8442758321762085, + "learning_rate": 0.0001794431316996247, + "loss": 5.3256, + "step": 6950 + }, + { + "epoch": 8.900255754475703, + "grad_norm": 1.3475013971328735, + "learning_rate": 0.00017914380179060756, + "loss": 5.3093, + "step": 6960 + }, + { + "epoch": 8.91304347826087, + "grad_norm": 1.1749337911605835, + "learning_rate": 0.00017884435122495575, + "loss": 5.3207, + "step": 6970 + }, + { + "epoch": 8.925831202046036, + "grad_norm": 1.2629647254943848, + "learning_rate": 0.00017854478124240783, + "loss": 5.2829, + "step": 6980 + }, + { + "epoch": 8.938618925831202, + "grad_norm": 1.7222191095352173, + "learning_rate": 0.00017824509308319688, + "loss": 5.3143, + "step": 6990 + }, + { + "epoch": 8.951406649616368, + "grad_norm": 14.291260719299316, + "learning_rate": 0.00017794528798804519, + "loss": 5.3009, + "step": 7000 + }, + { + "epoch": 8.964194373401535, + "grad_norm": 1.296583652496338, + "learning_rate": 0.00017764536719815918, + "loss": 5.3158, + "step": 7010 + }, + { + "epoch": 8.976982097186701, + "grad_norm": 1.0986963510513306, + "learning_rate": 0.00017734533195522424, + "loss": 5.3068, + "step": 7020 + }, + { + "epoch": 8.989769820971867, + "grad_norm": 1.0342364311218262, + "learning_rate": 0.00017704518350139965, + "loss": 5.2997, + "step": 7030 + }, + { + "epoch": 9.002557544757034, + "grad_norm": 1.445008635520935, + "learning_rate": 0.0001767449230793133, + "loss": 5.3086, + "step": 7040 + }, + { + "epoch": 9.0153452685422, + "grad_norm": 1.7098116874694824, + "learning_rate": 0.00017644455193205666, + "loss": 5.3005, + "step": 7050 + }, + { + "epoch": 9.028132992327366, + "grad_norm": 0.9708366990089417, + "learning_rate": 0.00017614407130317968, + "loss": 5.2737, + "step": 7060 + }, + { + "epoch": 9.040920716112533, + "grad_norm": 1.089858889579773, + "learning_rate": 0.00017584348243668556, + "loss": 5.262, + "step": 7070 + }, + { + "epoch": 9.053708439897699, + "grad_norm": 2.534782886505127, + "learning_rate": 0.00017554278657702549, + "loss": 5.2854, + "step": 7080 + }, + { + "epoch": 9.066496163682864, + "grad_norm": 1.3707194328308105, + "learning_rate": 0.00017524198496909373, + "loss": 5.2936, + "step": 7090 + }, + { + "epoch": 9.07928388746803, + "grad_norm": 1.4009782075881958, + "learning_rate": 0.0001749410788582223, + "loss": 5.2795, + "step": 7100 + }, + { + "epoch": 9.092071611253196, + "grad_norm": 1.1263126134872437, + "learning_rate": 0.00017464006949017584, + "loss": 5.2806, + "step": 7110 + }, + { + "epoch": 9.104859335038363, + "grad_norm": 0.9099717140197754, + "learning_rate": 0.00017433895811114658, + "loss": 5.3049, + "step": 7120 + }, + { + "epoch": 9.117647058823529, + "grad_norm": 1.3195571899414062, + "learning_rate": 0.00017403774596774893, + "loss": 5.2803, + "step": 7130 + }, + { + "epoch": 9.130434782608695, + "grad_norm": 1.1069055795669556, + "learning_rate": 0.00017373643430701463, + "loss": 5.2579, + "step": 7140 + }, + { + "epoch": 9.143222506393862, + "grad_norm": 1.1975563764572144, + "learning_rate": 0.00017343502437638727, + "loss": 5.2795, + "step": 7150 + }, + { + "epoch": 9.156010230179028, + "grad_norm": 1.6220418214797974, + "learning_rate": 0.00017313351742371746, + "loss": 5.2797, + "step": 7160 + }, + { + "epoch": 9.168797953964194, + "grad_norm": 1.490394115447998, + "learning_rate": 0.00017283191469725728, + "loss": 5.2768, + "step": 7170 + }, + { + "epoch": 9.18158567774936, + "grad_norm": 1.4838827848434448, + "learning_rate": 0.00017253021744565548, + "loss": 5.2871, + "step": 7180 + }, + { + "epoch": 9.194373401534527, + "grad_norm": 5.531613826751709, + "learning_rate": 0.0001722284269179521, + "loss": 5.2537, + "step": 7190 + }, + { + "epoch": 9.207161125319693, + "grad_norm": 1.5700544118881226, + "learning_rate": 0.0001719265443635733, + "loss": 5.2875, + "step": 7200 + }, + { + "epoch": 9.21994884910486, + "grad_norm": 1.2028346061706543, + "learning_rate": 0.00017162457103232632, + "loss": 5.2707, + "step": 7210 + }, + { + "epoch": 9.232736572890026, + "grad_norm": 1.9414821863174438, + "learning_rate": 0.00017132250817439412, + "loss": 5.2918, + "step": 7220 + }, + { + "epoch": 9.245524296675192, + "grad_norm": 3.8366823196411133, + "learning_rate": 0.00017102035704033038, + "loss": 5.277, + "step": 7230 + }, + { + "epoch": 9.258312020460359, + "grad_norm": 41.69551086425781, + "learning_rate": 0.0001707181188810542, + "loss": 5.2691, + "step": 7240 + }, + { + "epoch": 9.271099744245525, + "grad_norm": 2.0435402393341064, + "learning_rate": 0.00017041579494784506, + "loss": 5.3075, + "step": 7250 + }, + { + "epoch": 9.28388746803069, + "grad_norm": 2.1493489742279053, + "learning_rate": 0.00017011338649233743, + "loss": 5.3234, + "step": 7260 + }, + { + "epoch": 9.296675191815856, + "grad_norm": 3.627615213394165, + "learning_rate": 0.0001698108947665158, + "loss": 5.3018, + "step": 7270 + }, + { + "epoch": 9.309462915601022, + "grad_norm": 24.722545623779297, + "learning_rate": 0.00016950832102270927, + "loss": 5.3123, + "step": 7280 + }, + { + "epoch": 9.322250639386189, + "grad_norm": 19.32564353942871, + "learning_rate": 0.00016920566651358666, + "loss": 5.346, + "step": 7290 + }, + { + "epoch": 9.335038363171355, + "grad_norm": 173.1358184814453, + "learning_rate": 0.00016890293249215109, + "loss": 5.3385, + "step": 7300 + }, + { + "epoch": 9.347826086956522, + "grad_norm": 139.24111938476562, + "learning_rate": 0.0001686001202117348, + "loss": 5.3411, + "step": 7310 + }, + { + "epoch": 9.360613810741688, + "grad_norm": 12987.6923828125, + "learning_rate": 0.00016829723092599418, + "loss": 5.3288, + "step": 7320 + }, + { + "epoch": 9.373401534526854, + "grad_norm": 53.43489074707031, + "learning_rate": 0.00016799426588890427, + "loss": 5.3403, + "step": 7330 + }, + { + "epoch": 9.38618925831202, + "grad_norm": 29.375526428222656, + "learning_rate": 0.00016769122635475385, + "loss": 5.3186, + "step": 7340 + }, + { + "epoch": 9.398976982097187, + "grad_norm": 47.31606674194336, + "learning_rate": 0.00016738811357813998, + "loss": 5.3178, + "step": 7350 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 12.416561126708984, + "learning_rate": 0.00016708492881396307, + "loss": 5.3385, + "step": 7360 + }, + { + "epoch": 9.42455242966752, + "grad_norm": 9.562813758850098, + "learning_rate": 0.0001667816733174215, + "loss": 5.3481, + "step": 7370 + }, + { + "epoch": 9.437340153452686, + "grad_norm": 3.935084819793701, + "learning_rate": 0.00016647834834400654, + "loss": 5.3439, + "step": 7380 + }, + { + "epoch": 9.450127877237852, + "grad_norm": 2.569079875946045, + "learning_rate": 0.00016617495514949704, + "loss": 5.3225, + "step": 7390 + }, + { + "epoch": 9.462915601023019, + "grad_norm": 1.732500672340393, + "learning_rate": 0.0001658714949899543, + "loss": 5.3235, + "step": 7400 + }, + { + "epoch": 9.475703324808185, + "grad_norm": 1.2219961881637573, + "learning_rate": 0.00016556796912171689, + "loss": 5.3413, + "step": 7410 + }, + { + "epoch": 9.48849104859335, + "grad_norm": 0.9043082594871521, + "learning_rate": 0.00016526437880139537, + "loss": 5.288, + "step": 7420 + }, + { + "epoch": 9.501278772378516, + "grad_norm": 1.0241756439208984, + "learning_rate": 0.0001649607252858672, + "loss": 5.302, + "step": 7430 + }, + { + "epoch": 9.514066496163682, + "grad_norm": 1.5669431686401367, + "learning_rate": 0.00016465700983227138, + "loss": 5.2899, + "step": 7440 + }, + { + "epoch": 9.526854219948849, + "grad_norm": 1.5617988109588623, + "learning_rate": 0.00016435323369800344, + "loss": 5.2868, + "step": 7450 + }, + { + "epoch": 9.539641943734015, + "grad_norm": 1.1847914457321167, + "learning_rate": 0.00016404939814071003, + "loss": 5.2617, + "step": 7460 + }, + { + "epoch": 9.552429667519181, + "grad_norm": 0.8560781478881836, + "learning_rate": 0.0001637455044182839, + "loss": 5.2855, + "step": 7470 + }, + { + "epoch": 9.565217391304348, + "grad_norm": 0.9926068782806396, + "learning_rate": 0.0001634415537888585, + "loss": 5.265, + "step": 7480 + }, + { + "epoch": 9.578005115089514, + "grad_norm": 2.2098798751831055, + "learning_rate": 0.00016313754751080302, + "loss": 5.2773, + "step": 7490 + }, + { + "epoch": 9.59079283887468, + "grad_norm": 1.3162308931350708, + "learning_rate": 0.00016283348684271694, + "loss": 5.276, + "step": 7500 + }, + { + "epoch": 9.603580562659847, + "grad_norm": 1.2072679996490479, + "learning_rate": 0.00016252937304342494, + "loss": 5.2825, + "step": 7510 + }, + { + "epoch": 9.616368286445013, + "grad_norm": 1.212632656097412, + "learning_rate": 0.0001622252073719717, + "loss": 5.2609, + "step": 7520 + }, + { + "epoch": 9.62915601023018, + "grad_norm": 1.244361400604248, + "learning_rate": 0.0001619209910876165, + "loss": 5.247, + "step": 7530 + }, + { + "epoch": 9.641943734015346, + "grad_norm": 1.2935197353363037, + "learning_rate": 0.00016161672544982842, + "loss": 5.2666, + "step": 7540 + }, + { + "epoch": 9.654731457800512, + "grad_norm": 1.270849347114563, + "learning_rate": 0.00016131241171828063, + "loss": 5.2556, + "step": 7550 + }, + { + "epoch": 9.667519181585678, + "grad_norm": 1.5244646072387695, + "learning_rate": 0.00016100805115284555, + "loss": 5.2594, + "step": 7560 + }, + { + "epoch": 9.680306905370845, + "grad_norm": 1.245428204536438, + "learning_rate": 0.00016070364501358944, + "loss": 5.2452, + "step": 7570 + }, + { + "epoch": 9.693094629156011, + "grad_norm": 2.1988797187805176, + "learning_rate": 0.00016039919456076727, + "loss": 5.289, + "step": 7580 + }, + { + "epoch": 9.705882352941176, + "grad_norm": 4.7878031730651855, + "learning_rate": 0.00016009470105481736, + "loss": 5.2933, + "step": 7590 + }, + { + "epoch": 9.718670076726342, + "grad_norm": 1.0919615030288696, + "learning_rate": 0.00015979016575635644, + "loss": 5.2634, + "step": 7600 + }, + { + "epoch": 9.731457800511508, + "grad_norm": 1.3478410243988037, + "learning_rate": 0.00015948558992617416, + "loss": 5.2808, + "step": 7610 + }, + { + "epoch": 9.744245524296675, + "grad_norm": 1.088616132736206, + "learning_rate": 0.00015918097482522798, + "loss": 5.2656, + "step": 7620 + }, + { + "epoch": 9.757033248081841, + "grad_norm": 1.4594988822937012, + "learning_rate": 0.00015887632171463794, + "loss": 5.2422, + "step": 7630 + }, + { + "epoch": 9.769820971867007, + "grad_norm": 1.346463918685913, + "learning_rate": 0.00015857163185568153, + "loss": 5.2668, + "step": 7640 + }, + { + "epoch": 9.782608695652174, + "grad_norm": 2.3005871772766113, + "learning_rate": 0.00015826690650978825, + "loss": 5.2295, + "step": 7650 + }, + { + "epoch": 9.79539641943734, + "grad_norm": 1.372439980506897, + "learning_rate": 0.0001579621469385346, + "loss": 5.2419, + "step": 7660 + }, + { + "epoch": 9.808184143222507, + "grad_norm": 1.0069390535354614, + "learning_rate": 0.00015765735440363872, + "loss": 5.2692, + "step": 7670 + }, + { + "epoch": 9.820971867007673, + "grad_norm": 1.3591490983963013, + "learning_rate": 0.00015735253016695527, + "loss": 5.269, + "step": 7680 + }, + { + "epoch": 9.83375959079284, + "grad_norm": 1.47946298122406, + "learning_rate": 0.00015704767549047015, + "loss": 5.2615, + "step": 7690 + }, + { + "epoch": 9.846547314578006, + "grad_norm": 2.2882235050201416, + "learning_rate": 0.00015674279163629528, + "loss": 5.2452, + "step": 7700 + }, + { + "epoch": 9.859335038363172, + "grad_norm": 1.406764268875122, + "learning_rate": 0.00015643787986666333, + "loss": 5.2515, + "step": 7710 + }, + { + "epoch": 9.872122762148338, + "grad_norm": 0.9985896348953247, + "learning_rate": 0.00015613294144392256, + "loss": 5.2536, + "step": 7720 + }, + { + "epoch": 9.884910485933505, + "grad_norm": 1.1201996803283691, + "learning_rate": 0.00015582797763053166, + "loss": 5.2459, + "step": 7730 + }, + { + "epoch": 9.89769820971867, + "grad_norm": 1.2289462089538574, + "learning_rate": 0.00015552298968905432, + "loss": 5.2162, + "step": 7740 + }, + { + "epoch": 9.910485933503836, + "grad_norm": 1.3729525804519653, + "learning_rate": 0.00015521797888215424, + "loss": 5.2488, + "step": 7750 + }, + { + "epoch": 9.923273657289002, + "grad_norm": 1.3408769369125366, + "learning_rate": 0.00015491294647258967, + "loss": 5.2608, + "step": 7760 + }, + { + "epoch": 9.936061381074168, + "grad_norm": 1.3026928901672363, + "learning_rate": 0.0001546078937232083, + "loss": 5.227, + "step": 7770 + }, + { + "epoch": 9.948849104859335, + "grad_norm": 1.3715628385543823, + "learning_rate": 0.00015430282189694212, + "loss": 5.2677, + "step": 7780 + }, + { + "epoch": 9.961636828644501, + "grad_norm": 1.0734012126922607, + "learning_rate": 0.00015399773225680208, + "loss": 5.2575, + "step": 7790 + }, + { + "epoch": 9.974424552429667, + "grad_norm": 1.0179634094238281, + "learning_rate": 0.00015369262606587281, + "loss": 5.2117, + "step": 7800 + }, + { + "epoch": 9.987212276214834, + "grad_norm": 1.2135021686553955, + "learning_rate": 0.00015338750458730746, + "loss": 5.2387, + "step": 7810 + }, + { + "epoch": 10.0, + "grad_norm": 1.7299256324768066, + "learning_rate": 0.00015308236908432264, + "loss": 5.2517, + "step": 7820 + }, + { + "epoch": 10.012787723785166, + "grad_norm": 1.3696908950805664, + "learning_rate": 0.00015277722082019272, + "loss": 5.2373, + "step": 7830 + }, + { + "epoch": 10.025575447570333, + "grad_norm": 1.4967882633209229, + "learning_rate": 0.00015247206105824522, + "loss": 5.2045, + "step": 7840 + }, + { + "epoch": 10.038363171355499, + "grad_norm": 1.4495280981063843, + "learning_rate": 0.00015216689106185505, + "loss": 5.2286, + "step": 7850 + }, + { + "epoch": 10.051150895140665, + "grad_norm": 2.255189895629883, + "learning_rate": 0.00015186171209443958, + "loss": 5.1753, + "step": 7860 + }, + { + "epoch": 10.063938618925832, + "grad_norm": 1.22813081741333, + "learning_rate": 0.00015155652541945326, + "loss": 5.2259, + "step": 7870 + }, + { + "epoch": 10.076726342710998, + "grad_norm": 1.1559652090072632, + "learning_rate": 0.00015125133230038256, + "loss": 5.1997, + "step": 7880 + }, + { + "epoch": 10.089514066496164, + "grad_norm": 1.5692169666290283, + "learning_rate": 0.00015094613400074052, + "loss": 5.2184, + "step": 7890 + }, + { + "epoch": 10.10230179028133, + "grad_norm": 1.143264651298523, + "learning_rate": 0.00015064093178406165, + "loss": 5.2217, + "step": 7900 + }, + { + "epoch": 10.115089514066495, + "grad_norm": 1.3026010990142822, + "learning_rate": 0.00015033572691389673, + "loss": 5.2219, + "step": 7910 + }, + { + "epoch": 10.127877237851662, + "grad_norm": 1.255898356437683, + "learning_rate": 0.00015003052065380742, + "loss": 5.1964, + "step": 7920 + }, + { + "epoch": 10.140664961636828, + "grad_norm": 1.5015469789505005, + "learning_rate": 0.00014972531426736131, + "loss": 5.241, + "step": 7930 + }, + { + "epoch": 10.153452685421994, + "grad_norm": 1.5134257078170776, + "learning_rate": 0.0001494201090181263, + "loss": 5.2255, + "step": 7940 + }, + { + "epoch": 10.16624040920716, + "grad_norm": 1.295350432395935, + "learning_rate": 0.00014911490616966575, + "loss": 5.2201, + "step": 7950 + }, + { + "epoch": 10.179028132992327, + "grad_norm": 1.8612785339355469, + "learning_rate": 0.000148809706985533, + "loss": 5.2109, + "step": 7960 + }, + { + "epoch": 10.191815856777493, + "grad_norm": 1.5443718433380127, + "learning_rate": 0.0001485045127292662, + "loss": 5.2247, + "step": 7970 + }, + { + "epoch": 10.20460358056266, + "grad_norm": 1.7104902267456055, + "learning_rate": 0.00014819932466438317, + "loss": 5.2141, + "step": 7980 + }, + { + "epoch": 10.217391304347826, + "grad_norm": 1.7787065505981445, + "learning_rate": 0.00014789414405437607, + "loss": 5.1911, + "step": 7990 + }, + { + "epoch": 10.230179028132993, + "grad_norm": 1.0850389003753662, + "learning_rate": 0.0001475889721627062, + "loss": 5.2156, + "step": 8000 + }, + { + "epoch": 1.0252143862792782, + "grad_norm": 1.8151112794876099, + "learning_rate": 0.000298150415790792, + "loss": 5.5132, + "step": 8010 + }, + { + "epoch": 1.0264943043645207, + "grad_norm": 1.6067150831222534, + "learning_rate": 0.0002981456854903832, + "loss": 5.4928, + "step": 8020 + }, + { + "epoch": 1.0277742224497632, + "grad_norm": 1.838860273361206, + "learning_rate": 0.00029814094918646013, + "loss": 5.4783, + "step": 8030 + }, + { + "epoch": 1.0290541405350058, + "grad_norm": 1.2739697694778442, + "learning_rate": 0.0002981362068792148, + "loss": 5.481, + "step": 8040 + }, + { + "epoch": 1.0303340586202483, + "grad_norm": 1.7215646505355835, + "learning_rate": 0.0002981314585688393, + "loss": 5.4588, + "step": 8050 + }, + { + "epoch": 1.0316139767054908, + "grad_norm": 1.9005746841430664, + "learning_rate": 0.00029812670425552603, + "loss": 5.4475, + "step": 8060 + }, + { + "epoch": 1.0328938947907333, + "grad_norm": 1.0230218172073364, + "learning_rate": 0.00029812194393946776, + "loss": 5.4565, + "step": 8070 + }, + { + "epoch": 1.0341738128759759, + "grad_norm": 1.4073644876480103, + "learning_rate": 0.0002981171776208573, + "loss": 5.4663, + "step": 8080 + }, + { + "epoch": 1.0354537309612184, + "grad_norm": 2.152285575866699, + "learning_rate": 0.0002981124052998879, + "loss": 5.4846, + "step": 8090 + }, + { + "epoch": 1.036733649046461, + "grad_norm": 2.273869752883911, + "learning_rate": 0.0002981076269767529, + "loss": 5.4498, + "step": 8100 + }, + { + "epoch": 1.0380135671317037, + "grad_norm": 1.4030476808547974, + "learning_rate": 0.00029810284265164585, + "loss": 5.4423, + "step": 8110 + }, + { + "epoch": 1.0392934852169462, + "grad_norm": 1.3551042079925537, + "learning_rate": 0.00029809805232476074, + "loss": 5.4515, + "step": 8120 + }, + { + "epoch": 1.0405734033021887, + "grad_norm": 1.278965711593628, + "learning_rate": 0.00029809325599629174, + "loss": 5.4393, + "step": 8130 + }, + { + "epoch": 1.0418533213874313, + "grad_norm": 1.7702277898788452, + "learning_rate": 0.00029808845366643315, + "loss": 5.432, + "step": 8140 + }, + { + "epoch": 1.0431332394726738, + "grad_norm": 1.435707449913025, + "learning_rate": 0.0002980836453353795, + "loss": 5.4574, + "step": 8150 + }, + { + "epoch": 1.0444131575579163, + "grad_norm": 1.4043419361114502, + "learning_rate": 0.0002980788310033258, + "loss": 5.4436, + "step": 8160 + }, + { + "epoch": 1.0456930756431588, + "grad_norm": 1.6054362058639526, + "learning_rate": 0.00029807401067046707, + "loss": 5.4293, + "step": 8170 + }, + { + "epoch": 1.0469729937284014, + "grad_norm": 1.724361538887024, + "learning_rate": 0.00029806918433699866, + "loss": 5.4381, + "step": 8180 + }, + { + "epoch": 1.0482529118136439, + "grad_norm": 1.4272695779800415, + "learning_rate": 0.0002980643520031161, + "loss": 5.4341, + "step": 8190 + }, + { + "epoch": 1.0495328298988864, + "grad_norm": 1.6520978212356567, + "learning_rate": 0.0002980595136690153, + "loss": 5.433, + "step": 8200 + }, + { + "epoch": 1.050812747984129, + "grad_norm": 1.906540870666504, + "learning_rate": 0.0002980546693348923, + "loss": 5.4493, + "step": 8210 + }, + { + "epoch": 1.0520926660693715, + "grad_norm": 1.3540748357772827, + "learning_rate": 0.00029804981900094344, + "loss": 5.4291, + "step": 8220 + }, + { + "epoch": 1.0533725841546142, + "grad_norm": 1.2956520318984985, + "learning_rate": 0.0002980449626673652, + "loss": 5.4322, + "step": 8230 + }, + { + "epoch": 1.0546525022398567, + "grad_norm": 2.1103005409240723, + "learning_rate": 0.0002980401003343545, + "loss": 5.4154, + "step": 8240 + }, + { + "epoch": 1.0559324203250993, + "grad_norm": 1.9782565832138062, + "learning_rate": 0.00029803523200210827, + "loss": 5.4287, + "step": 8250 + }, + { + "epoch": 1.0572123384103418, + "grad_norm": 1.5578869581222534, + "learning_rate": 0.0002980303576708238, + "loss": 5.4315, + "step": 8260 + }, + { + "epoch": 1.0584922564955843, + "grad_norm": 1.8073164224624634, + "learning_rate": 0.0002980254773406987, + "loss": 5.4278, + "step": 8270 + }, + { + "epoch": 1.0597721745808268, + "grad_norm": 1.7924606800079346, + "learning_rate": 0.0002980205910119307, + "loss": 5.4216, + "step": 8280 + }, + { + "epoch": 1.0610520926660694, + "grad_norm": 1.3763577938079834, + "learning_rate": 0.00029801569868471783, + "loss": 5.4291, + "step": 8290 + }, + { + "epoch": 1.062332010751312, + "grad_norm": 1.5295467376708984, + "learning_rate": 0.00029801080035925833, + "loss": 5.4014, + "step": 8300 + }, + { + "epoch": 1.0636119288365544, + "grad_norm": 2.3050379753112793, + "learning_rate": 0.0002980058960357507, + "loss": 5.4266, + "step": 8310 + }, + { + "epoch": 1.064891846921797, + "grad_norm": 2.043785333633423, + "learning_rate": 0.0002980009857143937, + "loss": 5.4185, + "step": 8320 + }, + { + "epoch": 1.0661717650070395, + "grad_norm": 1.808798909187317, + "learning_rate": 0.0002979960693953863, + "loss": 5.4153, + "step": 8330 + }, + { + "epoch": 1.067451683092282, + "grad_norm": 1.8030039072036743, + "learning_rate": 0.00029799114707892776, + "loss": 5.4243, + "step": 8340 + }, + { + "epoch": 1.0687316011775247, + "grad_norm": 1.0454009771347046, + "learning_rate": 0.0002979862187652175, + "loss": 5.4105, + "step": 8350 + }, + { + "epoch": 1.0700115192627673, + "grad_norm": 1.3364286422729492, + "learning_rate": 0.0002979812844544553, + "loss": 5.4189, + "step": 8360 + }, + { + "epoch": 1.0712914373480098, + "grad_norm": 2.3642287254333496, + "learning_rate": 0.00029797634414684105, + "loss": 5.4119, + "step": 8370 + }, + { + "epoch": 1.0725713554332523, + "grad_norm": 1.865700125694275, + "learning_rate": 0.00029797139784257503, + "loss": 5.4069, + "step": 8380 + }, + { + "epoch": 1.0738512735184949, + "grad_norm": 1.0280027389526367, + "learning_rate": 0.0002979664455418576, + "loss": 5.4096, + "step": 8390 + }, + { + "epoch": 1.0751311916037374, + "grad_norm": 1.3924864530563354, + "learning_rate": 0.00029796148724488963, + "loss": 5.3929, + "step": 8400 + }, + { + "epoch": 1.07641110968898, + "grad_norm": 1.2913914918899536, + "learning_rate": 0.0002979565229518718, + "loss": 5.4008, + "step": 8410 + }, + { + "epoch": 1.0776910277742224, + "grad_norm": 1.4355765581130981, + "learning_rate": 0.00029795155266300544, + "loss": 5.388, + "step": 8420 + }, + { + "epoch": 1.078970945859465, + "grad_norm": 1.5805541276931763, + "learning_rate": 0.00029794657637849197, + "loss": 5.4101, + "step": 8430 + }, + { + "epoch": 1.0802508639447075, + "grad_norm": 1.2328497171401978, + "learning_rate": 0.000297941594098533, + "loss": 5.3967, + "step": 8440 + }, + { + "epoch": 1.08153078202995, + "grad_norm": 2.1215150356292725, + "learning_rate": 0.0002979366058233304, + "loss": 5.3945, + "step": 8450 + }, + { + "epoch": 1.0828107001151925, + "grad_norm": 1.345954418182373, + "learning_rate": 0.00029793161155308636, + "loss": 5.3928, + "step": 8460 + }, + { + "epoch": 1.084090618200435, + "grad_norm": 1.8684533834457397, + "learning_rate": 0.00029792661128800335, + "loss": 5.3869, + "step": 8470 + }, + { + "epoch": 1.0853705362856778, + "grad_norm": 1.3567562103271484, + "learning_rate": 0.00029792160502828386, + "loss": 5.3845, + "step": 8480 + }, + { + "epoch": 1.0866504543709203, + "grad_norm": 1.8894990682601929, + "learning_rate": 0.0002979165927741309, + "loss": 5.3699, + "step": 8490 + }, + { + "epoch": 1.0879303724561629, + "grad_norm": 1.4628643989562988, + "learning_rate": 0.00029791157452574744, + "loss": 5.3933, + "step": 8500 + }, + { + "epoch": 1.0892102905414054, + "grad_norm": 1.6238657236099243, + "learning_rate": 0.00029790655028333704, + "loss": 5.3868, + "step": 8510 + }, + { + "epoch": 1.090490208626648, + "grad_norm": 2.4890058040618896, + "learning_rate": 0.0002979015200471031, + "loss": 5.4187, + "step": 8520 + }, + { + "epoch": 1.0917701267118904, + "grad_norm": 1.314422845840454, + "learning_rate": 0.0002978964838172496, + "loss": 5.4416, + "step": 8530 + }, + { + "epoch": 1.093050044797133, + "grad_norm": 2.026683807373047, + "learning_rate": 0.0002978914415939805, + "loss": 5.4036, + "step": 8540 + }, + { + "epoch": 1.0943299628823755, + "grad_norm": 1.7179348468780518, + "learning_rate": 0.0002978863933775003, + "loss": 5.4044, + "step": 8550 + }, + { + "epoch": 1.095609880967618, + "grad_norm": 1.7547870874404907, + "learning_rate": 0.0002978813391680135, + "loss": 5.383, + "step": 8560 + }, + { + "epoch": 1.0968897990528605, + "grad_norm": 2.172962188720703, + "learning_rate": 0.00029787627896572485, + "loss": 5.3926, + "step": 8570 + }, + { + "epoch": 1.098169717138103, + "grad_norm": 1.2563143968582153, + "learning_rate": 0.00029787121277083953, + "loss": 5.386, + "step": 8580 + }, + { + "epoch": 1.0994496352233458, + "grad_norm": 1.3007797002792358, + "learning_rate": 0.0002978661405835628, + "loss": 5.3973, + "step": 8590 + }, + { + "epoch": 1.1007295533085883, + "grad_norm": 1.491527795791626, + "learning_rate": 0.0002978610624041002, + "loss": 5.3711, + "step": 8600 + }, + { + "epoch": 1.1020094713938309, + "grad_norm": 1.8254247903823853, + "learning_rate": 0.00029785597823265745, + "loss": 5.3771, + "step": 8610 + }, + { + "epoch": 1.1032893894790734, + "grad_norm": 1.2141834497451782, + "learning_rate": 0.0002978508880694407, + "loss": 5.3485, + "step": 8620 + }, + { + "epoch": 1.104569307564316, + "grad_norm": 2.4979705810546875, + "learning_rate": 0.00029784579191465617, + "loss": 5.3651, + "step": 8630 + }, + { + "epoch": 1.1058492256495585, + "grad_norm": 1.4393336772918701, + "learning_rate": 0.00029784068976851033, + "loss": 5.3681, + "step": 8640 + }, + { + "epoch": 1.107129143734801, + "grad_norm": 1.0520734786987305, + "learning_rate": 0.00029783558163121003, + "loss": 5.3689, + "step": 8650 + }, + { + "epoch": 1.1084090618200435, + "grad_norm": 1.4579353332519531, + "learning_rate": 0.0002978304675029623, + "loss": 5.3684, + "step": 8660 + }, + { + "epoch": 1.109688979905286, + "grad_norm": 1.8464175462722778, + "learning_rate": 0.0002978253473839743, + "loss": 5.3787, + "step": 8670 + }, + { + "epoch": 1.1109688979905286, + "grad_norm": 1.292655110359192, + "learning_rate": 0.00029782022127445353, + "loss": 5.3641, + "step": 8680 + }, + { + "epoch": 1.112248816075771, + "grad_norm": 1.6093817949295044, + "learning_rate": 0.0002978150891746077, + "loss": 5.3623, + "step": 8690 + }, + { + "epoch": 1.1135287341610136, + "grad_norm": 1.0206555128097534, + "learning_rate": 0.00029780995108464486, + "loss": 5.3739, + "step": 8700 + }, + { + "epoch": 1.1148086522462561, + "grad_norm": 1.2137470245361328, + "learning_rate": 0.0002978048070047732, + "loss": 5.3592, + "step": 8710 + }, + { + "epoch": 1.1160885703314989, + "grad_norm": 2.88940167427063, + "learning_rate": 0.0002977996569352012, + "loss": 5.3671, + "step": 8720 + }, + { + "epoch": 1.1173684884167414, + "grad_norm": 1.197797179222107, + "learning_rate": 0.0002977945008761375, + "loss": 5.3451, + "step": 8730 + }, + { + "epoch": 1.118648406501984, + "grad_norm": 1.198354959487915, + "learning_rate": 0.0002977893388277911, + "loss": 5.3827, + "step": 8740 + }, + { + "epoch": 1.1199283245872265, + "grad_norm": 1.0703370571136475, + "learning_rate": 0.0002977841707903711, + "loss": 5.3249, + "step": 8750 + }, + { + "epoch": 1.121208242672469, + "grad_norm": 1.3689323663711548, + "learning_rate": 0.0002977789967640871, + "loss": 5.3569, + "step": 8760 + }, + { + "epoch": 1.1224881607577115, + "grad_norm": 1.3093551397323608, + "learning_rate": 0.0002977738167491486, + "loss": 5.3379, + "step": 8770 + }, + { + "epoch": 1.123768078842954, + "grad_norm": 1.698310136795044, + "learning_rate": 0.0002977686307457657, + "loss": 5.3638, + "step": 8780 + }, + { + "epoch": 1.1250479969281966, + "grad_norm": 1.5317034721374512, + "learning_rate": 0.00029776343875414834, + "loss": 5.3916, + "step": 8790 + }, + { + "epoch": 1.126327915013439, + "grad_norm": 1.757363200187683, + "learning_rate": 0.0002977582407745071, + "loss": 5.3482, + "step": 8800 + }, + { + "epoch": 1.1276078330986816, + "grad_norm": 1.6926486492156982, + "learning_rate": 0.00029775303680705257, + "loss": 5.34, + "step": 8810 + }, + { + "epoch": 1.1288877511839241, + "grad_norm": 2.5482800006866455, + "learning_rate": 0.0002977478268519956, + "loss": 5.3665, + "step": 8820 + }, + { + "epoch": 1.130167669269167, + "grad_norm": 2.2844486236572266, + "learning_rate": 0.0002977426109095474, + "loss": 5.3552, + "step": 8830 + }, + { + "epoch": 1.1314475873544092, + "grad_norm": 2.151538372039795, + "learning_rate": 0.0002977373889799192, + "loss": 5.3469, + "step": 8840 + }, + { + "epoch": 1.132727505439652, + "grad_norm": 1.4821778535842896, + "learning_rate": 0.00029773216106332277, + "loss": 5.3422, + "step": 8850 + }, + { + "epoch": 1.1340074235248945, + "grad_norm": 1.5019316673278809, + "learning_rate": 0.00029772692715996984, + "loss": 5.3551, + "step": 8860 + }, + { + "epoch": 1.135287341610137, + "grad_norm": 1.7702430486679077, + "learning_rate": 0.0002977216872700727, + "loss": 5.3309, + "step": 8870 + }, + { + "epoch": 1.1365672596953795, + "grad_norm": 1.5363876819610596, + "learning_rate": 0.00029771644139384347, + "loss": 5.339, + "step": 8880 + }, + { + "epoch": 1.137847177780622, + "grad_norm": 1.330912470817566, + "learning_rate": 0.00029771118953149486, + "loss": 5.369, + "step": 8890 + }, + { + "epoch": 1.1391270958658646, + "grad_norm": 1.5051958560943604, + "learning_rate": 0.00029770593168323967, + "loss": 5.3514, + "step": 8900 + }, + { + "epoch": 1.140407013951107, + "grad_norm": 1.473021388053894, + "learning_rate": 0.000297700667849291, + "loss": 5.3574, + "step": 8910 + }, + { + "epoch": 1.1416869320363496, + "grad_norm": 1.1854453086853027, + "learning_rate": 0.0002976953980298621, + "loss": 5.3291, + "step": 8920 + }, + { + "epoch": 1.1429668501215922, + "grad_norm": 1.426828145980835, + "learning_rate": 0.0002976901222251666, + "loss": 5.3342, + "step": 8930 + }, + { + "epoch": 1.1442467682068347, + "grad_norm": 2.1792798042297363, + "learning_rate": 0.0002976848404354183, + "loss": 5.3406, + "step": 8940 + }, + { + "epoch": 1.1455266862920772, + "grad_norm": 1.5441704988479614, + "learning_rate": 0.00029767955266083113, + "loss": 5.3278, + "step": 8950 + }, + { + "epoch": 1.14680660437732, + "grad_norm": 1.8839783668518066, + "learning_rate": 0.0002976742589016195, + "loss": 5.3459, + "step": 8960 + }, + { + "epoch": 1.1480865224625625, + "grad_norm": 1.377284288406372, + "learning_rate": 0.00029766895915799783, + "loss": 5.2889, + "step": 8970 + }, + { + "epoch": 1.149366440547805, + "grad_norm": 1.5436594486236572, + "learning_rate": 0.000297663653430181, + "loss": 5.3387, + "step": 8980 + }, + { + "epoch": 1.1506463586330475, + "grad_norm": 1.8926775455474854, + "learning_rate": 0.00029765834171838397, + "loss": 5.3313, + "step": 8990 + }, + { + "epoch": 1.15192627671829, + "grad_norm": 2.016495704650879, + "learning_rate": 0.00029765302402282197, + "loss": 5.3102, + "step": 9000 + }, + { + "epoch": 1.1532061948035326, + "grad_norm": 1.5337920188903809, + "learning_rate": 0.0002976477003437105, + "loss": 5.3099, + "step": 9010 + }, + { + "epoch": 1.1544861128887751, + "grad_norm": 1.6861891746520996, + "learning_rate": 0.00029764237068126534, + "loss": 5.3177, + "step": 9020 + }, + { + "epoch": 1.1557660309740176, + "grad_norm": 11.703009605407715, + "learning_rate": 0.00029763703503570243, + "loss": 5.3458, + "step": 9030 + }, + { + "epoch": 1.1570459490592602, + "grad_norm": 1.3665626049041748, + "learning_rate": 0.000297631693407238, + "loss": 5.3726, + "step": 9040 + }, + { + "epoch": 1.1583258671445027, + "grad_norm": 1.9868730306625366, + "learning_rate": 0.0002976263457960886, + "loss": 5.3459, + "step": 9050 + }, + { + "epoch": 1.1596057852297452, + "grad_norm": 6.771224021911621, + "learning_rate": 0.0002976209922024708, + "loss": 5.4621, + "step": 9060 + }, + { + "epoch": 1.1608857033149877, + "grad_norm": 1.730333685874939, + "learning_rate": 0.00029761563262660165, + "loss": 5.3924, + "step": 9070 + }, + { + "epoch": 1.1621656214002303, + "grad_norm": 1.123976707458496, + "learning_rate": 0.0002976102670686983, + "loss": 5.3687, + "step": 9080 + }, + { + "epoch": 1.163445539485473, + "grad_norm": 2.981642723083496, + "learning_rate": 0.00029760489552897815, + "loss": 5.3319, + "step": 9090 + }, + { + "epoch": 1.1647254575707155, + "grad_norm": 1.4254289865493774, + "learning_rate": 0.00029759951800765896, + "loss": 5.3132, + "step": 9100 + }, + { + "epoch": 1.166005375655958, + "grad_norm": 1.141530156135559, + "learning_rate": 0.00029759413450495864, + "loss": 5.3433, + "step": 9110 + }, + { + "epoch": 1.1672852937412006, + "grad_norm": 1.7244399785995483, + "learning_rate": 0.0002975887450210953, + "loss": 5.3198, + "step": 9120 + }, + { + "epoch": 1.1685652118264431, + "grad_norm": 1.030436396598816, + "learning_rate": 0.00029758334955628737, + "loss": 5.3244, + "step": 9130 + }, + { + "epoch": 1.1698451299116857, + "grad_norm": 1.2787984609603882, + "learning_rate": 0.0002975779481107535, + "loss": 5.3185, + "step": 9140 + }, + { + "epoch": 1.1711250479969282, + "grad_norm": 1.4879578351974487, + "learning_rate": 0.00029757254068471257, + "loss": 5.3408, + "step": 9150 + }, + { + "epoch": 1.1724049660821707, + "grad_norm": 1.3642961978912354, + "learning_rate": 0.0002975671272783838, + "loss": 5.3289, + "step": 9160 + }, + { + "epoch": 1.1736848841674132, + "grad_norm": 1.4523587226867676, + "learning_rate": 0.0002975617078919864, + "loss": 5.3273, + "step": 9170 + }, + { + "epoch": 1.1749648022526558, + "grad_norm": 1.5739113092422485, + "learning_rate": 0.00029755628252574014, + "loss": 5.3224, + "step": 9180 + }, + { + "epoch": 1.1762447203378983, + "grad_norm": 1.1182880401611328, + "learning_rate": 0.0002975508511798648, + "loss": 5.3045, + "step": 9190 + }, + { + "epoch": 1.177524638423141, + "grad_norm": 1.6098095178604126, + "learning_rate": 0.00029754541385458046, + "loss": 5.3356, + "step": 9200 + }, + { + "epoch": 1.1788045565083833, + "grad_norm": 1.4355725049972534, + "learning_rate": 0.0002975399705501076, + "loss": 5.3156, + "step": 9210 + }, + { + "epoch": 1.180084474593626, + "grad_norm": 1.7407658100128174, + "learning_rate": 0.0002975345212666666, + "loss": 5.3086, + "step": 9220 + }, + { + "epoch": 1.1813643926788686, + "grad_norm": 1.4330238103866577, + "learning_rate": 0.00029752906600447846, + "loss": 5.3063, + "step": 9230 + }, + { + "epoch": 1.1826443107641111, + "grad_norm": 1.4022783041000366, + "learning_rate": 0.0002975236047637642, + "loss": 5.312, + "step": 9240 + }, + { + "epoch": 1.1839242288493537, + "grad_norm": 1.6006667613983154, + "learning_rate": 0.0002975181375447451, + "loss": 5.3361, + "step": 9250 + }, + { + "epoch": 1.1852041469345962, + "grad_norm": 1.2444195747375488, + "learning_rate": 0.00029751266434764276, + "loss": 5.2986, + "step": 9260 + }, + { + "epoch": 1.1864840650198387, + "grad_norm": 1.4725539684295654, + "learning_rate": 0.0002975071851726789, + "loss": 5.2853, + "step": 9270 + }, + { + "epoch": 1.1877639831050812, + "grad_norm": 1.2328054904937744, + "learning_rate": 0.0002975017000200757, + "loss": 5.3049, + "step": 9280 + }, + { + "epoch": 1.1890439011903238, + "grad_norm": 1.7641607522964478, + "learning_rate": 0.00029749620889005533, + "loss": 5.3093, + "step": 9290 + }, + { + "epoch": 1.1903238192755663, + "grad_norm": 1.6767491102218628, + "learning_rate": 0.0002974907117828403, + "loss": 5.2992, + "step": 9300 + }, + { + "epoch": 1.1916037373608088, + "grad_norm": 1.505649209022522, + "learning_rate": 0.00029748520869865347, + "loss": 5.3143, + "step": 9310 + }, + { + "epoch": 1.1928836554460513, + "grad_norm": 1.1579383611679077, + "learning_rate": 0.0002974796996377178, + "loss": 5.3144, + "step": 9320 + }, + { + "epoch": 1.194163573531294, + "grad_norm": 1.7368600368499756, + "learning_rate": 0.0002974741846002565, + "loss": 5.306, + "step": 9330 + }, + { + "epoch": 1.1954434916165366, + "grad_norm": 1.1850640773773193, + "learning_rate": 0.0002974686635864931, + "loss": 5.3167, + "step": 9340 + }, + { + "epoch": 1.1967234097017792, + "grad_norm": 1.958350658416748, + "learning_rate": 0.00029746313659665145, + "loss": 5.3047, + "step": 9350 + }, + { + "epoch": 1.1980033277870217, + "grad_norm": 1.1464362144470215, + "learning_rate": 0.00029745760363095533, + "loss": 5.2955, + "step": 9360 + }, + { + "epoch": 1.1992832458722642, + "grad_norm": 1.2057669162750244, + "learning_rate": 0.00029745206468962906, + "loss": 5.3028, + "step": 9370 + }, + { + "epoch": 1.2005631639575067, + "grad_norm": 1.2068003416061401, + "learning_rate": 0.00029744651977289707, + "loss": 5.3062, + "step": 9380 + }, + { + "epoch": 1.2018430820427493, + "grad_norm": 1.2438029050827026, + "learning_rate": 0.0002974409688809842, + "loss": 5.2763, + "step": 9390 + }, + { + "epoch": 1.2031230001279918, + "grad_norm": 1.4242688417434692, + "learning_rate": 0.00029743541201411513, + "loss": 5.3177, + "step": 9400 + }, + { + "epoch": 1.2044029182132343, + "grad_norm": 1.4365496635437012, + "learning_rate": 0.0002974298491725153, + "loss": 5.3017, + "step": 9410 + }, + { + "epoch": 1.2056828362984768, + "grad_norm": 1.5808701515197754, + "learning_rate": 0.00029742428035641, + "loss": 5.3031, + "step": 9420 + }, + { + "epoch": 1.2069627543837194, + "grad_norm": 6.06550931930542, + "learning_rate": 0.00029741870556602496, + "loss": 5.3044, + "step": 9430 + }, + { + "epoch": 1.208242672468962, + "grad_norm": 1.2524161338806152, + "learning_rate": 0.00029741312480158606, + "loss": 5.3139, + "step": 9440 + }, + { + "epoch": 1.2095225905542044, + "grad_norm": 1.8837299346923828, + "learning_rate": 0.00029740753806331956, + "loss": 5.2954, + "step": 9450 + }, + { + "epoch": 1.2108025086394472, + "grad_norm": 1.8030625581741333, + "learning_rate": 0.00029740194535145167, + "loss": 5.302, + "step": 9460 + }, + { + "epoch": 1.2120824267246897, + "grad_norm": 1.4304263591766357, + "learning_rate": 0.00029739634666620915, + "loss": 5.2882, + "step": 9470 + }, + { + "epoch": 1.2133623448099322, + "grad_norm": 1.8116546869277954, + "learning_rate": 0.00029739074200781893, + "loss": 5.3042, + "step": 9480 + }, + { + "epoch": 1.2146422628951747, + "grad_norm": 1.4493547677993774, + "learning_rate": 0.00029738513137650803, + "loss": 5.2958, + "step": 9490 + }, + { + "epoch": 1.2159221809804173, + "grad_norm": 3.873800754547119, + "learning_rate": 0.00029737951477250393, + "loss": 5.309, + "step": 9500 + }, + { + "epoch": 1.2172020990656598, + "grad_norm": 2.4254534244537354, + "learning_rate": 0.0002973738921960341, + "loss": 5.327, + "step": 9510 + }, + { + "epoch": 1.2184820171509023, + "grad_norm": 1.3041757345199585, + "learning_rate": 0.00029736826364732647, + "loss": 5.3251, + "step": 9520 + }, + { + "epoch": 1.2197619352361448, + "grad_norm": 2.2851760387420654, + "learning_rate": 0.00029736262912660917, + "loss": 5.3101, + "step": 9530 + }, + { + "epoch": 1.2210418533213874, + "grad_norm": 1.7676900625228882, + "learning_rate": 0.0002973569886341105, + "loss": 5.2925, + "step": 9540 + }, + { + "epoch": 1.22232177140663, + "grad_norm": 1.6757118701934814, + "learning_rate": 0.000297351342170059, + "loss": 5.278, + "step": 9550 + }, + { + "epoch": 1.2236016894918724, + "grad_norm": 258.9942321777344, + "learning_rate": 0.00029734568973468356, + "loss": 5.3329, + "step": 9560 + }, + { + "epoch": 1.2248816075771152, + "grad_norm": 67.85020446777344, + "learning_rate": 0.00029734003132821313, + "loss": 5.4291, + "step": 9570 + }, + { + "epoch": 1.2261615256623577, + "grad_norm": 172.93600463867188, + "learning_rate": 0.00029733436695087714, + "loss": 5.4633, + "step": 9580 + }, + { + "epoch": 1.2274414437476002, + "grad_norm": 59.954322814941406, + "learning_rate": 0.0002973286966029051, + "loss": 5.4602, + "step": 9590 + }, + { + "epoch": 1.2287213618328428, + "grad_norm": 418357.375, + "learning_rate": 0.0002973230202845267, + "loss": 5.4023, + "step": 9600 + }, + { + "epoch": 1.2300012799180853, + "grad_norm": 83.09380340576172, + "learning_rate": 0.00029731733799597215, + "loss": 5.4068, + "step": 9610 + }, + { + "epoch": 1.2312811980033278, + "grad_norm": 3.8474719524383545, + "learning_rate": 0.0002973116497374715, + "loss": 5.4837, + "step": 9620 + }, + { + "epoch": 1.2325611160885703, + "grad_norm": 6.988565444946289, + "learning_rate": 0.00029730595550925547, + "loss": 5.4765, + "step": 9630 + }, + { + "epoch": 1.2338410341738129, + "grad_norm": 8.367043495178223, + "learning_rate": 0.0002973002553115547, + "loss": 5.4381, + "step": 9640 + }, + { + "epoch": 1.2351209522590554, + "grad_norm": 4.276803970336914, + "learning_rate": 0.0002972945491446002, + "loss": 5.4551, + "step": 9650 + }, + { + "epoch": 1.236400870344298, + "grad_norm": 72.82675170898438, + "learning_rate": 0.00029728883700862326, + "loss": 5.4738, + "step": 9660 + }, + { + "epoch": 1.2376807884295404, + "grad_norm": 6.083178520202637, + "learning_rate": 0.0002972831189038553, + "loss": 5.4763, + "step": 9670 + }, + { + "epoch": 1.238960706514783, + "grad_norm": 1073.8052978515625, + "learning_rate": 0.0002972773948305281, + "loss": 5.4571, + "step": 9680 + }, + { + "epoch": 1.2402406246000255, + "grad_norm": 217.56842041015625, + "learning_rate": 0.0002972716647888736, + "loss": 5.5419, + "step": 9690 + }, + { + "epoch": 1.2415205426852682, + "grad_norm": 33.87669372558594, + "learning_rate": 0.000297265928779124, + "loss": 5.5087, + "step": 9700 + }, + { + "epoch": 1.2428004607705108, + "grad_norm": 12.367305755615234, + "learning_rate": 0.0002972601868015117, + "loss": 5.442, + "step": 9710 + }, + { + "epoch": 1.2440803788557533, + "grad_norm": 274.2508850097656, + "learning_rate": 0.0002972544388562694, + "loss": 5.4061, + "step": 9720 + }, + { + "epoch": 1.2453602969409958, + "grad_norm": 2.3056957721710205, + "learning_rate": 0.00029724868494363014, + "loss": 5.437, + "step": 9730 + }, + { + "epoch": 1.2466402150262383, + "grad_norm": 1.763442873954773, + "learning_rate": 0.00029724292506382705, + "loss": 5.396, + "step": 9740 + }, + { + "epoch": 1.2479201331114809, + "grad_norm": 1.2737253904342651, + "learning_rate": 0.00029723715921709345, + "loss": 5.3564, + "step": 9750 + }, + { + "epoch": 1.2492000511967234, + "grad_norm": 1.6267307996749878, + "learning_rate": 0.0002972313874036631, + "loss": 5.3609, + "step": 9760 + }, + { + "epoch": 1.250479969281966, + "grad_norm": 1.832618236541748, + "learning_rate": 0.00029722560962376984, + "loss": 5.344, + "step": 9770 + }, + { + "epoch": 1.2517598873672084, + "grad_norm": 1.918068289756775, + "learning_rate": 0.0002972198258776479, + "loss": 5.3326, + "step": 9780 + }, + { + "epoch": 1.253039805452451, + "grad_norm": 1.8401589393615723, + "learning_rate": 0.0002972140361655315, + "loss": 5.3075, + "step": 9790 + }, + { + "epoch": 1.2543197235376935, + "grad_norm": 1.50496244430542, + "learning_rate": 0.0002972082404876555, + "loss": 5.3139, + "step": 9800 + }, + { + "epoch": 1.2555996416229362, + "grad_norm": 1.468178629875183, + "learning_rate": 0.0002972024388442545, + "loss": 5.3077, + "step": 9810 + }, + { + "epoch": 1.2568795597081786, + "grad_norm": 1.319603443145752, + "learning_rate": 0.0002971966312355638, + "loss": 5.3103, + "step": 9820 + }, + { + "epoch": 1.2581594777934213, + "grad_norm": 1.3174471855163574, + "learning_rate": 0.00029719081766181864, + "loss": 5.2859, + "step": 9830 + }, + { + "epoch": 1.2594393958786638, + "grad_norm": 1.354108452796936, + "learning_rate": 0.00029718499812325476, + "loss": 5.315, + "step": 9840 + }, + { + "epoch": 1.2607193139639064, + "grad_norm": 1.3756756782531738, + "learning_rate": 0.00029717917262010783, + "loss": 5.2826, + "step": 9850 + }, + { + "epoch": 1.2619992320491489, + "grad_norm": 2.1197121143341064, + "learning_rate": 0.00029717334115261394, + "loss": 5.2865, + "step": 9860 + }, + { + "epoch": 1.2632791501343914, + "grad_norm": 1.1051539182662964, + "learning_rate": 0.00029716750372100955, + "loss": 5.3189, + "step": 9870 + }, + { + "epoch": 1.264559068219634, + "grad_norm": 7.60437536239624, + "learning_rate": 0.00029716166032553107, + "loss": 5.3171, + "step": 9880 + }, + { + "epoch": 1.2658389863048765, + "grad_norm": 392.87750244140625, + "learning_rate": 0.00029715581096641534, + "loss": 5.4252, + "step": 9890 + }, + { + "epoch": 1.267118904390119, + "grad_norm": 63.391754150390625, + "learning_rate": 0.00029714995564389943, + "loss": 5.5073, + "step": 9900 + }, + { + "epoch": 1.2683988224753615, + "grad_norm": 268.2895202636719, + "learning_rate": 0.0002971440943582206, + "loss": 5.4236, + "step": 9910 + }, + { + "epoch": 1.2696787405606043, + "grad_norm": 55.89106369018555, + "learning_rate": 0.00029713822710961645, + "loss": 5.4003, + "step": 9920 + }, + { + "epoch": 1.2709586586458466, + "grad_norm": 13.6283540725708, + "learning_rate": 0.0002971323538983246, + "loss": 5.4117, + "step": 9930 + }, + { + "epoch": 1.2722385767310893, + "grad_norm": 9.581747055053711, + "learning_rate": 0.0002971264747245832, + "loss": 5.4548, + "step": 9940 + }, + { + "epoch": 1.2735184948163318, + "grad_norm": 5.135045528411865, + "learning_rate": 0.00029712058958863036, + "loss": 5.4945, + "step": 9950 + }, + { + "epoch": 1.2747984129015744, + "grad_norm": 13.211899757385254, + "learning_rate": 0.00029711469849070474, + "loss": 5.4774, + "step": 9960 + }, + { + "epoch": 1.2760783309868169, + "grad_norm": 48.73270034790039, + "learning_rate": 0.00029710880143104493, + "loss": 5.4555, + "step": 9970 + }, + { + "epoch": 1.2773582490720594, + "grad_norm": 7.845920085906982, + "learning_rate": 0.00029710289840988993, + "loss": 5.5258, + "step": 9980 + }, + { + "epoch": 1.278638167157302, + "grad_norm": 6.870781421661377, + "learning_rate": 0.000297096989427479, + "loss": 5.4432, + "step": 9990 + }, + { + "epoch": 1.2799180852425445, + "grad_norm": 10.027936935424805, + "learning_rate": 0.0002970910744840516, + "loss": 5.4615, + "step": 10000 + }, + { + "epoch": 1.281198003327787, + "grad_norm": 40.80441665649414, + "learning_rate": 0.00029708515357984745, + "loss": 5.4906, + "step": 10010 + }, + { + "epoch": 1.2824779214130295, + "grad_norm": 11.978614807128906, + "learning_rate": 0.0002970792267151064, + "loss": 5.4575, + "step": 10020 + }, + { + "epoch": 1.283757839498272, + "grad_norm": 76.15137481689453, + "learning_rate": 0.0002970732938900687, + "loss": 5.4267, + "step": 10030 + }, + { + "epoch": 1.2850377575835146, + "grad_norm": 23.728790283203125, + "learning_rate": 0.00029706735510497476, + "loss": 5.4251, + "step": 10040 + }, + { + "epoch": 1.2863176756687573, + "grad_norm": 15.487527847290039, + "learning_rate": 0.00029706141036006526, + "loss": 5.4004, + "step": 10050 + }, + { + "epoch": 1.2875975937539996, + "grad_norm": 160.41036987304688, + "learning_rate": 0.0002970554596555811, + "loss": 5.3944, + "step": 10060 + }, + { + "epoch": 1.2888775118392424, + "grad_norm": 14.26937198638916, + "learning_rate": 0.0002970495029917634, + "loss": 5.4353, + "step": 10070 + }, + { + "epoch": 1.290157429924485, + "grad_norm": 21.807947158813477, + "learning_rate": 0.00029704354036885354, + "loss": 5.4581, + "step": 10080 + }, + { + "epoch": 1.2914373480097274, + "grad_norm": 20.17401885986328, + "learning_rate": 0.0002970375717870932, + "loss": 5.4164, + "step": 10090 + }, + { + "epoch": 1.29271726609497, + "grad_norm": 26.464344024658203, + "learning_rate": 0.00029703159724672425, + "loss": 5.4066, + "step": 10100 + }, + { + "epoch": 1.2939971841802125, + "grad_norm": 63.260562896728516, + "learning_rate": 0.0002970256167479888, + "loss": 5.4306, + "step": 10110 + }, + { + "epoch": 1.295277102265455, + "grad_norm": 270.7586669921875, + "learning_rate": 0.00029701963029112917, + "loss": 5.4221, + "step": 10120 + }, + { + "epoch": 1.2965570203506975, + "grad_norm": 104.90503692626953, + "learning_rate": 0.00029701363787638805, + "loss": 5.5096, + "step": 10130 + }, + { + "epoch": 1.29783693843594, + "grad_norm": 8.484354019165039, + "learning_rate": 0.0002970076395040082, + "loss": 5.5256, + "step": 10140 + }, + { + "epoch": 1.2991168565211826, + "grad_norm": 8.284499168395996, + "learning_rate": 0.0002970016351742327, + "loss": 5.4761, + "step": 10150 + }, + { + "epoch": 1.300396774606425, + "grad_norm": 3.632566213607788, + "learning_rate": 0.0002969956248873048, + "loss": 5.4266, + "step": 10160 + }, + { + "epoch": 1.3016766926916676, + "grad_norm": 9.005770683288574, + "learning_rate": 0.0002969896086434682, + "loss": 5.4402, + "step": 10170 + }, + { + "epoch": 1.3029566107769104, + "grad_norm": 9.097681045532227, + "learning_rate": 0.0002969835864429667, + "loss": 5.4615, + "step": 10180 + }, + { + "epoch": 1.3042365288621527, + "grad_norm": 12.6643705368042, + "learning_rate": 0.00029697755828604427, + "loss": 5.4443, + "step": 10190 + }, + { + "epoch": 1.3055164469473954, + "grad_norm": 236.66493225097656, + "learning_rate": 0.0002969715241729452, + "loss": 5.4242, + "step": 10200 + }, + { + "epoch": 1.306796365032638, + "grad_norm": 60.66410446166992, + "learning_rate": 0.0002969654841039141, + "loss": 5.46, + "step": 10210 + }, + { + "epoch": 1.3080762831178805, + "grad_norm": 9.005767822265625, + "learning_rate": 0.0002969594380791956, + "loss": 5.4581, + "step": 10220 + }, + { + "epoch": 1.309356201203123, + "grad_norm": 24.40799903869629, + "learning_rate": 0.0002969533860990349, + "loss": 5.4408, + "step": 10230 + }, + { + "epoch": 1.3106361192883655, + "grad_norm": 44.29862976074219, + "learning_rate": 0.0002969473281636771, + "loss": 5.4332, + "step": 10240 + }, + { + "epoch": 1.311916037373608, + "grad_norm": 9.148470878601074, + "learning_rate": 0.00029694126427336773, + "loss": 5.4249, + "step": 10250 + }, + { + "epoch": 1.3131959554588506, + "grad_norm": 3.238877296447754, + "learning_rate": 0.00029693519442835256, + "loss": 5.4235, + "step": 10260 + }, + { + "epoch": 1.3144758735440931, + "grad_norm": 140.43626403808594, + "learning_rate": 0.0002969291186288776, + "loss": 5.4636, + "step": 10270 + }, + { + "epoch": 1.3157557916293356, + "grad_norm": 19.890974044799805, + "learning_rate": 0.0002969230368751889, + "loss": 5.4817, + "step": 10280 + }, + { + "epoch": 1.3170357097145784, + "grad_norm": 60.46494674682617, + "learning_rate": 0.00029691694916753313, + "loss": 5.453, + "step": 10290 + }, + { + "epoch": 1.3183156277998207, + "grad_norm": 3.0658810138702393, + "learning_rate": 0.00029691085550615686, + "loss": 5.4608, + "step": 10300 + }, + { + "epoch": 1.3195955458850634, + "grad_norm": 2.8574109077453613, + "learning_rate": 0.0002969047558913071, + "loss": 5.4497, + "step": 10310 + }, + { + "epoch": 1.320875463970306, + "grad_norm": 5.469709873199463, + "learning_rate": 0.0002968986503232309, + "loss": 5.4928, + "step": 10320 + }, + { + "epoch": 1.3221553820555485, + "grad_norm": 9.013448715209961, + "learning_rate": 0.0002968925388021759, + "loss": 5.4647, + "step": 10330 + }, + { + "epoch": 1.323435300140791, + "grad_norm": 5.318248748779297, + "learning_rate": 0.00029688642132838955, + "loss": 5.4425, + "step": 10340 + }, + { + "epoch": 1.3247152182260336, + "grad_norm": 10.692891120910645, + "learning_rate": 0.00029688029790211993, + "loss": 5.4341, + "step": 10350 + }, + { + "epoch": 1.325995136311276, + "grad_norm": 11.036212921142578, + "learning_rate": 0.00029687416852361507, + "loss": 5.4143, + "step": 10360 + }, + { + "epoch": 1.3272750543965186, + "grad_norm": 8.127765655517578, + "learning_rate": 0.0002968680331931234, + "loss": 5.4221, + "step": 10370 + }, + { + "epoch": 1.3285549724817611, + "grad_norm": 8.590612411499023, + "learning_rate": 0.00029686189191089365, + "loss": 5.4127, + "step": 10380 + }, + { + "epoch": 1.3298348905670037, + "grad_norm": 23.215730667114258, + "learning_rate": 0.00029685574467717453, + "loss": 5.3911, + "step": 10390 + }, + { + "epoch": 1.3311148086522462, + "grad_norm": 16.342304229736328, + "learning_rate": 0.0002968495914922152, + "loss": 5.4043, + "step": 10400 + }, + { + "epoch": 1.3323947267374887, + "grad_norm": 119.0894546508789, + "learning_rate": 0.0002968434323562651, + "loss": 5.3897, + "step": 10410 + }, + { + "epoch": 1.3336746448227315, + "grad_norm": 3.1613729000091553, + "learning_rate": 0.00029683726726957365, + "loss": 5.4513, + "step": 10420 + }, + { + "epoch": 1.3349545629079738, + "grad_norm": 17.995119094848633, + "learning_rate": 0.0002968310962323909, + "loss": 5.4527, + "step": 10430 + }, + { + "epoch": 1.3362344809932165, + "grad_norm": 2.353546380996704, + "learning_rate": 0.00029682491924496677, + "loss": 5.4369, + "step": 10440 + }, + { + "epoch": 1.337514399078459, + "grad_norm": 3.29575514793396, + "learning_rate": 0.0002968187363075516, + "loss": 5.4181, + "step": 10450 + }, + { + "epoch": 1.3387943171637016, + "grad_norm": 42.947044372558594, + "learning_rate": 0.00029681254742039604, + "loss": 5.4268, + "step": 10460 + }, + { + "epoch": 1.340074235248944, + "grad_norm": 9.198704719543457, + "learning_rate": 0.0002968063525837508, + "loss": 5.4315, + "step": 10470 + }, + { + "epoch": 1.3413541533341866, + "grad_norm": 3.822139263153076, + "learning_rate": 0.000296800151797867, + "loss": 5.4177, + "step": 10480 + }, + { + "epoch": 1.3426340714194291, + "grad_norm": 197.9595947265625, + "learning_rate": 0.00029679394506299585, + "loss": 5.3682, + "step": 10490 + }, + { + "epoch": 1.3439139895046717, + "grad_norm": 25.585834503173828, + "learning_rate": 0.0002967877323793889, + "loss": 5.4032, + "step": 10500 + }, + { + "epoch": 1.3451939075899142, + "grad_norm": 8.87588882446289, + "learning_rate": 0.0002967815137472979, + "loss": 5.461, + "step": 10510 + }, + { + "epoch": 1.3464738256751567, + "grad_norm": 2.577976942062378, + "learning_rate": 0.0002967752891669749, + "loss": 5.4129, + "step": 10520 + }, + { + "epoch": 1.3477537437603995, + "grad_norm": 3.7082831859588623, + "learning_rate": 0.0002967690586386721, + "loss": 5.4013, + "step": 10530 + }, + { + "epoch": 1.3490336618456418, + "grad_norm": 2.4679198265075684, + "learning_rate": 0.00029676282216264206, + "loss": 5.4289, + "step": 10540 + }, + { + "epoch": 1.3503135799308845, + "grad_norm": 4.0808563232421875, + "learning_rate": 0.0002967565797391374, + "loss": 5.4205, + "step": 10550 + }, + { + "epoch": 1.351593498016127, + "grad_norm": 2.7137279510498047, + "learning_rate": 0.0002967503313684112, + "loss": 5.382, + "step": 10560 + }, + { + "epoch": 1.3528734161013696, + "grad_norm": 2.2462680339813232, + "learning_rate": 0.00029674407705071655, + "loss": 5.3692, + "step": 10570 + }, + { + "epoch": 1.354153334186612, + "grad_norm": 42.56100082397461, + "learning_rate": 0.00029673781678630703, + "loss": 5.3739, + "step": 10580 + }, + { + "epoch": 1.3554332522718546, + "grad_norm": 1.6741000413894653, + "learning_rate": 0.00029673155057543626, + "loss": 5.3682, + "step": 10590 + }, + { + "epoch": 1.3567131703570972, + "grad_norm": 3.09238338470459, + "learning_rate": 0.00029672527841835815, + "loss": 5.3518, + "step": 10600 + }, + { + "epoch": 1.3579930884423397, + "grad_norm": 4.391119003295898, + "learning_rate": 0.00029671900031532695, + "loss": 5.3582, + "step": 10610 + }, + { + "epoch": 1.3592730065275822, + "grad_norm": 9.450124740600586, + "learning_rate": 0.00029671271626659706, + "loss": 5.3564, + "step": 10620 + }, + { + "epoch": 1.3605529246128247, + "grad_norm": 7.369607925415039, + "learning_rate": 0.00029670642627242316, + "loss": 5.3938, + "step": 10630 + }, + { + "epoch": 1.3618328426980673, + "grad_norm": 30.87764549255371, + "learning_rate": 0.00029670013033306, + "loss": 5.9874, + "step": 10640 + }, + { + "epoch": 1.3631127607833098, + "grad_norm": 40.88727951049805, + "learning_rate": 0.0002966938284487629, + "loss": 5.6569, + "step": 10650 + }, + { + "epoch": 1.3643926788685525, + "grad_norm": 32.33999252319336, + "learning_rate": 0.00029668752061978715, + "loss": 5.5244, + "step": 10660 + }, + { + "epoch": 1.3656725969537948, + "grad_norm": 28.833730697631836, + "learning_rate": 0.00029668120684638835, + "loss": 5.5229, + "step": 10670 + }, + { + "epoch": 1.3669525150390376, + "grad_norm": 3.893641233444214, + "learning_rate": 0.00029667488712882247, + "loss": 5.4523, + "step": 10680 + }, + { + "epoch": 1.3682324331242801, + "grad_norm": 4.776241302490234, + "learning_rate": 0.0002966685614673455, + "loss": 5.4346, + "step": 10690 + }, + { + "epoch": 1.3695123512095226, + "grad_norm": 13.851577758789062, + "learning_rate": 0.00029666222986221384, + "loss": 5.3944, + "step": 10700 + }, + { + "epoch": 1.3707922692947652, + "grad_norm": 2.147908926010132, + "learning_rate": 0.00029665589231368404, + "loss": 5.3899, + "step": 10710 + }, + { + "epoch": 1.3720721873800077, + "grad_norm": 9.758132934570312, + "learning_rate": 0.00029664954882201297, + "loss": 5.3847, + "step": 10720 + }, + { + "epoch": 1.3733521054652502, + "grad_norm": 3.455718755722046, + "learning_rate": 0.00029664319938745763, + "loss": 5.3817, + "step": 10730 + }, + { + "epoch": 1.3746320235504927, + "grad_norm": 24.70989227294922, + "learning_rate": 0.00029663684401027543, + "loss": 5.4051, + "step": 10740 + }, + { + "epoch": 1.3759119416357353, + "grad_norm": 10.136948585510254, + "learning_rate": 0.00029663048269072385, + "loss": 5.392, + "step": 10750 + }, + { + "epoch": 1.3771918597209778, + "grad_norm": 3.9122562408447266, + "learning_rate": 0.0002966241154290607, + "loss": 5.3557, + "step": 10760 + }, + { + "epoch": 1.3784717778062203, + "grad_norm": 3.816096544265747, + "learning_rate": 0.0002966177422255439, + "loss": 5.3742, + "step": 10770 + }, + { + "epoch": 1.3797516958914628, + "grad_norm": 2.2476353645324707, + "learning_rate": 0.0002966113630804319, + "loss": 5.3807, + "step": 10780 + }, + { + "epoch": 1.3810316139767056, + "grad_norm": 3.042263984680176, + "learning_rate": 0.00029660497799398306, + "loss": 5.3864, + "step": 10790 + }, + { + "epoch": 1.382311532061948, + "grad_norm": 45.76136779785156, + "learning_rate": 0.00029659858696645626, + "loss": 5.3699, + "step": 10800 + }, + { + "epoch": 1.3835914501471906, + "grad_norm": 13.073348999023438, + "learning_rate": 0.00029659218999811043, + "loss": 5.387, + "step": 10810 + }, + { + "epoch": 1.3848713682324332, + "grad_norm": 8.278294563293457, + "learning_rate": 0.0002965857870892048, + "loss": 5.3712, + "step": 10820 + }, + { + "epoch": 1.3861512863176757, + "grad_norm": 2.5932791233062744, + "learning_rate": 0.0002965793782399988, + "loss": 5.3435, + "step": 10830 + }, + { + "epoch": 1.3874312044029182, + "grad_norm": 20.342578887939453, + "learning_rate": 0.0002965729634507522, + "loss": 5.3446, + "step": 10840 + }, + { + "epoch": 1.3887111224881608, + "grad_norm": 2.0133111476898193, + "learning_rate": 0.000296566542721725, + "loss": 5.3574, + "step": 10850 + }, + { + "epoch": 1.3899910405734033, + "grad_norm": 7.2772216796875, + "learning_rate": 0.0002965601160531773, + "loss": 5.3525, + "step": 10860 + }, + { + "epoch": 1.3912709586586458, + "grad_norm": 3.3040051460266113, + "learning_rate": 0.0002965536834453696, + "loss": 5.346, + "step": 10870 + }, + { + "epoch": 1.3925508767438883, + "grad_norm": 36.88441848754883, + "learning_rate": 0.00029654724489856256, + "loss": 5.3156, + "step": 10880 + }, + { + "epoch": 1.3938307948291309, + "grad_norm": 9.932327270507812, + "learning_rate": 0.0002965408004130171, + "loss": 5.3503, + "step": 10890 + }, + { + "epoch": 1.3951107129143736, + "grad_norm": 2.004136323928833, + "learning_rate": 0.0002965343499889943, + "loss": 5.3873, + "step": 10900 + }, + { + "epoch": 1.396390630999616, + "grad_norm": 7.710193634033203, + "learning_rate": 0.00029652789362675575, + "loss": 5.3892, + "step": 10910 + }, + { + "epoch": 1.3976705490848587, + "grad_norm": 4.5125532150268555, + "learning_rate": 0.0002965214313265629, + "loss": 5.3947, + "step": 10920 + }, + { + "epoch": 1.3989504671701012, + "grad_norm": 3.5431599617004395, + "learning_rate": 0.00029651496308867776, + "loss": 5.4125, + "step": 10930 + }, + { + "epoch": 1.4002303852553437, + "grad_norm": 3.65902042388916, + "learning_rate": 0.00029650848891336234, + "loss": 5.391, + "step": 10940 + }, + { + "epoch": 1.4015103033405862, + "grad_norm": 13.032547950744629, + "learning_rate": 0.0002965020088008791, + "loss": 5.3605, + "step": 10950 + }, + { + "epoch": 1.4027902214258288, + "grad_norm": 4.853715419769287, + "learning_rate": 0.0002964955227514906, + "loss": 5.3678, + "step": 10960 + }, + { + "epoch": 1.4040701395110713, + "grad_norm": 1.706592321395874, + "learning_rate": 0.00029648903076545967, + "loss": 5.3235, + "step": 10970 + }, + { + "epoch": 1.4053500575963138, + "grad_norm": 1.9886835813522339, + "learning_rate": 0.0002964825328430494, + "loss": 5.3405, + "step": 10980 + }, + { + "epoch": 1.4066299756815563, + "grad_norm": 2.2458877563476562, + "learning_rate": 0.00029647602898452317, + "loss": 5.3212, + "step": 10990 + }, + { + "epoch": 1.4079098937667989, + "grad_norm": 1.449478268623352, + "learning_rate": 0.0002964695191901445, + "loss": 5.309, + "step": 11000 + } + ], + "logging_steps": 10, + "max_steps": 156260, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4622869463232352e+19, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}