| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.1412268188302426, | |
| "eval_steps": 1000, | |
| "global_step": 4000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0028530670470756064, | |
| "grad_norm": 12.738757213188315, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 1.3983, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005706134094151213, | |
| "grad_norm": 4.573132593653715, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.2207, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008559201141226819, | |
| "grad_norm": 2.607415963265715, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.8804, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.011412268188302425, | |
| "grad_norm": 1.4566842335760672, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.695, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.014265335235378032, | |
| "grad_norm": 1.2148255794768892, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.5464, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.017118402282453638, | |
| "grad_norm": 1.6916303315740489, | |
| "learning_rate": 3e-06, | |
| "loss": 0.4627, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.019971469329529243, | |
| "grad_norm": 1.7004102978289282, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.4077, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02282453637660485, | |
| "grad_norm": 1.653253610336802, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.3841, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.025677603423680456, | |
| "grad_norm": 1.689309973059402, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.364, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.028530670470756064, | |
| "grad_norm": 1.6046281123266535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3403, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03138373751783167, | |
| "grad_norm": 1.4967297752688393, | |
| "learning_rate": 4.9999886265851494e-06, | |
| "loss": 0.3297, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.034236804564907276, | |
| "grad_norm": 1.0241239996443163, | |
| "learning_rate": 4.999954506444081e-06, | |
| "loss": 0.32, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.037089871611982884, | |
| "grad_norm": 0.8077512735887262, | |
| "learning_rate": 4.999897639887245e-06, | |
| "loss": 0.313, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.039942938659058486, | |
| "grad_norm": 0.6435179766620793, | |
| "learning_rate": 4.999818027432054e-06, | |
| "loss": 0.3047, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.042796005706134094, | |
| "grad_norm": 0.6182488561257354, | |
| "learning_rate": 4.999715669802881e-06, | |
| "loss": 0.3026, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0456490727532097, | |
| "grad_norm": 0.5522422292679724, | |
| "learning_rate": 4.999590567931051e-06, | |
| "loss": 0.2946, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04850213980028531, | |
| "grad_norm": 0.5700106345726289, | |
| "learning_rate": 4.999442722954833e-06, | |
| "loss": 0.3013, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05135520684736091, | |
| "grad_norm": 0.5460665115135962, | |
| "learning_rate": 4.999272136219427e-06, | |
| "loss": 0.2916, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05420827389443652, | |
| "grad_norm": 0.5283057748722413, | |
| "learning_rate": 4.999078809276956e-06, | |
| "loss": 0.2906, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05706134094151213, | |
| "grad_norm": 0.6617475985080037, | |
| "learning_rate": 4.998862743886452e-06, | |
| "loss": 0.2909, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05991440798858773, | |
| "grad_norm": 0.49064170191851164, | |
| "learning_rate": 4.998623942013835e-06, | |
| "loss": 0.2923, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06276747503566334, | |
| "grad_norm": 0.5449673849225073, | |
| "learning_rate": 4.998362405831898e-06, | |
| "loss": 0.2904, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06562054208273894, | |
| "grad_norm": 0.5178385304622709, | |
| "learning_rate": 4.998078137720292e-06, | |
| "loss": 0.2885, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06847360912981455, | |
| "grad_norm": 0.6013270184068287, | |
| "learning_rate": 4.997771140265491e-06, | |
| "loss": 0.2896, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.07132667617689016, | |
| "grad_norm": 0.5036999676599386, | |
| "learning_rate": 4.997441416260788e-06, | |
| "loss": 0.2875, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07417974322396577, | |
| "grad_norm": 0.49633104688002133, | |
| "learning_rate": 4.997088968706251e-06, | |
| "loss": 0.2862, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07703281027104136, | |
| "grad_norm": 0.5030540697220535, | |
| "learning_rate": 4.996713800808705e-06, | |
| "loss": 0.2849, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07988587731811697, | |
| "grad_norm": 0.8538545541191804, | |
| "learning_rate": 4.996315915981702e-06, | |
| "loss": 0.2891, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.08273894436519258, | |
| "grad_norm": 0.5542761326465268, | |
| "learning_rate": 4.995895317845491e-06, | |
| "loss": 0.2813, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08559201141226819, | |
| "grad_norm": 0.4831357039573924, | |
| "learning_rate": 4.995452010226981e-06, | |
| "loss": 0.2822, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0884450784593438, | |
| "grad_norm": 0.49413470727490605, | |
| "learning_rate": 4.9949859971597084e-06, | |
| "loss": 0.2839, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0912981455064194, | |
| "grad_norm": 0.49715798319643933, | |
| "learning_rate": 4.994497282883802e-06, | |
| "loss": 0.2838, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.09415121255349501, | |
| "grad_norm": 0.4937351105629805, | |
| "learning_rate": 4.993985871845942e-06, | |
| "loss": 0.2783, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.09700427960057062, | |
| "grad_norm": 0.5310943504340412, | |
| "learning_rate": 4.99345176869932e-06, | |
| "loss": 0.2826, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09985734664764621, | |
| "grad_norm": 0.4962386074023998, | |
| "learning_rate": 4.992894978303597e-06, | |
| "loss": 0.2768, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.10271041369472182, | |
| "grad_norm": 0.4910162543594788, | |
| "learning_rate": 4.992315505724861e-06, | |
| "loss": 0.2801, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.10556348074179743, | |
| "grad_norm": 0.4843795289621177, | |
| "learning_rate": 4.991713356235576e-06, | |
| "loss": 0.2785, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10841654778887304, | |
| "grad_norm": 0.47267521696329656, | |
| "learning_rate": 4.991088535314539e-06, | |
| "loss": 0.2776, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.11126961483594865, | |
| "grad_norm": 0.49940435343646394, | |
| "learning_rate": 4.990441048646828e-06, | |
| "loss": 0.2772, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.11412268188302425, | |
| "grad_norm": 0.5844532702472411, | |
| "learning_rate": 4.989770902123752e-06, | |
| "loss": 0.2785, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11697574893009986, | |
| "grad_norm": 0.4631470746659208, | |
| "learning_rate": 4.989078101842792e-06, | |
| "loss": 0.2738, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11982881597717546, | |
| "grad_norm": 0.5086487339844298, | |
| "learning_rate": 4.988362654107554e-06, | |
| "loss": 0.2808, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.12268188302425106, | |
| "grad_norm": 0.47741692592940005, | |
| "learning_rate": 4.9876245654277054e-06, | |
| "loss": 0.281, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.12553495007132667, | |
| "grad_norm": 0.5008330267900547, | |
| "learning_rate": 4.986863842518916e-06, | |
| "loss": 0.277, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.12838801711840228, | |
| "grad_norm": 0.48974085937848094, | |
| "learning_rate": 4.986080492302799e-06, | |
| "loss": 0.2709, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1312410841654779, | |
| "grad_norm": 0.5318382363555189, | |
| "learning_rate": 4.98527452190685e-06, | |
| "loss": 0.2775, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1340941512125535, | |
| "grad_norm": 0.4518864804676144, | |
| "learning_rate": 4.984445938664376e-06, | |
| "loss": 0.276, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1369472182596291, | |
| "grad_norm": 0.4814601248508118, | |
| "learning_rate": 4.983594750114434e-06, | |
| "loss": 0.2772, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1398002853067047, | |
| "grad_norm": 0.4489516327124342, | |
| "learning_rate": 4.982720964001762e-06, | |
| "loss": 0.2756, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.14265335235378032, | |
| "grad_norm": 0.5902810377681712, | |
| "learning_rate": 4.981824588276702e-06, | |
| "loss": 0.2739, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14550641940085593, | |
| "grad_norm": 0.4733751511637321, | |
| "learning_rate": 4.980905631095139e-06, | |
| "loss": 0.2758, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.14835948644793154, | |
| "grad_norm": 0.4876727563136549, | |
| "learning_rate": 4.97996410081842e-06, | |
| "loss": 0.2725, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.15121255349500715, | |
| "grad_norm": 0.47342204196031207, | |
| "learning_rate": 4.979000006013272e-06, | |
| "loss": 0.2743, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.15406562054208273, | |
| "grad_norm": 0.6175379692483637, | |
| "learning_rate": 4.978013355451737e-06, | |
| "loss": 0.2736, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.15691868758915833, | |
| "grad_norm": 0.46511042837002475, | |
| "learning_rate": 4.977004158111085e-06, | |
| "loss": 0.2713, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.15977175463623394, | |
| "grad_norm": 0.5460372685613308, | |
| "learning_rate": 4.9759724231737314e-06, | |
| "loss": 0.2722, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.16262482168330955, | |
| "grad_norm": 0.45534826155428965, | |
| "learning_rate": 4.974918160027154e-06, | |
| "loss": 0.2723, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.16547788873038516, | |
| "grad_norm": 0.45090632277751, | |
| "learning_rate": 4.973841378263814e-06, | |
| "loss": 0.2724, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.16833095577746077, | |
| "grad_norm": 0.4842622057551346, | |
| "learning_rate": 4.972742087681057e-06, | |
| "loss": 0.2712, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.17118402282453637, | |
| "grad_norm": 0.46110522709871865, | |
| "learning_rate": 4.971620298281035e-06, | |
| "loss": 0.27, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.17403708987161198, | |
| "grad_norm": 0.4818257920535842, | |
| "learning_rate": 4.970476020270608e-06, | |
| "loss": 0.269, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1768901569186876, | |
| "grad_norm": 0.48740402413847767, | |
| "learning_rate": 4.969309264061255e-06, | |
| "loss": 0.2743, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1797432239657632, | |
| "grad_norm": 0.48077635195269214, | |
| "learning_rate": 4.968120040268978e-06, | |
| "loss": 0.2725, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1825962910128388, | |
| "grad_norm": 0.46487834338154116, | |
| "learning_rate": 4.966908359714206e-06, | |
| "loss": 0.2681, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.18544935805991442, | |
| "grad_norm": 0.865148162215464, | |
| "learning_rate": 4.965674233421693e-06, | |
| "loss": 0.2737, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.18830242510699002, | |
| "grad_norm": 0.4411958842345361, | |
| "learning_rate": 4.964417672620427e-06, | |
| "loss": 0.269, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.19115549215406563, | |
| "grad_norm": 0.4721096540391217, | |
| "learning_rate": 4.963138688743515e-06, | |
| "loss": 0.2727, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.19400855920114124, | |
| "grad_norm": 0.4749289898782321, | |
| "learning_rate": 4.96183729342809e-06, | |
| "loss": 0.2675, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.19686162624821682, | |
| "grad_norm": 0.4475533049685516, | |
| "learning_rate": 4.960513498515198e-06, | |
| "loss": 0.2697, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.19971469329529243, | |
| "grad_norm": 0.4628820330767458, | |
| "learning_rate": 4.959167316049695e-06, | |
| "loss": 0.2698, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.20256776034236804, | |
| "grad_norm": 0.46916030608414716, | |
| "learning_rate": 4.957798758280133e-06, | |
| "loss": 0.271, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.20542082738944364, | |
| "grad_norm": 0.4545114466912326, | |
| "learning_rate": 4.956407837658654e-06, | |
| "loss": 0.2657, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.20827389443651925, | |
| "grad_norm": 0.43736837830336234, | |
| "learning_rate": 4.954994566840869e-06, | |
| "loss": 0.2669, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.21112696148359486, | |
| "grad_norm": 0.43214532509621273, | |
| "learning_rate": 4.9535589586857535e-06, | |
| "loss": 0.271, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.21398002853067047, | |
| "grad_norm": 0.4583545225436044, | |
| "learning_rate": 4.952101026255519e-06, | |
| "loss": 0.2693, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.21683309557774608, | |
| "grad_norm": 0.43639880262186087, | |
| "learning_rate": 4.950620782815503e-06, | |
| "loss": 0.2621, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.21968616262482168, | |
| "grad_norm": 0.4476657629981296, | |
| "learning_rate": 4.949118241834043e-06, | |
| "loss": 0.2674, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2225392296718973, | |
| "grad_norm": 0.5350799910644292, | |
| "learning_rate": 4.9475934169823555e-06, | |
| "loss": 0.271, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2253922967189729, | |
| "grad_norm": 0.4543328929999822, | |
| "learning_rate": 4.9460463221344155e-06, | |
| "loss": 0.2696, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2282453637660485, | |
| "grad_norm": 0.4451416033507459, | |
| "learning_rate": 4.944476971366823e-06, | |
| "loss": 0.2665, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.23109843081312412, | |
| "grad_norm": 0.569109136888445, | |
| "learning_rate": 4.9428853789586785e-06, | |
| "loss": 0.2637, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.23395149786019973, | |
| "grad_norm": 0.4524997655780825, | |
| "learning_rate": 4.9412715593914566e-06, | |
| "loss": 0.2645, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.23680456490727533, | |
| "grad_norm": 0.44614915653478204, | |
| "learning_rate": 4.939635527348867e-06, | |
| "loss": 0.2664, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2396576319543509, | |
| "grad_norm": 0.4522775732411208, | |
| "learning_rate": 4.937977297716729e-06, | |
| "loss": 0.2639, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.24251069900142652, | |
| "grad_norm": 0.4851573497130338, | |
| "learning_rate": 4.936296885582827e-06, | |
| "loss": 0.2681, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.24536376604850213, | |
| "grad_norm": 0.4561207795491063, | |
| "learning_rate": 4.934594306236783e-06, | |
| "loss": 0.2667, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.24821683309557774, | |
| "grad_norm": 0.43239589534049766, | |
| "learning_rate": 4.932869575169907e-06, | |
| "loss": 0.2666, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.25106990014265335, | |
| "grad_norm": 0.4577957080766326, | |
| "learning_rate": 4.9311227080750665e-06, | |
| "loss": 0.2685, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.25392296718972895, | |
| "grad_norm": 0.5847319935303987, | |
| "learning_rate": 4.929353720846537e-06, | |
| "loss": 0.2657, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.25677603423680456, | |
| "grad_norm": 0.4277882450989014, | |
| "learning_rate": 4.9275626295798575e-06, | |
| "loss": 0.268, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.25962910128388017, | |
| "grad_norm": 0.45465429351905756, | |
| "learning_rate": 4.9257494505716885e-06, | |
| "loss": 0.2651, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2624821683309558, | |
| "grad_norm": 0.5463528873483586, | |
| "learning_rate": 4.923914200319659e-06, | |
| "loss": 0.2648, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2653352353780314, | |
| "grad_norm": 0.43237436892519443, | |
| "learning_rate": 4.922056895522219e-06, | |
| "loss": 0.2608, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.268188302425107, | |
| "grad_norm": 0.4336706485840371, | |
| "learning_rate": 4.920177553078488e-06, | |
| "loss": 0.2633, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2710413694721826, | |
| "grad_norm": 0.42928653379024545, | |
| "learning_rate": 4.918276190088097e-06, | |
| "loss": 0.2619, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2738944365192582, | |
| "grad_norm": 0.42378464697261636, | |
| "learning_rate": 4.916352823851039e-06, | |
| "loss": 0.2639, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2767475035663338, | |
| "grad_norm": 0.479336654101038, | |
| "learning_rate": 4.9144074718675096e-06, | |
| "loss": 0.2633, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2796005706134094, | |
| "grad_norm": 0.44958869313136685, | |
| "learning_rate": 4.912440151837741e-06, | |
| "loss": 0.2604, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.28245363766048504, | |
| "grad_norm": 0.5313438040766252, | |
| "learning_rate": 4.910450881661854e-06, | |
| "loss": 0.2614, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.28530670470756064, | |
| "grad_norm": 0.45263061822213785, | |
| "learning_rate": 4.908439679439683e-06, | |
| "loss": 0.265, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.28530670470756064, | |
| "eval_loss": 0.26471513509750366, | |
| "eval_runtime": 1742.5629, | |
| "eval_samples_per_second": 10.4, | |
| "eval_steps_per_second": 0.041, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.28815977175463625, | |
| "grad_norm": 0.44311080830159827, | |
| "learning_rate": 4.906406563470618e-06, | |
| "loss": 0.2671, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.29101283880171186, | |
| "grad_norm": 0.44554515219755003, | |
| "learning_rate": 4.904351552253437e-06, | |
| "loss": 0.2643, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.29386590584878747, | |
| "grad_norm": 0.44722094387626915, | |
| "learning_rate": 4.902274664486135e-06, | |
| "loss": 0.2599, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.2967189728958631, | |
| "grad_norm": 0.4328390108638598, | |
| "learning_rate": 4.900175919065757e-06, | |
| "loss": 0.263, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.2995720399429387, | |
| "grad_norm": 0.4273990973816548, | |
| "learning_rate": 4.898055335088225e-06, | |
| "loss": 0.2633, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3024251069900143, | |
| "grad_norm": 0.44093816145774345, | |
| "learning_rate": 4.895912931848165e-06, | |
| "loss": 0.2667, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.30527817403708984, | |
| "grad_norm": 0.41805518075725556, | |
| "learning_rate": 4.893748728838728e-06, | |
| "loss": 0.259, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.30813124108416545, | |
| "grad_norm": 0.4455465834182174, | |
| "learning_rate": 4.891562745751418e-06, | |
| "loss": 0.261, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.31098430813124106, | |
| "grad_norm": 0.4526375562352427, | |
| "learning_rate": 4.889355002475909e-06, | |
| "loss": 0.2657, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.31383737517831667, | |
| "grad_norm": 0.4332796375642561, | |
| "learning_rate": 4.8871255190998644e-06, | |
| "loss": 0.2627, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3166904422253923, | |
| "grad_norm": 0.4510562211684952, | |
| "learning_rate": 4.884874315908757e-06, | |
| "loss": 0.2618, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3195435092724679, | |
| "grad_norm": 0.45101552885502644, | |
| "learning_rate": 4.882601413385679e-06, | |
| "loss": 0.2565, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3223965763195435, | |
| "grad_norm": 1.0935110922508877, | |
| "learning_rate": 4.8803068322111635e-06, | |
| "loss": 0.2588, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3252496433666191, | |
| "grad_norm": 0.43637967786050386, | |
| "learning_rate": 4.8779905932629874e-06, | |
| "loss": 0.2603, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3281027104136947, | |
| "grad_norm": 0.42416148844137713, | |
| "learning_rate": 4.875652717615989e-06, | |
| "loss": 0.2598, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3309557774607703, | |
| "grad_norm": 0.4251093273387571, | |
| "learning_rate": 4.873293226541871e-06, | |
| "loss": 0.2595, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3338088445078459, | |
| "grad_norm": 0.48415176922179926, | |
| "learning_rate": 4.870912141509011e-06, | |
| "loss": 0.2651, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.33666191155492153, | |
| "grad_norm": 0.60133878845711, | |
| "learning_rate": 4.868509484182263e-06, | |
| "loss": 0.2589, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.33951497860199714, | |
| "grad_norm": 0.437484977876018, | |
| "learning_rate": 4.866085276422761e-06, | |
| "loss": 0.2615, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.34236804564907275, | |
| "grad_norm": 0.4577703915050453, | |
| "learning_rate": 4.863639540287724e-06, | |
| "loss": 0.2596, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.34522111269614836, | |
| "grad_norm": 0.4500725064644898, | |
| "learning_rate": 4.861172298030245e-06, | |
| "loss": 0.2602, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.34807417974322397, | |
| "grad_norm": 0.4135593167266766, | |
| "learning_rate": 4.858683572099104e-06, | |
| "loss": 0.259, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3509272467902996, | |
| "grad_norm": 0.4534651310157033, | |
| "learning_rate": 4.856173385138549e-06, | |
| "loss": 0.2596, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3537803138373752, | |
| "grad_norm": 0.42472565214509456, | |
| "learning_rate": 4.853641759988098e-06, | |
| "loss": 0.2601, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3566333808844508, | |
| "grad_norm": 0.43624596360452567, | |
| "learning_rate": 4.85108871968233e-06, | |
| "loss": 0.2602, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3594864479315264, | |
| "grad_norm": 0.41607970147664086, | |
| "learning_rate": 4.848514287450675e-06, | |
| "loss": 0.2588, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.362339514978602, | |
| "grad_norm": 0.4412189776792949, | |
| "learning_rate": 4.8459184867172e-06, | |
| "loss": 0.2645, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3651925820256776, | |
| "grad_norm": 0.44224256221516595, | |
| "learning_rate": 4.8433013411004e-06, | |
| "loss": 0.2586, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3680456490727532, | |
| "grad_norm": 0.4348734461902946, | |
| "learning_rate": 4.840662874412983e-06, | |
| "loss": 0.2612, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.37089871611982883, | |
| "grad_norm": 0.42818319713706526, | |
| "learning_rate": 4.838003110661648e-06, | |
| "loss": 0.2587, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.37375178316690444, | |
| "grad_norm": 0.4451304872359712, | |
| "learning_rate": 4.8353220740468735e-06, | |
| "loss": 0.2585, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.37660485021398005, | |
| "grad_norm": 0.40548688155079643, | |
| "learning_rate": 4.832619788962692e-06, | |
| "loss": 0.2555, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.37945791726105566, | |
| "grad_norm": 0.45006278258495436, | |
| "learning_rate": 4.8298962799964714e-06, | |
| "loss": 0.2583, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.38231098430813126, | |
| "grad_norm": 0.4265482574282557, | |
| "learning_rate": 4.82715157192869e-06, | |
| "loss": 0.2589, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.38516405135520687, | |
| "grad_norm": 0.43176158718051943, | |
| "learning_rate": 4.824385689732709e-06, | |
| "loss": 0.2615, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.3880171184022825, | |
| "grad_norm": 0.42858147809592845, | |
| "learning_rate": 4.8215986585745515e-06, | |
| "loss": 0.2545, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.3908701854493581, | |
| "grad_norm": 0.5680213400128438, | |
| "learning_rate": 4.818790503812664e-06, | |
| "loss": 0.2599, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.39372325249643364, | |
| "grad_norm": 0.40377635878237433, | |
| "learning_rate": 4.8159612509976955e-06, | |
| "loss": 0.2585, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.39657631954350925, | |
| "grad_norm": 0.42022314777238234, | |
| "learning_rate": 4.813110925872258e-06, | |
| "loss": 0.2595, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.39942938659058486, | |
| "grad_norm": 0.40638381972803406, | |
| "learning_rate": 4.810239554370696e-06, | |
| "loss": 0.2563, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.40228245363766046, | |
| "grad_norm": 0.6782938626288993, | |
| "learning_rate": 4.807347162618849e-06, | |
| "loss": 0.26, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4051355206847361, | |
| "grad_norm": 0.4472896472871635, | |
| "learning_rate": 4.804433776933814e-06, | |
| "loss": 0.2577, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4079885877318117, | |
| "grad_norm": 0.4230599782725977, | |
| "learning_rate": 4.8014994238237055e-06, | |
| "loss": 0.2594, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4108416547788873, | |
| "grad_norm": 0.45922873897462896, | |
| "learning_rate": 4.798544129987417e-06, | |
| "loss": 0.2567, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4136947218259629, | |
| "grad_norm": 0.5826469722725868, | |
| "learning_rate": 4.7955679223143735e-06, | |
| "loss": 0.2612, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4165477888730385, | |
| "grad_norm": 0.4187718664079106, | |
| "learning_rate": 4.792570827884291e-06, | |
| "loss": 0.2575, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4194008559201141, | |
| "grad_norm": 0.429325671964738, | |
| "learning_rate": 4.789552873966929e-06, | |
| "loss": 0.2554, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4222539229671897, | |
| "grad_norm": 0.41858743123582154, | |
| "learning_rate": 4.78651408802184e-06, | |
| "loss": 0.2554, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.42510699001426533, | |
| "grad_norm": 0.44033025504158285, | |
| "learning_rate": 4.7834544976981225e-06, | |
| "loss": 0.2592, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.42796005706134094, | |
| "grad_norm": 0.4273424185754061, | |
| "learning_rate": 4.780374130834169e-06, | |
| "loss": 0.2553, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.43081312410841655, | |
| "grad_norm": 0.4323663068566294, | |
| "learning_rate": 4.777273015457412e-06, | |
| "loss": 0.2548, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.43366619115549215, | |
| "grad_norm": 0.4445698596020488, | |
| "learning_rate": 4.774151179784068e-06, | |
| "loss": 0.2566, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.43651925820256776, | |
| "grad_norm": 0.44015126214223665, | |
| "learning_rate": 4.771008652218883e-06, | |
| "loss": 0.2582, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.43937232524964337, | |
| "grad_norm": 0.4163665266324157, | |
| "learning_rate": 4.767845461354873e-06, | |
| "loss": 0.2556, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.442225392296719, | |
| "grad_norm": 0.41322404107853195, | |
| "learning_rate": 4.764661635973063e-06, | |
| "loss": 0.2562, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4450784593437946, | |
| "grad_norm": 0.4228007555177404, | |
| "learning_rate": 4.7614572050422286e-06, | |
| "loss": 0.2517, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4479315263908702, | |
| "grad_norm": 0.532694784450123, | |
| "learning_rate": 4.7582321977186255e-06, | |
| "loss": 0.2543, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4507845934379458, | |
| "grad_norm": 0.4073902034975678, | |
| "learning_rate": 4.75498664334573e-06, | |
| "loss": 0.2589, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.4536376604850214, | |
| "grad_norm": 0.45006754890846795, | |
| "learning_rate": 4.751720571453973e-06, | |
| "loss": 0.254, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.456490727532097, | |
| "grad_norm": 0.4214108734497676, | |
| "learning_rate": 4.748434011760467e-06, | |
| "loss": 0.259, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4593437945791726, | |
| "grad_norm": 0.4859439216376165, | |
| "learning_rate": 4.745126994168736e-06, | |
| "loss": 0.2545, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.46219686162624823, | |
| "grad_norm": 0.4295897174181562, | |
| "learning_rate": 4.7417995487684475e-06, | |
| "loss": 0.2561, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.46504992867332384, | |
| "grad_norm": 0.4615721735091556, | |
| "learning_rate": 4.738451705835134e-06, | |
| "loss": 0.2529, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.46790299572039945, | |
| "grad_norm": 0.4349080676205723, | |
| "learning_rate": 4.735083495829922e-06, | |
| "loss": 0.2559, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.47075606276747506, | |
| "grad_norm": 0.4160235794943592, | |
| "learning_rate": 4.731694949399251e-06, | |
| "loss": 0.2547, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.47360912981455067, | |
| "grad_norm": 0.4165750292624292, | |
| "learning_rate": 4.728286097374596e-06, | |
| "loss": 0.2583, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4764621968616263, | |
| "grad_norm": 0.42872504814420337, | |
| "learning_rate": 4.724856970772187e-06, | |
| "loss": 0.2588, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.4793152639087018, | |
| "grad_norm": 0.4490411001148922, | |
| "learning_rate": 4.721407600792729e-06, | |
| "loss": 0.2571, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.48216833095577744, | |
| "grad_norm": 0.4257707550200331, | |
| "learning_rate": 4.7179380188211136e-06, | |
| "loss": 0.2535, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.48502139800285304, | |
| "grad_norm": 0.45443727973461423, | |
| "learning_rate": 4.714448256426136e-06, | |
| "loss": 0.2542, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.48787446504992865, | |
| "grad_norm": 0.41909169551892117, | |
| "learning_rate": 4.7109383453602115e-06, | |
| "loss": 0.2564, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.49072753209700426, | |
| "grad_norm": 0.4238573989595952, | |
| "learning_rate": 4.707408317559077e-06, | |
| "loss": 0.2532, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.49358059914407987, | |
| "grad_norm": 0.4479506980321584, | |
| "learning_rate": 4.70385820514151e-06, | |
| "loss": 0.2542, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4964336661911555, | |
| "grad_norm": 0.49416455456248887, | |
| "learning_rate": 4.7002880404090326e-06, | |
| "loss": 0.2515, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4992867332382311, | |
| "grad_norm": 0.41497557289537307, | |
| "learning_rate": 4.696697855845615e-06, | |
| "loss": 0.253, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5021398002853067, | |
| "grad_norm": 0.405311057519384, | |
| "learning_rate": 4.693087684117383e-06, | |
| "loss": 0.2523, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5049928673323824, | |
| "grad_norm": 0.4096518116084079, | |
| "learning_rate": 4.689457558072323e-06, | |
| "loss": 0.2555, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5078459343794579, | |
| "grad_norm": 0.42444645636411893, | |
| "learning_rate": 4.6858075107399794e-06, | |
| "loss": 0.2532, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5106990014265336, | |
| "grad_norm": 0.4469562453504139, | |
| "learning_rate": 4.682137575331153e-06, | |
| "loss": 0.2509, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.5135520684736091, | |
| "grad_norm": 0.42408704273613285, | |
| "learning_rate": 4.678447785237601e-06, | |
| "loss": 0.2526, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5164051355206848, | |
| "grad_norm": 0.4119098542142099, | |
| "learning_rate": 4.674738174031735e-06, | |
| "loss": 0.2521, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5192582025677603, | |
| "grad_norm": 0.41779003975229145, | |
| "learning_rate": 4.671008775466314e-06, | |
| "loss": 0.254, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5221112696148359, | |
| "grad_norm": 0.4317147979348233, | |
| "learning_rate": 4.667259623474133e-06, | |
| "loss": 0.2526, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5249643366619116, | |
| "grad_norm": 0.4745777937198977, | |
| "learning_rate": 4.663490752167724e-06, | |
| "loss": 0.2521, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5278174037089871, | |
| "grad_norm": 0.4552627148139544, | |
| "learning_rate": 4.6597021958390345e-06, | |
| "loss": 0.2522, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5306704707560628, | |
| "grad_norm": 0.42753668658640426, | |
| "learning_rate": 4.655893988959123e-06, | |
| "loss": 0.2522, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5335235378031383, | |
| "grad_norm": 0.42474448674162446, | |
| "learning_rate": 4.652066166177842e-06, | |
| "loss": 0.255, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.536376604850214, | |
| "grad_norm": 0.4381596996499485, | |
| "learning_rate": 4.648218762323527e-06, | |
| "loss": 0.2539, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5392296718972895, | |
| "grad_norm": 0.41307815162965605, | |
| "learning_rate": 4.644351812402672e-06, | |
| "loss": 0.2513, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5420827389443652, | |
| "grad_norm": 0.4208888996238608, | |
| "learning_rate": 4.640465351599618e-06, | |
| "loss": 0.2504, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5449358059914408, | |
| "grad_norm": 0.39509408192070633, | |
| "learning_rate": 4.636559415276231e-06, | |
| "loss": 0.2515, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5477888730385164, | |
| "grad_norm": 0.4175388256033645, | |
| "learning_rate": 4.632634038971576e-06, | |
| "loss": 0.2547, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.550641940085592, | |
| "grad_norm": 0.43772323374782285, | |
| "learning_rate": 4.628689258401603e-06, | |
| "loss": 0.2526, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5534950071326676, | |
| "grad_norm": 0.42283349330961034, | |
| "learning_rate": 4.624725109458809e-06, | |
| "loss": 0.2537, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5563480741797432, | |
| "grad_norm": 0.4208740137670526, | |
| "learning_rate": 4.6207416282119246e-06, | |
| "loss": 0.2532, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5592011412268189, | |
| "grad_norm": 0.435314173012652, | |
| "learning_rate": 4.616738850905577e-06, | |
| "loss": 0.2524, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5620542082738944, | |
| "grad_norm": 0.44426244460821573, | |
| "learning_rate": 4.612716813959963e-06, | |
| "loss": 0.2531, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5649072753209701, | |
| "grad_norm": 0.39252008042053277, | |
| "learning_rate": 4.608675553970521e-06, | |
| "loss": 0.2543, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5677603423680456, | |
| "grad_norm": 0.42203820881314386, | |
| "learning_rate": 4.604615107707588e-06, | |
| "loss": 0.2503, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5706134094151213, | |
| "grad_norm": 0.4443532935670162, | |
| "learning_rate": 4.60053551211608e-06, | |
| "loss": 0.2555, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5706134094151213, | |
| "eval_loss": 0.2534787058830261, | |
| "eval_runtime": 1735.0009, | |
| "eval_samples_per_second": 10.445, | |
| "eval_steps_per_second": 0.041, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5734664764621968, | |
| "grad_norm": 0.402041602765129, | |
| "learning_rate": 4.596436804315141e-06, | |
| "loss": 0.2517, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5763195435092725, | |
| "grad_norm": 0.4237553224823114, | |
| "learning_rate": 4.592319021597814e-06, | |
| "loss": 0.255, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.579172610556348, | |
| "grad_norm": 0.40865394365467317, | |
| "learning_rate": 4.588182201430702e-06, | |
| "loss": 0.2495, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5820256776034237, | |
| "grad_norm": 0.3973130613900969, | |
| "learning_rate": 4.58402638145362e-06, | |
| "loss": 0.2477, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5848787446504993, | |
| "grad_norm": 0.3979382862870555, | |
| "learning_rate": 4.5798515994792625e-06, | |
| "loss": 0.2499, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5877318116975749, | |
| "grad_norm": 0.548323807818838, | |
| "learning_rate": 4.575657893492849e-06, | |
| "loss": 0.2492, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5905848787446505, | |
| "grad_norm": 0.4018835955183295, | |
| "learning_rate": 4.571445301651787e-06, | |
| "loss": 0.2543, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5934379457917262, | |
| "grad_norm": 0.39322324148387133, | |
| "learning_rate": 4.56721386228532e-06, | |
| "loss": 0.2503, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5962910128388017, | |
| "grad_norm": 0.47145556435252123, | |
| "learning_rate": 4.56296361389418e-06, | |
| "loss": 0.2558, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5991440798858774, | |
| "grad_norm": 0.39253010780526015, | |
| "learning_rate": 4.558694595150238e-06, | |
| "loss": 0.2531, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6019971469329529, | |
| "grad_norm": 0.43535536580155454, | |
| "learning_rate": 4.5544068448961505e-06, | |
| "loss": 0.2527, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6048502139800286, | |
| "grad_norm": 0.4391757861022582, | |
| "learning_rate": 4.550100402145007e-06, | |
| "loss": 0.2516, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6077032810271041, | |
| "grad_norm": 0.4044352420827572, | |
| "learning_rate": 4.545775306079977e-06, | |
| "loss": 0.2519, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6105563480741797, | |
| "grad_norm": 0.4092326872105066, | |
| "learning_rate": 4.541431596053949e-06, | |
| "loss": 0.2489, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.6134094151212554, | |
| "grad_norm": 0.4142574136435565, | |
| "learning_rate": 4.537069311589175e-06, | |
| "loss": 0.2533, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6162624821683309, | |
| "grad_norm": 0.407854285271177, | |
| "learning_rate": 4.5326884923769136e-06, | |
| "loss": 0.2489, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6191155492154066, | |
| "grad_norm": 0.4006735653856468, | |
| "learning_rate": 4.528289178277062e-06, | |
| "loss": 0.253, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6219686162624821, | |
| "grad_norm": 0.44701909817555985, | |
| "learning_rate": 4.5238714093178025e-06, | |
| "loss": 0.2496, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6248216833095578, | |
| "grad_norm": 0.4111503299148831, | |
| "learning_rate": 4.519435225695228e-06, | |
| "loss": 0.2492, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.6276747503566333, | |
| "grad_norm": 0.4276239208483549, | |
| "learning_rate": 4.514980667772985e-06, | |
| "loss": 0.2532, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.630527817403709, | |
| "grad_norm": 0.4037238998713479, | |
| "learning_rate": 4.5105077760819e-06, | |
| "loss": 0.25, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6333808844507846, | |
| "grad_norm": 0.4064925136476104, | |
| "learning_rate": 4.506016591319619e-06, | |
| "loss": 0.2511, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.6362339514978602, | |
| "grad_norm": 0.4198253144415072, | |
| "learning_rate": 4.501507154350224e-06, | |
| "loss": 0.2515, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.6390870185449358, | |
| "grad_norm": 0.424295747980334, | |
| "learning_rate": 4.496979506203874e-06, | |
| "loss": 0.2497, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.6419400855920114, | |
| "grad_norm": 0.41396692230669685, | |
| "learning_rate": 4.492433688076427e-06, | |
| "loss": 0.2494, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.644793152639087, | |
| "grad_norm": 0.4498271205977494, | |
| "learning_rate": 4.487869741329061e-06, | |
| "loss": 0.2513, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6476462196861626, | |
| "grad_norm": 0.41173345505564285, | |
| "learning_rate": 4.4832877074879065e-06, | |
| "loss": 0.2473, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.6504992867332382, | |
| "grad_norm": 0.8132661067460093, | |
| "learning_rate": 4.478687628243659e-06, | |
| "loss": 0.2532, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.6533523537803139, | |
| "grad_norm": 0.40516334541464116, | |
| "learning_rate": 4.474069545451206e-06, | |
| "loss": 0.2493, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.6562054208273894, | |
| "grad_norm": 0.40778215843981175, | |
| "learning_rate": 4.4694335011292464e-06, | |
| "loss": 0.2483, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6590584878744651, | |
| "grad_norm": 0.4085656867005652, | |
| "learning_rate": 4.464779537459902e-06, | |
| "loss": 0.2529, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6619115549215406, | |
| "grad_norm": 0.41429649428228155, | |
| "learning_rate": 4.460107696788343e-06, | |
| "loss": 0.251, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6647646219686163, | |
| "grad_norm": 0.41004572765980873, | |
| "learning_rate": 4.455418021622393e-06, | |
| "loss": 0.2486, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6676176890156919, | |
| "grad_norm": 0.41590970679740247, | |
| "learning_rate": 4.45071055463215e-06, | |
| "loss": 0.2501, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6704707560627675, | |
| "grad_norm": 0.393909725360039, | |
| "learning_rate": 4.445985338649594e-06, | |
| "loss": 0.2454, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6733238231098431, | |
| "grad_norm": 0.4035630410246483, | |
| "learning_rate": 4.441242416668198e-06, | |
| "loss": 0.2468, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6761768901569187, | |
| "grad_norm": 0.4273195789696201, | |
| "learning_rate": 4.436481831842537e-06, | |
| "loss": 0.2503, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6790299572039943, | |
| "grad_norm": 0.41577354947495637, | |
| "learning_rate": 4.431703627487897e-06, | |
| "loss": 0.2457, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.68188302425107, | |
| "grad_norm": 0.3979193031998009, | |
| "learning_rate": 4.426907847079878e-06, | |
| "loss": 0.2486, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6847360912981455, | |
| "grad_norm": 0.40986225593473136, | |
| "learning_rate": 4.422094534253998e-06, | |
| "loss": 0.2507, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6875891583452212, | |
| "grad_norm": 0.42272145688894464, | |
| "learning_rate": 4.417263732805303e-06, | |
| "loss": 0.2502, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6904422253922967, | |
| "grad_norm": 0.4137381078371123, | |
| "learning_rate": 4.412415486687958e-06, | |
| "loss": 0.248, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6932952924393724, | |
| "grad_norm": 0.394509664411755, | |
| "learning_rate": 4.407549840014856e-06, | |
| "loss": 0.2493, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6961483594864479, | |
| "grad_norm": 0.4216114075029632, | |
| "learning_rate": 4.402666837057211e-06, | |
| "loss": 0.2472, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6990014265335235, | |
| "grad_norm": 0.38685584751539454, | |
| "learning_rate": 4.397766522244158e-06, | |
| "loss": 0.2448, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7018544935805991, | |
| "grad_norm": 0.4044295983955075, | |
| "learning_rate": 4.3928489401623455e-06, | |
| "loss": 0.2467, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7047075606276747, | |
| "grad_norm": 0.43019705842154876, | |
| "learning_rate": 4.387914135555537e-06, | |
| "loss": 0.2469, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7075606276747504, | |
| "grad_norm": 0.4467724769804675, | |
| "learning_rate": 4.3829621533241955e-06, | |
| "loss": 0.246, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7104136947218259, | |
| "grad_norm": 0.40006544179174647, | |
| "learning_rate": 4.377993038525079e-06, | |
| "loss": 0.249, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.7132667617689016, | |
| "grad_norm": 0.40016811869588237, | |
| "learning_rate": 4.373006836370832e-06, | |
| "loss": 0.2449, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7161198288159771, | |
| "grad_norm": 0.4112329700604831, | |
| "learning_rate": 4.36800359222957e-06, | |
| "loss": 0.2502, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.7189728958630528, | |
| "grad_norm": 0.402952155245274, | |
| "learning_rate": 4.36298335162447e-06, | |
| "loss": 0.2462, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.7218259629101283, | |
| "grad_norm": 0.39083763622254625, | |
| "learning_rate": 4.357946160233356e-06, | |
| "loss": 0.2488, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.724679029957204, | |
| "grad_norm": 0.4035312318583136, | |
| "learning_rate": 4.352892063888281e-06, | |
| "loss": 0.2465, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.7275320970042796, | |
| "grad_norm": 0.40610326566062943, | |
| "learning_rate": 4.347821108575113e-06, | |
| "loss": 0.2501, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7303851640513552, | |
| "grad_norm": 0.39942872714888833, | |
| "learning_rate": 4.342733340433115e-06, | |
| "loss": 0.2463, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.7332382310984308, | |
| "grad_norm": 0.40646133973030324, | |
| "learning_rate": 4.337628805754525e-06, | |
| "loss": 0.2456, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.7360912981455064, | |
| "grad_norm": 0.3994627809176088, | |
| "learning_rate": 4.3325075509841355e-06, | |
| "loss": 0.2474, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.738944365192582, | |
| "grad_norm": 0.3824733984455135, | |
| "learning_rate": 4.32736962271887e-06, | |
| "loss": 0.2491, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.7417974322396577, | |
| "grad_norm": 0.3881074574492312, | |
| "learning_rate": 4.32221506770736e-06, | |
| "loss": 0.2494, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7446504992867332, | |
| "grad_norm": 0.41224420244827775, | |
| "learning_rate": 4.3170439328495216e-06, | |
| "loss": 0.2411, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.7475035663338089, | |
| "grad_norm": 0.4242095789286537, | |
| "learning_rate": 4.311856265196122e-06, | |
| "loss": 0.2476, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.7503566333808844, | |
| "grad_norm": 0.4081539274332129, | |
| "learning_rate": 4.3066521119483595e-06, | |
| "loss": 0.2461, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.7532097004279601, | |
| "grad_norm": 0.4078553256962837, | |
| "learning_rate": 4.301431520457428e-06, | |
| "loss": 0.2477, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.7560627674750356, | |
| "grad_norm": 0.4021083988237341, | |
| "learning_rate": 4.296194538224092e-06, | |
| "loss": 0.2456, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7589158345221113, | |
| "grad_norm": 0.3941508594403912, | |
| "learning_rate": 4.290941212898248e-06, | |
| "loss": 0.2476, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.7617689015691869, | |
| "grad_norm": 0.40144956230368534, | |
| "learning_rate": 4.285671592278492e-06, | |
| "loss": 0.2488, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.7646219686162625, | |
| "grad_norm": 0.40699098183331833, | |
| "learning_rate": 4.280385724311691e-06, | |
| "loss": 0.2458, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7674750356633381, | |
| "grad_norm": 0.5188128225635168, | |
| "learning_rate": 4.275083657092541e-06, | |
| "loss": 0.2433, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7703281027104137, | |
| "grad_norm": 0.40512015609538055, | |
| "learning_rate": 4.2697654388631295e-06, | |
| "loss": 0.2461, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7731811697574893, | |
| "grad_norm": 0.39065120200311765, | |
| "learning_rate": 4.264431118012498e-06, | |
| "loss": 0.2461, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.776034236804565, | |
| "grad_norm": 0.39851975406480444, | |
| "learning_rate": 4.259080743076203e-06, | |
| "loss": 0.2504, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.7788873038516405, | |
| "grad_norm": 0.3948409453282174, | |
| "learning_rate": 4.253714362735869e-06, | |
| "loss": 0.2477, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7817403708987162, | |
| "grad_norm": 0.40090019139214444, | |
| "learning_rate": 4.248332025818754e-06, | |
| "loss": 0.2461, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.7845934379457917, | |
| "grad_norm": 0.4105471490833463, | |
| "learning_rate": 4.242933781297297e-06, | |
| "loss": 0.249, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7874465049928673, | |
| "grad_norm": 0.4133163509705647, | |
| "learning_rate": 4.237519678288679e-06, | |
| "loss": 0.2488, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7902995720399429, | |
| "grad_norm": 0.3973226798279995, | |
| "learning_rate": 4.232089766054371e-06, | |
| "loss": 0.2474, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7931526390870185, | |
| "grad_norm": 0.4092824228811784, | |
| "learning_rate": 4.226644093999689e-06, | |
| "loss": 0.2472, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7960057061340942, | |
| "grad_norm": 0.38668313596002085, | |
| "learning_rate": 4.221182711673342e-06, | |
| "loss": 0.2459, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7988587731811697, | |
| "grad_norm": 0.40203392598637905, | |
| "learning_rate": 4.215705668766983e-06, | |
| "loss": 0.2475, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8017118402282454, | |
| "grad_norm": 0.42197334057202673, | |
| "learning_rate": 4.210213015114759e-06, | |
| "loss": 0.2459, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.8045649072753209, | |
| "grad_norm": 0.38817473177898254, | |
| "learning_rate": 4.204704800692851e-06, | |
| "loss": 0.2456, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.8074179743223966, | |
| "grad_norm": 0.41140732304480315, | |
| "learning_rate": 4.1991810756190265e-06, | |
| "loss": 0.2482, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.8102710413694721, | |
| "grad_norm": 0.3920457964568419, | |
| "learning_rate": 4.193641890152178e-06, | |
| "loss": 0.2457, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.8131241084165478, | |
| "grad_norm": 0.4096453185117248, | |
| "learning_rate": 4.18808729469187e-06, | |
| "loss": 0.2445, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8159771754636234, | |
| "grad_norm": 0.3941808069339604, | |
| "learning_rate": 4.182517339777875e-06, | |
| "loss": 0.2461, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.818830242510699, | |
| "grad_norm": 0.40375858832021977, | |
| "learning_rate": 4.1769320760897225e-06, | |
| "loss": 0.2429, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.8216833095577746, | |
| "grad_norm": 0.3976500055021346, | |
| "learning_rate": 4.171331554446227e-06, | |
| "loss": 0.2441, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8245363766048502, | |
| "grad_norm": 0.4075551768911702, | |
| "learning_rate": 4.1657158258050336e-06, | |
| "loss": 0.244, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.8273894436519258, | |
| "grad_norm": 0.40940826125744373, | |
| "learning_rate": 4.160084941262153e-06, | |
| "loss": 0.2473, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8302425106990015, | |
| "grad_norm": 0.42519664385457717, | |
| "learning_rate": 4.154438952051491e-06, | |
| "loss": 0.2449, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.833095577746077, | |
| "grad_norm": 0.4242347233613599, | |
| "learning_rate": 4.148777909544393e-06, | |
| "loss": 0.2434, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.8359486447931527, | |
| "grad_norm": 0.4145820864091558, | |
| "learning_rate": 4.143101865249165e-06, | |
| "loss": 0.245, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.8388017118402282, | |
| "grad_norm": 0.4093844970216059, | |
| "learning_rate": 4.137410870810613e-06, | |
| "loss": 0.2461, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.8416547788873039, | |
| "grad_norm": 0.40155728521291567, | |
| "learning_rate": 4.131704978009569e-06, | |
| "loss": 0.2453, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8445078459343794, | |
| "grad_norm": 0.38190955330022147, | |
| "learning_rate": 4.125984238762421e-06, | |
| "loss": 0.2417, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.8473609129814551, | |
| "grad_norm": 0.4246867248162885, | |
| "learning_rate": 4.120248705120643e-06, | |
| "loss": 0.2481, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.8502139800285307, | |
| "grad_norm": 0.3993513208504502, | |
| "learning_rate": 4.114498429270317e-06, | |
| "loss": 0.238, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.8530670470756063, | |
| "grad_norm": 0.39587105243959064, | |
| "learning_rate": 4.1087334635316615e-06, | |
| "loss": 0.2498, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.8559201141226819, | |
| "grad_norm": 0.4196151344194992, | |
| "learning_rate": 4.1029538603585536e-06, | |
| "loss": 0.2479, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8559201141226819, | |
| "eval_loss": 0.24617859721183777, | |
| "eval_runtime": 1737.1187, | |
| "eval_samples_per_second": 10.432, | |
| "eval_steps_per_second": 0.041, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8587731811697575, | |
| "grad_norm": 0.3980291915295099, | |
| "learning_rate": 4.097159672338054e-06, | |
| "loss": 0.2462, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.8616262482168331, | |
| "grad_norm": 0.41444324403023863, | |
| "learning_rate": 4.091350952189925e-06, | |
| "loss": 0.2439, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.8644793152639088, | |
| "grad_norm": 0.4066327533312674, | |
| "learning_rate": 4.085527752766154e-06, | |
| "loss": 0.2435, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.8673323823109843, | |
| "grad_norm": 0.3990297681799543, | |
| "learning_rate": 4.079690127050472e-06, | |
| "loss": 0.2461, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.8701854493580599, | |
| "grad_norm": 0.39340515362119455, | |
| "learning_rate": 4.073838128157868e-06, | |
| "loss": 0.246, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.8730385164051355, | |
| "grad_norm": 0.41636867957622425, | |
| "learning_rate": 4.067971809334113e-06, | |
| "loss": 0.2462, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.8758915834522111, | |
| "grad_norm": 0.40482030473771397, | |
| "learning_rate": 4.06209122395527e-06, | |
| "loss": 0.2413, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.8787446504992867, | |
| "grad_norm": 0.39896008462062826, | |
| "learning_rate": 4.0561964255272054e-06, | |
| "loss": 0.2491, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8815977175463623, | |
| "grad_norm": 0.39131731651682233, | |
| "learning_rate": 4.050287467685112e-06, | |
| "loss": 0.2419, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.884450784593438, | |
| "grad_norm": 0.3916143289826878, | |
| "learning_rate": 4.044364404193012e-06, | |
| "loss": 0.2458, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8873038516405135, | |
| "grad_norm": 0.4112838340106155, | |
| "learning_rate": 4.038427288943273e-06, | |
| "loss": 0.2452, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8901569186875892, | |
| "grad_norm": 0.41031933195346787, | |
| "learning_rate": 4.0324761759561134e-06, | |
| "loss": 0.2415, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.8930099857346647, | |
| "grad_norm": 0.377173019313895, | |
| "learning_rate": 4.026511119379116e-06, | |
| "loss": 0.2424, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8958630527817404, | |
| "grad_norm": 0.4024481912961536, | |
| "learning_rate": 4.02053217348673e-06, | |
| "loss": 0.2466, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8987161198288159, | |
| "grad_norm": 0.4004194441994946, | |
| "learning_rate": 4.014539392679781e-06, | |
| "loss": 0.2437, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9015691868758916, | |
| "grad_norm": 0.391668046386889, | |
| "learning_rate": 4.008532831484977e-06, | |
| "loss": 0.245, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.9044222539229672, | |
| "grad_norm": 0.4014481566004784, | |
| "learning_rate": 4.002512544554406e-06, | |
| "loss": 0.2415, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.9072753209700428, | |
| "grad_norm": 0.3918756710438348, | |
| "learning_rate": 3.996478586665044e-06, | |
| "loss": 0.2389, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.9101283880171184, | |
| "grad_norm": 0.4074106270656481, | |
| "learning_rate": 3.990431012718256e-06, | |
| "loss": 0.2442, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.912981455064194, | |
| "grad_norm": 0.39874892543579815, | |
| "learning_rate": 3.984369877739299e-06, | |
| "loss": 0.2451, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9158345221112696, | |
| "grad_norm": 0.4098947778727311, | |
| "learning_rate": 3.978295236876811e-06, | |
| "loss": 0.2474, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.9186875891583453, | |
| "grad_norm": 0.4034535569050579, | |
| "learning_rate": 3.9722071454023235e-06, | |
| "loss": 0.2411, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.9215406562054208, | |
| "grad_norm": 0.4145583465589786, | |
| "learning_rate": 3.966105658709747e-06, | |
| "loss": 0.2461, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.9243937232524965, | |
| "grad_norm": 0.403394452728964, | |
| "learning_rate": 3.959990832314873e-06, | |
| "loss": 0.2448, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.927246790299572, | |
| "grad_norm": 0.39328211366257104, | |
| "learning_rate": 3.953862721854867e-06, | |
| "loss": 0.2455, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9300998573466477, | |
| "grad_norm": 0.4160908230121395, | |
| "learning_rate": 3.947721383087765e-06, | |
| "loss": 0.2416, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.9329529243937232, | |
| "grad_norm": 0.775797683243158, | |
| "learning_rate": 3.941566871891959e-06, | |
| "loss": 0.2445, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.9358059914407989, | |
| "grad_norm": 0.3866443982477049, | |
| "learning_rate": 3.935399244265699e-06, | |
| "loss": 0.2439, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.9386590584878745, | |
| "grad_norm": 0.38620228514014043, | |
| "learning_rate": 3.929218556326573e-06, | |
| "loss": 0.2457, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.9415121255349501, | |
| "grad_norm": 0.39487916448963434, | |
| "learning_rate": 3.923024864311004e-06, | |
| "loss": 0.2467, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9443651925820257, | |
| "grad_norm": 0.41020554184914293, | |
| "learning_rate": 3.916818224573736e-06, | |
| "loss": 0.2408, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.9472182596291013, | |
| "grad_norm": 0.41485816769336903, | |
| "learning_rate": 3.910598693587319e-06, | |
| "loss": 0.2434, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.9500713266761769, | |
| "grad_norm": 0.39185402047768125, | |
| "learning_rate": 3.904366327941597e-06, | |
| "loss": 0.2436, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.9529243937232525, | |
| "grad_norm": 0.3881406348413725, | |
| "learning_rate": 3.8981211843431955e-06, | |
| "loss": 0.2455, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.9557774607703281, | |
| "grad_norm": 0.4110193660187997, | |
| "learning_rate": 3.891863319615001e-06, | |
| "loss": 0.243, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9586305278174037, | |
| "grad_norm": 0.39837419144438213, | |
| "learning_rate": 3.885592790695647e-06, | |
| "loss": 0.2441, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.9614835948644793, | |
| "grad_norm": 0.4006331615783165, | |
| "learning_rate": 3.879309654638994e-06, | |
| "loss": 0.2387, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.9643366619115549, | |
| "grad_norm": 0.400313006479787, | |
| "learning_rate": 3.873013968613613e-06, | |
| "loss": 0.2413, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.9671897289586305, | |
| "grad_norm": 0.40502171456250147, | |
| "learning_rate": 3.866705789902265e-06, | |
| "loss": 0.2379, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.9700427960057061, | |
| "grad_norm": 0.39598425697986533, | |
| "learning_rate": 3.860385175901374e-06, | |
| "loss": 0.2425, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.9728958630527818, | |
| "grad_norm": 0.4081437020149848, | |
| "learning_rate": 3.854052184120514e-06, | |
| "loss": 0.2387, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.9757489300998573, | |
| "grad_norm": 0.3911289400426434, | |
| "learning_rate": 3.8477068721818766e-06, | |
| "loss": 0.2415, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.978601997146933, | |
| "grad_norm": 0.4054384648891572, | |
| "learning_rate": 3.841349297819756e-06, | |
| "loss": 0.2438, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.9814550641940085, | |
| "grad_norm": 0.4148953146223238, | |
| "learning_rate": 3.834979518880017e-06, | |
| "loss": 0.2426, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.9843081312410842, | |
| "grad_norm": 0.4065755735091185, | |
| "learning_rate": 3.8285975933195696e-06, | |
| "loss": 0.2426, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9871611982881597, | |
| "grad_norm": 0.4224874480432456, | |
| "learning_rate": 3.822203579205843e-06, | |
| "loss": 0.2449, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.9900142653352354, | |
| "grad_norm": 0.393825499789435, | |
| "learning_rate": 3.8157975347162575e-06, | |
| "loss": 0.2421, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.992867332382311, | |
| "grad_norm": 0.41165238403559623, | |
| "learning_rate": 3.8093795181376953e-06, | |
| "loss": 0.242, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9957203994293866, | |
| "grad_norm": 0.40998581248250454, | |
| "learning_rate": 3.8029495878659667e-06, | |
| "loss": 0.2432, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.9985734664764622, | |
| "grad_norm": 0.397226432832215, | |
| "learning_rate": 3.7965078024052846e-06, | |
| "loss": 0.2431, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0014265335235377, | |
| "grad_norm": 0.3841435578940134, | |
| "learning_rate": 3.790054220367727e-06, | |
| "loss": 0.2215, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.0042796005706134, | |
| "grad_norm": 0.43654618071300577, | |
| "learning_rate": 3.7835889004727067e-06, | |
| "loss": 0.2016, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.007132667617689, | |
| "grad_norm": 0.4106276506752052, | |
| "learning_rate": 3.7771119015464363e-06, | |
| "loss": 0.199, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.0099857346647647, | |
| "grad_norm": 0.41125177481655895, | |
| "learning_rate": 3.7706232825213927e-06, | |
| "loss": 0.1994, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.0128388017118402, | |
| "grad_norm": 0.4362677765358407, | |
| "learning_rate": 3.7641231024357792e-06, | |
| "loss": 0.1978, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0156918687589158, | |
| "grad_norm": 0.4296108798191998, | |
| "learning_rate": 3.7576114204329937e-06, | |
| "loss": 0.1985, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.0185449358059915, | |
| "grad_norm": 0.432829463889251, | |
| "learning_rate": 3.7510882957610828e-06, | |
| "loss": 0.1988, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.0213980028530671, | |
| "grad_norm": 0.40827881618565204, | |
| "learning_rate": 3.7445537877722106e-06, | |
| "loss": 0.1965, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.0242510699001426, | |
| "grad_norm": 0.4118305763695054, | |
| "learning_rate": 3.7380079559221116e-06, | |
| "loss": 0.1967, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.0271041369472182, | |
| "grad_norm": 0.42480894852763423, | |
| "learning_rate": 3.731450859769556e-06, | |
| "loss": 0.2009, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.029957203994294, | |
| "grad_norm": 0.4166448347523735, | |
| "learning_rate": 3.7248825589758033e-06, | |
| "loss": 0.1981, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.0328102710413696, | |
| "grad_norm": 0.4272747031189851, | |
| "learning_rate": 3.718303113304061e-06, | |
| "loss": 0.1984, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.035663338088445, | |
| "grad_norm": 0.4297029432947676, | |
| "learning_rate": 3.7117125826189415e-06, | |
| "loss": 0.1979, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.0385164051355207, | |
| "grad_norm": 0.4364129589363993, | |
| "learning_rate": 3.7051110268859163e-06, | |
| "loss": 0.2007, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.0413694721825963, | |
| "grad_norm": 0.4198114406045477, | |
| "learning_rate": 3.6984985061707717e-06, | |
| "loss": 0.1985, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.044222539229672, | |
| "grad_norm": 0.4260158043377354, | |
| "learning_rate": 3.6918750806390592e-06, | |
| "loss": 0.1989, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.0470756062767475, | |
| "grad_norm": 0.43059347602945136, | |
| "learning_rate": 3.685240810555553e-06, | |
| "loss": 0.1993, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.0499286733238231, | |
| "grad_norm": 0.40669439626649023, | |
| "learning_rate": 3.678595756283698e-06, | |
| "loss": 0.2006, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.0527817403708988, | |
| "grad_norm": 0.4328051559037927, | |
| "learning_rate": 3.6719399782850612e-06, | |
| "loss": 0.1991, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.0556348074179742, | |
| "grad_norm": 0.4119609604958325, | |
| "learning_rate": 3.6652735371187815e-06, | |
| "loss": 0.199, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0584878744650499, | |
| "grad_norm": 0.4365386529538563, | |
| "learning_rate": 3.6585964934410203e-06, | |
| "loss": 0.1996, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.0613409415121255, | |
| "grad_norm": 0.42710468097508963, | |
| "learning_rate": 3.651908908004407e-06, | |
| "loss": 0.1992, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.0641940085592012, | |
| "grad_norm": 0.4250691774481476, | |
| "learning_rate": 3.6452108416574883e-06, | |
| "loss": 0.2, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.0670470756062767, | |
| "grad_norm": 0.4245775817557762, | |
| "learning_rate": 3.638502355344175e-06, | |
| "loss": 0.1964, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.0699001426533523, | |
| "grad_norm": 0.45293611685468005, | |
| "learning_rate": 3.6317835101031858e-06, | |
| "loss": 0.2003, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.072753209700428, | |
| "grad_norm": 0.42888134822976454, | |
| "learning_rate": 3.6250543670674907e-06, | |
| "loss": 0.1972, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.0756062767475036, | |
| "grad_norm": 0.4328050984506226, | |
| "learning_rate": 3.6183149874637586e-06, | |
| "loss": 0.1995, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.078459343794579, | |
| "grad_norm": 0.40554199743201336, | |
| "learning_rate": 3.6115654326117977e-06, | |
| "loss": 0.1991, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.0813124108416547, | |
| "grad_norm": 0.4333678007243748, | |
| "learning_rate": 3.6048057639239974e-06, | |
| "loss": 0.2001, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.0841654778887304, | |
| "grad_norm": 0.4273273663181915, | |
| "learning_rate": 3.598036042904771e-06, | |
| "loss": 0.2031, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.087018544935806, | |
| "grad_norm": 0.4339449885169188, | |
| "learning_rate": 3.591256331149995e-06, | |
| "loss": 0.1992, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.0898716119828815, | |
| "grad_norm": 0.410332918094684, | |
| "learning_rate": 3.5844666903464494e-06, | |
| "loss": 0.2007, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.0927246790299572, | |
| "grad_norm": 0.43686499033444526, | |
| "learning_rate": 3.577667182271254e-06, | |
| "loss": 0.1996, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.0955777460770328, | |
| "grad_norm": 0.4265385202662405, | |
| "learning_rate": 3.5708578687913113e-06, | |
| "loss": 0.1999, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.0984308131241085, | |
| "grad_norm": 0.4344366421551202, | |
| "learning_rate": 3.5640388118627377e-06, | |
| "loss": 0.1969, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.101283880171184, | |
| "grad_norm": 0.4288430785753112, | |
| "learning_rate": 3.557210073530305e-06, | |
| "loss": 0.2005, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.1041369472182596, | |
| "grad_norm": 0.4334435166129172, | |
| "learning_rate": 3.5503717159268712e-06, | |
| "loss": 0.1983, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.1069900142653353, | |
| "grad_norm": 0.4102918318516278, | |
| "learning_rate": 3.543523801272819e-06, | |
| "loss": 0.1996, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.109843081312411, | |
| "grad_norm": 0.425518883257834, | |
| "learning_rate": 3.536666391875489e-06, | |
| "loss": 0.1991, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.1126961483594864, | |
| "grad_norm": 0.4332698602222992, | |
| "learning_rate": 3.5297995501286087e-06, | |
| "loss": 0.1988, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.115549215406562, | |
| "grad_norm": 0.444166414023965, | |
| "learning_rate": 3.522923338511732e-06, | |
| "loss": 0.1986, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.1184022824536377, | |
| "grad_norm": 0.43264920486836433, | |
| "learning_rate": 3.5160378195896628e-06, | |
| "loss": 0.2013, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.1212553495007134, | |
| "grad_norm": 0.4456987444163269, | |
| "learning_rate": 3.5091430560118935e-06, | |
| "loss": 0.1979, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.1241084165477888, | |
| "grad_norm": 0.4263418800695392, | |
| "learning_rate": 3.5022391105120275e-06, | |
| "loss": 0.2015, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.1269614835948645, | |
| "grad_norm": 0.4361654558918281, | |
| "learning_rate": 3.495326045907215e-06, | |
| "loss": 0.2018, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1298145506419401, | |
| "grad_norm": 0.42644981988317404, | |
| "learning_rate": 3.4884039250975766e-06, | |
| "loss": 0.1983, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.1326676176890156, | |
| "grad_norm": 0.4342210808108061, | |
| "learning_rate": 3.4814728110656347e-06, | |
| "loss": 0.1978, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.1355206847360912, | |
| "grad_norm": 0.4559078272710917, | |
| "learning_rate": 3.4745327668757368e-06, | |
| "loss": 0.1978, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.138373751783167, | |
| "grad_norm": 0.42383170430544975, | |
| "learning_rate": 3.4675838556734844e-06, | |
| "loss": 0.1986, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.1412268188302426, | |
| "grad_norm": 0.40956908340236164, | |
| "learning_rate": 3.460626140685156e-06, | |
| "loss": 0.1994, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1412268188302426, | |
| "eval_loss": 0.24679900705814362, | |
| "eval_runtime": 1728.4796, | |
| "eval_samples_per_second": 10.484, | |
| "eval_steps_per_second": 0.041, | |
| "step": 4000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10515, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.087354105856e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |