{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010489052051920808, "grad_norm": 0.8763753771781921, "learning_rate": 2.6666666666666667e-05, "loss": 2.3391, "step": 5 }, { "epoch": 0.020978104103841617, "grad_norm": 0.7406191825866699, "learning_rate": 6e-05, "loss": 1.9486, "step": 10 }, { "epoch": 0.03146715615576242, "grad_norm": 0.2422162890434265, "learning_rate": 9.333333333333334e-05, "loss": 1.656, "step": 15 }, { "epoch": 0.041956208207683234, "grad_norm": 0.23392009735107422, "learning_rate": 9.998150522680437e-05, "loss": 1.617, "step": 20 }, { "epoch": 0.05244526025960404, "grad_norm": 0.20699062943458557, "learning_rate": 9.990639365966836e-05, "loss": 1.5619, "step": 25 }, { "epoch": 0.06293431231152484, "grad_norm": 0.1582898050546646, "learning_rate": 9.977359612865423e-05, "loss": 1.4935, "step": 30 }, { "epoch": 0.07342336436344565, "grad_norm": 0.15079393982887268, "learning_rate": 9.95832661322021e-05, "loss": 1.4438, "step": 35 }, { "epoch": 0.08391241641536647, "grad_norm": 0.15707609057426453, "learning_rate": 9.933562366956445e-05, "loss": 1.4175, "step": 40 }, { "epoch": 0.09440146846728727, "grad_norm": 0.16321730613708496, "learning_rate": 9.903095498651275e-05, "loss": 1.3422, "step": 45 }, { "epoch": 0.10489052051920808, "grad_norm": 0.17720702290534973, "learning_rate": 9.866961224447075e-05, "loss": 1.2402, "step": 50 }, { "epoch": 0.11537957257112888, "grad_norm": 0.20096494257450104, "learning_rate": 9.8252013113457e-05, "loss": 2.0712, "step": 55 }, { "epoch": 0.12586862462304968, "grad_norm": 0.2041371464729309, "learning_rate": 9.777864028930705e-05, "loss": 1.6589, "step": 60 }, { "epoch": 0.1363576766749705, "grad_norm": 0.1891430765390396, "learning_rate": 9.725004093573342e-05, "loss": 1.4179, "step": 65 }, { "epoch": 0.1468467287268913, "grad_norm": 0.23310059309005737, "learning_rate": 9.666682605186835e-05, "loss": 1.3934, "step": 70 }, { "epoch": 0.1573357807788121, "grad_norm": 0.19620561599731445, "learning_rate": 9.602966976601993e-05, "loss": 1.3833, "step": 75 }, { "epoch": 0.16782483283073293, "grad_norm": 0.313629150390625, "learning_rate": 9.533930855645872e-05, "loss": 1.3187, "step": 80 }, { "epoch": 0.17831388488265373, "grad_norm": 0.20362325012683868, "learning_rate": 9.45965404001347e-05, "loss": 1.2991, "step": 85 }, { "epoch": 0.18880293693457453, "grad_norm": 0.21132731437683105, "learning_rate": 9.380222385030915e-05, "loss": 1.2923, "step": 90 }, { "epoch": 0.19929198898649533, "grad_norm": 0.20679372549057007, "learning_rate": 9.295727704416731e-05, "loss": 1.2695, "step": 95 }, { "epoch": 0.20978104103841616, "grad_norm": 0.22825102508068085, "learning_rate": 9.206267664155907e-05, "loss": 1.1565, "step": 100 }, { "epoch": 0.22027009309033696, "grad_norm": 0.24348382651805878, "learning_rate": 9.111945669609408e-05, "loss": 1.8636, "step": 105 }, { "epoch": 0.23075914514225776, "grad_norm": 0.2147216647863388, "learning_rate": 9.012870745989663e-05, "loss": 1.553, "step": 110 }, { "epoch": 0.2412481971941786, "grad_norm": 0.22054672241210938, "learning_rate": 8.90915741234015e-05, "loss": 1.2994, "step": 115 }, { "epoch": 0.25173724924609936, "grad_norm": 0.23521165549755096, "learning_rate": 8.800925549164741e-05, "loss": 1.3099, "step": 120 }, { "epoch": 0.2622263012980202, "grad_norm": 0.21970166265964508, "learning_rate": 8.688300259859854e-05, "loss": 1.2951, "step": 125 }, { "epoch": 0.272715353349941, "grad_norm": 0.23371437191963196, "learning_rate": 8.571411726109519e-05, "loss": 1.2806, "step": 130 }, { "epoch": 0.2832044054018618, "grad_norm": 0.4371797442436218, "learning_rate": 8.450395057410561e-05, "loss": 1.2831, "step": 135 }, { "epoch": 0.2936934574537826, "grad_norm": 0.23421862721443176, "learning_rate": 8.325390134901794e-05, "loss": 1.2381, "step": 140 }, { "epoch": 0.3041825095057034, "grad_norm": 0.24431416392326355, "learning_rate": 8.196541449677758e-05, "loss": 1.2272, "step": 145 }, { "epoch": 0.3146715615576242, "grad_norm": 0.2365603744983673, "learning_rate": 8.063997935773885e-05, "loss": 1.113, "step": 150 }, { "epoch": 0.325160613609545, "grad_norm": 0.24464251101016998, "learning_rate": 7.927912798016143e-05, "loss": 1.8914, "step": 155 }, { "epoch": 0.33564966566146587, "grad_norm": 0.412227600812912, "learning_rate": 7.788443334934148e-05, "loss": 1.5314, "step": 160 }, { "epoch": 0.34613871771338667, "grad_norm": 0.223706915974617, "learning_rate": 7.645750756942425e-05, "loss": 1.2719, "step": 165 }, { "epoch": 0.35662776976530747, "grad_norm": 0.4911600351333618, "learning_rate": 7.500000000000001e-05, "loss": 1.2908, "step": 170 }, { "epoch": 0.36711682181722827, "grad_norm": 0.23396988213062286, "learning_rate": 7.351359534963684e-05, "loss": 1.2747, "step": 175 }, { "epoch": 0.37760587386914907, "grad_norm": 0.2470846325159073, "learning_rate": 7.200001172855435e-05, "loss": 1.2365, "step": 180 }, { "epoch": 0.38809492592106987, "grad_norm": 0.24769781529903412, "learning_rate": 7.046099866268879e-05, "loss": 1.2466, "step": 185 }, { "epoch": 0.39858397797299067, "grad_norm": 0.24430114030838013, "learning_rate": 6.889833507144532e-05, "loss": 1.225, "step": 190 }, { "epoch": 0.4090730300249115, "grad_norm": 0.6688951253890991, "learning_rate": 6.731382721147508e-05, "loss": 1.2107, "step": 195 }, { "epoch": 0.4195620820768323, "grad_norm": 0.5510894060134888, "learning_rate": 6.570930658885313e-05, "loss": 1.1007, "step": 200 }, { "epoch": 0.4300511341287531, "grad_norm": 0.3216978907585144, "learning_rate": 6.408662784207149e-05, "loss": 1.8714, "step": 205 }, { "epoch": 0.4405401861806739, "grad_norm": 0.2762185037136078, "learning_rate": 6.244766659829351e-05, "loss": 1.4307, "step": 210 }, { "epoch": 0.4510292382325947, "grad_norm": 0.33013805747032166, "learning_rate": 6.079431730534786e-05, "loss": 1.2413, "step": 215 }, { "epoch": 0.4615182902845155, "grad_norm": 0.2626986801624298, "learning_rate": 5.9128491041968094e-05, "loss": 1.2777, "step": 220 }, { "epoch": 0.4720073423364363, "grad_norm": 0.23779766261577606, "learning_rate": 5.745211330880872e-05, "loss": 1.2698, "step": 225 }, { "epoch": 0.4824963943883572, "grad_norm": 0.24917365610599518, "learning_rate": 5.576712180279133e-05, "loss": 1.2617, "step": 230 }, { "epoch": 0.492985446440278, "grad_norm": 0.25929364562034607, "learning_rate": 5.4075464177353164e-05, "loss": 1.2065, "step": 235 }, { "epoch": 0.5034744984921987, "grad_norm": 0.27120205760002136, "learning_rate": 5.2379095791187124e-05, "loss": 1.1933, "step": 240 }, { "epoch": 0.5139635505441196, "grad_norm": 0.3004542887210846, "learning_rate": 5.06799774480755e-05, "loss": 1.1832, "step": 245 }, { "epoch": 0.5244526025960404, "grad_norm": 0.44783318042755127, "learning_rate": 4.898007313042975e-05, "loss": 1.0712, "step": 250 }, { "epoch": 0.5349416546479612, "grad_norm": 0.30534207820892334, "learning_rate": 4.728134772915605e-05, "loss": 1.9374, "step": 255 }, { "epoch": 0.545430706699882, "grad_norm": 0.28996437788009644, "learning_rate": 4.558576477247097e-05, "loss": 1.5152, "step": 260 }, { "epoch": 0.5559197587518028, "grad_norm": 0.24489738047122955, "learning_rate": 4.389528415629201e-05, "loss": 1.2228, "step": 265 }, { "epoch": 0.5664088108037236, "grad_norm": 0.24929961562156677, "learning_rate": 4.221185987882684e-05, "loss": 1.2781, "step": 270 }, { "epoch": 0.5768978628556444, "grad_norm": 0.2558998167514801, "learning_rate": 4.0537437781979506e-05, "loss": 1.247, "step": 275 }, { "epoch": 0.5873869149075652, "grad_norm": 0.3272939920425415, "learning_rate": 3.887395330218429e-05, "loss": 1.2389, "step": 280 }, { "epoch": 0.5978759669594861, "grad_norm": 0.2735213041305542, "learning_rate": 3.722332923326735e-05, "loss": 1.1956, "step": 285 }, { "epoch": 0.6083650190114068, "grad_norm": 0.271259069442749, "learning_rate": 3.558747350392146e-05, "loss": 1.2094, "step": 290 }, { "epoch": 0.6188540710633277, "grad_norm": 0.26007166504859924, "learning_rate": 3.396827697236322e-05, "loss": 1.1767, "step": 295 }, { "epoch": 0.6293431231152484, "grad_norm": 0.2587098777294159, "learning_rate": 3.23676112407218e-05, "loss": 1.0923, "step": 300 }, { "epoch": 0.6398321751671693, "grad_norm": 0.29210400581359863, "learning_rate": 3.0787326491685286e-05, "loss": 1.8277, "step": 305 }, { "epoch": 0.65032122721909, "grad_norm": 0.28203901648521423, "learning_rate": 2.9229249349905684e-05, "loss": 1.4447, "step": 310 }, { "epoch": 0.6608102792710109, "grad_norm": 0.26349228620529175, "learning_rate": 2.7695180770633995e-05, "loss": 1.1435, "step": 315 }, { "epoch": 0.6712993313229317, "grad_norm": 0.25637128949165344, "learning_rate": 2.6186893958026243e-05, "loss": 1.2523, "step": 320 }, { "epoch": 0.6817883833748525, "grad_norm": 0.2572968304157257, "learning_rate": 2.4706132315526608e-05, "loss": 1.203, "step": 325 }, { "epoch": 0.6922774354267733, "grad_norm": 0.32262009382247925, "learning_rate": 2.325460743069639e-05, "loss": 1.223, "step": 330 }, { "epoch": 0.7027664874786941, "grad_norm": 0.25640591979026794, "learning_rate": 2.1833997096818898e-05, "loss": 1.1823, "step": 335 }, { "epoch": 0.7132555395306149, "grad_norm": 0.25940874218940735, "learning_rate": 2.044594337356618e-05, "loss": 1.193, "step": 340 }, { "epoch": 0.7237445915825357, "grad_norm": 0.2889132499694824, "learning_rate": 1.9092050688969738e-05, "loss": 1.1368, "step": 345 }, { "epoch": 0.7342336436344565, "grad_norm": 0.23785533010959625, "learning_rate": 1.777388398488918e-05, "loss": 1.0516, "step": 350 }, { "epoch": 0.7447226956863774, "grad_norm": 0.27121424674987793, "learning_rate": 1.649296690812203e-05, "loss": 1.852, "step": 355 }, { "epoch": 0.7552117477382981, "grad_norm": 0.29131484031677246, "learning_rate": 1.5250780049246028e-05, "loss": 1.4743, "step": 360 }, { "epoch": 0.765700799790219, "grad_norm": 0.273075670003891, "learning_rate": 1.4048759231229281e-05, "loss": 1.2332, "step": 365 }, { "epoch": 0.7761898518421397, "grad_norm": 0.4467771351337433, "learning_rate": 1.2888293849786503e-05, "loss": 1.2454, "step": 370 }, { "epoch": 0.7866789038940606, "grad_norm": 0.7620230913162231, "learning_rate": 1.177072526739989e-05, "loss": 1.2376, "step": 375 }, { "epoch": 0.7971679559459813, "grad_norm": 0.2570807933807373, "learning_rate": 1.0697345262860636e-05, "loss": 1.1993, "step": 380 }, { "epoch": 0.8076570079979022, "grad_norm": 0.24736545979976654, "learning_rate": 9.6693945381235e-06, "loss": 1.1942, "step": 385 }, { "epoch": 0.818146060049823, "grad_norm": 0.23647992312908173, "learning_rate": 8.688061284200266e-06, "loss": 1.1793, "step": 390 }, { "epoch": 0.8286351121017438, "grad_norm": 0.24470268189907074, "learning_rate": 7.754479807749571e-06, "loss": 1.1533, "step": 395 }, { "epoch": 0.8391241641536646, "grad_norm": 0.23314952850341797, "learning_rate": 6.86972921995096e-06, "loss": 1.075, "step": 400 }, { "epoch": 0.8496132162055854, "grad_norm": 0.31338390707969666, "learning_rate": 6.034832189178302e-06, "loss": 1.8415, "step": 405 }, { "epoch": 0.8601022682575062, "grad_norm": 0.28092852234840393, "learning_rate": 5.250753758914506e-06, "loss": 1.4482, "step": 410 }, { "epoch": 0.870591320309427, "grad_norm": 0.27677977085113525, "learning_rate": 4.5184002322740785e-06, "loss": 1.2341, "step": 415 }, { "epoch": 0.8810803723613478, "grad_norm": 0.2601166069507599, "learning_rate": 3.838618124422427e-06, "loss": 1.2449, "step": 420 }, { "epoch": 0.8915694244132687, "grad_norm": 0.2637106478214264, "learning_rate": 3.212193184103196e-06, "loss": 1.2557, "step": 425 }, { "epoch": 0.9020584764651894, "grad_norm": 0.2382846623659134, "learning_rate": 2.639849485404505e-06, "loss": 1.1883, "step": 430 }, { "epoch": 0.9125475285171103, "grad_norm": 0.24265426397323608, "learning_rate": 2.1222485908137747e-06, "loss": 1.159, "step": 435 }, { "epoch": 0.923036580569031, "grad_norm": 0.24033012986183167, "learning_rate": 1.659988786528821e-06, "loss": 1.1897, "step": 440 }, { "epoch": 0.9335256326209519, "grad_norm": 0.2423560470342636, "learning_rate": 1.2536043909088191e-06, "loss": 1.1653, "step": 445 }, { "epoch": 0.9440146846728726, "grad_norm": 0.22891700267791748, "learning_rate": 9.035651368646648e-07, "loss": 1.0391, "step": 450 }, { "epoch": 0.9545037367247935, "grad_norm": 0.28548210859298706, "learning_rate": 6.102756289025957e-07, "loss": 1.5921, "step": 455 }, { "epoch": 0.9649927887767143, "grad_norm": 0.2468496561050415, "learning_rate": 3.740748754486156e-07, "loss": 1.1896, "step": 460 }, { "epoch": 0.9754818408286351, "grad_norm": 0.24155913293361664, "learning_rate": 1.9523589699433352e-07, "loss": 1.2249, "step": 465 }, { "epoch": 0.985970892880556, "grad_norm": 0.25586798787117004, "learning_rate": 7.396541051717942e-08, "loss": 1.1765, "step": 470 }, { "epoch": 0.9964599449324767, "grad_norm": 0.25830599665641785, "learning_rate": 1.040359053967599e-08, "loss": 1.1704, "step": 475 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5465766812645978e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }