Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

adapter_config.json +6 -6
adapter_model.safetensors +1 -1
chat_template.jinja +50 -50
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +426 -461
training_args.bin +1 -1

adapter_config.json CHANGED Viewed

@@ -23,15 +23,15 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "q_proj",
-    "down_proj",
-    "o_proj",
     "v_proj",
     "k_proj",
-    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,
-  "use_rslora": false
 }

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
     "k_proj",
+    "down_proj",
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,
+  "use_rslora": true
 }

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:862b443d2450a20b1969cb13018bb8da9e546d83e7a72d7b2bd47a1c01915e78
 size 359270696

 version https://git-lfs.github.com/spec/v1
+oid sha256:2cfa09893426ca5c80e5e2808b3bf5162a00330603bce40998c1bcbaaf85fbd4
 size 359270696

chat_template.jinja CHANGED Viewed

@@ -1,54 +1,54 @@
 {%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
-    {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
         {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
             {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
         {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}

 {%- if tools %}
+            {{- '<|im_start|>system\n' }}
+            {%- if messages[0]['role'] == 'system' %}
+                {{- messages[0]['content'] }}
+            {%- else %}
+                {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+            {%- endif %}
+            {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+            {%- for tool in tools %}
+                {{- "\n" }}
+                {{- tool | tojson }}
+            {%- endfor %}
+            {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+        {%- else %}
+            {%- if messages[0]['role'] == 'system' %}
+                {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+            {%- else %}
+                {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+            {%- endif %}
         {%- endif %}
+        {%- for message in messages %}
+            {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+                {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+            {%- elif message.role == "assistant" %}
+                {{- '<|im_start|>' + message.role }}
+                {%- if message.content %}
+                    {{- '\n' + message.content }}
+                {%- endif %}
+                {%- for tool_call in message.tool_calls %}
+                    {%- if tool_call.function is defined %}
+                        {%- set tool_call = tool_call.function %}
+                    {%- endif %}
+                    {{- '\n<tool_call>\n{"name": "' }}
+                    {{- tool_call.name }}
+                    {{- '", "arguments": ' }}
+                    {{- tool_call.arguments | tojson }}
+                    {{- '}\n</tool_call>' }}
+                {%- endfor %}
+                {{- '<|im_end|>\n' }}
+            {%- elif message.role == "tool" %}
+                {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+                    {{- '<|im_start|>user' }}
+                {%- endif %}
+                {{- '\n<tool_response>\n' }}
+                {{- message.content }}
+                {{- '\n</tool_response>' }}
+                {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+                    {{- '<|im_end|>\n' }}
+                {%- endif %}
             {%- endif %}
         {%- endfor %}
+        {%- if add_generation_prompt %}
+            {{- '<|im_start|>assistant\n' }}
+        {%- endif %}

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e98c64ffd468a5f8da406fd5dc414c5741d65a4905df84235391c6878ed2d08
 size 718831691

 version https://git-lfs.github.com/spec/v1
+oid sha256:049df29a3abfd9172bca3bfb88a156988cc0cd764359c0a8a1a4576bb303e165
 size 718831691

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd8886924146c94ca2c895faa5a5126787af796176eddb058f7b8a6d9788a808
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:1f4130be9f8a4a1e2287e97f0f05456d51bf10d246bfde8a5ae0ae5d4681327e
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d930a4283cac06b311f5bf6385eb10f11a487cd4fc36bfbeeadba03f011f80ef
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:740fa2b047673c71a73647826d1867382ae011b9a31714631a50f15909a18934
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,1069 +2,1034 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 37.61538461538461,
   "eval_steps": 500,
-  "global_step": 150,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.3076923076923077,
-      "grad_norm": 1.6579457521438599,
       "learning_rate": 0.0,
       "loss": 2.0013,
       "step": 1
     },
     {
       "epoch": 0.6153846153846154,
-      "grad_norm": 2.071979522705078,
-      "learning_rate": 3.3333333333333335e-05,
       "loss": 2.512,
       "step": 2
     },
     {
       "epoch": 0.9230769230769231,
-      "grad_norm": 1.5595824718475342,
-      "learning_rate": 6.666666666666667e-05,
-      "loss": 2.4665,
       "step": 3
     },
     {
       "epoch": 1.0,
-      "grad_norm": 1.4536011219024658,
-      "learning_rate": 0.0001,
-      "loss": 2.4421,
       "step": 4
     },
     {
       "epoch": 1.3076923076923077,
-      "grad_norm": 1.0175144672393799,
-      "learning_rate": 0.00013333333333333334,
-      "loss": 2.2619,
       "step": 5
     },
     {
       "epoch": 1.6153846153846154,
-      "grad_norm": 0.6594896912574768,
-      "learning_rate": 0.00016666666666666666,
-      "loss": 2.3238,
       "step": 6
     },
     {
       "epoch": 1.9230769230769231,
-      "grad_norm": 0.4992673993110657,
-      "learning_rate": 0.0002,
-      "loss": 1.7963,
       "step": 7
     },
     {
       "epoch": 2.0,
-      "grad_norm": 1.0101423263549805,
-      "learning_rate": 0.00023333333333333333,
-      "loss": 2.2351,
       "step": 8
     },
     {
       "epoch": 2.3076923076923075,
-      "grad_norm": 0.6238862872123718,
-      "learning_rate": 0.0002666666666666667,
-      "loss": 2.1242,
       "step": 9
     },
     {
       "epoch": 2.6153846153846154,
-      "grad_norm": 0.49207818508148193,
-      "learning_rate": 0.0003,
-      "loss": 1.3944,
       "step": 10
     },
     {
       "epoch": 2.9230769230769234,
-      "grad_norm": 0.5449758768081665,
-      "learning_rate": 0.0003333333333333333,
-      "loss": 1.9957,
       "step": 11
     },
     {
       "epoch": 3.0,
-      "grad_norm": 1.2987996339797974,
-      "learning_rate": 0.00036666666666666667,
-      "loss": 1.7485,
       "step": 12
     },
     {
       "epoch": 3.3076923076923075,
-      "grad_norm": 0.7771750688552856,
-      "learning_rate": 0.0004,
-      "loss": 1.7729,
       "step": 13
     },
     {
       "epoch": 3.6153846153846154,
-      "grad_norm": 0.5679973363876343,
-      "learning_rate": 0.00043333333333333337,
-      "loss": 1.7897,
       "step": 14
     },
     {
       "epoch": 3.9230769230769234,
-      "grad_norm": 0.5018688440322876,
-      "learning_rate": 0.00046666666666666666,
-      "loss": 1.0134,
       "step": 15
     },
     {
       "epoch": 4.0,
-      "grad_norm": 1.9658831357955933,
-      "learning_rate": 0.0005,
-      "loss": 1.6495,
       "step": 16
     },
     {
       "epoch": 4.3076923076923075,
-      "grad_norm": 0.5439109206199646,
-      "learning_rate": 0.0004999848114735858,
-      "loss": 0.7469,
       "step": 17
     },
     {
       "epoch": 4.615384615384615,
-      "grad_norm": 0.7974035143852234,
-      "learning_rate": 0.0004999392477398737,
-      "loss": 1.1807,
       "step": 18
     },
     {
       "epoch": 4.923076923076923,
-      "grad_norm": 0.6797612309455872,
-      "learning_rate": 0.0004998633143352315,
-      "loss": 1.3357,
       "step": 19
     },
     {
       "epoch": 5.0,
-      "grad_norm": 1.6918320655822754,
-      "learning_rate": 0.0004997570204861915,
-      "loss": 1.176,
       "step": 20
     },
     {
       "epoch": 5.3076923076923075,
-      "grad_norm": 0.8325671553611755,
-      "learning_rate": 0.000499620379108329,
-      "loss": 0.9282,
       "step": 21
     },
     {
       "epoch": 5.615384615384615,
-      "grad_norm": 0.6552557945251465,
-      "learning_rate": 0.0004994534068046936,
-      "loss": 0.649,
       "step": 22
     },
     {
       "epoch": 5.923076923076923,
-      "grad_norm": 1.1335406303405762,
-      "learning_rate": 0.0004992561238637912,
-      "loss": 0.7929,
       "step": 23
     },
     {
       "epoch": 6.0,
-      "grad_norm": 2.5653879642486572,
-      "learning_rate": 0.000499028554257119,
-      "loss": 0.6364,
       "step": 24
     },
     {
       "epoch": 6.3076923076923075,
-      "grad_norm": 1.0426138639450073,
-      "learning_rate": 0.0004987707256362529,
-      "loss": 0.6463,
       "step": 25
     },
     {
       "epoch": 6.615384615384615,
-      "grad_norm": 0.8789294958114624,
-      "learning_rate": 0.0004984826693294874,
-      "loss": 0.5646,
       "step": 26
     },
     {
       "epoch": 6.923076923076923,
-      "grad_norm": 0.8288877606391907,
-      "learning_rate": 0.0004981644203380291,
-      "loss": 0.5083,
       "step": 27
     },
     {
       "epoch": 7.0,
-      "grad_norm": 3.075854778289795,
-      "learning_rate": 0.0004978160173317438,
-      "loss": 0.463,
       "step": 28
     },
     {
       "epoch": 7.3076923076923075,
-      "grad_norm": 1.1487135887145996,
-      "learning_rate": 0.0004974375026444575,
-      "loss": 0.3834,
       "step": 29
     },
     {
       "epoch": 7.615384615384615,
-      "grad_norm": 1.2224117517471313,
-      "learning_rate": 0.0004970289222688129,
-      "loss": 0.5554,
       "step": 30
     },
     {
       "epoch": 7.923076923076923,
-      "grad_norm": 0.5958878993988037,
-      "learning_rate": 0.0004965903258506806,
-      "loss": 0.264,
       "step": 31
     },
     {
       "epoch": 8.0,
-      "grad_norm": 1.5285398960113525,
-      "learning_rate": 0.0004961217666831268,
-      "loss": 0.1209,
       "step": 32
     },
     {
       "epoch": 8.307692307692308,
-      "grad_norm": 0.8856319785118103,
-      "learning_rate": 0.0004956233016999379,
-      "loss": 0.3155,
       "step": 33
     },
     {
       "epoch": 8.615384615384615,
-      "grad_norm": 0.7143183350563049,
-      "learning_rate": 0.0004950949914687023,
-      "loss": 0.2577,
       "step": 34
     },
     {
       "epoch": 8.923076923076923,
-      "grad_norm": 0.6463532447814941,
-      "learning_rate": 0.0004945369001834514,
-      "loss": 0.2509,
       "step": 35
     },
     {
       "epoch": 9.0,
-      "grad_norm": 1.5944372415542603,
-      "learning_rate": 0.0004939490956568589,
-      "loss": 0.2712,
       "step": 36
     },
     {
       "epoch": 9.307692307692308,
-      "grad_norm": 0.6741620302200317,
-      "learning_rate": 0.0004933316493120015,
-      "loss": 0.1793,
       "step": 37
     },
     {
       "epoch": 9.615384615384615,
-      "grad_norm": 0.8169341683387756,
-      "learning_rate": 0.00049268463617368,
-      "loss": 0.2251,
       "step": 38
     },
     {
       "epoch": 9.923076923076923,
-      "grad_norm": 0.47695809602737427,
-      "learning_rate": 0.0004920081348593038,
-      "loss": 0.1291,
       "step": 39
     },
     {
       "epoch": 10.0,
-      "grad_norm": 0.9237103462219238,
-      "learning_rate": 0.0004913022275693372,
-      "loss": 0.1339,
       "step": 40
     },
     {
       "epoch": 10.307692307692308,
-      "grad_norm": 0.48380130529403687,
-      "learning_rate": 0.0004905670000773126,
-      "loss": 0.105,
       "step": 41
     },
     {
       "epoch": 10.615384615384615,
-      "grad_norm": 0.8409221768379211,
-      "learning_rate": 0.0004898025417194075,
-      "loss": 0.1608,
       "step": 42
     },
     {
       "epoch": 10.923076923076923,
-      "grad_norm": 0.36892759799957275,
-      "learning_rate": 0.0004890089453835894,
-      "loss": 0.0887,
       "step": 43
     },
     {
       "epoch": 11.0,
-      "grad_norm": 1.5985524654388428,
-      "learning_rate": 0.00048818630749832974,
-      "loss": 0.1622,
       "step": 44
     },
     {
       "epoch": 11.307692307692308,
-      "grad_norm": 0.5602894425392151,
-      "learning_rate": 0.00048733472802088654,
-      "loss": 0.0832,
       "step": 45
     },
     {
       "epoch": 11.615384615384615,
-      "grad_norm": 0.6063843369483948,
-      "learning_rate": 0.00048645431042515866,
-      "loss": 0.1001,
       "step": 46
     },
     {
       "epoch": 11.923076923076923,
-      "grad_norm": 0.6315516233444214,
-      "learning_rate": 0.00048554516168911364,
-      "loss": 0.118,
       "step": 47
     },
     {
       "epoch": 12.0,
-      "grad_norm": 0.10256657004356384,
-      "learning_rate": 0.000484607392281788,
-      "loss": 0.0136,
       "step": 48
     },
     {
       "epoch": 12.307692307692308,
-      "grad_norm": 0.38051220774650574,
-      "learning_rate": 0.0004836411161498652,
-      "loss": 0.0501,
       "step": 49
     },
     {
       "epoch": 12.615384615384615,
-      "grad_norm": 0.3163893222808838,
-      "learning_rate": 0.0004826464507038296,
-      "loss": 0.0453,
       "step": 50
     },
     {
       "epoch": 12.923076923076923,
-      "grad_norm": 0.41999366879463196,
-      "learning_rate": 0.0004816235168037004,
-      "loss": 0.0556,
       "step": 51
     },
     {
       "epoch": 13.0,
-      "grad_norm": 0.46970197558403015,
-      "learning_rate": 0.0004805724387443462,
-      "loss": 0.0511,
       "step": 52
     },
     {
       "epoch": 13.307692307692308,
-      "grad_norm": 0.19861207902431488,
-      "learning_rate": 0.00047949334424038175,
-      "loss": 0.024,
       "step": 53
     },
     {
       "epoch": 13.615384615384615,
-      "grad_norm": 0.26438194513320923,
-      "learning_rate": 0.0004783863644106502,
-      "loss": 0.0339,
       "step": 54
     },
     {
       "epoch": 13.923076923076923,
-      "grad_norm": 0.15319757163524628,
-      "learning_rate": 0.00047725163376229063,
-      "loss": 0.0217,
       "step": 55
     },
     {
       "epoch": 14.0,
-      "grad_norm": 3.167978286743164,
-      "learning_rate": 0.0004760892901743944,
-      "loss": 0.0537,
       "step": 56
     },
     {
       "epoch": 14.307692307692308,
-      "grad_norm": 0.19109131395816803,
-      "learning_rate": 0.00047489947488125176,
-      "loss": 0.0184,
       "step": 57
     },
     {
       "epoch": 14.615384615384615,
-      "grad_norm": 0.13836173713207245,
-      "learning_rate": 0.0004736823324551909,
-      "loss": 0.0131,
       "step": 58
     },
     {
       "epoch": 14.923076923076923,
-      "grad_norm": 0.19333595037460327,
-      "learning_rate": 0.00047243801078901084,
-      "loss": 0.0213,
       "step": 59
     },
     {
       "epoch": 15.0,
-      "grad_norm": 0.5175873041152954,
-      "learning_rate": 0.0004711666610780115,
-      "loss": 0.0192,
       "step": 60
     },
     {
       "epoch": 15.307692307692308,
-      "grad_norm": 0.102613165974617,
-      "learning_rate": 0.00046986843780162223,
-      "loss": 0.0125,
       "step": 61
     },
     {
       "epoch": 15.615384615384615,
-      "grad_norm": 0.19265614449977875,
-      "learning_rate": 0.00046854349870463144,
-      "loss": 0.0149,
       "step": 62
     },
     {
       "epoch": 15.923076923076923,
-      "grad_norm": 0.08247430622577667,
-      "learning_rate": 0.0004671920047780186,
-      "loss": 0.0088,
       "step": 63
     },
     {
       "epoch": 16.0,
-      "grad_norm": 0.1660240739583969,
-      "learning_rate": 0.0004658141202393935,
-      "loss": 0.0136,
       "step": 64
     },
     {
       "epoch": 16.307692307692307,
-      "grad_norm": 0.09002210944890976,
-      "learning_rate": 0.00046441001251304177,
-      "loss": 0.0085,
       "step": 65
     },
     {
       "epoch": 16.615384615384617,
-      "grad_norm": 0.14277450740337372,
-      "learning_rate": 0.0004629798522095818,
-      "loss": 0.0145,
       "step": 66
     },
     {
       "epoch": 16.923076923076923,
-      "grad_norm": 0.0343237966299057,
-      "learning_rate": 0.00046152381310523384,
-      "loss": 0.0045,
       "step": 67
     },
     {
       "epoch": 17.0,
-      "grad_norm": 0.11968665570020676,
-      "learning_rate": 0.00046004207212070527,
-      "loss": 0.0078,
       "step": 68
     },
     {
       "epoch": 17.307692307692307,
-      "grad_norm": 0.035490620881319046,
-      "learning_rate": 0.0004585348092996925,
-      "loss": 0.0047,
       "step": 69
     },
     {
       "epoch": 17.615384615384617,
-      "grad_norm": 0.06462971121072769,
-      "learning_rate": 0.000457002207787005,
-      "loss": 0.0081,
       "step": 70
     },
     {
       "epoch": 17.923076923076923,
-      "grad_norm": 0.08005011081695557,
-      "learning_rate": 0.00045544445380631127,
-      "loss": 0.0071,
       "step": 71
     },
     {
       "epoch": 18.0,
-      "grad_norm": 0.08014772832393646,
-      "learning_rate": 0.0004538617366375112,
-      "loss": 0.0043,
       "step": 72
     },
     {
       "epoch": 18.307692307692307,
-      "grad_norm": 0.045388974249362946,
-      "learning_rate": 0.0004522542485937369,
-      "loss": 0.006,
       "step": 73
     },
     {
       "epoch": 18.615384615384617,
-      "grad_norm": 0.027296222746372223,
-      "learning_rate": 0.0004506221849979852,
       "loss": 0.0042,
       "step": 74
     },
     {
       "epoch": 18.923076923076923,
-      "grad_norm": 0.04167836531996727,
-      "learning_rate": 0.0004489657441593846,
-      "loss": 0.0036,
       "step": 75
     },
     {
       "epoch": 19.0,
-      "grad_norm": 0.09567083418369293,
-      "learning_rate": 0.00044728512734909845,
-      "loss": 0.0051,
       "step": 76
     },
     {
       "epoch": 19.307692307692307,
-      "grad_norm": 0.017897693440318108,
-      "learning_rate": 0.00044558053877586913,
-      "loss": 0.0022,
       "step": 77
     },
     {
       "epoch": 19.615384615384617,
-      "grad_norm": 0.024916386231780052,
-      "learning_rate": 0.0004438521855612054,
-      "loss": 0.0035,
       "step": 78
     },
     {
       "epoch": 19.923076923076923,
-      "grad_norm": 0.016562633216381073,
-      "learning_rate": 0.0004421002777142148,
-      "loss": 0.0036,
       "step": 79
     },
     {
       "epoch": 20.0,
-      "grad_norm": 2.793907880783081,
-      "learning_rate": 0.0004403250281060862,
-      "loss": 0.0357,
       "step": 80
     },
     {
       "epoch": 20.307692307692307,
-      "grad_norm": 0.015240938402712345,
-      "learning_rate": 0.0004385266524442241,
-      "loss": 0.0019,
       "step": 81
     },
     {
       "epoch": 20.615384615384617,
-      "grad_norm": 0.01802099496126175,
-      "learning_rate": 0.0004367053692460385,
-      "loss": 0.0028,
       "step": 82
     },
     {
       "epoch": 20.923076923076923,
-      "grad_norm": 0.029564393684267998,
-      "learning_rate": 0.00043486139981239303,
-      "loss": 0.0036,
       "step": 83
     },
     {
       "epoch": 21.0,
-      "grad_norm": 0.10444993525743484,
-      "learning_rate": 0.0004329949682007154,
-      "loss": 0.0036,
       "step": 84
     },
     {
       "epoch": 21.307692307692307,
-      "grad_norm": 0.022473925724625587,
-      "learning_rate": 0.0004311063011977723,
-      "loss": 0.0028,
       "step": 85
     },
     {
       "epoch": 21.615384615384617,
-      "grad_norm": 0.00846798438578844,
-      "learning_rate": 0.00042919562829211283,
-      "loss": 0.0021,
       "step": 86
     },
     {
       "epoch": 21.923076923076923,
-      "grad_norm": 0.019519299268722534,
-      "learning_rate": 0.0004272631816461843,
-      "loss": 0.0025,
       "step": 87
     },
     {
       "epoch": 22.0,
-      "grad_norm": 0.02333137020468712,
-      "learning_rate": 0.00042530919606812215,
-      "loss": 0.0026,
       "step": 88
     },
     {
       "epoch": 22.307692307692307,
-      "grad_norm": 0.008181007578969002,
-      "learning_rate": 0.0004233339089832189,
-      "loss": 0.0022,
       "step": 89
     },
     {
       "epoch": 22.615384615384617,
-      "grad_norm": 0.024723347276449203,
-      "learning_rate": 0.000421337560405075,
-      "loss": 0.0019,
       "step": 90
     },
     {
       "epoch": 22.923076923076923,
-      "grad_norm": 0.01907787285745144,
-      "learning_rate": 0.0004193203929064353,
       "loss": 0.0023,
       "step": 91
     },
     {
       "epoch": 23.0,
-      "grad_norm": 0.023070262745022774,
-      "learning_rate": 0.0004172826515897146,
-      "loss": 0.002,
       "step": 92
     },
     {
       "epoch": 23.307692307692307,
-      "grad_norm": 0.008314975537359715,
-      "learning_rate": 0.0004152245840572153,
-      "loss": 0.0019,
       "step": 93
     },
     {
       "epoch": 23.615384615384617,
-      "grad_norm": 0.008056281134486198,
-      "learning_rate": 0.00041314644038104216,
-      "loss": 0.0022,
       "step": 94
     },
     {
       "epoch": 23.923076923076923,
-      "grad_norm": 0.01323688868433237,
-      "learning_rate": 0.0004110484730727161,
-      "loss": 0.0015,
       "step": 95
     },
     {
       "epoch": 24.0,
-      "grad_norm": 0.009244530461728573,
-      "learning_rate": 0.0004089309370524921,
-      "loss": 0.0017,
       "step": 96
     },
     {
       "epoch": 24.307692307692307,
-      "grad_norm": 0.006018282379955053,
-      "learning_rate": 0.00040679408961838426,
       "loss": 0.0017,
       "step": 97
     },
     {
       "epoch": 24.615384615384617,
-      "grad_norm": 0.005748262628912926,
-      "learning_rate": 0.00040463819041490235,
       "loss": 0.0019,
       "step": 98
     },
     {
       "epoch": 24.923076923076923,
-      "grad_norm": 0.005269130691885948,
-      "learning_rate": 0.0004024635014015023,
-      "loss": 0.0013,
       "step": 99
     },
     {
       "epoch": 25.0,
-      "grad_norm": 0.18421179056167603,
-      "learning_rate": 0.00040027028682075626,
-      "loss": 0.005,
       "step": 100
     },
     {
       "epoch": 25.307692307692307,
-      "grad_norm": 0.004292176570743322,
-      "learning_rate": 0.000398058813166245,
       "loss": 0.0012,
       "step": 101
     },
     {
       "epoch": 25.615384615384617,
-      "grad_norm": 0.13048163056373596,
-      "learning_rate": 0.00039582934915017665,
-      "loss": 0.0078,
       "step": 102
     },
     {
       "epoch": 25.923076923076923,
-      "grad_norm": 0.006738720927387476,
-      "learning_rate": 0.0003935821656707359,
-      "loss": 0.0017,
       "step": 103
     },
     {
       "epoch": 26.0,
-      "grad_norm": 0.006725949235260487,
-      "learning_rate": 0.00039131753577916796,
-      "loss": 0.0015,
       "step": 104
     },
     {
       "epoch": 26.307692307692307,
-      "grad_norm": 0.03537844121456146,
-      "learning_rate": 0.00038903573464660015,
-      "loss": 0.0031,
       "step": 105
     },
     {
       "epoch": 26.615384615384617,
-      "grad_norm": 0.0065828608348965645,
-      "learning_rate": 0.00038673703953060677,
-      "loss": 0.0015,
       "step": 106
     },
     {
       "epoch": 26.923076923076923,
-      "grad_norm": 0.005749302916228771,
-      "learning_rate": 0.00038442172974151957,
-      "loss": 0.0015,
       "step": 107
     },
     {
       "epoch": 27.0,
-      "grad_norm": 0.012811440974473953,
-      "learning_rate": 0.00038209008660848977,
       "loss": 0.0014,
       "step": 108
     },
     {
       "epoch": 27.307692307692307,
-      "grad_norm": 0.016203908249735832,
-      "learning_rate": 0.0003797423934453038,
-      "loss": 0.0013,
       "step": 109
     },
     {
       "epoch": 27.615384615384617,
-      "grad_norm": 0.00788620300590992,
-      "learning_rate": 0.0003773789355159587,
-      "loss": 0.0014,
       "step": 110
     },
     {
       "epoch": 27.923076923076923,
-      "grad_norm": 0.004134251270443201,
-      "learning_rate": 0.000375,
       "loss": 0.0014,
       "step": 111
     },
     {
       "epoch": 28.0,
-      "grad_norm": 0.0072965058498084545,
-      "learning_rate": 0.00037260587595762705,
-      "loss": 0.0012,
       "step": 112
     },
     {
       "epoch": 28.307692307692307,
-      "grad_norm": 0.004400895908474922,
-      "learning_rate": 0.00037019685429456986,
       "loss": 0.0014,
       "step": 113
     },
     {
       "epoch": 28.615384615384617,
-      "grad_norm": 0.0042018345557153225,
-      "learning_rate": 0.0003677732277267418,
-      "loss": 0.0011,
       "step": 114
     },
     {
       "epoch": 28.923076923076923,
-      "grad_norm": 0.02602436952292919,
-      "learning_rate": 0.000365335290744672,
-      "loss": 0.0016,
       "step": 115
     },
     {
       "epoch": 29.0,
-      "grad_norm": 0.008391394279897213,
-      "learning_rate": 0.0003628833395777224,
       "loss": 0.0011,
       "step": 116
     },
     {
       "epoch": 29.307692307692307,
-      "grad_norm": 0.009821321815252304,
-      "learning_rate": 0.00036041767215809354,
-      "loss": 0.0011,
       "step": 117
     },
     {
       "epoch": 29.615384615384617,
-      "grad_norm": 0.005064834374934435,
-      "learning_rate": 0.0003579385880846232,
-      "loss": 0.0014,
       "step": 118
     },
     {
       "epoch": 29.923076923076923,
-      "grad_norm": 0.003741420805454254,
-      "learning_rate": 0.00035544638858638305,
       "loss": 0.0013,
       "step": 119
     },
     {
       "epoch": 30.0,
-      "grad_norm": 0.005992499180138111,
-      "learning_rate": 0.00035294137648607626,
-      "loss": 0.0011,
       "step": 120
     },
     {
       "epoch": 30.307692307692307,
-      "grad_norm": 0.0025831812527030706,
-      "learning_rate": 0.0003504238561632424,
-      "loss": 0.0009,
       "step": 121
     },
     {
       "epoch": 30.615384615384617,
-      "grad_norm": 0.024449031800031662,
-      "learning_rate": 0.0003478941335172729,
       "loss": 0.0012,
       "step": 122
     },
     {
       "epoch": 30.923076923076923,
-      "grad_norm": 0.00358560006134212,
-      "learning_rate": 0.0003453525159302415,
       "loss": 0.0011,
       "step": 123
     },
     {
       "epoch": 31.0,
-      "grad_norm": 0.005536896176636219,
-      "learning_rate": 0.00034279931222955517,
-      "loss": 0.0014,
       "step": 124
     },
     {
       "epoch": 31.307692307692307,
-      "grad_norm": 0.0029202536679804325,
-      "learning_rate": 0.0003402348326504287,
-      "loss": 0.0011,
       "step": 125
     },
     {
       "epoch": 31.615384615384617,
-      "grad_norm": 0.0022518346086144447,
-      "learning_rate": 0.00033765938879818866,
-      "loss": 0.0009,
       "step": 126
     },
     {
       "epoch": 31.923076923076923,
-      "grad_norm": 0.01685263216495514,
-      "learning_rate": 0.0003350732936104108,
-      "loss": 0.0011,
       "step": 127
     },
     {
       "epoch": 32.0,
-      "grad_norm": 0.00378896901383996,
-      "learning_rate": 0.0003324768613188957,
-      "loss": 0.0011,
       "step": 128
     },
     {
       "epoch": 32.30769230769231,
-      "grad_norm": 0.004914000164717436,
-      "learning_rate": 0.00032987040741148704,
-      "loss": 0.0008,
       "step": 129
     },
     {
       "epoch": 32.61538461538461,
-      "grad_norm": 0.003984972834587097,
-      "learning_rate": 0.00032725424859373687,
-      "loss": 0.001,
       "step": 130
     },
     {
       "epoch": 32.92307692307692,
-      "grad_norm": 0.002909860573709011,
-      "learning_rate": 0.00032462870275042365,
-      "loss": 0.0012,
       "step": 131
     },
     {
       "epoch": 33.0,
-      "grad_norm": 0.008864074014127254,
-      "learning_rate": 0.00032199408890692656,
-      "loss": 0.0008,
       "step": 132
     },
     {
       "epoch": 33.30769230769231,
-      "grad_norm": 0.0025989420246332884,
-      "learning_rate": 0.00031935072719046115,
-      "loss": 0.001,
       "step": 133
     },
     {
       "epoch": 33.61538461538461,
-      "grad_norm": 0.0020333165302872658,
-      "learning_rate": 0.00031669893879118153,
-      "loss": 0.0008,
       "step": 134
     },
     {
       "epoch": 33.92307692307692,
-      "grad_norm": 0.005124307703226805,
-      "learning_rate": 0.0003140390459231529,
-      "loss": 0.001,
       "step": 135
     },
     {
       "epoch": 34.0,
-      "grad_norm": 0.005078117363154888,
-      "learning_rate": 0.0003113713717851998,
-      "loss": 0.0007,
       "step": 136
     },
     {
       "epoch": 34.30769230769231,
-      "grad_norm": 0.0023773626890033484,
-      "learning_rate": 0.00030869624052163523,
-      "loss": 0.001,
       "step": 137
     },
     {
       "epoch": 34.61538461538461,
-      "grad_norm": 0.0029469747096300125,
-      "learning_rate": 0.000306013977182874,
-      "loss": 0.0006,
       "step": 138
     },
     {
       "epoch": 34.92307692307692,
-      "grad_norm": 0.002441684016957879,
-      "learning_rate": 0.0003033249076859367,
-      "loss": 0.0009,
       "step": 139
     },
     {
       "epoch": 35.0,
-      "grad_norm": 0.005331180989742279,
-      "learning_rate": 0.00030062935877484806,
-      "loss": 0.0012,
       "step": 140
     },
     {
       "epoch": 35.30769230769231,
-      "grad_norm": 0.0024844056461006403,
-      "learning_rate": 0.0002979276579809346,
-      "loss": 0.0009,
       "step": 141
     },
     {
       "epoch": 35.61538461538461,
-      "grad_norm": 0.002123428974300623,
-      "learning_rate": 0.0002952201335830275,
-      "loss": 0.0008,
       "step": 142
     },
     {
       "epoch": 35.92307692307692,
-      "grad_norm": 0.0018413775833323598,
-      "learning_rate": 0.00029250711456757327,
-      "loss": 0.0008,
       "step": 143
     },
     {
       "epoch": 36.0,
-      "grad_norm": 0.007370408158749342,
-      "learning_rate": 0.00028978893058865987,
-      "loss": 0.0009,
       "step": 144
     },
     {
       "epoch": 36.30769230769231,
-      "grad_norm": 0.0015665763057768345,
-      "learning_rate": 0.0002870659119279605,
-      "loss": 0.0006,
-      "step": 145
-    },
-    {
-      "epoch": 36.61538461538461,
-      "grad_norm": 0.002923523774370551,
-      "learning_rate": 0.00028433838945460206,
-      "loss": 0.0009,
-      "step": 146
-    },
-    {
-      "epoch": 36.92307692307692,
-      "grad_norm": 0.002343183383345604,
-      "learning_rate": 0.0002816066945849616,
-      "loss": 0.001,
-      "step": 147
-    },
-    {
-      "epoch": 37.0,
-      "grad_norm": 0.0035140886902809143,
-      "learning_rate": 0.0002788711592423966,
       "loss": 0.0008,
-      "step": 148
-    },
-    {
-      "epoch": 37.30769230769231,
-      "grad_norm": 0.0017854305915534496,
-      "learning_rate": 0.0002761321158169134,
-      "loss": 0.0006,
-      "step": 149
-    },
-    {
-      "epoch": 37.61538461538461,
-      "grad_norm": 0.002159240422770381,
-      "learning_rate": 0.0002733898971247795,
-      "loss": 0.0009,
-      "step": 150
     }
   ],
   "logging_steps": 1,
-  "max_steps": 300,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 75,
-  "save_steps": 10,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -1077,7 +1042,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.437176574055219e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 36.30769230769231,
   "eval_steps": 500,
+  "global_step": 145,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.3076923076923077,
+      "grad_norm": 11.430416107177734,
       "learning_rate": 0.0,
       "loss": 2.0013,
       "step": 1
     },
     {
       "epoch": 0.6153846153846154,
+      "grad_norm": 14.13573169708252,
+      "learning_rate": 6.666666666666667e-06,
       "loss": 2.512,
       "step": 2
     },
     {
       "epoch": 0.9230769230769231,
+      "grad_norm": 9.26183032989502,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 2.4408,
       "step": 3
     },
     {
       "epoch": 1.0,
+      "grad_norm": 8.897106170654297,
+      "learning_rate": 2e-05,
+      "loss": 2.4201,
       "step": 4
     },
     {
       "epoch": 1.3076923076923077,
+      "grad_norm": 6.057820796966553,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 2.2292,
       "step": 5
     },
     {
       "epoch": 1.6153846153846154,
+      "grad_norm": 4.151742935180664,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 2.2984,
       "step": 6
     },
     {
       "epoch": 1.9230769230769231,
+      "grad_norm": 2.9261631965637207,
+      "learning_rate": 4e-05,
+      "loss": 1.7644,
       "step": 7
     },
     {
       "epoch": 2.0,
+      "grad_norm": 6.626205921173096,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 2.1861,
       "step": 8
     },
     {
       "epoch": 2.3076923076923075,
+      "grad_norm": 3.6231133937835693,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 2.0493,
       "step": 9
     },
     {
       "epoch": 2.6153846153846154,
+      "grad_norm": 2.6322238445281982,
+      "learning_rate": 6e-05,
+      "loss": 1.3266,
       "step": 10
     },
     {
       "epoch": 2.9230769230769234,
+      "grad_norm": 3.063889265060425,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.9172,
       "step": 11
     },
     {
       "epoch": 3.0,
+      "grad_norm": 6.935006141662598,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 1.6738,
       "step": 12
     },
     {
       "epoch": 3.3076923076923075,
+      "grad_norm": 3.876340866088867,
+      "learning_rate": 8e-05,
+      "loss": 1.659,
       "step": 13
     },
     {
       "epoch": 3.6153846153846154,
+      "grad_norm": 3.144028663635254,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 1.6267,
       "step": 14
     },
     {
       "epoch": 3.9230769230769234,
+      "grad_norm": 2.40108060836792,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.9194,
       "step": 15
     },
     {
       "epoch": 4.0,
+      "grad_norm": 13.895914077758789,
+      "learning_rate": 0.0001,
+      "loss": 1.5071,
       "step": 16
     },
     {
       "epoch": 4.3076923076923075,
+      "grad_norm": 3.016589879989624,
+      "learning_rate": 9.998646205897309e-05,
+      "loss": 0.6631,
       "step": 17
     },
     {
       "epoch": 4.615384615384615,
+      "grad_norm": 3.7720727920532227,
+      "learning_rate": 9.994585556692624e-05,
+      "loss": 1.0356,
       "step": 18
     },
     {
       "epoch": 4.923076923076923,
+      "grad_norm": 3.657879114151001,
+      "learning_rate": 9.987820251299122e-05,
+      "loss": 1.1876,
       "step": 19
     },
     {
       "epoch": 5.0,
+      "grad_norm": 6.603457450866699,
+      "learning_rate": 9.978353953249022e-05,
+      "loss": 1.0117,
       "step": 20
     },
     {
       "epoch": 5.3076923076923075,
+      "grad_norm": 3.7631995677948,
+      "learning_rate": 9.966191788709716e-05,
+      "loss": 0.8773,
       "step": 21
     },
     {
       "epoch": 5.615384615384615,
+      "grad_norm": 2.5895802974700928,
+      "learning_rate": 9.951340343707852e-05,
+      "loss": 0.524,
       "step": 22
     },
     {
       "epoch": 5.923076923076923,
+      "grad_norm": 3.9228901863098145,
+      "learning_rate": 9.933807660562898e-05,
+      "loss": 0.639,
       "step": 23
     },
     {
       "epoch": 6.0,
+      "grad_norm": 10.968963623046875,
+      "learning_rate": 9.913603233532067e-05,
+      "loss": 0.5014,
       "step": 24
     },
     {
       "epoch": 6.3076923076923075,
+      "grad_norm": 3.60386061668396,
+      "learning_rate": 9.890738003669029e-05,
+      "loss": 0.4728,
       "step": 25
     },
     {
       "epoch": 6.615384615384615,
+      "grad_norm": 3.794558525085449,
+      "learning_rate": 9.865224352899119e-05,
+      "loss": 0.4594,
       "step": 26
     },
     {
       "epoch": 6.923076923076923,
+      "grad_norm": 3.044400691986084,
+      "learning_rate": 9.837076097314319e-05,
+      "loss": 0.3733,
       "step": 27
     },
     {
       "epoch": 7.0,
+      "grad_norm": 10.606956481933594,
+      "learning_rate": 9.806308479691595e-05,
+      "loss": 0.2499,
       "step": 28
     },
     {
       "epoch": 7.3076923076923075,
+      "grad_norm": 2.988124132156372,
+      "learning_rate": 9.77293816123866e-05,
+      "loss": 0.2711,
       "step": 29
     },
     {
       "epoch": 7.615384615384615,
+      "grad_norm": 3.3052446842193604,
+      "learning_rate": 9.736983212571646e-05,
+      "loss": 0.3466,
       "step": 30
     },
     {
       "epoch": 7.923076923076923,
+      "grad_norm": 2.315810441970825,
+      "learning_rate": 9.698463103929542e-05,
+      "loss": 0.185,
       "step": 31
     },
     {
       "epoch": 8.0,
+      "grad_norm": 8.904762268066406,
+      "learning_rate": 9.657398694630712e-05,
+      "loss": 0.0841,
       "step": 32
     },
     {
       "epoch": 8.307692307692308,
+      "grad_norm": 2.4080655574798584,
+      "learning_rate": 9.613812221777212e-05,
+      "loss": 0.155,
       "step": 33
     },
     {
       "epoch": 8.615384615384615,
+      "grad_norm": 2.4143807888031006,
+      "learning_rate": 9.567727288213005e-05,
+      "loss": 0.1114,
       "step": 34
     },
     {
       "epoch": 8.923076923076923,
+      "grad_norm": 2.5390758514404297,
+      "learning_rate": 9.519168849742604e-05,
+      "loss": 0.1484,
       "step": 35
     },
     {
       "epoch": 9.0,
+      "grad_norm": 4.964494705200195,
+      "learning_rate": 9.468163201617062e-05,
+      "loss": 0.1483,
       "step": 36
     },
     {
       "epoch": 9.307692307692308,
+      "grad_norm": 2.1613352298736572,
+      "learning_rate": 9.414737964294636e-05,
+      "loss": 0.0918,
       "step": 37
     },
     {
       "epoch": 9.615384615384615,
+      "grad_norm": 2.3066835403442383,
+      "learning_rate": 9.358922068483812e-05,
+      "loss": 0.094,
       "step": 38
     },
     {
       "epoch": 9.923076923076923,
+      "grad_norm": 1.2779909372329712,
+      "learning_rate": 9.300745739476829e-05,
+      "loss": 0.0516,
       "step": 39
     },
     {
       "epoch": 10.0,
+      "grad_norm": 2.4075942039489746,
+      "learning_rate": 9.24024048078213e-05,
+      "loss": 0.0477,
       "step": 40
     },
     {
       "epoch": 10.307692307692308,
+      "grad_norm": 1.0170810222625732,
+      "learning_rate": 9.177439057064683e-05,
+      "loss": 0.0312,
       "step": 41
     },
     {
       "epoch": 10.615384615384615,
+      "grad_norm": 2.0403945446014404,
+      "learning_rate": 9.112375476403312e-05,
+      "loss": 0.0624,
       "step": 42
     },
     {
       "epoch": 10.923076923076923,
+      "grad_norm": 0.8645662665367126,
+      "learning_rate": 9.045084971874738e-05,
+      "loss": 0.0262,
       "step": 43
     },
     {
       "epoch": 11.0,
+      "grad_norm": 5.61713171005249,
+      "learning_rate": 8.97560398247424e-05,
+      "loss": 0.0711,
       "step": 44
     },
     {
       "epoch": 11.307692307692308,
+      "grad_norm": 0.8560450673103333,
+      "learning_rate": 8.903970133383297e-05,
+      "loss": 0.0232,
       "step": 45
     },
     {
       "epoch": 11.615384615384615,
+      "grad_norm": 1.1219335794448853,
+      "learning_rate": 8.83022221559489e-05,
+      "loss": 0.0251,
       "step": 46
     },
     {
       "epoch": 11.923076923076923,
+      "grad_norm": 1.4238826036453247,
+      "learning_rate": 8.754400164907497e-05,
+      "loss": 0.0402,
       "step": 47
     },
     {
       "epoch": 12.0,
+      "grad_norm": 0.6608924269676208,
+      "learning_rate": 8.676545040299145e-05,
+      "loss": 0.0029,
       "step": 48
     },
     {
       "epoch": 12.307692307692308,
+      "grad_norm": 2.1574368476867676,
+      "learning_rate": 8.596699001693255e-05,
+      "loss": 0.014,
       "step": 49
     },
     {
       "epoch": 12.615384615384615,
+      "grad_norm": 0.9417891502380371,
+      "learning_rate": 8.51490528712831e-05,
+      "loss": 0.0173,
       "step": 50
     },
     {
       "epoch": 12.923076923076923,
+      "grad_norm": 1.281510353088379,
+      "learning_rate": 8.43120818934367e-05,
+      "loss": 0.0275,
       "step": 51
     },
     {
       "epoch": 13.0,
+      "grad_norm": 2.0283124446868896,
+      "learning_rate": 8.345653031794292e-05,
+      "loss": 0.0311,
       "step": 52
     },
     {
       "epoch": 13.307692307692308,
+      "grad_norm": 0.5412298440933228,
+      "learning_rate": 8.258286144107276e-05,
+      "loss": 0.0094,
       "step": 53
     },
     {
       "epoch": 13.615384615384615,
+      "grad_norm": 0.9629742503166199,
+      "learning_rate": 8.169154836993551e-05,
+      "loss": 0.0144,
       "step": 54
     },
     {
       "epoch": 13.923076923076923,
+      "grad_norm": 0.6250266432762146,
+      "learning_rate": 8.07830737662829e-05,
+      "loss": 0.0121,
       "step": 55
     },
     {
       "epoch": 14.0,
+      "grad_norm": 2.4746816158294678,
+      "learning_rate": 7.985792958513931e-05,
+      "loss": 0.0212,
       "step": 56
     },
     {
       "epoch": 14.307692307692308,
+      "grad_norm": 0.6302680373191833,
+      "learning_rate": 7.891661680839932e-05,
+      "loss": 0.0092,
       "step": 57
     },
     {
       "epoch": 14.615384615384615,
+      "grad_norm": 0.462685227394104,
+      "learning_rate": 7.795964517353735e-05,
+      "loss": 0.0066,
       "step": 58
     },
     {
       "epoch": 14.923076923076923,
+      "grad_norm": 0.8047600388526917,
+      "learning_rate": 7.698753289757565e-05,
+      "loss": 0.0125,
       "step": 59
     },
     {
       "epoch": 15.0,
+      "grad_norm": 1.3532655239105225,
+      "learning_rate": 7.600080639646077e-05,
+      "loss": 0.0094,
       "step": 60
     },
     {
       "epoch": 15.307692307692308,
+      "grad_norm": 0.8325954079627991,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 0.0078,
       "step": 61
     },
     {
       "epoch": 15.615384615384615,
+      "grad_norm": 0.5373042225837708,
+      "learning_rate": 7.398565566251232e-05,
+      "loss": 0.0065,
       "step": 62
     },
     {
       "epoch": 15.923076923076923,
+      "grad_norm": 0.6049899458885193,
+      "learning_rate": 7.295832266935059e-05,
+      "loss": 0.0072,
       "step": 63
     },
     {
       "epoch": 16.0,
+      "grad_norm": 0.6376262903213501,
+      "learning_rate": 7.191855733945387e-05,
+      "loss": 0.0061,
       "step": 64
     },
     {
       "epoch": 16.307692307692307,
+      "grad_norm": 0.4146476089954376,
+      "learning_rate": 7.08669227240909e-05,
+      "loss": 0.0067,
       "step": 65
     },
     {
       "epoch": 16.615384615384617,
+      "grad_norm": 0.901756763458252,
+      "learning_rate": 6.980398830195785e-05,
+      "loss": 0.0111,
       "step": 66
     },
     {
       "epoch": 16.923076923076923,
+      "grad_norm": 0.1660463660955429,
+      "learning_rate": 6.873032967079561e-05,
+      "loss": 0.0031,
       "step": 67
     },
     {
       "epoch": 17.0,
+      "grad_norm": 1.6091344356536865,
+      "learning_rate": 6.764652823569344e-05,
+      "loss": 0.0114,
       "step": 68
     },
     {
       "epoch": 17.307692307692307,
+      "grad_norm": 0.17023883759975433,
+      "learning_rate": 6.65531708942479e-05,
+      "loss": 0.0028,
       "step": 69
     },
     {
       "epoch": 17.615384615384617,
+      "grad_norm": 1.0435467958450317,
+      "learning_rate": 6.545084971874738e-05,
+      "loss": 0.0096,
       "step": 70
     },
     {
       "epoch": 17.923076923076923,
+      "grad_norm": 0.6580948233604431,
+      "learning_rate": 6.434016163555452e-05,
+      "loss": 0.006,
       "step": 71
     },
     {
       "epoch": 18.0,
+      "grad_norm": 1.8953274488449097,
+      "learning_rate": 6.322170810186012e-05,
+      "loss": 0.0119,
       "step": 72
     },
     {
       "epoch": 18.307692307692307,
+      "grad_norm": 0.39621683955192566,
+      "learning_rate": 6.209609477998338e-05,
+      "loss": 0.0042,
       "step": 73
     },
     {
       "epoch": 18.615384615384617,
+      "grad_norm": 0.3594362437725067,
+      "learning_rate": 6.096393120939516e-05,
       "loss": 0.0042,
       "step": 74
     },
     {
       "epoch": 18.923076923076923,
+      "grad_norm": 0.8800605535507202,
+      "learning_rate": 5.982583047664151e-05,
+      "loss": 0.0096,
       "step": 75
     },
     {
       "epoch": 19.0,
+      "grad_norm": 0.41055458784103394,
+      "learning_rate": 5.868240888334653e-05,
+      "loss": 0.0036,
       "step": 76
     },
     {
       "epoch": 19.307692307692307,
+      "grad_norm": 0.3029349446296692,
+      "learning_rate": 5.753428561247416e-05,
+      "loss": 0.0026,
       "step": 77
     },
     {
       "epoch": 19.615384615384617,
+      "grad_norm": 0.5970585346221924,
+      "learning_rate": 5.6382082393029746e-05,
+      "loss": 0.0078,
       "step": 78
     },
     {
       "epoch": 19.923076923076923,
+      "grad_norm": 0.27087146043777466,
+      "learning_rate": 5.522642316338268e-05,
+      "loss": 0.0039,
       "step": 79
     },
     {
       "epoch": 20.0,
+      "grad_norm": 0.8190112113952637,
+      "learning_rate": 5.4067933733392915e-05,
+      "loss": 0.0055,
       "step": 80
     },
     {
       "epoch": 20.307692307692307,
+      "grad_norm": 0.19041061401367188,
+      "learning_rate": 5.290724144552379e-05,
+      "loss": 0.0016,
       "step": 81
     },
     {
       "epoch": 20.615384615384617,
+      "grad_norm": 0.7743979692459106,
+      "learning_rate": 5.174497483512506e-05,
+      "loss": 0.0045,
       "step": 82
     },
     {
       "epoch": 20.923076923076923,
+      "grad_norm": 0.21607236564159393,
+      "learning_rate": 5.0581763290069865e-05,
+      "loss": 0.0035,
       "step": 83
     },
     {
       "epoch": 21.0,
+      "grad_norm": 1.6706500053405762,
+      "learning_rate": 4.941823670993016e-05,
+      "loss": 0.0133,
       "step": 84
     },
     {
       "epoch": 21.307692307692307,
+      "grad_norm": 0.48242342472076416,
+      "learning_rate": 4.825502516487497e-05,
+      "loss": 0.0059,
       "step": 85
     },
     {
       "epoch": 21.615384615384617,
+      "grad_norm": 0.11222591251134872,
+      "learning_rate": 4.709275855447621e-05,
+      "loss": 0.0025,
       "step": 86
     },
     {
       "epoch": 21.923076923076923,
+      "grad_norm": 0.22460472583770752,
+      "learning_rate": 4.593206626660709e-05,
+      "loss": 0.0033,
       "step": 87
     },
     {
       "epoch": 22.0,
+      "grad_norm": 0.39556336402893066,
+      "learning_rate": 4.477357683661734e-05,
+      "loss": 0.0024,
       "step": 88
     },
     {
       "epoch": 22.307692307692307,
+      "grad_norm": 0.2808006703853607,
+      "learning_rate": 4.361791760697027e-05,
+      "loss": 0.0028,
       "step": 89
     },
     {
       "epoch": 22.615384615384617,
+      "grad_norm": 0.3281514048576355,
+      "learning_rate": 4.246571438752585e-05,
+      "loss": 0.0029,
       "step": 90
     },
     {
       "epoch": 22.923076923076923,
+      "grad_norm": 0.4327070116996765,
+      "learning_rate": 4.131759111665349e-05,
       "loss": 0.0023,
       "step": 91
     },
     {
       "epoch": 23.0,
+      "grad_norm": 0.6825558543205261,
+      "learning_rate": 4.017416952335849e-05,
+      "loss": 0.0038,
       "step": 92
     },
     {
       "epoch": 23.307692307692307,
+      "grad_norm": 0.5522281527519226,
+      "learning_rate": 3.903606879060483e-05,
+      "loss": 0.002,
       "step": 93
     },
     {
       "epoch": 23.615384615384617,
+      "grad_norm": 0.06093262881040573,
+      "learning_rate": 3.790390522001662e-05,
+      "loss": 0.002,
       "step": 94
     },
     {
       "epoch": 23.923076923076923,
+      "grad_norm": 0.1554577350616455,
+      "learning_rate": 3.67782918981399e-05,
+      "loss": 0.0021,
       "step": 95
     },
     {
       "epoch": 24.0,
+      "grad_norm": 0.5829846858978271,
+      "learning_rate": 3.5659838364445505e-05,
+      "loss": 0.0031,
       "step": 96
     },
     {
       "epoch": 24.307692307692307,
+      "grad_norm": 0.10798896104097366,
+      "learning_rate": 3.4549150281252636e-05,
       "loss": 0.0017,
       "step": 97
     },
     {
       "epoch": 24.615384615384617,
+      "grad_norm": 0.0642886683344841,
+      "learning_rate": 3.34468291057521e-05,
       "loss": 0.0019,
       "step": 98
     },
     {
       "epoch": 24.923076923076923,
+      "grad_norm": 0.08898573368787766,
+      "learning_rate": 3.235347176430656e-05,
+      "loss": 0.0016,
       "step": 99
     },
     {
       "epoch": 25.0,
+      "grad_norm": 2.9850921630859375,
+      "learning_rate": 3.12696703292044e-05,
+      "loss": 0.0109,
       "step": 100
     },
     {
       "epoch": 25.307692307692307,
+      "grad_norm": 0.15037357807159424,
+      "learning_rate": 3.019601169804216e-05,
       "loss": 0.0012,
       "step": 101
     },
     {
       "epoch": 25.615384615384617,
+      "grad_norm": 0.45115193724632263,
+      "learning_rate": 2.9133077275909108e-05,
+      "loss": 0.0029,
       "step": 102
     },
     {
       "epoch": 25.923076923076923,
+      "grad_norm": 0.12441351264715195,
+      "learning_rate": 2.8081442660546125e-05,
+      "loss": 0.0021,
       "step": 103
     },
     {
       "epoch": 26.0,
+      "grad_norm": 0.03845607116818428,
+      "learning_rate": 2.7041677330649407e-05,
+      "loss": 0.0014,
       "step": 104
     },
     {
       "epoch": 26.307692307692307,
+      "grad_norm": 0.3710884153842926,
+      "learning_rate": 2.6014344337487707e-05,
+      "loss": 0.0024,
       "step": 105
     },
     {
       "epoch": 26.615384615384617,
+      "grad_norm": 0.0669671967625618,
+      "learning_rate": 2.500000000000001e-05,
+      "loss": 0.0016,
       "step": 106
     },
     {
       "epoch": 26.923076923076923,
+      "grad_norm": 0.0413970872759819,
+      "learning_rate": 2.399919360353923e-05,
+      "loss": 0.0016,
       "step": 107
     },
     {
       "epoch": 27.0,
+      "grad_norm": 0.08645425736904144,
+      "learning_rate": 2.3012467102424373e-05,
       "loss": 0.0014,
       "step": 108
     },
     {
       "epoch": 27.307692307692307,
+      "grad_norm": 0.18081574141979218,
+      "learning_rate": 2.2040354826462668e-05,
+      "loss": 0.0011,
       "step": 109
     },
     {
       "epoch": 27.615384615384617,
+      "grad_norm": 0.06019139289855957,
+      "learning_rate": 2.1083383191600674e-05,
+      "loss": 0.0017,
       "step": 110
     },
     {
       "epoch": 27.923076923076923,
+      "grad_norm": 0.03454792872071266,
+      "learning_rate": 2.0142070414860704e-05,
       "loss": 0.0014,
       "step": 111
     },
     {
       "epoch": 28.0,
+      "grad_norm": 0.08351138234138489,
+      "learning_rate": 1.9216926233717085e-05,
+      "loss": 0.0013,
       "step": 112
     },
     {
       "epoch": 28.307692307692307,
+      "grad_norm": 0.02913900464773178,
+      "learning_rate": 1.8308451630064484e-05,
       "loss": 0.0014,
       "step": 113
     },
     {
       "epoch": 28.615384615384617,
+      "grad_norm": 0.03412294760346413,
+      "learning_rate": 1.7417138558927244e-05,
+      "loss": 0.0012,
       "step": 114
     },
     {
       "epoch": 28.923076923076923,
+      "grad_norm": 0.0328608974814415,
+      "learning_rate": 1.6543469682057106e-05,
+      "loss": 0.0013,
       "step": 115
     },
     {
       "epoch": 29.0,
+      "grad_norm": 0.050466686487197876,
+      "learning_rate": 1.5687918106563326e-05,
       "loss": 0.0011,
       "step": 116
     },
     {
       "epoch": 29.307692307692307,
+      "grad_norm": 0.02442150004208088,
+      "learning_rate": 1.4850947128716913e-05,
+      "loss": 0.001,
       "step": 117
     },
     {
       "epoch": 29.615384615384617,
+      "grad_norm": 0.027474530041217804,
+      "learning_rate": 1.4033009983067452e-05,
+      "loss": 0.0015,
       "step": 118
     },
     {
       "epoch": 29.923076923076923,
+      "grad_norm": 0.024797696620225906,
+      "learning_rate": 1.3234549597008571e-05,
       "loss": 0.0013,
       "step": 119
     },
     {
       "epoch": 30.0,
+      "grad_norm": 0.029813682660460472,
+      "learning_rate": 1.245599835092504e-05,
+      "loss": 0.0012,
       "step": 120
     },
     {
       "epoch": 30.307692307692307,
+      "grad_norm": 0.01790359988808632,
+      "learning_rate": 1.1697777844051105e-05,
+      "loss": 0.001,
       "step": 121
     },
     {
       "epoch": 30.615384615384617,
+      "grad_norm": 0.021541791036725044,
+      "learning_rate": 1.096029866616704e-05,
       "loss": 0.0012,
       "step": 122
     },
     {
       "epoch": 30.923076923076923,
+      "grad_norm": 0.023409120738506317,
+      "learning_rate": 1.0243960175257606e-05,
       "loss": 0.0011,
       "step": 123
     },
     {
       "epoch": 31.0,
+      "grad_norm": 0.04634522646665573,
+      "learning_rate": 9.549150281252633e-06,
+      "loss": 0.0016,
       "step": 124
     },
     {
       "epoch": 31.307692307692307,
+      "grad_norm": 0.0230031069368124,
+      "learning_rate": 8.876245235966885e-06,
+      "loss": 0.0012,
       "step": 125
     },
     {
       "epoch": 31.615384615384617,
+      "grad_norm": 0.050953831523656845,
+      "learning_rate": 8.225609429353187e-06,
+      "loss": 0.001,
       "step": 126
     },
     {
       "epoch": 31.923076923076923,
+      "grad_norm": 0.03163566812872887,
+      "learning_rate": 7.597595192178702e-06,
+      "loss": 0.0012,
       "step": 127
     },
     {
       "epoch": 32.0,
+      "grad_norm": 0.03247598558664322,
+      "learning_rate": 6.992542605231739e-06,
+      "loss": 0.0014,
       "step": 128
     },
     {
       "epoch": 32.30769230769231,
+      "grad_norm": 0.013677124865353107,
+      "learning_rate": 6.410779315161886e-06,
+      "loss": 0.0009,
       "step": 129
     },
     {
       "epoch": 32.61538461538461,
+      "grad_norm": 0.026261860504746437,
+      "learning_rate": 5.852620357053651e-06,
+      "loss": 0.0011,
       "step": 130
     },
     {
       "epoch": 32.92307692307692,
+      "grad_norm": 0.021554652601480484,
+      "learning_rate": 5.318367983829392e-06,
+      "loss": 0.0014,
       "step": 131
     },
     {
       "epoch": 33.0,
+      "grad_norm": 0.048853594809770584,
+      "learning_rate": 4.8083115025739756e-06,
+      "loss": 0.0009,
       "step": 132
     },
     {
       "epoch": 33.30769230769231,
+      "grad_norm": 0.1052127406001091,
+      "learning_rate": 4.322727117869951e-06,
+      "loss": 0.0013,
       "step": 133
     },
     {
       "epoch": 33.61538461538461,
+      "grad_norm": 0.014833934605121613,
+      "learning_rate": 3.861877782227885e-06,
+      "loss": 0.0009,
       "step": 134
     },
     {
       "epoch": 33.92307692307692,
+      "grad_norm": 0.022332781925797462,
+      "learning_rate": 3.426013053692878e-06,
+      "loss": 0.0012,
       "step": 135
     },
     {
       "epoch": 34.0,
+      "grad_norm": 0.034213513135910034,
+      "learning_rate": 3.0153689607045845e-06,
+      "loss": 0.001,
       "step": 136
     },
     {
       "epoch": 34.30769230769231,
+      "grad_norm": 0.020968670025467873,
+      "learning_rate": 2.63016787428354e-06,
+      "loss": 0.0013,
       "step": 137
     },
     {
       "epoch": 34.61538461538461,
+      "grad_norm": 0.013379962183535099,
+      "learning_rate": 2.2706183876134045e-06,
+      "loss": 0.0007,
       "step": 138
     },
     {
       "epoch": 34.92307692307692,
+      "grad_norm": 0.020913399755954742,
+      "learning_rate": 1.9369152030840556e-06,
+      "loss": 0.0011,
       "step": 139
     },
     {
       "epoch": 35.0,
+      "grad_norm": 0.032495055347681046,
+      "learning_rate": 1.6292390268568104e-06,
+      "loss": 0.0014,
       "step": 140
     },
     {
       "epoch": 35.30769230769231,
+      "grad_norm": 0.01754385605454445,
+      "learning_rate": 1.3477564710088098e-06,
+      "loss": 0.0011,
       "step": 141
     },
     {
       "epoch": 35.61538461538461,
+      "grad_norm": 0.021996742114424706,
+      "learning_rate": 1.0926199633097157e-06,
+      "loss": 0.0011,
       "step": 142
     },
     {
       "epoch": 35.92307692307692,
+      "grad_norm": 0.013828999362885952,
+      "learning_rate": 8.639676646793382e-07,
+      "loss": 0.0011,
       "step": 143
     },
     {
       "epoch": 36.0,
+      "grad_norm": 0.0670580044388771,
+      "learning_rate": 6.61923394371039e-07,
+      "loss": 0.0012,
       "step": 144
     },
     {
       "epoch": 36.30769230769231,
+      "grad_norm": 0.01171192154288292,
+      "learning_rate": 4.865965629214819e-07,
       "loss": 0.0008,
+      "step": 145
     }
   ],
   "logging_steps": 1,
+  "max_steps": 150,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 38,
+  "save_steps": 5,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 4.279650186790502e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d0602db10057a2c9b0fdf15117d239f18bc4bbff9fed96a819f068221c63f8a
 size 6033

 version https://git-lfs.github.com/spec/v1
+oid sha256:1f19b144ff6256052340a4caed3d3358b46d4b6c87b9238f7d8f792f8dda85ea
 size 6033