Jaehwisong commited on
Commit
04e249e
·
verified ·
1 Parent(s): f4585c7

Upload trained Qwen3-VL model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. README.md +58 -0
  3. accelerate_config.yaml +13 -0
  4. checkpoint-1000/added_tokens.json +28 -0
  5. checkpoint-1000/chat_template.jinja +120 -0
  6. checkpoint-1000/config.json +67 -0
  7. checkpoint-1000/generation_config.json +13 -0
  8. checkpoint-1000/merges.txt +0 -0
  9. checkpoint-1000/model.safetensors +3 -0
  10. checkpoint-1000/optimizer.pt +3 -0
  11. checkpoint-1000/preprocessor_config.json +39 -0
  12. checkpoint-1000/rng_state_0.pth +3 -0
  13. checkpoint-1000/rng_state_1.pth +3 -0
  14. checkpoint-1000/rng_state_2.pth +3 -0
  15. checkpoint-1000/rng_state_3.pth +3 -0
  16. checkpoint-1000/rng_state_4.pth +3 -0
  17. checkpoint-1000/rng_state_5.pth +3 -0
  18. checkpoint-1000/rng_state_6.pth +3 -0
  19. checkpoint-1000/rng_state_7.pth +3 -0
  20. checkpoint-1000/scheduler.pt +3 -0
  21. checkpoint-1000/special_tokens_map.json +31 -0
  22. checkpoint-1000/tokenizer.json +3 -0
  23. checkpoint-1000/tokenizer_config.json +240 -0
  24. checkpoint-1000/trainer_state.json +1078 -0
  25. checkpoint-1000/training_args.bin +3 -0
  26. checkpoint-1000/video_preprocessor_config.json +41 -0
  27. checkpoint-1000/vocab.json +0 -0
  28. checkpoint-1500/added_tokens.json +28 -0
  29. checkpoint-1500/chat_template.jinja +120 -0
  30. checkpoint-1500/config.json +67 -0
  31. checkpoint-1500/generation_config.json +13 -0
  32. checkpoint-1500/merges.txt +0 -0
  33. checkpoint-1500/model.safetensors +3 -0
  34. checkpoint-1500/optimizer.pt +3 -0
  35. checkpoint-1500/preprocessor_config.json +39 -0
  36. checkpoint-1500/rng_state_0.pth +3 -0
  37. checkpoint-1500/rng_state_1.pth +3 -0
  38. checkpoint-1500/rng_state_2.pth +3 -0
  39. checkpoint-1500/rng_state_3.pth +3 -0
  40. checkpoint-1500/rng_state_4.pth +3 -0
  41. checkpoint-1500/rng_state_5.pth +3 -0
  42. checkpoint-1500/rng_state_6.pth +3 -0
  43. checkpoint-1500/rng_state_7.pth +3 -0
  44. checkpoint-1500/scheduler.pt +3 -0
  45. checkpoint-1500/special_tokens_map.json +31 -0
  46. checkpoint-1500/tokenizer.json +3 -0
  47. checkpoint-1500/tokenizer_config.json +240 -0
  48. checkpoint-1500/trainer_state.json +1600 -0
  49. checkpoint-1500/training_args.bin +3 -0
  50. checkpoint-1500/video_preprocessor_config.json +41 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-4320/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ final_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-VL-2B-Instruct
3
+ library_name: transformers
4
+ model_name: Qwen3_checkpoint
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen3_checkpoint
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/configint/qwen3-vl-sft/runs/uuci5pet)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.23.1
38
+ - Transformers: 4.57.0
39
+ - Pytorch: 2.8.0
40
+ - Datasets: 4.2.0
41
+ - Tokenizers: 0.22.1
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
accelerate_config.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: MULTI_GPU
4
+ downcast_bf16: 'no'
5
+ gpu_ids: all
6
+ main_training_function: main
7
+ mixed_precision: bf16
8
+ num_machines: 1
9
+ num_processes: 8
10
+ tpu_env: []
11
+ tpu_use_cluster: false
12
+ tpu_use_sudo: false
13
+ use_cpu: false
checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
checkpoint-1000/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "eos_token_id": 151645,
7
+ "image_token_id": 151655,
8
+ "model_type": "qwen3_vl",
9
+ "pad_token_id": 151643,
10
+ "text_config": {
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 151643,
14
+ "dtype": "bfloat16",
15
+ "eos_token_id": 151645,
16
+ "head_dim": 128,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 2048,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 6144,
21
+ "max_position_embeddings": 262144,
22
+ "model_type": "qwen3_vl_text",
23
+ "num_attention_heads": 16,
24
+ "num_hidden_layers": 28,
25
+ "num_key_value_heads": 8,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": {
28
+ "mrope_interleaved": true,
29
+ "mrope_section": [
30
+ 24,
31
+ 20,
32
+ 20
33
+ ],
34
+ "rope_type": "default"
35
+ },
36
+ "rope_theta": 5000000,
37
+ "tie_word_embeddings": true,
38
+ "use_cache": true,
39
+ "vocab_size": 151936
40
+ },
41
+ "tie_word_embeddings": true,
42
+ "transformers_version": "4.57.0",
43
+ "video_token_id": 151656,
44
+ "vision_config": {
45
+ "deepstack_visual_indexes": [
46
+ 5,
47
+ 11,
48
+ 17
49
+ ],
50
+ "depth": 24,
51
+ "dtype": "bfloat16",
52
+ "hidden_act": "gelu_pytorch_tanh",
53
+ "hidden_size": 1024,
54
+ "in_channels": 3,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 4096,
57
+ "model_type": "qwen3_vl",
58
+ "num_heads": 16,
59
+ "num_position_embeddings": 2304,
60
+ "out_hidden_size": 2048,
61
+ "patch_size": 16,
62
+ "spatial_merge_size": 2,
63
+ "temporal_patch_size": 2
64
+ },
65
+ "vision_end_token_id": 151653,
66
+ "vision_start_token_id": 151652
67
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.0"
13
+ }
checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60bee21b510c724c2ccb9b218eba003ddf592ac7a8c0b4c38caa23f65a14cedc
3
+ size 4255140312
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e706a37013c99bdf0096b6a9fb1994b9400923833ecd156e8d9a4d4cf98adc50
3
+ size 8510679398
checkpoint-1000/preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_pad": null,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
+ "image_std": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "input_data_format": null,
25
+ "max_pixels": null,
26
+ "merge_size": 2,
27
+ "min_pixels": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
+ "size": {
35
+ "longest_edge": 16777216,
36
+ "shortest_edge": 65536
37
+ },
38
+ "temporal_patch_size": 2
39
+ }
checkpoint-1000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8e5f48b40f283c2be57ffeca20c84e74d5bad51d76da17d127991b78da5289d
3
+ size 16389
checkpoint-1000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c3696205cbf4679a7f2faf48351e924c79ada4cfd81a956e03e4319cf3d05b5
3
+ size 16389
checkpoint-1000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:403766b1961af9b11405f33af2b272a2d6287cefdee9eb7e26e1dc71e3ed3707
3
+ size 16389
checkpoint-1000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54f6e3e08fea79c105dbd92657538a3cb15e89872360a3c3072b758835ca5ebc
3
+ size 16389
checkpoint-1000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0203808a2df05d235c57b83a0a087bae98ef6e47f06d685a0bddaa460a16c910
3
+ size 16389
checkpoint-1000/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b492b10406611c53f05fd065d6968c7e5e3797fd0008fca46cad009936c67e
3
+ size 16389
checkpoint-1000/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d44646ad1e32ad260756d9c2e6927a7a101af3a5c9265d8db8a1d4def7d446cb
3
+ size 16389
checkpoint-1000/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b83b78288d5cea0b10985a2d48c3c0b12d2ff3f99104720c1b3281409b3b752
3
+ size 16389
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cc81b6515b3846e01beabc95700d2ae76e21c13cebaaa8b0b7741d224738863
3
+ size 1465
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "processor_class": "Qwen3VLProcessor",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1078 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.0312611423432827,
4
+ "best_model_checkpoint": "/workspace/Qwen3_checkpoint/checkpoint-1000",
5
+ "epoch": 0.6945047313134821,
6
+ "eval_steps": 250,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.1519597232341767,
14
+ "epoch": 0.00694504731313482,
15
+ "grad_norm": 540.0,
16
+ "learning_rate": 6.249999999999999e-07,
17
+ "loss": 22.6265,
18
+ "mean_token_accuracy": 0.07503694812767207,
19
+ "num_tokens": 1640499.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.1664627104997636,
24
+ "epoch": 0.01389009462626964,
25
+ "grad_norm": 466.0,
26
+ "learning_rate": 1.3194444444444444e-06,
27
+ "loss": 22.1144,
28
+ "mean_token_accuracy": 0.08692597970366478,
29
+ "num_tokens": 3288480.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.239394597709179,
34
+ "epoch": 0.020835141939404462,
35
+ "grad_norm": 462.0,
36
+ "learning_rate": 2.0138888888888893e-06,
37
+ "loss": 21.7485,
38
+ "mean_token_accuracy": 0.07649312568828463,
39
+ "num_tokens": 4958023.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.4183061853051186,
44
+ "epoch": 0.02778018925253928,
45
+ "grad_norm": 436.0,
46
+ "learning_rate": 2.7083333333333334e-06,
47
+ "loss": 20.1258,
48
+ "mean_token_accuracy": 0.07965942500159144,
49
+ "num_tokens": 6627704.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.9329519897699357,
54
+ "epoch": 0.034725236565674104,
55
+ "grad_norm": 378.0,
56
+ "learning_rate": 3.402777777777778e-06,
57
+ "loss": 16.576,
58
+ "mean_token_accuracy": 0.07870438289828599,
59
+ "num_tokens": 8340232.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 4.602115747332573,
64
+ "epoch": 0.041670283878808924,
65
+ "grad_norm": 184.0,
66
+ "learning_rate": 4.097222222222222e-06,
67
+ "loss": 12.1758,
68
+ "mean_token_accuracy": 0.07569821006618441,
69
+ "num_tokens": 10025658.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 7.328701210021973,
74
+ "epoch": 0.04861533119194374,
75
+ "grad_norm": 114.5,
76
+ "learning_rate": 4.791666666666667e-06,
77
+ "loss": 9.8369,
78
+ "mean_token_accuracy": 0.08327204268425703,
79
+ "num_tokens": 11741238.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 7.635936880111695,
84
+ "epoch": 0.05556037850507856,
85
+ "grad_norm": 113.5,
86
+ "learning_rate": 5.486111111111111e-06,
87
+ "loss": 8.3162,
88
+ "mean_token_accuracy": 0.0876343248412013,
89
+ "num_tokens": 13404022.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 7.46761246919632,
94
+ "epoch": 0.06250542581821339,
95
+ "grad_norm": 123.5,
96
+ "learning_rate": 6.180555555555556e-06,
97
+ "loss": 7.63,
98
+ "mean_token_accuracy": 0.08659355682320893,
99
+ "num_tokens": 15112991.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 7.1816710293293,
104
+ "epoch": 0.06945047313134821,
105
+ "grad_norm": 148.0,
106
+ "learning_rate": 6.875e-06,
107
+ "loss": 7.0576,
108
+ "mean_token_accuracy": 0.08753767567686736,
109
+ "num_tokens": 16756210.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 7.2137157917022705,
114
+ "epoch": 0.07639552044448303,
115
+ "grad_norm": 169.0,
116
+ "learning_rate": 7.569444444444445e-06,
117
+ "loss": 6.6052,
118
+ "mean_token_accuracy": 0.08361622425727547,
119
+ "num_tokens": 18470889.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 7.177587866783142,
124
+ "epoch": 0.08334056775761785,
125
+ "grad_norm": 156.0,
126
+ "learning_rate": 8.263888888888888e-06,
127
+ "loss": 6.0531,
128
+ "mean_token_accuracy": 0.08918410916812718,
129
+ "num_tokens": 20159140.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 7.2117412209510805,
134
+ "epoch": 0.09028561507075267,
135
+ "grad_norm": 160.0,
136
+ "learning_rate": 8.958333333333334e-06,
137
+ "loss": 5.4473,
138
+ "mean_token_accuracy": 0.08560893354006112,
139
+ "num_tokens": 21857491.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 7.211903780698776,
144
+ "epoch": 0.09723066238388749,
145
+ "grad_norm": 278.0,
146
+ "learning_rate": 9.652777777777777e-06,
147
+ "loss": 4.788,
148
+ "mean_token_accuracy": 0.127999250870198,
149
+ "num_tokens": 23545940.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 7.21578094959259,
154
+ "epoch": 0.1041757096970223,
155
+ "grad_norm": 163.0,
156
+ "learning_rate": 1.0347222222222223e-05,
157
+ "loss": 4.1808,
158
+ "mean_token_accuracy": 0.5043030813336372,
159
+ "num_tokens": 25200650.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 7.225958931446075,
164
+ "epoch": 0.11112075701015713,
165
+ "grad_norm": 146.0,
166
+ "learning_rate": 1.1041666666666668e-05,
167
+ "loss": 3.4839,
168
+ "mean_token_accuracy": 0.8470314003527164,
169
+ "num_tokens": 26879379.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 7.03481342792511,
174
+ "epoch": 0.11806580432329196,
175
+ "grad_norm": 160.0,
176
+ "learning_rate": 1.1736111111111112e-05,
177
+ "loss": 2.6644,
178
+ "mean_token_accuracy": 0.9146032109856606,
179
+ "num_tokens": 28559842.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 6.437038451433182,
184
+ "epoch": 0.12501085163642678,
185
+ "grad_norm": 135.0,
186
+ "learning_rate": 1.2430555555555557e-05,
187
+ "loss": 1.8016,
188
+ "mean_token_accuracy": 0.9310876823961735,
189
+ "num_tokens": 30223208.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 4.774793094396591,
194
+ "epoch": 0.13195589894956158,
195
+ "grad_norm": 72.0,
196
+ "learning_rate": 1.3125e-05,
197
+ "loss": 1.0024,
198
+ "mean_token_accuracy": 0.9337419532239437,
199
+ "num_tokens": 31915078.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 2.502578377723694,
204
+ "epoch": 0.13890094626269642,
205
+ "grad_norm": 18.5,
206
+ "learning_rate": 1.3819444444444444e-05,
207
+ "loss": 0.5206,
208
+ "mean_token_accuracy": 0.9370624087750912,
209
+ "num_tokens": 33627632.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1439476042985917,
214
+ "epoch": 0.14584599357583122,
215
+ "grad_norm": 6.5,
216
+ "learning_rate": 1.451388888888889e-05,
217
+ "loss": 0.314,
218
+ "mean_token_accuracy": 0.9395098507404327,
219
+ "num_tokens": 35335117.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.625144999101758,
224
+ "epoch": 0.15279104088896606,
225
+ "grad_norm": 2.296875,
226
+ "learning_rate": 1.5208333333333333e-05,
227
+ "loss": 0.2057,
228
+ "mean_token_accuracy": 0.9517911829054355,
229
+ "num_tokens": 37011186.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.34807259757071735,
234
+ "epoch": 0.1597360882021009,
235
+ "grad_norm": 0.98828125,
236
+ "learning_rate": 1.590277777777778e-05,
237
+ "loss": 0.1098,
238
+ "mean_token_accuracy": 0.9706214666366577,
239
+ "num_tokens": 38673894.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.22573072295635938,
244
+ "epoch": 0.1666811355152357,
245
+ "grad_norm": 0.71875,
246
+ "learning_rate": 1.6597222222222222e-05,
247
+ "loss": 0.0648,
248
+ "mean_token_accuracy": 0.9812032662332058,
249
+ "num_tokens": 40360862.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.19475232288241387,
254
+ "epoch": 0.17362618282837053,
255
+ "grad_norm": 0.3515625,
256
+ "learning_rate": 1.7291666666666666e-05,
257
+ "loss": 0.0472,
258
+ "mean_token_accuracy": 0.9857565425336361,
259
+ "num_tokens": 42047917.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "epoch": 0.17362618282837053,
264
+ "eval_entropy": 0.22951670004171507,
265
+ "eval_loss": 0.05014730989933014,
266
+ "eval_mean_token_accuracy": 0.985958794625192,
267
+ "eval_num_tokens": 42047917.0,
268
+ "eval_runtime": 81.9972,
269
+ "eval_samples_per_second": 121.468,
270
+ "eval_steps_per_second": 7.598,
271
+ "step": 250
272
+ },
273
+ {
274
+ "entropy": 0.1923621967434883,
275
+ "epoch": 0.18057123014150533,
276
+ "grad_norm": 0.337890625,
277
+ "learning_rate": 1.7986111111111113e-05,
278
+ "loss": 0.0397,
279
+ "mean_token_accuracy": 0.9877771027386189,
280
+ "num_tokens": 43714135.0,
281
+ "step": 260
282
+ },
283
+ {
284
+ "entropy": 0.18295048763975502,
285
+ "epoch": 0.18751627745464017,
286
+ "grad_norm": 0.3046875,
287
+ "learning_rate": 1.8680555555555556e-05,
288
+ "loss": 0.0395,
289
+ "mean_token_accuracy": 0.9877975657582283,
290
+ "num_tokens": 45420354.0,
291
+ "step": 270
292
+ },
293
+ {
294
+ "entropy": 0.18508334569633006,
295
+ "epoch": 0.19446132476777497,
296
+ "grad_norm": 0.29296875,
297
+ "learning_rate": 1.9375e-05,
298
+ "loss": 0.0365,
299
+ "mean_token_accuracy": 0.9889688648283481,
300
+ "num_tokens": 47094373.0,
301
+ "step": 280
302
+ },
303
+ {
304
+ "entropy": 0.17800553049892187,
305
+ "epoch": 0.2014063720809098,
306
+ "grad_norm": 0.2373046875,
307
+ "learning_rate": 2.0069444444444447e-05,
308
+ "loss": 0.0334,
309
+ "mean_token_accuracy": 0.9898182846605778,
310
+ "num_tokens": 48737014.0,
311
+ "step": 290
312
+ },
313
+ {
314
+ "entropy": 0.17542087566107512,
315
+ "epoch": 0.2083514193940446,
316
+ "grad_norm": 0.33984375,
317
+ "learning_rate": 2.076388888888889e-05,
318
+ "loss": 0.0332,
319
+ "mean_token_accuracy": 0.9898306861519813,
320
+ "num_tokens": 50411139.0,
321
+ "step": 300
322
+ },
323
+ {
324
+ "entropy": 0.17356558851897716,
325
+ "epoch": 0.21529646670717945,
326
+ "grad_norm": 0.2421875,
327
+ "learning_rate": 2.1458333333333334e-05,
328
+ "loss": 0.0329,
329
+ "mean_token_accuracy": 0.9897275142371654,
330
+ "num_tokens": 52101776.0,
331
+ "step": 310
332
+ },
333
+ {
334
+ "entropy": 0.18331474922597407,
335
+ "epoch": 0.22224151402031425,
336
+ "grad_norm": 0.27734375,
337
+ "learning_rate": 2.2152777777777778e-05,
338
+ "loss": 0.0323,
339
+ "mean_token_accuracy": 0.9898556746542454,
340
+ "num_tokens": 53761706.0,
341
+ "step": 320
342
+ },
343
+ {
344
+ "entropy": 0.16903471825644373,
345
+ "epoch": 0.22918656133344908,
346
+ "grad_norm": 0.2265625,
347
+ "learning_rate": 2.284722222222222e-05,
348
+ "loss": 0.0303,
349
+ "mean_token_accuracy": 0.9905840046703815,
350
+ "num_tokens": 55510137.0,
351
+ "step": 330
352
+ },
353
+ {
354
+ "entropy": 0.1726135764271021,
355
+ "epoch": 0.23613160864658392,
356
+ "grad_norm": 0.2353515625,
357
+ "learning_rate": 2.354166666666667e-05,
358
+ "loss": 0.0308,
359
+ "mean_token_accuracy": 0.9905642926692962,
360
+ "num_tokens": 57209080.0,
361
+ "step": 340
362
+ },
363
+ {
364
+ "entropy": 0.17743745557963847,
365
+ "epoch": 0.24307665595971872,
366
+ "grad_norm": 0.32421875,
367
+ "learning_rate": 2.4236111111111112e-05,
368
+ "loss": 0.0304,
369
+ "mean_token_accuracy": 0.9905851900577545,
370
+ "num_tokens": 58867303.0,
371
+ "step": 350
372
+ },
373
+ {
374
+ "entropy": 0.18280480420216919,
375
+ "epoch": 0.25002170327285356,
376
+ "grad_norm": 0.359375,
377
+ "learning_rate": 2.4930555555555556e-05,
378
+ "loss": 0.0315,
379
+ "mean_token_accuracy": 0.9901946425437927,
380
+ "num_tokens": 60510859.0,
381
+ "step": 360
382
+ },
383
+ {
384
+ "entropy": 0.1761154976673424,
385
+ "epoch": 0.2569667505859884,
386
+ "grad_norm": 0.5,
387
+ "learning_rate": 2.5625e-05,
388
+ "loss": 0.0298,
389
+ "mean_token_accuracy": 0.9909366421401501,
390
+ "num_tokens": 62171999.0,
391
+ "step": 370
392
+ },
393
+ {
394
+ "entropy": 0.1671930664218962,
395
+ "epoch": 0.26391179789912317,
396
+ "grad_norm": 0.31640625,
397
+ "learning_rate": 2.6319444444444443e-05,
398
+ "loss": 0.0284,
399
+ "mean_token_accuracy": 0.9912083707749844,
400
+ "num_tokens": 63906245.0,
401
+ "step": 380
402
+ },
403
+ {
404
+ "entropy": 0.1754794458858669,
405
+ "epoch": 0.270856845212258,
406
+ "grad_norm": 0.240234375,
407
+ "learning_rate": 2.7013888888888887e-05,
408
+ "loss": 0.0303,
409
+ "mean_token_accuracy": 0.9904192194342614,
410
+ "num_tokens": 65589663.0,
411
+ "step": 390
412
+ },
413
+ {
414
+ "entropy": 0.1771222472190857,
415
+ "epoch": 0.27780189252539284,
416
+ "grad_norm": 0.2470703125,
417
+ "learning_rate": 2.7708333333333337e-05,
418
+ "loss": 0.0299,
419
+ "mean_token_accuracy": 0.9907479107379913,
420
+ "num_tokens": 67241611.0,
421
+ "step": 400
422
+ },
423
+ {
424
+ "entropy": 0.17829248458147048,
425
+ "epoch": 0.28474693983852767,
426
+ "grad_norm": 0.357421875,
427
+ "learning_rate": 2.840277777777778e-05,
428
+ "loss": 0.0297,
429
+ "mean_token_accuracy": 0.9906024843454361,
430
+ "num_tokens": 68889437.0,
431
+ "step": 410
432
+ },
433
+ {
434
+ "entropy": 0.17638491541147233,
435
+ "epoch": 0.29169198715166245,
436
+ "grad_norm": 0.1943359375,
437
+ "learning_rate": 2.9097222222222224e-05,
438
+ "loss": 0.0284,
439
+ "mean_token_accuracy": 0.9913305290043354,
440
+ "num_tokens": 70618362.0,
441
+ "step": 420
442
+ },
443
+ {
444
+ "entropy": 0.18182099591940643,
445
+ "epoch": 0.2986370344647973,
446
+ "grad_norm": 0.40234375,
447
+ "learning_rate": 2.9791666666666668e-05,
448
+ "loss": 0.0285,
449
+ "mean_token_accuracy": 0.9912928201258182,
450
+ "num_tokens": 72313577.0,
451
+ "step": 430
452
+ },
453
+ {
454
+ "entropy": 0.18310831850394607,
455
+ "epoch": 0.3055820817779321,
456
+ "grad_norm": 0.2265625,
457
+ "learning_rate": 2.9999760059577934e-05,
458
+ "loss": 0.029,
459
+ "mean_token_accuracy": 0.9910079762339592,
460
+ "num_tokens": 73988194.0,
461
+ "step": 440
462
+ },
463
+ {
464
+ "entropy": 0.17772579193115234,
465
+ "epoch": 0.31252712909106695,
466
+ "grad_norm": 0.2470703125,
467
+ "learning_rate": 2.9998584859663212e-05,
468
+ "loss": 0.0284,
469
+ "mean_token_accuracy": 0.9910575695335865,
470
+ "num_tokens": 75663076.0,
471
+ "step": 450
472
+ },
473
+ {
474
+ "entropy": 0.17988401055335998,
475
+ "epoch": 0.3194721764042018,
476
+ "grad_norm": 0.259765625,
477
+ "learning_rate": 2.9996430406198636e-05,
478
+ "loss": 0.028,
479
+ "mean_token_accuracy": 0.9914611585438251,
480
+ "num_tokens": 77335287.0,
481
+ "step": 460
482
+ },
483
+ {
484
+ "entropy": 0.173457539267838,
485
+ "epoch": 0.32641722371733656,
486
+ "grad_norm": 0.33984375,
487
+ "learning_rate": 2.9993296839847897e-05,
488
+ "loss": 0.0287,
489
+ "mean_token_accuracy": 0.9910615682601929,
490
+ "num_tokens": 79052668.0,
491
+ "step": 470
492
+ },
493
+ {
494
+ "entropy": 0.16980679212138056,
495
+ "epoch": 0.3333622710304714,
496
+ "grad_norm": 0.234375,
497
+ "learning_rate": 2.9989184365200695e-05,
498
+ "loss": 0.0266,
499
+ "mean_token_accuracy": 0.9917186461389065,
500
+ "num_tokens": 80733082.0,
501
+ "step": 480
502
+ },
503
+ {
504
+ "entropy": 0.16687583839520811,
505
+ "epoch": 0.3403073183436062,
506
+ "grad_norm": 0.197265625,
507
+ "learning_rate": 2.998409325075938e-05,
508
+ "loss": 0.0274,
509
+ "mean_token_accuracy": 0.9914394572377205,
510
+ "num_tokens": 82456201.0,
511
+ "step": 490
512
+ },
513
+ {
514
+ "entropy": 0.16694002971053123,
515
+ "epoch": 0.34725236565674106,
516
+ "grad_norm": 0.193359375,
517
+ "learning_rate": 2.997802382892144e-05,
518
+ "loss": 0.0271,
519
+ "mean_token_accuracy": 0.9916702680289745,
520
+ "num_tokens": 84174465.0,
521
+ "step": 500
522
+ },
523
+ {
524
+ "epoch": 0.34725236565674106,
525
+ "eval_entropy": 0.2104673289826938,
526
+ "eval_loss": 0.03412040323019028,
527
+ "eval_mean_token_accuracy": 0.9906957189688522,
528
+ "eval_num_tokens": 84174465.0,
529
+ "eval_runtime": 81.2363,
530
+ "eval_samples_per_second": 122.605,
531
+ "eval_steps_per_second": 7.669,
532
+ "step": 500
533
+ },
534
+ {
535
+ "entropy": 0.16945981308817865,
536
+ "epoch": 0.35419741296987584,
537
+ "grad_norm": 0.369140625,
538
+ "learning_rate": 2.997097649595779e-05,
539
+ "loss": 0.0281,
540
+ "mean_token_accuracy": 0.9912286765873433,
541
+ "num_tokens": 85878591.0,
542
+ "step": 510
543
+ },
544
+ {
545
+ "entropy": 0.16488271821290254,
546
+ "epoch": 0.36114246028301067,
547
+ "grad_norm": 0.1962890625,
548
+ "learning_rate": 2.9962951711986897e-05,
549
+ "loss": 0.0277,
550
+ "mean_token_accuracy": 0.9914339393377304,
551
+ "num_tokens": 87573271.0,
552
+ "step": 520
553
+ },
554
+ {
555
+ "entropy": 0.17190398359671236,
556
+ "epoch": 0.3680875075961455,
557
+ "grad_norm": 0.32421875,
558
+ "learning_rate": 2.9953950000944724e-05,
559
+ "loss": 0.0291,
560
+ "mean_token_accuracy": 0.9910320229828358,
561
+ "num_tokens": 89226804.0,
562
+ "step": 530
563
+ },
564
+ {
565
+ "entropy": 0.16401167614385487,
566
+ "epoch": 0.37503255490928034,
567
+ "grad_norm": 0.2216796875,
568
+ "learning_rate": 2.994397195055056e-05,
569
+ "loss": 0.0268,
570
+ "mean_token_accuracy": 0.9915202751755714,
571
+ "num_tokens": 90931627.0,
572
+ "step": 540
573
+ },
574
+ {
575
+ "entropy": 0.1601521703414619,
576
+ "epoch": 0.3819776022224151,
577
+ "grad_norm": 0.1806640625,
578
+ "learning_rate": 2.9933018212268607e-05,
579
+ "loss": 0.0256,
580
+ "mean_token_accuracy": 0.9919740296900272,
581
+ "num_tokens": 92669023.0,
582
+ "step": 550
583
+ },
584
+ {
585
+ "entropy": 0.1690834665670991,
586
+ "epoch": 0.38892264953554995,
587
+ "grad_norm": 0.1806640625,
588
+ "learning_rate": 2.9921089501265493e-05,
589
+ "loss": 0.0272,
590
+ "mean_token_accuracy": 0.9918020509183407,
591
+ "num_tokens": 94351832.0,
592
+ "step": 560
593
+ },
594
+ {
595
+ "entropy": 0.16791587956249715,
596
+ "epoch": 0.3958676968486848,
597
+ "grad_norm": 0.177734375,
598
+ "learning_rate": 2.990818659636352e-05,
599
+ "loss": 0.0269,
600
+ "mean_token_accuracy": 0.9915785849094391,
601
+ "num_tokens": 96026408.0,
602
+ "step": 570
603
+ },
604
+ {
605
+ "entropy": 0.1673963068984449,
606
+ "epoch": 0.4028127441618196,
607
+ "grad_norm": 0.1591796875,
608
+ "learning_rate": 2.989431033998986e-05,
609
+ "loss": 0.0263,
610
+ "mean_token_accuracy": 0.9915929511189461,
611
+ "num_tokens": 97667765.0,
612
+ "step": 580
613
+ },
614
+ {
615
+ "entropy": 0.1658677328377962,
616
+ "epoch": 0.40975779147495445,
617
+ "grad_norm": 0.25390625,
618
+ "learning_rate": 2.987946163812155e-05,
619
+ "loss": 0.0263,
620
+ "mean_token_accuracy": 0.9918196611106396,
621
+ "num_tokens": 99392767.0,
622
+ "step": 590
623
+ },
624
+ {
625
+ "entropy": 0.16773838149383663,
626
+ "epoch": 0.4167028387880892,
627
+ "grad_norm": 1.078125,
628
+ "learning_rate": 2.9863641460226304e-05,
629
+ "loss": 0.0269,
630
+ "mean_token_accuracy": 0.9917811565101147,
631
+ "num_tokens": 101073637.0,
632
+ "step": 600
633
+ },
634
+ {
635
+ "entropy": 0.1649629818275571,
636
+ "epoch": 0.42364788610122406,
637
+ "grad_norm": 0.263671875,
638
+ "learning_rate": 2.984685083919928e-05,
639
+ "loss": 0.0274,
640
+ "mean_token_accuracy": 0.9912688471376896,
641
+ "num_tokens": 102740766.0,
642
+ "step": 610
643
+ },
644
+ {
645
+ "entropy": 0.15838869903236627,
646
+ "epoch": 0.4305929334143589,
647
+ "grad_norm": 0.27734375,
648
+ "learning_rate": 2.9829090871295564e-05,
649
+ "loss": 0.0251,
650
+ "mean_token_accuracy": 0.9922874167561531,
651
+ "num_tokens": 104488780.0,
652
+ "step": 620
653
+ },
654
+ {
655
+ "entropy": 0.17186203664168714,
656
+ "epoch": 0.4375379807274937,
657
+ "grad_norm": 0.236328125,
658
+ "learning_rate": 2.981036271605867e-05,
659
+ "loss": 0.0263,
660
+ "mean_token_accuracy": 0.9918930843472481,
661
+ "num_tokens": 106195449.0,
662
+ "step": 630
663
+ },
664
+ {
665
+ "entropy": 0.1648147610016167,
666
+ "epoch": 0.4444830280406285,
667
+ "grad_norm": 0.20703125,
668
+ "learning_rate": 2.979066759624479e-05,
669
+ "loss": 0.0252,
670
+ "mean_token_accuracy": 0.992139758169651,
671
+ "num_tokens": 107902411.0,
672
+ "step": 640
673
+ },
674
+ {
675
+ "entropy": 0.16121056992560626,
676
+ "epoch": 0.45142807535376334,
677
+ "grad_norm": 0.2392578125,
678
+ "learning_rate": 2.9770006797742963e-05,
679
+ "loss": 0.025,
680
+ "mean_token_accuracy": 0.9919275738298893,
681
+ "num_tokens": 109608103.0,
682
+ "step": 650
683
+ },
684
+ {
685
+ "entropy": 0.17468737829476594,
686
+ "epoch": 0.45837312266689817,
687
+ "grad_norm": 0.400390625,
688
+ "learning_rate": 2.974838166949114e-05,
689
+ "loss": 0.0272,
690
+ "mean_token_accuracy": 0.9917291469871998,
691
+ "num_tokens": 111293326.0,
692
+ "step": 660
693
+ },
694
+ {
695
+ "entropy": 0.1678467271849513,
696
+ "epoch": 0.465318169980033,
697
+ "grad_norm": 0.21484375,
698
+ "learning_rate": 2.9725793623388097e-05,
699
+ "loss": 0.0255,
700
+ "mean_token_accuracy": 0.9922867320477963,
701
+ "num_tokens": 113010182.0,
702
+ "step": 670
703
+ },
704
+ {
705
+ "entropy": 0.16656696163117884,
706
+ "epoch": 0.47226321729316784,
707
+ "grad_norm": 0.359375,
708
+ "learning_rate": 2.9702244134201263e-05,
709
+ "loss": 0.0258,
710
+ "mean_token_accuracy": 0.9920145235955715,
711
+ "num_tokens": 114702373.0,
712
+ "step": 680
713
+ },
714
+ {
715
+ "entropy": 0.1741402018815279,
716
+ "epoch": 0.4792082646063026,
717
+ "grad_norm": 0.181640625,
718
+ "learning_rate": 2.9677734739470407e-05,
719
+ "loss": 0.0282,
720
+ "mean_token_accuracy": 0.9915049582719803,
721
+ "num_tokens": 116371688.0,
722
+ "step": 690
723
+ },
724
+ {
725
+ "entropy": 0.17028953870758415,
726
+ "epoch": 0.48615331191943745,
727
+ "grad_norm": 0.1845703125,
728
+ "learning_rate": 2.9652267039407297e-05,
729
+ "loss": 0.0273,
730
+ "mean_token_accuracy": 0.9915660627186298,
731
+ "num_tokens": 118057500.0,
732
+ "step": 700
733
+ },
734
+ {
735
+ "entropy": 0.17608886091038584,
736
+ "epoch": 0.4930983592325723,
737
+ "grad_norm": 0.1962890625,
738
+ "learning_rate": 2.962584269679117e-05,
739
+ "loss": 0.0274,
740
+ "mean_token_accuracy": 0.9914011798799038,
741
+ "num_tokens": 119691073.0,
742
+ "step": 710
743
+ },
744
+ {
745
+ "entropy": 0.16367722414433955,
746
+ "epoch": 0.5000434065457071,
747
+ "grad_norm": 0.330078125,
748
+ "learning_rate": 2.959846343686022e-05,
749
+ "loss": 0.0251,
750
+ "mean_token_accuracy": 0.9922043934464455,
751
+ "num_tokens": 121405698.0,
752
+ "step": 720
753
+ },
754
+ {
755
+ "entropy": 0.16646005054935814,
756
+ "epoch": 0.506988453858842,
757
+ "grad_norm": 0.2734375,
758
+ "learning_rate": 2.957013104719892e-05,
759
+ "loss": 0.0257,
760
+ "mean_token_accuracy": 0.9920815974473953,
761
+ "num_tokens": 123140318.0,
762
+ "step": 730
763
+ },
764
+ {
765
+ "entropy": 0.16303647235035895,
766
+ "epoch": 0.5139335011719768,
767
+ "grad_norm": 0.2255859375,
768
+ "learning_rate": 2.9540847377621337e-05,
769
+ "loss": 0.0264,
770
+ "mean_token_accuracy": 0.991966935992241,
771
+ "num_tokens": 124857523.0,
772
+ "step": 740
773
+ },
774
+ {
775
+ "entropy": 0.16988610094413162,
776
+ "epoch": 0.5208785484851115,
777
+ "grad_norm": 0.2197265625,
778
+ "learning_rate": 2.9510614340050325e-05,
779
+ "loss": 0.0256,
780
+ "mean_token_accuracy": 0.9918926700949668,
781
+ "num_tokens": 126537184.0,
782
+ "step": 750
783
+ },
784
+ {
785
+ "epoch": 0.5208785484851115,
786
+ "eval_entropy": 0.2028083825546704,
787
+ "eval_loss": 0.032208461314439774,
788
+ "eval_mean_token_accuracy": 0.99125398544783,
789
+ "eval_num_tokens": 126537184.0,
790
+ "eval_runtime": 81.5469,
791
+ "eval_samples_per_second": 122.138,
792
+ "eval_steps_per_second": 7.64,
793
+ "step": 750
794
+ },
795
+ {
796
+ "entropy": 0.1591445118188858,
797
+ "epoch": 0.5278235957982463,
798
+ "grad_norm": 0.2109375,
799
+ "learning_rate": 2.9479433908392747e-05,
800
+ "loss": 0.025,
801
+ "mean_token_accuracy": 0.9921697616577149,
802
+ "num_tokens": 128251701.0,
803
+ "step": 760
804
+ },
805
+ {
806
+ "entropy": 0.16354208169505,
807
+ "epoch": 0.5347686431113812,
808
+ "grad_norm": 0.18359375,
809
+ "learning_rate": 2.9447308118410544e-05,
810
+ "loss": 0.0251,
811
+ "mean_token_accuracy": 0.9920223578810692,
812
+ "num_tokens": 129913493.0,
813
+ "step": 770
814
+ },
815
+ {
816
+ "entropy": 0.16334566865116357,
817
+ "epoch": 0.541713690424516,
818
+ "grad_norm": 0.2177734375,
819
+ "learning_rate": 2.9414239067587866e-05,
820
+ "loss": 0.0256,
821
+ "mean_token_accuracy": 0.992166705429554,
822
+ "num_tokens": 131640183.0,
823
+ "step": 780
824
+ },
825
+ {
826
+ "entropy": 0.16315491450950503,
827
+ "epoch": 0.5486587377376508,
828
+ "grad_norm": 0.16796875,
829
+ "learning_rate": 2.938022891499409e-05,
830
+ "loss": 0.0257,
831
+ "mean_token_accuracy": 0.9920231319963932,
832
+ "num_tokens": 133328365.0,
833
+ "step": 790
834
+ },
835
+ {
836
+ "entropy": 0.1643814197741449,
837
+ "epoch": 0.5556037850507857,
838
+ "grad_norm": 0.26171875,
839
+ "learning_rate": 2.9345279881142887e-05,
840
+ "loss": 0.0257,
841
+ "mean_token_accuracy": 0.991948215663433,
842
+ "num_tokens": 134988156.0,
843
+ "step": 800
844
+ },
845
+ {
846
+ "entropy": 0.160524242464453,
847
+ "epoch": 0.5625488323639205,
848
+ "grad_norm": 0.2490234375,
849
+ "learning_rate": 2.930939424784723e-05,
850
+ "loss": 0.0243,
851
+ "mean_token_accuracy": 0.9926339693367481,
852
+ "num_tokens": 136738001.0,
853
+ "step": 810
854
+ },
855
+ {
856
+ "entropy": 0.1635106342844665,
857
+ "epoch": 0.5694938796770553,
858
+ "grad_norm": 0.279296875,
859
+ "learning_rate": 2.9272574358070406e-05,
860
+ "loss": 0.0263,
861
+ "mean_token_accuracy": 0.991894131153822,
862
+ "num_tokens": 138411117.0,
863
+ "step": 820
864
+ },
865
+ {
866
+ "entropy": 0.16258806074038148,
867
+ "epoch": 0.5764389269901902,
868
+ "grad_norm": 0.197265625,
869
+ "learning_rate": 2.923482261577307e-05,
870
+ "loss": 0.0265,
871
+ "mean_token_accuracy": 0.9918580889701843,
872
+ "num_tokens": 140144400.0,
873
+ "step": 830
874
+ },
875
+ {
876
+ "entropy": 0.17904180409386755,
877
+ "epoch": 0.5833839743033249,
878
+ "grad_norm": 0.181640625,
879
+ "learning_rate": 2.9196141485756267e-05,
880
+ "loss": 0.0283,
881
+ "mean_token_accuracy": 0.9913281343877316,
882
+ "num_tokens": 141787622.0,
883
+ "step": 840
884
+ },
885
+ {
886
+ "entropy": 0.1736303588375449,
887
+ "epoch": 0.5903290216164597,
888
+ "grad_norm": 0.2431640625,
889
+ "learning_rate": 2.915653349350051e-05,
890
+ "loss": 0.0262,
891
+ "mean_token_accuracy": 0.9919952727854252,
892
+ "num_tokens": 143416525.0,
893
+ "step": 850
894
+ },
895
+ {
896
+ "entropy": 0.17081214878708123,
897
+ "epoch": 0.5972740689295946,
898
+ "grad_norm": 0.2392578125,
899
+ "learning_rate": 2.9116001225000912e-05,
900
+ "loss": 0.0265,
901
+ "mean_token_accuracy": 0.9918745383620262,
902
+ "num_tokens": 145100905.0,
903
+ "step": 860
904
+ },
905
+ {
906
+ "entropy": 0.17302428921684623,
907
+ "epoch": 0.6042191162427294,
908
+ "grad_norm": 0.1875,
909
+ "learning_rate": 2.9074547326598316e-05,
910
+ "loss": 0.0259,
911
+ "mean_token_accuracy": 0.991954755783081,
912
+ "num_tokens": 146789995.0,
913
+ "step": 870
914
+ },
915
+ {
916
+ "entropy": 0.16971431467682124,
917
+ "epoch": 0.6111641635558642,
918
+ "grad_norm": 0.1611328125,
919
+ "learning_rate": 2.9032174504806546e-05,
920
+ "loss": 0.0254,
921
+ "mean_token_accuracy": 0.991959635913372,
922
+ "num_tokens": 148472897.0,
923
+ "step": 880
924
+ },
925
+ {
926
+ "entropy": 0.17342975698411464,
927
+ "epoch": 0.6181092108689991,
928
+ "grad_norm": 0.328125,
929
+ "learning_rate": 2.8988885526135672e-05,
930
+ "loss": 0.0259,
931
+ "mean_token_accuracy": 0.9921123184263706,
932
+ "num_tokens": 150147383.0,
933
+ "step": 890
934
+ },
935
+ {
936
+ "entropy": 0.16509362338110806,
937
+ "epoch": 0.6250542581821339,
938
+ "grad_norm": 0.263671875,
939
+ "learning_rate": 2.894468321691141e-05,
940
+ "loss": 0.0247,
941
+ "mean_token_accuracy": 0.9921708039939403,
942
+ "num_tokens": 151859058.0,
943
+ "step": 900
944
+ },
945
+ {
946
+ "entropy": 0.17183032808825374,
947
+ "epoch": 0.6319993054952687,
948
+ "grad_norm": 0.185546875,
949
+ "learning_rate": 2.889957046309058e-05,
950
+ "loss": 0.0259,
951
+ "mean_token_accuracy": 0.9919849313795567,
952
+ "num_tokens": 153511113.0,
953
+ "step": 910
954
+ },
955
+ {
956
+ "entropy": 0.1668629450723529,
957
+ "epoch": 0.6389443528084036,
958
+ "grad_norm": 0.1806640625,
959
+ "learning_rate": 2.8853550210072676e-05,
960
+ "loss": 0.0242,
961
+ "mean_token_accuracy": 0.9923402860760688,
962
+ "num_tokens": 155209381.0,
963
+ "step": 920
964
+ },
965
+ {
966
+ "entropy": 0.165106045268476,
967
+ "epoch": 0.6458894001215383,
968
+ "grad_norm": 0.216796875,
969
+ "learning_rate": 2.8806625462507573e-05,
970
+ "loss": 0.0244,
971
+ "mean_token_accuracy": 0.9924936473369599,
972
+ "num_tokens": 156930591.0,
973
+ "step": 930
974
+ },
975
+ {
976
+ "entropy": 0.1726138608530164,
977
+ "epoch": 0.6528344474346731,
978
+ "grad_norm": 0.1689453125,
979
+ "learning_rate": 2.8758799284099357e-05,
980
+ "loss": 0.0255,
981
+ "mean_token_accuracy": 0.9920349515974521,
982
+ "num_tokens": 158611580.0,
983
+ "step": 940
984
+ },
985
+ {
986
+ "entropy": 0.1682711481116712,
987
+ "epoch": 0.659779494747808,
988
+ "grad_norm": 0.201171875,
989
+ "learning_rate": 2.8710074797406277e-05,
990
+ "loss": 0.0242,
991
+ "mean_token_accuracy": 0.9924891531467438,
992
+ "num_tokens": 160303915.0,
993
+ "step": 950
994
+ },
995
+ {
996
+ "entropy": 0.1658643066883087,
997
+ "epoch": 0.6667245420609428,
998
+ "grad_norm": 0.26171875,
999
+ "learning_rate": 2.8660455183636893e-05,
1000
+ "loss": 0.0255,
1001
+ "mean_token_accuracy": 0.9920015670359135,
1002
+ "num_tokens": 161990176.0,
1003
+ "step": 960
1004
+ },
1005
+ {
1006
+ "entropy": 0.16630794005468488,
1007
+ "epoch": 0.6736695893740776,
1008
+ "grad_norm": 0.1748046875,
1009
+ "learning_rate": 2.860994368244238e-05,
1010
+ "loss": 0.0246,
1011
+ "mean_token_accuracy": 0.9922889664769172,
1012
+ "num_tokens": 163672500.0,
1013
+ "step": 970
1014
+ },
1015
+ {
1016
+ "entropy": 0.17136559821665287,
1017
+ "epoch": 0.6806146366872124,
1018
+ "grad_norm": 0.162109375,
1019
+ "learning_rate": 2.8558543591704984e-05,
1020
+ "loss": 0.0263,
1021
+ "mean_token_accuracy": 0.9918027445673943,
1022
+ "num_tokens": 165338616.0,
1023
+ "step": 980
1024
+ },
1025
+ {
1026
+ "entropy": 0.16459389431402088,
1027
+ "epoch": 0.6875596840003473,
1028
+ "grad_norm": 0.2236328125,
1029
+ "learning_rate": 2.8506258267322738e-05,
1030
+ "loss": 0.0251,
1031
+ "mean_token_accuracy": 0.9921816885471344,
1032
+ "num_tokens": 167056620.0,
1033
+ "step": 990
1034
+ },
1035
+ {
1036
+ "entropy": 0.1651211366057396,
1037
+ "epoch": 0.6945047313134821,
1038
+ "grad_norm": 0.177734375,
1039
+ "learning_rate": 2.8453091122990325e-05,
1040
+ "loss": 0.0242,
1041
+ "mean_token_accuracy": 0.9924872562289238,
1042
+ "num_tokens": 168766550.0,
1043
+ "step": 1000
1044
+ },
1045
+ {
1046
+ "epoch": 0.6945047313134821,
1047
+ "eval_entropy": 0.20344850389857927,
1048
+ "eval_loss": 0.0312611423432827,
1049
+ "eval_mean_token_accuracy": 0.9914905528386945,
1050
+ "eval_num_tokens": 168766550.0,
1051
+ "eval_runtime": 82.313,
1052
+ "eval_samples_per_second": 121.002,
1053
+ "eval_steps_per_second": 7.569,
1054
+ "step": 1000
1055
+ }
1056
+ ],
1057
+ "logging_steps": 10,
1058
+ "max_steps": 4320,
1059
+ "num_input_tokens_seen": 0,
1060
+ "num_train_epochs": 3,
1061
+ "save_steps": 500,
1062
+ "stateful_callbacks": {
1063
+ "TrainerControl": {
1064
+ "args": {
1065
+ "should_epoch_stop": false,
1066
+ "should_evaluate": false,
1067
+ "should_log": false,
1068
+ "should_save": true,
1069
+ "should_training_stop": false
1070
+ },
1071
+ "attributes": {}
1072
+ }
1073
+ },
1074
+ "total_flos": 3.5961741992379023e+18,
1075
+ "train_batch_size": 4,
1076
+ "trial_name": null,
1077
+ "trial_params": null
1078
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8363ddab34be1be669fb9c3b0e477751c99c8dd6938033052e7430da744c5331
3
+ size 6289
checkpoint-1000/video_preprocessor_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_sample_frames": true,
12
+ "fps": 2,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_std": [
19
+ 0.5,
20
+ 0.5,
21
+ 0.5
22
+ ],
23
+ "input_data_format": null,
24
+ "max_frames": 768,
25
+ "merge_size": 2,
26
+ "min_frames": 4,
27
+ "num_frames": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_metadata": false,
34
+ "size": {
35
+ "longest_edge": 25165824,
36
+ "shortest_edge": 4096
37
+ },
38
+ "temporal_patch_size": 2,
39
+ "video_metadata": null,
40
+ "video_processor_type": "Qwen3VLVideoProcessor"
41
+ }
checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-1500/chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
checkpoint-1500/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "eos_token_id": 151645,
7
+ "image_token_id": 151655,
8
+ "model_type": "qwen3_vl",
9
+ "pad_token_id": 151643,
10
+ "text_config": {
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "bos_token_id": 151643,
14
+ "dtype": "bfloat16",
15
+ "eos_token_id": 151645,
16
+ "head_dim": 128,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 2048,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 6144,
21
+ "max_position_embeddings": 262144,
22
+ "model_type": "qwen3_vl_text",
23
+ "num_attention_heads": 16,
24
+ "num_hidden_layers": 28,
25
+ "num_key_value_heads": 8,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": {
28
+ "mrope_interleaved": true,
29
+ "mrope_section": [
30
+ 24,
31
+ 20,
32
+ 20
33
+ ],
34
+ "rope_type": "default"
35
+ },
36
+ "rope_theta": 5000000,
37
+ "tie_word_embeddings": true,
38
+ "use_cache": true,
39
+ "vocab_size": 151936
40
+ },
41
+ "tie_word_embeddings": true,
42
+ "transformers_version": "4.57.0",
43
+ "video_token_id": 151656,
44
+ "vision_config": {
45
+ "deepstack_visual_indexes": [
46
+ 5,
47
+ 11,
48
+ 17
49
+ ],
50
+ "depth": 24,
51
+ "dtype": "bfloat16",
52
+ "hidden_act": "gelu_pytorch_tanh",
53
+ "hidden_size": 1024,
54
+ "in_channels": 3,
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 4096,
57
+ "model_type": "qwen3_vl",
58
+ "num_heads": 16,
59
+ "num_position_embeddings": 2304,
60
+ "out_hidden_size": 2048,
61
+ "patch_size": 16,
62
+ "spatial_merge_size": 2,
63
+ "temporal_patch_size": 2
64
+ },
65
+ "vision_end_token_id": 151653,
66
+ "vision_start_token_id": 151652
67
+ }
checkpoint-1500/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.0"
13
+ }
checkpoint-1500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5530396e0c21e6d9f6af1e8472f7cb21fe08ae240781e73d1775d8edc9cb5200
3
+ size 4255140312
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:951c6c3b2d54e59bc114cb3f21b654e3286993754c6ca960677f78ba4682edd2
3
+ size 8510679398
checkpoint-1500/preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_pad": null,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
+ "image_std": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "input_data_format": null,
25
+ "max_pixels": null,
26
+ "merge_size": 2,
27
+ "min_pixels": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
+ "size": {
35
+ "longest_edge": 16777216,
36
+ "shortest_edge": 65536
37
+ },
38
+ "temporal_patch_size": 2
39
+ }
checkpoint-1500/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:706722fc41436bdb24c4016ab83fbbb62526893d41d4cf49a009fdb7bb4abb5e
3
+ size 16389
checkpoint-1500/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cd3c5bfb3f44d6324570cdf01e259d25d57145e7c5bcf8a356f6f7b56afdf24
3
+ size 16389
checkpoint-1500/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada3d67296c7e68b4d424298393788e9698d123c0f49e21ffd505f02a1def057
3
+ size 16389
checkpoint-1500/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6abe268928ee44107d52539186bb325f6915b9fe7dafd490b17c58d29bd50783
3
+ size 16389
checkpoint-1500/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdabaa1468bbe8afa177e64e4dd55a0b2c5ac41e68d229b3d790a6cdd8fbf6fb
3
+ size 16389
checkpoint-1500/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20c4cea43dfd62d3a3364a4603fb39a847fae2cd95a7b7d8f43fb21cef815cde
3
+ size 16389
checkpoint-1500/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c22599770b4a3567bdcbd40583c1c29ca81866b4f2fb6dbcf94bafecdef10d2
3
+ size 16389
checkpoint-1500/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12f20736861ad603467227a1c08b4d09fd872a184e0c54c650a3ac14d130dfcd
3
+ size 16389
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2090e53456e02d38e003eadec700e1e47336ea9d8442b5e9e6d344c82e3d5dfb
3
+ size 1465
checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-1500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "processor_class": "Qwen3VLProcessor",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,1600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.030296573415398598,
4
+ "best_model_checkpoint": "/workspace/Qwen3_checkpoint/checkpoint-1500",
5
+ "epoch": 1.041670283878809,
6
+ "eval_steps": 250,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.1519597232341767,
14
+ "epoch": 0.00694504731313482,
15
+ "grad_norm": 540.0,
16
+ "learning_rate": 6.249999999999999e-07,
17
+ "loss": 22.6265,
18
+ "mean_token_accuracy": 0.07503694812767207,
19
+ "num_tokens": 1640499.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 1.1664627104997636,
24
+ "epoch": 0.01389009462626964,
25
+ "grad_norm": 466.0,
26
+ "learning_rate": 1.3194444444444444e-06,
27
+ "loss": 22.1144,
28
+ "mean_token_accuracy": 0.08692597970366478,
29
+ "num_tokens": 3288480.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.239394597709179,
34
+ "epoch": 0.020835141939404462,
35
+ "grad_norm": 462.0,
36
+ "learning_rate": 2.0138888888888893e-06,
37
+ "loss": 21.7485,
38
+ "mean_token_accuracy": 0.07649312568828463,
39
+ "num_tokens": 4958023.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.4183061853051186,
44
+ "epoch": 0.02778018925253928,
45
+ "grad_norm": 436.0,
46
+ "learning_rate": 2.7083333333333334e-06,
47
+ "loss": 20.1258,
48
+ "mean_token_accuracy": 0.07965942500159144,
49
+ "num_tokens": 6627704.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.9329519897699357,
54
+ "epoch": 0.034725236565674104,
55
+ "grad_norm": 378.0,
56
+ "learning_rate": 3.402777777777778e-06,
57
+ "loss": 16.576,
58
+ "mean_token_accuracy": 0.07870438289828599,
59
+ "num_tokens": 8340232.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 4.602115747332573,
64
+ "epoch": 0.041670283878808924,
65
+ "grad_norm": 184.0,
66
+ "learning_rate": 4.097222222222222e-06,
67
+ "loss": 12.1758,
68
+ "mean_token_accuracy": 0.07569821006618441,
69
+ "num_tokens": 10025658.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 7.328701210021973,
74
+ "epoch": 0.04861533119194374,
75
+ "grad_norm": 114.5,
76
+ "learning_rate": 4.791666666666667e-06,
77
+ "loss": 9.8369,
78
+ "mean_token_accuracy": 0.08327204268425703,
79
+ "num_tokens": 11741238.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 7.635936880111695,
84
+ "epoch": 0.05556037850507856,
85
+ "grad_norm": 113.5,
86
+ "learning_rate": 5.486111111111111e-06,
87
+ "loss": 8.3162,
88
+ "mean_token_accuracy": 0.0876343248412013,
89
+ "num_tokens": 13404022.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 7.46761246919632,
94
+ "epoch": 0.06250542581821339,
95
+ "grad_norm": 123.5,
96
+ "learning_rate": 6.180555555555556e-06,
97
+ "loss": 7.63,
98
+ "mean_token_accuracy": 0.08659355682320893,
99
+ "num_tokens": 15112991.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 7.1816710293293,
104
+ "epoch": 0.06945047313134821,
105
+ "grad_norm": 148.0,
106
+ "learning_rate": 6.875e-06,
107
+ "loss": 7.0576,
108
+ "mean_token_accuracy": 0.08753767567686736,
109
+ "num_tokens": 16756210.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 7.2137157917022705,
114
+ "epoch": 0.07639552044448303,
115
+ "grad_norm": 169.0,
116
+ "learning_rate": 7.569444444444445e-06,
117
+ "loss": 6.6052,
118
+ "mean_token_accuracy": 0.08361622425727547,
119
+ "num_tokens": 18470889.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 7.177587866783142,
124
+ "epoch": 0.08334056775761785,
125
+ "grad_norm": 156.0,
126
+ "learning_rate": 8.263888888888888e-06,
127
+ "loss": 6.0531,
128
+ "mean_token_accuracy": 0.08918410916812718,
129
+ "num_tokens": 20159140.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 7.2117412209510805,
134
+ "epoch": 0.09028561507075267,
135
+ "grad_norm": 160.0,
136
+ "learning_rate": 8.958333333333334e-06,
137
+ "loss": 5.4473,
138
+ "mean_token_accuracy": 0.08560893354006112,
139
+ "num_tokens": 21857491.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 7.211903780698776,
144
+ "epoch": 0.09723066238388749,
145
+ "grad_norm": 278.0,
146
+ "learning_rate": 9.652777777777777e-06,
147
+ "loss": 4.788,
148
+ "mean_token_accuracy": 0.127999250870198,
149
+ "num_tokens": 23545940.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 7.21578094959259,
154
+ "epoch": 0.1041757096970223,
155
+ "grad_norm": 163.0,
156
+ "learning_rate": 1.0347222222222223e-05,
157
+ "loss": 4.1808,
158
+ "mean_token_accuracy": 0.5043030813336372,
159
+ "num_tokens": 25200650.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 7.225958931446075,
164
+ "epoch": 0.11112075701015713,
165
+ "grad_norm": 146.0,
166
+ "learning_rate": 1.1041666666666668e-05,
167
+ "loss": 3.4839,
168
+ "mean_token_accuracy": 0.8470314003527164,
169
+ "num_tokens": 26879379.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 7.03481342792511,
174
+ "epoch": 0.11806580432329196,
175
+ "grad_norm": 160.0,
176
+ "learning_rate": 1.1736111111111112e-05,
177
+ "loss": 2.6644,
178
+ "mean_token_accuracy": 0.9146032109856606,
179
+ "num_tokens": 28559842.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 6.437038451433182,
184
+ "epoch": 0.12501085163642678,
185
+ "grad_norm": 135.0,
186
+ "learning_rate": 1.2430555555555557e-05,
187
+ "loss": 1.8016,
188
+ "mean_token_accuracy": 0.9310876823961735,
189
+ "num_tokens": 30223208.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 4.774793094396591,
194
+ "epoch": 0.13195589894956158,
195
+ "grad_norm": 72.0,
196
+ "learning_rate": 1.3125e-05,
197
+ "loss": 1.0024,
198
+ "mean_token_accuracy": 0.9337419532239437,
199
+ "num_tokens": 31915078.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 2.502578377723694,
204
+ "epoch": 0.13890094626269642,
205
+ "grad_norm": 18.5,
206
+ "learning_rate": 1.3819444444444444e-05,
207
+ "loss": 0.5206,
208
+ "mean_token_accuracy": 0.9370624087750912,
209
+ "num_tokens": 33627632.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 1.1439476042985917,
214
+ "epoch": 0.14584599357583122,
215
+ "grad_norm": 6.5,
216
+ "learning_rate": 1.451388888888889e-05,
217
+ "loss": 0.314,
218
+ "mean_token_accuracy": 0.9395098507404327,
219
+ "num_tokens": 35335117.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.625144999101758,
224
+ "epoch": 0.15279104088896606,
225
+ "grad_norm": 2.296875,
226
+ "learning_rate": 1.5208333333333333e-05,
227
+ "loss": 0.2057,
228
+ "mean_token_accuracy": 0.9517911829054355,
229
+ "num_tokens": 37011186.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.34807259757071735,
234
+ "epoch": 0.1597360882021009,
235
+ "grad_norm": 0.98828125,
236
+ "learning_rate": 1.590277777777778e-05,
237
+ "loss": 0.1098,
238
+ "mean_token_accuracy": 0.9706214666366577,
239
+ "num_tokens": 38673894.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.22573072295635938,
244
+ "epoch": 0.1666811355152357,
245
+ "grad_norm": 0.71875,
246
+ "learning_rate": 1.6597222222222222e-05,
247
+ "loss": 0.0648,
248
+ "mean_token_accuracy": 0.9812032662332058,
249
+ "num_tokens": 40360862.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.19475232288241387,
254
+ "epoch": 0.17362618282837053,
255
+ "grad_norm": 0.3515625,
256
+ "learning_rate": 1.7291666666666666e-05,
257
+ "loss": 0.0472,
258
+ "mean_token_accuracy": 0.9857565425336361,
259
+ "num_tokens": 42047917.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "epoch": 0.17362618282837053,
264
+ "eval_entropy": 0.22951670004171507,
265
+ "eval_loss": 0.05014730989933014,
266
+ "eval_mean_token_accuracy": 0.985958794625192,
267
+ "eval_num_tokens": 42047917.0,
268
+ "eval_runtime": 81.9972,
269
+ "eval_samples_per_second": 121.468,
270
+ "eval_steps_per_second": 7.598,
271
+ "step": 250
272
+ },
273
+ {
274
+ "entropy": 0.1923621967434883,
275
+ "epoch": 0.18057123014150533,
276
+ "grad_norm": 0.337890625,
277
+ "learning_rate": 1.7986111111111113e-05,
278
+ "loss": 0.0397,
279
+ "mean_token_accuracy": 0.9877771027386189,
280
+ "num_tokens": 43714135.0,
281
+ "step": 260
282
+ },
283
+ {
284
+ "entropy": 0.18295048763975502,
285
+ "epoch": 0.18751627745464017,
286
+ "grad_norm": 0.3046875,
287
+ "learning_rate": 1.8680555555555556e-05,
288
+ "loss": 0.0395,
289
+ "mean_token_accuracy": 0.9877975657582283,
290
+ "num_tokens": 45420354.0,
291
+ "step": 270
292
+ },
293
+ {
294
+ "entropy": 0.18508334569633006,
295
+ "epoch": 0.19446132476777497,
296
+ "grad_norm": 0.29296875,
297
+ "learning_rate": 1.9375e-05,
298
+ "loss": 0.0365,
299
+ "mean_token_accuracy": 0.9889688648283481,
300
+ "num_tokens": 47094373.0,
301
+ "step": 280
302
+ },
303
+ {
304
+ "entropy": 0.17800553049892187,
305
+ "epoch": 0.2014063720809098,
306
+ "grad_norm": 0.2373046875,
307
+ "learning_rate": 2.0069444444444447e-05,
308
+ "loss": 0.0334,
309
+ "mean_token_accuracy": 0.9898182846605778,
310
+ "num_tokens": 48737014.0,
311
+ "step": 290
312
+ },
313
+ {
314
+ "entropy": 0.17542087566107512,
315
+ "epoch": 0.2083514193940446,
316
+ "grad_norm": 0.33984375,
317
+ "learning_rate": 2.076388888888889e-05,
318
+ "loss": 0.0332,
319
+ "mean_token_accuracy": 0.9898306861519813,
320
+ "num_tokens": 50411139.0,
321
+ "step": 300
322
+ },
323
+ {
324
+ "entropy": 0.17356558851897716,
325
+ "epoch": 0.21529646670717945,
326
+ "grad_norm": 0.2421875,
327
+ "learning_rate": 2.1458333333333334e-05,
328
+ "loss": 0.0329,
329
+ "mean_token_accuracy": 0.9897275142371654,
330
+ "num_tokens": 52101776.0,
331
+ "step": 310
332
+ },
333
+ {
334
+ "entropy": 0.18331474922597407,
335
+ "epoch": 0.22224151402031425,
336
+ "grad_norm": 0.27734375,
337
+ "learning_rate": 2.2152777777777778e-05,
338
+ "loss": 0.0323,
339
+ "mean_token_accuracy": 0.9898556746542454,
340
+ "num_tokens": 53761706.0,
341
+ "step": 320
342
+ },
343
+ {
344
+ "entropy": 0.16903471825644373,
345
+ "epoch": 0.22918656133344908,
346
+ "grad_norm": 0.2265625,
347
+ "learning_rate": 2.284722222222222e-05,
348
+ "loss": 0.0303,
349
+ "mean_token_accuracy": 0.9905840046703815,
350
+ "num_tokens": 55510137.0,
351
+ "step": 330
352
+ },
353
+ {
354
+ "entropy": 0.1726135764271021,
355
+ "epoch": 0.23613160864658392,
356
+ "grad_norm": 0.2353515625,
357
+ "learning_rate": 2.354166666666667e-05,
358
+ "loss": 0.0308,
359
+ "mean_token_accuracy": 0.9905642926692962,
360
+ "num_tokens": 57209080.0,
361
+ "step": 340
362
+ },
363
+ {
364
+ "entropy": 0.17743745557963847,
365
+ "epoch": 0.24307665595971872,
366
+ "grad_norm": 0.32421875,
367
+ "learning_rate": 2.4236111111111112e-05,
368
+ "loss": 0.0304,
369
+ "mean_token_accuracy": 0.9905851900577545,
370
+ "num_tokens": 58867303.0,
371
+ "step": 350
372
+ },
373
+ {
374
+ "entropy": 0.18280480420216919,
375
+ "epoch": 0.25002170327285356,
376
+ "grad_norm": 0.359375,
377
+ "learning_rate": 2.4930555555555556e-05,
378
+ "loss": 0.0315,
379
+ "mean_token_accuracy": 0.9901946425437927,
380
+ "num_tokens": 60510859.0,
381
+ "step": 360
382
+ },
383
+ {
384
+ "entropy": 0.1761154976673424,
385
+ "epoch": 0.2569667505859884,
386
+ "grad_norm": 0.5,
387
+ "learning_rate": 2.5625e-05,
388
+ "loss": 0.0298,
389
+ "mean_token_accuracy": 0.9909366421401501,
390
+ "num_tokens": 62171999.0,
391
+ "step": 370
392
+ },
393
+ {
394
+ "entropy": 0.1671930664218962,
395
+ "epoch": 0.26391179789912317,
396
+ "grad_norm": 0.31640625,
397
+ "learning_rate": 2.6319444444444443e-05,
398
+ "loss": 0.0284,
399
+ "mean_token_accuracy": 0.9912083707749844,
400
+ "num_tokens": 63906245.0,
401
+ "step": 380
402
+ },
403
+ {
404
+ "entropy": 0.1754794458858669,
405
+ "epoch": 0.270856845212258,
406
+ "grad_norm": 0.240234375,
407
+ "learning_rate": 2.7013888888888887e-05,
408
+ "loss": 0.0303,
409
+ "mean_token_accuracy": 0.9904192194342614,
410
+ "num_tokens": 65589663.0,
411
+ "step": 390
412
+ },
413
+ {
414
+ "entropy": 0.1771222472190857,
415
+ "epoch": 0.27780189252539284,
416
+ "grad_norm": 0.2470703125,
417
+ "learning_rate": 2.7708333333333337e-05,
418
+ "loss": 0.0299,
419
+ "mean_token_accuracy": 0.9907479107379913,
420
+ "num_tokens": 67241611.0,
421
+ "step": 400
422
+ },
423
+ {
424
+ "entropy": 0.17829248458147048,
425
+ "epoch": 0.28474693983852767,
426
+ "grad_norm": 0.357421875,
427
+ "learning_rate": 2.840277777777778e-05,
428
+ "loss": 0.0297,
429
+ "mean_token_accuracy": 0.9906024843454361,
430
+ "num_tokens": 68889437.0,
431
+ "step": 410
432
+ },
433
+ {
434
+ "entropy": 0.17638491541147233,
435
+ "epoch": 0.29169198715166245,
436
+ "grad_norm": 0.1943359375,
437
+ "learning_rate": 2.9097222222222224e-05,
438
+ "loss": 0.0284,
439
+ "mean_token_accuracy": 0.9913305290043354,
440
+ "num_tokens": 70618362.0,
441
+ "step": 420
442
+ },
443
+ {
444
+ "entropy": 0.18182099591940643,
445
+ "epoch": 0.2986370344647973,
446
+ "grad_norm": 0.40234375,
447
+ "learning_rate": 2.9791666666666668e-05,
448
+ "loss": 0.0285,
449
+ "mean_token_accuracy": 0.9912928201258182,
450
+ "num_tokens": 72313577.0,
451
+ "step": 430
452
+ },
453
+ {
454
+ "entropy": 0.18310831850394607,
455
+ "epoch": 0.3055820817779321,
456
+ "grad_norm": 0.2265625,
457
+ "learning_rate": 2.9999760059577934e-05,
458
+ "loss": 0.029,
459
+ "mean_token_accuracy": 0.9910079762339592,
460
+ "num_tokens": 73988194.0,
461
+ "step": 440
462
+ },
463
+ {
464
+ "entropy": 0.17772579193115234,
465
+ "epoch": 0.31252712909106695,
466
+ "grad_norm": 0.2470703125,
467
+ "learning_rate": 2.9998584859663212e-05,
468
+ "loss": 0.0284,
469
+ "mean_token_accuracy": 0.9910575695335865,
470
+ "num_tokens": 75663076.0,
471
+ "step": 450
472
+ },
473
+ {
474
+ "entropy": 0.17988401055335998,
475
+ "epoch": 0.3194721764042018,
476
+ "grad_norm": 0.259765625,
477
+ "learning_rate": 2.9996430406198636e-05,
478
+ "loss": 0.028,
479
+ "mean_token_accuracy": 0.9914611585438251,
480
+ "num_tokens": 77335287.0,
481
+ "step": 460
482
+ },
483
+ {
484
+ "entropy": 0.173457539267838,
485
+ "epoch": 0.32641722371733656,
486
+ "grad_norm": 0.33984375,
487
+ "learning_rate": 2.9993296839847897e-05,
488
+ "loss": 0.0287,
489
+ "mean_token_accuracy": 0.9910615682601929,
490
+ "num_tokens": 79052668.0,
491
+ "step": 470
492
+ },
493
+ {
494
+ "entropy": 0.16980679212138056,
495
+ "epoch": 0.3333622710304714,
496
+ "grad_norm": 0.234375,
497
+ "learning_rate": 2.9989184365200695e-05,
498
+ "loss": 0.0266,
499
+ "mean_token_accuracy": 0.9917186461389065,
500
+ "num_tokens": 80733082.0,
501
+ "step": 480
502
+ },
503
+ {
504
+ "entropy": 0.16687583839520811,
505
+ "epoch": 0.3403073183436062,
506
+ "grad_norm": 0.197265625,
507
+ "learning_rate": 2.998409325075938e-05,
508
+ "loss": 0.0274,
509
+ "mean_token_accuracy": 0.9914394572377205,
510
+ "num_tokens": 82456201.0,
511
+ "step": 490
512
+ },
513
+ {
514
+ "entropy": 0.16694002971053123,
515
+ "epoch": 0.34725236565674106,
516
+ "grad_norm": 0.193359375,
517
+ "learning_rate": 2.997802382892144e-05,
518
+ "loss": 0.0271,
519
+ "mean_token_accuracy": 0.9916702680289745,
520
+ "num_tokens": 84174465.0,
521
+ "step": 500
522
+ },
523
+ {
524
+ "epoch": 0.34725236565674106,
525
+ "eval_entropy": 0.2104673289826938,
526
+ "eval_loss": 0.03412040323019028,
527
+ "eval_mean_token_accuracy": 0.9906957189688522,
528
+ "eval_num_tokens": 84174465.0,
529
+ "eval_runtime": 81.2363,
530
+ "eval_samples_per_second": 122.605,
531
+ "eval_steps_per_second": 7.669,
532
+ "step": 500
533
+ },
534
+ {
535
+ "entropy": 0.16945981308817865,
536
+ "epoch": 0.35419741296987584,
537
+ "grad_norm": 0.369140625,
538
+ "learning_rate": 2.997097649595779e-05,
539
+ "loss": 0.0281,
540
+ "mean_token_accuracy": 0.9912286765873433,
541
+ "num_tokens": 85878591.0,
542
+ "step": 510
543
+ },
544
+ {
545
+ "entropy": 0.16488271821290254,
546
+ "epoch": 0.36114246028301067,
547
+ "grad_norm": 0.1962890625,
548
+ "learning_rate": 2.9962951711986897e-05,
549
+ "loss": 0.0277,
550
+ "mean_token_accuracy": 0.9914339393377304,
551
+ "num_tokens": 87573271.0,
552
+ "step": 520
553
+ },
554
+ {
555
+ "entropy": 0.17190398359671236,
556
+ "epoch": 0.3680875075961455,
557
+ "grad_norm": 0.32421875,
558
+ "learning_rate": 2.9953950000944724e-05,
559
+ "loss": 0.0291,
560
+ "mean_token_accuracy": 0.9910320229828358,
561
+ "num_tokens": 89226804.0,
562
+ "step": 530
563
+ },
564
+ {
565
+ "entropy": 0.16401167614385487,
566
+ "epoch": 0.37503255490928034,
567
+ "grad_norm": 0.2216796875,
568
+ "learning_rate": 2.994397195055056e-05,
569
+ "loss": 0.0268,
570
+ "mean_token_accuracy": 0.9915202751755714,
571
+ "num_tokens": 90931627.0,
572
+ "step": 540
573
+ },
574
+ {
575
+ "entropy": 0.1601521703414619,
576
+ "epoch": 0.3819776022224151,
577
+ "grad_norm": 0.1806640625,
578
+ "learning_rate": 2.9933018212268607e-05,
579
+ "loss": 0.0256,
580
+ "mean_token_accuracy": 0.9919740296900272,
581
+ "num_tokens": 92669023.0,
582
+ "step": 550
583
+ },
584
+ {
585
+ "entropy": 0.1690834665670991,
586
+ "epoch": 0.38892264953554995,
587
+ "grad_norm": 0.1806640625,
588
+ "learning_rate": 2.9921089501265493e-05,
589
+ "loss": 0.0272,
590
+ "mean_token_accuracy": 0.9918020509183407,
591
+ "num_tokens": 94351832.0,
592
+ "step": 560
593
+ },
594
+ {
595
+ "entropy": 0.16791587956249715,
596
+ "epoch": 0.3958676968486848,
597
+ "grad_norm": 0.177734375,
598
+ "learning_rate": 2.990818659636352e-05,
599
+ "loss": 0.0269,
600
+ "mean_token_accuracy": 0.9915785849094391,
601
+ "num_tokens": 96026408.0,
602
+ "step": 570
603
+ },
604
+ {
605
+ "entropy": 0.1673963068984449,
606
+ "epoch": 0.4028127441618196,
607
+ "grad_norm": 0.1591796875,
608
+ "learning_rate": 2.989431033998986e-05,
609
+ "loss": 0.0263,
610
+ "mean_token_accuracy": 0.9915929511189461,
611
+ "num_tokens": 97667765.0,
612
+ "step": 580
613
+ },
614
+ {
615
+ "entropy": 0.1658677328377962,
616
+ "epoch": 0.40975779147495445,
617
+ "grad_norm": 0.25390625,
618
+ "learning_rate": 2.987946163812155e-05,
619
+ "loss": 0.0263,
620
+ "mean_token_accuracy": 0.9918196611106396,
621
+ "num_tokens": 99392767.0,
622
+ "step": 590
623
+ },
624
+ {
625
+ "entropy": 0.16773838149383663,
626
+ "epoch": 0.4167028387880892,
627
+ "grad_norm": 1.078125,
628
+ "learning_rate": 2.9863641460226304e-05,
629
+ "loss": 0.0269,
630
+ "mean_token_accuracy": 0.9917811565101147,
631
+ "num_tokens": 101073637.0,
632
+ "step": 600
633
+ },
634
+ {
635
+ "entropy": 0.1649629818275571,
636
+ "epoch": 0.42364788610122406,
637
+ "grad_norm": 0.263671875,
638
+ "learning_rate": 2.984685083919928e-05,
639
+ "loss": 0.0274,
640
+ "mean_token_accuracy": 0.9912688471376896,
641
+ "num_tokens": 102740766.0,
642
+ "step": 610
643
+ },
644
+ {
645
+ "entropy": 0.15838869903236627,
646
+ "epoch": 0.4305929334143589,
647
+ "grad_norm": 0.27734375,
648
+ "learning_rate": 2.9829090871295564e-05,
649
+ "loss": 0.0251,
650
+ "mean_token_accuracy": 0.9922874167561531,
651
+ "num_tokens": 104488780.0,
652
+ "step": 620
653
+ },
654
+ {
655
+ "entropy": 0.17186203664168714,
656
+ "epoch": 0.4375379807274937,
657
+ "grad_norm": 0.236328125,
658
+ "learning_rate": 2.981036271605867e-05,
659
+ "loss": 0.0263,
660
+ "mean_token_accuracy": 0.9918930843472481,
661
+ "num_tokens": 106195449.0,
662
+ "step": 630
663
+ },
664
+ {
665
+ "entropy": 0.1648147610016167,
666
+ "epoch": 0.4444830280406285,
667
+ "grad_norm": 0.20703125,
668
+ "learning_rate": 2.979066759624479e-05,
669
+ "loss": 0.0252,
670
+ "mean_token_accuracy": 0.992139758169651,
671
+ "num_tokens": 107902411.0,
672
+ "step": 640
673
+ },
674
+ {
675
+ "entropy": 0.16121056992560626,
676
+ "epoch": 0.45142807535376334,
677
+ "grad_norm": 0.2392578125,
678
+ "learning_rate": 2.9770006797742963e-05,
679
+ "loss": 0.025,
680
+ "mean_token_accuracy": 0.9919275738298893,
681
+ "num_tokens": 109608103.0,
682
+ "step": 650
683
+ },
684
+ {
685
+ "entropy": 0.17468737829476594,
686
+ "epoch": 0.45837312266689817,
687
+ "grad_norm": 0.400390625,
688
+ "learning_rate": 2.974838166949114e-05,
689
+ "loss": 0.0272,
690
+ "mean_token_accuracy": 0.9917291469871998,
691
+ "num_tokens": 111293326.0,
692
+ "step": 660
693
+ },
694
+ {
695
+ "entropy": 0.1678467271849513,
696
+ "epoch": 0.465318169980033,
697
+ "grad_norm": 0.21484375,
698
+ "learning_rate": 2.9725793623388097e-05,
699
+ "loss": 0.0255,
700
+ "mean_token_accuracy": 0.9922867320477963,
701
+ "num_tokens": 113010182.0,
702
+ "step": 670
703
+ },
704
+ {
705
+ "entropy": 0.16656696163117884,
706
+ "epoch": 0.47226321729316784,
707
+ "grad_norm": 0.359375,
708
+ "learning_rate": 2.9702244134201263e-05,
709
+ "loss": 0.0258,
710
+ "mean_token_accuracy": 0.9920145235955715,
711
+ "num_tokens": 114702373.0,
712
+ "step": 680
713
+ },
714
+ {
715
+ "entropy": 0.1741402018815279,
716
+ "epoch": 0.4792082646063026,
717
+ "grad_norm": 0.181640625,
718
+ "learning_rate": 2.9677734739470407e-05,
719
+ "loss": 0.0282,
720
+ "mean_token_accuracy": 0.9915049582719803,
721
+ "num_tokens": 116371688.0,
722
+ "step": 690
723
+ },
724
+ {
725
+ "entropy": 0.17028953870758415,
726
+ "epoch": 0.48615331191943745,
727
+ "grad_norm": 0.1845703125,
728
+ "learning_rate": 2.9652267039407297e-05,
729
+ "loss": 0.0273,
730
+ "mean_token_accuracy": 0.9915660627186298,
731
+ "num_tokens": 118057500.0,
732
+ "step": 700
733
+ },
734
+ {
735
+ "entropy": 0.17608886091038584,
736
+ "epoch": 0.4930983592325723,
737
+ "grad_norm": 0.1962890625,
738
+ "learning_rate": 2.962584269679117e-05,
739
+ "loss": 0.0274,
740
+ "mean_token_accuracy": 0.9914011798799038,
741
+ "num_tokens": 119691073.0,
742
+ "step": 710
743
+ },
744
+ {
745
+ "entropy": 0.16367722414433955,
746
+ "epoch": 0.5000434065457071,
747
+ "grad_norm": 0.330078125,
748
+ "learning_rate": 2.959846343686022e-05,
749
+ "loss": 0.0251,
750
+ "mean_token_accuracy": 0.9922043934464455,
751
+ "num_tokens": 121405698.0,
752
+ "step": 720
753
+ },
754
+ {
755
+ "entropy": 0.16646005054935814,
756
+ "epoch": 0.506988453858842,
757
+ "grad_norm": 0.2734375,
758
+ "learning_rate": 2.957013104719892e-05,
759
+ "loss": 0.0257,
760
+ "mean_token_accuracy": 0.9920815974473953,
761
+ "num_tokens": 123140318.0,
762
+ "step": 730
763
+ },
764
+ {
765
+ "entropy": 0.16303647235035895,
766
+ "epoch": 0.5139335011719768,
767
+ "grad_norm": 0.2255859375,
768
+ "learning_rate": 2.9540847377621337e-05,
769
+ "loss": 0.0264,
770
+ "mean_token_accuracy": 0.991966935992241,
771
+ "num_tokens": 124857523.0,
772
+ "step": 740
773
+ },
774
+ {
775
+ "entropy": 0.16988610094413162,
776
+ "epoch": 0.5208785484851115,
777
+ "grad_norm": 0.2197265625,
778
+ "learning_rate": 2.9510614340050325e-05,
779
+ "loss": 0.0256,
780
+ "mean_token_accuracy": 0.9918926700949668,
781
+ "num_tokens": 126537184.0,
782
+ "step": 750
783
+ },
784
+ {
785
+ "epoch": 0.5208785484851115,
786
+ "eval_entropy": 0.2028083825546704,
787
+ "eval_loss": 0.032208461314439774,
788
+ "eval_mean_token_accuracy": 0.99125398544783,
789
+ "eval_num_tokens": 126537184.0,
790
+ "eval_runtime": 81.5469,
791
+ "eval_samples_per_second": 122.138,
792
+ "eval_steps_per_second": 7.64,
793
+ "step": 750
794
+ },
795
+ {
796
+ "entropy": 0.1591445118188858,
797
+ "epoch": 0.5278235957982463,
798
+ "grad_norm": 0.2109375,
799
+ "learning_rate": 2.9479433908392747e-05,
800
+ "loss": 0.025,
801
+ "mean_token_accuracy": 0.9921697616577149,
802
+ "num_tokens": 128251701.0,
803
+ "step": 760
804
+ },
805
+ {
806
+ "entropy": 0.16354208169505,
807
+ "epoch": 0.5347686431113812,
808
+ "grad_norm": 0.18359375,
809
+ "learning_rate": 2.9447308118410544e-05,
810
+ "loss": 0.0251,
811
+ "mean_token_accuracy": 0.9920223578810692,
812
+ "num_tokens": 129913493.0,
813
+ "step": 770
814
+ },
815
+ {
816
+ "entropy": 0.16334566865116357,
817
+ "epoch": 0.541713690424516,
818
+ "grad_norm": 0.2177734375,
819
+ "learning_rate": 2.9414239067587866e-05,
820
+ "loss": 0.0256,
821
+ "mean_token_accuracy": 0.992166705429554,
822
+ "num_tokens": 131640183.0,
823
+ "step": 780
824
+ },
825
+ {
826
+ "entropy": 0.16315491450950503,
827
+ "epoch": 0.5486587377376508,
828
+ "grad_norm": 0.16796875,
829
+ "learning_rate": 2.938022891499409e-05,
830
+ "loss": 0.0257,
831
+ "mean_token_accuracy": 0.9920231319963932,
832
+ "num_tokens": 133328365.0,
833
+ "step": 790
834
+ },
835
+ {
836
+ "entropy": 0.1643814197741449,
837
+ "epoch": 0.5556037850507857,
838
+ "grad_norm": 0.26171875,
839
+ "learning_rate": 2.9345279881142887e-05,
840
+ "loss": 0.0257,
841
+ "mean_token_accuracy": 0.991948215663433,
842
+ "num_tokens": 134988156.0,
843
+ "step": 800
844
+ },
845
+ {
846
+ "entropy": 0.160524242464453,
847
+ "epoch": 0.5625488323639205,
848
+ "grad_norm": 0.2490234375,
849
+ "learning_rate": 2.930939424784723e-05,
850
+ "loss": 0.0243,
851
+ "mean_token_accuracy": 0.9926339693367481,
852
+ "num_tokens": 136738001.0,
853
+ "step": 810
854
+ },
855
+ {
856
+ "entropy": 0.1635106342844665,
857
+ "epoch": 0.5694938796770553,
858
+ "grad_norm": 0.279296875,
859
+ "learning_rate": 2.9272574358070406e-05,
860
+ "loss": 0.0263,
861
+ "mean_token_accuracy": 0.991894131153822,
862
+ "num_tokens": 138411117.0,
863
+ "step": 820
864
+ },
865
+ {
866
+ "entropy": 0.16258806074038148,
867
+ "epoch": 0.5764389269901902,
868
+ "grad_norm": 0.197265625,
869
+ "learning_rate": 2.923482261577307e-05,
870
+ "loss": 0.0265,
871
+ "mean_token_accuracy": 0.9918580889701843,
872
+ "num_tokens": 140144400.0,
873
+ "step": 830
874
+ },
875
+ {
876
+ "entropy": 0.17904180409386755,
877
+ "epoch": 0.5833839743033249,
878
+ "grad_norm": 0.181640625,
879
+ "learning_rate": 2.9196141485756267e-05,
880
+ "loss": 0.0283,
881
+ "mean_token_accuracy": 0.9913281343877316,
882
+ "num_tokens": 141787622.0,
883
+ "step": 840
884
+ },
885
+ {
886
+ "entropy": 0.1736303588375449,
887
+ "epoch": 0.5903290216164597,
888
+ "grad_norm": 0.2431640625,
889
+ "learning_rate": 2.915653349350051e-05,
890
+ "loss": 0.0262,
891
+ "mean_token_accuracy": 0.9919952727854252,
892
+ "num_tokens": 143416525.0,
893
+ "step": 850
894
+ },
895
+ {
896
+ "entropy": 0.17081214878708123,
897
+ "epoch": 0.5972740689295946,
898
+ "grad_norm": 0.2392578125,
899
+ "learning_rate": 2.9116001225000912e-05,
900
+ "loss": 0.0265,
901
+ "mean_token_accuracy": 0.9918745383620262,
902
+ "num_tokens": 145100905.0,
903
+ "step": 860
904
+ },
905
+ {
906
+ "entropy": 0.17302428921684623,
907
+ "epoch": 0.6042191162427294,
908
+ "grad_norm": 0.1875,
909
+ "learning_rate": 2.9074547326598316e-05,
910
+ "loss": 0.0259,
911
+ "mean_token_accuracy": 0.991954755783081,
912
+ "num_tokens": 146789995.0,
913
+ "step": 870
914
+ },
915
+ {
916
+ "entropy": 0.16971431467682124,
917
+ "epoch": 0.6111641635558642,
918
+ "grad_norm": 0.1611328125,
919
+ "learning_rate": 2.9032174504806546e-05,
920
+ "loss": 0.0254,
921
+ "mean_token_accuracy": 0.991959635913372,
922
+ "num_tokens": 148472897.0,
923
+ "step": 880
924
+ },
925
+ {
926
+ "entropy": 0.17342975698411464,
927
+ "epoch": 0.6181092108689991,
928
+ "grad_norm": 0.328125,
929
+ "learning_rate": 2.8988885526135672e-05,
930
+ "loss": 0.0259,
931
+ "mean_token_accuracy": 0.9921123184263706,
932
+ "num_tokens": 150147383.0,
933
+ "step": 890
934
+ },
935
+ {
936
+ "entropy": 0.16509362338110806,
937
+ "epoch": 0.6250542581821339,
938
+ "grad_norm": 0.263671875,
939
+ "learning_rate": 2.894468321691141e-05,
940
+ "loss": 0.0247,
941
+ "mean_token_accuracy": 0.9921708039939403,
942
+ "num_tokens": 151859058.0,
943
+ "step": 900
944
+ },
945
+ {
946
+ "entropy": 0.17183032808825374,
947
+ "epoch": 0.6319993054952687,
948
+ "grad_norm": 0.185546875,
949
+ "learning_rate": 2.889957046309058e-05,
950
+ "loss": 0.0259,
951
+ "mean_token_accuracy": 0.9919849313795567,
952
+ "num_tokens": 153511113.0,
953
+ "step": 910
954
+ },
955
+ {
956
+ "entropy": 0.1668629450723529,
957
+ "epoch": 0.6389443528084036,
958
+ "grad_norm": 0.1806640625,
959
+ "learning_rate": 2.8853550210072676e-05,
960
+ "loss": 0.0242,
961
+ "mean_token_accuracy": 0.9923402860760688,
962
+ "num_tokens": 155209381.0,
963
+ "step": 920
964
+ },
965
+ {
966
+ "entropy": 0.165106045268476,
967
+ "epoch": 0.6458894001215383,
968
+ "grad_norm": 0.216796875,
969
+ "learning_rate": 2.8806625462507573e-05,
970
+ "loss": 0.0244,
971
+ "mean_token_accuracy": 0.9924936473369599,
972
+ "num_tokens": 156930591.0,
973
+ "step": 930
974
+ },
975
+ {
976
+ "entropy": 0.1726138608530164,
977
+ "epoch": 0.6528344474346731,
978
+ "grad_norm": 0.1689453125,
979
+ "learning_rate": 2.8758799284099357e-05,
980
+ "loss": 0.0255,
981
+ "mean_token_accuracy": 0.9920349515974521,
982
+ "num_tokens": 158611580.0,
983
+ "step": 940
984
+ },
985
+ {
986
+ "entropy": 0.1682711481116712,
987
+ "epoch": 0.659779494747808,
988
+ "grad_norm": 0.201171875,
989
+ "learning_rate": 2.8710074797406277e-05,
990
+ "loss": 0.0242,
991
+ "mean_token_accuracy": 0.9924891531467438,
992
+ "num_tokens": 160303915.0,
993
+ "step": 950
994
+ },
995
+ {
996
+ "entropy": 0.1658643066883087,
997
+ "epoch": 0.6667245420609428,
998
+ "grad_norm": 0.26171875,
999
+ "learning_rate": 2.8660455183636893e-05,
1000
+ "loss": 0.0255,
1001
+ "mean_token_accuracy": 0.9920015670359135,
1002
+ "num_tokens": 161990176.0,
1003
+ "step": 960
1004
+ },
1005
+ {
1006
+ "entropy": 0.16630794005468488,
1007
+ "epoch": 0.6736695893740776,
1008
+ "grad_norm": 0.1748046875,
1009
+ "learning_rate": 2.860994368244238e-05,
1010
+ "loss": 0.0246,
1011
+ "mean_token_accuracy": 0.9922889664769172,
1012
+ "num_tokens": 163672500.0,
1013
+ "step": 970
1014
+ },
1015
+ {
1016
+ "entropy": 0.17136559821665287,
1017
+ "epoch": 0.6806146366872124,
1018
+ "grad_norm": 0.162109375,
1019
+ "learning_rate": 2.8558543591704984e-05,
1020
+ "loss": 0.0263,
1021
+ "mean_token_accuracy": 0.9918027445673943,
1022
+ "num_tokens": 165338616.0,
1023
+ "step": 980
1024
+ },
1025
+ {
1026
+ "entropy": 0.16459389431402088,
1027
+ "epoch": 0.6875596840003473,
1028
+ "grad_norm": 0.2236328125,
1029
+ "learning_rate": 2.8506258267322738e-05,
1030
+ "loss": 0.0251,
1031
+ "mean_token_accuracy": 0.9921816885471344,
1032
+ "num_tokens": 167056620.0,
1033
+ "step": 990
1034
+ },
1035
+ {
1036
+ "entropy": 0.1651211366057396,
1037
+ "epoch": 0.6945047313134821,
1038
+ "grad_norm": 0.177734375,
1039
+ "learning_rate": 2.8453091122990325e-05,
1040
+ "loss": 0.0242,
1041
+ "mean_token_accuracy": 0.9924872562289238,
1042
+ "num_tokens": 168766550.0,
1043
+ "step": 1000
1044
+ },
1045
+ {
1046
+ "epoch": 0.6945047313134821,
1047
+ "eval_entropy": 0.20344850389857927,
1048
+ "eval_loss": 0.0312611423432827,
1049
+ "eval_mean_token_accuracy": 0.9914905528386945,
1050
+ "eval_num_tokens": 168766550.0,
1051
+ "eval_runtime": 82.313,
1052
+ "eval_samples_per_second": 121.002,
1053
+ "eval_steps_per_second": 7.569,
1054
+ "step": 1000
1055
+ },
1056
+ {
1057
+ "entropy": 0.16957395104691386,
1058
+ "epoch": 0.701449778626617,
1059
+ "grad_norm": 0.248046875,
1060
+ "learning_rate": 2.8399045629976228e-05,
1061
+ "loss": 0.0251,
1062
+ "mean_token_accuracy": 0.9921023808419704,
1063
+ "num_tokens": 170430747.0,
1064
+ "step": 1010
1065
+ },
1066
+ {
1067
+ "entropy": 0.16484390227124096,
1068
+ "epoch": 0.7083948259397517,
1069
+ "grad_norm": 0.169921875,
1070
+ "learning_rate": 2.8344125316896066e-05,
1071
+ "loss": 0.0249,
1072
+ "mean_token_accuracy": 0.9923050113022327,
1073
+ "num_tokens": 172148573.0,
1074
+ "step": 1020
1075
+ },
1076
+ {
1077
+ "entropy": 0.16710211224853994,
1078
+ "epoch": 0.7153398732528865,
1079
+ "grad_norm": 0.2138671875,
1080
+ "learning_rate": 2.8288333769482234e-05,
1081
+ "loss": 0.0258,
1082
+ "mean_token_accuracy": 0.9921270661056042,
1083
+ "num_tokens": 173820070.0,
1084
+ "step": 1030
1085
+ },
1086
+ {
1087
+ "entropy": 0.16856574947014452,
1088
+ "epoch": 0.7222849205660213,
1089
+ "grad_norm": 0.26953125,
1090
+ "learning_rate": 2.823167463034978e-05,
1091
+ "loss": 0.0252,
1092
+ "mean_token_accuracy": 0.9920636892318726,
1093
+ "num_tokens": 175479758.0,
1094
+ "step": 1040
1095
+ },
1096
+ {
1097
+ "entropy": 0.16606747973710298,
1098
+ "epoch": 0.7292299678791562,
1099
+ "grad_norm": 0.3125,
1100
+ "learning_rate": 2.817415159875857e-05,
1101
+ "loss": 0.0243,
1102
+ "mean_token_accuracy": 0.992286852747202,
1103
+ "num_tokens": 177172453.0,
1104
+ "step": 1050
1105
+ },
1106
+ {
1107
+ "entropy": 0.1690367877483368,
1108
+ "epoch": 0.736175015192291,
1109
+ "grad_norm": 0.333984375,
1110
+ "learning_rate": 2.8115768430371784e-05,
1111
+ "loss": 0.0256,
1112
+ "mean_token_accuracy": 0.9920091181993484,
1113
+ "num_tokens": 178851088.0,
1114
+ "step": 1060
1115
+ },
1116
+ {
1117
+ "entropy": 0.16904147835448385,
1118
+ "epoch": 0.7431200625054258,
1119
+ "grad_norm": 0.1982421875,
1120
+ "learning_rate": 2.8056528937010707e-05,
1121
+ "loss": 0.0244,
1122
+ "mean_token_accuracy": 0.9924891479313374,
1123
+ "num_tokens": 180522526.0,
1124
+ "step": 1070
1125
+ },
1126
+ {
1127
+ "entropy": 0.16681075962260367,
1128
+ "epoch": 0.7500651098185607,
1129
+ "grad_norm": 0.197265625,
1130
+ "learning_rate": 2.7996436986405826e-05,
1131
+ "loss": 0.0244,
1132
+ "mean_token_accuracy": 0.9924980387091636,
1133
+ "num_tokens": 182232157.0,
1134
+ "step": 1080
1135
+ },
1136
+ {
1137
+ "entropy": 0.1678140834905207,
1138
+ "epoch": 0.7570101571316955,
1139
+ "grad_norm": 0.2734375,
1140
+ "learning_rate": 2.7935496501944358e-05,
1141
+ "loss": 0.0246,
1142
+ "mean_token_accuracy": 0.9924834348261357,
1143
+ "num_tokens": 183935604.0,
1144
+ "step": 1090
1145
+ },
1146
+ {
1147
+ "entropy": 0.16671771639958025,
1148
+ "epoch": 0.7639552044448302,
1149
+ "grad_norm": 0.22265625,
1150
+ "learning_rate": 2.7873711462414057e-05,
1151
+ "loss": 0.0248,
1152
+ "mean_token_accuracy": 0.9925374925136566,
1153
+ "num_tokens": 185597802.0,
1154
+ "step": 1100
1155
+ },
1156
+ {
1157
+ "entropy": 0.16695020655170084,
1158
+ "epoch": 0.7709002517579651,
1159
+ "grad_norm": 0.30078125,
1160
+ "learning_rate": 2.7811085901743443e-05,
1161
+ "loss": 0.025,
1162
+ "mean_token_accuracy": 0.9922063417732716,
1163
+ "num_tokens": 187315982.0,
1164
+ "step": 1110
1165
+ },
1166
+ {
1167
+ "entropy": 0.16623194608837366,
1168
+ "epoch": 0.7778452990710999,
1169
+ "grad_norm": 0.2392578125,
1170
+ "learning_rate": 2.7747623908738438e-05,
1171
+ "loss": 0.0244,
1172
+ "mean_token_accuracy": 0.992550103366375,
1173
+ "num_tokens": 189065194.0,
1174
+ "step": 1120
1175
+ },
1176
+ {
1177
+ "entropy": 0.17834464814513923,
1178
+ "epoch": 0.7847903463842347,
1179
+ "grad_norm": 0.16796875,
1180
+ "learning_rate": 2.7683329626815405e-05,
1181
+ "loss": 0.0255,
1182
+ "mean_token_accuracy": 0.9921450808644294,
1183
+ "num_tokens": 190723953.0,
1184
+ "step": 1130
1185
+ },
1186
+ {
1187
+ "entropy": 0.16976428665220739,
1188
+ "epoch": 0.7917353936973696,
1189
+ "grad_norm": 0.12890625,
1190
+ "learning_rate": 2.761820725373063e-05,
1191
+ "loss": 0.0244,
1192
+ "mean_token_accuracy": 0.9923642635345459,
1193
+ "num_tokens": 192392834.0,
1194
+ "step": 1140
1195
+ },
1196
+ {
1197
+ "entropy": 0.16556692561134695,
1198
+ "epoch": 0.7986804410105044,
1199
+ "grad_norm": 0.146484375,
1200
+ "learning_rate": 2.7552261041306236e-05,
1201
+ "loss": 0.0239,
1202
+ "mean_token_accuracy": 0.9925307177007199,
1203
+ "num_tokens": 194102328.0,
1204
+ "step": 1150
1205
+ },
1206
+ {
1207
+ "entropy": 0.1675541820935905,
1208
+ "epoch": 0.8056254883236392,
1209
+ "grad_norm": 0.2158203125,
1210
+ "learning_rate": 2.7485495295152604e-05,
1211
+ "loss": 0.0245,
1212
+ "mean_token_accuracy": 0.9923206239938736,
1213
+ "num_tokens": 195797798.0,
1214
+ "step": 1160
1215
+ },
1216
+ {
1217
+ "entropy": 0.16941533563658595,
1218
+ "epoch": 0.8125705356367741,
1219
+ "grad_norm": 0.1611328125,
1220
+ "learning_rate": 2.7417914374387244e-05,
1221
+ "loss": 0.0254,
1222
+ "mean_token_accuracy": 0.9921845324337483,
1223
+ "num_tokens": 197478529.0,
1224
+ "step": 1170
1225
+ },
1226
+ {
1227
+ "entropy": 0.1745386002585292,
1228
+ "epoch": 0.8195155829499089,
1229
+ "grad_norm": 0.19921875,
1230
+ "learning_rate": 2.734952269135021e-05,
1231
+ "loss": 0.0264,
1232
+ "mean_token_accuracy": 0.9921305790543556,
1233
+ "num_tokens": 199155644.0,
1234
+ "step": 1180
1235
+ },
1236
+ {
1237
+ "entropy": 0.16794457482174038,
1238
+ "epoch": 0.8264606302630436,
1239
+ "grad_norm": 0.15234375,
1240
+ "learning_rate": 2.7280324711315975e-05,
1241
+ "loss": 0.0252,
1242
+ "mean_token_accuracy": 0.9920954599976539,
1243
+ "num_tokens": 200849027.0,
1244
+ "step": 1190
1245
+ },
1246
+ {
1247
+ "entropy": 0.17055450221523644,
1248
+ "epoch": 0.8334056775761784,
1249
+ "grad_norm": 0.1533203125,
1250
+ "learning_rate": 2.7210324952201956e-05,
1251
+ "loss": 0.0248,
1252
+ "mean_token_accuracy": 0.9922527462244034,
1253
+ "num_tokens": 202543815.0,
1254
+ "step": 1200
1255
+ },
1256
+ {
1257
+ "entropy": 0.1680849887430668,
1258
+ "epoch": 0.8403507248893133,
1259
+ "grad_norm": 0.2578125,
1260
+ "learning_rate": 2.7139527984273498e-05,
1261
+ "loss": 0.0241,
1262
+ "mean_token_accuracy": 0.9922554560005665,
1263
+ "num_tokens": 204205558.0,
1264
+ "step": 1210
1265
+ },
1266
+ {
1267
+ "entropy": 0.1646501302719116,
1268
+ "epoch": 0.8472957722024481,
1269
+ "grad_norm": 0.2470703125,
1270
+ "learning_rate": 2.706793842984549e-05,
1271
+ "loss": 0.0235,
1272
+ "mean_token_accuracy": 0.992755102366209,
1273
+ "num_tokens": 205926396.0,
1274
+ "step": 1220
1275
+ },
1276
+ {
1277
+ "entropy": 0.173645833786577,
1278
+ "epoch": 0.854240819515583,
1279
+ "grad_norm": 0.2734375,
1280
+ "learning_rate": 2.6995560962980587e-05,
1281
+ "loss": 0.0248,
1282
+ "mean_token_accuracy": 0.9920175686478615,
1283
+ "num_tokens": 207587498.0,
1284
+ "step": 1230
1285
+ },
1286
+ {
1287
+ "entropy": 0.1690474884584546,
1288
+ "epoch": 0.8611858668287178,
1289
+ "grad_norm": 0.1494140625,
1290
+ "learning_rate": 2.6922400309184022e-05,
1291
+ "loss": 0.0228,
1292
+ "mean_token_accuracy": 0.9925218060612678,
1293
+ "num_tokens": 209266767.0,
1294
+ "step": 1240
1295
+ },
1296
+ {
1297
+ "entropy": 0.17194054620340465,
1298
+ "epoch": 0.8681309141418526,
1299
+ "grad_norm": 0.34765625,
1300
+ "learning_rate": 2.6848461245095104e-05,
1301
+ "loss": 0.0244,
1302
+ "mean_token_accuracy": 0.9921990059316158,
1303
+ "num_tokens": 210919281.0,
1304
+ "step": 1250
1305
+ },
1306
+ {
1307
+ "epoch": 0.8681309141418526,
1308
+ "eval_entropy": 0.21217725058811243,
1309
+ "eval_loss": 0.030440116301178932,
1310
+ "eval_mean_token_accuracy": 0.9917104203092535,
1311
+ "eval_num_tokens": 210919281.0,
1312
+ "eval_runtime": 81.2048,
1313
+ "eval_samples_per_second": 122.653,
1314
+ "eval_steps_per_second": 7.672,
1315
+ "step": 1250
1316
+ },
1317
+ {
1318
+ "entropy": 0.17561539998278022,
1319
+ "epoch": 0.8750759614549874,
1320
+ "grad_norm": 0.1484375,
1321
+ "learning_rate": 2.6773748598175334e-05,
1322
+ "loss": 0.0238,
1323
+ "mean_token_accuracy": 0.9925415866076946,
1324
+ "num_tokens": 212549619.0,
1325
+ "step": 1260
1326
+ },
1327
+ {
1328
+ "entropy": 0.1754954437725246,
1329
+ "epoch": 0.8820210087681223,
1330
+ "grad_norm": 0.1875,
1331
+ "learning_rate": 2.6698267246393226e-05,
1332
+ "loss": 0.0255,
1333
+ "mean_token_accuracy": 0.9922065414488316,
1334
+ "num_tokens": 214229955.0,
1335
+ "step": 1270
1336
+ },
1337
+ {
1338
+ "entropy": 0.16669268533587456,
1339
+ "epoch": 0.888966056081257,
1340
+ "grad_norm": 0.1875,
1341
+ "learning_rate": 2.6622022117905835e-05,
1342
+ "loss": 0.0243,
1343
+ "mean_token_accuracy": 0.9923124983906746,
1344
+ "num_tokens": 215930293.0,
1345
+ "step": 1280
1346
+ },
1347
+ {
1348
+ "entropy": 0.16962254513055086,
1349
+ "epoch": 0.8959111033943918,
1350
+ "grad_norm": 0.29296875,
1351
+ "learning_rate": 2.6545018190736966e-05,
1352
+ "loss": 0.0245,
1353
+ "mean_token_accuracy": 0.9922974735498429,
1354
+ "num_tokens": 217597783.0,
1355
+ "step": 1290
1356
+ },
1357
+ {
1358
+ "entropy": 0.16414162274450064,
1359
+ "epoch": 0.9028561507075267,
1360
+ "grad_norm": 0.1923828125,
1361
+ "learning_rate": 2.646726049245222e-05,
1362
+ "loss": 0.024,
1363
+ "mean_token_accuracy": 0.9924651831388474,
1364
+ "num_tokens": 219266157.0,
1365
+ "step": 1300
1366
+ },
1367
+ {
1368
+ "entropy": 0.16683494308963417,
1369
+ "epoch": 0.9098011980206615,
1370
+ "grad_norm": 0.197265625,
1371
+ "learning_rate": 2.6388754099830678e-05,
1372
+ "loss": 0.0246,
1373
+ "mean_token_accuracy": 0.992430393397808,
1374
+ "num_tokens": 220961611.0,
1375
+ "step": 1310
1376
+ },
1377
+ {
1378
+ "entropy": 0.16680709961801768,
1379
+ "epoch": 0.9167462453337963,
1380
+ "grad_norm": 0.283203125,
1381
+ "learning_rate": 2.6309504138533493e-05,
1382
+ "loss": 0.0237,
1383
+ "mean_token_accuracy": 0.9925533398985863,
1384
+ "num_tokens": 222675223.0,
1385
+ "step": 1320
1386
+ },
1387
+ {
1388
+ "entropy": 0.1718313039280474,
1389
+ "epoch": 0.9236912926469312,
1390
+ "grad_norm": 0.1787109375,
1391
+ "learning_rate": 2.6229515782769213e-05,
1392
+ "loss": 0.0252,
1393
+ "mean_token_accuracy": 0.9922806732356548,
1394
+ "num_tokens": 224320984.0,
1395
+ "step": 1330
1396
+ },
1397
+ {
1398
+ "entropy": 0.165607544593513,
1399
+ "epoch": 0.930636339960066,
1400
+ "grad_norm": 0.21484375,
1401
+ "learning_rate": 2.614879425495595e-05,
1402
+ "loss": 0.0242,
1403
+ "mean_token_accuracy": 0.9924774184823036,
1404
+ "num_tokens": 226008160.0,
1405
+ "step": 1340
1406
+ },
1407
+ {
1408
+ "entropy": 0.16860781861469148,
1409
+ "epoch": 0.9375813872732008,
1410
+ "grad_norm": 0.341796875,
1411
+ "learning_rate": 2.6067344825380428e-05,
1412
+ "loss": 0.0251,
1413
+ "mean_token_accuracy": 0.9922392442822456,
1414
+ "num_tokens": 227691055.0,
1415
+ "step": 1350
1416
+ },
1417
+ {
1418
+ "entropy": 0.15853081615641712,
1419
+ "epoch": 0.9445264345863357,
1420
+ "grad_norm": 0.19140625,
1421
+ "learning_rate": 2.5985172811853883e-05,
1422
+ "loss": 0.0229,
1423
+ "mean_token_accuracy": 0.9926266454160213,
1424
+ "num_tokens": 229447767.0,
1425
+ "step": 1360
1426
+ },
1427
+ {
1428
+ "entropy": 0.16538406033068895,
1429
+ "epoch": 0.9514714818994704,
1430
+ "grad_norm": 0.2001953125,
1431
+ "learning_rate": 2.5902283579364855e-05,
1432
+ "loss": 0.0243,
1433
+ "mean_token_accuracy": 0.992415937781334,
1434
+ "num_tokens": 231143110.0,
1435
+ "step": 1370
1436
+ },
1437
+ {
1438
+ "entropy": 0.1578194869682193,
1439
+ "epoch": 0.9584165292126052,
1440
+ "grad_norm": 0.166015625,
1441
+ "learning_rate": 2.5818682539728924e-05,
1442
+ "loss": 0.0228,
1443
+ "mean_token_accuracy": 0.9929021798074246,
1444
+ "num_tokens": 232890680.0,
1445
+ "step": 1380
1446
+ },
1447
+ {
1448
+ "entropy": 0.16025031013414265,
1449
+ "epoch": 0.9653615765257401,
1450
+ "grad_norm": 0.201171875,
1451
+ "learning_rate": 2.573437515123536e-05,
1452
+ "loss": 0.0232,
1453
+ "mean_token_accuracy": 0.9927003815770149,
1454
+ "num_tokens": 234606776.0,
1455
+ "step": 1390
1456
+ },
1457
+ {
1458
+ "entropy": 0.1669618772342801,
1459
+ "epoch": 0.9723066238388749,
1460
+ "grad_norm": 0.173828125,
1461
+ "learning_rate": 2.5649366918290765e-05,
1462
+ "loss": 0.0252,
1463
+ "mean_token_accuracy": 0.992205673456192,
1464
+ "num_tokens": 236312207.0,
1465
+ "step": 1400
1466
+ },
1467
+ {
1468
+ "entropy": 0.1684069731272757,
1469
+ "epoch": 0.9792516711520097,
1470
+ "grad_norm": 0.296875,
1471
+ "learning_rate": 2.5563663391059675e-05,
1472
+ "loss": 0.0241,
1473
+ "mean_token_accuracy": 0.9923799067735672,
1474
+ "num_tokens": 238000255.0,
1475
+ "step": 1410
1476
+ },
1477
+ {
1478
+ "entropy": 0.16883580991998315,
1479
+ "epoch": 0.9861967184651446,
1480
+ "grad_norm": 0.3359375,
1481
+ "learning_rate": 2.5477270165102207e-05,
1482
+ "loss": 0.0242,
1483
+ "mean_token_accuracy": 0.992599005997181,
1484
+ "num_tokens": 239682996.0,
1485
+ "step": 1420
1486
+ },
1487
+ {
1488
+ "entropy": 0.1691451609134674,
1489
+ "epoch": 0.9931417657782794,
1490
+ "grad_norm": 0.3046875,
1491
+ "learning_rate": 2.5390192881008734e-05,
1492
+ "loss": 0.0254,
1493
+ "mean_token_accuracy": 0.9921128831803798,
1494
+ "num_tokens": 241366834.0,
1495
+ "step": 1430
1496
+ },
1497
+ {
1498
+ "entropy": 0.17308599907386152,
1499
+ "epoch": 1.0,
1500
+ "grad_norm": 0.23046875,
1501
+ "learning_rate": 2.530243722403158e-05,
1502
+ "loss": 0.0251,
1503
+ "mean_token_accuracy": 0.9923322027242636,
1504
+ "num_tokens": 243025309.0,
1505
+ "step": 1440
1506
+ },
1507
+ {
1508
+ "entropy": 0.1778549592010677,
1509
+ "epoch": 1.0069450473131347,
1510
+ "grad_norm": 0.2158203125,
1511
+ "learning_rate": 2.5214008923713855e-05,
1512
+ "loss": 0.0255,
1513
+ "mean_token_accuracy": 0.9925065904855728,
1514
+ "num_tokens": 244698287.0,
1515
+ "step": 1450
1516
+ },
1517
+ {
1518
+ "entropy": 0.17283849623054265,
1519
+ "epoch": 1.0138900946262697,
1520
+ "grad_norm": 0.1298828125,
1521
+ "learning_rate": 2.512491375351539e-05,
1522
+ "loss": 0.0243,
1523
+ "mean_token_accuracy": 0.9923485726118088,
1524
+ "num_tokens": 246367654.0,
1525
+ "step": 1460
1526
+ },
1527
+ {
1528
+ "entropy": 0.1658339927904308,
1529
+ "epoch": 1.0208351419394044,
1530
+ "grad_norm": 0.173828125,
1531
+ "learning_rate": 2.5035157530435748e-05,
1532
+ "loss": 0.0246,
1533
+ "mean_token_accuracy": 0.9923050880432129,
1534
+ "num_tokens": 248073194.0,
1535
+ "step": 1470
1536
+ },
1537
+ {
1538
+ "entropy": 0.16023449478670954,
1539
+ "epoch": 1.0277801892525393,
1540
+ "grad_norm": 0.138671875,
1541
+ "learning_rate": 2.4944746114634462e-05,
1542
+ "loss": 0.0226,
1543
+ "mean_token_accuracy": 0.9928353972733021,
1544
+ "num_tokens": 249843884.0,
1545
+ "step": 1480
1546
+ },
1547
+ {
1548
+ "entropy": 0.17396175684407353,
1549
+ "epoch": 1.034725236565674,
1550
+ "grad_norm": 0.240234375,
1551
+ "learning_rate": 2.4853685409048427e-05,
1552
+ "loss": 0.0251,
1553
+ "mean_token_accuracy": 0.9921702355146408,
1554
+ "num_tokens": 251503510.0,
1555
+ "step": 1490
1556
+ },
1557
+ {
1558
+ "entropy": 0.16662784228101374,
1559
+ "epoch": 1.041670283878809,
1560
+ "grad_norm": 0.3203125,
1561
+ "learning_rate": 2.476198135900648e-05,
1562
+ "loss": 0.0238,
1563
+ "mean_token_accuracy": 0.9925790295004845,
1564
+ "num_tokens": 253182254.0,
1565
+ "step": 1500
1566
+ },
1567
+ {
1568
+ "epoch": 1.041670283878809,
1569
+ "eval_entropy": 0.2101928804123574,
1570
+ "eval_loss": 0.030296573415398598,
1571
+ "eval_mean_token_accuracy": 0.9917910993577581,
1572
+ "eval_num_tokens": 253182254.0,
1573
+ "eval_runtime": 81.0064,
1574
+ "eval_samples_per_second": 122.953,
1575
+ "eval_steps_per_second": 7.691,
1576
+ "step": 1500
1577
+ }
1578
+ ],
1579
+ "logging_steps": 10,
1580
+ "max_steps": 4320,
1581
+ "num_input_tokens_seen": 0,
1582
+ "num_train_epochs": 3,
1583
+ "save_steps": 500,
1584
+ "stateful_callbacks": {
1585
+ "TrainerControl": {
1586
+ "args": {
1587
+ "should_epoch_stop": false,
1588
+ "should_evaluate": false,
1589
+ "should_log": false,
1590
+ "should_save": true,
1591
+ "should_training_stop": false
1592
+ },
1593
+ "attributes": {}
1594
+ }
1595
+ },
1596
+ "total_flos": 5.394361222344737e+18,
1597
+ "train_batch_size": 4,
1598
+ "trial_name": null,
1599
+ "trial_params": null
1600
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8363ddab34be1be669fb9c3b0e477751c99c8dd6938033052e7430da744c5331
3
+ size 6289
checkpoint-1500/video_preprocessor_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_sample_frames": true,
12
+ "fps": 2,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_std": [
19
+ 0.5,
20
+ 0.5,
21
+ 0.5
22
+ ],
23
+ "input_data_format": null,
24
+ "max_frames": 768,
25
+ "merge_size": 2,
26
+ "min_frames": 4,
27
+ "num_frames": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_metadata": false,
34
+ "size": {
35
+ "longest_edge": 25165824,
36
+ "shortest_edge": 4096
37
+ },
38
+ "temporal_patch_size": 2,
39
+ "video_metadata": null,
40
+ "video_processor_type": "Qwen3VLVideoProcessor"
41
+ }