luotingdan commited on
Commit
fab406b
·
1 Parent(s): 3d4b323

add generation config and update Readme

Browse files
Files changed (3) hide show
  1. README.md +13 -22
  2. chat_template.jinja +0 -81
  3. generation_config.json +10 -0
README.md CHANGED
@@ -46,8 +46,8 @@ STEP3-VL-10B delivers best-in-class performance across major multimodal benchmar
46
 
47
  | Benchmark | STEP3-VL-10B (SeRe) | STEP3-VL-10B (PaCoRe) | GLM-4.6V (106B-A12B) | Qwen3-VL (235B-A22B) | Gemini-2.5-Pro | Seed-1.5-VL |
48
  | :---------------- | :-----------------: | :-------------------: | :------------------: | :------------------: | :------------: | :---------: |
49
- | **MMMU** | 78.11 | 80.11 | 75.20 | 78.70 | **83.89** | 79.11 |
50
- | **MathVista** | 83.97 | 85.50 | 83.51 | 85.10 | 83.88 | **85.60** |
51
  | **MathVision** | 70.81 | **75.95** | 63.50 | 72.10 | 73.30 | 68.70 |
52
  | **MMBench (EN)** | 92.05 | 92.38 | 92.75 | 92.70 | **93.19** | 92.11 |
53
  | **MMStar** | 77.48 | 77.64 | 75.30 | 76.80 | **79.18** | 77.91 |
@@ -123,7 +123,8 @@ We introduce how to use our model at inference stage using transformers library.
123
 
124
  ```python
125
  from transformers import AutoProcessor, AutoModelForCausalLM
126
-
 
127
 
128
  key_mapping = {
129
  "^vision_model": "model.vision_model",
@@ -134,16 +135,9 @@ key_mapping = {
134
  model_path = "stepfun-ai/Step3-VL-10B-Base"
135
 
136
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
137
-
138
- messages = [
139
- {
140
- "role": "user",
141
- "content": [
142
- {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
143
- {"type": "text", "text": "What's in this picture?"}
144
- ]
145
- },
146
- ]
147
 
148
  model = AutoModelForCausalLM.from_pretrained(
149
  model_path,
@@ -152,12 +146,7 @@ model = AutoModelForCausalLM.from_pretrained(
152
  torch_dtype="auto",
153
  key_mapping=key_mapping).eval()
154
 
155
-
156
- inputs = processor.apply_chat_template(
157
- messages, add_generation_prompt=True, tokenize=True,
158
- return_dict=True, return_tensors="pt"
159
- ).to(model.device)
160
-
161
 
162
  generate_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
163
  decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], skip_special_tokens=True)
@@ -165,22 +154,24 @@ decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], ski
165
  print(decoded)
166
  ```
167
 
 
 
168
  ## 📜 Citation
169
 
170
  If you find this project useful in your research, please cite our technical report:
171
 
172
  ```tex
173
  @misc{huang2026step3vl10btechnicalreport,
174
- title={STEP3-VL-10B Technical Report},
175
  author={Ailin Huang and Chengyuan Yao and Chunrui Han and Fanqi Wan and Hangyu Guo and Haoran Lv and Hongyu Zhou and Jia Wang and Jian Zhou and Jianjian Sun and Jingcheng Hu and Kangheng Lin and Liang Zhao and Mitt Huang and Song Yuan and Wenwen Qu and Xiangfeng Wang and Yanlin Lai and Yingxiu Zhao and Yinmin Zhang and Yukang Shi and Yuyang Chen and Zejia Weng and Ziyang Meng and Ang Li and Aobo Kong and Bo Dong and Changyi Wan and David Wang and Di Qi and Dingming Li and En Yu and Guopeng Li and Haiquan Yin and Han Zhou and Hanshan Zhang and Haolong Yan and Hebin Zhou and Hongbo Peng and Jiaran Zhang and Jiashu Lv and Jiayi Fu and Jie Cheng and Jie Zhou and Jisheng Yin and Jingjing Xie and Jingwei Wu and Jun Zhang and Junfeng Liu and Kaijun Tan and Kaiwen Yan and Liangyu Chen and Lina Chen and Mingliang Li and Qian Zhao and Quan Sun and Shaoliang Pang and Shengjie Fan and Shijie Shang and Siyuan Zhang and Tianhao You and Wei Ji and Wuxun Xie and Xiaobo Yang and Xiaojie Hou and Xiaoran Jiao and Xiaoxiao Ren and Xiangwen Kong and Xin Huang and Xin Wu and Xing Chen and Xinran Wang and Xuelin Zhang and Yana Wei and Yang Li and Yanming Xu and Yeqing Shen and Yuang Peng and Yue Peng and Yu Zhou and Yusheng Li and Yuxiang Yang and Yuyang Zhang and Zhe Xie and Zhewei Huang and Zhenyi Lu and Zhimin Fan and Zihui Cheng and Daxin Jiang and Qi Han and Xiangyu Zhang and Yibo Zhu and Zheng Ge},
176
  year={2026},
177
  eprint={2601.09668},
178
  archivePrefix={arXiv},
179
  primaryClass={cs.CV},
180
- url={https://arxiv.org/abs/2601.09668},
181
  }
182
  ```
183
 
184
  ## 📄 License
185
 
186
- This project is open-sourced under the [Apache 2.0 License](https://www.google.com/search?q=LICENSE).
 
46
 
47
  | Benchmark | STEP3-VL-10B (SeRe) | STEP3-VL-10B (PaCoRe) | GLM-4.6V (106B-A12B) | Qwen3-VL (235B-A22B) | Gemini-2.5-Pro | Seed-1.5-VL |
48
  | :---------------- | :-----------------: | :-------------------: | :------------------: | :------------------: | :------------: | :---------: |
49
+ | **MMMU** | 78.11 | 80.11 | 75.20 | 78.70 | **83.89** | 79.11 |
50
+ | **MathVista** | 83.97 | 85.50 | 83.51 | 85.10 | 83.88 | **85.60** |
51
  | **MathVision** | 70.81 | **75.95** | 63.50 | 72.10 | 73.30 | 68.70 |
52
  | **MMBench (EN)** | 92.05 | 92.38 | 92.75 | 92.70 | **93.19** | 92.11 |
53
  | **MMStar** | 77.48 | 77.64 | 75.30 | 76.80 | **79.18** | 77.91 |
 
123
 
124
  ```python
125
  from transformers import AutoProcessor, AutoModelForCausalLM
126
+ from PIL import Image
127
+ import requests
128
 
129
  key_mapping = {
130
  "^vision_model": "model.vision_model",
 
135
  model_path = "stepfun-ai/Step3-VL-10B-Base"
136
 
137
  processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
138
+ prompt = "<im_patch> What's the content of the image?"
139
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
140
+ image = Image.open(requests.get(url, stream=True).raw)
 
 
 
 
 
 
 
141
 
142
  model = AutoModelForCausalLM.from_pretrained(
143
  model_path,
 
146
  torch_dtype="auto",
147
  key_mapping=key_mapping).eval()
148
 
149
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
 
 
 
 
 
150
 
151
  generate_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
152
  decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], skip_special_tokens=True)
 
154
  print(decoded)
155
  ```
156
 
157
+ **Note** It is not recommended to deploy the base model with vLLM; using the Instruct or Chat version is preferred.
158
+
159
  ## 📜 Citation
160
 
161
  If you find this project useful in your research, please cite our technical report:
162
 
163
  ```tex
164
  @misc{huang2026step3vl10btechnicalreport,
165
+ title={STEP3-VL-10B Technical Report},
166
  author={Ailin Huang and Chengyuan Yao and Chunrui Han and Fanqi Wan and Hangyu Guo and Haoran Lv and Hongyu Zhou and Jia Wang and Jian Zhou and Jianjian Sun and Jingcheng Hu and Kangheng Lin and Liang Zhao and Mitt Huang and Song Yuan and Wenwen Qu and Xiangfeng Wang and Yanlin Lai and Yingxiu Zhao and Yinmin Zhang and Yukang Shi and Yuyang Chen and Zejia Weng and Ziyang Meng and Ang Li and Aobo Kong and Bo Dong and Changyi Wan and David Wang and Di Qi and Dingming Li and En Yu and Guopeng Li and Haiquan Yin and Han Zhou and Hanshan Zhang and Haolong Yan and Hebin Zhou and Hongbo Peng and Jiaran Zhang and Jiashu Lv and Jiayi Fu and Jie Cheng and Jie Zhou and Jisheng Yin and Jingjing Xie and Jingwei Wu and Jun Zhang and Junfeng Liu and Kaijun Tan and Kaiwen Yan and Liangyu Chen and Lina Chen and Mingliang Li and Qian Zhao and Quan Sun and Shaoliang Pang and Shengjie Fan and Shijie Shang and Siyuan Zhang and Tianhao You and Wei Ji and Wuxun Xie and Xiaobo Yang and Xiaojie Hou and Xiaoran Jiao and Xiaoxiao Ren and Xiangwen Kong and Xin Huang and Xin Wu and Xing Chen and Xinran Wang and Xuelin Zhang and Yana Wei and Yang Li and Yanming Xu and Yeqing Shen and Yuang Peng and Yue Peng and Yu Zhou and Yusheng Li and Yuxiang Yang and Yuyang Zhang and Zhe Xie and Zhewei Huang and Zhenyi Lu and Zhimin Fan and Zihui Cheng and Daxin Jiang and Qi Han and Xiangyu Zhang and Yibo Zhu and Zheng Ge},
167
  year={2026},
168
  eprint={2601.09668},
169
  archivePrefix={arXiv},
170
  primaryClass={cs.CV},
171
+ url={https://arxiv.org/abs/2601.09668},
172
  }
173
  ```
174
 
175
  ## 📄 License
176
 
177
+ This project is open-sourced under the [Apache 2.0 License](https://www.google.com/search?q=LICENSE).
chat_template.jinja DELETED
@@ -1,81 +0,0 @@
1
- {% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
2
- {%- if tools %}
3
- {{- '<|im_start|>system\n' }}
4
- {%- if messages[0].role == 'system' %}
5
- {{- render_content(messages[0].content) + '\n\n' }}
6
- {%- endif %}
7
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
8
- {%- for tool in tools %}
9
- {{- "\n" }}
10
- {{- tool | tojson }}
11
- {%- endfor %}
12
- {{- "\n</tools>\n\nAlways adhere to this exact format for tool use:\n<tool_calls>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n{additional_tool_calls}</tool_calls>\n\nNote:\n- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags.\n- `<function-name>` must be an exact match to one of the available tools.\n- `<args-json-object>` must be valid JSON that strictly follows the tool's parameters schema.<|im_end|>\n" }}
13
- {%- else %}
14
- {%- if messages[0].role == 'system' %}
15
- {{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
16
- {%- endif %}
17
- {%- endif %}
18
- {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
19
- {%- for message in messages[::-1] %}
20
- {%- set index = (messages|length - 1) - loop.index0 %}
21
- {%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
22
- {%- set ns.multi_step_tool = false %}
23
- {%- set ns.last_query_index = index %}
24
- {%- endif %}
25
- {%- endfor %}
26
- {%- for message in messages %}
27
- {%- set content = render_content(message.content) %}
28
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
29
- {%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
30
- {{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
31
- {%- elif message.role == "assistant" %}
32
- {%- if message.reasoning_content is string %}
33
- {%- set reasoning_content = render_content(message.reasoning_content) %}
34
- {%- else %}
35
- {%- if '</think>' in content %}
36
- {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
- {%- set content = content.split('</think>')[-1].lstrip('\n') %}
38
- {%- else %}
39
- {%- set reasoning_content = '' %}
40
- {%- endif %}
41
- {%- endif %}
42
- {%- if loop.index0 > ns.last_query_index %}
43
- {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
44
- {%- else %}
45
- {{- '<|im_start|>' + message.role + '\n' + content }}
46
- {%- endif %}
47
- {%- if message.tool_calls %}
48
- {{- '\n<tool_calls>' }}
49
- {%- for tool_call in message.tool_calls %}
50
- {{- '\n' }}
51
- {%- if tool_call.function %}
52
- {%- set tool_call = tool_call.function %}
53
- {%- endif %}
54
- {{- '<tool_call>\n{"name": "' }}
55
- {{- tool_call.name }}
56
- {{- '", "arguments": ' }}
57
- {%- if tool_call.arguments is string %}
58
- {{- tool_call.arguments }}
59
- {%- else %}
60
- {{- tool_call.arguments | tojson }}
61
- {%- endif %}
62
- {{- '}\n</tool_call>' }}
63
- {%- endfor %}
64
- {{- '\n</tool_calls>' }}
65
- {%- endif %}
66
- {{- '<|im_end|>\n' }}
67
- {%- elif message.role == "tool" %}
68
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
69
- {{- '<|im_start|>tool_response' }}
70
- {%- endif %}
71
- {{- '\n<tool_response>\n' }}
72
- {{- content }}
73
- {{- '\n</tool_response>' }}
74
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
75
- {{- '<|im_end|>\n' }}
76
- {%- endif %}
77
- {%- endif %}
78
- {%- endfor %}
79
- {%- if add_generation_prompt %}
80
- {{- '<|im_start|>assistant\n<think>\n' }}
81
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "temperature": 1.0,
3
+ "top_p": 1.0,
4
+ "top_k": 0,
5
+ "eos_token_id": [
6
+ 151643,
7
+ 151645,
8
+ 151679
9
+ ]
10
+ }