luotingdan commited on
Commit ·
fab406b
1
Parent(s): 3d4b323
add generation config and update Readme
Browse files- README.md +13 -22
- chat_template.jinja +0 -81
- generation_config.json +10 -0
README.md
CHANGED
|
@@ -46,8 +46,8 @@ STEP3-VL-10B delivers best-in-class performance across major multimodal benchmar
|
|
| 46 |
|
| 47 |
| Benchmark | STEP3-VL-10B (SeRe) | STEP3-VL-10B (PaCoRe) | GLM-4.6V (106B-A12B) | Qwen3-VL (235B-A22B) | Gemini-2.5-Pro | Seed-1.5-VL |
|
| 48 |
| :---------------- | :-----------------: | :-------------------: | :------------------: | :------------------: | :------------: | :---------: |
|
| 49 |
-
| **MMMU**
|
| 50 |
-
| **MathVista**
|
| 51 |
| **MathVision** | 70.81 | **75.95** | 63.50 | 72.10 | 73.30 | 68.70 |
|
| 52 |
| **MMBench (EN)** | 92.05 | 92.38 | 92.75 | 92.70 | **93.19** | 92.11 |
|
| 53 |
| **MMStar** | 77.48 | 77.64 | 75.30 | 76.80 | **79.18** | 77.91 |
|
|
@@ -123,7 +123,8 @@ We introduce how to use our model at inference stage using transformers library.
|
|
| 123 |
|
| 124 |
```python
|
| 125 |
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 126 |
-
|
|
|
|
| 127 |
|
| 128 |
key_mapping = {
|
| 129 |
"^vision_model": "model.vision_model",
|
|
@@ -134,16 +135,9 @@ key_mapping = {
|
|
| 134 |
model_path = "stepfun-ai/Step3-VL-10B-Base"
|
| 135 |
|
| 136 |
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
"role": "user",
|
| 141 |
-
"content": [
|
| 142 |
-
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
|
| 143 |
-
{"type": "text", "text": "What's in this picture?"}
|
| 144 |
-
]
|
| 145 |
-
},
|
| 146 |
-
]
|
| 147 |
|
| 148 |
model = AutoModelForCausalLM.from_pretrained(
|
| 149 |
model_path,
|
|
@@ -152,12 +146,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 152 |
torch_dtype="auto",
|
| 153 |
key_mapping=key_mapping).eval()
|
| 154 |
|
| 155 |
-
|
| 156 |
-
inputs = processor.apply_chat_template(
|
| 157 |
-
messages, add_generation_prompt=True, tokenize=True,
|
| 158 |
-
return_dict=True, return_tensors="pt"
|
| 159 |
-
).to(model.device)
|
| 160 |
-
|
| 161 |
|
| 162 |
generate_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
|
| 163 |
decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], skip_special_tokens=True)
|
|
@@ -165,22 +154,24 @@ decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], ski
|
|
| 165 |
print(decoded)
|
| 166 |
```
|
| 167 |
|
|
|
|
|
|
|
| 168 |
## 📜 Citation
|
| 169 |
|
| 170 |
If you find this project useful in your research, please cite our technical report:
|
| 171 |
|
| 172 |
```tex
|
| 173 |
@misc{huang2026step3vl10btechnicalreport,
|
| 174 |
-
title={STEP3-VL-10B Technical Report},
|
| 175 |
author={Ailin Huang and Chengyuan Yao and Chunrui Han and Fanqi Wan and Hangyu Guo and Haoran Lv and Hongyu Zhou and Jia Wang and Jian Zhou and Jianjian Sun and Jingcheng Hu and Kangheng Lin and Liang Zhao and Mitt Huang and Song Yuan and Wenwen Qu and Xiangfeng Wang and Yanlin Lai and Yingxiu Zhao and Yinmin Zhang and Yukang Shi and Yuyang Chen and Zejia Weng and Ziyang Meng and Ang Li and Aobo Kong and Bo Dong and Changyi Wan and David Wang and Di Qi and Dingming Li and En Yu and Guopeng Li and Haiquan Yin and Han Zhou and Hanshan Zhang and Haolong Yan and Hebin Zhou and Hongbo Peng and Jiaran Zhang and Jiashu Lv and Jiayi Fu and Jie Cheng and Jie Zhou and Jisheng Yin and Jingjing Xie and Jingwei Wu and Jun Zhang and Junfeng Liu and Kaijun Tan and Kaiwen Yan and Liangyu Chen and Lina Chen and Mingliang Li and Qian Zhao and Quan Sun and Shaoliang Pang and Shengjie Fan and Shijie Shang and Siyuan Zhang and Tianhao You and Wei Ji and Wuxun Xie and Xiaobo Yang and Xiaojie Hou and Xiaoran Jiao and Xiaoxiao Ren and Xiangwen Kong and Xin Huang and Xin Wu and Xing Chen and Xinran Wang and Xuelin Zhang and Yana Wei and Yang Li and Yanming Xu and Yeqing Shen and Yuang Peng and Yue Peng and Yu Zhou and Yusheng Li and Yuxiang Yang and Yuyang Zhang and Zhe Xie and Zhewei Huang and Zhenyi Lu and Zhimin Fan and Zihui Cheng and Daxin Jiang and Qi Han and Xiangyu Zhang and Yibo Zhu and Zheng Ge},
|
| 176 |
year={2026},
|
| 177 |
eprint={2601.09668},
|
| 178 |
archivePrefix={arXiv},
|
| 179 |
primaryClass={cs.CV},
|
| 180 |
-
url={https://arxiv.org/abs/2601.09668},
|
| 181 |
}
|
| 182 |
```
|
| 183 |
|
| 184 |
## 📄 License
|
| 185 |
|
| 186 |
-
This project is open-sourced under the [Apache 2.0 License](https://www.google.com/search?q=LICENSE).
|
|
|
|
| 46 |
|
| 47 |
| Benchmark | STEP3-VL-10B (SeRe) | STEP3-VL-10B (PaCoRe) | GLM-4.6V (106B-A12B) | Qwen3-VL (235B-A22B) | Gemini-2.5-Pro | Seed-1.5-VL |
|
| 48 |
| :---------------- | :-----------------: | :-------------------: | :------------------: | :------------------: | :------------: | :---------: |
|
| 49 |
+
| **MMMU** | 78.11 | 80.11 | 75.20 | 78.70 | **83.89** | 79.11 |
|
| 50 |
+
| **MathVista** | 83.97 | 85.50 | 83.51 | 85.10 | 83.88 | **85.60** |
|
| 51 |
| **MathVision** | 70.81 | **75.95** | 63.50 | 72.10 | 73.30 | 68.70 |
|
| 52 |
| **MMBench (EN)** | 92.05 | 92.38 | 92.75 | 92.70 | **93.19** | 92.11 |
|
| 53 |
| **MMStar** | 77.48 | 77.64 | 75.30 | 76.80 | **79.18** | 77.91 |
|
|
|
|
| 123 |
|
| 124 |
```python
|
| 125 |
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 126 |
+
from PIL import Image
|
| 127 |
+
import requests
|
| 128 |
|
| 129 |
key_mapping = {
|
| 130 |
"^vision_model": "model.vision_model",
|
|
|
|
| 135 |
model_path = "stepfun-ai/Step3-VL-10B-Base"
|
| 136 |
|
| 137 |
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 138 |
+
prompt = "<im_patch> What's the content of the image?"
|
| 139 |
+
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
|
| 140 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
model = AutoModelForCausalLM.from_pretrained(
|
| 143 |
model_path,
|
|
|
|
| 146 |
torch_dtype="auto",
|
| 147 |
key_mapping=key_mapping).eval()
|
| 148 |
|
| 149 |
+
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
generate_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
|
| 152 |
decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1] :], skip_special_tokens=True)
|
|
|
|
| 154 |
print(decoded)
|
| 155 |
```
|
| 156 |
|
| 157 |
+
**Note** It is not recommended to deploy the base model with vLLM; using the Instruct or Chat version is preferred.
|
| 158 |
+
|
| 159 |
## 📜 Citation
|
| 160 |
|
| 161 |
If you find this project useful in your research, please cite our technical report:
|
| 162 |
|
| 163 |
```tex
|
| 164 |
@misc{huang2026step3vl10btechnicalreport,
|
| 165 |
+
title={STEP3-VL-10B Technical Report},
|
| 166 |
author={Ailin Huang and Chengyuan Yao and Chunrui Han and Fanqi Wan and Hangyu Guo and Haoran Lv and Hongyu Zhou and Jia Wang and Jian Zhou and Jianjian Sun and Jingcheng Hu and Kangheng Lin and Liang Zhao and Mitt Huang and Song Yuan and Wenwen Qu and Xiangfeng Wang and Yanlin Lai and Yingxiu Zhao and Yinmin Zhang and Yukang Shi and Yuyang Chen and Zejia Weng and Ziyang Meng and Ang Li and Aobo Kong and Bo Dong and Changyi Wan and David Wang and Di Qi and Dingming Li and En Yu and Guopeng Li and Haiquan Yin and Han Zhou and Hanshan Zhang and Haolong Yan and Hebin Zhou and Hongbo Peng and Jiaran Zhang and Jiashu Lv and Jiayi Fu and Jie Cheng and Jie Zhou and Jisheng Yin and Jingjing Xie and Jingwei Wu and Jun Zhang and Junfeng Liu and Kaijun Tan and Kaiwen Yan and Liangyu Chen and Lina Chen and Mingliang Li and Qian Zhao and Quan Sun and Shaoliang Pang and Shengjie Fan and Shijie Shang and Siyuan Zhang and Tianhao You and Wei Ji and Wuxun Xie and Xiaobo Yang and Xiaojie Hou and Xiaoran Jiao and Xiaoxiao Ren and Xiangwen Kong and Xin Huang and Xin Wu and Xing Chen and Xinran Wang and Xuelin Zhang and Yana Wei and Yang Li and Yanming Xu and Yeqing Shen and Yuang Peng and Yue Peng and Yu Zhou and Yusheng Li and Yuxiang Yang and Yuyang Zhang and Zhe Xie and Zhewei Huang and Zhenyi Lu and Zhimin Fan and Zihui Cheng and Daxin Jiang and Qi Han and Xiangyu Zhang and Yibo Zhu and Zheng Ge},
|
| 167 |
year={2026},
|
| 168 |
eprint={2601.09668},
|
| 169 |
archivePrefix={arXiv},
|
| 170 |
primaryClass={cs.CV},
|
| 171 |
+
url={https://arxiv.org/abs/2601.09668},
|
| 172 |
}
|
| 173 |
```
|
| 174 |
|
| 175 |
## 📄 License
|
| 176 |
|
| 177 |
+
This project is open-sourced under the [Apache 2.0 License](https://www.google.com/search?q=LICENSE).
|
chat_template.jinja
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}
|
| 2 |
-
{%- if tools %}
|
| 3 |
-
{{- '<|im_start|>system\n' }}
|
| 4 |
-
{%- if messages[0].role == 'system' %}
|
| 5 |
-
{{- render_content(messages[0].content) + '\n\n' }}
|
| 6 |
-
{%- endif %}
|
| 7 |
-
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 8 |
-
{%- for tool in tools %}
|
| 9 |
-
{{- "\n" }}
|
| 10 |
-
{{- tool | tojson }}
|
| 11 |
-
{%- endfor %}
|
| 12 |
-
{{- "\n</tools>\n\nAlways adhere to this exact format for tool use:\n<tool_calls>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n{additional_tool_calls}</tool_calls>\n\nNote:\n- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags.\n- `<function-name>` must be an exact match to one of the available tools.\n- `<args-json-object>` must be valid JSON that strictly follows the tool's parameters schema.<|im_end|>\n" }}
|
| 13 |
-
{%- else %}
|
| 14 |
-
{%- if messages[0].role == 'system' %}
|
| 15 |
-
{{- '<|im_start|>system\n' + render_content(messages[0].content) + '<|im_end|>\n' }}
|
| 16 |
-
{%- endif %}
|
| 17 |
-
{%- endif %}
|
| 18 |
-
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 19 |
-
{%- for message in messages[::-1] %}
|
| 20 |
-
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 21 |
-
{%- if ns.multi_step_tool and message.role == "user" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}
|
| 22 |
-
{%- set ns.multi_step_tool = false %}
|
| 23 |
-
{%- set ns.last_query_index = index %}
|
| 24 |
-
{%- endif %}
|
| 25 |
-
{%- endfor %}
|
| 26 |
-
{%- for message in messages %}
|
| 27 |
-
{%- set content = render_content(message.content) %}
|
| 28 |
-
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 29 |
-
{%- set role_name = 'observation' if (message.role == "system" and not loop.first and message.name == 'observation') else message.role %}
|
| 30 |
-
{{- '<|im_start|>' + role_name + '\n' + content + '<|im_end|>' + '\n' }}
|
| 31 |
-
{%- elif message.role == "assistant" %}
|
| 32 |
-
{%- if message.reasoning_content is string %}
|
| 33 |
-
{%- set reasoning_content = render_content(message.reasoning_content) %}
|
| 34 |
-
{%- else %}
|
| 35 |
-
{%- if '</think>' in content %}
|
| 36 |
-
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 37 |
-
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 38 |
-
{%- else %}
|
| 39 |
-
{%- set reasoning_content = '' %}
|
| 40 |
-
{%- endif %}
|
| 41 |
-
{%- endif %}
|
| 42 |
-
{%- if loop.index0 > ns.last_query_index %}
|
| 43 |
-
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n' + content }}
|
| 44 |
-
{%- else %}
|
| 45 |
-
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 46 |
-
{%- endif %}
|
| 47 |
-
{%- if message.tool_calls %}
|
| 48 |
-
{{- '\n<tool_calls>' }}
|
| 49 |
-
{%- for tool_call in message.tool_calls %}
|
| 50 |
-
{{- '\n' }}
|
| 51 |
-
{%- if tool_call.function %}
|
| 52 |
-
{%- set tool_call = tool_call.function %}
|
| 53 |
-
{%- endif %}
|
| 54 |
-
{{- '<tool_call>\n{"name": "' }}
|
| 55 |
-
{{- tool_call.name }}
|
| 56 |
-
{{- '", "arguments": ' }}
|
| 57 |
-
{%- if tool_call.arguments is string %}
|
| 58 |
-
{{- tool_call.arguments }}
|
| 59 |
-
{%- else %}
|
| 60 |
-
{{- tool_call.arguments | tojson }}
|
| 61 |
-
{%- endif %}
|
| 62 |
-
{{- '}\n</tool_call>' }}
|
| 63 |
-
{%- endfor %}
|
| 64 |
-
{{- '\n</tool_calls>' }}
|
| 65 |
-
{%- endif %}
|
| 66 |
-
{{- '<|im_end|>\n' }}
|
| 67 |
-
{%- elif message.role == "tool" %}
|
| 68 |
-
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 69 |
-
{{- '<|im_start|>tool_response' }}
|
| 70 |
-
{%- endif %}
|
| 71 |
-
{{- '\n<tool_response>\n' }}
|
| 72 |
-
{{- content }}
|
| 73 |
-
{{- '\n</tool_response>' }}
|
| 74 |
-
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 75 |
-
{{- '<|im_end|>\n' }}
|
| 76 |
-
{%- endif %}
|
| 77 |
-
{%- endif %}
|
| 78 |
-
{%- endfor %}
|
| 79 |
-
{%- if add_generation_prompt %}
|
| 80 |
-
{{- '<|im_start|>assistant\n<think>\n' }}
|
| 81 |
-
{%- endif %}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generation_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"temperature": 1.0,
|
| 3 |
+
"top_p": 1.0,
|
| 4 |
+
"top_k": 0,
|
| 5 |
+
"eos_token_id": [
|
| 6 |
+
151643,
|
| 7 |
+
151645,
|
| 8 |
+
151679
|
| 9 |
+
]
|
| 10 |
+
}
|