wangrongsheng commited on 10 days ago

Commit

a86f2f6

verified ·

1 Parent(s): 621e54c

Add files using upload-large-folder tool

Browse files

Files changed (49) hide show

.gitignore +1 -0
LICENSE +34 -0
README.md +108 -6
README_EN.md +111 -0
checklist.chk +99 -0
config.json +40 -0
configuration_openpangu_moe.py +82 -0
doc/docker.md +31 -0
doc/docker_EN.md +31 -0
doc/vllm_ascend_for_openpangu_ultra_moe_718b.md +215 -0
doc/vllm_ascend_for_openpangu_ultra_moe_718b_EN.md +216 -0
generation_config.json +11 -0
inference/generate.py +106 -0
inference/generate.sh +70 -0
inference/model.py +918 -0
inference/runner.py +411 -0
inference/runner_config/tp1.yaml +30 -0
inference/runner_config/tp32.yaml +30 -0
inference/split_weight.py +387 -0
inference/split_weight.sh +13 -0
inference/vllm_ascend/_build_info.py +3 -0
inference/vllm_ascend/attention/attention.py +1220 -0
inference/vllm_ascend/attention/mla_v1.py +1224 -0
inference/vllm_ascend/entrypoints/openai/reasoning_parsers/__init__.py +6 -0
inference/vllm_ascend/entrypoints/openai/reasoning_parsers/pangu_reasoning_parser.py +171 -0
inference/vllm_ascend/entrypoints/openai/tool_parsers/__init__.py +6 -0
inference/vllm_ascend/entrypoints/openai/tool_parsers/pangu_tool_parser.py +300 -0
inference/vllm_ascend/envs.py +153 -0
inference/vllm_ascend/models/__init__.py +68 -0
inference/vllm_ascend/models/open_pangu.py +1127 -0
inference/vllm_ascend/ops/fused_moe.py +1530 -0
inference/vllm_ascend/patch/worker/patch_common/__init__.py +27 -0
inference/vllm_ascend/patch/worker/patch_common/patch_config.py +97 -0
inference/vllm_ascend/patch/worker/patch_common/patch_parsers.py +26 -0
inference/vllm_ascend/patch/worker/patch_common/patch_sampler.py +159 -0
inference/vllm_ascend/quantization/w8a8.py +757 -0
inference/vllm_ascend/quantization/w8a8_dynamic.py +831 -0
inference/vllm_ascend/utils.py +563 -0
inference/vllm_ascend/worker/model_runner_v1.py +0 -0
inference/vllm_ascend/worker/npu_input_batch.py +796 -0
model-00002-of-000062.safetensors +3 -0
model-00003-of-000062.safetensors +3 -0
model-00005-of-000062.safetensors +3 -0
model-00045-of-000062.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_openpangu_moe.py +653 -0
special_tokens_map.json +30 -0
tokenization_openpangu.py +273 -0
tokenizer_config.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

LICENSE ADDED Viewed

	@@ -0,0 +1,34 @@

+OPENPANGU MODEL LICENSE AGREEMENT VERSION 1.0
+This OPENPANGU MODEL LICENSE AGREEMENT VERSION 1.0 (the "Agreement") is a legal agreement between You and Huawei Technologies Co., Ltd. ("Huawei", "We" or "Us"), and it governs Your reproducing, use, modification, and distribution of openPangu as made available by Huawei under this Agreement.
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of openPangu, or otherwise accepting the terms of this Agreement, You agree to be bound by this Agreement.
+1.	Definitions.
+1.1.	“openPangu” or “Model” means openPangu large language models and software, including trained model weights, parameters (including optimizer states), accompanying source code and scripts released under this Agreement.
+1.2.	“Derivative Model” means all (1) modifications to the Model, (2) works based on the Model, and (3) any other derivative works of the Model. For clarity, information or content results from operating or otherwise using the Model is not a Derivative Model.
+1.3.	“You” or “Your” means an individual or Legal Entity exercising permissions granted by this Agreement and/or using the Model for any purpose.
+1.4.	“Third Party” or “Third Parties” means individuals or legal entities that are not under common control with Us or You.
+2.	License Grant. Subject to Your full compliance with the terms and conditions of this Agreement, We hereby grant to You a perpetual, worldwide, non-exclusive, non-transferable, no-charge, royalty-free license (except as stated in Section 3) to use, reproduce, modify, and distribute the Model.
+3.	Conditions for License Grant. You represent and warrant that You will not, access, download, install, run, deploy, integrate, modify, or otherwise use the Model, directly or indirectly, within the European Union.
+4.	Redistribution.
+4.1.	If You distribute the Model or Derivative Model, You shall retain in Your distribution (1) a copy of this agreement, and (2) all copyright notices and other notices of origin included in the Model that are applicable to Your distribution.
+4.2.	Further, if You distribute or make available to Third Parties a product or service (including another AI model) based on the Model, You are required to (1) display the acknowledgement “Powered by openPangu” and (2) include a trademark notice “openPangu is a trademark of Huawei Technologies Co., Ltd.” on related webpages, user manuals, product documentations or other advertising materials mentioning features of the Model.
+4.3.	You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for Derivative Model made by You as a whole, provided Your use, reproduction, and distribution of the Model otherwise complies with the terms and conditions of this Agreement.
+5.	Ownership. We do not claim ownership to any information or content generated using the Model or Derivative Model that are made by You. You are solely responsible for evaluating the accuracy and appropriateness of such information or content for Your use case.
+6.	Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of Huawei, except as required for complying with Section 4.2.
+7.	Indemnity. You will indemnify and hold harmless Huawei from and against any claim by any third party arising out of or related to Your use or distribution of the Model or Derivative Model made by You (e.g. a violation against Section 3). For avoidance of doubt, “third party” in this clause include supervisory authorities.
+8.	THE MODEL IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NONINFRINGEMENT, ACCURACY, OR THE ABSENCE OF LATENT OR OTHER DEFECTS OR ERRORS, WHETHER OR NOT DISCOVERABLE, ALL TO THE GREATEST EXTENT PERMISSIBLE UNDER APPLICABLE LAW.
+9.	IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MODEL, IN WHOLE OR IN PART, NO MATTER HOW IT’S CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+END OF THE TERMS AND CONDITIONS

README.md CHANGED Viewed

@@ -1,6 +1,108 @@
----
-license: other
-license_name: openpangu-model-license-agreement-version-1.0
-license_link: >-
-  https://ai.gitcode.com/ascend-tribe/openpangu-ultra-moe-718b-model/blob/main/LICENSE
----

+# 开源盘古 Ultra-MoE-718B
+中文 | [English](README_EN.md)
+## 1. 简介
+openPangu-Ultra-MoE-718B 是基于昇腾NPU从零训练的大规模混合专家语言模型，总参数量为718B，激活参数量为39B。openPangu-Ultra-MoE-718B 训练了约19T tokens，具备快慢思考融合能力。
+## 2. 模型架构
+openPangu-Ultra-MoE-718B 的模型架构采用了业界主流的Multi-head Latent Attention (MLA)、Multi-Token Prediction (MTP)、大稀疏比等架构，以及一些特有的设计：
+- Depth-Scaled Sandwich-Norm和TinyInit：通过调整层归一化结构与参数初始化，提升训练稳定性。
+- 基于EP-Group的负载均衡策略：通过优化负载均衡损失函数，改善专家特化效果。
+## 3. 测评结果
+|       测评集        |             测评指标             |  慢思考  |
+|:----------------:|:----------------------------:|:-----:|
+|     **通用能力**     |                              |       |
+|      C-Eval      |             Acc              | 91.06 |
+|     CLUEWSC      |             Acc              | 94.67 |
+|     MMLU-Pro     |         Exact Match          | 82.40 |
+|  ArenaHard_v0.1  |      w/o Style Control       | 96.80 |
+|   GPQA-Diamond   |            Avg@4             | 76.77 |
+|    SuperGPQA     |             Acc              | 61.67 |
+|     IF-Eval      |        Prompt Strict         | 80.59 |
+|     SysBench     | Constraint Satisfaction Rate | 91.43 |
+|     **数学能力**     |                              |       |
+|    CNMO 2024     |            Avg@32            | 80.73 |
+|      AIME25      |            Avg@16            | 75.21 |
+|      AIME24      |            Avg@16            | 80.21 |
+|     MATH-500     |            Avg@1             | 97.40 |
+|     **代码能力**     |                              |       |
+|   LiveCodeBench  |     Avg@3 (01/25~05/25)      | 61.14 |
+|      MBPP+       |            Avg@2             | 81.48 |
+**注：** 评测过程中，system prompt 为空。
+## 4. 部署和使用
+### 4.1 环境准备
+#### 硬件规格
+Atlas 800T A2 (64GB, >=32卡)，驱动与固件安装包获取请参照[[Atlas 800T A2](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.2.RC1.alpha003&driver=Ascend+HDK+25.0.RC1)]
+#### 软件环境
+- 方式一：基于裸机环境安装以下配套软件
+  - 操作系统：Linux（推荐openEuler>=24.03）
+  - CANN==8.1.RC1，安装准备及流程请参照[[CANN Install](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)]
+  - python==3.10
+  - torch==2.1.0
+  - torch-npu==2.1.0.post12
+  - transformers>=4.48.2
+- 方式二：从docker镜像启动容器
+  参考[[Docker使用指南](doc/docker.md)]
+以上软件配套经过验证，理论可以支持更高的版本，如有疑问，可以提交issue。
+### 4.2 权重完整性校验
+请参考以下方法对下载内容进行完整性校验，hash 值存储在 checklist.chk 文件中。
+```
+#!/usr/bin/env bash
+ARCH=$(uname -m)
+MODEL_PATH="${TARGET_FOLDER}/${MODEL_FOLDER_PATH}"
+cd "$MODEL_PATH" || exit 1
+if [ "$ARCH" = "arm64" ]; then
+    sha256sum checklist.chk
+else
+    sha256sum -c checklist.chk
+fi
+```
+### 4.3 推理权重转换
+本次样例 openPangu-Ultra-MoE-718B 推理采用 Tensor Parallel 并行策略，叠加昇腾 NPU 融合大算子，需要提前对 safetensors 权重进行切分，下述内容提供32卡并行推理的权重切分示例，切分后的权重会保存在`model/`目录下：
+```bash
+cd inference
+bash split_weight.sh
+```
+### 4.4 推理样例
+openPangu-Ultra-MoE-718B 在 Atlas 800T A2 上4机32卡bfloat16推理示例，主节点选取节点IP0：
+```bash
+cd inference
+# 主节点IP0:  ${NNODES} ${NODE_RANK} ${NPROC_PER_NODE} ${MASTER_ADDR} ${PROMPT}
+bash generate.sh 4 0 8 IP0 "3*7=?"
+# 从节点IP1
+bash generate.sh 4 1 8 IP0 "3*7=?"
+# 从节点IP2
+bash generate.sh 4 2 8 IP0 "3*7=?"
+# 从节点IP3
+bash generate.sh 4 3 8 IP0 "3*7=?"
+```
+模型默认为慢思考模式，可以通过以下手段切换至快思考模式：如`generate.py`示例中`fast_thinking_template`所示，在用户输入结尾添加` /no_think`标记可以将当前轮次切换至快思考模式。
+### 4.5 使用推理框架
+vllm_ascend：参考[[vllm_ascend_for_openPangu_ultra_moe_718b](doc/vllm_ascend_for_openpangu_ultra_moe_718b.md)]
+## 5. 模型许可证
+除文件中对开源许可证另有约定外，openPangu-Ultra-MoE-718B 模型根据 OPENPANGU MODEL LICENSE AGREEMENT VERSION 1.0 授权，旨在允许使用并促进人工智能技术的进一步发展。有关详细信息，请参阅模型存储库根目录中的 [LICENSE](LICENSE) 文件。
+## 6. 免责声明
+由于 openPangu-Ultra-MoE-718B （“模型”）所依赖的技术固有的限制，以及人工智能生成的内容是由盘古自动生成的，华为无法对以下事项做出任何保证：
+- 该模型的输出通过AI算法自动生成，不能排除某些信息可能存在缺陷、不合理或引起不适的可能性，生成的内容不代表华为的态度或立场；
+- 无法保证该模型100%准确、可靠、功能齐全、及时、安全、无错误、不间断、持续稳定或无任何故障；
+- 该模型的输出内容不构成任何建议或决策，也不保证生成的内容的真实性、完整性、准确性、及时性、合法性、功能性或实用性。生成的内容不能替代医疗、法律等领域的专业人士回答您的问题。生成的内容仅供参考，不代表华为的任何态度、立场或观点。您需要根据实际情况做出独立判断，华为不承担任何责任。
+## 7. 反馈
+如果有任何意见和建议，请提交issue或联系[openPangu@huawei.com](url)。

README_EN.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# openPangu-Ultra-MoE-718B
+English | [中文](README.md)
+## 1. Introduction
+The openPangu-Ultra-MoE-718B is a large-scale mixture-of-experts language model trained from scratch on Ascend NPU, with a total parameter count of 718B and 39B activated parameters per token. The openPangu-Ultra-MoE-718B is trained on approximately 19 trillion tokens, and equipped with the capability to switch between fast and slow thinking.
+## 2. Model Architecture
+The architecture of the openPangu-Ultra-MoE-718B adopts the mainstream Multi-head Latent Attention (MLA), Multi-Token Prediction (MTP), high MoE sparsity, and features several different designs:
+- Depth-Scaled Sandwich-Norm and TinyInit: These techniques adjust the layer normalization structure and parameter initialization for improved training stability.
+- EP-Group load balancing loss: This technique optimizes the load balancing loss, achieving better expert specialization.
+## 3. Results
+|         Benchmark         |             Metric             |   Slow-thinking   |
+|:-------------------------:|:------------------------------:|:-----------------:|
+|        **General**        |                                |                   |
+|          C-Eval           |              Acc               |       91.06       |
+|          CLUEWSC          |              Acc               |       94.67       |
+|         MMLU-Pro          |          Exact Match           |       82.40       |
+|      ArenaHard_v0.1       |       w/o Style Control        |       96.80       |
+|       GPQA-Diamond        |             Avg@4              |       76.77       |
+|         SuperGPQA         |              Acc               |       61.67       |
+|          IF-Eval          |         Prompt Strict          |       80.59       |
+|         SysBench          |  Constraint Satisfaction Rate  |       91.43       |
+|         **Math**          |                                |                   |
+|         CNMO 2024         |             Avg@32             |       80.73       |
+|          AIME25           |             Avg@16             |       75.21       |
+|          AIME24           |             Avg@16             |       80.21       |
+|         MATH-500          |             Avg@1              |       97.40       |
+|        **Coding**         |                                |                   |
+|     LiveCodeBench      |           Avg@3 (01/25~05/25)  |       61.14       |
+|           MBPP+           |             Avg@2              |       81.48       |
+**Note:** The system prompt is empty during the evaluation process.
+## 4. Deployment
+### 4.1 Environment
+#### Hardware Requirements
+Atlas 800T A2 (64GB, >=32 NPUs), please refer to [[Atlas 800T A2](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.2.RC1.alpha003&driver=Ascend+HDK+25.0.RC1)] for obtaining the driver and firmware installation packages.
+#### System Requirements & Dependencies
+- Method 1: Install the following supporting software in a bare-metal environment.
+  - System: Linux (openEuler ≥ 24.03 recommended)
+  - CANN==8.1.RC1, please refer to [[CANN Install](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)] for installation
+  - python==3.10
+  - torch==2.1.0
+  - torch-npu==2.1.0.post12
+  - transformers>=4.48.2
+- Method 2: Start a container from a docker image.
+   Refer to the [[Docker User Guide](doc/docker_EN.md)]
+The above software environment has been verified, and theoretically supports newer versions. For any questions, please submit an issue.
+### 4.2 Integrity Check
+Please refer to the following methods to verify the integrity of the downloaded content. The hash values are stored in the `checklist.chk` file.
+```
+#!/usr/bin/env bash
+ARCH=$(uname -m)
+MODEL_PATH="${TARGET_FOLDER}/${MODEL_FOLDER_PATH}"
+cd "$MODEL_PATH" || exit 1
+if [ "$ARCH" = "arm64" ]; then
+    sha256sum checklist.chk
+else
+    sha256sum -c checklist.chk
+fi
+```
+### 4.3 Model Weights Conversion
+This inference example of the openPangu-Ultra-MoE-718B adopts Tensor Parallel strategy with fused operators on Ascend NPU. It requires pre-sharding of the model weights. The following provides an example of weight sharding for 32-NPU parallel inference, with the split weights saved in the `model/` directory.
+```bash
+cd inference
+bash split_weight.sh
+```
+### 4.4 Inference Examples
+The following provides a simple bfloat16 inference example of the openPangu-Ultra-MoE-718B deployed on a 4-node 32-NPU Atlas 800T A2 cluster, for which the node IP0 is selected as the master node:
+```bash
+cd inference
+# Master node IP0:  ${NNODES} ${NODE_RANK} ${NPROC_PER_NODE} ${MASTER_ADDR} ${PROMPT}
+bash generate.sh 4 0 8 IP0 "3*7=?"
+# Worker node IP1
+bash generate.sh 4 1 8 IP0 "3*7=?"
+# Worker node IP2
+bash generate.sh 4 2 8 IP0 "3*7=?"
+# Worker node IP3
+bash generate.sh 4 3 8 IP0 "3*7=?"
+```
+The model operates by default in slow thinking mode and can be configured to fast thinking mode through the following method: As demonstrated in the `fast_thinking_template` within the `generate.py` example, appending the ` /no_think` flag to the end of user input.
+### 4.5 Using Inference Framework
+Vllm-ascend：please refer to [[vllm_ascend_for_openpangu_ultra_moe_718b_EN](doc/vllm_ascend_for_openpangu_ultra_moe_718b_EN.md)]
+## 5. Model License
+Unless otherwise noted, the openPangu-Ultra-MoE-718B model is licensed under the terms and conditions of OPENPANGU MODEL LICENSE AGREEMENT VERSION 1.0, which is intended to be used permissively and enable the further development of artificial intelligence technologies. Please refer to the [LICENSE](LICENSE) file located in the root directory of the model repository for details.
+## 6. Disclaimer
+Due to the technical limitations inherent in the technology on which the openPangu-Ultra-MoE-718B (“Model”) relies and the fact that the artificial intelligence generated content is automatically produced by Model, Huawei cannot make any guarantees regarding the following matters:
+- The output of this Model is automatically generated via AI algorithms, it does not rule out the possibility that some of the information may be flawed, unreasonable, or cause discomfort, and the generated content does not represent Huawei's attitude or standpoint;
+- There is no guarantee that this Model is 100% accurate, reliable, functional, timely, secure and safety, error-free, uninterrupted, continuously stable, or free of any faults;
+- The output of this Model does not constitute any advices or decisions for you, and it does not guarantee the authenticity, completeness, accuracy, timeliness, legality, functionality, or practicality of the generated content. The generated content cannot replace professionals in medical, legal, and other fields in answering your questions. The generated content is for your reference only and does not represent any attitude, standpoint, or position of Huawei. You need to make independent judgments based on your actual situation, and Huawei does not assume any responsibilities.
+## 7. Contact
+If you have any question, please raise an issue or contact us at [openPangu@huawei.com](url).

checklist.chk ADDED Viewed

	@@ -0,0 +1,99 @@

+714e6540d1371ad78a5816a6c69e2f0e330594939dd14b9dac7041e784d701a4 *./tokenizer_config.json
+6b16f1558c0cd4ae6ef1a2c605713be0a514f50e1ce2d2c878979ce988c148ec *./tokenizer.model
+81c7a7c24ed70acdeeff295a5a091a401cacfb49dcc30308a5b451956b051223 *./tokenization_openpangu.py
+b34cf5e7c7660889303b6e2d0a346c440356385c9db551d06f6615cf9fc600d1 *./special_tokens_map.json
+035c0bc169b317e19096ac2f81d6fa790850533f573bedda9d3e88a1183aed9b *./modeling_openpangu_moe.py
+6580557dcb86f3285d3c056eeb21eb9b702e0fa908577ba471cc91ec6c9b3fcc *./model.safetensors.index.json
+bc6505adabc0498ad07b49187858788c65c13dbf9446fd0bcf177a3e1b27220d *./inference/vllm_ascend/worker/npu_input_batch.py
+62c6734d1283e3d649a6478d2004f46bfee2f7878af7f2849c979b124e355302 *./inference/vllm_ascend/worker/model_runner_v1.py
+e2457c558f048876afe069d1226e7080ac214478f1a9ac28ae472928b81b5a06 *./inference/vllm_ascend/utils.py
+6adfaa8a67ea9b561dec2e6a2392f6fc85ff376fb2030d8761c34c6c6d3f4cbf *./inference/vllm_ascend/quantization/w8a8_dynamic.py
+743bd96cfc109975a11fe5412c4b5de46f880501dcbbbdd10e11cbeb865fa4f2 *./inference/vllm_ascend/quantization/w8a8.py
+e712ea36caf16c2a9dd21c5288f9d8e34c7fd2face444da44dca6db6c21f6c1b *./inference/vllm_ascend/patch/worker/patch_common/patch_sampler.py
+8c59df8086bde0cd4df674403f83000921a34403651a8ff2b31de9b28768247a *./inference/vllm_ascend/patch/worker/patch_common/patch_parsers.py
+8436ab93933989431160e55627b5dce5326f0fc5ec18263653902764ac8ace7b *./inference/vllm_ascend/patch/worker/patch_common/patch_config.py
+63a6ba0d0b0158d4586219c979bf96d5fe87b74123af93f1c8d9ed842db96500 *./inference/vllm_ascend/patch/worker/patch_common/__init__.py
+09273eb0e4696d2fb530881ba1ad9d331897dd81c0cd2f203ed3d0a475b4d39b *./inference/vllm_ascend/ops/fused_moe.py
+b654e72ece161b3f04080e5c4d2476641c024939ac5308115fe1c65a6c5c7215 *./inference/vllm_ascend/models/open_pangu.py
+e98aa2549f02017a35b07499216fe569e86400684087821820cf2d971c8fcbac *./inference/vllm_ascend/models/__init__.py
+52a968f10ebaebeb626248afd3e1d1b92f8fbfcaad19ebf05cafbc0bd03192cb *./inference/vllm_ascend/envs.py
+91eab52cdc19603b7b705b302e25345d849e18fa66875261a1135d5382392123 *./inference/vllm_ascend/entrypoints/openai/tool_parsers/pangu_tool_parser.py
+d07256c9014f911f81269e65aad6c0d7dd61d4e82f5cb399e05285d5c1bc8fa8 *./inference/vllm_ascend/entrypoints/openai/tool_parsers/__init__.py
+f9577c29bc4dc19a4cc41ccfcca17065402c9dd92221bef987c74808b23ed124 *./inference/vllm_ascend/entrypoints/openai/reasoning_parsers/pangu_reasoning_parser.py
+9070682b058a79d2b2874ba5e07ce72beff6efb870f75cdac30cdcf6ba8fadc7 *./inference/vllm_ascend/entrypoints/openai/reasoning_parsers/__init__.py
+2254aeca0be7b8922318e10c4a950f39afb30ba5fe3b46564a58671b237ac612 *./inference/vllm_ascend/attention/mla_v1.py
+ba6d7edcf1cf464d6fd787b12a9bda2a16fea0ac0d5d1e54136baec503d6e696 *./inference/vllm_ascend/attention/attention.py
+4aaf57e6f6d2e139b3847b10ee59d738398ebfc4927a22325b27dad384874aec *./inference/vllm_ascend/_build_info.py
+5cd02a8ec3b7494e08c6cea07927c377b5da177b9cf704988d3c2ca367528c09 *./inference/split_weight.sh
+a2ea110364d7eecf039e180b42986f1c830acc3fd6ac487f45f93f89f285c669 *./inference/split_weight.py
+bd8f9bb4a9cd7d182a499307746c939514b4487ebbecbdf9527482f2c31aed9a *./inference/runner_config/tp32.yaml
+379d51e424a24b6599a7e5b85187ced84a8d016c051133432d0a4eaa58a5c900 *./inference/runner_config/tp1.yaml
+72f5bf3c6e4a990d5c9b2412f4b6bf9534b99ce7e1c196f7a42740a6f006e7ad *./inference/runner.py
+85841e6a1bc03eff679e3cf00958864f86e3260dc90e69379132f2e3dc6674ad *./inference/model.py
+d646343af70b851f1942ee6dbdc1807686c13a48d4d660ebac777dba29eafdd1 *./inference/generate.sh
+d4d48848ec2c7945670a6196323a570d1830e4edcf5336919641e58bcbc9da0a *./inference/generate.py
+feb36ae08104d00af5bd32e6e20b67a11588e02911b15b3261650b22e1db3ad8 *./generation_config.json
+634ef0ce7809d0e44d31fecf72863146a08e609a8ba9cbe16b427f0de12fe2e0 *./configuration_openpangu_moe.py
+9988eca928867694568154f72de3cedaca34b258954652b3c95772d9a5f5118e *./config.json
+fca1e83c102d7a4b9c9f1bcfdc8766df2037d77550ca33dc45c8342fd5b73d0d *./model-00062-of-000062.safetensors
+0548a05b2afa374e731f8785ad5ee7302335dcba952d3797b759ad920f3f4fce *./model-00061-of-000062.safetensors
+4351cc1440af69fca1735baa78c09658b08c9ae67fcc076ad4fa9ea2d25e084c *./model-00060-of-000062.safetensors
+8802bb0b554a1e6404b58b13f328c82fb4f2e2d6d7524f7b4f14d7bb6e81d0f3 *./model-00059-of-000062.safetensors
+59a1f7667adcfff55145b4b21789e7c494aa80d8bc2fa466d0fa43cd0f3ff43e *./model-00058-of-000062.safetensors
+65c1cd81cf879606bcddc4af60b506e9bcda80955be72693e642d3a325ffd8e7 *./model-00057-of-000062.safetensors
+b72df9faac3f73cf7191dbb454229f74bc22e1a98d5ad7ef2aea85e0b14d4123 *./model-00056-of-000062.safetensors
+4ded3c2baad08d7646cf3da25dcecda3bcbdd542f90acf76848d93000b3ab23b *./model-00055-of-000062.safetensors
+86cc6e275ab190244cc2bfc0cff5689fb7c145623ec54d03ed351df6af829ec2 *./model-00054-of-000062.safetensors
+a5bba51810248ad0a9a54e55b56b946732737ee5f5f289fab580b7c36f21d3f6 *./model-00053-of-000062.safetensors
+cd153a6a0fd5dea192768257c71d47655d7a7a1dbdffdaffeab4dfb7ad27b3eb *./model-00052-of-000062.safetensors
+85000bb98b77232c47542490df67faec31e3d5105b7d4de34e46466d53220bd6 *./model-00051-of-000062.safetensors
+0da40653e3dcb3a32aa929674062224f3a74fa3eeefb6dcc5a6698cd9f596708 *./model-00050-of-000062.safetensors
+678d2b3ac3c73f387a18a446d83dd866ffc925f82ff56541d3b71a747c6b7d06 *./model-00049-of-000062.safetensors
+a49b5e3f1be43a6a3bff9fec25d5d8dffad5a8ebd96c8e389933f40287f1888e *./model-00048-of-000062.safetensors
+54d984279e435df0426cb052ed99067b7a4a1637234a24a8f05bcb7bbd89d0d2 *./model-00047-of-000062.safetensors
+4fe5e98cb4e492639bafe37c3e3144c3fe8f27a9fd45e48c92c780f39599becc *./model-00046-of-000062.safetensors
+97458250006f5949c65b338a26503738200c5fb2415f4cc664a6b224aa9dce70 *./model-00045-of-000062.safetensors
+fd548640dbe4cc04ef4834ac32cada57fb43b0fb9971f0e36030d94041dd1b0d *./model-00044-of-000062.safetensors
+58847a3be6d606e21941769346e86165891f4fa7242cc84cda7edc9620298ad2 *./model-00043-of-000062.safetensors
+f0be4dc1d9326543061e858d566955aefa9d4a757ebd8f92df648bd9c16a236b *./model-00042-of-000062.safetensors
+d6de2cffa758d32d790c89ac7df7aa313ec1a2923faf52549357a3e8ff16d74f *./model-00041-of-000062.safetensors
+cf9fb7c2ca6e977d9e6f19bd3204ba8b8ad15d33cef9a0ad9f6e9b867319fc8b *./model-00040-of-000062.safetensors
+2f12c46798cd8f51740964ea58b59ef682eca6ee2ae9402283214f1f0fd4113c *./model-00039-of-000062.safetensors
+0a531b364281f6060567549b741a1a53a81c1d9211170354bf32c88494b486e9 *./model-00038-of-000062.safetensors
+e3fe1e5795ffb480aa934410434f30711a4dc982f7eea3df5ac1da57783bc619 *./model-00037-of-000062.safetensors
+e36d0dbfbfbb7a792246c931c89d95be2e7afff1872ff455114c58d82b7398d2 *./model-00036-of-000062.safetensors
+af8bfd55a58902cdd8293c67f2c9c5e069ca38f3086645a05bfb9140fe6d51ba *./model-00035-of-000062.safetensors
+9aa7ab7d596db78af87e87a92e8111fc673000b3132cffe02fa2e164d77b8b32 *./model-00034-of-000062.safetensors
+dfa6683035c9caca4de02afc6b3b4fc5fe786e0ee687ddfa1f72c4171eb33821 *./model-00033-of-000062.safetensors
+cdb597fd48c542dd1508d9ce069b8e7c19007908fbdd2dc8a443d73bee6f1754 *./model-00032-of-000062.safetensors
+457f476851463f36e3fdcb54e54dc39c9890e64b6b06b100ac5013c3c72385e4 *./model-00031-of-000062.safetensors
+3d1ee1e248181a08f2401d314f71d342a95e8c9fbee5877800b392be54b1343f *./model-00030-of-000062.safetensors
+1856663f2077287ba21f3bd821705c1f55bac979e768d67fc31c52727b34ae2f *./model-00029-of-000062.safetensors
+0cfe77fd4bfac0f9623411d1234872e141b211eb0b3cf61238380f5b34c3c043 *./model-00028-of-000062.safetensors
+933263ed6db42b1e16407b4400b70260425b1cc11f9ed8fdaad6ef5935f05fb4 *./model-00027-of-000062.safetensors
+d15f1371a364df11676b105291e481fbbd1999ea2152ec0b14905f5d9cb854fa *./model-00026-of-000062.safetensors
+744dedfcb6bc74624351cc299772ee6be389147301b05f4fe645ebac7bedb53b *./model-00025-of-000062.safetensors
+fd9cf078f3a819e230a78fdf8201e37fd25696f576e1fd0d46fb122deb11c2c8 *./model-00024-of-000062.safetensors
+3ec91acf57a576b8550d14312dda2acf345ce09769407d1596e0cbdff5a1200a *./model-00023-of-000062.safetensors
+6df9b0923f9ec0ca7ab20dd0934fc36467d02f02d0cb996904b2f1181d37502b *./model-00022-of-000062.safetensors
+0e4ac1edb16a327ed30dd75325fdb9d6e7da6d35724fc086c66c74822d6a1de7 *./model-00021-of-000062.safetensors
+9837d1382bdab63f5781e6b76f3634775a94cd792adadb4d51763693aa670c36 *./model-00020-of-000062.safetensors
+6a1de462c450f0ddc2e379224cc5ca0ebd31b9bcb9102ed4eb7effe8298de963 *./model-00019-of-000062.safetensors
+1bf72bcf187656c13ac2f003fa534bbb79d22374711a7d5315766b2299800c4e *./model-00018-of-000062.safetensors
+41bda994ada5d86c56f660db9350acd900201d1bc131379ce892e60564154e5f *./model-00017-of-000062.safetensors
+009a80118b5e38fc072a5bcaf20efa832148c14e55765ae2e4176edda11d6608 *./model-00016-of-000062.safetensors
+310713a4eeebdb60bd62b1c9c0a51bcd6efd6fdefde5c31d4adc8ce854d06f23 *./model-00015-of-000062.safetensors
+91ef0b66652923f58267a69bb39a18da89ce71732a7a92b458447bb938fb17e7 *./model-00014-of-000062.safetensors
+deabb0cc16d4bcaa6576583acefa13c92e0bc16c6209e8db0fdea2bb55b45501 *./model-00013-of-000062.safetensors
+95d7980844ee1411f71358f8268d537f0f6c2d5d87a15e101e956d1b3c9c61b2 *./model-00012-of-000062.safetensors
+4661dd325fda474f8490505bfe9b3652ae2976284825d17cfb7bd60d01aacae2 *./model-00011-of-000062.safetensors
+9f3168f097ba85d266b062732aab8cb28daae08fef108305291819750be1b384 *./model-00010-of-000062.safetensors
+91547f7296b08e93903a250eec9a25562f714a7bdfab511ae4fd0f358aaec832 *./model-00009-of-000062.safetensors
+5ee6ca3f506708b453076c56027fe7366958ef4baa06fdd69f0992e15544ad17 *./model-00008-of-000062.safetensors
+07fa95a4e6b3e9b3475076f160e839cb715ab426fb78a63d8e2decb636cb8987 *./model-00007-of-000062.safetensors
+a0b071272706a4d4d4ed5a331d60590930a3529c13c4bc158b6f1b0bc3dd8c85 *./model-00006-of-000062.safetensors
+4e3907f683d7f8382d2a792304155e8533ffa3a94dd4bb5ff825124b0dba3835 *./model-00005-of-000062.safetensors
+2bd5c0012a3cedf4582160173f857480dd58426282a0e5826609a02b5aff5b3e *./model-00004-of-000062.safetensors
+3f63aa17d947032e0a524b5798eee3becbfc9a9b6f8a352ead3232e7b34bb289 *./model-00003-of-000062.safetensors
+1e29a512e3737d1826c80a2277a8b42021878847753aadbe5e1ae2a2df3d7f8d *./model-00002-of-000062.safetensors
+c692b00cbc19ee5a2d4a9bb71496f12a846d14e06d7e0ac79b46abc3243ee115 *./model-00001-of-000062.safetensors

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "architectures": [
+    "PanguUltraMoEForCausalLM"
+  ],
+  "attention_bias": false,
+  "auto_map": {
+    "AutoConfig": "configuration_openpangu_moe.PanguUltraMoEConfig",
+    "AutoModel": "modeling_openpangu_moe.PanguUltraMoEModel",
+    "AutoModelForCausalLM": "modeling_openpangu_moe.PanguUltraMoEForCausalLM"
+  },
+  "num_dense_layers": 3,
+  "hidden_act": "silu",
+  "hidden_size": 7680,
+  "initializer_range": 0.02,
+  "intermediate_size": 18432,
+  "attention_kv_lora_dim": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "pangu_ultra_moe",
+  "moe_intermediate_size": 2048,
+  "num_routed_experts": 256,
+  "num_shared_experts": 1,
+  "num_attention_heads": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 128,
+  "num_mtp_layers": 1,
+  "attention_q_lora_dim": 1536,
+  "attention_qk_dim": 128,
+  "attention_qk_rope_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 25600000,
+  "routed_scaling_factor": 2.5,
+  "sandwich_norm": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": true,
+  "attention_v_dim": 128,
+  "vocab_size": 153600
+}

configuration_openpangu_moe.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+"""openPanguUltraMoE 718B model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+class PanguUltraMoEConfig(PretrainedConfig):
+    model_type = "pangu_ultra_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=153600,
+        hidden_size=7680,
+        intermediate_size=18432,
+        moe_intermediate_size=2048,
+        num_hidden_layers=61,
+        num_mtp_layers=1,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        num_shared_experts=1,
+        num_routed_experts=256,
+        routed_scaling_factor=2.5,
+        attention_kv_lora_dim=512,
+        attention_q_lora_dim=1536,
+        attention_qk_rope_dim=64,
+        attention_v_dim=128,
+        attention_qk_dim=128,
+        num_experts_per_tok=8,
+        num_dense_layers=3,
+        norm_topk_prob=True,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        rope_theta=25600000,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.num_dense_layers = num_dense_layers
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_shared_experts = num_shared_experts
+        self.num_routed_experts = num_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.attention_kv_lora_dim = attention_kv_lora_dim
+        self.attention_q_lora_dim = attention_q_lora_dim
+        self.attention_qk_rope_dim = attention_qk_rope_dim
+        self.attention_v_dim = attention_v_dim
+        self.attention_qk_dim = attention_qk_dim
+        self.attention_dropout = attention_dropout
+        self.num_mtp_layers = num_mtp_layers
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

doc/docker.md ADDED Viewed

	@@ -0,0 +1,31 @@

+## Atlas 800T A2一键部署
+参考使用vllm-ascend社区开源[镜像](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)
+```bash
+export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:v0.9.2rc1
+export NAME=atlas_800t_a2
+docker run --rm \
+    --name $NAME \
+    --net=host \
+    --device /dev/davinci0 \
+    --device /dev/davinci1 \
+    --device /dev/davinci2 \
+    --device /dev/davinci3 \
+    --device /dev/davinci4 \
+    --device /dev/davinci5 \
+    --device /dev/davinci6 \
+    --device /dev/davinci7 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /data:/data \
+    -v /tmp:/tmp \
+    -it $IMAGE bash
+```

doc/docker_EN.md ADDED Viewed

	@@ -0,0 +1,31 @@

+## Deploy on Atlas 800T A2
+Please refer to the vllm-ascend community open-source [mirror](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)
+```bash
+export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:v0.9.2rc1
+export NAME=atlas_800t_a2
+docker run --rm \
+    --name $NAME \
+    --net=host \
+    --device /dev/davinci0 \
+    --device /dev/davinci1 \
+    --device /dev/davinci2 \
+    --device /dev/davinci3 \
+    --device /dev/davinci4 \
+    --device /dev/davinci5 \
+    --device /dev/davinci6 \
+    --device /dev/davinci7 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /data:/data \
+    -v /tmp:/tmp \
+    -it $IMAGE bash
+```

doc/vllm_ascend_for_openpangu_ultra_moe_718b.md ADDED Viewed

	@@ -0,0 +1,215 @@

+## openPangu-Ultra-MoE-718B在[vllm-ascend](https://github.com/vllm-project/vllm-ascend)部署指导文档
+### 部署环境说明
+Atlas 800T A2(64GB) 64卡可以部署openPangu-Ultra-MoE-718B(bf16)，32卡可部署盘古 Ultra MoE (int8)，选用vllm-ascend社区镜像v0.9.1-dev，多个节点都需拉取镜像。
+```bash
+docker pull quay.io/ascend/vllm-ascend:v0.9.1-dev
+```
+* 网络环境检测
+在每个节点上依次执行以下命令。所有结果必须为 success 且状态必须为 UP：
+```bash
+# Check the remote switch ports
+for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
+# Get the link status of the Ethernet ports (UP or DOWN)
+for i in {0..7}; do hccn_tool -i $i -link -g ; done
+# Check the network health status
+for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
+# View the network detected IP configuration
+for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
+# View gateway configuration
+for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
+# View NPU network configuration
+cat /etc/hccn.conf
+```
+### 镜像启动和推理代码适配
+以下操作需在每个节点都执行。
+启动镜像。
+```bash
+# Update the vllm-ascend image
+export IMAGE=quay.io/ascend/vllm-ascend:v0.9.1-dev  # Use correct image id
+export NAME=vllm-ascend  # Custom docker name
+# Run the container using the defined variables
+# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance
+# To prevent device interference from other docker containers, add the argument "--privileged"
+docker run --rm \
+--name $NAME \
+--network host \
+--device /dev/davinci0 \
+--device /dev/davinci1 \
+--device /dev/davinci2 \
+--device /dev/davinci3 \
+--device /dev/davinci4 \
+--device /dev/davinci5 \
+--device /dev/davinci6 \
+--device /dev/davinci7 \
+--device /dev/davinci_manager \
+--device /dev/devmm_svm \
+--device /dev/hisi_hdc \
+-v /usr/local/dcmi:/usr/local/dcmi \
+-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
+-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+-v /etc/ascend_install.info:/etc/ascend_install.info \
+-v /mnt/sfs_turbo/.cache:/root/.cache \
+-it $IMAGE bash
+```
+如果未进入容器，需以root用户进入容器。
+```
+docker exec -itu root $NAME /bin/bash
+```
+下载vllm(v0.9.2)，替换镜像内置的vllm代码。
+```bash
+pip install --no-deps vllm==0.9.2 pybase64==1.4.1
+```
+下载[vllm-ascend (v0.9.2rc1)](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.2rc1)，替换镜像内置的vllm-ascend代码（`/vllm-workspace/vllm-ascend/`）。例如下载Assets中的[Source code
+(tar.gz)](https://github.com/vllm-project/vllm-ascend/archive/refs/tags/v0.9.2rc1.tar.gz)得到v0.9.2rc1.tar.gz，然后解压并替换：
+```bash
+tar -zxvf vllm-ascend-0.9.2rc1.tar.gz -C /vllm-workspace/vllm-ascend/ --strip-components=1
+export PYTHONPATH=/vllm-workspace/vllm-ascend/:${PYTHONPATH}
+```
+使用当前代码仓中适配盘古模型的vllm-ascend代码替换`/vllm-workspace/vllm-ascend/vllm_ascend/`中的部分代码。
+```bash
+yes | cp -r inference/vllm_ascend/* /vllm-workspace/vllm-ascend/vllm_ascend/
+```
+### BF16推理
+以下操作需在每个节点都执行。
+运行命令：
+```bash
+# This obtained through ifconfig
+# nic_name is the network interface name corresponding to local_ip
+local_ip=`hostname -I | cut -d' ' -f1`
+nic_name=$(ifconfig | grep -B 1 "$local_ip" | head -n 1 | awk '{print $1}' | sed 's/://')
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export HCCL_BUFFSIZE=1024
+export VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP=1
+export VLLM_ASCEND_ENABLE_TOP_N_SIGMA=1  # enable top-n-sigma sampling
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+MASTER_NODE_IP=xxx.xxx.xxx.xxx  # master/head node ip
+NODE_RANK=xxx  # current node rank (0~7)
+NUM_NODES=8  # number of nodes
+NUM_NPUS_LOCAL=8  # number of NPUs per node
+DATA_PARALLEL_SIZE_LOCAL=4  # DP size per node, can be set to 1, 2, or 4
+LOCAL_CKPT_DIR=/root/.cache/pangu_ultra_moe  # The pangu_ultra_moe bf16 weight
+# Specifying HOST=127.0.0.1 (localhost) means the server can only be accessed from the master device.
+# Specifying HOST=0.0.0.0 allows the vLLM server to be accessed from other devices on the same network or even from the internet, provided proper network configuration (e.g., firewall rules, port forwarding) is in place.
+HOST=xxx.xxx.xxx.xxx
+if [[ $NODE_RANK -ne 0 ]]; then
+    headless="--headless"
+else
+    headless=""
+fi
+vllm serve $LOCAL_CKPT_DIR \
+--host $HOST \
+--port 8004 \
+--data-parallel-size $((NUM_NODES*DATA_PARALLEL_SIZE_LOCAL)) \
+--data-parallel-size-local $DATA_PARALLEL_SIZE_LOCAL \
+--data-parallel-start-rank $((DATA_PARALLEL_SIZE_LOCAL*NODE_RANK)) \
+--data-parallel-address $MASTER_NODE_IP \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size $((NUM_NPUS_LOCAL/DATA_PARALLEL_SIZE_LOCAL)) \
+--seed 1024 \
+--served-model-name pangu_ultra_moe \
+--enable-expert-parallel \
+--max-num-seqs 8 \
+--max-model-len 32768 \
+--max-num-batched-tokens 4096 \
+--trust-remote-code \
+--no-enable-prefix-caching \
+--gpu-memory-utilization 0.9 \
+${headless} \
+--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
+```
+### 发请求测试
+服务启动后，在主节点或者其他节点向主节点发送测试请求：
+```bash
+curl http://${MASTER_NODE_IP}:8004/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "pangu_ultra_moe",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Who are you?"
+            }
+        ],
+        "max_tokens": 512,
+        "temperature": 0.7,
+        "top_p": 1.0,
+        "top_k": -1,
+        "vllm_xargs": {"top_n_sigma": 0.05}
+    }'
+```
+### Int8推理
+#### ModelSlim量化
+openPangu-Ultra-MoE-718B模型支持使用开源量化框架[ModelSlim](https://gitcode.com/Ascend/msit/blob/br_noncom_pangu_ultra_moe_8.1.RC1_POC_20251231/msmodelslim/example/Pangu/README.md)进行量化，当前模型支持W8A8权重激活量化。
+##### openPangu-Ultra-MoE-718B W8A8 动态量化
+```bash
+python3 quant_pangu_ultra_moe_w8a8.py --model_path {浮点权重路径} --save_path {W8A8量化权重路径} --dynamic
+```
+##### openPangu-Ultra-MoE-718B W8A8 混合量化 + MTP 量化
+生成openPangu-Ultra-MoE-718B模型W8A8量化权重（含MTP）
+```bash
+python3 quant_pangu_ultra_moe_w8a8.py --model_path {浮点权重路径} --save_path {W8A8量化权重路径} --dynamic --quant_mtp mix
+```
+相较于BF16模型，int8量化模型的config.json增加以下字段：
+```
+"mla_quantize": "w8a8",
+"quantize": "w8a8_dynamic",
+```
+如果MTP量化，增加字段：
+```
+"mtp_quantize": "w8a8_dynamic",
+```
+ModelSlim量化脚本生成量化模型后会自动追加上述字段到config.json中。
+#### Int8推理
+相较于BF16模型推理，int8量化模型推理仅需使用4节点（32卡），修改变量
+```bash
+NUM_NODES=4
+```
+启动命令需要修改为对应的量化权重路径，另外增加`--quantization ascend`：
+```bash
+LOCAL_CKPT_DIR=/root/.cache/pang_ultra_moe_w8a8
+vllm serve $LOCAL_CKPT_DIR \
+...
+--quantization ascend
+...
+```

doc/vllm_ascend_for_openpangu_ultra_moe_718b_EN.md ADDED Viewed

	@@ -0,0 +1,216 @@

+## Deployment Guide of the openPangu-Ultra-MoE-718B Based on [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
+### Deployment Environment Description
+The Atlas 800T A2 (64 GB) supports the deployment of the openPangu-Ultra-MoE-718B (bf16) with 64 cards and the deployment of the openPangu-Ultra-MoE-718B (int8) with 32 cards. The vllm-ascend community image v0.9.1-dev is used and needs to be pulled on multiple nodes.
+```bash
+docker pull quay.io/ascend/vllm-ascend:v0.9.1-dev
+```
+* Network Detection
+Run the following commands on each node: All the results must be success and the status must be UP.
+```bash
+# Check the remote switch ports
+for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
+# Get the link status of the Ethernet ports (UP or DOWN)
+for i in {0..7}; do hccn_tool -i $i -link -g ; done
+# Check the network health status
+for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
+# View the network detected IP configuration
+for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
+# View gateway configuration
+for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
+# View NPU network configuration
+cat /etc/hccn.conf
+```
+### Docker Startup and Inference Code Adaptation
+Perform the following operations on all nodes.
+Run the following command to start the docker:
+```bash
+# Update the vllm-ascend image
+export IMAGE=quay.io/ascend/vllm-ascend:v0.9.1-dev  # Use correct image id
+export NAME=vllm-ascend  # Custom docker name
+# Run the container using the defined variables
+# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance
+# To prevent device interference from other docker containers, add the argument "--privileged"
+docker run --rm \
+--name $NAME \
+--network host \
+--device /dev/davinci0 \
+--device /dev/davinci1 \
+--device /dev/davinci2 \
+--device /dev/davinci3 \
+--device /dev/davinci4 \
+--device /dev/davinci5 \
+--device /dev/davinci6 \
+--device /dev/davinci7 \
+--device /dev/davinci_manager \
+--device /dev/devmm_svm \
+--device /dev/hisi_hdc \
+-v /usr/local/dcmi:/usr/local/dcmi \
+-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
+-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+-v /etc/ascend_install.info:/etc/ascend_install.info \
+-v /mnt/sfs_turbo/.cache:/root/.cache \
+-it $IMAGE bash
+```
+If not inside the container, enter the container as the root user:
+```
+docker exec -itu root $NAME /bin/bash
+```
+Download vllm (v0.9.2) to replace the built-in vllm code of the image.
+```bash
+pip install --no-deps vllm==0.9.2 pybase64==1.4.1
+```
+Download [vllm-ascend (v0.9.2rc1)](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.2rc1) and replace the built-in vllm-ascend code in the image (/vllm-workspace/vllm-ascend/). For example, download [Source code (tar.gz)](https://github.com/vllm-project/vllm-ascend/archive/refs/tags/v0.9.2rc1.tar.gz) from Assets to get v0.9.2rc1.tar.gz, then extract and replace:
+```bash
+tar -zxvf vllm-ascend-0.9.2rc1.tar.gz -C /vllm-workspace/vllm-ascend/ --strip-components=1
+export PYTHONPATH=/vllm-workspace/vllm-ascend/:${PYTHONPATH}
+```
+Use the Pangu model-adapted vllm-ascend code from the current repository to replace parts of the code in `/vllm-workspace/vllm-ascend/vllm_ascend/`:
+```bash
+yes | cp -r inference/vllm_ascend/* /vllm-workspace/vllm-ascend/vllm_ascend/
+```
+### BF16 Inference
+Perform the following operations on all nodes.
+Run command:
+```bash
+# This obtained through ifconfig
+# nic_name is the network interface name corresponding to local_ip
+local_ip=`hostname -I | cut -d' ' -f1`
+nic_name=$(ifconfig | grep -B 1 "$local_ip" | head -n 1 | awk '{print $1}' | sed 's/://')
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export HCCL_BUFFSIZE=1024
+export VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP=1
+export VLLM_ASCEND_ENABLE_TOP_N_SIGMA=1  # enable top-n-sigma sampling
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+MASTER_NODE_IP=xxx.xxx.xxx.xxx  # master/head node ip
+NODE_RANK=xxx  # current node rank (0~7)
+NUM_NODES=8  # number of nodes
+NUM_NPUS_LOCAL=8  # number of NPUs per node
+DATA_PARALLEL_SIZE_LOCAL=4  # DP size per node, can be set to 1, 2, or 4
+LOCAL_CKPT_DIR=/root/.cache/pangu_ultra_moe  # The pangu_ultra_moe bf16 weight
+# Specifying HOST=127.0.0.1 (localhost) means the server can only be accessed from the master device.
+# Specifying HOST=0.0.0.0 allows the vLLM server to be accessed from other devices on the same network or even from the internet, provided proper network configuration (e.g., firewall rules, port forwarding) is in place.
+HOST=xxx.xxx.xxx.xxx
+if [[ $NODE_RANK -ne 0 ]]; then
+    headless="--headless"
+else
+    headless=""
+fi
+vllm serve $LOCAL_CKPT_DIR \
+--host $HOST \
+--port 8004 \
+--data-parallel-size $((NUM_NODES*DATA_PARALLEL_SIZE_LOCAL)) \
+--data-parallel-size-local $DATA_PARALLEL_SIZE_LOCAL \
+--data-parallel-start-rank $((DATA_PARALLEL_SIZE_LOCAL*NODE_RANK)) \
+--data-parallel-address $MASTER_NODE_IP \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size $((NUM_NPUS_LOCAL/DATA_PARALLEL_SIZE_LOCAL)) \
+--seed 1024 \
+--served-model-name pangu_ultra_moe \
+--enable-expert-parallel \
+--max-num-seqs 8 \
+--max-model-len 32768 \
+--max-num-batched-tokens 4096 \
+--trust-remote-code \
+--no-enable-prefix-caching \
+--gpu-memory-utilization 0.9 \
+${headless} \
+--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}'
+```
+### Test Request
+After server launched, send test request from master node or other nodes:
+```bash
+curl http://${MASTER_NODE_IP}:8004/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "pangu_ultra_moe",
+        "messages": [
+            {
+                "role": "user",
+                "content": "Who are you?"
+            }
+        ],
+        "max_tokens": 512,
+        "temperature": 0.7,
+        "top_p": 1.0,
+        "top_k": -1,
+        "vllm_xargs": {"top_n_sigma": 0.05}
+    }'
+```
+### Int8 Inference
+#### ModelSlim Quantization
+The openPangu-Ultra-MoE-718B model supports quantization using the open source quantization framework [ModelSlim](https://gitcode.com/Ascend/msit/blob/br_noncom_pangu_ultra_moe_8.1.RC1_POC_20251231/msmodelslim/example/Pangu/README.md). The current model supports W8A8 weight activation quantization.
+##### openPangu-Ultra-MoE-718B W8A8 Dynamic quantization
+```bash
+Python3 quant_pangu_ultra_moe_w8a8.py --model_path {bf16 weight path} --save_path {W8A8 weight path} --dynamic
+```
+##### openPangu-Ultra-MoE-718B W8A8 Hybrid quantization + MTP quantization
+Generate the openPangu-Ultra-MoE-718B model W8A8 quantization weight (including MTP)
+```bash
+python3 quant_pangu_ultra_moe_w8a8.py --model_path {bf16 weight path} --save_path {W8A8 weight path} --dynamic --quant_mtp mix
+```
+Compared with the BF16 model, the following fields are added to the config.json file of the int8 quantization model:
+```
+"mla_quantize": "w8a8",
+"quantize": "w8a8_dynamic",
+```
+If the MTP is included, the following fields are added:
+```
+"mtp_quantize": "w8a8_dynamic",
+```
+After the ModelSlim quantization script generates a quantization model, the preceding fields are automatically added to the config.json file.
+#### Int8 Inference
+Compared with the BF16 model inference, the int8 quantization model inference uses only four nodes (32 cards). Variables are modified.
+```bash
+NUM_NODES=4
+```
+The startup command needs to be changed to the corresponding quantization weight path, and add `--quantization ascend`:
+```bash
+LOCAL_CKPT_DIR=/root/.cache/pangu_ultra_moe_w8a8
+vllm serve $LOCAL_CKPT_DIR \
+...
+--quantization ascend
+...
+```

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 45892,
+  "do_sample": true,
+  "temperature": 0.7,
+  "top_p": 1.0,
+  "top_n_sigma": 0.05,
+  "top_k": -1,
+  "transformers_version": "4.48.2"
+}

inference/generate.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+import argparse
+import logging
+import torch
+import yaml
+from model import PanguUltraMoEForCausalLM
+from runner import ModelRunner
+root_logger = logging.getLogger()
+root_logger.handlers.clear()
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - [LLM](%(filename)s:%(lineno)d): %(message)s",
+    level=logging.INFO,
+)
+torch.manual_seed(42)
+torch.npu.manual_seed_all(42)
+"""
+NOTE:
+For enhancing model safety, we recommend the following system prompt.
+It is suggested to be removed for other normal use cases and model evaluation.
+"""
+safe_word = "你必须严格遵守法律法规和社会道德规范。" \
+    "生成任何内容时，都应避免涉及暴力、色情、恐怖主义、种族歧视、性别歧视等不当内容。" \
+    "一旦检测到输入或输出有此类倾向，应拒绝回答并发出警告。例如，如果输入内容包含暴力威胁或色情描述，" \
+    "应返回错误信息：“您的输入包含不当内容，无法处理。”"
+# basic token generator
+def generate_default_prompt(bs):
+    # prompt batch size define actual model forward batch size
+    fast_thinking_template = "[unused9]系统：%s[unused10][unused9]用户：{} /no_think[unused10][unused9]助手：" % (safe_word,)
+    slow_thinking_template = "[unused9]系统：%s[unused10][unused9]用户：{}[unused10][unused9]助手：" % (safe_word,)
+    preset_prompts = [slow_thinking_template.format(args.prompt)]
+    preset_prompts = preset_prompts * (bs // len(preset_prompts) + 1)
+    preset_prompts = preset_prompts[:bs]
+    logging.info(f"prompt batch size: {bs}")
+    return preset_prompts
+def generate_chat_prompt(bs):
+    preset_prompts = [
+        {"role": "system", "content": safe_word},
+        {"role": "user", "content": args.prompt}
+    ]
+    preset_prompts = [preset_prompts] * (bs // len(preset_prompts) + 1)
+    preset_prompts = preset_prompts[:bs]
+    logging.info(f"chat prompt batch size: {bs}")
+    return preset_prompts
+def generate_prompt(bs, tokenizer_mode):
+    if tokenizer_mode == "default":
+        return generate_default_prompt(bs)
+    else:
+        return generate_chat_prompt(bs)
+def parse_args():
+    parser = argparse.ArgumentParser(description="llm run parameters")
+    parser.add_argument("--yaml_file_path", type=str, help="inference configurations")
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=0,
+        help="Local rank id for torch distributed launch",
+    )
+    parser.add_argument("--prompt", type=str, default="3*7=？", help="user prompts")
+    parser_args = parser.parse_args()
+    return parser_args
+def main(runner_config):
+    bs = runner_config.get("data_config").get("batch_size", 1)
+    tokenizer_mode = runner_config.get("model_config").get("tokenizer_mode", "default")
+    preset_prompts = generate_prompt(bs, tokenizer_mode)
+    logging.info(f"input prompts: {preset_prompts}")
+    model_runner = ModelRunner(runner_config)
+    torch.npu.set_compile_mode(jit_compile=False)
+    model_runner.init_model(PanguUltraMoEForCausalLM)
+    # warmup
+    model_runner.model_generate(preset_prompts, warm_up=True)
+    # generate
+    model_runner.model_generate(preset_prompts)
+def read_yaml(yaml_file_path):
+    try:
+        with open(yaml_file_path, "r", encoding="utf-8") as file:
+            data = yaml.safe_load(file)
+    except FileNotFoundError:
+        logging.error(f"No such yaml file: {yaml_file_path}")
+    except yaml.YAMLERROR as e:
+        logging.error(f"Load yaml file failed: {e}")
+    return data
+if __name__ == "__main__":
+    args = parse_args()
+    yaml_file_path = args.yaml_file_path
+    runner_config = read_yaml(yaml_file_path)
+    main(runner_config)
+    logging.info("model run success")

inference/generate.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+#
+#!/bin/bash
+# use example:
+#          bash generate.sh ${NNODES} ${NODE_RANK} ${NPROC_PER_NODE} ${MASTER_ADDR} ${PROMPTS}
+# input args
+export NNODES=$1
+export NODE_RANK=$2
+export NPROC_PER_NODE=$3
+export MASTER_ADDR=$4                      # master node IP
+export prompt=$5
+export MASTER_PORT=6038                                   # master node port
+export WORLD_SIZE=32
+export YAML=runner_config/tp32.yaml
+export RANK_OFFSET=`expr $NODE_RANK \* ${NPROC_PER_NODE}`
+# setup env
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export HCCL_SOCKET_IFNAME=enp                             # network card prefix. specify it according to the actual situation e.g enp/eth
+export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`    # get current node IP
+export HCCL_IF_BASE_PORT=23456
+export HCCL_OP_EXPANSION_MODE=AIV
+export HCCL_CONNECT_TIMEOUT=1200
+export HCCL_EXEC_TIMEOUT=1200
+if [[ -d "/usr/local/Ascend/ascend-toolkit/latest" ]]; then
+    export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
+else
+    export ASCEND_HOME_PATH=/usr/local/Ascend/latest
+fi
+export PYTHONPATH=${PYTHONPATH}:${ASCEND_HOME_PATH}/python/site-packages/
+# set result path
+DATE=`date +%Y%m%d`
+export MODEL_NAME="pangu_ultra_moe"
+NAME=${MODEL_NAME}_${WORLD_SIZE}p
+export TASK_QUEUE_ENABLE=2 # eager mode:opt host perf
+export RES_PATH="res/${DATE}/${NAME}"
+WORK_DIR=`pwd`
+DUMP_PRECISION_PATH=${WORK_DIR}'/'${RES_PATH}'/dump_data'
+mkdir -p ${WORK_DIR}'/'${RES_PATH}
+mkdir -p ${DUMP_PRECISION_PATH}
+# launch multi proc
+cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
+avg_core_per_rank=`expr $cores \/ $NPROC_PER_NODE`
+core_gap=`expr $avg_core_per_rank \- 1`
+for((i=0; i<${NPROC_PER_NODE}; i++))
+do
+    echo $i
+    start=`expr $i \* $avg_core_per_rank`
+    end=`expr $start \+ $core_gap`
+    cmdopt=$start"-"$end
+    export LOCAL_RANK=$i
+    export RANK=$(expr $i + $RANK_OFFSET)
+    export RANK_ID=$RANK
+    if [ $i -eq 0 ];then
+        taskset -c $cmdopt python3 generate.py \
+                            --prompt "$prompt" \
+                            --yaml_file_path=${YAML} 2>&1 | tee ${WORK_DIR}/${RES_PATH}/log_${LOCAL_RANK}.log &
+    else
+        taskset -c $cmdopt python3 generate.py \
+                            --prompt "$prompt" \
+                            --yaml_file_path=${YAML} &> ${WORK_DIR}/${RES_PATH}/log_${LOCAL_RANK}.log &
+    fi
+done
+wait

inference/model.py ADDED Viewed

	@@ -0,0 +1,918 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import warnings
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torch_npu
+from torch import nn
+from torch.distributed.distributed_c10d import _world
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
+from transformers.utils.import_utils import is_torch_fx_available
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from configuration_openpangu_moe import PanguUltraMoEConfig
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+class PanguUltraMoERMSNorm(nn.Module):
+    def __init__(self, hidden_dim, epsilon=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(hidden_dim))
+        self.epsilon = epsilon
+    def forward(self, hidden_states, *args):
+        if len(args) == 0:
+            result = torch_npu.npu_rms_norm(hidden_states, self.weight, self.epsilon)[0]
+            return result
+        elif len(args) == 1 and args[0] is None:
+            result = torch_npu.npu_rms_norm(hidden_states, self.weight, self.epsilon)[0]
+            residual = hidden_states
+            return (result, residual)
+        elif len(args) == 1:
+            residual = args[0]
+            y, _, x = torch_npu.npu_add_rms_norm(
+                residual, hidden_states, self.weight, self.epsilon
+            )
+            return (y, x)
+        else:
+            raise NotImplementedError(f"PanguUltraMoERMSNorm inner error")
+class PanguUltraMoERotaryEmbedding(nn.Module):
+    def __init__(
+        self, dim, max_position_embeddings=131072, base=25600000.0, device=None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self._set_cache(
+            seq_len=max_position_embeddings,
+            device=device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, kv_len, max_seq_len=None):
+        if max_seq_len is None:
+            self._set_cache(seq_len=kv_len, device=x.device, dtype=x.dtype)
+        elif max_seq_len > self.max_seq_len_cached:
+            self._set_cache(seq_len=max_seq_len, device=x.device, dtype=x.dtype)
+        batch_size = x.shape[0]
+        seq_len = x.shape[1]
+        if seq_len == 1:
+            cos = (
+                torch.index_select(self.cos_cached, dim=0, index=kv_len)
+                .unsqueeze(1)
+                .unsqueeze(1)
+            )
+            sin = (
+                torch.index_select(self.sin_cached, dim=0, index=kv_len)
+                .unsqueeze(1)
+                .unsqueeze(1)
+            )
+        else:
+            cos = (
+                self.cos_cached[:seq_len]
+                .unsqueeze(0)
+                .unsqueeze(2)
+                .repeat(batch_size, 1, 1, 1)
+            )
+            sin = (
+                self.sin_cached[:seq_len]
+                .unsqueeze(0)
+                .unsqueeze(2)
+                .repeat(batch_size, 1, 1, 1)
+            )
+        cos = cos[0, :, 0, :]
+        sin = sin[0, :, 0, :]
+        return (
+            cos.to(dtype=x.dtype),
+            sin.to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    def __init__(self, config, runner_config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.runner_config = runner_config
+        self.moe_tp_size = self.runner_config.get("parallel_config").get(
+            "moe_tp_size", 1
+        )
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.intermediate_size_per_rank = self.intermediate_size // self.moe_tp_size
+        self.merge_up_gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size_per_rank * 2, bias=False
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size_per_rank, self.hidden_size, bias=False
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        merged_x = self.merge_up_gate_proj(x)
+        gate_state, up_state = merged_x.chunk(2, dim=-1)
+        intermediate_hidden_states = self.act_fn(gate_state) * up_state
+        down_proj = self.down_proj(intermediate_hidden_states)
+        if self.moe_tp_size > 1:
+            dist.all_reduce(down_proj)
+        return down_proj
+class MoE(nn.Module):
+    def __init__(self, config, runner_config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.runner_config = runner_config
+        self.moe_tp_size = self.runner_config.get("parallel_config").get(
+            "moe_tp_size", 1
+        )
+        self.num_experts = config.num_routed_experts
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.intermediate_size_per_rank = self.intermediate_size // self.moe_tp_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.group_w1_w3 = nn.Parameter(
+            torch.ones(
+                self.num_experts, self.intermediate_size_per_rank * 2, self.hidden_size
+            ),
+            requires_grad=False,
+        )
+        self.group_w2 = nn.Parameter(
+            torch.ones(
+                self.num_experts, self.hidden_size, self.intermediate_size_per_rank
+            ),
+            requires_grad=False,
+        )
+    def forward(self, hidden_states, expert_tokens, seq_len=None):
+        mm1_mm3 = torch_npu.npu_grouped_matmul(
+            [hidden_states],
+            [torch.transpose(self.group_w1_w3, 1, 2)],
+            group_list=expert_tokens,
+            group_type=0,
+            split_item=3,
+        )[0]
+        mm1, mm3 = mm1_mm3.chunk(2, dim=-1)
+        intermediate_hidden_states = self.act_fn(mm1) * mm3
+        hidden_states = torch_npu.npu_grouped_matmul(
+            [intermediate_hidden_states],
+            [torch.transpose(self.group_w2, 1, 2)],
+            group_list=expert_tokens,
+            group_type=0,
+            split_item=3,
+        )[0]
+        return hidden_states
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.norm_topk_prob = config.norm_topk_prob
+        self.weight = nn.Parameter(
+            torch.empty((config.num_routed_experts, config.hidden_size))
+        )
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.to(torch.float32), self.weight.to(torch.float32), None
+        )
+        scores = logits.sigmoid()
+        scores_for_choice = scores.view(bsz * seq_len, -1)
+        _, topk_idx = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)
+        topk_weight = scores.gather(1, topk_idx)
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor
+        return topk_idx, topk_weight
+class PanguUltraMoE(nn.Module):
+    def __init__(self, config, runner_config):
+        super().__init__()
+        self.runner_config = runner_config
+        self.hidden_dim = config.hidden_size
+        self.moe_tp_size = self.runner_config.get("parallel_config").get(
+            "moe_tp_size", 1
+        )
+        self.batch_size_decode = self.runner_config.get("data_config").get(
+            "batch_size", 1
+        )
+        self.batch_size_prefill = self.batch_size_decode
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.num_experts = config.num_routed_experts
+        self.num_shared_experts = config.num_shared_experts
+        self.top_k = config.num_experts_per_tok
+        self.experts_per_rank = config.num_routed_experts
+        self.experts = MoE(
+            config, self.runner_config, intermediate_size=config.moe_intermediate_size
+        )
+        self.gate = MoEGate(config)
+        if self.num_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * self.num_shared_experts
+            self.shared_experts = MLP(
+                config, self.runner_config, intermediate_size=intermediate_size
+            )
+        self.row_idx_decode_len = self.batch_size_decode * self.top_k
+        self.row_idx_decode = (
+            torch.arange(0, self.row_idx_decode_len, dtype=torch.int32)
+            .view(self.top_k, -1)
+            .permute(1, 0)
+            .int()
+            .contiguous()
+            .npu()
+        )
+    def forward(self, hidden_states):
+        identity = hidden_states
+        topk_idx, topk_weight = self.gate(hidden_states)
+        y = self.moe_npu(hidden_states, topk_idx, topk_weight)
+        if self.num_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    def moe_npu(self, x, topk_ids, topk_weight):
+        batch_size, sequence_length, h = x.shape
+        hidden_states = x.view(-1, x.shape[-1])
+        routing_weights = topk_weight.to(x.dtype)
+        expert_idx = topk_ids.int()
+        if sequence_length == 1:
+            row_idx = self.row_idx_decode
+        else:
+            row_idx_prefill_len = self.batch_size_prefill * sequence_length * self.top_k
+            row_idx = (
+                torch.arange(
+                    0, row_idx_prefill_len, dtype=torch.int32, device=topk_weight.device
+                )
+                .view(self.top_k, -1)
+                .permute(1, 0)
+                .int()
+                .contiguous()
+            )
+        active_num = batch_size * sequence_length
+        expanded_x, expanded_row_idx, expanded_expert_idx = (
+            torch_npu.npu_moe_init_routing(
+                hidden_states,
+                row_idx=row_idx,
+                expert_idx=expert_idx,
+                active_num=active_num,
+            )
+        )
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, self.num_experts
+        )
+        expert_tokens = expert_tokens.to(torch.int64)
+        hidden_states_ordered_by_experts = self.experts(
+            expanded_x, expert_tokens, seq_len=sequence_length
+        )
+        hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states_ordered_by_experts,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=routing_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=expert_idx,
+        )
+        if self.moe_tp_size > 1:
+            dist.all_reduce(hidden_states)
+        hidden_states = hidden_states.view(batch_size, -1, self.hidden_dim)
+        return hidden_states
+class PanguUltraMoEAttention(nn.Module):
+    def __init__(
+        self,
+        config: PanguUltraMoEConfig,
+        layer_idx: Optional[int] = None,
+        runner_config: Optional[Dict] = None,
+    ):
+        super().__init__()
+        if runner_config is not None:
+            self.attn_tp_size = runner_config.get("parallel_config").get(
+                "attn_tp_size", 1
+            )
+        else:
+            self.attn_tp_size = 1
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_heads_per_rank = self.num_heads // self.attn_tp_size
+        self.num_key_value_heads_per_rank = self.num_heads_per_rank
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.attention_q_lora_dim = config.attention_q_lora_dim
+        self.attention_qk_rope_dim = config.attention_qk_rope_dim
+        self.attention_kv_lora_dim = config.attention_kv_lora_dim
+        self.attention_v_dim = config.attention_v_dim
+        self.attention_qk_dim = config.attention_qk_dim
+        self.q_head_dim = config.attention_qk_dim + config.attention_qk_rope_dim
+        if self.attention_q_lora_dim is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads_per_rank * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.attention_q_lora_dim, bias=False
+            )
+            self.q_a_layernorm = PanguUltraMoERMSNorm(config.attention_q_lora_dim)
+            self.q_b_proj = nn.Linear(
+                config.attention_q_lora_dim,
+                self.num_heads_per_rank * self.q_head_dim,
+                bias=False,
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.attention_kv_lora_dim + config.attention_qk_rope_dim,
+            bias=False,
+        )
+        self.kv_a_layernorm = PanguUltraMoERMSNorm(config.attention_kv_lora_dim)
+        self.kv_b_proj_w_k = nn.Parameter(
+            torch.zeros(
+                self.num_heads_per_rank,
+                self.attention_qk_dim,
+                self.attention_kv_lora_dim,
+            )
+        )
+        self.kv_b_proj_w_v = nn.Parameter(
+            torch.zeros(
+                self.num_heads_per_rank,
+                self.attention_kv_lora_dim,
+                self.attention_v_dim,
+            )
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads_per_rank * self.attention_v_dim,
+            self.hidden_size,
+            bias=False,
+        )
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+    def bmm_5d(self, x, y):
+        b, s, n, _, d = x.shape
+        x = x.view(b * s, n, d).transpose(0, 1)
+        output = torch.matmul(x, y)
+        output = output.transpose(1, 0).view(b, s, n, -1)
+        return output
+    def prepare_qkv(
+        self,
+        hidden_states: torch.Tensor,
+        cos_sin: torch.Tensor = None,
+        kv_len: torch.IntTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        if self.attention_q_lora_dim is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv,
+            [self.attention_kv_lora_dim, self.attention_qk_rope_dim],
+            dim=-1,
+        )
+        q = q.view(bsz, q_len, self.num_heads_per_rank, self.q_head_dim)
+        q_nope, q_pe = torch.split(
+            q, [self.attention_qk_dim, self.attention_qk_rope_dim], dim=-1
+        )
+        q_pe = q_pe.transpose(1, 2)
+        q_nope = self.bmm_5d(
+            q_nope.view(bsz, q_len, self.num_heads_per_rank, 1, self.attention_qk_dim),
+            self.kv_b_proj_w_k,
+        )
+        q_nope = q_nope.view(
+            bsz, q_len, self.num_heads_per_rank, self.attention_kv_lora_dim
+        )
+        q_nope = q_nope.transpose(1, 2)
+        k_pe = k_pe.view(bsz, q_len, 1, self.attention_qk_rope_dim).transpose(1, 2)
+        k_nope = (
+            self.kv_a_layernorm(compressed_kv)
+            .view(bsz, -1, 1, self.attention_kv_lora_dim)
+            .transpose(1, 2)
+        )
+        cos, sin = cos_sin
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        query_states = torch.cat([q_nope, q_pe], dim=-1)
+        key_states = torch.cat([k_nope, k_pe], dim=-1)
+        kv_seq_len = k_nope.shape[-2]
+        if past_key_value is not None:
+            past_key_states = past_key_value[self.layer_idx][0]
+            torch_npu.scatter_update_(past_key_states, kv_len, key_states, -2)
+            if q_len == 1:
+                key_states = past_key_states
+            kv_seq_len = past_key_value[0][0].size()[-2]
+        value_states = key_states
+        return query_states, key_states, value_states, kv_seq_len
+    def apply_attention_npu(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        kv_seq_len,
+        attention_mask: Optional[torch.Tensor] = None,
+        actual_seq_lengths_kv: list = None,
+        output_attentions: bool = False,
+        past_key_value: Optional[Cache] = None,
+    ):
+        # repeat k/v heads if n_kv_heads < n_heads
+        bsz, _, q_len, _ = query_states.size()
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
+        )
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        else:
+            raise ValueError("attention mask must not be None")
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        value_states = value_states[..., : self.attention_kv_lora_dim]
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = self.bmm_5d(attn_output.unsqueeze(3), self.kv_b_proj_w_v)
+        attn_output = self.o_proj(attn_output.reshape(bsz, q_len, -1))
+        if self.attn_tp_size > 1:
+            dist.all_reduce(attn_output)
+        return attn_output
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_len: torch.IntTensor = None,
+        actual_seq_lengths_kv: list = None,
+        cos_sin: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        query_states, key_states, value_states, kv_seq_len = self.prepare_qkv(
+            hidden_states=hidden_states,
+            cos_sin=cos_sin,
+            kv_len=kv_len,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+        )
+        output = self.apply_attention_npu(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            kv_seq_len=kv_seq_len,
+            actual_seq_lengths_kv=actual_seq_lengths_kv,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            past_key_value=past_key_value,
+        )
+        return output
+class PanguUltraMoEDecoderLayer(nn.Module):
+    def __init__(
+        self, config: PanguUltraMoEConfig, runner_config: Dict, layer_idx: int
+    ):
+        super().__init__()
+        self.runner_config = runner_config
+        self.hidden_size = config.hidden_size
+        self.self_attn = PanguUltraMoEAttention(
+            config=config, runner_config=self.runner_config, layer_idx=layer_idx
+        )
+        self.mlp = (
+            PanguUltraMoE(config, self.runner_config)
+            if (
+                config.num_routed_experts is not None
+                and layer_idx >= config.num_dense_layers
+            )
+            else MLP(config, self.runner_config)
+        )
+        self.input_layernorm = PanguUltraMoERMSNorm(
+            config.hidden_size, epsilon=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = PanguUltraMoERMSNorm(
+            config.hidden_size, epsilon=config.rms_norm_eps
+        )
+        if getattr(config, "sandwich_norm", False):
+            self.sandwich_norm = True
+            self.pre_mlp_layernorm = PanguUltraMoERMSNorm(
+                config.hidden_size, epsilon=config.rms_norm_eps
+            )
+            self.post_mlp_layernorm = PanguUltraMoERMSNorm(
+                config.hidden_size, epsilon=config.rms_norm_eps
+            )
+        else:
+            self.sandwich_norm = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_len: torch.IntTensor,
+        actual_seq_lengths_kv: list,
+        cos_sin: torch.Tensor,
+        past_residual: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        hidden_states, residual = self.input_layernorm(hidden_states, past_residual)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            kv_len=kv_len,
+            actual_seq_lengths_kv=actual_seq_lengths_kv,
+            cos_sin=cos_sin,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+        )
+        if self.sandwich_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states, residual = self.pre_mlp_layernorm(hidden_states, residual)
+        else:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+        hidden_states = self.mlp(hidden_states)
+        if self.sandwich_norm:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+        outputs = (residual, hidden_states)
+        return outputs
+class PanguUltraMoEPreTrainedModel(PreTrainedModel):
+    config_class = PanguUltraMoEConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PanguUltraMoEDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        pass
+class PanguUltraMoEModel(PanguUltraMoEPreTrainedModel):
+    def __init__(self, config: PanguUltraMoEConfig, runner_config: Dict):
+        super().__init__(config)
+        self.config = config
+        self.runner_config = runner_config
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.rank_offset = int(os.getenv("RANK_OFFSET", "0"))
+        self.global_rank = self.local_rank + self.rank_offset
+        self.embed_tp_size = self.runner_config.get("parallel_config").get(
+            "embed_tp_size", 1
+        )
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.vocab_size_per_rank = self.vocab_size // self.embed_tp_size
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size_per_rank, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                PanguUltraMoEDecoderLayer(config, self.runner_config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = PanguUltraMoERMSNorm(config.hidden_size, epsilon=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.rotary_emb = PanguUltraMoERotaryEmbedding(
+            self.config.attention_qk_rope_dim,
+            max_position_embeddings=self.config.max_position_embeddings,
+            base=self.config.rope_theta,
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        kv_len: torch.IntTensor = None,
+        actual_seq_lengths_kv: list = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+    ):
+        batch_size, seq_length = input_ids.shape
+        past_key_values_length = past_key_values[0][0].size()[-2]
+        if position_ids is None:
+            device = input_ids.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if self.embed_tp_size > 1:
+            new_input_ids = input_ids - self.global_rank * self.vocab_size_per_rank
+            mask = (new_input_ids >= 0) & (
+                new_input_ids < self.vocab_size_per_rank
+            )  # (bs, qlen)
+            new_input_ids_per_rank = new_input_ids * mask
+            inputs_embeds = self.embed_tokens(new_input_ids_per_rank) * mask.unsqueeze(
+                -1
+            )
+            dist.all_reduce(inputs_embeds)
+        else:
+            inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+        cos_sin = self.rotary_emb(
+            hidden_states, kv_len, self.config.max_position_embeddings
+        )
+        residual = None
+        for decoder_layer in self.layers:
+            residual, hidden_states = decoder_layer(
+                hidden_states,
+                kv_len,
+                actual_seq_lengths_kv,
+                cos_sin=cos_sin,
+                past_residual=residual,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+class PanguUltraMoEForCausalLM(PanguUltraMoEPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config, runner_config):
+        super().__init__(config)
+        self.config = config
+        self.runner_config = runner_config
+        self.embed_tp_size = self.runner_config.get("parallel_config").get(
+            "embed_tp_size", 1
+        )
+        self.model = PanguUltraMoEModel(config, self.runner_config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size // self.embed_tp_size, bias=False
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        kv_len: torch.IntTensor = None,
+        actual_seq_lengths_kv: list = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+    ):
+        outputs = self.model(
+            input_ids=input_ids,
+            kv_len=kv_len,
+            actual_seq_lengths_kv=actual_seq_lengths_kv,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+        hidden_states = outputs
+        if hidden_states.size()[1] > 1:
+            gather_index, _ = torch.max(position_ids, dim=-1)
+            gather_index = (
+                gather_index.unsqueeze(1)
+                .unsqueeze(2)
+                .repeat(1, 1, hidden_states.shape[-1])
+            )
+            hidden_states = torch.gather(hidden_states, 1, gather_index)
+        logits = self.lm_head(hidden_states)
+        if self.embed_tp_size > 1:
+            new_logits = torch.zeros_like(logits).repeat(self.embed_tp_size, 1, 1)
+            dist.all_gather_into_tensor(new_logits, logits, group=_world._default_pg)
+            new_logits = new_logits.reshape(
+                self.embed_tp_size, logits.shape[0], logits.shape[1], -1
+            ).permute(1, 2, 0, 3)
+            logits = new_logits.reshape(logits.shape[0], logits.shape[1], -1)
+        logits = logits.float()
+        return logits
+    def init_cache(self, input_ids):
+        batch_size, seq_len = input_ids.size()
+        cache_seq_len = self.config.max_position_embeddings
+        past_key_values = ()
+        cache_key_shape = (
+            batch_size,
+            1,
+            cache_seq_len,
+            self.config.attention_kv_lora_dim + self.config.attention_qk_rope_dim,
+        )
+        dtype = self.config.torch_dtype
+        for _ in range(self.config.num_hidden_layers):
+            key_cache = torch.zeros(
+                cache_key_shape, dtype=dtype, device=input_ids.device
+            )
+            past_key_values += ((key_cache,),)
+        return past_key_values
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        is_prefill=None,
+        kv_len=None,
+        share_mask_tril=None,
+        **kwargs,
+    ):
+        batch_size, seq_len = input_ids.size()
+        if past_key_values is None:
+            past_key_values = self.init_cache(input_ids)
+        if is_prefill:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            attention_mask = share_mask_tril
+            kv_len = torch.zeros(
+                (position_ids.size()[0]), dtype=torch.int32, device=input_ids.device
+            )
+            actual_seq_lengths_kv = None
+            past_key_values_length = 0
+            input_mask = None
+        else:
+            attention_mask = None
+            position_ids = kv_len.unsqueeze(1)
+            actual_seq_lengths_kv = (kv_len + 1).cpu().detach().numpy().tolist()
+            past_key_values_length = self.config.max_position_embeddings - seq_len
+            input_mask = share_mask_tril
+        attention_mask = _prepare_4d_causal_attention_mask(
+            input_mask, (batch_size, seq_len), input_ids.float(), past_key_values_length
+        )
+        model_inputs = {}
+        model_inputs.update(
+            {
+                "input_ids": input_ids,
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "attention_mask": attention_mask,
+                "kv_len": kv_len,
+                "actual_seq_lengths_kv": actual_seq_lengths_kv,
+            }
+        )
+        return model_inputs

inference/runner.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+import copy
+import logging
+import os
+import time
+import torch
+from torch.distributed.distributed_c10d import _world
+from transformers import AutoTokenizer
+root_logger = logging.getLogger()
+root_logger.handlers.clear()
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - [LLM](%(filename)s:%(lineno)d): %(message)s",
+    level=logging.INFO,
+)
+torch.manual_seed(42)
+torch.npu.manual_seed_all(42)
+def get_init_attn_mask(mask_length, device, valid_len=None):
+    share_mask_tril = ~torch.tril(
+        torch.ones((mask_length, mask_length), dtype=torch.bool, device=device)
+    )
+    if valid_len is not None:
+        share_mask_tril[-valid_len:, :] = torch.zeros(valid_len, mask_length)
+    return share_mask_tril
+def get_decode_mask(mask_length, device, position):
+    decode_mask = torch.zeros((1, mask_length), device=device)
+    decode_mask[0, :position] = 1
+    return decode_mask
+def sample(input_logits: torch.Tensor, temperature=1.0, top_p=0.0, top_k=0, top_n_sigma=-1.0, **kwargs):
+    # shape of input_logits: [batch_size, 1, vocab_size]
+    # greedy
+    if temperature <= 0.0 or top_k == 1 or top_p == 0.0 or top_n_sigma == 0.0:
+        return torch.argmax(input_logits, dim=-1)
+    logits = input_logits / temperature
+    filter_value = -3.4028e+38
+    # top_n_sigma truncation
+    if top_n_sigma > 0.0:
+        max_vals, _ = logits.max(dim=-1, keepdim=True)
+        std_vals = logits.std(dim=-1, keepdim=True)
+        threshold = max_vals - top_n_sigma * std_vals
+        mask = logits < threshold
+        logits = torch.where(mask, filter_value, logits)
+    # top_k truncation
+    if top_k > 0:
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    # top_p truncation
+    if 0.0 < top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+        sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+        # keep at least 1 token
+        sorted_indices_to_remove[..., -1:] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+        logits = logits.masked_fill(indices_to_remove, filter_value)
+    probs = logits.softmax(dim=-1)
+    outputs = torch.multinomial(probs.squeeze(1), num_samples=1)
+    return outputs
+class ModelRunner:
+    def __init__(self, runner_config):
+        self.runner_config = runner_config
+        self.model_name = runner_config.get("model_name", "default_model_name")
+        model_path = self.runner_config.get("model_path")
+        self.dtype = runner_config.get("model_config").get("dtype", torch.bfloat16)
+        self.max_position_embeddings = runner_config.get("data_config").get(
+            "max_position_embeddings", 131072
+        )
+        self.input_max_len = runner_config.get("data_config").get("input_max_len", 1024)
+        self.max_new_tokens = runner_config.get("data_config").get("max_new_tokens", 32)
+        self.batch_size = runner_config.get("data_config").get("batch_size", 16)
+        self.sampling_params = runner_config.get("sampling_config", {})
+        self.tokenizer = None
+        self.model = None
+        self.device = None
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.rank_offset = int(os.getenv("RANK_OFFSET", "0"))
+        self.global_rank = self.local_rank + self.rank_offset
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        if self.world_size == 1:
+            self.model_path = model_path
+        else:
+            self.model_path = os.path.join(model_path, f"rank_{self.global_rank}")
+        self.res_path = os.getenv("RES_PATH", "./")
+        self.enable_profiler = runner_config.get("model_config").get(
+            "enable_profiler", 0
+        )
+        self.use_pretrained_model = True
+        self.execute_mode = runner_config.get("exe_mode", "dynamo")
+        self.tokenizer_mode = runner_config.get("model_config").get(
+            "tokenizer_mode", "default"
+        )
+        self.init_device()
+        self.start_time = None
+        self.end_time = None
+        self.with_ckpt = runner_config.get("model_config").get("with_ckpt", 1)
+    @staticmethod
+    def repeat_batch(tensor, repeat_num):
+        if repeat_num == 1:
+            return tensor
+        return tensor.repeat(repeat_num, *[1] * (tensor.dim() - 1))
+    def init_device(self):
+        logging.info(
+            "Set execution using npu index: %s, global: %s",
+            self.local_rank,
+            self.global_rank,
+        )
+        self.device = torch.device("%s:%s" % ("npu", self.local_rank))
+        torch.npu.set_device(self.device)
+        if torch.npu.is_available() and self.world_size > 1:
+            if _world._default_pg is None:
+                torch.distributed.init_process_group(
+                    backend="hccl", world_size=self.world_size, rank=self.global_rank
+                )
+    def init_model(self, model, config=None):
+        if self.with_ckpt:
+            self.use_pretrained_model = True
+            config = None
+        else:
+            self.use_pretrained_model = False
+            from configuration_openpangu_moe import PanguUltraMoEConfig as config
+        logging.info(f"use_pretrained_model: {self.use_pretrained_model}")
+        if self.use_pretrained_model:
+            self.load_model(model)
+        else:
+            self.init_model_from_config(model, config=config)
+        self.to_device()
+        self.compile_model()
+        self.init_tokenizer()
+    def init_model_from_config(self, model, config):
+        if config is None:
+            raise Exception("config cannot be None")
+        config_file = f"{self.model_path}/config.json"
+        model_config = config.from_pretrained(
+            config_file,
+            torch_dtype=self.dtype,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+        self.model = model(model_config, runner_config=self.runner_config).to(
+            self.dtype
+        )
+    def load_model(self, model):
+        logging.info("Try to load pretrained model in path: %s", self.model_path)
+        self.model = model.from_pretrained(
+            self.model_path,
+            low_cpu_mem_usage=True,
+            ignore_mismatched_sizes=True,
+            torch_dtype=self.dtype,
+            max_position_embeddings=self.max_position_embeddings,
+            runner_config=self.runner_config,
+        )
+        for name, params in self.model.named_parameters():
+            logging.info(
+                "Param of %s: %s, %s, %s",
+                self.model_name,
+                name,
+                params.size(),
+                params.dtype,
+            )
+    def to_device(self):
+        self.model.to(self.device)
+        logging.info("Model weights H2D finished.")
+    def init_tokenizer(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            local_files_only=True,
+            padding_side="right",
+            truncation_side="right",
+        )
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+    def compile_model(self):
+        logging.info("The final model structure is: \n %s", self.model)
+        if self.execute_mode == "dynamo":
+            logging.info("Try to compile model")
+            self.graph_compile()
+    def graph_compile(self):
+        import torchair as tng
+        import torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce
+        from torchair.configs.compiler_config import CompilerConfig
+        compiler_config = CompilerConfig()
+        compiler_config.experimental_config.frozen_parameter = True
+        compiler_config.experimental_config.tiling_schedule_optimize = True
+        npu_backend = tng.get_npu_backend(compiler_config=compiler_config)
+        self.model = torch.compile(
+            self.model, dynamic=True, fullgraph=True, backend=npu_backend
+        )
+    def mark_inputs(self, model_inputs):
+        if self.execute_mode == "dynamo":
+            input_ids = model_inputs.get("input_ids")
+            kv_len = model_inputs.get("kv_len")
+            attention_mask = model_inputs.get("attention_mask")
+            position_ids = model_inputs.get("position_ids")
+            past_key_values = model_inputs.get("past_key_values")
+            # prefill with dynamic sequence length, decode with static sequence length
+            torch._dynamo.mark_static(kv_len)
+            for item in past_key_values:
+                for sub_item in item:
+                    torch._dynamo.mark_static(sub_item)
+            torch._dynamo.mark_static(input_ids)
+            if attention_mask is not None:
+                torch._dynamo.mark_static(attention_mask)
+            torch._dynamo.mark_static(position_ids)
+    def model_input_prepare(self, input_dict):
+        input_ids = input_dict.get("input_ids")
+        attention_mask = input_dict.get("attention_mask")
+        past_key_values = input_dict.get("past_key_values")
+        is_prefill = input_dict.get("is_prefill")
+        kv_len = input_dict.get("kv_len")
+        share_mask_tril = input_dict.get("share_mask_tril")
+        model_inputs = self.model.prepare_inputs_for_generation(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            is_prefill=is_prefill,
+            kv_len=kv_len,
+            input_lens=input_dict.get("input_lens"),
+            share_mask_tril=share_mask_tril,
+        )
+        return model_inputs
+    def model_inference(self, model_inputs, warm_up=False):
+        torch.npu.synchronize()
+        if warm_up:
+            self.mark_inputs(model_inputs)
+        if self.start_time is None:
+            self.start_time = time.time()
+        with torch.no_grad():
+            logits = self.model(**model_inputs)
+        torch.npu.synchronize()
+        self.end_time = time.time()
+        if torch.distributed.get_rank() != 0:
+            logging.info(
+                f"{self.model_name} inference time cost {(self.end_time - self.start_time)*1000:.2f} ms"
+            )
+        self.start_time = time.time()
+        return logits
+    def model_generate(self, prompts, warm_up=False, **kwargs):
+        calling_func = {
+            "default": self.tokenizer,
+            "chat": self.tokenizer.apply_chat_template,
+        }
+        kwargs = {
+            "return_tensors": "pt",
+            "truncation": True,
+            "padding": "max_length",
+            "max_length": self.input_max_len,
+        }
+        if self.tokenizer_mode == "chat":
+            chat_kwargs = {"add_generation_prompt": True, "return_dict": True}
+            kwargs.update(chat_kwargs)
+        tokenizer = calling_func.get(self.tokenizer_mode, self.tokenizer)
+        inputs = tokenizer(prompts, **kwargs).to(self.device)
+        # get init input_dict
+        share_mask_tril = get_init_attn_mask(
+            self.max_position_embeddings, self.device, valid_len=self.input_max_len
+        )
+        share_mask_tril = share_mask_tril[None, None, ...]
+        input_lens = copy.deepcopy(inputs.input_ids.size()[1])
+        logging.info("Padding max prompts lens is : %d", input_lens)
+        input_dict = {
+            "input_ids": inputs.input_ids,
+            "generate_ids": inputs.input_ids,
+            "input_lens": input_lens,
+            "kv_len": None,
+            "past_key_values": None,
+            "attention_mask": inputs.attention_mask,
+            "share_mask_tril": share_mask_tril,
+            "is_prefill": True,
+        }
+        if torch.distributed.get_rank() == 0:
+            logging.info(
+                f"inputs.input_ids {inputs.input_ids[:,:30]} eod id {self.tokenizer.eos_token_id}"
+            )
+        generate_tokens = 0
+        cnt = 0
+        all_done = [False for _ in range(input_dict["input_ids"].size(0))]
+        done_len = [-1 for _ in range(input_dict["input_ids"].size(0))]
+        while True:
+            jump_flag = self.get_jump_flag(cnt, warm_up, generate_tokens)
+            if jump_flag:
+                break
+            # exit until all reach eod
+            if input_dict["input_ids"].size(1) == 1:
+                for bi in range(input_dict["input_ids"].size(0)):
+                    if (
+                        input_dict["input_ids"][bi, 0].item()
+                        == self.tokenizer.eos_token_id
+                    ):
+                        all_done[bi] = True
+                        done_len[bi] = generate_tokens
+                if all(all_done):
+                    break
+            model_inputs = self.model_input_prepare(input_dict)
+            # fix decode mask
+            if model_inputs["position_ids"].shape[1] == 1:
+                model_inputs["attention_mask"].fill_(-3.4028e38)
+                for bi in range(model_inputs["position_ids"].size(0)):
+                    max_l = model_inputs["position_ids"][bi].max().item()
+                    model_inputs["attention_mask"][bi, :, :, : max_l + 1] = 0
+            outputs = self.model_inference(model_inputs, warm_up=warm_up)
+            self.model_output_process(model_inputs, outputs, input_dict)
+            # prof.step()
+            generate_tokens += 1
+            cnt += 1
+        generate_ids = input_dict["generate_ids"][:, input_lens:].clip(
+            0, self.model.config.vocab_size - 1
+        )
+        for bi in range(generate_ids.size(0)):
+            if done_len[bi] != -1:
+                generate_ids[bi, done_len[bi] :] = self.tokenizer.eos_token_id
+        res = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True)
+        if isinstance(res, list):
+            for answer in res:
+                logging.info("Inference decode result: \n%s", answer)
+        else:
+            logging.info("Inference decode result: \n%s", res)
+        return res
+    def get_jump_flag(self, cnt, warm_up, generate_tokens):
+        default_decode_dump = 2
+        # warm up only perform for 5 times(decode)
+        jump_flag_warm = warm_up and cnt >= default_decode_dump
+        # do not generate after max_token
+        jump_flag_oversize = generate_tokens >= self.max_new_tokens
+        jump_flag = jump_flag_oversize or jump_flag_warm
+        return jump_flag
+    def model_output_process(self, model_inputs, outputs, input_dict):
+        next_batch = self.batch_size
+        attn_tp_size = self.runner_config.get("parallel_config").get("attn_tp_size", 1)
+        if self.world_size % attn_tp_size != 0:
+            raise Exception(
+                f"world_size ({self.world_siz}) not divisible by attn_tp_size ({attn_tp_size})!"
+            )
+        attn_dp_size = self.world_size // attn_tp_size
+        input_dict["is_prefill"] = False
+        input_dict["input_lens"] = input_dict["input_lens"] + 1
+        kv_len = torch.max(model_inputs.get("position_ids"), axis=1)[0] + 1
+        input_dict["kv_len"] = kv_len
+        logits = outputs
+        past_key_values = model_inputs.get("past_key_values")
+        input_dict["past_key_values"] = past_key_values
+        attention_mask = None
+        share_mask_tril = get_decode_mask(
+            mask_length=self.max_position_embeddings,
+            device=self.device,
+            position=input_dict["input_lens"],
+        )
+        share_mask_tril = share_mask_tril[None, None, ...]
+        input_dict["attention_mask"] = attention_mask
+        input_dict["share_mask_tril"] = ModelRunner.repeat_batch(
+            share_mask_tril, self.batch_size
+        )
+        next_tokens = sample(logits, **self.sampling_params)
+        torch.distributed.broadcast(next_tokens, src=0)
+        input_dict["input_ids"] = next_tokens
+        input_dict["generate_ids"] = torch.cat(
+            [input_dict["generate_ids"], next_tokens], dim=-1
+        )

inference/runner_config/tp1.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+model_name: "pangu_ultra_moe"
+model_path: "./model"
+exe_mode: "eager"        # ["dynamo", "eager"]
+model_config:
+  tokenizer_mode: default # ["default", "chat"]
+  mm_quant_mode: None
+  mla_backend: absorb     # [native, absorb]
+  with_ckpt: 1            # [0, 1]
+  enable_profiler: 0      # [0, 1]
+data_config:
+  input_max_len: 4096
+  max_new_tokens: 28000
+  batch_size: 1
+  max_position_embeddings: 32768
+parallel_config:
+  attn_tp_size: 1
+  moe_tp_size: 1
+  embed_tp_size: 1
+sampling_config:
+  top_n_sigma: 0.05
+  top_p: 1.0
+  temperature: 0.7
+  top_k: -1

inference/runner_config/tp32.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+model_name: "pangu_ultra_moe"
+model_path: "./model"
+exe_mode: "eager"        # ["dynamo", "eager"]
+model_config:
+  tokenizer_mode: default # ["default", "chat"]
+  mm_quant_mode: None
+  mla_backend: absorb     # [native, absorb]
+  with_ckpt: 1            # [0, 1]
+  enable_profiler: 0      # [0, 1]
+data_config:
+  input_max_len: 4096
+  max_new_tokens: 28000
+  batch_size: 1
+  max_position_embeddings: 32768
+parallel_config:
+  attn_tp_size: 32
+  moe_tp_size: 32
+  embed_tp_size: 32
+sampling_config:
+  top_n_sigma: 0.05
+  top_p: 1.0
+  temperature: 0.7
+  top_k: -1

inference/split_weight.py ADDED Viewed

	@@ -0,0 +1,387 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+import argparse
+import logging
+import os
+import shutil
+from threading import Thread
+import numpy as np
+import torch
+import yaml
+from torch import nn
+from transformers import AutoModelForCausalLM
+from model import PanguUltraMoEForCausalLM
+root_logger = logging.getLogger()
+root_logger.handlers.clear()
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - [LLM](%(filename)s:%(lineno)d): %(message)s",
+    level=logging.INFO,
+)
+def _to_parameter(data):
+    return nn.Parameter(data, requires_grad=False)
+def split_w_dense(block, dst_model, i, local_rank):
+    up_weight_list = []
+    ffn_dim = dst_model.model.layers[i].mlp.intermediate_size_per_rank
+    gate_weight = block.mlp.gate_proj.weight[
+        local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+    ].contiguous()
+    up_weight = block.mlp.up_proj.weight[
+        local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+    ].contiguous()
+    up_weight_list.append(_to_parameter(torch.cat([gate_weight, up_weight], axis=0)))
+    if len(up_weight_list) == 1:
+        dst_model.model.layers[i].mlp.merge_up_gate_proj.weight = up_weight_list[0]
+    else:
+        dst_model.model.layers[i].mlp.merge_up_gate_proj.weight = _to_parameter(
+            torch.cat(up_weight_list, axis=0)
+        )
+    dst_model.model.layers[i].mlp.down_proj.weight.data = (
+        block.mlp.down_proj.weight.data[
+            :, local_rank * ffn_dim : (local_rank + 1) * ffn_dim
+        ].contiguous()
+    )
+def split_w_moe(block, dst_model, i, local_rank):
+    shared_up_weight_list = []
+    ffn_dim = dst_model.model.layers[i].mlp.shared_experts.intermediate_size_per_rank
+    gate_weight = block.mlp.shared_experts.gate_proj.weight[
+        local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+    ].contiguous()
+    up_weight = block.mlp.shared_experts.up_proj.weight[
+        local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+    ].contiguous()
+    shared_up_weight_list.append(
+        _to_parameter(torch.cat([gate_weight, up_weight], axis=0))
+    )
+    if len(shared_up_weight_list) == 1:
+        dst_model.model.layers[i].mlp.shared_experts.merge_up_gate_proj.weight = (
+            shared_up_weight_list[0]
+        )
+    else:
+        dst_model.model.layers[i].mlp.shared_experts.merge_up_gate_proj.weight = (
+            _to_parameter(torch.cat(shared_up_weight_list, axis=0))
+        )
+    dst_model.model.layers[i].mlp.shared_experts.down_proj.weight.data = (
+        block.mlp.shared_experts.down_proj.weight.data[
+            :, local_rank * ffn_dim : (local_rank + 1) * ffn_dim
+        ].contiguous()
+    )
+    dst_model.model.layers[i].mlp.gate.weight.data = block.mlp.gate.weight.data
+    expert_num = block.mlp.num_routed_experts
+    gate_proj_list, down_proj_list, up_proj_list = [], [], []
+    for _, src_expert in enumerate(block.mlp.experts):
+        ffn_dim = dst_model.model.layers[i].mlp.experts.intermediate_size_per_rank
+        gate_proj_list.append(
+            src_expert.gate_proj.weight.data[
+                local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+            ].contiguous()
+        )
+        up_proj_list.append(
+            src_expert.up_proj.weight.data[
+                local_rank * ffn_dim : (local_rank + 1) * ffn_dim, :
+            ].contiguous()
+        )
+        down_proj_list.append(
+            src_expert.down_proj.weight.data[
+                :, local_rank * ffn_dim : (local_rank + 1) * ffn_dim
+            ].contiguous()
+        )
+    dst_model.model.layers[i].mlp.experts.group_w2.data = (
+        torch.cat(down_proj_list, dim=0).view(expert_num, -1, ffn_dim).contiguous()
+    )
+    group_gate_proj = (
+        torch.cat(gate_proj_list, dim=0).view(expert_num, ffn_dim, -1).contiguous()
+    )
+    group_up_proj = (
+        torch.cat(up_proj_list, dim=0).view(expert_num, ffn_dim, -1).contiguous()
+    )
+    dst_model.model.layers[i].mlp.experts.group_w1_w3.data = torch.cat(
+        [group_gate_proj, group_up_proj], dim=1
+    )
+def split_w_attn(block, dst_model, i, local_rank):
+    q_dim = (
+        dst_model.model.layers[0].self_attn.num_heads_per_rank
+        * dst_model.model.layers[0].self_attn.q_head_dim
+    )
+    o_dim = (
+        dst_model.model.layers[0].self_attn.num_heads_per_rank
+        * dst_model.model.layers[0].self_attn.attention_v_dim
+    )
+    if dst_model.model.layers[i].self_attn.attention_q_lora_dim is None:
+        dst_model.model.layers[i].self_attn.q_proj.weight.data = (
+            block.self_attn.q_proj.weight.data[
+                local_rank * q_dim : (local_rank + 1) * q_dim, :
+            ].contiguous()
+        )
+    else:
+        dst_model.model.layers[i].self_attn.q_a_proj.weight.data = (
+            block.self_attn.q_a_proj.weight.data
+        )
+        dst_model.model.layers[i].self_attn.q_a_layernorm.weight.data = (
+            block.self_attn.q_a_layernorm.weight.data
+        )
+        dst_model.model.layers[i].self_attn.q_b_proj.weight.data = (
+            block.self_attn.q_b_proj.weight.data[
+                local_rank * q_dim : (local_rank + 1) * q_dim, :
+            ].contiguous()
+        )
+    dst_model.model.layers[i].self_attn.kv_a_proj_with_mqa.weight.data = (
+        block.self_attn.kv_a_proj_with_mqa.weight.data
+    )
+    dst_model.model.layers[i].self_attn.kv_a_layernorm.weight.data = (
+        block.self_attn.kv_a_layernorm.weight.data
+    )
+    dst_model.model.layers[i].self_attn.o_proj.weight.data = (
+        block.self_attn.o_proj.weight.data[
+            :, local_rank * o_dim : (local_rank + 1) * o_dim
+        ].contiguous()
+    )
+    dst_model.model.layers[i].input_layernorm.weight.data = (
+        block.input_layernorm.weight.data
+    )
+    dst_model.model.layers[i].post_attention_layernorm.weight.data = (
+        block.post_attention_layernorm.weight.data
+    )
+    dst_model.model.layers[i].pre_mlp_layernorm.weight.data = (
+        block.pre_mlp_layernorm.weight.data
+    )
+    dst_model.model.layers[i].post_mlp_layernorm.weight.data = (
+        block.post_mlp_layernorm.weight.data
+    )
+def kv_low_rank_split(block, dst_model, i, local_rank):
+    k_dim = dst_model.model.layers[0].self_attn.num_heads_per_rank * (
+        dst_model.model.layers[0].self_attn.attention_qk_dim
+        + dst_model.model.layers[0].self_attn.attention_v_dim
+    )
+    kv_b_proj_weight_data = block.self_attn.kv_b_proj.weight.data[
+        local_rank * k_dim : (local_rank + 1) * k_dim, :
+    ].contiguous()
+    attention_qk_dim = dst_model.model.layers[i].self_attn.attention_qk_dim
+    num_heads_per_rank = dst_model.model.layers[i].self_attn.num_heads_per_rank
+    attention_kv_lora_dim = dst_model.model.layers[i].self_attn.attention_kv_lora_dim
+    attention_v_dim = dst_model.model.layers[i].self_attn.attention_v_dim
+    index_tensor = torch.arange(attention_qk_dim).repeat(
+        num_heads_per_rank
+    ) + torch.arange(num_heads_per_rank).repeat_interleave(attention_qk_dim) * (
+        attention_qk_dim + attention_v_dim
+    )
+    kv_b_proj_w_k = torch.index_select(kv_b_proj_weight_data, dim=0, index=index_tensor)
+    dst_model.model.layers[i].self_attn.kv_b_proj_w_k.data = kv_b_proj_w_k.view(
+        num_heads_per_rank, attention_qk_dim, attention_kv_lora_dim
+    ).contiguous()
+    index_tensor = torch.arange(
+        attention_qk_dim, attention_qk_dim + attention_v_dim
+    ).repeat(num_heads_per_rank) + torch.arange(num_heads_per_rank).repeat_interleave(
+        attention_v_dim
+    ) * (
+        attention_qk_dim + attention_v_dim
+    )
+    kv_b_proj_w_v = torch.index_select(kv_b_proj_weight_data, dim=0, index=index_tensor)
+    dst_model.model.layers[i].self_attn.kv_b_proj_w_v.data = (
+        kv_b_proj_w_v.view(num_heads_per_rank, attention_v_dim, attention_kv_lora_dim)
+        .transpose(1, 2)
+        .contiguous()
+    )
+def split_layer(block, dst_model, i, local_rank, attn_tp_size, moe_tp_size):
+    # attn weights
+    local_rank_tp_attn = local_rank % attn_tp_size
+    split_w_attn(block, dst_model, i, local_rank_tp_attn)
+    kv_low_rank_split(block, dst_model, i, local_rank_tp_attn)
+    # moe experts weights
+    local_rank_tp_moe = local_rank % moe_tp_size
+    if i >= dst_model.config.num_dense_layers:
+        split_w_moe(block, dst_model, i, local_rank_tp_moe)
+    else:
+        split_w_dense(block, dst_model, i, local_rank_tp_moe)
+def split_w(src_model, dst_model, local_rank, runner_config):
+    attn_tp_size = runner_config.get("parallel_config").get("attn_tp_size")
+    moe_tp_size = runner_config.get("parallel_config").get("moe_tp_size")
+    embed_tp_size = runner_config.get("parallel_config").get("embed_tp_size")
+    vocab_size = src_model.model.vocab_size // embed_tp_size
+    embed_tp_rank = local_rank % embed_tp_size
+    dst_model.lm_head.weight.data = src_model.lm_head.weight.data[
+        embed_tp_rank * vocab_size : (embed_tp_rank + 1) * vocab_size, :
+    ]
+    dst_model.model.embed_tokens.weight.data = src_model.model.embed_tokens.weight.data[
+        embed_tp_rank * vocab_size : (embed_tp_rank + 1) * vocab_size, :
+    ]
+    dst_model.model.norm.weight.data = src_model.model.norm.weight.data
+    layer_num = len(src_model.model.layers)
+    all_threads = []
+    for i in range(0, layer_num):
+        block = src_model.model.layers[i]
+        thread = Thread(
+            target=split_layer,
+            args=(block, dst_model, i, local_rank, attn_tp_size, moe_tp_size),
+        )
+        all_threads.append(thread)
+        thread.start()
+    for thread in all_threads:
+        thread.join()
+def copy_files_with_prefix(src_dir, dst_dir, prefix):
+    for file in os.listdir(src_dir):
+        if file.startswith(prefix):
+            src_file = os.path.join(src_dir, file)
+            dst_file = os.path.join(dst_dir, file)
+            shutil.copy2(src_file, dst_file)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Split weight parameters with tensor parallel"
+    )
+    parser.add_argument("--model_path", type=str, help="Path of model weights")
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="The output directory where the results are saved",
+    )
+    parser.add_argument(
+        "--origin_yaml_file_path", type=str, help="inference configurations"
+    )
+    parser.add_argument(
+        "--new_yaml_file_path", type=str, help="inference configurations"
+    )
+    parser.add_argument(
+        "--world_size", type=int, default=8, help="The parallel rank size of model"
+    )
+    parser.add_argument("--node_num", type=int, default=1, help="The parallel node num")
+    parser.add_argument(
+        "--node_rank", type=int, default=0, help="The parallel node rank"
+    )
+    parser_args = parser.parse_args()
+    return parser_args
+def show_model_states(origin_model, model_name="src_model"):
+    src_param_size = 0
+    for name, params in origin_model.named_parameters():
+        size_per_param = np.prod(params.size())
+        src_param_size += size_per_param
+        logging.info(
+            "Param of %s tensor parallel: %s, %s, %s",
+            model_name,
+            name,
+            params.size(),
+            params.dtype,
+        )
+    logging.info(
+        "Total param size of %s tensor parallel: %s", model_name, src_param_size
+    )
+def read_yaml(yaml_file_path):
+    try:
+        with open(yaml_file_path, "r", encoding="utf-8") as file:
+            data = yaml.safe_load(file)
+    except FileNotFoundError:
+        logging.error(f"No such yaml file: {yaml_file_path}")
+    except yaml.YAMLERROR as e:
+        logging.error(f"Load yaml file failed: {e}")
+    return data
+def check_vars(world_size, runner_config):
+    attn_tp_size = runner_config.get("parallel_config").get("attn_tp_size")
+    moe_tp_size = runner_config.get("parallel_config").get("moe_tp_size")
+    embed_tp_size = runner_config.get("parallel_config").get("embed_tp_size")
+    if world_size % attn_tp_size != 0:
+        logging.error(
+            "world_size %s mod attn_tp_size %s must be 0", world_size, attn_tp_size
+        )
+        exit(1)
+    if world_size % moe_tp_size != 0:
+        logging.error(
+            "world_size %s mod moe_tp_size %s must be 0", world_size, moe_tp_size
+        )
+        exit(1)
+    if world_size % embed_tp_size != 0:
+        logging.error(
+            "world_size %s mod embed_tp_size %s must be 0", world_size, embed_tp_size
+        )
+        exit(1)
+if __name__ == "__main__":
+    logging.info("Start to split weight...")
+    args = parse_args()
+    output_path = args.output_path
+    old_runner_config = read_yaml(args.origin_yaml_file_path)
+    new_runner_config = read_yaml(args.new_yaml_file_path)
+    world_size = args.world_size
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    origin_model = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+        local_files_only=True,
+        ignore_mismatched_sizes=True,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="eager",
+    )
+    show_model_states(origin_model, "origin_model")
+    node_rank_id = args.node_rank
+    rank_num_per_node = world_size // args.node_num
+    start_rank = rank_num_per_node * node_rank_id
+    end_rank = rank_num_per_node * (node_rank_id + 1)
+    for rank_id in range(start_rank, end_rank):
+        logging.info("rank_id={} / rank_size={}".format(rank_id, world_size))
+        os.environ["LOCAL_RANK"] = str(rank_id)
+        save_path = os.path.join(output_path, f"rank_{rank_id}")
+        logging.info(
+            "Split weight for rank %s start, save path is: %s", rank_id, save_path
+        )
+        config = origin_model.config
+        part_model = PanguUltraMoEForCausalLM(config, new_runner_config)
+        split_w(origin_model, part_model, rank_id, new_runner_config)
+        show_model_states(part_model, "dst_model")
+        part_model.save_pretrained(save_path)
+        copy_files_with_prefix(args.model_path, save_path, "tokenizer")
+        copy_files_with_prefix(args.model_path, save_path, "tokenization")
+        logging.info(
+            "Split weight for rank %s finished, save path is: %s", rank_id, save_path
+        )
+        del part_model

inference/split_weight.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+rm -rf ./model
+mkdir ./model
+python split_weight.py \
+    --model_path=../ \
+    --output_path=./model \
+    --origin_yaml_file_path=./runner_config/tp1.yaml \
+    --new_yaml_file_path=./runner_config/tp32.yaml \
+    --world_size=32 \
+    --node_num=1 \
+    --node_rank=0

inference/vllm_ascend/_build_info.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Auto-generated file
+__soc_version__ = 'ASCEND910B1'
+__sleep_mode_enabled__ = True

inference/vllm_ascend/attention/attention.py ADDED Viewed

	@@ -0,0 +1,1220 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+import numpy as np
+import torch
+import torch_npu
+import torchair._contrib.custom_torch_ops  # type: ignore  # noqa: F401
+from torch.nn.functional import scaled_dot_product_attention
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType,
+                                              MLAAttentionImpl)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
+                                           CommonMetadataBuilder,
+                                           compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.utils import async_tensor_h2d, make_tensor_with_pad
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
+from vllm_ascend.ops.cache import concat_and_cache_mla
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16,
+                               enable_custom_op, is_310p, nd_to_nz_2d)
+from vllm_ascend.worker.model_runner import (
+    ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
+_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
+class AscendAttentionBackend(AttentionBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "ASCEND"
+    @staticmethod
+    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
+        return AscendAttentionBackendImpl
+    @staticmethod
+    def get_metadata_cls() -> Type["AscendMetadata"]:
+        return AscendMetadata
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if is_310p():
+            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
+                    16)
+        else:
+            return (2, num_blocks, block_size, num_kv_heads, head_size)
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: List[torch.Tensor],
+        dst_kv_cache: List[torch.Tensor],
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1]
+        dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1]
+        src_indices = src_to_dst[:, 0]
+        dst_indices = src_to_dst[:, 1]
+        dst_key_cache[dst_indices] = src_key_cache[src_indices].to(
+            dst_key_cache.device)
+        dst_value_cache[dst_indices] = src_value_cache[src_indices].to(
+            dst_key_cache.device)
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        src_indices = src_to_dists[:, 0]
+        dst_indices = src_to_dists[:, 1]
+        for kv_cache in kv_caches:
+            key_caches = kv_cache[0]
+            value_caches = kv_cache[1]
+            key_caches[dst_indices] = key_caches[src_indices]
+            value_caches[dst_indices] = value_caches[src_indices]
+    @staticmethod
+    def get_builder_cls() -> Type["AscendMetadataBuilder"]:
+        return AscendMetadataBuilder
+    @classmethod
+    def make_metadata_builder(cls, *args, **kwargs) -> "AscendMetadataBuilder":
+        return cls.get_builder_cls()(*args, **kwargs)
+class AscendMLAAttentionBackend(AscendAttentionBackend):
+    @staticmethod
+    def get_impl_cls() -> Type["AscendMLAAttentionBackendImpl"]:
+        return AscendMLAAttentionBackendImpl
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, num_kv_heads, head_size)
+@dataclass
+class AscendMetadata(AttentionMetadata):
+    """Metadata for Ascendbackend.
+        * modified from XFormersbackend
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+    # FIXME: It is for flash attn.
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # Avoid mypy error
+    # Total number of prefill requests.
+    num_prefills: int
+    # Number of prefill tokens.
+    num_prefill_tokens: int
+    # (num_tokens,). The indices of the token slots that input tokens will be
+    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
+    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
+    # in block 0, and 1st slot in block 1, respectively.
+    slot_mapping: torch.Tensor
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    chunked_prefill_enabled: bool
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    block_tables: Optional[torch.Tensor]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+    # The query lengths of the input sequences
+    query_lens: Optional[List[int]] = None
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+    # Self-attention prefill/decode metadata cache
+    _cached_prefill_metadata: Optional["AscendMetadata"] = None
+    _cached_decode_metadata: Optional["AscendMetadata"] = None
+    # Begin encoder attn & enc/dec cross-attn fields...
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+    # Mask for normal situation
+    attn_mask: Optional[torch.Tensor] = None
+    # Mask for prefix caching
+    compress_mask: Optional[torch.Tensor] = None
+    # Mask for chunked prefill
+    chunk_mask: Optional[torch.Tensor] = None
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+    @property
+    def prefill_metadata(self) -> Optional["AscendMetadata"]:
+        if self.num_prefills == 0:
+            return None
+        if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure.
+            return self._cached_prefill_metadata
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        # Compute some attn_metadata fields which default to None.
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        query_lens = (None if self.query_lens is None else
+                      self.query_lens[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        # Construct & cache prefill-phase attention metadata structure.
+        self._cached_prefill_metadata = AscendMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            query_lens=query_lens,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            chunked_prefill_enabled=self.chunked_prefill_enabled,
+            block_tables=block_tables,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables,
+            enable_kv_scales_calculation=False)
+        return self._cached_prefill_metadata
+    @property
+    def decode_metadata(self) -> Optional["AscendMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+        if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure.
+            return self._cached_decode_metadata
+        # Compute some attn_metadata fields which default to None.
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[self.num_prefills:])
+        query_lens = (None if self.query_lens is None else
+                      self.query_lens[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        # Construct & cache decode-phase attention metadata structure.
+        self._cached_decode_metadata = AscendMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            query_lens=query_lens,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            chunked_prefill_enabled=self.chunked_prefill_enabled,
+            block_tables=block_tables,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables,
+            enable_kv_scales_calculation=False)
+        return self._cached_decode_metadata
+    def advance_step(self,
+                     model_input: "ModelInputForNPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+        if enable_custom_op():
+            #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
+            torch.ops._C.advance_step_flashattn_ascendc(
+                num_seqs=num_seqs,
+                num_queries=num_queries,
+                block_size=block_size,
+                input_tokens=model_input.input_tokens,
+                sampled_token_ids=sampled_token_ids,
+                input_positions=model_input.input_positions,
+                seq_lens=self.seq_lens_tensor,
+                slot_mapping=self.slot_mapping,
+                block_tables=self.block_tables)
+        else:
+            # use traditional Pytorch method for updating these tensors.
+            # update input_tokens
+            sampled_token_ids_list = sampled_token_ids[:
+                                                       num_queries].squeeze(  # type: ignore
+                                                           -1)
+            model_input.input_tokens[:
+                                     num_queries] = sampled_token_ids_list  # type: ignore
+            # get seq_lens and input_positions
+            seq_lens = self.seq_lens_tensor[:num_queries]
+            next_seq_lens = seq_lens + 1
+            next_input_pos = next_seq_lens - 1
+            # update seq_lens and input_positions
+            self.seq_lens_tensor[:num_queries] = next_seq_lens
+            model_input.input_positions[:
+                                        num_queries] = next_input_pos  # type: ignore
+            # 计算 block index 和 offset
+            block_idx = next_input_pos // block_size
+            block_offset = next_input_pos % block_size
+            current_block_table = self.block_tables.gather(
+                1, block_idx.unsqueeze(-1)).squeeze(-1)
+            slot_num = current_block_table * block_size + block_offset
+            # update slot_mapping
+            self.slot_mapping[:num_queries] = slot_num
+class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]):
+    _attn_mask_builder = None  # noqa
+    def __init__(self, input_builder: "ModelInputForNPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.attn_mask = None
+        self.compress_mask = None
+        self.chunk_mask = None
+        if AscendMetadataBuilder._attn_mask_builder is None:
+            AscendMetadataBuilder._attn_mask_builder = AttentionMaskBuilder(
+                128, self.input_builder.runner.model_config.dtype)
+    def _add_seq_group(
+            self, inter_data: ModelInputForNPUBuilder.InterDataForSeqGroup,
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table: List[int] = []
+            prefix_cache_hit = any([
+                inter_data.prefix_cache_hit
+                for inter_data in self.input_builder.inter_data_list
+            ])
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                if block_tables is not None:
+                    block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(
+                is_profile_run,
+                self.slot_mapping,
+                seq_id,
+                seq_len,
+                context_len,
+                start_idx,
+                self.block_size,
+                inter_data.block_tables,
+            )
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+        graph_block_tables = self.runner.graph_block_tables  # [:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+    def build(
+        self,
+        seq_lens: List[int],
+        query_lens: List[int],
+        graph_pad_size: int,
+    ):
+        """Build attention metadata with on-device tensors.
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+        """
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+        device = self.runner.device
+        dtype = self.runner.model_config.dtype
+        use_npu_graph = graph_pad_size != -1
+        max_query_len = max(query_lens)
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        max_seq_len = max(max_prefill_seq_len, max_decode_seq_len)
+        num_decode_tokens = self.num_decode_tokens
+        if self.num_prefills == 0 and use_npu_graph:
+            num_seqs = len(seq_lens)
+            self.slot_mapping.extend([PAD_SLOT_ID] * graph_pad_size)
+            self.block_tables.extend([[]] * graph_pad_size)
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int32,
+                device=device,
+            )
+        if self.num_prefills > 0:
+            if block_tables is None or block_tables.numel() == 0:
+                # normal mask
+                self.attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
+                    max_prefill_seq_len, dtype, device)
+                if is_310p():
+                    mask_nz = nd_to_nz_2d(self.attn_mask)
+                    mask_nz = torch_npu.npu_format_cast(
+                        mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                    self.attn_mask = mask_nz
+            elif self.num_decode_tokens == 0 and not self.input_builder.chunked_prefill_enabled:
+                # compress mask for prefix cache
+                self.compress_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
+                    128, dtype, device)
+            else:
+                # chunk_mask for chunk prefill
+                attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask(  # type: ignore
+                    max_seq_len, dtype, device)
+                if attn_mask.numel() > 1 and attn_mask[0][1] > 0:
+                    # Do not use in-place multiplication to avoid modifying `attn_mask_cache`!
+                    attn_mask = attn_mask * -10000
+                chunk_mask_list = []
+                for i, seq_len in enumerate(seq_lens):
+                    context_len = self.context_lens[i]
+                    chunk_mask_list.append(attn_mask[context_len:seq_len])
+                self.chunk_mask = torch.cat(chunk_mask_list, 0)
+        else:
+            self.attn_mask = None
+            self.compress_mask = None
+            self.chunk_mask = None
+        assert max_query_len > 0, "query_lens: {}".format(query_lens)
+        assert device is not None
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.int32,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        placeholder_index_maps = {
+            modality: placeholder_map.index_map()
+            for modality, placeholder_map in
+            self.multimodal_placeholder_maps.items()
+        }
+        return AscendMetadata(
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            seq_lens=seq_lens,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=True,
+            seq_lens_tensor=seq_lens_tensor,
+            query_lens=query_lens,
+            max_query_len=max_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            block_tables=block_tables,
+            attn_mask=self.attn_mask,
+            compress_mask=self.compress_mask,
+            chunk_mask=self.chunk_mask,
+            chunked_prefill_enabled=self.input_builder.chunked_prefill_enabled,
+        )
+class AscendAttentionBackendImpl(AttentionImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        use_irope: bool = False,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.hidden_size = self.num_heads * self.head_size
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes,
+                                        dtype=torch.float32,
+                                        device="npu")
+        self.alibi_slopes = alibi_slopes
+        self.attn_type = attn_type
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.seq_len_cpu_tensor = None
+        self.query_len_cpu_tensor = None
+        self.key_cache = None
+        self.value_cache = None
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AscendMetadata,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with Ascend attention.
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+                   num_tokens = batch_size * seq_len
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache: shape = [2, num_blocks, block_size,
+                               num_kv_heads, head_size]
+                      key_cache = [num_blocks, block_size,
+                                   num_kv_heads, head_size]
+                      value_cache = [num_blocks, block_size,
+                                     num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len * num_heads * head_size]
+        """
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+        # View q k v to BSH.
+        num_tokens = query.shape[0]
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        # TODO: Remove this contiguous in the future.
+        value = value.contiguous()
+        attn_type = self.attn_type
+        output = torch.empty(num_tokens,
+                             self.num_heads,
+                             self.head_size,
+                             dtype=query.dtype,
+                             device=query.device)
+        if kv_cache.numel() > 0:
+            if self.key_cache is None:
+                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+            slots = attn_metadata.slot_mapping
+        if hasattr(layer, 'quant_method'):
+            isPrefill = True if attn_metadata.num_prefills > 0 else False
+            if isPrefill:
+                assert attn_metadata.prefill_metadata is not None
+                self.seq_lens_tensor_cpu = torch.from_numpy(
+                    np.array(attn_metadata.prefill_metadata.seq_lens).astype(
+                        np.int32))
+            else:
+                assert attn_metadata.decode_metadata is not None
+                self.seq_lens_tensor_cpu = torch.from_numpy(
+                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
+                        np.int32))
+            block_tables = attn_metadata.decode_metadata.block_tables if attn_metadata.decode_metadata else None
+            # Details of kv_cache arrangement in attention quantization
+            # are implemented by quant_method.
+            layer.quant_method.apply(
+                layer,
+                query,
+                key,
+                value,
+                self.key_cache,
+                self.value_cache,
+                self.scale,
+                block_tables,
+                isPrefill,
+                attn_metadata,
+                output,
+                seq_lens_tensor_cpu=self.seq_lens_tensor_cpu)
+        else:
+            if self.key_cache is not None:
+                torch_npu._npu_reshape_and_cache(key=key,
+                                                 value=value,
+                                                 key_cache=self.key_cache,
+                                                 value_cache=self.value_cache,
+                                                 slot_indices=slots)
+            if attn_metadata.num_prefills > 0:
+                # Prefix cache disabled  and  chunk prefill disabled  or  no prefix cache hit
+                if (attn_metadata.block_tables is None
+                        or attn_metadata.block_tables.numel() == 0):
+                    if attn_type == AttentionType.ENCODER_ONLY:
+                        # TODO: change to use torch_npu encoder attention op, instead
+                        # of torch sdpa
+                        query = query.movedim(0, query.dim() - 2)
+                        key = key.movedim(0, key.dim() - 2)
+                        value = value.movedim(0, value.dim() - 2)
+                        causal_attn = (attn_type == AttentionType.DECODER)
+                        if attn_metadata.seq_lens is not None:
+                            seq_lens_q = seq_lens_kv = attn_metadata.seq_lens
+                        attn_masks = [None] * len(seq_lens_q)
+                        start_q, start_kv = 0, 0
+                        for seq_len_q, seq_len_kv, mask in zip(
+                                seq_lens_q, seq_lens_kv, attn_masks):
+                            end_q = start_q + seq_len_q
+                            end_kv = start_kv + seq_len_kv
+                            sub_out = scaled_dot_product_attention(
+                                query[None, :, start_q:end_q, :],
+                                key[None, :, start_kv:end_kv, :],
+                                value[None, :, start_kv:end_kv, :],
+                                attn_mask=mask,
+                                dropout_p=0.0,
+                                is_causal=causal_attn and mask is None,
+                                scale=self.scale).squeeze(0).movedim(
+                                    query.dim() - 2, 0)
+                            output[start_q:end_q, :, :] = sub_out
+                            start_q, start_kv = end_q, end_kv
+                    else:
+                        assert attn_metadata.attn_mask is not None
+                        mask = attn_metadata.attn_mask
+                        assert attn_metadata.prefill_metadata is not None
+                        self.seq_lens_tensor_cpu = torch.from_numpy(
+                            np.array(attn_metadata.prefill_metadata.seq_lens).
+                            astype(np.int32))
+                        if is_310p():
+                            # align q k v output tensors
+                            query = aligned_16(query)
+                            key = aligned_16(key)
+                            value = aligned_16(value)
+                            output = aligned_16(output)
+                            # do reformat in case of broadcasted tensors
+                            mask = mask.repeat(
+                                self.seq_lens_tensor_cpu.size(0), 1, 1, 1)
+                            mask = torch_npu.npu_format_cast(
+                                mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                        torch_npu._npu_flash_attention(
+                            query=query,
+                            key=key,
+                            value=value,
+                            mask=mask,
+                            seq_len=self.seq_lens_tensor_cpu,
+                            scale_value=self.scale,
+                            num_heads=self.num_heads,
+                            num_kv_heads=self.num_kv_heads,
+                            out=output)
+                        output = output[:num_tokens, :, :]
+                # Prefix cache only and cache hit
+                elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled:
+                    assert kv_cache is not None
+                    assert attn_metadata.prefill_metadata is not None
+                    self.seq_lens_tensor_cpu = torch.from_numpy(
+                        np.array(
+                            attn_metadata.prefill_metadata.seq_lens).astype(
+                                np.int32))
+                    self.query_lens_tensor_cpu = torch.from_numpy(
+                        np.array(
+                            attn_metadata.prefill_metadata.query_lens).astype(
+                                np.int32))
+                    block_tables = attn_metadata.prefill_metadata.block_tables
+                    assert attn_metadata.compress_mask is not None
+                    compress_mask = attn_metadata.compress_mask
+                    torch_npu._npu_flash_attention_qlens(
+                        query=query,
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        block_table=block_tables,
+                        mask=compress_mask,
+                        seq_len=self.query_lens_tensor_cpu,
+                        context_lens=self.seq_lens_tensor_cpu,
+                        num_kv_heads=self.num_kv_heads,
+                        num_heads=self.num_heads,
+                        scale_value=self.scale,
+                        out=output)
+                # Splitfuse
+                else:
+                    assert kv_cache is not None
+                    self.seq_lens_tensor_cpu = torch.from_numpy(
+                        np.array(attn_metadata.seq_lens).astype(np.int32))
+                    self.query_lens_tensor_cpu = torch.from_numpy(
+                        np.array(attn_metadata.query_lens).astype(np.int32))
+                    block_tables = attn_metadata.block_tables
+                    assert attn_metadata.chunk_mask is not None
+                    chunk_mask = attn_metadata.chunk_mask
+                    torch_npu._npu_paged_attention_splitfuse(
+                        query=query,
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        block_table=block_tables,
+                        context_lens=self.seq_lens_tensor_cpu,
+                        mask=chunk_mask,
+                        seq_len=self.query_lens_tensor_cpu,
+                        num_kv_heads=self.num_kv_heads,
+                        num_heads=self.num_heads,
+                        scale_value=self.scale,
+                        out=output)
+            # Decode only
+            else:
+                assert self.key_cache is not None
+                assert self.value_cache is not None
+                assert attn_metadata.decode_metadata is not None
+                self.seq_lens_tensor_cpu = torch.from_numpy(
+                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
+                        np.int32))
+                if is_310p():
+                    # # seq_lens_tensor needs to be transferred to the device for 310P
+                    self.seq_lens_tensor_cpu = self.seq_lens_tensor_cpu.to(
+                        device=self.key_cache.device)
+                block_tables = attn_metadata.decode_metadata.block_tables
+                torch_npu._npu_paged_attention(
+                    query=query,
+                    key_cache=self.key_cache,
+                    value_cache=self.value_cache,
+                    num_kv_heads=self.num_kv_heads,
+                    num_heads=self.num_heads,
+                    scale_value=self.scale,
+                    block_table=block_tables,
+                    context_lens=self.seq_lens_tensor_cpu,
+                    out=output)
+        return output.view(num_tokens, self.hidden_size)
+class AscendMLAAttentionBackendImpl(MLAAttentionImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        **extra_impl_args,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.hidden_size = self.num_heads * self.head_size
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes,
+                                        dtype=torch.float32,
+                                        device="npu")
+        self.alibi_slopes = alibi_slopes
+        self.attn_type = attn_type
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.seq_len_cpu_tensor = None
+        # MLA Args
+        self.q_lora_rank = extra_impl_args['q_lora_rank']
+        self.kv_lora_rank = extra_impl_args['kv_lora_rank']
+        self.qk_nope_head_dim = extra_impl_args['qk_nope_head_dim']
+        self.qk_rope_head_dim = extra_impl_args['qk_rope_head_dim']
+        self.qk_head_dim = extra_impl_args['qk_head_dim']
+        self.v_head_dim = extra_impl_args['v_head_dim']
+        self.rotary_emb = extra_impl_args['rotary_emb']
+        self.q_proj = extra_impl_args['q_proj']
+        self.kv_b_proj = extra_impl_args['kv_b_proj']
+        self.o_proj = extra_impl_args['o_proj']
+        self.kv_a_proj_with_mqa = extra_impl_args.get('kv_a_proj_with_mqa',
+                                                      None)
+        self.kv_a_layernorm = extra_impl_args.get('kv_a_layernorm', None)
+        self.k_pe_cache = None
+        self.k_nope_cache = None
+        self.w_kc = None
+        self.w_vc = None
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+    def exec_kv(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        kv_cache: Tuple,
+        slots: torch.Tensor,
+    ):
+        B = hidden_states.shape[0]
+        N = self.num_kv_heads
+        S = 1
+        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
+        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
+        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
+        k_pe, k_nope, _, _ = torch.ops.npu_inference.npu_kv_rmsnorm_rope_cache(
+            kv,
+            self.kv_a_layernorm.weight,
+            cos,
+            sin,
+            slots.to(torch.int64),
+            kv_cache[1],
+            kv_cache[0],
+            epsilon=self.kv_a_layernorm.variance_epsilon,
+            cache_mode="PA",
+        )
+        return k_pe, k_nope
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        is_neox_style: bool,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [num_tokens, num_heads, head_size]
+            cos: [num_tokens, head_size // 2]
+            sin: [num_tokens, head_size // 2]
+            is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+                positional embeddings.
+        """
+        cos = cos.unsqueeze(-2).to(x.dtype)
+        sin = sin.unsqueeze(-2).to(x.dtype)
+        if is_neox_style:
+            x1, x2 = torch.chunk(x, 2, dim=-1)
+        else:
+            x1 = x[..., ::2]
+            x2 = x[..., 1::2]
+        o1 = x1 * cos - x2 * sin
+        o2 = x2 * cos + x1 * sin
+        if is_neox_style:
+            return torch.cat((o1, o2), dim=-1)
+        else:
+            return torch.stack((o1, o2), dim=-1).flatten(-2)
+    def rope_single(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        B, N, D = x.shape
+        S = 1
+        x = x.view(B, N, S, D)
+        x = torch.ops.npu_inference.npu_interleave_rope(x, cos, sin)
+        return x.view(B, N, D)
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        if self.w_kc is None or self.w_vc is None:
+            kv_b_proj_weight = self.kv_b_proj.weight.reshape(
+                self.num_heads, self.qk_nope_head_dim + self.v_head_dim,
+                self.kv_lora_rank)
+            self.w_kc = kv_b_proj_weight[:, :self.
+                                         qk_nope_head_dim, :].contiguous()
+            self.w_vc = kv_b_proj_weight[:,
+                                         self.qk_nope_head_dim:, :].transpose(
+                                             1, 2).contiguous()
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,
+        hidden_states_or_kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AscendMetadata,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with Ascend attention.
+        Args:
+            hidden_states_or_q_c: shape = [num_tokens, num_heads * head_size]
+                                           num_tokens = batch_size * seq_len
+            hidden_states_or_kv_c_normed: shape = [num_tokens, num_kv_heads * head_size]
+            k_pe: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache: shape = [1, num_blocks, block_size,
+                               num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len * num_heads * head_size]
+        """
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+        attn_type = self.attn_type
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+        if attn_metadata is None:
+            # for profile run
+            return hidden_states_or_q_c
+        num_tokens = hidden_states_or_q_c.shape[0]
+        q = self.q_proj(hidden_states_or_q_c)[0].view(-1, self.num_heads,
+                                                      self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
+                               dim=-1)
+        if k_pe is None and attn_metadata.decode_metadata:
+            seq_len = self.rotary_emb.max_position_embeddings
+            cos = self.rotary_emb.cos_cached[:seq_len].to(dtype=q_pe.dtype)
+            sin = self.rotary_emb.sin_cached[:seq_len].to(dtype=q_pe.dtype)
+            cos = cos[attn_metadata.input_positions]
+            sin = sin[attn_metadata.input_positions]
+            cos = cos[:, None, None, :]
+            sin = sin[:, None, None, :]
+            q_pe = self.rope_single(q_pe, cos, sin)
+            k_pe, k_nope = self.exec_kv(hidden_states_or_kv_c_normed, cos, sin,
+                                        kv_cache, attn_metadata.slot_mapping)
+        else:
+            if k_pe is None:
+                # NOTE: k_pe is None when graph mode enabled
+                kv_c, k_pe = self.kv_a_proj_with_mqa(
+                    hidden_states_or_kv_c_normed)[0].split(
+                        [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+                kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+            else:
+                kv_c_normed = hidden_states_or_kv_c_normed
+            k_pe = k_pe.view(num_tokens, self.num_kv_heads, -1)
+            if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding':
+                # NOTE: When scaling not specified
+                ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+                q_pe = q_pe.reshape(num_tokens, -1)
+                k_pe = k_pe.reshape(num_tokens, -1)
+                q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions,
+                                             q_pe, k_pe)
+                q_pe = q_pe.view(ori_q_pe_shape)
+                k_pe = k_pe.view(ori_k_pe_shape)
+            else:
+                q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions,
+                                             q_pe, k_pe)
+        if attn_metadata.num_prefills > 0:
+            kv = self.kv_b_proj(kv_c_normed)[0].view(num_tokens,
+                                                     self.num_heads, -1)
+            k_nope, value = kv.split([self.qk_nope_head_dim, self.v_head_dim],
+                                     dim=-1)
+        else:
+            q_nope_t = torch.transpose(q_nope, 0, 1)
+            q_nope_out = torch.bmm(q_nope_t, self.w_kc)
+            q_nope = torch.transpose(q_nope_out, 0, 1)
+        query = torch.cat([q_nope, q_pe], dim=-1).view(num_tokens,
+                                                       self.num_heads, -1)
+        # TODO: Replace the env with more flexible expressions
+        if self.torchair_graph_enabled:
+            if len(kv_cache) > 0 and kv_cache[0].numel(
+            ) > 0 and attn_metadata.num_prefills > 0:
+                slots = attn_metadata.slot_mapping
+                # NOTE: Separate the kv cache in advance to avoid OOM or other issues
+                torch_npu._npu_reshape_and_cache(key=kv_c_normed.view(
+                    num_tokens, self.num_kv_heads, -1),
+                                                 value=k_pe,
+                                                 key_cache=kv_cache[0],
+                                                 value_cache=kv_cache[1],
+                                                 slot_indices=slots)
+        elif kv_cache.numel() > 0:
+            # TODO replace this naive implement with fusion kernel
+            concat_and_cache_mla(kv_c_normed, k_pe, kv_cache,
+                                 attn_metadata.slot_mapping)
+        if attn_metadata.num_prefills > 0:
+            attn_output = torch.empty(num_tokens,
+                                      self.num_heads,
+                                      self.v_head_dim,
+                                      dtype=query.dtype,
+                                      device=query.device)
+            if (attn_metadata.block_tables is None
+                    or attn_metadata.block_tables.numel() == 0):
+                assert attn_metadata.attn_mask is not None
+                assert attn_metadata.prefill_metadata is not None
+                assert attn_metadata.prefill_metadata.seq_lens is not None
+                mask = attn_metadata.attn_mask
+                self.seq_lens_tensor_cpu = torch.from_numpy(
+                    np.array(attn_metadata.prefill_metadata.seq_lens).astype(
+                        np.int32))
+                k_pe = k_pe.repeat(1, self.num_heads, 1)
+                key = torch.cat(
+                    [k_nope.view(num_tokens, self.num_heads, -1), k_pe], dim=2)
+                torch_npu._npu_flash_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    mask=mask,
+                    seq_len=self.seq_lens_tensor_cpu,
+                    scale_value=self.scale,
+                    num_heads=self.num_heads,
+                    num_kv_heads=self.num_heads,
+                    out=attn_output)
+            else:
+                # TODO: Will support prefix cache and chunked prefill soon.
+                raise RuntimeError(
+                    "Prefix cache and chunked prefill are currently not supported."
+                )
+        elif attn_metadata.decode_metadata:
+            assert kv_cache is not None
+            if self.torchair_graph_enabled:
+                # shape of query for npu graph mode should be:
+                # [bs, num_heads_per_rank, seq_len, dim]
+                q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
+                q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
+                # shape of knope/k_pe for npu graph mode should be:
+                # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim]
+                block_size = kv_cache[0].shape[1]
+                k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
+                                     self.kv_lora_rank)
+                k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
+                                 self.qk_rope_head_dim)
+                attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+                    q_nope,
+                    k_nope,
+                    k_nope,
+                    query_rope=q_pe,
+                    key_rope=k_pe,
+                    num_heads=self.num_heads,
+                    num_key_value_heads=self.num_kv_heads,
+                    input_layout="BNSD",
+                    atten_mask=attn_metadata.attn_mask,
+                    scale=self.scale,
+                    antiquant_mode=0,
+                    antiquant_scale=None,
+                    block_table=attn_metadata.block_tables,
+                    block_size=block_size,
+                    actual_seq_lengths_kv=attn_metadata.seq_lens,
+                )
+                attn_output = attn_output.view(num_tokens, -1,
+                                               self.kv_lora_rank).transpose(
+                                                   0, 1)
+                attn_output = torch.bmm(attn_output, self.w_vc).transpose(0, 1)
+            else:
+                # if torch.empty is used here, the preemptive scheduling case of
+                # test_mtp_correctness.py will fail to run.
+                attn_output = torch.randn(
+                    [num_tokens, self.num_heads, self.kv_lora_rank],
+                    dtype=query.dtype,
+                    device=query.device)
+                self.seq_lens_tensor_cpu = torch.from_numpy(
+                    np.array(attn_metadata.decode_metadata.seq_lens).astype(
+                        np.int32))
+                block_tables = attn_metadata.decode_metadata.block_tables
+                torch_npu._npu_paged_attention_mla(
+                    query=query,
+                    key_cache=kv_cache,
+                    num_kv_heads=self.num_kv_heads,
+                    num_heads=self.num_heads,
+                    scale_value=self.scale,
+                    block_table=block_tables,
+                    context_lens=self.seq_lens_tensor_cpu,
+                    mla_vheadsize=self.kv_lora_rank,
+                    out=attn_output)
+                attn_output_t = torch.transpose(attn_output, 0, 1)
+                attn_output_t = torch.bmm(attn_output_t, self.w_vc)
+                attn_output = torch.transpose(attn_output_t, 0, 1)
+        output, _ = self.o_proj(attn_output.reshape(num_tokens, -1))
+        return output

inference/vllm_ascend/attention/mla_v1.py ADDED Viewed

	@@ -0,0 +1,1224 @@

+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar
+import numpy as np
+import torch
+import torch_npu
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl)
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import get_current_vllm_config
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.utils import cdiv, round_down
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
+from vllm_ascend.multistream.context import get_multistream_comm_context
+from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
+from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor
+from vllm_ascend.worker.npu_input_batch import InputBatch
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+@dataclass
+class CommonAttentionMetadata:
+    """
+    Attention metadata attributes that can be shared by layers in different KV
+    cache groups and thus having different block table.
+    """
+    query_start_loc: torch.Tensor
+    """(batch_size + 1,), the start location of each request in query Tensor"""
+    seq_lens: torch.Tensor
+    """(batch_size,), the length of each request including both computed tokens
+    and newly scheduled tokens"""
+class AscendMLABackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    @staticmethod
+    def get_name() -> str:
+        return "VLLM_ASCEND_MLA"
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return AscendMLAMetadata
+    @staticmethod
+    def get_builder_cls():
+        return AscendMLAMetadataBuilder
+    @staticmethod
+    def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int,
+                           head_size: int) -> tuple[int, ...]:
+        return (num_blocks, block_size, num_kv_heads, head_size)
+    @staticmethod
+    def get_impl_cls() -> Type["MLAAttentionImpl"]:
+        return AscendMLAImpl
+@dataclass
+class AscendMLAPrefillMetadata:
+    """ Prefill Specific Metadata for Ascend"""
+    @dataclass
+    class ChunkedContextMetadata:
+        # New for MLA (compared to FlashAttention)
+        # For handling chunked prefill
+        cu_seq_lens: torch.Tensor
+        starts: torch.Tensor
+        seq_tot: list[int]
+        max_seq_lens: list[int]
+        workspace: torch.Tensor
+        chunk_seq_lens: torch.Tensor
+    attn_mask: torch.Tensor
+    query_lens: list[int]
+    seq_lens: list[int]
+    context_lens: torch.Tensor
+    input_positions: torch.Tensor
+    query_start_loc: torch.Tensor
+    block_table: torch.Tensor
+    max_query_len: int
+    max_seq_lens: int
+    chunked_context: Optional[ChunkedContextMetadata] = None
+@dataclass
+class AscendMLADecodeMetadata:
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+    block_table: torch.Tensor
+    seq_lens: torch.Tensor
+    max_seq_lens: int
+    seq_lens_list: list[int]
+    attn_mask: Optional[torch.Tensor] = None
+@dataclass
+class AscendMLAMetadata:
+    """Metadata for MLACommon.
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    slot_mapping: torch.Tensor
+    query_start_loc: torch.Tensor
+    seq_lens: torch.Tensor
+    block_tables: torch.Tensor
+    # New for MLA (compared to FlashAttention)
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+    max_num_tokens_across_dp: int = 0
+    with_prefill_across_dp: bool = False
+    query_lens: Optional[list[int]] = None
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+    attn_mask: torch.Tensor = None
+    # chunked prefill by default if no attn_states passed
+    attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
+    decode: Optional[AscendMLADecodeMetadata] = None
+    prefill: Optional[AscendMLAPrefillMetadata] = None
+    def __post_init__(self):
+        pass
+        # supported_head_sizes = AscendMLABackend.get_supported_head_sizes()
+        # if self.head_dim is not None and self.head_dim \
+        #         not in supported_head_sizes:
+        #     raise ValueError(
+        #         f"Only {supported_head_sizes} are supported for head_dim,",
+        #         f"received {self.head_dim}.")
+    def split_metadata_for_multistream(
+        self,
+        ms_split_config: MSAttentionMetadataSplitConfig,
+    ) -> list["AscendMLAMetadata"]:
+        """Split metadata for multi-stream with AscendMLAMetadata"""
+        return model_input_split_v1_mla_attn(
+            ms_split_config=ms_split_config,
+            attn_metadata=self,
+            _metadata_cls=AscendMLAMetadata,
+        )
+M = TypeVar("M", bound=AscendMLAMetadata)
+class AscendMLAMetadataBuilder:
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+    # _attn_mask_builder = None
+    def __init__(self,
+                 runner,
+                 metadata_cls: Optional[AscendMLAMetadata] = None):
+        self.metadata_cls: Optional[AscendMLAMetadata] = metadata_cls \
+            if metadata_cls is not None else AscendMLAMetadata  # type: ignore
+        self.runner = runner
+        scheduler_config = runner.scheduler_config
+        model_config = runner.model_config
+        self.block_size = runner.block_size
+        self.chunked_prefill_enabled = runner.chunked_prefill_enabled
+        if self.chunked_prefill_enabled:
+            self.chunked_prefill_workspace_size = min(
+                # Max sure there is enough for 8 full length request or at least
+                # 4 pages of cache per request
+                max(8 * model_config.max_model_len,
+                    4 * scheduler_config.max_num_seqs * self.block_size),
+                # For long-context models try not to over-allocate limiting
+                # kv-cache space, limiting it to 64k tokens,
+                # which would result in the workspace being:
+                #   2*(576)*(64*1024) = 144mb
+                # (assuming 576 MLA head dim, and fp16)
+                # which would result in up-projected context being
+                #   2*(192*128)*(64*1024) = 3gb
+                # (assuming 192 QK head dim, 128 heads, and fp16)
+                128 * 1024)
+            assert self.chunked_prefill_workspace_size >= \
+                scheduler_config.max_num_seqs * self.block_size
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 model_config.get_head_size()),
+                dtype=model_config.dtype,
+                device=runner.device,
+            )
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        # We now want to reorder the batch so that the "decode" requests are at
+        # the front and the "prefill" requests are at the using the least amount
+        # swaps possible. (NOTE for now we loosely use "decode" to mean requests
+        # where attention is likely memory-bound and "prefill" to mean requests
+        # where attention is likely compute-bound, TODO(lucas): figure out a
+        # better naming here)
+        decodes = []
+        prefills = []
+        num_decode_tokens = 0
+        num_prefill_tokens = 0
+        for i, req_id in enumerate(input_batch.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_spec_tokens = len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            # For torch air graph mode we treat spec decoding as decode.
+            if self.torchair_graph_enabled:
+                if num_tokens - num_spec_tokens == 1:
+                    decodes.append(i)
+                    num_decode_tokens += num_tokens
+                else:
+                    prefills.append(i)
+                    num_prefill_tokens += num_tokens
+            # For eager mode we treat spec decoding as chunked prefill.
+            else:
+                if num_tokens == 1:
+                    decodes.append(i)
+                    num_decode_tokens += num_tokens
+                else:
+                    prefills.append(i)
+                    num_prefill_tokens += num_tokens
+        # We hope that this is fairly minimal since decodes
+        # should be around for a number of iterations so hopefully they are
+        # relatively stationary (and new request are generally appended to the
+        # persistent batch so already should be at the back)
+        # To achieve this we loop over the decodes in descending order and
+        # the prefills in ascending order. We swap decodes from the  "back"
+        # i.e. past where the last decode should be in the reodorered with
+        # prefills from the front of the batch.
+        # `decodes` and `prefills` are already in ascending order just based on
+        # the above loop
+        num_decodes = len(decodes)
+        num_prefills = len(prefills)
+        first_prefill = 0
+        modified_batch = False
+        for i in range(1, min(num_decodes, num_prefills) + 1):
+            # If the decode is at the "back" of the batch, i, we can swap it
+            # with the prefill closest to the front of the batch
+            if decodes[num_decodes - i] >= num_decodes:
+                input_batch.swap_states(prefills[first_prefill],
+                                        decodes[num_decodes - i])
+                first_prefill += 1
+                modified_batch = True
+            else:
+                break
+        # Save for next `build` call
+        # TODO(lucas): this is a bit of a hack, we should probably have a
+        # better way of doing this
+        self._num_decodes = num_decodes
+        self._num_prefills = num_prefills
+        self._num_decode_tokens = num_decode_tokens
+        self._num_prefill_tokens = num_prefill_tokens
+        return modified_batch
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor:
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+        if isinstance(self.runner.graph_block_tables, np.ndarray):
+            graph_block_tables = torch.zeros((max_batch_size, max_blocks),
+                                             dtype=block_tables.dtype,
+                                             device=block_tables.device)
+        else:
+            graph_block_tables = self.runner.graph_block_tables.to(
+                device=block_tables.device, dtype=block_tables.dtype)
+        num_blocks = block_tables.size(1)
+        if num_blocks <= max_blocks:
+            graph_block_tables[:num_seqs, :
+                               num_blocks] = block_tables[:num_seqs, :
+                                                          num_blocks]
+        else:
+            graph_block_tables[:num_seqs, :
+                               max_blocks] = block_tables[:num_seqs, :
+                                                          max_blocks]
+        return graph_block_tables[:num_seqs, :max_blocks]
+    def build_dummy(self, num_reqs: int,
+                    num_actual_tokens: int) -> AscendMLAMetadata:
+        device = self.runner.device
+        _, max_blocks = self.runner.graph_block_tables.shape
+        block_table = torch.zeros((num_reqs, max_blocks),
+                                  dtype=torch.int32,
+                                  device=device)
+        block_table = self._get_graph_runner_block_tables(
+            num_reqs, block_table)
+        seq_lens = torch.ones(num_reqs, dtype=torch.int32, device=device)
+        input_positions = torch.zeros(num_reqs,
+                                      dtype=torch.int32,
+                                      device=device).long()
+        slot_mapping = torch.full((num_reqs, ),
+                                  PAD_SLOT_ID,
+                                  dtype=torch.int32,
+                                  device=device)
+        query_start_loc = torch.full((num_reqs, ),
+                                     -1,
+                                     dtype=torch.int32,
+                                     device=device)
+        decode_metadata = AscendMLADecodeMetadata(
+            input_positions=input_positions,
+            block_table=block_table,
+            seq_lens=seq_lens,
+            seq_lens_list=seq_lens.tolist(),
+            max_seq_lens=1,
+            attn_mask=self.runner.spec_attn_mask)
+        return self.metadata_cls(  # type: ignore
+            num_input_tokens=num_actual_tokens,
+            num_actual_tokens=num_actual_tokens,
+            slot_mapping=slot_mapping,
+            head_dim=self.runner.model_config.get_head_size(),
+            num_decodes=1,
+            num_decode_tokens=1,
+            num_prefills=0,
+            attn_mask=self.runner.attn_mask,
+            attn_state=AscendAttentionState.DecodeOnly,
+            prefill=None,
+            decode=decode_metadata,
+            query_start_loc=query_start_loc,
+            seq_lens=seq_lens,
+            block_tables=block_table,
+        )
+    def build(
+        self,
+        num_reqs: int,
+        num_actual_tokens: int,
+        max_query_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        common_prefix_len: Optional[int] = None,
+        graph_pad_size: int = -1,
+        max_num_tokens_across_dp: int = 0,
+        with_prefill_across_dp: bool = False,
+    ) -> AscendMLAMetadata:
+        assert self._num_decodes + self._num_prefills == num_reqs
+        # Note(simon): be careful about the CPU <> GPU memory movement in this
+        # function. We should avoid GPU -> CPU sync as much as possible because
+        # it blocks on all previous kernels.
+        device = self.runner.device
+        block_table = (self.runner.input_batch.block_table[0].
+                       get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            device, non_blocking=True)
+        input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
+            device, non_blocking=True).long()
+        seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
+        query_lens = seq_lens_cpu - self.runner.input_batch.num_computed_tokens_cpu_tensor[:
+                                                                                           num_reqs]
+        seq_lens = seq_lens_cpu
+        max_query_len = query_lens.max().item()
+        max_seq_lens = seq_lens.max().item()
+        query_start_loc = common_attn_metadata.query_start_loc
+        prefill_metadata = None
+        chunked_context_metadata = None
+        if self._num_prefills > 0:
+            reqs_start = self._num_decodes  # prefill_start
+            tokens_start = self._num_decode_tokens
+            max_query_len = query_lens[tokens_start:].max().item()
+            max_seq_lens = seq_lens[tokens_start:].max().item()
+            query_start_loc = common_attn_metadata.query_start_loc
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
+            context_lens_cpu = self.runner.input_batch.num_computed_tokens_cpu_tensor[
+                reqs_start:num_reqs]
+            max_context_len_cpu = context_lens_cpu.max().item()
+            num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
+            if self.chunked_prefill_enabled and max_context_len_cpu > 0:
+                max_context_chunk = (self.chunked_prefill_workspace_size //
+                                     num_prefills_with_context_cpu)
+                max_context_chunk = round_down(max_context_chunk,
+                                               self.block_size)
+                assert max_context_chunk > 0
+                num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
+                chunk_starts = torch.arange(num_chunks, dtype=torch.int32) \
+                    .unsqueeze(1).expand(-1, self._num_prefills) * max_context_chunk
+                chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
+                                       chunk_starts + max_context_chunk)
+                chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
+                cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                              self._num_prefills + 1,
+                                              dtype=torch.int32,
+                                              pin_memory=True)
+                torch.cumsum(chunk_seq_lens,
+                             dim=1,
+                             out=cu_seq_lens_cpu[:, 1:],
+                             dtype=torch.int32)
+                chunked_context_metadata = \
+                    AscendMLAPrefillMetadata.ChunkedContextMetadata(
+                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
+                    starts=chunk_starts.to(device, non_blocking=True),
+                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                    chunk_seq_lens=chunk_seq_lens,
+                    workspace=self.chunked_prefill_workspace,
+                )
+            prefill_metadata = AscendMLAPrefillMetadata(
+                attn_mask=self.runner.attn_mask,
+                query_lens=query_lens[tokens_start:],
+                seq_lens=seq_lens,
+                context_lens=seq_lens[tokens_start:],
+                input_positions=input_positions[tokens_start:],
+                block_table=block_table[reqs_start:, ...],
+                max_query_len=max_query_len,
+                max_seq_lens=max_seq_lens,
+                query_start_loc=prefill_query_start_loc,
+                chunked_context=chunked_context_metadata,
+            )
+        decode_metadata = None
+        use_torchair_graph = graph_pad_size != -1
+        if self._num_decodes > 0:
+            max_seq_lens = seq_lens[:self._num_decodes].max().item()
+            seq_lens = seq_lens[:self._num_decode_tokens]
+            input_positions = input_positions[:self._num_decode_tokens]
+            block_table = block_table[:self._num_decode_tokens, ...]
+            if use_torchair_graph and self.runner.attn_state in [
+                    AscendAttentionState.DecodeOnly,
+                    AscendAttentionState.SpecDecoding
+            ]:
+                num_seqs = len(seq_lens)
+                if graph_pad_size != 0:
+                    pad_value = 1
+                    padded_seq_lens = seq_lens.tolist() + [pad_value
+                                                           ] * graph_pad_size
+                else:
+                    padded_seq_lens = seq_lens.tolist()
+                seq_lens = torch.from_numpy(
+                    np.array(padded_seq_lens).astype(np.int32))
+                padding = torch.full((graph_pad_size, ),
+                                     PAD_SLOT_ID,
+                                     dtype=slot_mapping.dtype,
+                                     device=slot_mapping.device)
+                slot_mapping = torch.cat([slot_mapping, padding])
+                block_table_padding = torch.zeros(
+                    (graph_pad_size, ) + block_table.shape[1:],
+                    dtype=block_table.dtype,
+                    device=block_table.device)
+                block_table = torch.cat([block_table, block_table_padding],
+                                        dim=0)
+                block_table = self._get_graph_runner_block_tables(
+                    num_seqs + graph_pad_size, block_table)
+                padding_0 = torch.zeros(graph_pad_size,
+                                        dtype=input_positions.dtype,
+                                        device=input_positions.device)
+                input_positions = torch.cat([input_positions, padding_0])
+            decode_metadata = AscendMLADecodeMetadata(
+                input_positions=input_positions,
+                block_table=block_table,
+                seq_lens=seq_lens,
+                seq_lens_list=seq_lens.tolist(),
+                max_seq_lens=max_seq_lens,
+                attn_mask=self.runner.spec_attn_mask)
+        return self.metadata_cls(  # type: ignore
+            num_actual_tokens=num_actual_tokens,
+            query_lens=query_lens.tolist(),
+            slot_mapping=slot_mapping,
+            head_dim=self.runner.model_config.get_head_size(),
+            num_decodes=self._num_decodes,
+            num_decode_tokens=self._num_decode_tokens,
+            num_prefills=self._num_prefills,
+            attn_mask=self.runner.attn_mask,
+            attn_state=self.runner.attn_state,
+            prefill=prefill_metadata,
+            decode=decode_metadata,
+            query_start_loc=query_start_loc,
+            block_tables=block_table,
+            seq_lens=seq_lens,
+            max_num_tokens_across_dp=max_num_tokens_across_dp,
+            with_prefill_across_dp=with_prefill_across_dp,
+        )
+class AscendMLAImpl(MLAAttentionImpl):
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        kv_sharing_target_layer_name: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+        # MLA Args
+        self.q_lora_rank = kwargs['q_lora_rank']
+        self.kv_lora_rank = kwargs['kv_lora_rank']
+        self.qk_nope_head_dim = kwargs['qk_nope_head_dim']
+        self.qk_rope_head_dim = kwargs['qk_rope_head_dim']
+        self.qk_head_dim = kwargs['qk_head_dim']
+        self.v_head_dim = kwargs['v_head_dim']
+        self.rotary_emb = kwargs['rotary_emb']
+        self.q_proj = kwargs['q_proj']
+        self.kv_b_proj = kwargs['kv_b_proj']
+        self.o_proj = kwargs['o_proj']
+        self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
+        self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.tp_size = get_tensor_model_parallel_world_size()
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
+        # Adapt torch air graph mode with spec decoding.
+        speculative_config = get_current_vllm_config().speculative_config
+        if speculative_config is not None:
+            self.spec_token_num = speculative_config.num_speculative_tokens
+            assert self.spec_token_num > 0
+        self.SHARE_MASK_TRIL_SPARSE = ~torch.tril(torch.ones((2048, 2048), dtype=torch.bool)).npu()
+    def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False):
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024  # 16MB
+        npu_prefetch(self.o_proj.weight,
+                     x,
+                     max_size=MAX_O_PROJ_PREFETCH_SIZE,
+                     enabled=enable_multistream_mla)
+        return self.o_proj(x, is_prefill=False)[0]
+    # Return `ql_nope`, `q_pe`
+    def _q_proj_and_k_up_proj(self, x):
+        q_nope, q_pe = self.q_proj(x)[0]\
+            .view(-1, self.num_heads, self.qk_head_dim)\
+            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+        ql_nope = torch.bmm(q_nope, self.W_UK_T)
+        # Convert from (N, B, L) to (B, N, L)
+        return ql_nope.transpose(0, 1), q_pe
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        def get_layer_weight(layer):
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute:"
+                f" {WEIGHT_NAMES}.")
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        # Convert from (L, N, V) to (N, L, V)
+        self.W_UV = W_UV.transpose(0, 1).contiguous()
+        # Convert from (L, N, P) to (N, P, L)
+        self.W_UK_T = W_UK.permute(1, 2, 0).contiguous()
+        # Waiting for BMM NZ support
+        # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29)
+        # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29)
+    def _compute_prefill_context(
+        self,
+        query: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        rope_dim: int,
+        attn_metadata: AscendMLAMetadata,
+        prefix_output: torch.Tensor,
+        prefix_lse: torch.Tensor,
+    ):
+        prefill_metadata = attn_metadata.prefill
+        if prefill_metadata is None or prefill_metadata.chunked_context is None:
+            return prefix_output, prefix_lse
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        q_pe = query[..., self.qk_nope_head_dim:]
+        q_nope = query[..., :self.qk_nope_head_dim]
+        seq_len1 = torch.tensor(prefill_metadata.query_lens, dtype=torch.int32)
+        latent_kv_dim = kv_c_and_k_pe_cache.size(3) - rope_dim
+        cache_kv_c = kv_c_and_k_pe_cache[:, :, :, :latent_kv_dim]
+        cache_k_pe = kv_c_and_k_pe_cache[:, :, :, latent_kv_dim:]
+        for i in range(iters):
+            toks = prefill_metadata.chunked_context.seq_tot[i]
+            seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i]
+            seq_len = torch.stack([seq_len1, seq_len2])
+            kv_c_normed = torch.empty(toks,
+                                      kv_c_and_k_pe_cache.size(2),
+                                      latent_kv_dim,
+                                      dtype=query.dtype,
+                                      device=query.device)
+            k_pe = torch.empty(toks,
+                               kv_c_and_k_pe_cache.size(2),
+                               rope_dim,
+                               dtype=query.dtype,
+                               device=query.device)
+            torch_npu.atb.npu_paged_cache_load(
+                cache_kv_c,
+                cache_k_pe,
+                prefill_metadata.block_table,
+                seq_len2.to(query.device),
+                seq_starts=prefill_metadata.chunked_context.starts[i],
+                key=kv_c_normed,
+                value=k_pe,
+            )
+            kv_c_normed = kv_c_normed.squeeze()
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
+            mask = torch.triu(
+                torch.ones(512, 512, device=query.device, dtype=query.dtype),
+                1)
+            torch_npu.atb.npu_ring_mla(
+                q_nope=q_nope,
+                q_rope=q_pe,
+                k_nope=k_nope,
+                k_rope=k_pe,
+                value=v,
+                mask=mask,
+                seqlen=seq_len,
+                head_num=self.num_heads,
+                kv_head_num=self.num_heads,
+                pre_out=prefix_output,
+                prev_lse=prefix_lse,
+                qk_scale=self.scale,
+                kernel_type="kernel_type_high_precision",
+                mask_type="no_mask",
+                input_layout="type_bsnd",
+                calc_type="calc_type_default",
+                output=prefix_output,
+                softmax_lse=prefix_lse)
+        return prefix_output, prefix_lse
+    def _forward_prefill(
+        self,
+        query: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AscendMLAMetadata,
+    ) -> torch.Tensor:
+        assert attn_metadata.prefill is not None
+        num_tokens = query.size(0)
+        attn_output = torch.empty(num_tokens,
+                                  self.num_heads,
+                                  self.v_head_dim,
+                                  dtype=query.dtype,
+                                  device=query.device)
+        k_nope, value = self.kv_b_proj(kv_c_normed)[0].view(
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
+        # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
+        ascend_config = get_ascend_config()
+        if attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ] and not ascend_config.chunked_prefill_for_mla:
+            attn_output_torch = torch.empty(num_tokens,
+                                            self.num_heads * self.v_head_dim,
+                                            dtype=query.dtype,
+                                            device=query.device)
+            # current requests is chunked in prefill, disable flash attention with chunked prefill
+            vanilla_chunked_prefill_mla(
+                output=attn_output_torch,
+                query=query,
+                kv_cache=kv_c_and_k_pe_cache,
+                block_tables=attn_metadata.prefill.block_table,
+                query_lens=attn_metadata.prefill.query_lens,
+                context_lens=attn_metadata.prefill.context_lens,
+                kv_b_proj=self.kv_b_proj,
+                max_query_len=attn_metadata.prefill.max_query_len,
+                max_context_len=attn_metadata.prefill.max_seq_lens,
+                nope_dim=self.qk_nope_head_dim,
+                rope_dim=self.qk_rope_head_dim,
+                v_head_dim=self.v_head_dim,
+                scale=self.scale,
+                alibi_slopes=None,
+                causal=True)
+        elif attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ]:
+            attn_lse = torch.empty(self.num_heads,
+                                   num_tokens,
+                                   dtype=torch.float32,
+                                   device=query.device)
+            q_pe = query[..., self.qk_nope_head_dim:]
+            q_nope = query[..., :self.qk_nope_head_dim]
+            mask = torch.triu(
+                torch.ones(512, 512, device=query.device, dtype=query.dtype),
+                1)  # 512: mask only support 512
+            if attn_metadata.num_prefills > 1:
+                mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1,
+                                                1)
+            torch_npu.atb.npu_ring_mla(
+                q_nope=q_nope,
+                q_rope=q_pe,
+                k_nope=k_nope,
+                k_rope=k_pe,
+                value=value,
+                mask=mask,
+                seqlen=torch.tensor(attn_metadata.prefill.query_lens,
+                                    dtype=torch.int32),
+                head_num=self.num_heads,
+                kv_head_num=self.num_heads,
+                pre_out=None,
+                prev_lse=None,
+                qk_scale=self.scale,
+                kernel_type="kernel_type_high_precision",
+                mask_type="mask_type_triu",
+                input_layout="type_bsnd",
+                calc_type="calc_type_first_ring",
+                output=attn_output,
+                softmax_lse=attn_lse)
+            attn_output, attn_lse = self._compute_prefill_context( \
+                query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            key = torch.cat((k_nope, k_pe), dim=-1)
+            context_lens_list = torch.cumsum(attn_metadata.prefill.context_lens, dim=0).tolist()
+            attn_output = torch_npu.npu_fused_infer_attention_score(
+                query,
+                key,
+                value,
+                num_heads=self.num_heads,
+                input_layout="TND",
+                scale=self.scale,
+                sparse_mode=3,
+                atten_mask=self.SHARE_MASK_TRIL_SPARSE,
+                actual_seq_lengths=context_lens_list,
+                actual_seq_lengths_kv=context_lens_list,
+                inner_precise=0)[0]
+            attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim)
+        else:
+            raise RuntimeError(
+                "Unexpected path reached, AscendMLAImpl should only have PrefillNoCache, PrefillCacheHit, ChunkedPrefill and SpecDecoding scenario in forward prefill, please file a bug to vllm-ascend !"
+            )
+        attn_output = attn_output.reshape(
+            [num_tokens, self.num_heads * self.v_head_dim])
+        if attn_metadata.attn_state in [
+                AscendAttentionState.ChunkedPrefill,
+                AscendAttentionState.SpecDecoding,
+                AscendAttentionState.PrefillCacheHit
+        ] and not ascend_config.chunked_prefill_for_mla:
+            attn_output = attn_output_torch
+        current_ms_metadata = get_multistream_comm_context()
+        if current_ms_metadata is None:
+            return self.o_proj(attn_output, is_prefill=True)[0]
+        else:
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self.o_proj(attn_output, is_prefill=True)[0]
+    def exec_kv(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        kv_cache: Tuple,
+        slots: torch.Tensor,
+    ):
+        B = hidden_states.shape[0]
+        N = self.num_kv_heads
+        S = 1
+        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
+        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
+        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
+        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
+        k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache(
+            kv,
+            self.kv_a_layernorm.weight,
+            cos,
+            sin,
+            slots.to(torch.int64),
+            kv_cache[1],
+            kv_cache[0],
+            epsilon=self.kv_a_layernorm.variance_epsilon,
+            cache_mode=cache_mode,
+        )
+        return k_pe, k_nope, kv
+    def exec_kv_prefill(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        kv_cache: Tuple,
+        slots: torch.Tensor,
+    ):
+        B = hidden_states.shape[0]
+        N = self.num_kv_heads
+        S = 1
+        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
+        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
+        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
+        cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA"
+        _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache(
+            kv,
+            self.kv_a_layernorm.weight,
+            cos,
+            sin,
+            slots.to(torch.int64),
+            kv_cache[1],
+            kv_cache[0],
+            epsilon=self.kv_a_layernorm.variance_epsilon,
+            cache_mode=cache_mode,
+            is_output_kv=True,
+        )
+        return k_pe, k_nope
+    def rope_single(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        B, N, D = x.shape
+        S = 1
+        x = x.view(B, N, S, D)
+        x = torch_npu.npu_interleave_rope(x, cos, sin)
+        return x.view(B, N, D)
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        k_nope: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AscendMLAMetadata,
+        enable_multistream_mla: bool = False,
+    ) -> torch.Tensor:
+        decode_meta = attn_metadata.decode
+        assert decode_meta is not None
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        num_tokens = q.size(0)
+        attn_output = torch.empty(
+            [num_tokens, self.num_heads, self.kv_lora_rank],
+            dtype=q.dtype,
+            device=q.device)
+        if self.running_in_graph:
+            # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim]
+            if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
+                assert num_tokens % self.spec_token_num == 0
+                q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1),
+                                     self.spec_token_num + 1, self.num_heads,
+                                     -1)
+                q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1),
+                                 self.spec_token_num + 1, self.num_heads, -1)
+                if not self.enable_kv_nz:
+                    q_nope = q_nope.transpose(1, 2).contiguous()
+                    q_pe = q_pe.transpose(1, 2).contiguous()
+                sparse_mode = 3
+                spec_attn_mask = attn_metadata.decode.attn_mask  # type:ignore
+            else:
+                if self.enable_kv_nz:
+                    q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1)
+                    q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1)
+                else:
+                    q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
+                    q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
+                sparse_mode = 0
+                spec_attn_mask = None
+            # shape of knope/k_pe for npu graph mode should be:
+            # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim]
+            block_size = kv_c_and_k_pe_cache[0].shape[1]
+            if self.enable_kv_nz:
+                k_nope = k_nope.view(-1, self.num_kv_heads,
+                                     self.kv_lora_rank // 16, block_size, 16)
+                k_pe = k_pe.view(-1, self.num_kv_heads,
+                                 self.qk_rope_head_dim // 16, block_size, 16)
+                input_layout = "BSND"
+            else:
+                k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
+                                     self.kv_lora_rank)
+                k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
+                                 self.qk_rope_head_dim)
+                input_layout = "BNSD"
+            attn_output, _ = torch_npu.npu_fused_infer_attention_score(
+                q_nope,
+                k_nope,
+                k_nope,
+                query_rope=q_pe,
+                key_rope=k_pe,
+                num_heads=self.num_heads,
+                num_key_value_heads=self.num_kv_heads,
+                input_layout=input_layout,
+                atten_mask=spec_attn_mask,
+                sparse_mode=sparse_mode,
+                scale=self.scale,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                block_table=decode_meta.block_table,
+                block_size=block_size,
+                actual_seq_lengths_kv=decode_meta.seq_lens_list,
+            )
+        else:
+            torch_npu._npu_paged_attention_mla(
+                query=q,
+                key_cache=kv_c_and_k_pe_cache,
+                num_kv_heads=self.num_kv_heads,
+                num_heads=self.num_heads,
+                scale_value=self.scale,
+                block_table=attn_metadata.decode.block_table,  # type:ignore
+                context_lens=attn_metadata.decode.seq_lens,  # type:ignore
+                mla_vheadsize=self.kv_lora_rank,
+                out=attn_output)
+        current_ms_metadata = get_multistream_comm_context()
+        if current_ms_metadata is None:
+            return self._v_up_proj_and_o_proj(attn_output,
+                                              enable_multistream_mla)
+        else:
+            current_ms_metadata.before_comm_event.record()
+            with torch.npu.stream(current_ms_metadata.comm_stream):
+                current_ms_metadata.before_comm_event.wait()
+                return self._v_up_proj_and_o_proj(attn_output)
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        hidden_states_or_kv_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: M,
+        output: Optional[torch.Tensor] = None,
+        enable_multistream_mla: bool = False,
+        ckq: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert output is not None, "Output tensor must be provided."
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+        self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [
+            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
+        ]
+        num_actual_toks = attn_metadata.num_actual_tokens
+        if k_pe is None and not self.running_in_graph:
+            if not self.torchair_graph_enabled:
+                kv_c, k_pe = self.kv_a_proj_with_mqa(
+                    hidden_states_or_kv_c_normed)[0].split(
+                        [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+                kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+        else:
+            kv_c_normed = hidden_states_or_kv_c_normed
+        assert attn_metadata.num_decodes is not None and \
+        attn_metadata.num_prefills is not None and \
+        attn_metadata.num_decode_tokens is not None
+        has_decode = attn_metadata.num_decodes > 0
+        has_prefill = attn_metadata.num_prefills > 0
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        if not self.running_in_graph:
+            # Inputs and outputs may be padded for CUDA graphs
+            output_padded = output
+            output = output[:num_actual_toks, ...]
+            if not self.torchair_graph_enabled:
+                kv_c_normed = kv_c_normed[:num_actual_toks, ...]
+                prefill_k_c_normed = kv_c_normed[num_decode_tokens:]
+        if not self.running_in_graph:
+            hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...]
+            prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
+            if not self.torchair_graph_enabled:
+                decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
+                k_pe = k_pe[:num_actual_toks, ...]
+                k_pe = k_pe.unsqueeze(1)
+                decode_k_pe = k_pe[:num_decode_tokens]
+                prefill_k_pe = k_pe[num_decode_tokens:]
+        else:
+            decode_hs_or_q_c = hidden_states_or_q_c
+        if has_decode:
+            decode_k_nope = None
+            assert attn_metadata.decode is not None
+            if self.running_in_graph:
+                seq_len = self.rotary_emb.max_position_embeddings * \
+                    getattr(self.rotary_emb, "scaling_factor", 1)
+                cos = self.rotary_emb.cos_cached[:seq_len].to(
+                    dtype=decode_hs_or_q_c.dtype)
+                sin = self.rotary_emb.sin_cached[:seq_len].to(
+                    dtype=decode_hs_or_q_c.dtype)
+                cos = cos[attn_metadata.decode.input_positions]
+                sin = sin[attn_metadata.decode.input_positions]
+                cos = cos[:, None, None, :]
+                sin = sin[:, None, None, :]
+                with npu_stream_switch("mla_secondary",
+                                       0,
+                                       enabled=enable_multistream_mla):
+                    npu_wait_tensor(hidden_states_or_kv_c_normed,
+                                    ckq,
+                                    enabled=enable_multistream_mla)
+                    decode_k_pe, decode_k_nope, decode_kv = self.exec_kv(
+                        hidden_states_or_kv_c_normed, cos, sin, kv_cache,
+                        attn_metadata.slot_mapping)
+                # Without explicitly controlling the order, IndexByTensor operations
+                # would be placed after `matmul W_KV_T` hindering the overlapping of
+                # KvRmsNormRopeCache and SingleRope.
+                npu_wait_tensor(decode_hs_or_q_c,
+                                cos,
+                                enabled=enable_multistream_mla)
+                npu_wait_tensor(decode_hs_or_q_c,
+                                sin,
+                                enabled=enable_multistream_mla)
+                npu_wait_tensor(decode_hs_or_q_c,
+                                decode_kv,
+                                enabled=enable_multistream_mla)
+            decode_ql_nope, decode_q_pe = \
+                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            if self.running_in_graph:
+                with npu_stream_switch("mla_secondary",
+                                       0,
+                                       enabled=enable_multistream_mla):
+                    npu_wait_tensor(decode_q_pe,
+                                    decode_k_pe,
+                                    enabled=enable_multistream_mla)
+                    decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
+            else:
+                decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
+                    attn_metadata.decode.input_positions,
+                    decode_q_pe.contiguous(),
+                    decode_k_pe,
+                    max_seq_len=attn_metadata.decode.max_seq_lens)
+        if has_prefill:
+            assert attn_metadata.prefill is not None
+            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+            prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
+            prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
+            if self.torchair_graph_enabled:
+                num_tokens = prefill_hs_or_q_c.shape[0]
+                seq_len = self.rotary_emb.max_position_embeddings * \
+                    getattr(self.rotary_emb, "scaling_factor", 1)
+                cos = self.rotary_emb.cos_cached[:seq_len].to(
+                    dtype=prefill_q_pe.dtype)
+                sin = self.rotary_emb.sin_cached[:seq_len].to(
+                    dtype=prefill_q_pe.dtype)
+                cos = cos[attn_metadata.prefill.input_positions]
+                sin = sin[attn_metadata.prefill.input_positions]
+                cos = cos[:, None, None, :]
+                sin = sin[:, None, None, :]
+                prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
+                prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
+                    hidden_states_or_kv_c_normed, cos, sin, kv_cache,
+                    attn_metadata.slot_mapping)
+                kv_c_normed = prefill_k_nope[:num_actual_toks, ...]
+                prefill_k_c_normed = prefill_k_nope[num_decode_tokens:]
+                prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads,
+                                                 -1)
+                prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1)
+            else:
+                prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
+                    attn_metadata.prefill.input_positions,
+                    prefill_q_pe.contiguous(),
+                    prefill_k_pe,
+                    max_seq_len=attn_metadata.prefill.max_seq_lens)
+        if self.torchair_graph_enabled:
+            if len(kv_cache) > 0 and kv_cache[0].numel(
+            ) > 0 and attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+                slots = attn_metadata.slot_mapping
+                # NOTE: Separate the kv cache in advance to avoid OOM or other issues
+                torch_npu._npu_reshape_and_cache(key=kv_c_normed.view(
+                    num_tokens, self.num_kv_heads, -1),
+                                                 value=prefill_k_pe,
+                                                 key_cache=kv_cache[0],
+                                                 value_cache=kv_cache[1],
+                                                 slot_indices=slots)
+        elif kv_cache.numel() > 0:
+            key = torch.cat([
+                kv_c_normed.view([num_actual_toks, self.num_kv_heads, -1]),
+                k_pe
+            ],
+                            dim=2)
+            torch_npu._npu_reshape_and_cache_siso(
+                key=key,
+                key_cache=kv_cache,
+                slot_indices=attn_metadata.slot_mapping.flatten())
+        if has_prefill:
+            # FIX: aicore move should be also placed on the comm stream in dbo,
+            # otherwise it may affect the accuracy
+            # TODO: use an elegant way to overlap
+            output_prefill = self._forward_prefill(prefill_q,
+                                                   prefill_k_c_normed,
+                                                   prefill_k_pe, kv_cache,
+                                                   attn_metadata)
+            current_ms_metadata = get_multistream_comm_context()
+            if current_ms_metadata is not None:
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    output[num_decode_tokens:] = output_prefill
+                    current_ms_metadata.after_comm_event.record()
+            else:
+                output[num_decode_tokens:] = output_prefill
+        if has_decode:
+            if self.running_in_graph:
+                return self._forward_decode(decode_ql_nope, decode_q_pe,
+                                            decode_k_nope, decode_k_pe,
+                                            kv_cache, attn_metadata,
+                                            enable_multistream_mla)
+            else:
+                output_decode = self._forward_decode(decode_ql_nope,
+                                                     decode_q_pe,
+                                                     decode_k_nope,
+                                                     decode_k_pe, kv_cache,
+                                                     attn_metadata)
+            current_ms_metadata = get_multistream_comm_context()
+            if current_ms_metadata is not None:
+                with torch.npu.stream(current_ms_metadata.comm_stream):
+                    output[:num_decode_tokens] = output_decode
+                    current_ms_metadata.after_comm_event.record()
+            else:
+                output[:num_decode_tokens] = output_decode
+        return output_padded

inference/vllm_ascend/entrypoints/openai/reasoning_parsers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+from .pangu_reasoning_parser import PanguReasoningParser
+__all__ = [
+    "PanguReasoningParser"
+]

inference/vllm_ascend/entrypoints/openai/reasoning_parsers/pangu_reasoning_parser.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+from collections.abc import Sequence
+from typing import Optional, Union
+from transformers import PreTrainedTokenizerBase
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+logger = init_logger(__name__)
+@ReasoningParserManager.register_module("pangu")
+class PanguReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for Pangu model.
+    The Pangu model uses [unused16]...[unused17] tokens to denote reasoning
+    text. This parser extracts the reasoning content from the model output.
+    """
+    start_token_id: int
+    end_token_id: int
+    start_token: str = "[unused16]"
+    end_token: str = "[unused17]"
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError(
+                "Pangu reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.end_token_id in input_ids
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text [unused16]abc[unused17]xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.start_token_id, self.end_token_id
+        ]):
+            return None
+        # Check if [unused16] is present in previous or delta.
+        # Keep compatibility with models that don't generate [unused16] tokens.
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # [unused16] in previous, [unused17] in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # [unused16] in previous, [unused17] in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # [unused16] in previous, no [unused17] in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # [unused16] in delta, [unused17] in delta, extract reasoning content
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            else:
+                # [unused16] in delta, no [unused17] in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No [unused16] in previous or delta, also need to check for [unused17].
+            # Because the model may have generated [unused17] without [unused16]
+            if self.end_token_id in delta_token_ids:
+                # [unused17] in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # [unused17] in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no [unused17] in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+        For text [unused16]abc[unused17]xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+        # Thus we assume the reasoning content is always at the start.
+        if self.end_token not in model_output:
+            return model_output, None
+        else:
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content

inference/vllm_ascend/entrypoints/openai/tool_parsers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+from .pangu_tool_parser import PanguToolParser
+__all__ = [
+    "PanguToolParser"
+]

inference/vllm_ascend/entrypoints/openai/tool_parsers/pangu_tool_parser.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+import json
+import re
+from json import JSONDecodeError, JSONDecoder
+from typing import Dict, List, Sequence, Union, Optional
+from pydantic import Field
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+from transformers import PreTrainedTokenizerBase
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall,
+                                              )
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
+                                                        is_complete_json)
+from vllm.logger import init_logger
+import os
+logger = init_logger(__name__)
+@ToolParserManager.register_module("pangu")
+class PanguToolParser(ToolParser):
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, enable_reasoning=False):
+        super().__init__(tokenizer)
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: List[Dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: List[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.tool_call_start_token = "[unused11]"
+        self.tool_call_end_token = "[unused12]"
+        self.pattern = re.escape(self.tool_call_start_token) \
+                       + "(.*?)" + re.escape(self.tool_call_end_token)
+        self.tool_call_regex = re.compile(self.pattern, re.DOTALL)
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(
+            self.tool_call_end_token)
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Pangu Tool parser could not locate tool calls start/end "
+                "tokens in the tokenizer!")
+        self.is_complete = []
+        self.text_after_start_token = ""
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        # case -- if a tool call token is not present, return a text response
+        if not (self.tool_call_start_token in model_output and \
+                model_output.find(self.tool_call_end_token) != -1):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            raw_function_calls = []
+            # use a regex to find the tool call between the tags
+            function_call_tuples = self.tool_call_regex.findall(model_output)
+            # load the JSON, and then use it to build the Function and
+            # Tool Call
+            for function_call_str in function_call_tuples:
+                function_call = json.loads(function_call_str)
+                raw_function_calls.extend(function_call)
+            tool_calls: List[ToolCall] = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"] \
+                                                 if "arguments" in function_call \
+                                                 else function_call["parameters"], ensure_ascii=False)))
+                for function_call in raw_function_calls
+            ]
+            content = model_output[:model_output.
+                                       find(self.tool_call_start_token)]
+            # get any content before  the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=content if content else None)
+            return ret
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        if (self.tool_call_end_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion and don't send a control token
+            return None
+        if (self.tool_call_end_token in current_text
+                and self.tool_call_end_token not in delta_text):
+            return DeltaMessage(content=delta_text)
+        if self.tool_call_start_token not in current_text:
+            return DeltaMessage(content=delta_text)
+        if self.tool_call_start_token in delta_text:
+            texts = delta_text.split(self.tool_call_start_token)
+            text_before_start_token = texts[0]
+            if text_before_start_token:
+                return DeltaMessage(content=text_before_start_token)
+        if (self.tool_call_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion and don't send a control token
+            return None
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_portion = current_text.split(
+                self.tool_call_start_token)[-1].split(self.tool_call_end_token)[0]
+            try:
+                tool_call_arr: list[dict] = partial_json_parser.loads(
+                    tool_call_portion, flags)
+                self.is_complete.append(
+                    is_complete_json(tool_call_portion))
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                self.is_complete = []
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+                if (self.is_complete[-1] and not cur_arguments
+                        and not self.streamed_args_for_tool[-1]):
+                    argument_diff = "{}"
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+                    argument_diff = None
+                    if self.is_complete[-1]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None

inference/vllm_ascend/envs.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+from typing import Any, Callable, Dict
+# The begin-* and end* here are used by the documentation generator
+# to extract the used env vars.
+# begin-env-vars-definition
+env_variables: Dict[str, Callable[[], Any]] = {
+    # max compile thread number for package building. Usually, it is set to
+    # the number of CPU cores. If not set, the default value is None, which
+    # means all number of CPU cores will be used.
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+    # The build type of the package. It can be one of the following values:
+    # Release, Debug, RelWithDebugInfo. If not set, the default value is Release.
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+    # Whether to compile custom kernels. If not set, the default value is True.
+    # If set to False, the custom kernels will not be compiled. Please note that
+    # the sleep mode feature will be disabled as well if custom kernels are not
+    # compiled.
+    "COMPILE_CUSTOM_KERNELS":
+    lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
+    # The CXX compiler used for compiling the package. If not set, the default
+    # value is None, which means the system default CXX compiler will be used.
+    "CXX_COMPILER":
+    lambda: os.getenv("CXX_COMPILER", None),
+    # The C compiler used for compiling the package. If not set, the default
+    # value is None, which means the system default C compiler will be used.
+    "C_COMPILER":
+    lambda: os.getenv("C_COMPILER", None),
+    # The version of the Ascend chip. If not set, the default value is
+    # ASCEND910B1. It's used for package building. Please make sure that the
+    # version is correct.
+    "SOC_VERSION":
+    lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
+    # If set, vllm-ascend will print verbose logs during compilation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+    # The home path for CANN toolkit. If not set, the default value is
+    # /usr/local/Ascend/ascend-toolkit/latest
+    "ASCEND_HOME_PATH":
+    lambda: os.getenv("ASCEND_HOME_PATH", None),
+    # The path for HCCN Tool, the tool will be called by disaggregated prefilling
+    # case.
+    "HCCN_PATH":
+    lambda: os.getenv("HCCN_PATH", "/usr/local/Ascend/driver/tools/hccn_tool"),
+    # The path for HCCL library, it's used by pyhccl communicator backend. If
+    # not set, the default value is libhccl.so。
+    "HCCL_SO_PATH":
+    # The prefill device id for disaggregated prefilling case.
+    lambda: os.environ.get("HCCL_SO_PATH", None),
+    "PROMPT_DEVICE_ID":
+    lambda: os.getenv("PROMPT_DEVICE_ID", None),
+    # The decode device id for disaggregated prefilling case.
+    "DECODE_DEVICE_ID":
+    lambda: os.getenv("DECODE_DEVICE_ID", None),
+    # The port number for llmdatadist communication. If not set, the default
+    # value is 26000.
+    "LLMDATADIST_COMM_PORT":
+    lambda: os.getenv("LLMDATADIST_COMM_PORT", "26000"),
+    # The wait time for llmdatadist sync cache. If not set, the default value is
+    # 5000ms.
+    "LLMDATADIST_SYNC_CACHE_WAIT_TIME":
+    lambda: os.getenv("LLMDATADIST_SYNC_CACHE_WAIT_TIME", "5000"),
+    # The version of vllm is installed. This value is used for developers who
+    # installed vllm from source locally. In this case, the version of vllm is
+    # usually changed. For example, if the version of vllm is "0.9.0", but when
+    # it's installed from source, the version of vllm is usually set to "0.9.1".
+    # In this case, developers need to set this value to "0.9.0" to make sure
+    # that the correct package is installed.
+    "VLLM_VERSION":
+    lambda: os.getenv("VLLM_VERSION", None),
+    # Whether to enable the trace recompiles from pytorch.
+    "VLLM_ASCEND_TRACE_RECOMPILES":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
+    # Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and
+    # GroupedMatmulFinalizeRouting operators are combined to implement EP.
+    "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP", '0'))
+                 ),
+    "VLLM_ASCEND_ENABLE_DBO":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DBO", '0'))),
+    # Whether to enable the model execute time observe profile. Disable it when
+    # running vllm ascend in production environment.
+    "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0'))
+                 ),
+    # MOE_ALL2ALL_BUFFER:
+    #   0: default, normal init.
+    #   1: enable moe_all2all_buffer.
+    "MOE_ALL2ALL_BUFFER":
+    lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))),
+    # Some models are optimized by vllm ascend. While in some case, e.g. rlhf
+    # training, the optimized model may not be suitable. In this case, set this
+    # value to False to disable the optimized model.
+    "USE_OPTIMIZED_MODEL":
+    lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
+    # SELECT_GATING_TOPK_SOTFMAX_EXPERTS is the equivalent of select_experts in non-quantized scenarios.
+    # In theory, it should have better performance than select_experts.
+    # Subsequent versions will remove the SELECT_GATING_TOPK_SOTFMAX_EXPERTS tag and use it as the default mode.
+    "SELECT_GATING_TOPK_SOTFMAX_EXPERTS":
+    lambda: bool(int(os.getenv("SELECT_GATING_TOPK_SOTFMAX_EXPERTS", '0'))),
+    # The tolerance of the kv cache size, if the difference between the
+    # actual kv cache size and the cached kv cache size is less than this value,
+    # then the cached kv cache size will be used.
+    "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
+    lambda: int(
+        os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)),
+    # Whether to enable the topk optimization. It's disabled by default for experimental support
+    # We'll make it enabled by default in the future.
+    "VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION":
+    lambda: bool(
+        int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '0'))),
+    # Whether to enable top n sigma sampling
+    "VLLM_ASCEND_ENABLE_TOP_N_SIGMA":
+    lambda: bool(
+        int(os.getenv("VLLM_ASCEND_ENABLE_TOP_N_SIGMA", '0'))),
+}
+# end-env-vars-definition
+def __getattr__(name: str):
+    # lazy evaluation of environment variables
+    if name in env_variables:
+        return env_variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__():
+    return list(env_variables.keys())

inference/vllm_ascend/models/__init__.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from vllm import ModelRegistry
+import vllm_ascend.envs as envs
+def register_model():
+    from .deepseek_dbo import CustomDeepseekDBOForCausalLM  # noqa: F401
+    from .deepseek_mtp import CustomDeepSeekMTP  # noqa: F401
+    from .deepseek_v2 import CustomDeepseekV2ForCausalLM  # noqa: F401
+    from .deepseek_v2 import CustomDeepseekV3ForCausalLM  # noqa: F401
+    from .open_pangu import PanguUltraMoEForCausalLM  # noqa: F401
+    from .open_pangu import PanguEmbeddedForCausalLM  # noqa: F401
+    from .qwen2_5_vl import \
+        AscendQwen2_5_VLForConditionalGeneration  # noqa: F401
+    from .qwen2_vl import AscendQwen2VLForConditionalGeneration  # noqa: F401
+    ModelRegistry.register_model(
+        "DeepSeekMTPModel",
+        "vllm_ascend.models.deepseek_mtp:CustomDeepSeekMTP")
+    ModelRegistry.register_model(
+        "Qwen2VLForConditionalGeneration",
+        "vllm_ascend.models.qwen2_vl:AscendQwen2VLForConditionalGeneration")
+    if envs.USE_OPTIMIZED_MODEL:
+        ModelRegistry.register_model(
+            "Qwen2_5_VLForConditionalGeneration",
+            "vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration"
+        )
+    else:
+        ModelRegistry.register_model(
+            "Qwen2_5_VLForConditionalGeneration",
+            "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen2_5_VLForConditionalGeneration_Without_Padding"
+        )
+    if envs.VLLM_ASCEND_ENABLE_DBO:
+        ModelRegistry.register_model(
+            "DeepseekV2ForCausalLM",
+            "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
+        ModelRegistry.register_model(
+            "DeepseekV3ForCausalLM",
+            "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM")
+    else:
+        ModelRegistry.register_model(
+            "DeepseekV2ForCausalLM",
+            "vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM")
+        ModelRegistry.register_model(
+            "DeepseekV3ForCausalLM",
+            "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM")
+    ModelRegistry.register_model(
+        "Qwen3MoeForCausalLM",
+        "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
+    ModelRegistry.register_model(
+        "PanguProMoEForCausalLM",
+        "vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM")
+    ModelRegistry.register_model(
+        "PanguUltraMoEForCausalLM",
+        "vllm_ascend.models.open_pangu:PanguUltraMoEForCausalLM")
+    ModelRegistry.register_model(
+        "PanguEmbeddedForCausalLM",
+        "vllm_ascend.models.open_pangu:PanguEmbeddedForCausalLM")

inference/vllm_ascend/models/open_pangu.py ADDED Viewed

	@@ -0,0 +1,1127 @@

+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+import torch
+import torch_npu
+import vllm.envs as envs
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              get_tp_group, split_tensor_along_last_dim,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed.parallel_state import get_dp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear,
+                                               UnquantizedLinearMethod,
+                                               QKVParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope, _rotate_gptj
+from vllm.model_executor.layers.sampler import get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.utils import (
+    make_layers, maybe_prefix, extract_layer_index)
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.parallel_state import get_ep_group
+from vllm_ascend.ops.fused_moe import AscendFusedMoE
+from vllm_ascend.quantization.quant_config import AscendLinearMethod
+from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
+from vllm_ascend.utils import dispose_tensor, npu_prefetch, get_fused_moe_state, FusedMoEState
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+class OpenPanguMergedReplicatedLinear(ReplicatedLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size,
+                         sum(output_sizes),
+                         bias=bias,
+                         quant_config=quant_config,
+                         prefix=prefix)
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, loaded_shard_id: int):
+        # With no support for GGUF format yet.
+        if getattr(param, "is_gguf_weight", False) or getattr(param, "is_gguf_weight_type", False):
+            raise ValueError('With no support for GGUF format yet.')
+        if loaded_shard_id >= len(self.output_sizes):
+            raise ValueError(f'loaded_shard_id {loaded_shard_id} >= len(self.output_sizes) {len(self.output_sizes)}.')
+        shard_offset = sum(self.output_sizes[:loaded_shard_id])
+        shard_size = self.output_sizes[loaded_shard_id]
+        shard = param.data.narrow(param.output_dim, shard_offset, shard_size)
+        if shard.size() != loaded_weight.size():
+            raise ValueError(f"Tried to load weights of size {loaded_weight.size()} "
+                             f"to a parameter shard of id {loaded_shard_id} size {shard.size()}.")
+        shard.copy_(loaded_weight)
+class OpenPanguRowParallelLinearReplaceAllreduce(RowParallelLinear):
+    def forward(
+        self,
+        input_,
+        is_prefill=True
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]:
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[tp_rank].contiguous()
+        # Matrix multiply.
+        if self.quant_method is None:
+            raise ValueError('self.quant_method is None.')
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        output_parallel = self.quant_method.apply(self,
+                                                  input_parallel,
+                                                  bias=bias_)
+        if self.reduce_results and self.tp_size > 1:
+            if not is_prefill and output_parallel.shape[0] % self.tp_size == 0:
+                output = tensor_model_parallel_reduce_scatter(output_parallel,
+                                                              dim=0)
+            else:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
+        return output, output_bias
+class OpenPanguRowParallelLinear(RowParallelLinear):
+    def forward(
+        self,
+        input_,
+        is_prefill=True
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]:
+        return super().forward(input_)
+class OpenPanguRotaryEmbedding(nn.Module):
+    def __init__(self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+    ):
+        super().__init__()
+        self.dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device='npu',
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self,
+        seq_len: int,
+        device: str,
+        dtype: torch.dtype
+    ):
+        self.max_seq_len = seq_len
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device='npu') / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device='npu', dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self,
+                positions: torch.Tensor,
+                query: torch.Tensor,
+                key: torch.Tensor,
+                offsets: Optional[torch.Tensor] = None,
+                max_seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if max_seq_len is not None and max_seq_len > self.max_seq_len:
+            self._set_cos_sin_cache(max_seq_len, query.device, query.dtype)
+        idx = torch.add(positions, offsets) if offsets is not None else positions
+        cos = self.cos_cached[idx]
+        sin = self.sin_cached[idx]
+        # Adapt: adapt cos and sin shape
+        cos = cos.view(-1, 1, cos.shape[-1])
+        sin = sin.view(-1, 1, sin.shape[-1])
+        # Adapt end.
+        query_rot = query * cos + _rotate_gptj(query) * sin
+        if key is not None:
+            key_rot = key * cos + _rotate_gptj(key) * sin
+        return query_rot, key_rot
+class OpenPanguSiluAndMul(SiluAndMul):
+    def __init__(self,
+                 *,
+                 weight_scale: Optional[Callable[[], torch.Tensor]] = None):
+        super().__init__()
+        self.weight_scale = weight_scale
+    def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor,
+                                                       torch.Tensor]]):
+        if isinstance(x, tuple):
+            if self.weight_scale is None:
+                raise ValueError('self.weight_scale is None.')
+            quantized_x, dynamic_scale = x
+            return torch_npu.npu_dequant_swiglu_quant(
+                x=quantized_x,
+                weight_scale=self.weight_scale(),
+                activation_scale=dynamic_scale,
+                activate_left=True,
+                quant_mode=1)
+        else:
+            return super().forward_oot(x)
+def check_ffn_act_fn(act_fn: str):
+    if act_fn != "silu":
+        raise ValueError(
+            f"Unsupported activation: {act_fn}. Only silu is supported for now.")
+class OpenPanguMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        reduce_results: bool = True,
+        force_replicate: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if not force_replicate:
+            self.gate_up_proj = MergedColumnParallelLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = RowParallelLinear(intermediate_size,
+                                               hidden_size,
+                                               bias=bias,
+                                               quant_config=quant_config,
+                                               reduce_results=reduce_results,
+                                               prefix=f"{prefix}.down_proj")
+        else:
+            self.gate_up_proj = OpenPanguMergedReplicatedLinear(
+                                     hidden_size, [intermediate_size] * 2,
+                                     bias=bias,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = ReplicatedLinear(intermediate_size,
+                                            hidden_size,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.down_proj")
+        check_ffn_act_fn(hidden_act)
+        quant_method = self.gate_up_proj.quant_method
+        if isinstance(quant_method, UnquantizedLinearMethod):
+            self.act_fn = OpenPanguSiluAndMul()
+        elif (isinstance(quant_method, AscendLinearMethod) and isinstance(
+                quant_method.quant_method, AscendW8A8DynamicLinearMethod)):
+            # TODO(sdmyzlp): Currently preserved as before:
+            # 1. The only quantization supported for silu is W8A8Dynamic
+            # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16
+            #
+            # Maybe one can implement a better and more general configuration
+            # scheme, e.g. by somehow passing around the tweaked `quant_config`
+            self.act_fn = OpenPanguSiluAndMul(
+                # Use lazy binding, for `weight_scale_fp32` is accessible
+                # only after `process_weights_after_loading`.
+                weight_scale=lambda: self.gate_up_proj.weight_scale_fp32)
+            # To be consumed by AscendW8A8DynamicLinearMethod.apply()
+            self.gate_up_proj._ascend_quant_config = {
+                "output_dtype": torch.int32,
+                "pertoken_scale": False,
+                "return_scale": True,
+            }
+            self.down_proj._ascend_quant_config = {
+                "output_dtype": torch.bfloat16,
+                "pertoken_scale": True,
+                "return_scale": False,
+            }
+        else:
+            raise NotImplementedError(
+                f"Quantization with [{type(quant_method)}] is NOT supported")
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act_fn(self.gate_up_proj(x)[0]))[0]
+class OpenPanguMoE(nn.Module):
+    top_k: int
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_moe = \
+            ascend_config.torchair_graph_config.enable_multistream_moe
+        self.routed_scaling_factor = config.routed_scaling_factor
+        check_ffn_act_fn(config.hidden_act)
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_routed_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+        self.experts = AscendFusedMoE(
+            num_experts=config.num_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=1,
+            topk_group=1,
+            prefix=f"{prefix}.experts",
+            scoring_func='sigmoid',
+            e_score_correction_bias=None)
+        if config.num_shared_experts is not None:
+            self.all_reduce_merge = self.experts.all_reduce_merge
+            reduce_results = not self.all_reduce_merge
+            intermediate_size = (config.moe_intermediate_size * config.num_shared_experts)
+            self.shared_experts = OpenPanguMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=reduce_results,
+                force_replicate=self.enable_multistream_moe,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None  # type: ignore
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.dp_size = get_dp_group().world_size
+        self.tp_group = get_tp_group().device_group
+        self.tp_rank = get_tp_group().rank_in_group
+        self.ep_group = get_ep_group()
+        self.params_dtype = torch.get_default_dtype()
+        self.rm_router_logits = self.experts.rm_router_logits
+        self.__class__.top_k = config.num_experts_per_tok
+    def forward(self,
+                hidden_states: torch.Tensor,
+                attn_metadata: Optional[AttentionMetadata] = None,
+                replace_allreduce: bool = False) -> torch.Tensor:
+        if attn_metadata is None:
+            attn_metadata = get_forward_context().attn_metadata
+        # when profile runs, force experts to load balanced tokens
+        # to avoid high memory consumption on a single rank.
+        # TODO: need a better flag to indicate whether in profile run or not.
+        if attn_metadata is None:
+            # for profile run
+            is_prefill = True
+            fused_moe_state = get_fused_moe_state(self.ep_group.world_size, is_prefill, True)
+            enable_force_load_balance = fused_moe_state != FusedMoEState.AllGatherEP
+        else:
+            is_prefill = attn_metadata.num_prefills > 0
+            enable_force_load_balance = False
+            if hasattr(attn_metadata, 'with_prefill_across_dp'):
+                is_prefill = is_prefill or attn_metadata.with_prefill_across_dp
+            fused_moe_state = get_fused_moe_state(self.ep_group.world_size, is_prefill, True)
+        # router_logits: (num_tokens, n_experts)
+        router_logits = None
+        if not self.rm_router_logits or fused_moe_state == FusedMoEState.All2All:
+            router_logits, _ = self.gate(hidden_states.float())
+        routed_hidden_states, shared_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            is_prefill=is_prefill,
+            top_k=self.__class__.top_k,
+            enable_force_load_balance=enable_force_load_balance,
+            shared_experts=self.shared_experts,
+            gate=self.gate,
+            replace_allreduce=replace_allreduce)
+        if self.all_reduce_merge and fused_moe_state == FusedMoEState.All2All:
+            shared_hidden_states = tensor_model_parallel_all_reduce(shared_hidden_states)
+        hidden_states = routed_hidden_states * self.routed_scaling_factor + shared_hidden_states
+        if self.all_reduce_merge and fused_moe_state != FusedMoEState.All2All:
+            # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+        return hidden_states
+class OpenPanguMLAAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        attention_qk_dim: int,
+        attention_qk_rope_dim: int,
+        attention_v_dim: int,
+        attention_q_lora_dim: Optional[int],
+        attention_kv_lora_dim: int,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_mla = ascend_config.torchair_graph_config.enable_multistream_mla
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_qk_dim = attention_qk_dim
+        self.attention_qk_rope_dim = attention_qk_rope_dim
+        self.qk_head_dim = attention_qk_dim + attention_qk_rope_dim
+        self.attention_v_dim = attention_v_dim
+        self.attention_q_lora_dim = attention_q_lora_dim
+        self.attention_kv_lora_dim = attention_kv_lora_dim
+        self.rope_theta = rope_theta
+        tp_size = get_tensor_model_parallel_world_size()
+        if num_heads % tp_size != 0:
+            raise ValueError(f'num_heads {num_heads} is not divisible by tp_size {tp_size}.')
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+        if self.attention_q_lora_dim is not None:
+            self.q_a_proj = ReplicatedLinear(self.hidden_size,
+                                             self.attention_q_lora_dim,
+                                             bias=False,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.q_a_proj")
+            self.q_a_layernorm = RMSNorm(self.attention_q_lora_dim, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(attention_q_lora_dim,
+                                                 self.num_heads * self.qk_head_dim,
+                                                 bias=False,
+                                                 quant_config=quant_config,
+                                                 prefix=f"{prefix}.q_b_proj")
+        else:
+            self.q_proj = ColumnParallelLinear(self.hidden_size,
+                                               self.num_heads * self.qk_head_dim,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.q_proj")
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.attention_kv_lora_dim + self.attention_qk_rope_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_a_proj_with_mqa")
+        self.kv_a_layernorm = RMSNorm(self.attention_kv_lora_dim,
+                                      eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.attention_kv_lora_dim,
+            self.num_heads * (self.attention_qk_dim + self.attention_v_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.kv_b_proj")
+        if (config.num_routed_experts is not None
+                and self.debug_layer_idx >= config.num_dense_layers and
+                ascend_config.torchair_graph_config.enable_multistream_moe):
+            self.o_proj = OpenPanguRowParallelLinearReplaceAllreduce(
+                self.num_heads * self.attention_v_dim,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj")
+        else:
+            self.o_proj = OpenPanguRowParallelLinear(
+                self.num_heads * self.attention_v_dim,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.o_proj")
+        self.rotary_emb = OpenPanguRotaryEmbedding(attention_qk_rope_dim,
+                                                   rotary_dim=attention_qk_rope_dim,
+                                                   max_position_embeddings=max_position_embeddings,
+                                                   base=rope_theta)
+        self.mla_attn = Attention(
+            num_heads=self.num_local_heads,
+            head_size=self.attention_kv_lora_dim + self.attention_qk_rope_dim,
+            scale=self.scaling,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.attention_q_lora_dim,
+            kv_lora_rank=self.attention_kv_lora_dim,
+            qk_nope_head_dim=self.attention_qk_dim,
+            qk_rope_head_dim=self.attention_qk_rope_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.attention_v_dim,
+            rotary_emb=self.rotary_emb,
+            q_proj=self.q_proj if self.attention_q_lora_dim is None else self.q_b_proj,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa,
+            kv_a_layernorm=self.kv_a_layernorm,
+            kv_b_proj=self.kv_b_proj,
+            o_proj=self.o_proj,
+        )
+    def forward(
+            self,
+            positions: torch.Tensor,
+            hidden_states: torch.Tensor,
+            kv_cache: Optional[torch.Tensor] = None,
+            attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor:
+        enable_multistream_mla = (self.enable_multistream_mla
+                                  and attn_metadata is not None
+                                  and not attn_metadata.with_prefill_across_dp
+                                  and attn_metadata.num_decodes > 0)
+        forward_kwargs = {"enable_multistream_mla": enable_multistream_mla}
+        if self.attention_q_lora_dim is not None:
+            npu_prefetch(self.q_a_proj.weight,
+                         hidden_states,
+                         enabled=enable_multistream_mla)
+            ckq = self.q_a_proj(hidden_states)[0]
+            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+            forward_kwargs['ckq'] = ckq
+        else:
+            hidden_states_or_q_c = hidden_states
+        if self.torchair_graph_enabled:
+            if envs.VLLM_USE_V1:
+                output_shape = hidden_states.shape
+                output = torch.empty(output_shape,
+                                     dtype=hidden_states_or_q_c.dtype,
+                                     device=hidden_states_or_q_c.device)
+                forward_kwargs['output'] = output
+            output = self.mla_attn.impl.forward(self.mla_attn,
+                                                hidden_states_or_q_c,
+                                                hidden_states, None, kv_cache,
+                                                attn_metadata,
+                                                **forward_kwargs)
+            if envs.VLLM_USE_V1:
+                output = output.view(-1, output_shape[-1])
+            return output
+        else:
+            kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
+                [self.attention_kv_lora_dim, self.attention_qk_rope_dim], dim=-1)
+            kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+            return self.mla_attn(hidden_states_or_q_c,
+                                 kv_c_normed,
+                                 k_pe,
+                                 output_shape=hidden_states.shape)
+class OpenPanguEmbeddedAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        if self.total_num_heads % tp_size != 0:
+            raise ValueError(f'total_num_heads {total_num_heads} is not divisible by tp_size {tp_size}.')
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size and self.total_num_kv_heads % tp_size != 0:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel NPUs.
+            raise ValueError(f'Number of KV heads is less than TP size, but total_num_kv_heads {self.total_num_kv_heads} '
+                             f'is not divisible by tp_size {tp_size}.')
+        elif self.total_num_kv_heads < tp_size and tp_size % self.total_num_kv_heads != 0:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel NPUs.
+            raise ValueError(f'Number of KV heads is less than TP size, but tp_size {tp_size} '
+                                f'is not divisible by total_num_kv_heads {self.total_num_kv_heads}.')
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
+        if hasattr(config, "interleaved_sliding_window"):
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(
+                    f"{type(interleaved_sliding_window)} is not supported.")
+        else:
+            sliding_window = None
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: Optional[torch.Tensor] = None,
+        attn_metadata: Optional[AttentionMetadata] = None
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+    def _init_rotary_emb(self, config: PretrainedConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "Pangu":
+            is_neox_style = False
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            #partial_rotary_factor=self.partial_rotary_factor,
+        )
+class OpenPanguDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        layer_idx = int(prefix.split(sep='.')[-1])
+        self.layer_idx = layer_idx
+        self.layers = config.num_hidden_layers
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tp_group().rank_in_group
+        ascend_config = get_ascend_config()
+        self.use_mla = hasattr(config, 'attention_qk_dim') and hasattr(config, 'attention_qk_rope_dim') \
+            and hasattr(config, 'attention_v_dim') and hasattr(config, 'attention_kv_lora_dim')
+        if self.use_mla:
+            self.self_attn = OpenPanguMLAAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                attention_qk_dim=config.attention_qk_dim,
+                attention_qk_rope_dim=config.attention_qk_rope_dim,
+                attention_v_dim=config.attention_v_dim,
+                attention_q_lora_dim=config.attention_q_lora_dim
+                if hasattr(config, "attention_q_lora_dim") else None,
+                attention_kv_lora_dim=config.attention_kv_lora_dim,
+                rope_theta=rope_theta,
+                max_position_embeddings=max_position_embeddings,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            attention_bias = getattr(config, "attention_bias", False) or getattr(
+                config, "bias", False)
+            bias_o_proj = attention_bias
+            if hasattr(config, 'qkv_bias'):
+                attention_bias = config.qkv_bias
+            # By default, PanguEmbedded uses causal attention as it is a decoder-only model.
+            # You can override the HF config with `is_causal=False` to enable
+            # bidirectional attention, which is used in some embedding models
+            if getattr(config, "is_causal", True):
+                attn_type = AttentionType.DECODER
+            else:
+                attn_type = AttentionType.ENCODER_ONLY
+            self.self_attn = OpenPanguEmbeddedAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=getattr(config, "num_key_value_heads", config.num_attention_heads),
+                rope_theta=rope_theta,
+                rope_scaling=getattr(config, "rope_scaling", None),
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+                attn_type=attn_type,
+            )
+        if getattr(config, 'num_routed_experts', None) is not None and layer_idx >= config.num_dense_layers:
+            self.mlp = OpenPanguMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+            self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
+                and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
+        else:
+            self.mlp = OpenPanguMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+            self.mla_moe_communication = False
+        self.routed_scaling_factor = getattr(config, 'routed_scaling_factor', None)
+        self.num_dense_layers = getattr(config, 'num_dense_layers', None)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        if getattr(config, 'sandwich_norm', False):
+            self.sandwich_norm = True
+            self.pre_mlp_layernorm = RMSNorm(config.hidden_size,
+                                             eps=config.rms_norm_eps)
+            self.post_mlp_layernorm = RMSNorm(config.hidden_size,
+                                              eps=config.rms_norm_eps)
+        else:
+            self.sandwich_norm = False
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        kv_cache: Optional[torch.Tensor] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+        replace_allreduce: bool = False,
+    ) -> torch.Tensor:
+        # Self Attention
+        if self.use_mla and attn_metadata is not None and attn_metadata.num_decodes > 0:
+            mla_moe_communication = self.mla_moe_communication and replace_allreduce
+        else:
+            mla_moe_communication = False
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            previous_hidden_states, previous_residual = hidden_states, residual
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+            # Dispose hidden_states and residual from the previous layer
+            # to save npu memory because they're no longer used.
+            dispose_tensor(previous_hidden_states)
+            dispose_tensor(previous_residual)
+        if mla_moe_communication and self.layer_idx > self.num_dense_layers:
+            hidden_states = tensor_model_parallel_all_gather(hidden_states,
+                                                             dim=0)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        if mla_moe_communication and residual.shape[0] != hidden_states.shape[0]:
+            chunk_hidden_states = torch.tensor_split(residual,
+                                                     self.tp_size,
+                                                     dim=0)
+            residual = chunk_hidden_states[self.tp_rank]
+        if self.routed_scaling_factor is not None and hidden_states.dtype == torch.float16:
+            # Fix FP16 overflow
+            # We scale both hidden_states and residual before
+            # rmsnorm, and rmsnorm result would not affect by scale.
+            hidden_states *= 1. / self.routed_scaling_factor
+            if self.layer_idx == 0:
+                # The residual is shared by all layers, we only scale it on
+                # first layer.
+                residual *= 1. / self.routed_scaling_factor
+        if self.sandwich_norm:
+            hidden_states = self.post_attention_layernorm(
+                hidden_states)
+            hidden_states, residual = self.pre_mlp_layernorm(
+                hidden_states, residual)
+        else:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+        # Fully Connected
+        if isinstance(self.mlp, OpenPanguMoE):
+            hidden_states = self.mlp(hidden_states,
+                                     attn_metadata,
+                                     replace_allreduce=mla_moe_communication)
+        else:
+            hidden_states = self.mlp(hidden_states)
+        if self.routed_scaling_factor is not None and isinstance(self.mlp, OpenPanguMLP) \
+            and hidden_states.dtype == torch.float16:
+            hidden_states *= 1. / self.routed_scaling_factor
+        if self.sandwich_norm:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+        if mla_moe_communication and self.layer_idx == self.layers - 1:
+            hidden_states = tensor_model_parallel_all_gather(hidden_states,
+                                                             dim=0)
+            residual = tensor_model_parallel_all_gather(residual, dim=0)
+        return hidden_states, residual
+@support_torch_compile
+class OpenPanguModel(nn.Module):
+    fall_back_to_pt_during_load = False
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: OpenPanguDecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        residual = None
+        replace_allreduce = hidden_states.shape[0] % self.tp_size == 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                kv_caches[i -
+                          self.start_layer] if kv_caches is not None else None,
+                attn_metadata,
+                replace_allreduce=replace_allreduce)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+class OpenPanguForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
+    }
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OpenPanguModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config,
+                                      prefix=maybe_prefix(prefix, "lm_head"))
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+    def load_attn_mlp_weight(self,
+                             attn_mlp_replace_mapping: List[Tuple[str, str, int]],
+                             params_dict: Dict[str, Any],
+                             weight_name: str,
+                             loaded_weight: torch.Tensor,
+                             loaded_params: set[str]) -> bool:
+        for (param_name, origin_name, shard_id) in attn_mlp_replace_mapping:
+            if origin_name not in weight_name or \
+                (("mlp.experts." in weight_name) and weight_name not in params_dict):
+                continue
+            weight_name = weight_name.replace(origin_name, param_name)
+            if weight_name.endswith(".bias") and weight_name not in params_dict:
+                continue
+            param = params_dict[weight_name]
+            weight_loader = param.weight_loader
+            weight_loader(param, loaded_weight, shard_id)
+            loaded_params.add(weight_name)
+            return True
+        return False
+    def load_expert_weight(self,
+                           expert_merge_mapping: List[Tuple[str, str, int, str]],
+                           params_dict: Dict[str, Any],
+                           weight_name: str,
+                           loaded_weight: torch.Tensor,
+                           loaded_params: set[str]) -> bool:
+        for mapping in expert_merge_mapping:
+            param_name, origin_name, expert_id, shard_id = mapping
+            if origin_name not in weight_name:
+                continue
+            weight_name = weight_name.replace(origin_name, param_name)
+            param = params_dict[weight_name]
+            weight_loader = param.weight_loader
+            weight_loader(param,
+                          loaded_weight,
+                          weight_name,
+                          shard_id=shard_id,
+                          expert_id=expert_id,
+                          return_success=False)
+            loaded_params.add(weight_name)
+            return True
+        return False
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        # (param_name, shard_name, shard_id)
+        attn_mlp_replace_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        has_experts = hasattr(self.config, 'num_routed_experts')
+        if has_experts:
+            expert_merge_mapping = AscendFusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_routed_experts)
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if 'layers' in name: # skip spec decode layers for main model
+                layer_idx = int(name.split('layers.')[-1].split('.')[0])
+                if layer_idx > self.config.num_hidden_layers:
+                    continue
+            if 'layers' in name and hasattr(self.config, "num_mtp_layers") \
+                and (self.config.num_mtp_layers > 0):
+                layer_idx = int(name.split('layers.')[-1].split('.')[0])
+                mtp_idx = layer_idx - self.config.num_hidden_layers
+                if mtp_idx >= 0 and mtp_idx < self.config.num_mtp_layers:
+                    continue # skip spec decode layers for main model
+            if self.load_attn_mlp_weight(attn_mlp_replace_mapping, params_dict, name, loaded_weight, loaded_params):
+                continue
+            elif has_experts and self.load_expert_weight(expert_merge_mapping, params_dict, name, loaded_weight, loaded_params):
+                continue
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        return loaded_params
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: Optional[List[torch.Tensor]] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, inputs_embeds)
+        return hidden_states
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        return logits
+class PanguUltraMoEForCausalLM(OpenPanguForCausalLM):
+    pass
+class PanguEmbeddedForCausalLM(OpenPanguForCausalLM):
+    pass

inference/vllm_ascend/ops/fused_moe.py ADDED Viewed

	@@ -0,0 +1,1530 @@

+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/kernels/test_moe.py
+import os
+from typing import Any, Callable, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch_npu
+from torch import nn
+from vllm.config import get_current_vllm_config
+from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import get_dp_group, get_tp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe.config import \
+    FusedMoEConfig  # isort: skip
+from vllm.model_executor.layers.fused_moe.config import \
+    FusedMoEParallelConfig  # isort: skip
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizationConfig
+import vllm_ascend.envs as envs_ascend
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.communication_op import \
+    data_parallel_reduce_scatter
+from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
+from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.utils import (FusedMoEState, dispose_tensor,
+                               get_all_reduce_merge_state, get_fused_moe_state,
+                               get_rm_router_logits_state, is_310p,
+                               npu_stream_switch, npu_wait_tensor)
+MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER
+SELECT_GATING_TOPK_SOTFMAX_EXPERTS: bool = envs_ascend.SELECT_GATING_TOPK_SOTFMAX_EXPERTS
+def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int,
+                     max_row_per_ep_rank: int, num_tokens: int,
+                     top_k: int) -> tuple[torch.Tensor, torch.Tensor]:
+    original_total_elements = num_tokens * top_k
+    device = topk_ids.device
+    original_dtype = topk_ids.dtype
+    if original_total_elements == 0:
+        output_len = ep_size * max_row_per_ep_rank
+        topk_ids_pad = torch.full((output_len, ),
+                                  expert_num,
+                                  dtype=original_dtype,
+                                  device=device)
+        unpad_indices = torch.full((original_total_elements, ),
+                                   -1,
+                                   dtype=torch.long,
+                                   device=device)
+        return topk_ids_pad, unpad_indices
+    experts_per_ep_rank_val = expert_num // ep_size
+    if experts_per_ep_rank_val == 0:
+        raise ValueError(
+            "expert_num // ep_size is 0, which leads to division by zero in ep_rank calculation. "
+            "Ensure expert_num >= ep_size.")
+    assigned_ep_rank = (topk_ids.float() /
+                        experts_per_ep_rank_val).to(original_dtype)
+    indices_arange = torch.arange(topk_ids.shape[0], device=device)
+    is_new_segment = torch.cat(
+        (torch.tensor([True], device=device), assigned_ep_rank[1:]
+         != assigned_ep_rank[:-1]))
+    temp_start_markers = torch.full_like(indices_arange,
+                                         -1,
+                                         dtype=indices_arange.dtype)
+    temp_start_markers[is_new_segment] = indices_arange[is_new_segment]
+    start_offset_for_each_token = torch.cummax(temp_start_markers, dim=0)[0]
+    token_intra_ep_rank_idx = indices_arange - start_offset_for_each_token
+    is_kept_mask = token_intra_ep_rank_idx < max_row_per_ep_rank
+    cumsum_kept = torch.cumsum(is_kept_mask.float(), dim=0).to(torch.long)
+    indices_in_rec_cond_list_for_all = cumsum_kept - 1
+    unpad_indices = torch.where(
+        is_kept_mask, indices_in_rec_cond_list_for_all,
+        torch.tensor(-1, device=device, dtype=torch.long))
+    output_len = ep_size * max_row_per_ep_rank
+    topk_ids_pad = torch.full((output_len, ),
+                              expert_num,
+                              dtype=original_dtype,
+                              device=device)
+    if topk_ids.shape[0] > 0:
+        all_destination_indices = assigned_ep_rank * max_row_per_ep_rank + token_intra_ep_rank_idx
+        temp_pad_buffer = torch.full((output_len + 1, ),
+                                     expert_num,
+                                     dtype=original_dtype,
+                                     device=device)
+        output_len_tensor = torch.tensor(output_len,
+                                         dtype=torch.long,
+                                         device=device)
+        scatter_indices = torch.where(is_kept_mask, all_destination_indices,
+                                      output_len_tensor)
+        temp_pad_buffer.scatter_(0, scatter_indices, topk_ids)
+        topk_ids_pad = temp_pad_buffer[:output_len]
+    return topk_ids_pad, unpad_indices
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: Optional[str] = None,
+    shared_experts: Optional[Any] = None
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    global_bs = 0
+    moe_expert_num = len(expert_map)
+    kwargs_mc2 = {
+        "x": hidden_states,
+        "expert_ids": topk_ids,
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": global_bs,
+    }
+    rank = torch.distributed.get_rank()
+    quant_mode = 0
+    ep_group = get_ep_group().device_group
+    local_rank = torch.distributed.get_rank(group=ep_group)
+    all_to_all_group_size = torch.distributed.get_world_size(ep_group)
+    tp_size = get_etp_group().world_size
+    tp_rank = rank % tp_size
+    stage1_kwargs = {
+        "scales": None,
+        "quant_mode": quant_mode,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs_mc2.update(stage1_kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
+    expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
+        0:5]
+    if shared_experts is not None:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(hidden_states, topk_weights)
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
+            npu_wait_tensor(shared_gate_up, expand_x)
+            shared_act = shared_experts.act_fn(shared_gate_up)
+    w1 = w1.transpose(1, 2)
+    group_list = expert_token_nums.to(torch.int64)
+    gate_up_out_list = torch_npu.npu_grouped_matmul(
+        x=[expand_x],
+        weight=[w1],
+        split_item=2,
+        # 1 means count mode, to avoid cumulative operation of the group list
+        group_list_type=1,
+        group_type=0,
+        group_list=group_list,
+    )
+    # TODO: Remove this in the future.
+    gate_up_out = torch.cat(gate_up_out_list, dim=0)
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[gate_up_out],
+        weight=[w2],
+        split_item=2,
+        group_list_type=1,
+        group_type=0,
+        group_list=group_list,
+    )
+    down_out_list = torch.cat(down_out_list, dim=0)
+    # moeCombine
+    kwargs_mc2 = {
+        "expand_x": down_out_list,
+        "expert_ids": topk_ids,
+        "expand_idx": expand_idx,
+        "expert_scales": topk_weights.to(torch.float32),
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": 0,
+    }
+    tp_recv_counts = output[5]
+    stage3_kwargs = {
+        "ep_send_counts": ep_recv_counts,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        "tp_send_counts": tp_recv_counts,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs_mc2.update(stage3_kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
+    if shared_experts is None:
+        return hidden_states
+    else:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(shared_act, down_out_list)
+            shared_hidden_states, _ = shared_experts.down_proj(shared_act)
+        return hidden_states, shared_hidden_states
+def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              group_list: torch.Tensor,
+              group_list_type: int = 1) -> torch.Tensor:
+    """
+    apply MLP: gate_up_proj -> swiglu -> down_proj
+    Args:
+        hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size).
+        w1: expert weights1 with shape
+            (num_experts, hidden_size, intermediate_size * 2)
+        w2: expert weights2 with shape
+            (num_experts, intermediate_size, hidden_size)
+        group_list: number of tokens for each expert, follow cumsum mode, and
+            with shape (num_experts).
+        transpose_weight:
+            w1: (num_experts, intermediate_size * 2, hidden_size) ->
+                    (num_experts, hidden_size, intermediate_size * 2)
+            w2: (num_experts, hidden_size, intermediate_size) ->
+                    (num_experts, intermediate_size, hidden_size)
+    Returns:
+        hidden_states: output hidden states after MLP.
+    """
+    assert len(hidden_states_wrapper) == 1
+    hidden_states = hidden_states_wrapper.pop()
+    w1 = w1.transpose(1, 2)
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+    )
+    hidden_states = torch.cat(hidden_states, dim=0)
+    hidden_states = torch_npu.npu_swiglu(hidden_states)
+    w2 = w2.transpose(1, 2)
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+    )
+    hidden_states = torch.cat(hidden_states, dim=0)
+    return hidden_states
+def fused_experts_with_all2all(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    ep_group: GroupCoordinator = None,
+):
+    original_shape = hidden_states.shape
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens, _ = hidden_states.shape
+    num_experts = w1.shape[0]
+    device = hidden_states.device
+    if expert_map is not None:
+        global_num_experts = len(expert_map)
+        local_num_experts = global_num_experts // ep_group.world_size
+        row_idx_len = num_tokens * top_k
+        row_idx = (torch.arange(0,
+                                row_idx_len,
+                                dtype=torch.int32,
+                                device=device).view(top_k, -1).permute(
+                                    1, 0).contiguous())
+        hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=num_tokens)
+        global_expert_tokens = torch.bincount(expanded_expert_idx,
+                                              minlength=global_num_experts)
+        scatter_sizes = global_expert_tokens.view(ep_group.world_size,
+                                                  -1).sum(-1)
+        gather_sizes = torch.empty_like(scatter_sizes)
+        dist.all_to_all_single(gather_sizes,
+                               scatter_sizes,
+                               group=ep_group.device_group)
+        scatter_size_list = scatter_sizes.cpu().tolist()
+        gather_size_list = gather_sizes.cpu().tolist()
+        expanded_expert_idx = expanded_expert_idx % local_num_experts
+        hidden_states = ep_group.all_to_all(hidden_states, 0, 0,
+                                            scatter_size_list,
+                                            gather_size_list)
+        local_expert_idx = ep_group.all_to_all(expanded_expert_idx, 0, 0,
+                                               scatter_size_list,
+                                               gather_size_list)
+        sorted_local_expert_idx, sorted_idx = torch.sort(local_expert_idx)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            sorted_local_expert_idx, local_num_experts).to(torch.int64)
+        hidden_states = hidden_states[sorted_idx]
+    else:
+        row_idx_len = num_tokens * top_k
+        row_idx = torch.arange(0,
+                               row_idx_len,
+                               dtype=torch.int32,
+                               device=topk_weights.device).view(
+                                   top_k, -1).permute(1, 0).contiguous()
+        hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=num_tokens)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, num_experts)
+        expert_tokens = expert_tokens.to(torch.int64)
+    w1 = w1.transpose(1, 2)
+    gate_up_out_list = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+    )
+    # TODO: Remove this in the future.
+    hidden_states = torch.cat(gate_up_out_list, dim=0)
+    hidden_states = torch_npu.npu_swiglu(hidden_states)
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+    )
+    hidden_states = torch.cat(down_out_list, dim=0)
+    if expert_map is not None:
+        resorted_idx = torch.argsort(sorted_idx)
+        hidden_states = hidden_states[resorted_idx]
+        hidden_states = ep_group.all_to_all(hidden_states, 0, 0,
+                                            gather_size_list,
+                                            scatter_size_list)
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    else:
+        # TODO: Reorder device memory 2 times here, replace the current
+        # implementation here when suitable operators become available.
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+# currently expert parallelism implemented with all2all
+# is under-optimized.
+def fused_experts_with_all2all_buffer(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    max_model_len: int,
+    global_batch_size: int,
+    expert_map: torch.Tensor = None,
+    ep_group: GroupCoordinator = None,
+):
+    original_shape = hidden_states.shape
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens, _ = hidden_states.shape
+    device = hidden_states.device
+    global_num_experts = len(expert_map)
+    local_num_experts = global_num_experts // ep_group.world_size
+    row_idx_len = num_tokens * top_k
+    row_idx = (torch.arange(0, row_idx_len, dtype=torch.int32,
+                            device=device).view(top_k,
+                                                -1).permute(1, 0).contiguous())
+    hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+        hidden_states,
+        row_idx=row_idx,
+        expert_idx=topk_ids,
+        active_num=num_tokens)
+    max_row_per_ep_rank = (-(-global_batch_size // ep_group.world_size) *
+                           max_model_len // ep_group.world_size +
+                           1) * top_k * 2
+    expert_idx_buffer_scatter, unpad_indices = process_topk_ids(
+        expanded_expert_idx, global_num_experts, ep_group.world_size,
+        max_row_per_ep_rank, num_tokens, top_k)
+    hidden_states_pad_idx = torch.zeros(
+        expert_idx_buffer_scatter.shape,
+        dtype=expert_idx_buffer_scatter.dtype,
+        device=expert_idx_buffer_scatter.device)
+    non_pad_len = torch.sum((expert_idx_buffer_scatter
+                             != global_num_experts).to(torch.int32))
+    hidden_states_pad_idx[expert_idx_buffer_scatter !=
+                          global_num_experts] = torch.arange(
+                              non_pad_len,
+                              dtype=expert_idx_buffer_scatter.dtype,
+                              device=hidden_states.device)
+    hidden_states_buffer_scatter = hidden_states[hidden_states_pad_idx]
+    expert_idx_buffer_gather = torch.empty_like(
+        expert_idx_buffer_scatter,
+        dtype=expert_idx_buffer_scatter.dtype,
+        device=expert_idx_buffer_scatter.device)
+    hidden_states_buffer_gather = torch.empty_like(
+        hidden_states_buffer_scatter,
+        dtype=hidden_states_buffer_scatter.dtype,
+        device=hidden_states_buffer_scatter.device)
+    dist.all_to_all_single(expert_idx_buffer_gather,
+                           expert_idx_buffer_scatter,
+                           group=ep_group.device_group)
+    dist.all_to_all_single(hidden_states_buffer_gather,
+                           hidden_states_buffer_scatter,
+                           group=ep_group.device_group)
+    mask = expert_idx_buffer_gather != global_num_experts
+    local_expert_idx = expert_idx_buffer_gather[mask] - ep_group.rank * (
+        global_num_experts // ep_group.world_size)
+    hidden_states = hidden_states_buffer_gather[mask]
+    idx_type = local_expert_idx.dtype
+    sorted_local_expert_idx, sorted_idx = torch.sort(local_expert_idx.float())
+    sorted_local_expert_idx = sorted_local_expert_idx.to(idx_type)
+    expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+        sorted_local_expert_idx, local_num_experts).to(torch.int64)
+    hidden_states = hidden_states[sorted_idx]
+    group_list_type = 0
+    hidden_states_wrapper = [hidden_states]
+    del hidden_states
+    hidden_states = apply_mlp(hidden_states_wrapper,
+                              w1,
+                              w2,
+                              expert_tokens,
+                              group_list_type=group_list_type)
+    resorted_idx = torch.argsort(sorted_idx.float()).to(sorted_idx.dtype)
+    hidden_states = hidden_states[resorted_idx]
+    hidden_states_scatter = torch.zeros(
+        (mask.shape[0], hidden_states.shape[1]),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device)
+    hidden_states_scatter[mask] = hidden_states
+    hidden_states_gatter = torch.empty_like(
+        hidden_states_scatter,
+        dtype=hidden_states_scatter.dtype,
+        device=hidden_states_scatter.device)
+    dist.all_to_all_single(hidden_states_gatter,
+                           hidden_states_scatter,
+                           group=ep_group.device_group)
+    hidden_states_gatter = hidden_states_gatter[expert_idx_buffer_scatter !=
+                                                global_num_experts]
+    if hidden_states_gatter.shape[0] != row_idx_len:
+        hidden_states = torch.zeros((row_idx_len, hidden_states.shape[1]),
+                                    dtype=hidden_states.dtype,
+                                    device=hidden_states.device)
+        hidden_states[unpad_indices != -1] = hidden_states_gatter
+    else:
+        # TODO: Reorder device memory 2 times here, replace the current
+        hidden_states = hidden_states_gatter
+    final_hidden_states = torch_npu.npu_moe_finalize_routing(
+        hidden_states,
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=topk_weights,
+        expanded_src_to_dst_row=expanded_row_idx,
+        export_for_source_row=topk_ids,
+    )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+def fused_experts_moge(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
+        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+        top_k: Number of experts to select.
+        expert_map: Expert mapping of shape (num_experts,).
+    Returns:
+        hidden_states: Hidden states after routing.
+    """
+    ep_size = get_ep_group().world_size
+    local_num_experts = global_num_experts // ep_size
+    local_num_group = top_k // ep_size
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+    bsz, _ = hidden_states.shape
+    flatten_topk_ids = topk_ids.view(-1)
+    sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
+    sorted_topk_ids = sorted_topk_ids.to(torch.int32)
+    sorted_hidden_states = hidden_states.index_select(
+        0, sorted_topk_ids // local_num_group)
+    experts_id = torch.arange(0,
+                              local_num_experts,
+                              dtype=topk_ids.dtype,
+                              device=topk_ids.device)
+    num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
+        torch.float32).sum(0)
+    topk_scales = topk_weights.view(-1).index_select(
+        0, sorted_topk_ids).unsqueeze(-1)
+    group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
+    w1 = w1.transpose(1, 2)
+    gate_up_out = torch_npu.npu_grouped_matmul(
+        x=[sorted_hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+    if is_310p():
+        gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
+            torch.float16)
+    else:
+        gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+    gate_up_out *= topk_scales
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[gate_up_out],
+        weight=[w2],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+    unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
+    unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids)
+    final_hidden_states = unsorted_hidden_states.reshape(
+        bsz, top_k // ep_size, -1).sum(1)
+    return final_hidden_states
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    apply_router_weight_on_input: bool = False,
+    max_num_tokens: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    Fused experts with top-k routing.
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
+        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+        top_k: Number of experts to select.
+        expert_map: Expert mapping of shape (num_experts,).
+    Returns:
+        hidden_states: Hidden states after routing.
+    """
+    """
+    # Check constraints.
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    """
+    # if torch.distributed.get_rank() == 0:
+    #     print(w1.shape)
+    #     print(hidden_states.shape)
+    original_shape = hidden_states.shape
+    # assert len(original_shape) == 2
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    dtype = hidden_states.dtype
+    device = hidden_states.device
+    # assert dtype in [torch.float32, torch.float16, torch.bfloat16
+    #                  ], "Only float32, float16, and bfloat16 are supported"
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+    if expert_map is not None:
+        # Generate token indices and flatten
+        token_indices = (torch.arange(num_tokens,
+                                      device=device,
+                                      dtype=torch.int64).unsqueeze(1).expand(
+                                          -1, top_k).reshape(-1))
+        # Flatten token-to-expert mappings and map to local experts
+        weights_flat = topk_weights.view(-1)
+        experts_flat = topk_ids.view(-1)
+        local_experts_flat = expert_map[experts_flat]
+        # Filter valid token-expert pairs
+        mask = local_experts_flat != -1
+        filtered_weights = torch.where(
+            mask, weights_flat, torch.zeros_like(weights_flat)).to(dtype)
+        filtered_experts = torch.where(
+            mask, local_experts_flat,
+            torch.full_like(local_experts_flat,
+                            num_experts)).to(topk_ids.dtype)
+        # Sort by local expert IDs
+        sort_indices = torch.argsort(filtered_experts.view(torch.float32))
+        sorted_token_indices = token_indices[sort_indices]
+        sorted_weights = filtered_weights[sort_indices]
+        # Compute token counts with minlength of num_experts
+        # This is equivalent to but faster than:
+        # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1]
+        token_counts = torch.zeros(num_experts + 1,
+                                   device=device,
+                                   dtype=torch.int64)
+        ones = torch.ones_like(filtered_experts, dtype=torch.int64)
+        token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones)
+        token_counts = token_counts[:num_experts]
+        expert_tokens = torch.cumsum(token_counts, dim=0, dtype=torch.int64)
+        # Rearrange hidden_states
+        sorted_hidden_states = hidden_states[sorted_token_indices]
+    else:
+        row_idx_len = num_tokens * top_k
+        row_idx = (torch.arange(0,
+                                row_idx_len,
+                                dtype=torch.int32,
+                                device=device).view(top_k, -1).permute(
+                                    1, 0).contiguous())
+        active_num = max_num_tokens if max_num_tokens is not None else num_tokens
+        sorted_hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=active_num)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, num_experts)
+        expert_tokens = expert_tokens.to(torch.int64)
+    w1 = w1.transpose(1, 2)
+    gate_up_out_list = torch_npu.npu_grouped_matmul(
+        x=[sorted_hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+    )
+    # TODO: Remove this in the future.
+    gate_up_out = torch.cat(gate_up_out_list, dim=0)
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+    w2 = w2.transpose(1, 2)
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[gate_up_out],
+        weight=[w2],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+    )
+    down_out_list = torch.cat(down_out_list, dim=0)
+    if expert_map is not None:
+        weighted_down_out = down_out_list * sorted_weights.unsqueeze(1)
+        final_hidden_states = torch.zeros(*original_shape,
+                                          device=hidden_states.device,
+                                          dtype=dtype)
+        # TODO: npu_grouped_matmul output random values at [num_valid_tokens:, ...]
+        # This created multiple NaN and index_add_ will mix them up which harms accuracy
+        # remove this mask and filter after it being fixed
+        num_valid_tokens = mask.sum()
+        valid_token_mask = torch.arange(
+            0, sorted_token_indices.shape[0],
+            device=device).unsqueeze(1) < num_valid_tokens
+        valid_output = torch.where(
+            valid_token_mask, weighted_down_out,
+            torch.zeros_like(weighted_down_out)).to(dtype)
+        final_hidden_states.index_add_(0, sorted_token_indices, valid_output)
+    else:
+        scales = torch.ones_like(
+            topk_weights) if apply_router_weight_on_input else topk_weights
+        # TODO: Reorder device memory 2 times here, replace the current
+        # implementation here when suitable operators become available.
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            down_out_list,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=scales,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    return final_hidden_states
+def fused_experts_allgather_ep(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    is_prefill: bool
+):
+    local_rank = torch.distributed.get_rank(group=get_ep_group().device_group)
+    num_experts_per_ep = w1.shape[0]
+    local_expert_indices_offset = local_rank * num_experts_per_ep
+    global_local_mask = (topk_ids >= local_expert_indices_offset) & \
+                        (topk_ids <= local_expert_indices_offset + num_experts_per_ep - 1)
+    non_global_local_mask = (~global_local_mask).to(torch.int32)
+    global_local_mask = global_local_mask.to(torch.int32)
+    row_idx = torch.arange(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32).view(
+        -1, topk_ids.shape[0]).transpose(0, 1).contiguous()
+    topk_ids -= local_expert_indices_offset
+    local_topk_ids_mask_with_max = topk_ids * global_local_mask + non_global_local_mask * num_experts_per_ep
+    sorted_tokens, expanded_src_to_dst_row, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+        x=hidden_states,
+        row_idx=row_idx,
+        expert_idx=local_topk_ids_mask_with_max,
+        active_num=topk_ids.shape[0]*topk_ids.shape[1]
+    )
+    if expanded_expert_idx.shape[0] > 8192:
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(expanded_expert_idx, num_experts_per_ep + 1)
+        expert_tokens = expert_tokens[:-1]
+    else:
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(expanded_expert_idx, num_experts_per_ep)
+    expert_tokens = expert_tokens.to(torch.int64)
+    w1 = w1.transpose(1, 2)
+    gate_up_out = torch_npu.npu_grouped_matmul(
+        x=[sorted_tokens],
+        weight=[w1],
+        group_list=expert_tokens,
+        split_item=3,
+        group_type=0
+    )[0]
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+    w2 = w2.transpose(1, 2)
+    down_out = torch_npu.npu_grouped_matmul(
+        x=[gate_up_out],
+        weight=[w2],
+        group_list=expert_tokens,
+        split_item=3,
+        group_type=0
+    )[0]
+    if is_prefill:
+        down_out[expert_tokens[-1]:] = 0
+    else:
+        sorted_tokens_mask = expanded_expert_idx != num_experts_per_ep
+        down_out *= sorted_tokens_mask.unsqueeze(1)
+    final_hidden_states = torch_npu.npu_moe_finalize_routing(
+        expanded_permuted_rows=down_out,
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=topk_weights.to(down_out.dtype),
+        expanded_src_to_dst_row=expanded_src_to_dst_row,
+        export_for_source_row=topk_ids
+    )
+    return final_hidden_states
+def select_gating_top_k_softmax_experts(
+        hidden_states: torch.Tensor, router_logits: torch.Tensor, top_k: int,
+        renormalize: bool) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Select top-k experts based on router logits.
+    only supports float16、bfloat16、float32
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        router_logits: Router logits of shape (num_tokens, num_experts).
+        top_k: Number of experts to select.
+        renormalize: Whether to renormalize the routing weights.
+    Returns:
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+    Raises:
+        ValueError: If an unsupported scoring function is provided.
+    """
+    topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax(
+        router_logits, None, k=top_k)
+    # # Required by npu_moe_init_routing
+    # topk_weights = topk_weights.to(hidden_states.dtype)
+    # topk_ids = topk_ids.to(torch.int32)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+def native_grouped_topk(
+    topk_weights: torch.Tensor,
+    num_expert_group: Optional[int],
+    topk_group: Optional[int],
+):
+    topk_group = 0 if topk_group is None else topk_group
+    num_expert_group = 0 if num_expert_group is None else num_expert_group
+    num_token = topk_weights.shape[0]
+    grouped_weights = topk_weights.view(num_token, num_expert_group,
+                                        -1).max(dim=-1).values
+    topk_group_indices = torch.topk(grouped_weights.to(torch.float32),
+                                    k=topk_group,
+                                    dim=-1,
+                                    sorted=False)[1]
+    topk_group_mask = torch.zeros_like(grouped_weights)
+    topk_group_mask.scatter_(1, topk_group_indices, 1)
+    topk_weight_mask = (topk_group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        topk_weights.shape[-1] // num_expert_group).reshape(num_token, -1))
+    topk_weights = topk_weights.masked_fill(~topk_weight_mask.bool(), 0.0)
+    return topk_weights
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None,
+    global_num_experts: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Select top-k experts based on router logits.
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        router_logits: Router logits of shape (num_tokens, num_experts).
+        top_k: Number of experts to select.
+        use_grouped_topk: Whether to group experts before selecting top-k.
+        renormalize: Whether to renormalize the routing weights.
+        topk_group: Number of expert groups to select from.
+        num_expert_group: Number of experts in each group.
+        custom_routing_function: Custom routing function.
+        scoring_func: Scoring function to use.
+        e_score_correction_bias: Correction bias to apply to expert scores.
+    Returns:
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+    Raises:
+        ValueError: If an unsupported scoring function is provided.
+    """
+    if scoring_func == "softmax":
+        # NOTE: vLLM use dtype=torch.float here
+        topk_weights = router_logits.softmax(dim=-1)
+    elif scoring_func == "sigmoid":
+        topk_weights = router_logits.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        if e_score_correction_bias is not None:
+            # Store original scores before applying correction bias. We use biased
+            # scores for expert selection but original scores for routing weights
+            original_weights = topk_weights
+            topk_weights = topk_weights + e_score_correction_bias.unsqueeze(0)
+        # TODO: Change to npu_group_topk when the latest CANN and NNAL is available
+        # >>> torch_npu._npu_group_topk(topk_weights, group_num=num_expert_group, k=topk_group)
+        topk_weights = native_grouped_topk(topk_weights, num_expert_group,
+                                           topk_group)
+        # TODO bfloat16 is not supported in torch.topk with ge graph.
+        if e_score_correction_bias is not None:
+            topk_ids = torch.topk(topk_weights.to(torch.float32),
+                                  k=top_k,
+                                  dim=-1,
+                                  sorted=False)[1]
+            # Use original unbiased scores for the routing weights
+            topk_weights = original_weights.gather(1, topk_ids)
+        else:
+            topk_weights, topk_ids = torch.topk(topk_weights.to(torch.float32),
+                                                k=top_k,
+                                                dim=-1,
+                                                sorted=False)
+    elif custom_routing_function is None:
+        topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            global_num_experts=global_num_experts)
+        # Required by npu_moe_init_routing
+        topk_ids = topk_ids.to(torch.int32)
+        return topk_weights, topk_ids
+    # Required by npu_moe_init_routing
+    topk_ids = topk_ids.to(torch.int32)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
+    def __init__(self, moe: FusedMoEConfig = None):
+        super().__init__(moe=moe)
+        vllm_config = get_current_vllm_config()
+        self.ep_group = get_ep_group()
+        self.ep_size = self.ep_group.world_size
+        self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
+        self.local_batch_size = self.global_batch_size // self.ep_size
+        self.max_model_len = vllm_config.model_config.max_model_len
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        try:
+            device_group = self.ep_group.device_group
+            # TODO: Try local_rank = ep_group.rank_in_group
+            local_rank = torch.distributed.get_rank(group=device_group)
+            backend = device_group._get_backend(torch.device("npu"))
+            self.moe_all_to_all_group_name = backend.get_hccl_comm_name(
+                local_rank)
+        except AttributeError:
+            self.moe_all_to_all_group_name = None
+    def process_weights_after_loading(self, layer):
+        super(UnquantizedFusedMoEMethod,
+              self).process_weights_after_loading(layer)
+        layer.w13_weight = torch.nn.Parameter(self._maybe_pad_weight(
+            layer.w13_weight.data),
+                                              requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
+            layer.w2_weight.data),
+                                             requires_grad=False)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = False,
+        enable_force_load_balance: bool = False,
+        shared_experts: Optional[Any] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        use_grouped_topk = (topk_group > 1 or num_expert_group > 1)
+        is_deepseek_v3_r1 = global_num_experts == 256
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        if use_grouped_topk and is_deepseek_v3_r1:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=top_k,  # topk当前写8
+                bias=e_score_correction_bias,
+                k_group=topk_group,  # fix: 4
+                group_count=num_expert_group,  # fix 8
+                group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
+                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                # out_flag=False, # todo new api; 第三个输出是否输出
+                # y2_flag=False, # old api; 第三个输出是否输出
+                routed_scaling_factor=1,
+                eps=float(1e-20))
+        elif use_grouped_topk and SELECT_GATING_TOPK_SOTFMAX_EXPERTS:
+            topk_weights, topk_ids = select_gating_top_k_softmax_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                renormalize=renormalize)
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                use_grouped_topk=use_grouped_topk,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+        topk_weights = topk_weights.to(x.dtype)
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+        fused_moe_state = get_fused_moe_state(self.ep_group.world_size,
+                                              is_prefill, is_deepseek_v3_r1)
+        if fused_moe_state == FusedMoEState.MC2:
+            return fused_experts_with_mc2(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                shared_experts=shared_experts)
+        elif fused_moe_state == FusedMoEState.AllGatherEP:
+            return fused_experts_allgather_ep(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                is_prefill=is_prefill)
+        elif fused_moe_state in [
+                FusedMoEState.AllGather, FusedMoEState.NaiveMulticast
+        ]:
+            return fused_experts(hidden_states=x,
+                                 w1=layer.w13_weight,
+                                 w2=layer.w2_weight,
+                                 topk_weights=topk_weights,
+                                 topk_ids=topk_ids,
+                                 top_k=top_k,
+                                 expert_map=expert_map)
+        elif MOE_ALL2ALL_BUFFER:
+            return fused_experts_with_all2all_buffer(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                max_model_len=self.max_model_len,
+                global_batch_size=self.global_batch_size,
+                expert_map=expert_map,
+                ep_group=get_ep_group())
+        else:
+            return fused_experts_with_all2all(hidden_states=x,
+                                              w1=layer.w13_weight,
+                                              w2=layer.w2_weight,
+                                              topk_weights=topk_weights,
+                                              topk_ids=topk_ids,
+                                              top_k=top_k,
+                                              expert_map=expert_map,
+                                              ep_group=get_ep_group())
+class AscendFusedMoE(FusedMoE):
+    # The moe_counter parameter is required during the initialization of EPLB
+    # to identify the current layer index within the MOE model.
+    moe_counter = -1
+    def __init__(
+        self,
+        num_experts: int,  # Global number of experts
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        ep_size: Optional[int] = None,
+        dp_size: Optional[int] = None,
+        prefix: str = "",
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+    ):
+        # TODO: This could not initialize FusedMoE baseclass,
+        # fixme and make __init__() of AscendFusedMoE more clear
+        super(FusedMoE, self).__init__()
+        AscendFusedMoE.moe_counter += 1
+        self.moe_instance_id = AscendFusedMoE.moe_counter
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        vllm_config = get_current_vllm_config()
+        self.moe_parallel_config = FusedMoEParallelConfig.make(
+            tp_size_=(tp_size if tp_size is not None else
+                      get_tensor_model_parallel_world_size()),
+            dp_size_=(dp_size
+                      if dp_size is not None else get_dp_group().world_size),
+            vllm_parallel_config=vllm_config.parallel_config)
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.global_num_experts = num_experts
+        assert intermediate_size % self.tp_size == 0
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.custom_routing_function = custom_routing_function
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
+        self.expert_map = None
+        self.activation = activation
+        self.log2phy = None
+        self.global_redundant_expert_num = 0
+        is_deepseek_v3_r1 = self.global_num_experts == 256
+        self.rm_router_logits = get_rm_router_logits_state(
+            self.moe_parallel_config.ep_size, self.dp_size, is_deepseek_v3_r1)
+        self.all_reduce_merge = get_all_reduce_merge_state(
+            self.moe_parallel_config.ep_size, is_deepseek_v3_r1)
+        ascend_config = get_ascend_config()
+        expert_map_path = ascend_config.expert_map_path
+        if expert_map_path and os.path.exists(expert_map_path):
+            # moe expert load balance
+            expert_load_balancer = ExpertLoadBalancer(expert_map_path,
+                                                      self.global_num_experts)
+            self.local_num_experts, self.expert_map = \
+                                expert_load_balancer.get_rank_placement_map(
+                                                self.moe_instance_id,
+                                                get_ep_group().rank_in_group)
+            self.log2phy = expert_load_balancer.get_rank_log2phy_map(
+                self.moe_instance_id,
+                get_ep_group().rank_in_group)
+            self.global_redundant_expert_num = \
+                        expert_load_balancer.get_global_redundant_expert_num()
+        else:
+            # Create a tensor of size num_experts filled with -1
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                self.ep_size,
+                get_ep_group().rank_in_group, self.global_num_experts)
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_moe = \
+            ascend_config.torchair_graph_config.enable_multistream_moe
+        if self.scoring_func != "softmax" and not self.use_grouped_topk:
+            raise ValueError("Only softmax scoring function is supported for "
+                             "non-grouped topk.")
+        moe = FusedMoEConfig.make(
+            num_experts=self.global_num_experts,
+            experts_per_token=top_k,
+            hidden_dim=hidden_size,
+            num_local_experts=self.local_num_experts,
+            moe_parallel_config=self.moe_parallel_config,
+            # TODO (bnell): this needs to be fixed for quantized types.
+            in_dtype=params_dtype,
+            quant_config=quant_config)
+        if quant_config is None:
+            self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
+        assert self.quant_method is not None
+        local_num_experts = torch.sum(self.expert_map != -1) \
+            if self.expert_map is not None else num_experts
+        moe_quant_params = {
+            "num_experts": local_num_experts,
+            "hidden_size": hidden_size,
+            "intermediate_size_per_partition":
+            self.intermediate_size_per_partition,
+            "params_dtype": params_dtype,
+            "weight_loader": self.weight_loader,
+        }
+        # need full intermediate size pre-sharding for WNA16 act order
+        if (self.quant_method.__class__.__name__
+                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
+            moe_quant_params["intermediate_size_full"] = intermediate_size
+        self.ep_group = get_ep_group()
+        # NOTE: self.tp_group is not expert_tp_group
+        self.tp_group = get_tp_group().device_group
+        self.quant_method.create_weights(layer=self, **moe_quant_params)
+    def naive_multicast(self, x: torch.Tensor,
+                        cu_tokens_across_dp_cpu: torch.Tensor):
+        assert (len(x.shape) == 2)
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(self.dp_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            get_dp_group().broadcast(buffer[start:end, :], idx)
+        return buffer
+    def forward(self,
+                hidden_states: torch.Tensor,
+                router_logits: torch.Tensor,
+                is_prefill: bool,
+                enable_force_load_balance: bool = False,
+                top_k: Optional[int] = None,
+                shared_experts: Optional[Any] = None,
+                gate=None,
+                replace_allreduce: bool = False):
+        assert self.quant_method is not None
+        if top_k:
+            real_top_k = top_k
+        else:
+            real_top_k = self.top_k
+        num_tokens, hidden_size = hidden_states.shape
+        is_deepseek_v3_r1 = self.global_num_experts == 256
+        fused_moe_state = get_fused_moe_state(self.moe_parallel_config.ep_size,
+                                              is_prefill, is_deepseek_v3_r1)
+        if shared_experts:
+            if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
+                # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
+                shared_hidden_states = shared_experts(hidden_states)
+        tp_size = get_tensor_model_parallel_world_size()
+        if (tp_size > 1 and fused_moe_state not in [
+                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
+                FusedMoEState.NaiveMulticast
+        ] and not replace_allreduce):
+            if num_tokens < tp_size:
+                hidden_states = nn.functional.pad(
+                    hidden_states, (0, 0, 0, tp_size - num_tokens))
+                router_logits = nn.functional.pad(
+                    router_logits, (0, 0, 0, tp_size - num_tokens))
+            chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                     tp_size,
+                                                     dim=0)
+            chunk_router_logits = torch.tensor_split(router_logits,
+                                                     tp_size,
+                                                     dim=0)
+            tp_rank = get_tensor_model_parallel_rank()
+            hidden_states = chunk_hidden_states[tp_rank]
+            router_logits = chunk_router_logits[tp_rank]
+        if self.dp_size > 1:
+            if fused_moe_state in (FusedMoEState.AllGather, FusedMoEState.AllGatherEP):
+                # NOTE: When in torchair graph, it has been padded in model_runner_v1
+                if not self.torchair_graph_enabled or is_prefill:
+                    attn_metadata = get_forward_context().attn_metadata
+                    if attn_metadata is not None:
+                        max_num_tokens_across_dp = attn_metadata.max_num_tokens_across_dp
+                        if num_tokens < max_num_tokens_across_dp:
+                            hidden_states = nn.functional.pad(
+                                hidden_states,
+                                (0, 0, 0,
+                                 max_num_tokens_across_dp - num_tokens))
+                            if not self.rm_router_logits:
+                                router_logits = nn.functional.pad(
+                                    router_logits,
+                                    (0, 0, 0,
+                                     max_num_tokens_across_dp - num_tokens))
+                hidden_states = get_dp_group().all_gather(hidden_states, 0)
+                if self.rm_router_logits:
+                    router_logits, _ = gate(hidden_states.float())
+                else:
+                    router_logits = get_dp_group().all_gather(router_logits, 0)
+            elif fused_moe_state == FusedMoEState.NaiveMulticast:
+                cu_tokens_across_dp_cpu = get_forward_context(
+                ).dp_metadata.cu_tokens_across_dp_cpu
+                hidden_states = self.naive_multicast(hidden_states,
+                                                     cu_tokens_across_dp_cpu)
+                if self.rm_router_logits:
+                    router_logits, _ = gate(hidden_states.float())
+                else:
+                    router_logits = self.naive_multicast(
+                        router_logits, cu_tokens_across_dp_cpu)
+        # Matrix multiply.
+        e_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=real_top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            global_num_experts=self.global_num_experts,
+            expert_map=self.expert_map,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
+            is_prefill=is_prefill,
+            enable_force_load_balance=enable_force_load_balance,
+            log2phy=self.log2phy,
+            global_redundant_expert_num=self.global_redundant_expert_num,
+            shared_experts=shared_experts if self.torchair_graph_enabled
+            and self.enable_multistream_moe and not is_prefill else None,
+        )
+        if shared_experts:
+            if isinstance(e_hidden_states, tuple):
+                e_hidden_states, shared_hidden_states = e_hidden_states
+        if (tp_size > 1 and fused_moe_state not in [
+                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
+                FusedMoEState.NaiveMulticast
+        ] and not replace_allreduce):
+            dist.all_gather(list(chunk_hidden_states), e_hidden_states,
+                            self.tp_group)
+            final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
+            if num_tokens < tp_size:
+                final_hidden_states = final_hidden_states[:num_tokens]
+            dispose_tensor(e_hidden_states)
+        elif self.dp_size > 1:
+            if fused_moe_state == FusedMoEState.NaiveMulticast:
+                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+                    self.dp_rank - 1]
+                end = cu_tokens_across_dp_cpu[self.dp_rank]
+                final_hidden_states = get_dp_group().all_reduce(
+                    e_hidden_states)
+                final_hidden_states = final_hidden_states[start:end, :]
+                dispose_tensor(e_hidden_states)
+            elif fused_moe_state in (FusedMoEState.AllGather, FusedMoEState.AllGatherEP):
+                final_hidden_states = data_parallel_reduce_scatter(
+                    e_hidden_states, dim=0)
+                final_hidden_states = final_hidden_states[:num_tokens]
+                dispose_tensor(e_hidden_states)
+        else:
+            final_hidden_states = e_hidden_states
+        if tp_size > 1 and not self.all_reduce_merge and fused_moe_state in [
+                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
+                FusedMoEState.NaiveMulticast
+        ]:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+        if shared_experts:
+            return final_hidden_states, shared_hidden_states
+        else:
+            return final_hidden_states
+    # ----------------------------------------- TBO-related --------------------------------------------
+    def _forward_ms_fused_moe_comp(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_prefill: bool,
+        real_top_k,
+        enable_force_load_balance: bool = False,
+    ):
+        hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=real_top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            global_num_experts=self.global_num_experts,
+            expert_map=self.expert_map,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
+            is_prefill=is_prefill,
+            enable_force_load_balance=enable_force_load_balance)
+        return hidden_states

inference/vllm_ascend/patch/worker/patch_common/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# patch_utils should be the first import, because it will be used by other
+# patch files.
+import vllm_ascend.patch.worker.patch_common.patch_utils  # noqa isort:skip
+import vllm_ascend.patch.worker.patch_common.patch_distributed  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_minicpm  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_multi_step_worker  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_sampler  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_spec_decode_worker  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_config  # noqa
+import vllm_ascend.patch.worker.patch_common.patch_parsers  # noqa

inference/vllm_ascend/patch/worker/patch_common/patch_config.py ADDED Viewed

	@@ -0,0 +1,97 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from vllm.config import ModelConfig
+def get_attr_by_names(src_config, attrs, default_value):
+    for attr in attrs:
+        value = getattr(src_config, attr, 0)
+        if value > 0:
+            return value
+    return default_value
+def _verify_with_expert_parallelism(self) -> None:
+        num_expert_names = [
+            "moe_num_experts",  # Dbrx
+            "num_experts",  # Jamba
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+            "num_routed_experts",  # Pangu
+        ]
+        num_experts = 0
+        for name in num_expert_names:
+            num_experts = getattr(self.hf_text_config, name, 0)
+            if num_experts > 0:
+                break
+        if num_experts < 1:
+            raise ValueError(
+                "Number of experts in the model must be greater than 0 "
+                "when expert parallelism is enabled.")
+@property
+def is_deepseek_mla(self) -> bool:
+    kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank']
+    kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, None)
+    if not hasattr(self.hf_text_config, "model_type"):
+        return False
+    elif self.hf_text_config.model_type in \
+        ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'pangu_ultra_moe'):
+        return kv_lora_dim is not None
+    elif self.hf_text_config.model_type == 'eagle':
+        # if the model is an EAGLE module, check for the
+        # underlying architecture
+        return self.hf_text_config.model.model_type in \
+                ('deepseek_v2', 'deepseek_v3', 'pangu_ultra_moe') \
+            and kv_lora_dim is not None
+    return False
+def get_head_size(self) -> int:
+    if self.is_deepseek_mla:
+        qk_rope_dim_names = ['attention_qk_rope_dim', 'qk_rope_head_dim']
+        kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank']
+        qk_rope_dim = get_attr_by_names(self.hf_text_config, qk_rope_dim_names, 0)
+        kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, 0)
+        if self.use_mla:
+            return kv_lora_dim + qk_rope_dim
+        else:
+            qk_dim_names = ['attention_qk_dim', 'qk_nope_head_dim']
+            qk_dim = get_attr_by_names(self.hf_text_config, qk_dim_names, 0)
+            if qk_rope_dim and qk_dim:
+                return qk_rope_dim + qk_dim
+    if hasattr(self.hf_text_config,
+                "model_type") and (self.hf_text_config.model_type
+                                    == "zamba2"):
+        return self.hf_text_config.attention_head_dim
+    if self.is_attention_free:
+        return 0
+    # NOTE: Some configs may set head_dim=None in the config
+    if getattr(self.hf_text_config, "head_dim", None) is not None:
+        return self.hf_text_config.head_dim
+    # FIXME(woosuk): This may not be true for all models.
+    return (self.hf_text_config.hidden_size //
+            self.hf_text_config.num_attention_heads)
+ModelConfig._verify_with_expert_parallelism = _verify_with_expert_parallelism
+ModelConfig.is_deepseek_mla = is_deepseek_mla
+ModelConfig.get_head_size = get_head_size

inference/vllm_ascend/patch/worker/patch_common/patch_parsers.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from vllm.entrypoints.openai import tool_parsers
+from vllm_ascend.entrypoints.openai.tool_parsers import PanguToolParser
+tool_parsers.__all__.append("PanguToolParser")
+from vllm import reasoning
+from vllm_ascend.entrypoints.openai.reasoning_parsers import PanguReasoningParser
+reasoning.__all__.append("PanguReasoningParser")

inference/vllm_ascend/patch/worker/patch_common/patch_sampler.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Optional
+import torch
+import torch_npu
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
+from vllm.v1.sample.sampler import Sampler, _SAMPLING_EPS
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm_ascend import envs
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    if p is not None and k is not None:
+        # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
+        return torch_npu.npu_top_k_top_p(logits, p, k)
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+    if k is not None:
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+    if p is not None:
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+    return logits
+def topk_topp_forward_native(
+    self,
+    logits: torch.Tensor,
+    generators: dict[int, torch.Generator],
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    PyTorch-native implementation of top-k and top-p sampling.
+    The logits tensor may be updated in-place.
+    """
+    logits = apply_top_k_top_p(logits, k, p)
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+    return random_sample(probs, generators)
+def apply_top_n_sigma(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+):
+    if sampling_metadata.no_top_n_sigma:
+        return logits
+    top_n_sigma = sampling_metadata.top_n_sigma[:, None]
+    top_n_sigma_mask = (top_n_sigma != -1)
+    filter_value = -3.4028e+38
+    max_vals, _ = logits.max(dim=-1, keepdim=True)
+    std_vals = logits.std(dim=-1, keepdim=True)
+    threshold = max_vals - top_n_sigma * std_vals
+    threshold[~top_n_sigma_mask] = filter_value
+    mask = (logits < threshold)
+    logits = torch.where(mask, filter_value, logits)
+    return logits
+def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Sample logits based on sampling metadata.
+    The various logits processing functions called in this method
+    may update the logits tensor in-place.
+    """
+    assert not (sampling_metadata.all_greedy
+                and sampling_metadata.all_random)
+    if sampling_metadata.all_random:
+        greedy_sampled = None
+    else:
+        greedy_sampled = self.greedy_sample(logits)
+        if sampling_metadata.all_greedy:
+            return greedy_sampled
+    assert sampling_metadata.temperature is not None
+    # Apply temperature.
+    logits = self.apply_temperature(logits, sampling_metadata.temperature)
+    # Apply logits processors that only apply to random sampling
+    # (argmax invariant)
+    for processor in sampling_metadata.logitsprocs.argmax_invariant:
+        logits = processor.apply(logits)
+    # Apply top_n_sigma
+    logits = apply_top_n_sigma(logits, sampling_metadata)
+    # Apply top_k and/or top_p.
+    random_sampled = self.topk_topp_sampler(
+        logits,
+        sampling_metadata.generators,
+        sampling_metadata.top_k,
+        sampling_metadata.top_p,
+    )
+    if greedy_sampled is None:
+        return random_sampled
+    sampled = torch.where(
+        sampling_metadata.temperature < _SAMPLING_EPS,
+        greedy_sampled,
+        random_sampled,
+        out=greedy_sampled,  # Reuse tensor
+    )
+    return sampled
+if envs.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
+    TopKTopPSampler.forward_native = topk_topp_forward_native
+if envs.VLLM_ASCEND_ENABLE_TOP_N_SIGMA:
+    Sampler.sample = sample

inference/vllm_ascend/quantization/w8a8.py ADDED Viewed

	@@ -0,0 +1,757 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Any, Callable, Dict, Optional
+import torch
+import torch_npu
+from vllm.attention.backends.abstract import AttentionType
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.distributed.parallel_state import get_ep_group
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
+def quant_per_tensor(in_tensor: torch.Tensor,
+                     input_scale: torch.Tensor,
+                     input_offset: torch.Tensor,
+                     function=False):
+    return torch_npu.npu_quantize(in_tensor, input_scale, input_offset,
+                                  torch.qint8, -1, function)
+class AscendW8A8LinearMethod:
+    """Linear method for Ascend W8A8.
+    Args:
+        w_sym: whether the linear weight is symmetrically quantized.
+    """
+    def __init__(self) -> None:
+        # aclnn quant matmul requires to transpose matrix B, set to true by default.
+        self.transpose_weight = not is_310p()
+    @staticmethod
+    def get_weight(
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype = torch.bfloat16,
+    ) -> Dict[str, Any]:
+        params_dict = {
+            "weight": torch.empty(output_size, input_size, dtype=torch.int8)
+        }
+        return params_dict
+    @staticmethod
+    def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
+        params_dict["input_offset"] = torch.empty(1, dtype=torch.int8)
+        return params_dict
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["quant_bias"] = torch.empty(output_size, dtype=torch.int32)
+        if params_dtype == torch.bfloat16:
+            params_dict["deq_scale"] = torch.empty(output_size,
+                                                   dtype=torch.float32)
+        elif params_dtype == torch.float16:
+            params_dict["deq_scale"] = torch.empty(output_size,
+                                                   dtype=torch.int64)
+        params_dict["weight_scale"] = torch.empty(output_size,
+                                                  1,
+                                                  dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size,
+                                                   1,
+                                                   dtype=params_dtype)
+        return params_dict
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        original_dtype = x.dtype
+        if original_dtype != torch.int8:
+            x = quant_per_tensor(x, layer.aclnn_input_scale,
+                                 layer.aclnn_input_offset)
+        quant_bias = layer.quant_bias if tp_rank == 0 else None
+        if is_310p():
+            # On 300I Duo platform, we need transpose again if
+            # using nz. This transpose can be skipped in torchair.
+            output = torch_npu.npu_quant_matmul(
+                x,
+                layer.weight.data.transpose(1, 0),
+                layer.deq_scale,
+                bias=quant_bias,
+                output_dtype=original_dtype,
+            )
+        else:
+            output = torch_npu.npu_quant_matmul(
+                x,
+                layer.weight,
+                layer.deq_scale,
+                bias=quant_bias,
+                output_dtype=original_dtype,
+            )
+        return output
+    def process_weights_after_loading(self, layer):
+        expanding_factor = layer.weight.data.shape[1]
+        layer.aclnn_input_scale = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor),
+            requires_grad=False)
+        layer.aclnn_input_offset = torch.nn.Parameter(
+            layer.input_offset.data.repeat(expanding_factor),
+            requires_grad=False).to(layer.aclnn_input_scale.dtype)
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight.data = torch_npu.npu_format_cast(layer.weight.data,
+                                                      ACL_FORMAT_FRACTAL_NZ)
+        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
+        layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
+class AscendW8A8FusedMoEMethod:
+    """FusedMoe method for Ascend W8A8.
+    """
+    def __init__(self):
+        self.transpose_weight = True
+    @staticmethod
+    def get_weight(num_experts: int, intermediate_size_per_partition: int,
+                   hidden_sizes: int,
+                   params_dtype: torch.dtype) -> Dict[str, Any]:
+        param_dict = {}
+        param_dict["w13_weight"] = torch.empty(num_experts,
+                                               2 *
+                                               intermediate_size_per_partition,
+                                               hidden_sizes,
+                                               dtype=torch.int8,
+                                               requires_grad=False)
+        param_dict["w2_weight"] = torch.empty(num_experts,
+                                              hidden_sizes,
+                                              intermediate_size_per_partition,
+                                              dtype=torch.int8,
+                                              requires_grad=False)
+        return param_dict
+    @staticmethod
+    def get_dynamic_quant_param(num_experts: int,
+                                intermediate_size_per_partition: int,
+                                hidden_sizes: int,
+                                params_dtype: torch.dtype) -> Dict[str, Any]:
+        param_dict = {}
+        param_dict["w13_weight_scale"] = torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=torch.float32)
+        param_dict["w13_weight_offset"] = torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=torch.float16)
+        param_dict["w2_weight_scale"] = torch.empty(num_experts,
+                                                    hidden_sizes,
+                                                    1,
+                                                    dtype=torch.float32)
+        param_dict["w2_weight_offset"] = torch.empty(num_experts,
+                                                     hidden_sizes,
+                                                     1,
+                                                     dtype=torch.float16)
+        param_dict["w2_deq_scale"] = torch.empty(num_experts,
+                                                 hidden_sizes,
+                                                 dtype=torch.float32)
+        param_dict["w13_deq_scale"] = torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            dtype=torch.float32)
+        param_dict["w2_input_scale"] = torch.empty(num_experts,
+                                                   1,
+                                                   dtype=torch.float32)
+        param_dict["w13_input_scale"] = torch.empty(num_experts,
+                                                    1,
+                                                    dtype=torch.float32)
+        param_dict["w2_input_offset"] = torch.empty(num_experts,
+                                                    1,
+                                                    dtype=torch.int8)
+        param_dict["w13_input_offset"] = torch.empty(num_experts,
+                                                     1,
+                                                     dtype=torch.int8)
+        param_dict["quant_bias"] = torch.empty(num_experts,
+                                               hidden_sizes,
+                                               dtype=torch.int32)
+        return param_dict
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = True,
+        enable_force_load_balance: bool = False,
+        log2phy: torch.Tensor = None,
+        global_redundant_expert_num: int = 0,
+        shared_experts: Optional[Any] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert router_logits.shape[
+            1] == global_num_experts, "Number of global experts mismatch"
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            top_k=top_k,
+            use_grouped_topk=use_grouped_topk,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            global_num_experts=global_num_experts,
+        )
+        if is_310p():
+            return fused_experts_310p(hidden_states=x,
+                                      w1=layer.w13_weight,
+                                      w1_scale=layer.w13_weight_scale,
+                                      w1_input_scale=layer.w13_input_scale,
+                                      w2=layer.w2_weight,
+                                      w2_scale=layer.w2_weight_scale,
+                                      w2_input_scale=layer.w2_input_scale,
+                                      topk_weights=topk_weights,
+                                      topk_ids=topk_ids,
+                                      top_k=top_k,
+                                      global_num_experts=global_num_experts,
+                                      expert_map=expert_map)
+        return fused_experts(hidden_states=x,
+                             w1=layer.w13_weight,
+                             w1_scale=layer.w13_weight_scale,
+                             w1_input_scale=layer.w13_input_scale,
+                             w1_input_offset=layer.w13_input_offset,
+                             w2=layer.w2_weight,
+                             w2_scale=layer.w2_weight_scale,
+                             w2_input_scale=layer.w2_input_scale,
+                             w2_input_offset=layer.w2_input_offset,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids,
+                             top_k=top_k,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map)
+    def process_weights_after_loading(self, layer):
+        if not is_310p():
+            layer.w13_weight.data = layer.w13_weight.data.transpose(
+                1, 2).contiguous()
+            layer.w2_weight.data = layer.w2_weight.data.transpose(
+                1, 2).contiguous()
+        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
+            layer.w13_weight_scale.data.shape[0], -1)
+        layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(
+            layer.w13_weight_offset.data.shape[0], -1)
+        layer.w2_weight_scale.data = layer.w2_weight_scale.data.view(
+            layer.w2_weight_scale.data.shape[0], -1)
+        layer.w2_weight_offset.data = layer.w2_weight_offset.data.view(
+            layer.w2_weight_offset.data.shape[0], -1)
+        expanding_factor_w13 = layer.w13_weight.data.shape[1]
+        expanding_factor_w2 = layer.w2_weight.data.shape[1]
+        if is_310p():
+            layer.w13_input_scale.data = torch.nn.Parameter(
+                layer.w13_input_scale.data.max())
+            layer.w2_input_scale.data = torch.nn.Parameter(
+                layer.w2_input_scale.data.max())
+        else:
+            layer.w13_input_scale.data = torch.nn.Parameter(
+                layer.w13_input_scale.data.repeat(1,
+                                                  expanding_factor_w13)[0:1])
+            layer.w2_input_scale.data = torch.nn.Parameter(
+                layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1])
+        layer.w13_input_offset.data = torch.nn.Parameter(
+            layer.w13_input_scale.data.repeat(1, expanding_factor_w13)[0:1])
+        layer.w2_input_offset.data = torch.nn.Parameter(
+            layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1])
+        # converting ACL_FORMAT_FRACTAL_NZ.
+        # npu_quant_grouped_matmul_dequant in eager mode does not accept
+        # ACL_FORMAT_FRACTAL_NZ.
+        if not is_310p():
+            layer.w13_weight.data = torch_npu.npu_format_cast(
+                layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
+            layer.w2_weight.data = torch_npu.npu_format_cast(
+                layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
+class AscendC8KVCacheMethod:
+    def __init__(self) -> None:
+        self.antiquant_scale_comb = None
+    @staticmethod
+    def create_weights(layer) -> None:
+        param_dict = {}  # num_kv_heads * head_size
+        param_dict["key_antiquant_scale"] = torch.empty(layer.num_kv_heads *
+                                                        layer.head_size,
+                                                        dtype=torch.float16,
+                                                        requires_grad=False)
+        param_dict["value_antiquant_scale"] = torch.empty(layer.num_kv_heads *
+                                                          layer.head_size,
+                                                          dtype=torch.float16,
+                                                          requires_grad=False)
+        for weight_name, weight_param in param_dict.items():
+            param = torch.nn.Parameter(weight_param, requires_grad=False)
+            layer.register_parameter(weight_name, param)
+    def process_weights_after_loading(self, layer):
+        self.antiquant_scale_comb = torch.cat(
+            (layer.key_antiquant_scale.data.unsqueeze(0),
+             layer.value_antiquant_scale.data.unsqueeze(0)),
+            dim=0).to(torch.float16).contiguous()
+    def apply(self, layer, query, key, value, kv_cache, attn_metadata,
+              attn_type, scale, output) -> torch.Tensor:
+        num_tokens = query.shape[0]
+        if attn_metadata is None:
+            return output.view(num_tokens, layer.num_heads * layer.head_size)
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+        # C8
+        quant_key = quant_per_tensor(
+            key.view(-1, layer.num_kv_heads * layer.head_size),
+            layer.key_antiquant_scale.data.view(-1), None, True)
+        quant_value = quant_per_tensor(
+            value.view(-1, layer.num_kv_heads * layer.head_size),
+            layer.value_antiquant_scale.data.view(-1), None, True)
+        # View q k v to BSH.
+        query = query.view(-1, layer.num_heads, layer.head_size)
+        key = key.view(-1, layer.num_kv_heads, layer.head_size)
+        value = value.view(-1, layer.num_kv_heads, layer.head_size)
+        # TODO: Remove this contiguous in the future.
+        value = value.contiguous()
+        if kv_cache[0].numel() > 0:
+            # if key_cache is None:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            slots = attn_metadata.slot_mapping
+            block_size = key_cache.shape[1]
+            slots_indices = slots.reshape(-1, 1)
+            block_indices = slots_indices // block_size
+            slots_indices = slots_indices % block_size
+            indices = torch.cat((block_indices, slots_indices), dim=1)
+            # C8
+            torch_npu.npu_scatter_nd_update_(key_cache, indices, quant_key)
+            torch_npu.npu_scatter_nd_update_(value_cache, indices, quant_value)
+        # V0-Style scheduler situation.
+        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            assert attn_metadata is not None
+            assert attn_metadata.attn_mask is not None
+            mask = attn_metadata.attn_mask
+            torch_npu._npu_flash_attention(query=query,
+                                           key=key,
+                                           value=value,
+                                           mask=mask,
+                                           seq_len=attn_metadata.seq_lens,
+                                           scale_value=scale,
+                                           num_heads=layer.num_heads,
+                                           num_kv_heads=layer.num_kv_heads,
+                                           out=output.reshape(query.shape))
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+            raise NotImplementedError("kv cache int8 are not "
+                                      "implemented for "
+                                      "PrefillCacheHit")
+        elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:  # changed attn_metadata.attn_state == AscendAttentionState.DecodeOnly
+            if hasattr(attn_metadata, "decode"):
+                # torch_air
+                decode_meta = attn_metadata.decode
+                seq_lens = decode_meta.seq_lens_list
+            else:
+                seq_lens = attn_metadata.seq_lens
+            block_size = key_cache.shape[1]
+            query = query.view(num_tokens, 1, layer.num_heads *
+                               layer.head_size).contiguous()  # changed
+            # [num_blocks, block_size, N, D] --> [num_blocks, N, block_size, D]
+            key = key_cache
+            value = value_cache
+            output = torch_npu.npu_incre_flash_attention(
+                query,
+                key,
+                value,
+                num_key_value_heads=layer.num_kv_heads,
+                num_heads=layer.num_heads,
+                actual_seq_lengths=seq_lens,
+                scale_value=scale,
+                input_layout='BSH',
+                block_size=block_size,
+                block_table=attn_metadata.block_tables,
+                antiquant_scale=self.antiquant_scale_comb,
+            )
+        # Normal V1 situation.
+        else:
+            raise NotImplementedError("kv cache int8 are not "
+                                      "implemented for "
+                                      "other case")
+        return output
+def fused_experts_310p(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w1_input_scale: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+) -> torch.Tensor:
+    ep_size = get_ep_group().world_size
+    local_num_experts = global_num_experts // ep_size
+    local_num_group = top_k // ep_size
+    bsz, _ = hidden_states.shape
+    flatten_topk_ids = topk_ids.view(-1)
+    sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
+    sorted_topk_ids = sorted_topk_ids.to(torch.int32)
+    sorted_hidden_states = hidden_states.index_select(
+        0, sorted_topk_ids // local_num_group)
+    experts_id = torch.arange(0,
+                              local_num_experts,
+                              dtype=topk_ids.dtype,
+                              device=topk_ids.device)
+    num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to(
+        torch.float32).sum(0)
+    topk_scales = topk_weights.view(-1).index_select(
+        0, sorted_topk_ids).unsqueeze(-1)
+    group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64)
+    gate_up_out = torch_npu.npu_quant_grouped_matmul_dequant(
+        x=sorted_hidden_states,
+        quantized_weight=w1,
+        weight_scale=w1_scale,
+        group_list=group_list,
+        x_scale=w1_input_scale,
+        quant_mode="pertensor")
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to(
+        torch.float16)
+    gate_up_out *= topk_scales
+    down_out = torch_npu.npu_quant_grouped_matmul_dequant(
+        x=gate_up_out,
+        quantized_weight=w2,
+        weight_scale=w2_scale,
+        group_list=group_list,
+        x_scale=w2_input_scale,
+        quant_mode="pertensor")
+    unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32)
+    unsorted_hidden_states = down_out.index_select(0, unsorted_topk_ids)
+    final_hidden_states = unsorted_hidden_states.reshape(
+        bsz, top_k // ep_size, -1).sum(1)
+    return final_hidden_states
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w1_input_scale: torch.Tensor,
+    w1_input_offset: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w2_input_scale: torch.Tensor,
+    w2_input_offset: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+) -> torch.Tensor:
+    """
+    Fused experts with top-k routing.
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size).
+        w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size).
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+        top_k: Number of experts to select.
+        expert_map: Expert mapping of shape (num_experts,).
+    Returns:
+        hidden_states: Hidden states after routing.
+    """
+    """
+    # Check constraints.
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    """
+    original_dtype = hidden_states.dtype
+    ep_size = get_ep_group().world_size
+    local_num_experts = global_num_experts // ep_size
+    w1_input_scale, _ = w1_input_scale.max(0)
+    quant_sorted_hidden_states = quant_per_tensor(
+        hidden_states,
+        w1_input_scale,
+        None,
+        True,
+    )
+    if expert_map is not None:
+        expanded_x, expanded_row_idx, expert_token_count, expanded_scale = torch_npu.npu_moe_init_routing_v2(
+            quant_sorted_hidden_states,
+            topk_ids,
+            scale=None,
+            active_num=topk_ids.numel(),
+            expert_capacity=-1,
+            expert_num=local_num_experts,
+            drop_pad_mode=0,
+            expert_tokens_num_type=1,
+            expert_tokens_num_flag=True,
+            quant_mode=-1,
+            active_expert_range=[0, local_num_experts],
+            row_idx_type=0,
+        )
+    else:
+        raise NotImplementedError(
+            "The quantified version of MOE class models "
+            "currently does not support tensor parallelism")
+    if expanded_x.dtype != w1.dtype:
+        w1_input_scale, _ = w1_input_scale.max(0)
+        quant_sorted_hidden_states = quant_per_tensor(
+            expanded_x,
+            w1_input_scale,
+            None,
+            True,
+        )
+    else:
+        quant_sorted_hidden_states = expanded_x
+    gate_up_out = torch_npu.npu_grouped_matmul(
+        x=[quant_sorted_hidden_states],
+        weight=[w1],
+        scale=[w1_scale * w1_input_scale[0]],
+        split_item=2,
+        group_list_type=1,
+        group_type=0,
+        group_list=expert_token_count,
+        output_dtype=original_dtype,
+    )[0]
+    gate_up_out = torch_npu.npu_swiglu(gate_up_out)
+    if gate_up_out.dtype != w2.dtype:
+        w2_input_scale, _ = w2_input_scale.max(0)
+        quant_gate_up_out = quant_per_tensor(
+            gate_up_out,
+            w2_input_scale,
+            None,
+            True,
+        )
+    else:
+        quant_gate_up_out = gate_up_out
+    down_out = torch_npu.npu_grouped_matmul(
+        x=[quant_gate_up_out],
+        weight=[w2],
+        scale=[w2_scale * w2_input_scale[0]],
+        split_item=2,
+        group_list_type=1,
+        group_type=0,
+        group_list=expert_token_count,
+        output_dtype=original_dtype,
+    )[0]
+    if expert_map is not None:
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            down_out,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights.to(down_out.dtype),
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+            drop_pad_mode=2,
+        )
+    else:
+        raise NotImplementedError(
+            "The quantified version of MOE class models "
+            "currently does not support tensor parallelism")
+    return final_hidden_states
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None,
+    global_num_experts=-1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Select top-k experts based on router logits.
+    Args:
+        hidden_states: Hidden states of shape (num_tokens, hidden_size).
+        router_logits: Router logits of shape (num_tokens, num_experts).
+        top_k: Number of experts to select.
+        use_grouped_topk: Whether to group experts before selecting top-k.
+        renormalize: Whether to renormalize the routing weights.
+        topk_group: Number of expert groups to select from.
+        num_expert_group: Number of experts in each group.
+        custom_routing_function: Custom routing function.
+        scoring_func: Scoring function to use.
+        e_score_correction_bias: Correction bias to apply to expert scores.
+    Returns:
+        topk_weights: Routing weights of shape (num_tokens, top_k).
+        topk_ids: Selected expert IDs of shape (num_tokens, top_k).
+    Raises:
+        ValueError: If an unsupported scoring function is provided.
+    """
+    if scoring_func == "softmax":
+        # NOTE: vLLM use dtype=torch.float here
+        topk_weights = router_logits.softmax(dim=-1)
+    elif scoring_func == "sigmoid":
+        topk_weights = router_logits.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        if e_score_correction_bias is not None:
+            # Store original scores before applying correction bias. We use biased
+            # scores for expert selection but original scores for routing weights
+            original_weights = topk_weights
+            topk_weights = topk_weights + e_score_correction_bias.unsqueeze(0)
+        # TODO: Change to npu_group_topk when the latest CANN and NNAL is available
+        # >>> torch_npu._npu_group_topk(topk_weights, group_num=num_expert_group, k=topk_group)
+        topk_weights = native_grouped_topk(topk_weights, num_expert_group,
+                                           topk_group)
+        # TODO bfloat16 is not supported in torch.topk with ge graph.
+        if e_score_correction_bias is not None:
+            topk_ids = torch.topk(topk_weights.to(torch.float32),
+                                  k=top_k,
+                                  dim=-1,
+                                  sorted=False)[1]
+            # Use original unbiased scores for the routing weights
+            topk_weights = original_weights.gather(1, topk_ids)
+        else:
+            topk_weights, topk_ids = torch.topk(topk_weights.to(torch.float32),
+                                                k=top_k,
+                                                dim=-1,
+                                                sorted=False)
+    elif custom_routing_function is None:
+        topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            global_num_experts=global_num_experts,
+        )
+        # Required by npu_moe_init_routing
+        topk_ids = topk_ids.to(torch.int32)
+        return topk_weights, topk_ids
+    # Required by npu_moe_init_routing
+    topk_ids = topk_ids.to(torch.int32)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+def native_grouped_topk(
+    topk_weights: torch.Tensor,
+    num_expert_group: Optional[int],
+    topk_group: Optional[int],
+):
+    topk_group = 0 if topk_group is None else topk_group
+    num_expert_group = 0 if num_expert_group is None else num_expert_group
+    num_token = topk_weights.shape[0]
+    grouped_weights = topk_weights.view(num_token, num_expert_group,
+                                        -1).max(dim=-1).values
+    topk_group_indices = torch.topk(grouped_weights.to(torch.float32),
+                                    k=topk_group,
+                                    dim=-1,
+                                    sorted=False)[1]
+    topk_group_mask = torch.zeros_like(grouped_weights)
+    topk_group_mask.scatter_(1, topk_group_indices, 1)
+    topk_weight_mask = (topk_group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        topk_weights.shape[-1] // num_expert_group).reshape(num_token, -1))
+    topk_weights = topk_weights.masked_fill(~topk_weight_mask.bool(), 0.0)
+    return topk_weights

inference/vllm_ascend/quantization/w8a8_dynamic.py ADDED Viewed

	@@ -0,0 +1,831 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch_npu
+from vllm.distributed import GroupCoordinator
+import vllm_ascend.envs as envs
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.parallel_state import get_ep_group
+from vllm_ascend.ops.fused_moe import select_experts
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, FusedMoEState,
+                               dispose_tensor, get_fused_moe_state,
+                               npu_stream_switch, npu_wait_tensor)
+def apply_mlp(hidden_states: torch.Tensor,
+              w1: torch.Tensor,
+              w1_scale: torch.Tensor,
+              w2: torch.Tensor,
+              w2_scale: torch.Tensor,
+              group_list: torch.Tensor,
+              dynamic_scale: torch.Tensor = None,
+              group_list_type: int = 1) -> torch.Tensor:
+    """
+    apply MLP: gate_up_proj -> swiglu -> down_proj
+    Args:
+        hidden_states: input hidden states with shape (num_tokens, hidden_size).
+        w1: expert weights1 with shape
+            (num_experts, hidden_size, intermediate_size * 2)
+        w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2)
+        w2: expert weights2 with shape
+            (num_experts, intermediate_size, hidden_size)
+        w2_scale: weights2 scale with shape (num_experts, hidden_size)
+        group_list: number of tokens for each expert, follow cumsum mode, and
+            with shape (num_experts).
+        transpose_weight:
+            w1: (num_experts, intermediate_size * 2, hidden_size) ->
+                    (num_experts, hidden_size, intermediate_size * 2)
+            w2: (num_experts, hidden_size, intermediate_size) ->
+                    (num_experts, intermediate_size, hidden_size)
+    Returns:
+        hidden_states: output hidden states after MLP.
+    """
+    if dynamic_scale is None:
+        unquantized_hidden_states = hidden_states
+        hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(
+            hidden_states)
+        # Dispose the original unquantized hidden states
+        # to save npu memory because they're no longer used.
+        dispose_tensor(unquantized_hidden_states)
+    else:
+        pertoken_scale = dynamic_scale
+    # gmm1: gate_up_proj
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        scale=[w1_scale],
+        per_token_scale=[pertoken_scale],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=w2_scale.dtype)[0]
+    # act_fn: swiglu
+    hidden_states = torch_npu.npu_swiglu(hidden_states)
+    hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(
+        hidden_states)
+    # gmm2: down_proj
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        scale=[w2_scale],
+        per_token_scale=[swiglu_out_scale],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=w2_scale.dtype)[0]
+    return hidden_states
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: str = "",
+    log2phy: torch.Tensor = None,
+    global_redundant_expert_num: int = 0,
+    shared_experts: Optional[Any] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    if log2phy is not None:
+        topk_ids = log2phy[topk_ids]
+    global_bs = 0
+    moe_expert_num = len(expert_map) + global_redundant_expert_num
+    # hidden_states = hidden_states.bfloat16()
+    kwargs_mc2 = {
+        "x": hidden_states,
+        "expert_ids": topk_ids,
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": global_bs,
+        "expert_scales": topk_weights.to(torch.float32),
+    }
+    rank = torch.distributed.get_rank()
+    quant_mode = 2
+    ep_group = get_ep_group().device_group
+    local_rank = torch.distributed.get_rank(group=ep_group)
+    all_to_all_group_size = torch.distributed.get_world_size(ep_group)
+    world_size = torch.distributed.get_world_size()
+    tp_size = world_size // all_to_all_group_size
+    tp_rank = rank % tp_size
+    stage1_kwargs = {
+        "scales": None,
+        "quant_mode": quant_mode,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs_mc2.update(stage1_kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
+    # comm_stream.wait_stream(torch.npu.current_stream())
+    expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts, _, expand_scales = output[
+        0:7]
+    if shared_experts is not None:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(hidden_states, topk_weights)
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
+            npu_wait_tensor(shared_gate_up[0], expand_x)
+            shared_act = shared_experts.act_fn(shared_gate_up)
+    # `expand_x` will be disposed in the `apply_mlp` function
+    down_out_list = apply_mlp(expand_x,
+                              w1,
+                              w1_scale,
+                              w2,
+                              w2_scale,
+                              expert_token_nums,
+                              dynamic_scale=dynamic_scale)
+    # moeCombine
+    kwargs_mc2 = {
+        "expand_x": down_out_list,
+        "expert_ids": topk_ids,
+        "expand_idx": expand_idx,
+        "expert_scales": topk_weights.to(torch.float32),
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": 0,
+        "expand_scales": expand_scales,
+    }
+    tp_recv_counts = torch.empty(1,
+                                 dtype=torch.int32,
+                                 device=hidden_states.device)
+    stage3_kwargs = {
+        "ep_send_counts": ep_recv_counts,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        "tp_send_counts": tp_recv_counts,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs_mc2.update(stage3_kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
+    if shared_experts is None:
+        return hidden_states
+    else:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(shared_act[0], down_out_list)
+            shared_output, _ = shared_experts.down_proj(shared_act)
+        return hidden_states, shared_output
+# currently expert parallelism implemented with all2all
+# is under-optimized.
+def fused_experts_with_all2all(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    ep_group: GroupCoordinator = None,
+    log2phy: torch.Tensor = None,
+    global_redundant_expert_num: int = 0,
+):
+    if log2phy is not None:
+        topk_ids = log2phy[topk_ids]
+    original_shape = hidden_states.shape
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens, _ = hidden_states.shape
+    num_experts = w1.shape[0]
+    device = hidden_states.device
+    if expert_map is not None:
+        global_num_experts = len(expert_map) + global_redundant_expert_num
+        local_num_experts = global_num_experts // ep_group.world_size
+        row_idx_len = num_tokens * top_k
+        row_idx = (torch.arange(0,
+                                row_idx_len,
+                                dtype=torch.int32,
+                                device=device).view(top_k, -1).permute(
+                                    1, 0).contiguous())
+        hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=num_tokens)
+        global_expert_tokens = torch.bincount(expanded_expert_idx,
+                                              minlength=global_num_experts)
+        scatter_sizes = global_expert_tokens.view(ep_group.world_size,
+                                                  -1).sum(-1)
+        gather_sizes = torch.empty_like(scatter_sizes)
+        dist.all_to_all_single(gather_sizes,
+                               scatter_sizes,
+                               group=ep_group.device_group)
+        scatter_size_list = scatter_sizes.cpu().tolist()
+        gather_size_list = gather_sizes.cpu().tolist()
+        expanded_expert_idx = expanded_expert_idx % local_num_experts
+        hidden_states = ep_group.all_to_all(hidden_states, 0, 0,
+                                            scatter_size_list,
+                                            gather_size_list)
+        local_expert_idx = ep_group.all_to_all(expanded_expert_idx, 0, 0,
+                                               scatter_size_list,
+                                               gather_size_list)
+        sorted_local_expert_idx, sorted_idx = torch.sort(local_expert_idx)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            sorted_local_expert_idx, local_num_experts).to(torch.int64)
+        hidden_states = hidden_states[sorted_idx]
+        group_list_type = 0
+    else:
+        row_idx_len = num_tokens * top_k
+        row_idx = torch.arange(0,
+                               row_idx_len,
+                               dtype=torch.int32,
+                               device=topk_weights.device).view(
+                                   top_k, -1).permute(1, 0).contiguous()
+        hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=num_tokens)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, num_experts)
+        expert_tokens = expert_tokens.to(torch.int64)
+        group_list_type = 0
+    # `hidden_states` will be disposed in the `apply_mlp` function
+    hidden_states = apply_mlp(
+        hidden_states,
+        w1,
+        w1_scale,  #17
+        w2,
+        w2_scale,
+        expert_tokens,  #16
+        group_list_type=group_list_type)
+    if expert_map is not None:
+        resorted_idx = torch.argsort(sorted_idx)
+        hidden_states = hidden_states[resorted_idx]
+        hidden_states = ep_group.all_to_all(hidden_states, 0, 0,
+                                            gather_size_list,
+                                            scatter_size_list)
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    else:
+        # TODO: Reorder device memory 2 times here, replace the current
+        # implementation here when suitable operators become available.
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+def fused_experts_with_allgather(hidden_states: torch.Tensor,
+                                 w1: torch.Tensor,
+                                 w1_scale: torch.Tensor,
+                                 w2: torch.Tensor,
+                                 w2_scale: torch.Tensor,
+                                 topk_weights: torch.Tensor,
+                                 topk_ids: torch.Tensor,
+                                 top_k: int,
+                                 expert_map: torch.Tensor = None):
+    original_shape = hidden_states.shape
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens = hidden_states.shape[0]
+    batch_size, hidden_size = hidden_states.shape
+    ep_group = get_ep_group().device_group
+    ep_rank = torch.distributed.get_rank(group=ep_group)
+    ep_size = torch.distributed.get_world_size(ep_group)
+    global_num_experts = len(expert_map)
+    local_num_experts = global_num_experts // ep_size
+    hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
+    hidden_states, expanded_x_idx, expert_tokens, pertoken_scale = torch_npu.npu_moe_init_routing_v2(
+        hidden_states,
+        topk_ids,
+        scale=pertoken_scale,
+        offset=None,
+        active_num=num_tokens * top_k,
+        expert_num=global_num_experts,
+        expert_tokens_num_type=1,
+        expert_tokens_num_flag=True,
+        active_expert_range=[
+            ep_rank * local_num_experts, (ep_rank + 1) * local_num_experts
+        ],
+        quant_mode=-1,
+        row_idx_type=0)
+    group_list_type = 1
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        split_item=3,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=torch.int32)[0]
+    # act_fn: swiglu
+    hidden_states, pertoken_scale = torch_npu.npu_dequant_swiglu_quant(
+        x=hidden_states,
+        weight_scale=w1_scale.to(torch.float32),
+        activation_scale=pertoken_scale,
+        bias=None,
+        quant_scale=None,
+        quant_offset=None,
+        group_index=expert_tokens,
+        activate_left=True,
+        quant_mode=1,
+    )
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        scale=[w2_scale.to(torch.bfloat16)],
+        per_token_scale=[pertoken_scale.view(-1)],
+        split_item=3,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=torch.bfloat16)[0]
+    final_hidden_states = torch_npu.npu_moe_finalize_routing(
+        expanded_permuted_rows=hidden_states.unsqueeze(1),
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=topk_weights.to(torch.bfloat16),
+        expanded_src_to_dst_row=expanded_x_idx.to(torch.int32),
+        export_for_source_row=topk_ids,
+        drop_pad_mode=3
+    ).to(torch.bfloat16)
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+def fused_experts(hidden_states: torch.Tensor,
+                  w1: torch.Tensor,
+                  w1_scale: torch.Tensor,
+                  w2: torch.Tensor,
+                  w2_scale: torch.Tensor,
+                  topk_weights: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  top_k: int,
+                  expert_map: torch.Tensor = None):
+    original_shape = hidden_states.shape
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens, _ = hidden_states.shape
+    num_experts = w1.shape[0]
+    dtype = hidden_states.dtype
+    device = hidden_states.device
+    if expert_map is not None:
+        # Generate token indices and flatten
+        token_indices = (torch.arange(num_tokens,
+                                      device=device,
+                                      dtype=torch.int64).unsqueeze(1).expand(
+                                          -1, top_k).reshape(-1))
+        # Flatten token-to-expert mappings and map to local experts
+        weights_flat = topk_weights.view(-1)
+        experts_flat = topk_ids.view(-1)
+        local_experts_flat = expert_map[experts_flat]
+        # Filter valid token-expert pairs
+        mask = local_experts_flat != -1
+        filtered_weights = torch.where(
+            mask, weights_flat, torch.zeros_like(weights_flat)).to(dtype)
+        filtered_experts = torch.where(
+            mask, local_experts_flat,
+            torch.full_like(local_experts_flat,
+                            num_experts)).to(topk_ids.dtype)
+        # Sort by local expert IDs
+        sort_indices = torch.argsort(filtered_experts)
+        sorted_token_indices = token_indices[sort_indices]
+        sorted_weights = filtered_weights[sort_indices]
+        # Compute token counts with minlength of num_experts
+        # This is equivalent to but faster than:
+        # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1]
+        token_counts = torch.zeros(num_experts + 1,
+                                   device=device,
+                                   dtype=torch.int64)
+        ones = torch.ones_like(filtered_experts, dtype=torch.int64)
+        token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones)
+        expert_tokens = token_counts[:num_experts]
+        # Rearrange hidden_states
+        hidden_states = hidden_states[sorted_token_indices]
+        group_list_type = 1
+    else:
+        row_idx_len = num_tokens * top_k
+        row_idx = torch.arange(0,
+                               row_idx_len,
+                               dtype=torch.int32,
+                               device=topk_weights.device).view(
+                                   top_k, -1).permute(1, 0).contiguous()
+        hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing(
+            hidden_states,
+            row_idx=row_idx,
+            expert_idx=topk_ids,
+            active_num=num_tokens)
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, num_experts)
+        expert_tokens = expert_tokens.to(torch.int64)
+        group_list_type = 0
+    # `hidden_states` will be disposed in the `apply_mlp` function
+    hidden_states = apply_mlp(hidden_states,
+                              w1,
+                              w1_scale,
+                              w2,
+                              w2_scale,
+                              expert_tokens,
+                              group_list_type=group_list_type)
+    if expert_map is not None:
+        hidden_states.mul_(sorted_weights.unsqueeze(1))
+        final_hidden_states = torch.zeros(*original_shape,
+                                          device=device,
+                                          dtype=dtype)
+        num_valid_tokens = mask.sum()
+        valid_token_mask = torch.arange(
+            0, sorted_token_indices.shape[0],
+            device=device).unsqueeze(1) < num_valid_tokens
+        hidden_states = hidden_states.masked_fill_(~valid_token_mask,
+                                                   0).to(dtype)
+        final_hidden_states.index_add_(0, sorted_token_indices, hidden_states)
+    else:
+        # TODO: Reorder device memory 2 times here, replace the current
+        # implementation here when suitable operators become available.
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+class AscendW8A8DynamicLinearMethod:
+    """Linear method for Ascend W8A8_DYNAMIC.
+    """
+    def __init__(self):
+        self.transpose_weight = True
+    @staticmethod
+    def get_weight(input_size: int, output_size: int,
+                   params_dtype: torch.dtype) -> Dict[str, Any]:
+        params_dict = {
+            "weight": torch.empty(output_size, input_size, dtype=torch.int8)
+        }
+        return params_dict
+    @staticmethod
+    def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
+        return {}
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["weight_scale"] = torch.empty(output_size,
+                                                  1,
+                                                  dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size,
+                                                   1,
+                                                   dtype=params_dtype)
+        return params_dict
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        config = getattr(layer, "_ascend_quant_config", {})
+        if not isinstance(x, tuple):
+            output_dtype = config.get("output_dtype", x.dtype)
+            quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+        else:
+            assert "output_dtype" in config.keys(), (
+                f"DynamicLinearMethod needs explicitly specified `output_dtype`"
+                f"for pre-quantized input, got config [{config}]")
+            output_dtype = config["output_dtype"]
+            quantized_x, dynamic_scale = x
+        pertoken_scale = (dynamic_scale
+                          if config.get("pertoken_scale", True) else None)
+        output = torch_npu.npu_quant_matmul(
+            quantized_x,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=pertoken_scale,
+            bias=bias,
+            output_dtype=output_dtype,
+        )
+        return ((output, dynamic_scale)
+                if config.get("return_scale", False) else output)
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        # cast quantized weight tensors in NZ format (29) for higher inference speed
+        layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29)
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+class AscendW8A8DynamicFusedMoEMethod:
+    """FusedMoe method for Ascend W8A8_DYNAMIC.
+    """
+    def __init__(self):
+        self.transpose_weight = True
+        self.ep_group = get_ep_group()
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        try:
+            device_group = self.ep_group.device_group
+            # TODO: Try local_rank = ep_group.rank_in_group
+            local_rank = torch.distributed.get_rank(group=device_group)
+            backend = device_group._get_backend(torch.device("npu"))
+            self.moe_all_to_all_group_name = backend.get_hccl_comm_name(
+                local_rank)
+        except AttributeError:
+            self.moe_all_to_all_group_name = ""
+    @staticmethod
+    def get_weight(num_experts: int, intermediate_size_per_partition: int,
+                   hidden_sizes: int,
+                   params_dtype: torch.dtype) -> Dict[str, Any]:
+        param_dict = {}
+        param_dict["w13_weight"] = torch.empty(num_experts,
+                                               2 *
+                                               intermediate_size_per_partition,
+                                               hidden_sizes,
+                                               dtype=torch.int8)
+        param_dict["w2_weight"] = torch.empty(num_experts,
+                                              hidden_sizes,
+                                              intermediate_size_per_partition,
+                                              dtype=torch.int8)
+        return param_dict
+    @staticmethod
+    def get_dynamic_quant_param(num_experts: int,
+                                intermediate_size_per_partition: int,
+                                hidden_sizes: int,
+                                params_dtype: torch.dtype) -> Dict[str, Any]:
+        param_dict = {}
+        param_dict["w13_weight_scale"] = torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=params_dtype)
+        param_dict["w13_weight_offset"] = torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=params_dtype)
+        param_dict["w2_weight_scale"] = torch.empty(num_experts,
+                                                    hidden_sizes,
+                                                    1,
+                                                    dtype=params_dtype)
+        param_dict["w2_weight_offset"] = torch.empty(num_experts,
+                                                     hidden_sizes,
+                                                     1,
+                                                     dtype=params_dtype)
+        return param_dict
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = True,
+        enable_force_load_balance: bool = True,
+        log2phy: torch.Tensor = None,
+        global_redundant_expert_num: int = 0,
+        shared_experts: Optional[Any] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert router_logits.shape[
+            1] == global_num_experts, "Number of global experts mismatch"
+        is_deepseek_v3_r1 = global_num_experts == 256
+        use_grouped_topk = (topk_group > 1 or num_expert_group > 1)
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        if use_grouped_topk and is_deepseek_v3_r1:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=top_k,  # topk当前写8
+                bias=e_score_correction_bias,
+                k_group=topk_group,  # fix: 4
+                group_count=num_expert_group,  # fix 8
+                group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
+                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                # out_flag=False, # todo new api; 第三个输出是否输出
+                # y2_flag=False, # old api; 第三个输出是否输出
+                routed_scaling_factor=1,
+                eps=float(1e-20))
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                use_grouped_topk=use_grouped_topk,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+        # this is a naive implementation for experts load balance so as
+        # to avoid accumulating too much tokens on a single rank.
+        # currently it is only activated when doing profile runs.
+        if enable_force_load_balance:
+            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+        topk_weights = topk_weights.to(x.dtype)
+        fused_moe_state = get_fused_moe_state(self.ep_group.world_size,
+                                              is_prefill, is_deepseek_v3_r1)
+        if fused_moe_state == FusedMoEState.AllGatherEP:
+            return fused_experts_with_allgather(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w1_scale=layer.w13_weight_scale,
+                w2=layer.w2_weight,
+                w2_scale=layer.w2_weight_scale,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map)
+        elif fused_moe_state == FusedMoEState.MC2:
+            return fused_experts_with_mc2(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                log2phy=log2phy,
+                global_redundant_expert_num=global_redundant_expert_num,
+                shared_experts=shared_experts)
+        elif fused_moe_state in [
+                FusedMoEState.AllGather, FusedMoEState.NaiveMulticast
+        ]:
+            return fused_experts(hidden_states=x,
+                                 w1=layer.w13_weight,
+                                 w1_scale=layer.w13_weight_scale,
+                                 w2=layer.w2_weight,
+                                 w2_scale=layer.w2_weight_scale,
+                                 topk_weights=topk_weights,
+                                 topk_ids=topk_ids,
+                                 top_k=top_k,
+                                 expert_map=expert_map)
+        else:
+            # The current implementation of deepseek moe splits hidden_states
+            # according to tp_size before they are feed into fused_moe module.
+            # Therefore, all2all is needed no matter how dp/tp is set so as to
+            # dispatch/combine tokens.
+            return fused_experts_with_all2all(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w1_scale=layer.w13_weight_scale,
+                w2=layer.w2_weight,
+                w2_scale=layer.w2_weight_scale,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                ep_group=self.ep_group,
+                log2phy=log2phy,
+                global_redundant_expert_num=global_redundant_expert_num,
+            )
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.w13_weight.data = layer.w13_weight.data.transpose(
+                1, 2).contiguous()
+            layer.w2_weight.data = layer.w2_weight.data.transpose(
+                1, 2).contiguous()
+        layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
+            layer.w13_weight_scale.data.shape[0], -1)
+        layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(
+            layer.w13_weight_offset.data.shape[0], -1)
+        layer.w2_weight_scale.data = layer.w2_weight_scale.data.view(
+            layer.w2_weight_scale.data.shape[0], -1)
+        layer.w2_weight_offset.data = layer.w2_weight_offset.data.view(
+            layer.w2_weight_offset.data.shape[0], -1)

inference/vllm_ascend/utils.py ADDED Viewed

	@@ -0,0 +1,563 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/worker.py
+#
+import atexit
+import fcntl
+import math
+import os
+import shutil
+from contextlib import contextmanager, nullcontext
+from enum import Enum
+from threading import Lock
+from typing import TYPE_CHECKING, List, Tuple
+import torch
+import torch_npu  # noqa: F401  # noqa: F401
+from packaging.version import InvalidVersion, Version
+from torch_npu.npu.streams import Event
+from vllm.logger import logger
+import vllm_ascend.envs as envs
+from vllm_ascend.ascend_config import get_ascend_config
+try:
+    # Recent release of torchair has moved these ops to `.scope`.
+    from torchair.scope import npu_stream_switch as _npu_stream_switch
+    from torchair.scope import npu_wait_tensor as _npu_wait_tensor
+except ImportError:
+    from torchair.ops import NpuStreamSwitch as _npu_stream_switch
+    from torchair.ops import npu_wait_tensor as _npu_wait_tensor
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+# NOTE: Currently, we can only capture 1920 graphs at most,
+# due to the limitation of ACL graph. This number is bounded by
+# the number of streams, which is 2048, we save 128 streams
+# as a buffer.
+# Maximum number of graphs that can be captured by ACL Graph
+MAX_CAPTURE_SIZE = 1920
+ASCEND_QUATIZATION_METHOD = "ascend"
+SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"]
+ACL_FORMAT_FRACTAL_ND = 2
+ACL_FORMAT_FRACTAL_NZ = 29
+_CUSTOM_OP_ENABLED = None
+_IS_310P = None
+_SLEEP_MODE_ENABLED = None
+_CURRENT_STREAM = None
+def is_310p():
+    global _IS_310P
+    if _IS_310P is None:
+        from vllm_ascend import _build_info  # type: ignore
+        _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p")
+    return _IS_310P
+def sleep_mode_enabled():
+    global _SLEEP_MODE_ENABLED
+    if _SLEEP_MODE_ENABLED is None:
+        from vllm_ascend import _build_info  # type: ignore
+        _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__
+    return _SLEEP_MODE_ENABLED
+def _round_up(x: int, align: int):
+    # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc.
+    # input: 15, 16 -> output: 16
+    # input: 17, 16 -> output: 32
+    # input: 30, 16 -> output: 32
+    # input: 33, 16 -> output: 48
+    # ...
+    return (x + align - 1) // align * align
+def _custom_pad(x, pad_dims):
+    # pad the input tensor to the shape of pad_dims
+    # input: (13, 30), pad_dims: [0, 2, 0, 3]
+    # output: (16, 32)
+    return torch.nn.functional.pad(x, pad_dims)
+def _custom_reshape(x, target_shape):
+    # reshape the input tensor to the shape of target_shape
+    # input: (16, 32), target_shape: [1, 16, 2, 16]
+    # output: (1, 16, 2, 16)
+    return x.reshape(target_shape)
+def _custom_transpose(x, dim1, dim2):
+    # transpose the input tensor
+    # input: (1, 16, 2, 16), dim1: 1, dim2: 2
+    # output: (1, 2, 16, 16)
+    return x.transpose(dim1, dim2)
+def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor:
+    # in_tensor: (13, 30)
+    aux_dims = [1, 0, 0, 16]
+    # aux_dims[1]: 16
+    aux_dims[1] = _round_up(in_tensor.size(0), 16)
+    # aux_dims[2]: 2
+    aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16
+    # after: aux_dims: [1, 16, 2, 16]
+    pad_dims = [0, 0, 0, 0]
+    # pad_dims[1]: 2
+    pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1)
+    # pad_dims[3]: 3
+    pad_dims[3] = _round_up(in_tensor.size(0), 16) - in_tensor.size(0)
+    # after: pad_dims: [0, 2, 0, 3]
+    # return: (1, 2, 16, 16)
+    return _custom_transpose(
+        _custom_reshape(_custom_pad(in_tensor, pad_dims), aux_dims), 1,
+        2).contiguous()
+def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor:
+    num_tokens = mask_tensor.shape[0]
+    max_seq_len = mask_tensor.shape[1]
+    tokens_pad = (num_tokens + 15) // 16 * 16
+    max_seq_len_pad = (max_seq_len + 15) // 16 * 16
+    mask_tensor_pad = \
+        torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device)
+    mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor
+    mask = mask_tensor_pad.reshape(
+        (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3)
+    return mask
+def aligned_16(tensor: torch.Tensor):
+    """Aligned tensor for 310P"""
+    # Get the size of the current 0th dimension
+    n = tensor.size(0)
+    # Calculate the aligned size
+    n_aligned = ((n + 15) // 16) * 16
+    # If already aligned, return the original tensor
+    if n == n_aligned:
+        return tensor
+    # Create a new tensor with shape (n_aligned, H, W) and fill it with zeros
+    new_tensor = torch.zeros(n_aligned,
+                             *tensor.shape[1:],
+                             dtype=tensor.dtype,
+                             device=tensor.device)
+    # Copy the original tensor to the first N positions of the new tensor
+    new_tensor[:n] = tensor
+    return new_tensor
+def maybe_converting_weight_acl_format(model, format=ACL_FORMAT_FRACTAL_NZ):
+    # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
+    # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
+    # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
+    # conversion when using torchair graph mode on 300I Duo platform.
+    # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
+    # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+    use_torchair = get_ascend_config().torchair_graph_config.enabled
+    if not is_310p() or not use_torchair:
+        return
+    for module in model.modules():
+        if isinstance(module, FusedMoE):
+            if torch_npu.get_npu_format(module.w13_weight.data) == format:
+                return
+            module.w13_weight.data = torch_npu.npu_format_cast(
+                module.w13_weight.data, format)
+            module.w2_weight.data = torch_npu.npu_format_cast(
+                module.w2_weight.data, format)
+def try_register_lib(lib_name: str, lib_info: str = ""):
+    import importlib
+    import importlib.util
+    try:
+        module_spec = importlib.util.find_spec(lib_name)
+        if module_spec is not None:
+            importlib.import_module(lib_name)
+            if lib_info:
+                logger.info(lib_info)
+    except Exception:
+        pass
+def enable_custom_op():
+    """
+    Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
+    Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
+    """
+    global _CUSTOM_OP_ENABLED
+    if _CUSTOM_OP_ENABLED is not None:
+        return _CUSTOM_OP_ENABLED
+    try:
+        # register custom ops into torch_library here
+        import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
+        _CUSTOM_OP_ENABLED = True
+    except ImportError:
+        _CUSTOM_OP_ENABLED = False
+        logger.warning(
+            "Warning: Failed to register custom ops, all custom ops will be disabled"
+        )
+    return _CUSTOM_OP_ENABLED
+def find_hccl_library() -> str:
+    """
+    We either use the library file specified by the `HCCL_SO_PATH`
+    environment variable, or we find the library file brought by PyTorch.
+    After importing `torch`, `libhccl.so` can be
+    found by `ctypes` automatically.
+    """
+    so_file = envs.HCCL_SO_PATH
+    # manually load the hccl library
+    if so_file:
+        logger.info("Found hccl from environment variable HCCL_SO_PATH=%s",
+                    so_file)
+    else:
+        if torch.version.cann is not None:
+            so_file = "libhccl.so"
+        else:
+            raise ValueError("HCCL only supports Ascend NPU backends.")
+        logger.info("Found hccl from library %s", so_file)
+    return so_file
+def current_stream() -> torch.npu.Stream:
+    """
+    replace `torch.npu.current_stream()` with `vllm.utils.current_stream()`.
+    it turns out that `torch.npu.current_stream()` is quite expensive,
+    as it will construct a new stream object at each call.
+    here we patch `torch.npu.set_stream` to keep track of the current stream
+    directly, so that we can avoid calling `torch.npu.current_stream()`.
+    """
+    global _CURRENT_STREAM
+    if _CURRENT_STREAM is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _CURRENT_STREAM = torch.npu.current_stream()
+    return _CURRENT_STREAM
+def adapt_patch(is_global_patch: bool = False):
+    if is_global_patch:
+        from vllm_ascend.patch import platform  # noqa: F401
+    else:
+        from vllm_ascend.patch import worker  # noqa: F401
+def vllm_version_is(target_vllm_version: str):
+    if envs.VLLM_VERSION is not None:
+        vllm_version = envs.VLLM_VERSION
+    else:
+        import vllm
+        vllm_version = vllm.__version__
+    try:
+        return Version(vllm_version) == Version(target_vllm_version)
+    except InvalidVersion:
+        raise ValueError(
+            f"Invalid vllm version {vllm_version} found. A dev version of vllm "
+            "is installed probably. Set the environment variable VLLM_VERSION "
+            "to control it by hand. And please make sure the value follows the "
+            "format of x.y.z.")
+def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
+    """Update ACL graph capture sizes based on hardware limitations"""
+    # Store original configuration and temporarily clear it
+    compilation_config = vllm_config.compilation_config
+    original_sizes, compilation_config.cudagraph_capture_sizes = \
+        compilation_config.cudagraph_capture_sizes, None
+    # Calculate parallel configuration factor
+    num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+    parallel_config = vllm_config.parallel_config
+    # TODO: Find out whether we need to take into account the pp_size
+    parallel_factor = 1 + sum(size > 1 for size in [
+        parallel_config.data_parallel_size_local,
+        parallel_config.tensor_parallel_size,
+        parallel_config.expert_parallel_size,
+        parallel_config.expert_tensor_parallel_size,
+    ])
+    # Calculate maximum supported batch sizes considering model architecture
+    max_num_batch_sizes = math.floor(MAX_CAPTURE_SIZE /
+                                     (num_hidden_layers + 1) / parallel_factor)
+    logger.info("Calculated maximum supported batch sizes for ACL graph: %s",
+                max_num_batch_sizes)
+    # If original sizes exceed maximum, sample a representative subset
+    if max_num_batch_sizes < len(original_sizes):
+        # Sample uniformly from original sizes
+        step = (len(original_sizes) - 1) / (max_num_batch_sizes - 1)
+        indices = [round(i * step) for i in range(max_num_batch_sizes)]
+        # Ensure first and last elements are preserved
+        indices[0], indices[-1] = 0, len(original_sizes) - 1
+        sampled_sizes = [original_sizes[i] for i in indices]
+        compilation_config.init_with_cudagraph_sizes(sampled_sizes)
+        logger.info(
+            "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
+            vllm_config.model_config.architectures[0],
+            num_hidden_layers,
+            len(original_sizes),
+            len(compilation_config.
+                cudagraph_capture_sizes  # type: ignore[arg-type]
+                ))
+    else:
+        # No adjustment needed
+        compilation_config.cudagraph_capture_sizes = original_sizes
+        logger.info(
+            "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
+            vllm_config.model_config.architectures[0], num_hidden_layers,
+            len(original_sizes))
+# TODO(wxy): Move to ops module
+def dispose_tensor(x: torch.Tensor):
+    x.set_(torch.empty((0, ), device=x.device, dtype=x.dtype))
+class ProfileExecuteDuration:
+    _instance = None
+    _observations: List[Tuple[str, Event, Event]] = []
+    _lock = Lock()
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super().__new__(cls)
+                atexit.register(cls._instance.destroy)
+            return cls._instance
+    def destroy(self):
+        with self._lock:
+            self._observations.clear()
+    @contextmanager
+    def capture_async(self, duration_tag: str):
+        if not envs.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE:
+            yield
+            return
+        observe_start = Event(enable_timing=True)
+        observe_start.record()
+        try:
+            yield
+        finally:
+            observe_end = Event(enable_timing=True)
+            observe_end.record()
+            with self._lock:
+                self._observations.append(
+                    (duration_tag, observe_start, observe_end))
+    def pop_captured_sync(self) -> dict:
+        """Pop and synchronize all events in the observation list"""
+        durations: dict[str, float] = {}
+        if not envs.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE:
+            return durations
+        while self._observations:
+            with self._lock:
+                tag, observe_start, observe_end = self._observations.pop()
+            observe_end.synchronize()
+            durations[tag] = observe_start.elapsed_time(observe_end)
+        return durations
+# TODO(wxy): Move to ops module
+def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True):
+    return _npu_stream_switch(tag, priority) if enabled else nullcontext()
+# TODO(wxy): Move to ops module
+def npu_wait_tensor(self: torch.Tensor,
+                    dependency: torch.Tensor,
+                    *,
+                    enabled: bool = True):
+    return _npu_wait_tensor(self, dependency) if enabled else self
+# TODO(wxy): Move to ops module
+def npu_prefetch(input: torch.Tensor,
+                 dependency: torch.Tensor,
+                 max_size: int = 0,
+                 *,
+                 enabled: bool = True):
+    if not enabled:
+        return
+    input_size = input.element_size() * input.numel()
+    if max_size <= 0 or max_size > input_size:
+        max_size = input_size
+    torch_npu.npu_prefetch(input, dependency, max_size)
+# TODO(zzzzwwjj): move this into forward_context
+class FusedMoEState(Enum):
+    AllGather = 0
+    All2All = 1
+    MC2 = 2
+    AllGatherEP = 3
+    NaiveMulticast = 4
+# TODO(ttanzhiqiang): rm_router_logits
+# dp>1 will trigger
+# In theory, this solution is only applicable to AllGather and AllGatherEP, because in the dp scenario, the previous operation was gate + two communications, and now it is changed to one communication + gate operation, which can save some communication time. In theory, all moe AllGather and AllGatherEP solutions can follow this logic, but now other moe models (qwen3-235b) dp solutions are not adjusted, so use the switch to control it to prevent code errors.
+def get_rm_router_logits_state(ep_size: int, dp_size: int,
+                               is_deepseek_v3_r1: bool):
+    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
+    # only supports deepseek v3/r1
+    if dp_size > 1:
+        if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
+                and is_deepseek_v3_r1):
+            return True
+        elif ep_size == 1 and is_deepseek_v3_r1:
+            return True
+    return False
+# TODO(ttanzhiqiang): all_reduce merge
+# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
+# Currently, all_reduce_merge is enabled by default in the AllGather, AllGatherEP and NaiveMulticast scenarios of the deepseek model.
+def get_all_reduce_merge_state(ep_size: int, is_deepseek_v3_r1: bool):
+    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
+    # only supports deepseek v3/r1
+    if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
+            and is_deepseek_v3_r1):
+        return True
+    elif ep_size == 1 and is_deepseek_v3_r1:
+        return True
+    return False
+# TODO(zzzzwwjj): add soc_version to choose branch
+def get_fused_moe_state(ep_size: int, with_prefill: bool,
+                        is_deepseek_v3_r1: bool):
+    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
+    # only supports deepseek v3/r1
+    if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
+            and is_deepseek_v3_r1 and not with_prefill):
+        return FusedMoEState.AllGatherEP
+    elif ep_size == 1:
+        if with_prefill:
+            return FusedMoEState.NaiveMulticast
+        else:
+            return FusedMoEState.AllGather
+    # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
+    elif ep_size < 16 or with_prefill:
+        return FusedMoEState.All2All
+    else:
+        return FusedMoEState.MC2
+KV_CACHE_BYTES_CACHE_PATH_NAME = ".kv_cache_bytes"
+KV_CACHE_BYTES_CACHE_FILE_NAME = "kv_cache_bytes"
+TORCHAIR_CACHE_PATH_NAME = ".torchair_cache"
+TORCHAIR_CACHE_DIR = os.getenv(
+    'TORCHAIR_CACHE_HOME', os.path.join(os.getcwd(), TORCHAIR_CACHE_PATH_NAME))
+def get_torchair_current_work_dir(file_name=None):
+    if file_name is None:
+        return TORCHAIR_CACHE_DIR
+    return os.path.join(TORCHAIR_CACHE_DIR, file_name)
+def check_torchair_cache_exist():
+    res = False
+    torch_air_abs_path = get_torchair_current_work_dir()
+    if os.path.exists(torch_air_abs_path):
+        file_list = os.listdir(torch_air_abs_path)
+        if len(file_list) != 0:
+            res = True
+    return res
+def check_kv_cache_bytes_cache_exist():
+    res = False
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    if os.path.exists(kv_cache_bytes_cache_abs_path):
+        file_list = os.listdir(kv_cache_bytes_cache_abs_path)
+        if len(file_list) != 0:
+            res = True
+    return res
+def read_kv_cache_bytes_from_file(rank) -> int:
+    kv_cache_bytes = -1
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    kv_cache_bytes_file = os.path.join(
+        kv_cache_bytes_cache_abs_path,
+        f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}")
+    with open(kv_cache_bytes_file, "r", encoding="utf-8") as f:
+        with file_lock(f, fcntl.LOCK_SH):
+            kv_cache_bytes = int(f.readline())
+    return kv_cache_bytes
+@contextmanager
+def file_lock(file_descriptor, lock_type):
+    fcntl.flock(file_descriptor, lock_type)
+    try:
+        yield
+    finally:
+        fcntl.flock(file_descriptor, fcntl.LOCK_UN)
+def write_kv_cache_bytes_to_file(rank, kv_cache_bytes):
+    kv_cache_bytes_cache_abs_path = get_torchair_current_work_dir(
+        KV_CACHE_BYTES_CACHE_PATH_NAME)
+    os.makedirs(kv_cache_bytes_cache_abs_path, exist_ok=True)
+    kv_cache_bytes_file = os.path.join(
+        kv_cache_bytes_cache_abs_path,
+        f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}")
+    with open(kv_cache_bytes_file, "w", encoding="utf-8") as f:
+        with file_lock(f, fcntl.LOCK_EX):
+            f.write(f"{kv_cache_bytes}")
+def delete_torchair_cache_file():
+    torch_air_abs_path = get_torchair_current_work_dir()
+    if os.path.exists(torch_air_abs_path):
+        shutil.rmtree(torch_air_abs_path)

inference/vllm_ascend/worker/model_runner_v1.py ADDED Viewed

The diff for this file is too large to render. See raw diff

inference/vllm_ascend/worker/npu_input_batch.py ADDED Viewed

	@@ -0,0 +1,796 @@

+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/gpu_input_batch.py
+#
+from dataclasses import dataclass
+from typing import Optional, cast, Union
+import numpy as np
+import torch
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.logits_processor import init_builtin_logitsprocs
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
+from vllm.v1.utils import copy_slice
+from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm_ascend.pool.metadata import PoolingMetadata
+_SAMPLING_EPS = 1e-5
+@dataclass
+class CachedRequestState:
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: Optional[SamplingParams]
+    pooling_params: Optional[PoolingParams]
+    generator: Optional[torch.Generator]
+    block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+    output_token_ids: list[int]
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+    lora_request: Optional[LoRARequest] = None
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+    @property
+    def num_tokens(self) -> int:
+        return self.num_prompt_tokens + len(self.output_token_ids)
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
+@dataclass
+class SamplingMetadataTopNSigma(SamplingMetadata):
+    top_n_sigma: torch.Tensor
+    no_top_n_sigma: bool
+class InputBatch:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        pin_memory: bool,
+        vocab_size: int,
+        block_sizes: list[int],  # The block_size of each kv cache group
+        logits_processing_needs_token_ids: bool = False,
+        is_spec_decode: bool = False,
+    ):
+        self.is_spec_decode = is_spec_decode
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+        self.logits_processing_needs_token_ids = (
+            logits_processing_needs_token_ids)
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the NPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+        # Block table.
+        self.block_table = MultiGroupBlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            pin_memory=pin_memory,
+            device=device,
+            block_sizes=block_sizes,
+        )
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+        # IDs of requests which do not support spec decoding
+        self.spec_decode_unsupported_reqs: set[str] = set()
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: set[str] = set()
+        # topnsigma penalty
+        self.top_n_sigma = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.top_n_sigma_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.top_n_sigma_cpu = \
+            self.top_n_sigma_cpu_tensor.numpy()
+        self.top_n_sigma_reqs: set[str] = set()
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+        self.logit_bias: list[Optional[dict[int,
+                                            float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+        # Define logits processors.
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_builtin_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)
+        # This is updated each time the batch constituents change.
+        self.sampling_metadata = self._make_sampling_metadata()
+        self.pooling_params: dict[str, PoolingParams] = {}
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+        self.req_id_to_index[req_id] = req_index
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+        if sampling_params := request.sampling_params:
+            if self.is_spec_decode and is_spec_decode_unsupported(
+                    sampling_params):
+                self.spec_decode_unsupported_reqs.add(req_id)
+            if sampling_params.sampling_type == SamplingType.GREEDY:
+                # Avoid later division by zero.
+                self.temperature_cpu[req_index] = -1.0
+                self.greedy_reqs.add(req_id)
+            else:
+                self.temperature_cpu[req_index] = sampling_params.temperature
+                self.random_reqs.add(req_id)
+            self.top_p_cpu[req_index] = sampling_params.top_p
+            if sampling_params.top_p < 1:
+                self.top_p_reqs.add(req_id)
+            top_k = sampling_params.top_k
+            if 0 < top_k < self.vocab_size:
+                self.top_k_reqs.add(req_id)
+            else:
+                top_k = self.vocab_size
+            self.top_k_cpu[req_index] = top_k
+            self.min_p_cpu[req_index] = sampling_params.min_p
+            self.frequency_penalties_cpu[
+                req_index] = sampling_params.frequency_penalty
+            if sampling_params.min_p > _SAMPLING_EPS:
+                self.min_p_reqs.add(req_id)
+            if sampling_params.frequency_penalty != 0.0:
+                self.frequency_penalties_reqs.add(req_id)
+            self.presence_penalties_cpu[
+                req_index] = sampling_params.presence_penalty
+            if sampling_params.presence_penalty != 0.0:
+                self.presence_penalties_reqs.add(req_id)
+            self.repetition_penalties_cpu[
+                req_index] = sampling_params.repetition_penalty
+            if sampling_params.repetition_penalty != 1.0:
+                self.repetition_penalties_reqs.add(req_id)
+            if sampling_params.min_tokens:
+                self.min_tokens[req_index] = (
+                    sampling_params.min_tokens,
+                    sampling_params.all_stop_token_ids)
+            if sampling_params.extra_args and "top_n_sigma" in sampling_params.extra_args:
+                self.top_n_sigma_cpu[
+                    req_index] = sampling_params.extra_args["top_n_sigma"]
+                self.top_n_sigma_reqs.add(req_id)
+            else:
+                self.top_n_sigma_cpu[req_index] = -1
+            # NOTE(woosuk): self.generators should not include the requests that
+            # do not have their own generator.
+            if request.generator is not None:
+                self.generators[req_index] = request.generator
+            if sampling_params.logprobs is not None:
+                self.num_logprobs[req_id] = sampling_params.logprobs
+            if sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[
+                    req_id] = sampling_params.prompt_logprobs
+            if sampling_params.logit_bias is not None:
+                self.logit_bias[req_index] = sampling_params.logit_bias
+            if sampling_params.allowed_token_ids:
+                self.has_allowed_token_ids.add(req_id)
+                if self.allowed_token_ids_mask_cpu_tensor is None:
+                    # Lazy allocation for this tensor, which can be large.
+                    # False means we don't fill with -inf.
+                    self.allowed_token_ids_mask = torch.zeros(
+                        self.max_num_reqs,
+                        self.vocab_size,
+                        dtype=torch.bool,
+                        device=self.device)
+                    self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                        self.max_num_reqs,
+                        self.vocab_size,
+                        dtype=torch.bool,
+                        device="cpu")
+                self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask_cpu_tensor[req_index][
+                    sampling_params.allowed_token_ids] = False
+            if sampling_params.bad_words_token_ids:
+                self.bad_words_token_ids[
+                    req_index] = sampling_params.bad_words_token_ids
+        else:
+            assert request.pooling_params is not None
+            self.pooling_params[req_id] = request.pooling_params
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.spec_decode_unsupported_reqs.discard(req_id)
+        self.top_n_sigma_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+        self.logit_bias[req_index] = None
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        self.pooling_params.pop(req_id, None)
+        return req_index
+    def swap_states(self, i1: int, i2: int) -> None:
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
+        self.top_n_sigma_cpu[i1], self.top_n_sigma_cpu[i2] =\
+            self.top_n_sigma_cpu[i2], self.top_n_sigma_cpu[i1]
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+        self.block_table.swap_row(i1, i2)
+    def condense(self, empty_req_indices: list[int]) -> None:
+        """Move non-empty requests down into lower, empty indices.
+        Args:
+          empty_req_indices: empty batch indices, sorted descending.
+        """
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+            # Swap the states.
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
+            self.top_n_sigma_cpu[
+                empty_index] = self.top_n_sigma_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+    def refresh_sampling_metadata(self):
+        self.sampling_metadata = self._make_sampling_metadata()
+    def _make_sampling_metadata(self) -> Union[SamplingMetadata, SamplingMetadataTopNSigma]:
+        num_reqs = self.num_reqs
+        if not self.all_greedy:
+            temperature = copy_slice(self.temperature_cpu_tensor,
+                                     self.temperature, num_reqs)
+        else:
+            temperature = None
+        if not self.no_top_p:
+            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
+        if not self.no_top_k:
+            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+        if not self.no_min_p:
+            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
+        if not self.no_penalties:
+            # Since syncing these tensors is expensive only copy them
+            # if necessary i.e. if there are requests which require
+            # penalties to be applied during sampling.
+            copy_slice(self.frequency_penalties_cpu_tensor,
+                       self.frequency_penalties, num_reqs)
+            copy_slice(self.presence_penalties_cpu_tensor,
+                       self.presence_penalties, num_reqs)
+            copy_slice(self.repetition_penalties_cpu_tensor,
+                       self.repetition_penalties, num_reqs)
+        if not self.no_top_n_sigma:
+            copy_slice(self.top_n_sigma_cpu_tensor,
+                       self.top_n_sigma, num_reqs)
+        needs_prompt_token_ids = (not self.no_penalties or
+                                  (self.num_reqs > 0
+                                   and self.logits_processing_needs_token_ids))
+        if needs_prompt_token_ids:
+            # The prompt tokens are used only for applying penalties or
+            # step pooling during the sampling/pooling process.
+            # Hence copy these tensors only when there are requests which
+            # need penalties/step_pooler to be applied.
+            prompt_token_ids = self._make_prompt_token_ids_tensor()
+        else:
+            prompt_token_ids = None
+        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        if not self.no_allowed_token_ids:
+            assert self.allowed_token_ids_mask is not None
+            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                       self.allowed_token_ids_mask, num_reqs)
+            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+        return SamplingMetadataTopNSigma(
+            temperature=temperature,
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=None if self.no_top_p else self.top_p[:num_reqs],
+            top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:num_reqs],
+            presence_penalties=self.presence_penalties[:num_reqs],
+            repetition_penalties=self.repetition_penalties[:num_reqs],
+            top_n_sigma=self.top_n_sigma[:num_reqs],
+            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            no_penalties=self.no_penalties,
+            no_top_n_sigma=self.no_top_n_sigma,
+            allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
+            logitsprocs=self.logitsprocs,
+        )
+    @property
+    def pooling_metadata(self) -> PoolingMetadata:
+        if len(self.pooling_params) == 0:
+            pooling_params = []
+        else:
+            # Note, for now this assumes that all request in the batch
+            # are either sampling or pooling requests
+            assert len(self.req_ids) == len(self.pooling_params)
+            pooling_params = [
+                self.pooling_params[req_id] for req_id in self.req_ids
+            ]
+        return PoolingMetadata(
+            prompt_lens=torch.from_numpy(
+                self.num_prompt_tokens[:self.num_reqs]).to(self.device),
+            prompt_token_ids=self.sampling_metadata.prompt_token_ids,
+            pooling_params=pooling_params,
+        )
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+    @property
+    def no_top_n_sigma(self) -> bool:
+        return len(self.top_n_sigma_reqs) == 0
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0

model-00002-of-000062.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e29a512e3737d1826c80a2277a8b42021878847753aadbe5e1ae2a2df3d7f8d
+size 1242564208

model-00003-of-000062.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f63aa17d947032e0a524b5798eee3becbfc9a9b6f8a352ead3232e7b34bb289
+size 1242564208

model-00005-of-000062.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e3907f683d7f8382d2a792304155e8533ffa3a94dd4bb5ff825124b0dba3835
+size 24650809648

model-00045-of-000062.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97458250006f5949c65b338a26503738200c5fb2415f4cc664a6b224aa9dce70
+size 24650810432

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_openpangu_moe.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_openpangu_moe import PanguUltraMoEConfig
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+class PanguUltraMoERMSNorm(nn.Module):
+    def __init__(self, hidden_dim, epsilon=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(hidden_dim))
+        self.epsilon = epsilon
+    def forward(self, input_x):
+        origin_dtype = input_x.dtype
+        var = input_x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        input_x = input_x * torch.rsqrt(var + self.epsilon)
+        output_x = self.weight * input_x
+        return output_x.to(origin_dtype)
+class PanguUltraMoERotaryEmbedding(nn.Module):
+    def __init__(
+        self, dim, max_position_embeddings=131072, base=25600000.0, device=None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self._set_cache(
+            seq_len=max_position_embeddings,
+            device=device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, kv_len, max_seq_len=None):
+        if max_seq_len is None:
+            self._set_cache(seq_len=kv_len, device=x.device, dtype=x.dtype)
+        elif max_seq_len > self.max_seq_len_cached:
+            self._set_cache(seq_len=max_seq_len, device=x.device, dtype=x.dtype)
+        batch_size = x.shape[0]
+        seq_len = x.shape[1]
+        if seq_len == 1:
+            cos = (
+                torch.index_select(self.cos_cached, dim=0, index=kv_len)
+                .unsqueeze(1)
+                .unsqueeze(1)
+            )
+            sin = (
+                torch.index_select(self.sin_cached, dim=0, index=kv_len)
+                .unsqueeze(1)
+                .unsqueeze(1)
+            )
+        else:
+            cos = (
+                self.cos_cached[:seq_len]
+                .unsqueeze(0)
+                .unsqueeze(2)
+                .repeat(batch_size, 1, 1, 1)
+            )
+            sin = (
+                self.sin_cached[:seq_len]
+                .unsqueeze(0)
+                .unsqueeze(2)
+                .repeat(batch_size, 1, 1, 1)
+            )
+        cos = cos[0, :, 0, :]
+        sin = sin[0, :, 0, :]
+        return (
+            cos.to(dtype=x.dtype),
+            sin.to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return output
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.norm_topk_prob = config.norm_topk_prob
+        self.weight = nn.Parameter(
+            torch.empty((config.num_routed_experts, config.hidden_size))
+        )
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.to(torch.float32), self.weight.to(torch.float32), None
+        )
+        scores = logits.sigmoid()
+        scores_for_choice = scores.view(bsz * seq_len, -1)
+        _, topk_idx = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)
+        topk_weight = scores.gather(1, topk_idx)
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor
+        return topk_idx, topk_weight
+class PanguUltraMoE(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_shared_experts = config.num_shared_experts
+        self.num_routed_experts = config.num_routed_experts
+        self.experts = nn.ModuleList(
+            [
+                MLP(config, intermediate_size=config.moe_intermediate_size)
+                for i in range(self.num_routed_experts)
+            ]
+        )
+        self.gate = MoEGate(config)
+        if self.num_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * self.num_shared_experts
+            self.shared_experts = MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def forward(self, hidden_states):
+        if self.num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        input_shape = hidden_states.shape
+        topk_ids, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        counts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        counts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = counts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+        sorted_tokens = hidden_states[idxs // topk_ids.shape[1]]
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+        output_hidden_states = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            expert = self.experts[i]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            output_hidden_states.append(expert_out)
+            start_idx = end_idx
+        if len(output_hidden_states) > 0:
+            cat_hidden_states = torch.cat(output_hidden_states, dim=0)
+        else:
+            cat_hidden_states = sorted_tokens.new_empty(0)
+        final_hidden_states = torch.empty_like(cat_hidden_states)
+        final_hidden_states[idxs] = cat_hidden_states
+        final_out = final_hidden_states.view(*topk_ids.shape, -1).to(topk_weight.dtype)
+        final_out = (
+            final_out.mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .to(final_hidden_states.dtype)
+        ).view(*input_shape)
+        if self.num_shared_experts is not None:
+            final_out = final_out + shared_output
+        return final_out
+class PanguUltraMoEAttention(nn.Module):
+    def __init__(self, config: PanguUltraMoEConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.attention_q_lora_dim = config.attention_q_lora_dim
+        self.attention_qk_rope_dim = config.attention_qk_rope_dim
+        self.attention_kv_lora_dim = config.attention_kv_lora_dim
+        self.attention_v_dim = config.attention_v_dim
+        self.attention_qk_dim = config.attention_qk_dim
+        self.q_head_dim = config.attention_qk_dim + config.attention_qk_rope_dim
+        if self.attention_q_lora_dim is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.attention_q_lora_dim, bias=False
+            )
+            self.q_a_layernorm = PanguUltraMoERMSNorm(config.attention_q_lora_dim)
+            self.q_b_proj = nn.Linear(
+                config.attention_q_lora_dim,
+                self.num_heads * self.q_head_dim,
+                bias=False,
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.attention_kv_lora_dim + config.attention_qk_rope_dim,
+            bias=False,
+        )
+        self.kv_a_layernorm = PanguUltraMoERMSNorm(config.attention_kv_lora_dim)
+        self.kv_b_proj = nn.Linear(
+            config.attention_kv_lora_dim,
+            self.num_heads * (config.attention_qk_dim + self.attention_v_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.attention_v_dim,
+            self.hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = PanguUltraMoERotaryEmbedding(
+            self.attention_qk_rope_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=config.rope_theta,
+        )
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.attention_qk_dim, self.attention_qk_rope_dim], dim=-1
+        )
+        latent_kv = self.kv_a_proj_with_mqa(hidden_states)
+        kv_a, k_pe = torch.split(
+            latent_kv, [self.attention_kv_lora_dim, self.attention_qk_rope_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.attention_qk_rope_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(kv_a))
+            .view(
+                bsz, q_len, self.num_heads, self.attention_qk_dim + self.attention_v_dim
+            )
+            .transpose(1, 2)
+        )
+        kv_seq_len = kv.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(kv, kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        k_nope, value = torch.split(
+            kv, [self.attention_qk_dim, self.attention_v_dim], dim=-1
+        )
+        def concat_nope_pe(nope, pe):
+            states = torch.empty(
+                [bsz, self.num_heads, q_len, self.q_head_dim],
+                dtype=nope.dtype,
+                device=nope.device,
+            )
+            states[:, :, :, : self.attention_qk_dim] = nope
+            states[:, :, :, self.attention_qk_dim :] = pe
+            return states
+        query = concat_nope_pe(q_nope, q_pe)
+        key = concat_nope_pe(k_nope, k_pe)
+        if past_key_value is not None:
+            key, value = past_key_value.update(
+                key, value, self.layer_idx, {"sin": sin, "cos": cos}
+            )
+        attn_weights = (
+            torch.matmul(query, key.transpose(2, 3)) * self.softmax_scale
+            + attention_mask
+        )
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
+class PanguUltraMoEDecoderLayer(nn.Module):
+    def __init__(self, config: PanguUltraMoEConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PanguUltraMoEAttention(config=config, layer_idx=layer_idx)
+        self.mlp = (
+            PanguUltraMoE(config)
+            if (
+                config.num_routed_experts is not None
+                and layer_idx >= config.num_dense_layers
+            )
+            else MLP(config)
+        )
+        self.input_layernorm = PanguUltraMoERMSNorm(
+            config.hidden_size, epsilon=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = PanguUltraMoERMSNorm(
+            config.hidden_size, epsilon=config.rms_norm_eps
+        )
+        if getattr(config, "sandwich_norm", False):
+            self.sandwich_norm = True
+            self.pre_mlp_layernorm = PanguUltraMoERMSNorm(
+                config.hidden_size, epsilon=config.rms_norm_eps
+            )
+            self.post_mlp_layernorm = PanguUltraMoERMSNorm(
+                config.hidden_size, epsilon=config.rms_norm_eps
+            )
+        else:
+            self.sandwich_norm = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        if self.sandwich_norm:
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.pre_mlp_layernorm(hidden_states)
+        else:
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.sandwich_norm:
+            hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return (hidden_states, present_key_value)
+class PanguUltraMoEPreTrainedModel(PreTrainedModel):
+    config_class = PanguUltraMoEConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PanguUltraMoEDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        self._initialize_linear(module, std)
+        self._initialize_embedding(module, std)
+    def _initialize_linear(self, module, std):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def _initialize_embedding(self, module, std):
+        if isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class PanguUltraMoEModel(PanguUltraMoEPreTrainedModel):
+    def __init__(self, config: PanguUltraMoEConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.padding_idx = config.pad_token_id
+        self.layer_num = config.num_hidden_layers
+        self.epsilon = config.rms_norm_eps
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size, self.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [PanguUltraMoEDecoderLayer(config, idx) for idx in range(self.layer_num)]
+        )
+        self.norm = PanguUltraMoERMSNorm(self.hidden_size, epsilon=self.epsilon)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You have to specify input_ids or inputs_embeds.")
+        if input_ids is not None:
+            hidden_states = self.embed_tokens(input_ids)
+            batch_size, seq_length = input_ids.size()
+        else:
+            hidden_states = inputs_embeds
+            batch_size, seq_length = inputs_embeds.size()
+        if position_ids is None:
+            position_ids = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).unsqueeze(0)
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+            position_ids += past_key_values_length
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            hidden_states,
+            past_key_values_length,
+        )
+        for decoder_layer in self.layers:
+            hidden_states, present_key_value = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+            )
+        hidden_states = self.norm(hidden_states)
+        if use_cache and use_legacy_cache:
+            present_key_value = present_key_value.to_legacy_cache()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value,
+        )
+class PanguUltraMoEForCausalLM(PanguUltraMoEPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = PanguUltraMoEModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+        )
+        logits = self.lm_head(outputs[0])
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.vocab_size
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[unused10]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_openpangu.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# coding=utf-8
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {}
+def convert_bool(string):
+    if isinstance(string, str):
+        if string.lower() == "true":
+            return True
+        elif string.lower() == "false":
+            return False
+        else:
+            return string
+    else:
+        return string
+class PanguUltraMoETokenizer(PreTrainedTokenizer):
+    """
+    Construct a  tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.add_bos_token = convert_bool(add_bos_token)
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        """ Initialisation"""
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
+        return self._no_prefix_space_tokens
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return super().eos_token_id
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                # Decode the current sub-tokens first
+                if current_sub_tokens:
+                    out_string += self.sp_model.decode(current_sub_tokens)
+                    current_sub_tokens = []
+                # Append the special token without adding extra spaces
+                out_string += token
+                prev_is_special = True
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        # Decode any remaining sub-tokens
+        if current_sub_tokens:
+            out_string += self.sp_model.decode(current_sub_tokens)
+        # Clean up leading and trailing spaces
+        if self.clean_up_tokenization_spaces:
+            out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]
+    # Override decode to set spaces_between_special_tokens to True as default
+    def decode(self,
+               token_ids,
+               spaces_between_special_tokens: bool = False,
+               **kwargs):
+        return super().decode(
+            token_ids=token_ids,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return ("",)
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+        output = bos_token_ids + token_ids_0
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"add_bos_token": false, "add_eos_token": false, "add_prefix_space": true, "added_tokens_decoder": {"0": {"content": "<unk>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "1": {"content": "<s>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "2": {"content": "</s>", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45806": {"content": "<|User|>:", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45813": {"content": "<|Bot|>:", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45830": {"content": "[unused0]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45840": {"content": "[unused1]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45846": {"content": "[unused2]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45849": {"content": "[unused3]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45861": {"content": "[unused4]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45866": {"content": "[unused5]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45874": {"content": "[unused6]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45883": {"content": "[unused7]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45884": {"content": "[unused8]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45887": {"content": "[unused9]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45892": {"content": "[unused10]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45920": {"content": "[unused11]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45932": {"content": "[unused12]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45938": {"content": "[unused13]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45953": {"content": "[unused14]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45968": {"content": "[unused15]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45974": {"content": "[unused16]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45982": {"content": "[unused17]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "45986": {"content": "[unused18]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46005": {"content": "[unused19]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46007": {"content": "[unused20]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46014": {"content": "[unused21]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46017": {"content": "[unused22]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46028": {"content": "[unused23]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46032": {"content": "[unused24]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46081": {"content": "[unused25]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46086": {"content": "[unused26]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46101": {"content": "[unused27]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46183": {"content": "[unused28]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46230": {"content": "[unused29]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46245": {"content": "[unused30]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "46257": {"content": "[unused31]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "144208": {"content": "[unused32]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}, "144209": {"content": "[unused33]", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, "special": true}}, "auto_map": {"AutoTokenizer": ["tokenization_openpangu.PanguUltraMoETokenizer", null]}, "bos_token": "<s>", "clean_up_tokenization_spaces": false, "eos_token": "[unused10]", "legacy": true, "model_max_length": 1000000000000000019884624838656, "pad_token": "<unk>", "sp_model_kwargs": {}, "spaces_between_special_tokens": false, "tokenizer_class": "PanguUltraMoETokenizer", "unk_token": "<unk>", "use_default_system_prompt": false, "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '[unused9]系统：[unused10]' }}{% endif %}{% if message['role'] == 'system' %}{{ '[unused9]系统：' + message['content'] + '[unused10]' }}{% endif %}{% if message['role'] == 'assistant' %}{{'[unused9]助手：' + message['content'] + '[unused10]'}}{% endif %}{% if message['role'] == 'tool' %}{{'[unused9]工具：' + message['content'] + '[unused10]'}}{% endif %}{% if message['role'] == 'function' %}{{'[unused9]方法：' + message['content'] + '[unused10]'}}{% endif %}{% if message['role'] == 'user' %}{{'[unused9]用户：' + message['content'] + '[unused10]'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[unused9]助手：' }}{% endif %}"}